Merge pull request #311 from debbiemarkslab/develop

v0.2
debbiemarkslab · Sep 26, 2024 · 88978ee · 88978ee
2 parents ffc16e0 + 188db86
commit 88978ee
Show file tree

Hide file tree

Showing 25 changed files with 1,062 additions and 386 deletions.
diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
@@ -8,12 +8,12 @@ jobs:
 
     strategy:
       matrix:
-        python-version: [3.6]
+        python-version: ['3.10', '3.11']
 
     steps:
-    - uses: actions/checkout@v2
+    - uses: actions/checkout@v4
     - name: Set up Python ${{ matrix.python-version }}
-      uses: actions/setup-python@v2
+      uses: actions/setup-python@v4
       with:
         python-version: ${{ matrix.python-version }}
     - name: Install and configure conda
@@ -29,14 +29,17 @@ jobs:
         source activate test-environment
     - name: Run setup.py
       run: |
-        python setup.py sdist --formats=zip -k
+        pip install build
+        python setup.py sdist  --formats=zip -k
+        python -m build
         find ./dist -iname "*.zip" -print0 | xargs -0 pip install
         pip install codecov
     - name: Download test files
       run: |
         wget https://marks.hms.harvard.edu/evcouplings_test_cases/data/evcouplings_test_cases.tar.gz
         tar -xf evcouplings_test_cases.tar.gz -C $HOME/
     - name: Run tests in headless xvfb environment
-      uses: GabrielBB/xvfb-action@v1
+      uses: coactions/setup-xvfb@v1
       with:
         run: coverage run -m unittest discover -s test -p "Test*.py"
+        working-directory: ./ #optional
diff --git a/.github/workflows/build_test_and_push.yml b/.github/workflows/build_test_and_push.yml
@@ -9,12 +9,12 @@ jobs:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        python-version: [3.6]
+        python-version: ['3.10', '3.11']
 
     steps:
-    - uses: actions/checkout@v2
+    - uses: actions/checkout@v4
     - name: Set up Python ${{ matrix.python-version }}
-      uses: actions/setup-python@v2
+      uses: actions/setup-python@v4
       with:
         python-version: ${{ matrix.python-version }}
     - name: Install and configure conda
@@ -31,16 +31,18 @@ jobs:
     - name: Run setup.py
       run: |
         python setup.py sdist --formats=zip -k
+        python setup.py bdist_wheel
         find ./dist -iname "*.zip" -print0 | xargs -0 pip install
         pip install codecov
     - name: Download test files
       run: |
         wget https://marks.hms.harvard.edu/evcouplings_test_cases/data/evcouplings_test_cases.tar.gz
         tar -xf evcouplings_test_cases.tar.gz -C $HOME/
     - name: Run tests in headless xvfb environment
-      uses: GabrielBB/xvfb-action@v1
+      uses: coactions/setup-xvfb@v1
       with:
         run: coverage run -m unittest discover -s test -p "Test*.py"
+        working-directory: ./ #optional
     - name: Publish evcouplings to test PyPI
       if: startsWith(github.ref, 'refs/tags')
       uses: pypa/gh-action-pypi-publish@master

diff --git a/README.md b/README.md
@@ -7,11 +7,12 @@ Predict protein structure, function and mutations using evolutionary sequence co
 
 ### Installing the Python package
 
-If you are simply interested in using EVcouplings as a library, installing the Python package is all you need to do (unless you use functions that depend on external tools). If you want to run the *evcouplings* application (alignment generation, model parameter inference, structure prediction, etc.) you will also need to follow the sections on installing external tools and databases.
+* If you are simply interested in using EVcouplings as a library, installing the Python package is all you need to do (unless you use functions that depend on external tools). 
+* If you want to run the *evcouplings* application (alignment generation, model parameter inference, structure prediction, etc.) you will also need to follow the sections on installing external tools and databases.
 
 #### Requirements
 
-EVcouplings requires a Python >= 3.5 installation. Since it depends on some packages that can be tricky to install using pip (numba, numpy, ...), we recommend using the [Anaconda Python distribution](https://www.continuum.io/downloads). In case you are creating a new conda environment or using miniconda, please make sure to run `conda install anaconda` before running pip, or otherwise the required packages will not be present.  
+EVcouplings actively supports Python >= 3.10 installations.  
 
 #### Installation
 
@@ -27,8 +28,6 @@ and to update to the latest version after previously installing EVcouplings from
 
     pip install -U --no-deps https://github.com/debbiemarkslab/EVcouplings/archive/develop.zip
 
-Installation will take seconds.
-
 ### External software tools
 
 *After installation and before running compute jobs, the paths to the respective binaries of the following external tools have to be set in your EVcouplings job configuration file(s).*
@@ -141,7 +140,7 @@ Hopf, T. A., Schärfe, C. P. I., Rodrigues, J. P. G. L. M., Green, A. G., Kohlba
 
 Hopf, T. A., Ingraham, J. B., Poelwijk, F.J., Schärfe, C.P.I., Springer, M., Sander, C., & Marks, D. S. (2017). Mutation effects predicted from sequence co-variation. *Nature Biotechnology* **35**, 128–135 doi:10.1038/nbt.3769
 
-Green, A. G. and Elhabashy, H., Brock, K. P., Maddamsetti, R., Kohlbacher, O., Marks, D. S. (2019) Proteom-scale discovery of protein interactions with residue-level resolution using sequence coevolution. BioRxiv (in review). https://doi.org/10.1101/791293
+Green, A. G. and Elhabashy, H., Brock, K. P., Maddamsetti, R., Kohlbacher, O., Marks, D. S. (2021) Large-scale discovery of protein interactions at residue resolution using co-evolution calculated from genomic sequences. *Nature Communications* **12**, 1396. https://doi.org/10.1038/s41467-021-21636-z
 
 ## Contributors
 

diff --git a/config/sample_config_complex.txt b/config/sample_config_complex.txt
@@ -487,6 +487,12 @@ environment:
     memory: 15000
     time: 2-0:0:0
 
+    # Special setting for "local" engine to define number of workers running in parallel
+    # (note that "cores" has to be defined above to make sure each job only uses a defined
+    # number of cores). If not defined or None, will default to number of cores / cores per job;
+    # otherwise specify integer to limit number of workers (1 for serial execution of subjobs)
+    # parallel_workers: 1
+
     # command that will be executed before running actual computation (can be used to set up environment)
     configuration:
 
@@ -500,7 +506,7 @@ databases:
     uniref90: /n/groups/marks/databases/jackhmmer/uniref90/uniref90_current.o2.fasta
 
     # URL do download sequences if sequence_file is not given. {} will be replaced by sequence_id.
-    sequence_download_url: http://www.uniprot.org/uniprot/{}.fasta
+    sequence_download_url: http://rest.uniprot.org/uniprot/{}.fasta
 
     # Directory with PDB MMTF structures (leave blank to fetch structures from web)
     pdb_mmtf_dir:

diff --git a/config/sample_config_monomer.txt b/config/sample_config_monomer.txt
@@ -384,6 +384,12 @@ environment:
     memory: 15000
     time: 2-0:0:0
 
+    # Special setting for "local" engine to define number of workers running in parallel
+    # (note that "cores" has to be defined above to make sure each job only uses a defined
+    # number of cores). If not defined or None, will default to number of cores / cores per job;
+    # otherwise specify integer to limit number of workers (1 for serial execution of subjobs)
+    # parallel_workers: 1
+
     # command that will be executed before running actual computation (can be used to set up environment)
     configuration:
 
@@ -397,7 +403,7 @@ databases:
     uniref90: /n/groups/marks/databases/jackhmmer/uniref90/uniref90_current.o2.fasta
 
     # URL do download sequences if sequence_file is not given. {} will be replaced by sequence_id.
-    sequence_download_url: http://www.uniprot.org/uniprot/{}.fasta
+    sequence_download_url: http://rest.uniprot.org/uniprot/{}.fasta
 
     # Directory with PDB MMTF structures (leave blank to fetch structures from web)
     pdb_mmtf_dir:

diff --git a/evcouplings/align/alignment.py b/evcouplings/align/alignment.py
@@ -9,6 +9,7 @@
 import re
 from collections import namedtuple, OrderedDict, defaultdict
 from copy import deepcopy
+from pathlib import Path
 
 import numpy as np
 from numba import jit
@@ -326,7 +327,7 @@ def write_a3m(sequences, fileobj, insert_gap=INSERT_GAP, width=80):
         fileobj.write(seq.replace(insert_gap, "") + "\n")
 
 
-def detect_format(fileobj):
+def detect_format(fileobj, filepath=""):
     """
     Detect if an alignment file is in FASTA or
     Stockholm format.
@@ -335,10 +336,12 @@ def detect_format(fileobj):
     ----------
     fileobj : file-like obj
         Alignment file for which to detect format
+    filepath : string or path-like obj
+        Path of alignment file
 
     Returns
     -------
-    format : {"fasta", "stockholm", None}
+    format : {"fasta", "a3m", "stockholm", None}
         Format of alignment, None if not detectable
     """
     for i, line in enumerate(fileobj):
@@ -348,6 +351,9 @@ def detect_format(fileobj):
 
         # This indicates a FASTA file
         if line.startswith(">"):
+            # A3M files have extension .a3m
+            if Path(filepath).suffix.lower() == ".a3m":
+                return "a3m"
             return "fasta"
 
         # Skip comment lines and empty lines for FASTA detection
@@ -422,7 +428,7 @@ def sequences_to_matrix(sequences):
 
     N = len(sequences)
     L = len(next(iter(sequences)))
-    matrix = np.empty((N, L), dtype=np.str)
+    matrix = np.empty((N, L), dtype=str)
 
     for i, seq in enumerate(sequences):
         if len(seq) != L:
@@ -569,7 +575,12 @@ def __init__(self, sequence_matrix, sequence_ids=None, annotation=None,
                 )
 
             # make sure we get rid of iterators etc.
-            self.ids = np.array(list(sequence_ids))
+            self.ids = list(sequence_ids)
+
+        # turn identifiers into numpy array for consistency with previous implementation;
+        # but use dtype object to avoid memory usage issues of numpy string datatypes (longest
+        # sequence defines memory usage otherwise)
+        self.ids = np.array(self.ids, dtype=np.object_)
 
         self.id_to_index = {
             id_: i for i, id_ in enumerate(self.ids)
@@ -607,7 +618,7 @@ def from_dict(cls, sequences, **kwargs):
     @classmethod
     def from_file(cls, fileobj, format="fasta",
                   a3m_inserts="first", raise_hmmer_prefixes=True,
-                  **kwargs):
+                  split_header=False, **kwargs):
         """
         Construct an alignment object by reading in an
         alignment file.
@@ -625,6 +636,9 @@ def from_file(cls, fileobj, format="fasta",
             HMMER adds number prefixes to sequence identifiers in Stockholm
             files if identifiers are not unique. If True, the parser will
             raise an exception if a Stockholm alignment has such prefixes.
+        split_header: bool, optional (default: False)
+            Only store identifier portion of each header (before first whitespace)
+            in identifier list, rather than full header line
         **kwargs
             Additional arguments to be passed to class constructor
 
@@ -664,6 +678,12 @@ def from_file(cls, fileobj, format="fasta",
         else:
             raise ValueError("Invalid alignment format: {}".format(format))
 
+        # reduce header lines to identifiers if requested
+        if split_header:
+            seqs = {
+                header.split()[0]: seq for header, seq in seqs.items()
+            }
+
         return cls.from_dict(seqs, **kwargs)
 
     def __getitem__(self, index):
@@ -777,7 +797,8 @@ def select(self, columns=None, sequences=None):
     def apply(self, columns=None, sequences=None, func=np.char.lower):
         """
         Apply a function along columns and/or rows of alignment matrix,
-        or to entire matrix.
+        or to entire matrix. Note that column and row selections are
+        applied independently in this particular order.
 
         Parameters
         ----------
@@ -811,7 +832,7 @@ def apply(self, columns=None, sequences=None, func=np.char.lower):
                 mod_matrix[sequences, :] = func(mod_matrix[sequences, :])
 
         return Alignment(
-            mod_matrix, np.copy(self.ids), deepcopy(self.annotation),
+            mod_matrix, deepcopy(self.ids), deepcopy(self.annotation),
             alphabet=self.alphabet
         )
 

diff --git a/evcouplings/align/protocol.py b/evcouplings/align/protocol.py
@@ -8,7 +8,8 @@
 
 """
 
-from collections import OrderedDict, Iterable
+from collections import OrderedDict
+from collections.abc import Iterable
 import re
 from shutil import copy
 import os
@@ -522,11 +523,15 @@ def describe_frequencies(alignment, first_index, target_seq_index=None):
     fi = alignment.frequencies
     conservation = alignment.conservation()
 
-    fi_cols = {c: fi[:, i] for c, i in alignment.alphabet_map.items()}
+    # careful not to include any characters that are non-match state (e.g. lowercase letters)
+    fi_cols = {
+        c: fi[:, alignment.alphabet_map[c]] for c in alignment.alphabet
+    }
+
     if target_seq_index is not None:
         target_seq = alignment[target_seq_index]
     else:
-        target_seq = np.full((alignment.L), np.nan)
+        target_seq = np.full((alignment.L, ), np.nan)
 
     info = pd.DataFrame(
         {
@@ -539,6 +544,11 @@ def describe_frequencies(alignment, first_index, target_seq_index=None):
     # reorder columns
     info = info.loc[:, ["i", "A_i", "conservation"] + list(alignment.alphabet)]
 
+    # do not report values for lowercase columns
+    info.loc[
+        info.A_i.str.lower() == info.A_i, ["conservation"] + list(alignment.alphabet)
+    ] = np.nan
+
     return info
 
 
@@ -679,7 +689,7 @@ def existing(**kwargs):
 
     # first try to autodetect format of alignment
     with open(input_alignment) as f:
-        format = detect_format(f)
+        format = detect_format(f, filepath=input_alignment)
         if format is None:
             raise InvalidParameterError(
                 "Format of input alignment {} could not be "
@@ -1502,6 +1512,8 @@ def standard(**kwargs):
         annotation_file = prefix + "_annotation.csv"
         annotation = extract_header_annotation(ali_raw)
         annotation.to_csv(annotation_file, index=False)
+    else:
+        annotation_file = None
 
     # center alignment around focus/search sequence
     focus_cols = np.array([c != "-" for c in ali_raw[0]])
@@ -1516,9 +1528,11 @@ def standard(**kwargs):
     outcfg = {
         **jackhmmer_outcfg,
         **mod_outcfg,
-        "annotation_file": annotation_file
     }
 
+    if annotation_file is not None:
+        outcfg["annotation_file"] = annotation_file
+
     # dump output config to YAML file for debugging/logging
     write_config_file(prefix + ".align_standard.outcfg", outcfg)
 

diff --git a/evcouplings/compare/distances.py b/evcouplings/compare/distances.py
@@ -323,7 +323,7 @@ def _add_axis(df, axis):
         else:
             res_i = _add_axis(self.residues_i, "i")
             res_j = _add_axis(self.residues_j, "j")
-            residues = res_i.append(res_j)
+            residues = pd.concat([res_i, res_j])
 
         # save residue table
         residue_table_filename = filename + ".csv"
@@ -770,7 +770,7 @@ def _get_col_name(col_name):
             # extract coverage segments for all individual structures
             segments = {
                 _get_col_name(col_name): find_segments(series.dropna().sort_index().index)
-                for col_name, series in coverage_cols.iteritems()
+                for col_name, series in coverage_cols.items()
             }
 
             return segments
@@ -1273,9 +1273,7 @@ def _get_chains(sifts_result):
     # if no structures given, or path to files, load first
     structures = _prepare_structures(
         structures,
-        sifts_result_i.hits.pdb_id.append(
-            sifts_result_j.hits.pdb_id
-        ),
+        set(sifts_result_i.hits.pdb_id) | set(sifts_result_j.hits.pdb_id),
         raise_missing
     )