From b15c5833ed20249533b29e8cfcceb6eaa14d0fc1 Mon Sep 17 00:00:00 2001
From: Joseph Szymborski <joseph.szymborski@mail.mcgill.ca>
Date: Sat, 27 Jan 2024 17:14:25 -0500
Subject: [PATCH] Added Black dependency, formatted code

---
 intrepppid/__init__.py            | 32 ++++++++++-----------
 intrepppid/__main__.py            |  1 +
 intrepppid/classifier/__init__.py |  2 --
 intrepppid/cli/train.py           | 48 +++++++++++++++----------------
 intrepppid/data/ppi_oma.py        | 45 ++++++++++++++++-------------
 intrepppid/e2e/e2e_triplet.py     | 25 ++++++++++------
 intrepppid/encoders/awd_lstm.py   |  4 +--
 requirements.txt                  |  6 ++++
 8 files changed, 91 insertions(+), 72 deletions(-)

diff --git a/intrepppid/__init__.py b/intrepppid/__init__.py
index ead8043..fce269d 100644
--- a/intrepppid/__init__.py
+++ b/intrepppid/__init__.py
@@ -21,20 +21,20 @@
 
 
 def intrepppid_network(
-        steps_per_epoch: int,
-        vocab_size: int = 250,
-        embedding_size: int = 64,
-        rnn_num_layers: int = 2,
-        rnn_dropout_rate: float = 0.3,
-        variational_dropout: bool = False,
-        bi_reduce: str = "last",
-        embedding_droprate: float = 0.3,
-        num_epochs: int = 100,
-        do_rate: float = 0.3,
-        beta_classifier: int = 2,
-        lr: float = 1e-2,
-        use_projection: bool = False,
-        optimizer_type: str = "ranger21_xx"
+    steps_per_epoch: int,
+    vocab_size: int = 250,
+    embedding_size: int = 64,
+    rnn_num_layers: int = 2,
+    rnn_dropout_rate: float = 0.3,
+    variational_dropout: bool = False,
+    bi_reduce: str = "last",
+    embedding_droprate: float = 0.3,
+    num_epochs: int = 100,
+    do_rate: float = 0.3,
+    beta_classifier: int = 2,
+    lr: float = 1e-2,
+    use_projection: bool = False,
+    optimizer_type: str = "ranger21_xx",
 ):
     """
     This builds a PyTorch nn.Module which represents the INTREPPPID network as
@@ -67,7 +67,7 @@ def intrepppid_network(
         rnn_num_layers,
         rnn_dropout_rate,
         variational_dropout,
-        bi_reduce
+        bi_reduce,
     )
 
     head = MLPHead(embedding_size, do_rate)
@@ -82,7 +82,7 @@ def intrepppid_network(
         beta_classifier,
         use_projection,
         optimizer_type,
-        lr
+        lr,
     )
 
     return net
diff --git a/intrepppid/__main__.py b/intrepppid/__main__.py
index caab563..a7ebcc7 100644
--- a/intrepppid/__main__.py
+++ b/intrepppid/__main__.py
@@ -23,6 +23,7 @@ class Pipeline(object):
     """
     The INTREPPPID CLI
     """
+
     def __init__(self):
         self.train = Train
 
diff --git a/intrepppid/classifier/__init__.py b/intrepppid/classifier/__init__.py
index 2f53f17..2918f50 100644
--- a/intrepppid/classifier/__init__.py
+++ b/intrepppid/classifier/__init__.py
@@ -13,5 +13,3 @@
 # You should have received a copy of the GNU Affero General Public
 # License along with this programme. If not, see
 # <https://www.gnu.org/licenses/agpl-3.0.en.html>.
-
-
diff --git a/intrepppid/cli/train.py b/intrepppid/cli/train.py
index 90d5ffa..8f6d45c 100644
--- a/intrepppid/cli/train.py
+++ b/intrepppid/cli/train.py
@@ -26,30 +26,30 @@ class Train(object):
 
     @staticmethod
     def e2e_rnn_triplet(
-            ppi_dataset_path: Path,
-            sentencepiece_path: Path,
-            c_type: int,
-            num_epochs: int,
-            batch_size: int,
-            seed: Optional[int] = None,
-            vocab_size: int = 250,
-            trunc_len: int = 1500,
-            embedding_size: int = 64,
-            rnn_num_layers: int = 2,
-            rnn_dropout_rate: float = 0.3,
-            variational_dropout: bool = False,
-            bi_reduce: str = "last",
-            workers: int = 4,
-            embedding_droprate: float = 0.3,
-            do_rate: float = 0.3,
-            log_path: Path = Path("./logs/e2e_rnn_triplet"),
-            encoder_only_steps: int = -1,
-            classifier_warm_up: int = -1,
-            beta_classifier: float = 4.0,
-            lr: Union[float, str] = 1e-2,
-            use_projection: bool = False,
-            checkpoint_path: Optional[Path] = None,
-            optimizer_type: str = "ranger21"
+        ppi_dataset_path: Path,
+        sentencepiece_path: Path,
+        c_type: int,
+        num_epochs: int,
+        batch_size: int,
+        seed: Optional[int] = None,
+        vocab_size: int = 250,
+        trunc_len: int = 1500,
+        embedding_size: int = 64,
+        rnn_num_layers: int = 2,
+        rnn_dropout_rate: float = 0.3,
+        variational_dropout: bool = False,
+        bi_reduce: str = "last",
+        workers: int = 4,
+        embedding_droprate: float = 0.3,
+        do_rate: float = 0.3,
+        log_path: Path = Path("./logs/e2e_rnn_triplet"),
+        encoder_only_steps: int = -1,
+        classifier_warm_up: int = -1,
+        beta_classifier: float = 4.0,
+        lr: Union[float, str] = 1e-2,
+        use_projection: bool = False,
+        checkpoint_path: Optional[Path] = None,
+        optimizer_type: str = "ranger21",
     ):
         """
         Train INTREPPPID in an end-to-end fashion using an AWD-LSTM encoder and MLP classifier.
diff --git a/intrepppid/data/ppi_oma.py b/intrepppid/data/ppi_oma.py
index 24b7f5d..1389f94 100644
--- a/intrepppid/data/ppi_oma.py
+++ b/intrepppid/data/ppi_oma.py
@@ -37,7 +37,7 @@ def __init__(
         trunc_len=1000,
         sos=False,
         eos=False,
-        negative_omid=False
+        negative_omid=False,
     ):
         super().__init__()
 
@@ -60,10 +60,7 @@ def __init__(
 
         if self.negative_omid:
             with tb.open_file(self.dataset_path) as dataset:
-                self.all_omids = [
-                    x[0]
-                    for x in dataset.root.orthologs.iterrows()
-                ]
+                self.all_omids = [x[0] for x in dataset.root.orthologs.iterrows()]
 
     @staticmethod
     def static_encode(
@@ -189,7 +186,7 @@ def __init__(
         seed: int,
         sos: bool,
         eos: bool,
-        negative_omid: bool = False
+        negative_omid: bool = False,
     ):
         super().__init__()
 
@@ -226,7 +223,7 @@ def setup(self, stage=None):
             self.trunc_len,
             self.sos,
             self.eos,
-            self.negative_omid
+            self.negative_omid,
         )
         self.dataset_val = IntrepppidDataset2(
             self.dataset_path,
@@ -236,7 +233,7 @@ def setup(self, stage=None):
             self.trunc_len,
             self.sos,
             self.eos,
-            self.negative_omid
+            self.negative_omid,
         )
         self.dataset_test = IntrepppidDataset2(
             self.dataset_path,
@@ -246,7 +243,7 @@ def setup(self, stage=None):
             self.trunc_len,
             self.sos,
             self.eos,
-            self.negative_omid
+            self.negative_omid,
         )
 
     def train_dataloader(self):
@@ -284,7 +281,7 @@ def __init__(
         trunc_len: int = 1000,
         sos: bool = False,
         eos: bool = False,
-        negative_omid: bool = False
+        negative_omid: bool = False,
     ):
         """
         Builds a PyTorch dataset from an HDF5 dataset in the INTREPPPID format.
@@ -323,20 +320,28 @@ def __init__(
 
         with tb.open_file(self.dataset_path) as dataset:
             print("loading interactions...")
-            for row in dataset.root["interactions"][f"c{self.c_type}"][f"c{self.c_type}_{self.split}"]:
-                p1, p2, omid_pid, omid_id, label = row['protein_id1'].decode('utf8'), row['protein_id2'].decode('utf8'), row['omid_protein_id'].decode('utf8'), row['omid_id'], row['label']
+            for row in dataset.root["interactions"][f"c{self.c_type}"][
+                f"c{self.c_type}_{self.split}"
+            ]:
+                p1, p2, omid_pid, omid_id, label = (
+                    row["protein_id1"].decode("utf8"),
+                    row["protein_id2"].decode("utf8"),
+                    row["omid_protein_id"].decode("utf8"),
+                    row["omid_id"],
+                    row["label"],
+                )
                 self.interactions.append((p1, p2, omid_pid, omid_id, label))
 
             print("loading sequences...")
             for row in dataset.root.sequences.iterrows():
-                name = row['name'].decode("utf8")
-                sequence = row['sequence'].decode("utf8")
+                name = row["name"].decode("utf8")
+                sequence = row["sequence"].decode("utf8")
                 self.sequences[name] = sequence
 
             print("loading orthogroups...")
             for row in dataset.root.orthologs.iterrows():
-                ortholog_group_id = row['ortholog_group_id']
-                protein_id = row['protein_id'].decode("utf8")
+                ortholog_group_id = row["ortholog_group_id"]
+                protein_id = row["protein_id"].decode("utf8")
                 self.omid_members[ortholog_group_id].append(protein_id)
 
     @staticmethod
@@ -523,7 +528,7 @@ def __init__(
         seed: int,
         sos: bool,
         eos: bool,
-        negative_omid: bool = False
+        negative_omid: bool = False,
     ):
         """
         A `PyTorch Lightning <https://lightning.ai/docs/pytorch/stable/>`_ `Data Module <https://lightning.ai/docs/pytorch/1.9.3/api/pytorch_lightning.core.LightningDataModule.html>`_ for INTREPPPID datasets.
@@ -580,7 +585,7 @@ def setup(self, stage=None):
             self.trunc_len,
             self.sos,
             self.eos,
-            self.negative_omid
+            self.negative_omid,
         )
         self.dataset_val = IntrepppidDataset(
             self.dataset_path,
@@ -590,7 +595,7 @@ def setup(self, stage=None):
             self.trunc_len,
             self.sos,
             self.eos,
-            self.negative_omid
+            self.negative_omid,
         )
         self.dataset_test = IntrepppidDataset(
             self.dataset_path,
@@ -600,7 +605,7 @@ def setup(self, stage=None):
             self.trunc_len,
             self.sos,
             self.eos,
-            self.negative_omid
+            self.negative_omid,
         )
 
     def train_dataloader(self):
diff --git a/intrepppid/e2e/e2e_triplet.py b/intrepppid/e2e/e2e_triplet.py
index 37a44d9..aaee85e 100644
--- a/intrepppid/e2e/e2e_triplet.py
+++ b/intrepppid/e2e/e2e_triplet.py
@@ -52,7 +52,7 @@ def __init__(
         beta_classifier: float,
         use_projection: bool,
         optimizer_type: str,
-        lr: float
+        lr: float,
     ):
         """
         Create an end-to-end INTREPPPID network which uses a triplet loss for the orthologue task.
@@ -122,7 +122,9 @@ def step(self, batch, stage):
             z_omid_positive = self.encoder(omid_positive_seq)
             z_omid_negative = self.encoder(omid_negative_seq)
 
-        triplet_loss = self.triplet_criterion(z_omid_anchor, z_omid_positive, z_omid_negative)
+        triplet_loss = self.triplet_criterion(
+            z_omid_anchor, z_omid_positive, z_omid_negative
+        )
 
         y_hat = self(p1_seq, p2_seq).squeeze(1)
 
@@ -230,13 +232,20 @@ def configure_optimizers(self):
 
         elif self.optimizer_type == "adamw_1cycle":
             optimizer = AdamW(self.parameters(), lr=self.lr)
-            scheduler = OneCycleLR(optimizer, self.lr, epochs=self.num_epochs, steps_per_epoch=self.steps_per_epoch)
+            scheduler = OneCycleLR(
+                optimizer,
+                self.lr,
+                epochs=self.num_epochs,
+                steps_per_epoch=self.steps_per_epoch,
+            )
 
             return [optimizer], [scheduler]
 
         elif self.optimizer_type == "adamw_cosine":
             optimizer = AdamW(self.parameters(), lr=self.lr)
-            scheduler = CosineAnnealingWarmRestarts(optimizer, T_0=10, T_mult=2, eta_min=1e-6)
+            scheduler = CosineAnnealingWarmRestarts(
+                optimizer, T_0=10, T_mult=2, eta_min=1e-6
+            )
 
             return [optimizer], [scheduler]
 
@@ -310,7 +319,7 @@ def train_e2e_rnn_triplet(
         "checkpoint_path": checkpoint_path,
         "use_projection": use_projection,
         "seed": seed,
-        "optimizer_type": optimizer_type
+        "optimizer_type": optimizer_type,
     }
 
     with open(hyperparams_path, "w") as f:
@@ -327,7 +336,7 @@ def train_e2e_rnn_triplet(
         seed=seed,
         sos=False,
         eos=False,
-        negative_omid=True
+        negative_omid=True,
     )
 
     data_module.setup("training")
@@ -342,7 +351,7 @@ def train_e2e_rnn_triplet(
         rnn_num_layers,
         rnn_dropout_rate,
         variational_dropout,
-        bi_reduce
+        bi_reduce,
     )
 
     head = MLPHead(embedding_size, do_rate)
@@ -360,7 +369,7 @@ def train_e2e_rnn_triplet(
         beta_classifier,
         use_projection,
         optimizer_type,
-        lr
+        lr,
     )
 
     num_params = sum(p.numel() for p in net.parameters() if p.requires_grad)
diff --git a/intrepppid/encoders/awd_lstm.py b/intrepppid/encoders/awd_lstm.py
index d7bdf0f..8966fa4 100644
--- a/intrepppid/encoders/awd_lstm.py
+++ b/intrepppid/encoders/awd_lstm.py
@@ -114,7 +114,7 @@ def __init__(
         rnn_num_layers: int,
         rnn_dropout_rate: float,
         variational_dropout: bool,
-        bi_reduce: str
+        bi_reduce: str,
     ):
         """
         Represents an AWD-LSTM encoder.
@@ -135,7 +135,7 @@ def __init__(
             rnn_num_layers,
             rnn_dropout_rate,
             variational_dropout,
-            bi_reduce
+            bi_reduce,
         )
         self.projection = Projection(
             self.encoder.embedding_size, self.encoder.embedding_size * 2, 3
diff --git a/requirements.txt b/requirements.txt
index e167644..5d59622 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -6,10 +6,12 @@ async-timeout==4.0.3
 attrs==23.2.0
 Babel==2.14.0
 beautifulsoup4==4.12.3
+black==24.1.0
 blosc2==2.5.1
 cachetools==5.3.2
 certifi==2023.11.17
 charset-normalizer==3.3.2
+click==8.1.7
 contourpy==1.2.0
 cycler==0.12.1
 docutils==0.20.1
@@ -34,6 +36,7 @@ matplotlib==3.6.3
 mdurl==0.1.2
 msgpack==1.0.7
 multidict==6.0.4
+mypy-extensions==1.0.0
 ndindex==1.7
 numexpr==2.8.8
 numpy==1.26.3
@@ -45,7 +48,9 @@ oauthlib==3.2.2
 packaging==23.2
 pandas==2.2.0
 passlib==1.7.4
+pathspec==0.12.1
 pillow==10.2.0
+platformdirs==4.1.0
 plyvel==1.5.1
 protobuf==4.23.4
 py-cpuinfo==9.0.0
@@ -82,6 +87,7 @@ tensorboard==2.15.1
 tensorboard-data-server==0.7.2
 termcolor==2.4.0
 threadpoolctl==3.2.0
+tomli==2.0.1
 torch==1.13.1
 torchmetrics==0.11.1
 tqdm==4.64.1