From b15c5833ed20249533b29e8cfcceb6eaa14d0fc1 Mon Sep 17 00:00:00 2001 From: Joseph Szymborski Date: Sat, 27 Jan 2024 17:14:25 -0500 Subject: [PATCH] Added Black dependency, formatted code --- intrepppid/__init__.py | 32 ++++++++++----------- intrepppid/__main__.py | 1 + intrepppid/classifier/__init__.py | 2 -- intrepppid/cli/train.py | 48 +++++++++++++++---------------- intrepppid/data/ppi_oma.py | 45 ++++++++++++++++------------- intrepppid/e2e/e2e_triplet.py | 25 ++++++++++------ intrepppid/encoders/awd_lstm.py | 4 +-- requirements.txt | 6 ++++ 8 files changed, 91 insertions(+), 72 deletions(-) diff --git a/intrepppid/__init__.py b/intrepppid/__init__.py index ead8043..fce269d 100644 --- a/intrepppid/__init__.py +++ b/intrepppid/__init__.py @@ -21,20 +21,20 @@ def intrepppid_network( - steps_per_epoch: int, - vocab_size: int = 250, - embedding_size: int = 64, - rnn_num_layers: int = 2, - rnn_dropout_rate: float = 0.3, - variational_dropout: bool = False, - bi_reduce: str = "last", - embedding_droprate: float = 0.3, - num_epochs: int = 100, - do_rate: float = 0.3, - beta_classifier: int = 2, - lr: float = 1e-2, - use_projection: bool = False, - optimizer_type: str = "ranger21_xx" + steps_per_epoch: int, + vocab_size: int = 250, + embedding_size: int = 64, + rnn_num_layers: int = 2, + rnn_dropout_rate: float = 0.3, + variational_dropout: bool = False, + bi_reduce: str = "last", + embedding_droprate: float = 0.3, + num_epochs: int = 100, + do_rate: float = 0.3, + beta_classifier: int = 2, + lr: float = 1e-2, + use_projection: bool = False, + optimizer_type: str = "ranger21_xx", ): """ This builds a PyTorch nn.Module which represents the INTREPPPID network as @@ -67,7 +67,7 @@ def intrepppid_network( rnn_num_layers, rnn_dropout_rate, variational_dropout, - bi_reduce + bi_reduce, ) head = MLPHead(embedding_size, do_rate) @@ -82,7 +82,7 @@ def intrepppid_network( beta_classifier, use_projection, optimizer_type, - lr + lr, ) return net diff --git a/intrepppid/__main__.py b/intrepppid/__main__.py index caab563..a7ebcc7 100644 --- a/intrepppid/__main__.py +++ b/intrepppid/__main__.py @@ -23,6 +23,7 @@ class Pipeline(object): """ The INTREPPPID CLI """ + def __init__(self): self.train = Train diff --git a/intrepppid/classifier/__init__.py b/intrepppid/classifier/__init__.py index 2f53f17..2918f50 100644 --- a/intrepppid/classifier/__init__.py +++ b/intrepppid/classifier/__init__.py @@ -13,5 +13,3 @@ # You should have received a copy of the GNU Affero General Public # License along with this programme. If not, see # . - - diff --git a/intrepppid/cli/train.py b/intrepppid/cli/train.py index 90d5ffa..8f6d45c 100644 --- a/intrepppid/cli/train.py +++ b/intrepppid/cli/train.py @@ -26,30 +26,30 @@ class Train(object): @staticmethod def e2e_rnn_triplet( - ppi_dataset_path: Path, - sentencepiece_path: Path, - c_type: int, - num_epochs: int, - batch_size: int, - seed: Optional[int] = None, - vocab_size: int = 250, - trunc_len: int = 1500, - embedding_size: int = 64, - rnn_num_layers: int = 2, - rnn_dropout_rate: float = 0.3, - variational_dropout: bool = False, - bi_reduce: str = "last", - workers: int = 4, - embedding_droprate: float = 0.3, - do_rate: float = 0.3, - log_path: Path = Path("./logs/e2e_rnn_triplet"), - encoder_only_steps: int = -1, - classifier_warm_up: int = -1, - beta_classifier: float = 4.0, - lr: Union[float, str] = 1e-2, - use_projection: bool = False, - checkpoint_path: Optional[Path] = None, - optimizer_type: str = "ranger21" + ppi_dataset_path: Path, + sentencepiece_path: Path, + c_type: int, + num_epochs: int, + batch_size: int, + seed: Optional[int] = None, + vocab_size: int = 250, + trunc_len: int = 1500, + embedding_size: int = 64, + rnn_num_layers: int = 2, + rnn_dropout_rate: float = 0.3, + variational_dropout: bool = False, + bi_reduce: str = "last", + workers: int = 4, + embedding_droprate: float = 0.3, + do_rate: float = 0.3, + log_path: Path = Path("./logs/e2e_rnn_triplet"), + encoder_only_steps: int = -1, + classifier_warm_up: int = -1, + beta_classifier: float = 4.0, + lr: Union[float, str] = 1e-2, + use_projection: bool = False, + checkpoint_path: Optional[Path] = None, + optimizer_type: str = "ranger21", ): """ Train INTREPPPID in an end-to-end fashion using an AWD-LSTM encoder and MLP classifier. diff --git a/intrepppid/data/ppi_oma.py b/intrepppid/data/ppi_oma.py index 24b7f5d..1389f94 100644 --- a/intrepppid/data/ppi_oma.py +++ b/intrepppid/data/ppi_oma.py @@ -37,7 +37,7 @@ def __init__( trunc_len=1000, sos=False, eos=False, - negative_omid=False + negative_omid=False, ): super().__init__() @@ -60,10 +60,7 @@ def __init__( if self.negative_omid: with tb.open_file(self.dataset_path) as dataset: - self.all_omids = [ - x[0] - for x in dataset.root.orthologs.iterrows() - ] + self.all_omids = [x[0] for x in dataset.root.orthologs.iterrows()] @staticmethod def static_encode( @@ -189,7 +186,7 @@ def __init__( seed: int, sos: bool, eos: bool, - negative_omid: bool = False + negative_omid: bool = False, ): super().__init__() @@ -226,7 +223,7 @@ def setup(self, stage=None): self.trunc_len, self.sos, self.eos, - self.negative_omid + self.negative_omid, ) self.dataset_val = IntrepppidDataset2( self.dataset_path, @@ -236,7 +233,7 @@ def setup(self, stage=None): self.trunc_len, self.sos, self.eos, - self.negative_omid + self.negative_omid, ) self.dataset_test = IntrepppidDataset2( self.dataset_path, @@ -246,7 +243,7 @@ def setup(self, stage=None): self.trunc_len, self.sos, self.eos, - self.negative_omid + self.negative_omid, ) def train_dataloader(self): @@ -284,7 +281,7 @@ def __init__( trunc_len: int = 1000, sos: bool = False, eos: bool = False, - negative_omid: bool = False + negative_omid: bool = False, ): """ Builds a PyTorch dataset from an HDF5 dataset in the INTREPPPID format. @@ -323,20 +320,28 @@ def __init__( with tb.open_file(self.dataset_path) as dataset: print("loading interactions...") - for row in dataset.root["interactions"][f"c{self.c_type}"][f"c{self.c_type}_{self.split}"]: - p1, p2, omid_pid, omid_id, label = row['protein_id1'].decode('utf8'), row['protein_id2'].decode('utf8'), row['omid_protein_id'].decode('utf8'), row['omid_id'], row['label'] + for row in dataset.root["interactions"][f"c{self.c_type}"][ + f"c{self.c_type}_{self.split}" + ]: + p1, p2, omid_pid, omid_id, label = ( + row["protein_id1"].decode("utf8"), + row["protein_id2"].decode("utf8"), + row["omid_protein_id"].decode("utf8"), + row["omid_id"], + row["label"], + ) self.interactions.append((p1, p2, omid_pid, omid_id, label)) print("loading sequences...") for row in dataset.root.sequences.iterrows(): - name = row['name'].decode("utf8") - sequence = row['sequence'].decode("utf8") + name = row["name"].decode("utf8") + sequence = row["sequence"].decode("utf8") self.sequences[name] = sequence print("loading orthogroups...") for row in dataset.root.orthologs.iterrows(): - ortholog_group_id = row['ortholog_group_id'] - protein_id = row['protein_id'].decode("utf8") + ortholog_group_id = row["ortholog_group_id"] + protein_id = row["protein_id"].decode("utf8") self.omid_members[ortholog_group_id].append(protein_id) @staticmethod @@ -523,7 +528,7 @@ def __init__( seed: int, sos: bool, eos: bool, - negative_omid: bool = False + negative_omid: bool = False, ): """ A `PyTorch Lightning `_ `Data Module `_ for INTREPPPID datasets. @@ -580,7 +585,7 @@ def setup(self, stage=None): self.trunc_len, self.sos, self.eos, - self.negative_omid + self.negative_omid, ) self.dataset_val = IntrepppidDataset( self.dataset_path, @@ -590,7 +595,7 @@ def setup(self, stage=None): self.trunc_len, self.sos, self.eos, - self.negative_omid + self.negative_omid, ) self.dataset_test = IntrepppidDataset( self.dataset_path, @@ -600,7 +605,7 @@ def setup(self, stage=None): self.trunc_len, self.sos, self.eos, - self.negative_omid + self.negative_omid, ) def train_dataloader(self): diff --git a/intrepppid/e2e/e2e_triplet.py b/intrepppid/e2e/e2e_triplet.py index 37a44d9..aaee85e 100644 --- a/intrepppid/e2e/e2e_triplet.py +++ b/intrepppid/e2e/e2e_triplet.py @@ -52,7 +52,7 @@ def __init__( beta_classifier: float, use_projection: bool, optimizer_type: str, - lr: float + lr: float, ): """ Create an end-to-end INTREPPPID network which uses a triplet loss for the orthologue task. @@ -122,7 +122,9 @@ def step(self, batch, stage): z_omid_positive = self.encoder(omid_positive_seq) z_omid_negative = self.encoder(omid_negative_seq) - triplet_loss = self.triplet_criterion(z_omid_anchor, z_omid_positive, z_omid_negative) + triplet_loss = self.triplet_criterion( + z_omid_anchor, z_omid_positive, z_omid_negative + ) y_hat = self(p1_seq, p2_seq).squeeze(1) @@ -230,13 +232,20 @@ def configure_optimizers(self): elif self.optimizer_type == "adamw_1cycle": optimizer = AdamW(self.parameters(), lr=self.lr) - scheduler = OneCycleLR(optimizer, self.lr, epochs=self.num_epochs, steps_per_epoch=self.steps_per_epoch) + scheduler = OneCycleLR( + optimizer, + self.lr, + epochs=self.num_epochs, + steps_per_epoch=self.steps_per_epoch, + ) return [optimizer], [scheduler] elif self.optimizer_type == "adamw_cosine": optimizer = AdamW(self.parameters(), lr=self.lr) - scheduler = CosineAnnealingWarmRestarts(optimizer, T_0=10, T_mult=2, eta_min=1e-6) + scheduler = CosineAnnealingWarmRestarts( + optimizer, T_0=10, T_mult=2, eta_min=1e-6 + ) return [optimizer], [scheduler] @@ -310,7 +319,7 @@ def train_e2e_rnn_triplet( "checkpoint_path": checkpoint_path, "use_projection": use_projection, "seed": seed, - "optimizer_type": optimizer_type + "optimizer_type": optimizer_type, } with open(hyperparams_path, "w") as f: @@ -327,7 +336,7 @@ def train_e2e_rnn_triplet( seed=seed, sos=False, eos=False, - negative_omid=True + negative_omid=True, ) data_module.setup("training") @@ -342,7 +351,7 @@ def train_e2e_rnn_triplet( rnn_num_layers, rnn_dropout_rate, variational_dropout, - bi_reduce + bi_reduce, ) head = MLPHead(embedding_size, do_rate) @@ -360,7 +369,7 @@ def train_e2e_rnn_triplet( beta_classifier, use_projection, optimizer_type, - lr + lr, ) num_params = sum(p.numel() for p in net.parameters() if p.requires_grad) diff --git a/intrepppid/encoders/awd_lstm.py b/intrepppid/encoders/awd_lstm.py index d7bdf0f..8966fa4 100644 --- a/intrepppid/encoders/awd_lstm.py +++ b/intrepppid/encoders/awd_lstm.py @@ -114,7 +114,7 @@ def __init__( rnn_num_layers: int, rnn_dropout_rate: float, variational_dropout: bool, - bi_reduce: str + bi_reduce: str, ): """ Represents an AWD-LSTM encoder. @@ -135,7 +135,7 @@ def __init__( rnn_num_layers, rnn_dropout_rate, variational_dropout, - bi_reduce + bi_reduce, ) self.projection = Projection( self.encoder.embedding_size, self.encoder.embedding_size * 2, 3 diff --git a/requirements.txt b/requirements.txt index e167644..5d59622 100644 --- a/requirements.txt +++ b/requirements.txt @@ -6,10 +6,12 @@ async-timeout==4.0.3 attrs==23.2.0 Babel==2.14.0 beautifulsoup4==4.12.3 +black==24.1.0 blosc2==2.5.1 cachetools==5.3.2 certifi==2023.11.17 charset-normalizer==3.3.2 +click==8.1.7 contourpy==1.2.0 cycler==0.12.1 docutils==0.20.1 @@ -34,6 +36,7 @@ matplotlib==3.6.3 mdurl==0.1.2 msgpack==1.0.7 multidict==6.0.4 +mypy-extensions==1.0.0 ndindex==1.7 numexpr==2.8.8 numpy==1.26.3 @@ -45,7 +48,9 @@ oauthlib==3.2.2 packaging==23.2 pandas==2.2.0 passlib==1.7.4 +pathspec==0.12.1 pillow==10.2.0 +platformdirs==4.1.0 plyvel==1.5.1 protobuf==4.23.4 py-cpuinfo==9.0.0 @@ -82,6 +87,7 @@ tensorboard==2.15.1 tensorboard-data-server==0.7.2 termcolor==2.4.0 threadpoolctl==3.2.0 +tomli==2.0.1 torch==1.13.1 torchmetrics==0.11.1 tqdm==4.64.1