Skip to content

Commit

Permalink
Added Black dependency, formatted code
Browse files Browse the repository at this point in the history
  • Loading branch information
jszym committed Jan 27, 2024
1 parent b42fd4a commit b15c583
Show file tree
Hide file tree
Showing 8 changed files with 91 additions and 72 deletions.
32 changes: 16 additions & 16 deletions intrepppid/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,20 +21,20 @@


def intrepppid_network(
steps_per_epoch: int,
vocab_size: int = 250,
embedding_size: int = 64,
rnn_num_layers: int = 2,
rnn_dropout_rate: float = 0.3,
variational_dropout: bool = False,
bi_reduce: str = "last",
embedding_droprate: float = 0.3,
num_epochs: int = 100,
do_rate: float = 0.3,
beta_classifier: int = 2,
lr: float = 1e-2,
use_projection: bool = False,
optimizer_type: str = "ranger21_xx"
steps_per_epoch: int,
vocab_size: int = 250,
embedding_size: int = 64,
rnn_num_layers: int = 2,
rnn_dropout_rate: float = 0.3,
variational_dropout: bool = False,
bi_reduce: str = "last",
embedding_droprate: float = 0.3,
num_epochs: int = 100,
do_rate: float = 0.3,
beta_classifier: int = 2,
lr: float = 1e-2,
use_projection: bool = False,
optimizer_type: str = "ranger21_xx",
):
"""
This builds a PyTorch nn.Module which represents the INTREPPPID network as
Expand Down Expand Up @@ -67,7 +67,7 @@ def intrepppid_network(
rnn_num_layers,
rnn_dropout_rate,
variational_dropout,
bi_reduce
bi_reduce,
)

head = MLPHead(embedding_size, do_rate)
Expand All @@ -82,7 +82,7 @@ def intrepppid_network(
beta_classifier,
use_projection,
optimizer_type,
lr
lr,
)

return net
1 change: 1 addition & 0 deletions intrepppid/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ class Pipeline(object):
"""
The INTREPPPID CLI
"""

def __init__(self):
self.train = Train

Expand Down
2 changes: 0 additions & 2 deletions intrepppid/classifier/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,5 +13,3 @@
# You should have received a copy of the GNU Affero General Public
# License along with this programme. If not, see
# <https://www.gnu.org/licenses/agpl-3.0.en.html>.


48 changes: 24 additions & 24 deletions intrepppid/cli/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,30 +26,30 @@ class Train(object):

@staticmethod
def e2e_rnn_triplet(
ppi_dataset_path: Path,
sentencepiece_path: Path,
c_type: int,
num_epochs: int,
batch_size: int,
seed: Optional[int] = None,
vocab_size: int = 250,
trunc_len: int = 1500,
embedding_size: int = 64,
rnn_num_layers: int = 2,
rnn_dropout_rate: float = 0.3,
variational_dropout: bool = False,
bi_reduce: str = "last",
workers: int = 4,
embedding_droprate: float = 0.3,
do_rate: float = 0.3,
log_path: Path = Path("./logs/e2e_rnn_triplet"),
encoder_only_steps: int = -1,
classifier_warm_up: int = -1,
beta_classifier: float = 4.0,
lr: Union[float, str] = 1e-2,
use_projection: bool = False,
checkpoint_path: Optional[Path] = None,
optimizer_type: str = "ranger21"
ppi_dataset_path: Path,
sentencepiece_path: Path,
c_type: int,
num_epochs: int,
batch_size: int,
seed: Optional[int] = None,
vocab_size: int = 250,
trunc_len: int = 1500,
embedding_size: int = 64,
rnn_num_layers: int = 2,
rnn_dropout_rate: float = 0.3,
variational_dropout: bool = False,
bi_reduce: str = "last",
workers: int = 4,
embedding_droprate: float = 0.3,
do_rate: float = 0.3,
log_path: Path = Path("./logs/e2e_rnn_triplet"),
encoder_only_steps: int = -1,
classifier_warm_up: int = -1,
beta_classifier: float = 4.0,
lr: Union[float, str] = 1e-2,
use_projection: bool = False,
checkpoint_path: Optional[Path] = None,
optimizer_type: str = "ranger21",
):
"""
Train INTREPPPID in an end-to-end fashion using an AWD-LSTM encoder and MLP classifier.
Expand Down
45 changes: 25 additions & 20 deletions intrepppid/data/ppi_oma.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ def __init__(
trunc_len=1000,
sos=False,
eos=False,
negative_omid=False
negative_omid=False,
):
super().__init__()

Expand All @@ -60,10 +60,7 @@ def __init__(

if self.negative_omid:
with tb.open_file(self.dataset_path) as dataset:
self.all_omids = [
x[0]
for x in dataset.root.orthologs.iterrows()
]
self.all_omids = [x[0] for x in dataset.root.orthologs.iterrows()]

@staticmethod
def static_encode(
Expand Down Expand Up @@ -189,7 +186,7 @@ def __init__(
seed: int,
sos: bool,
eos: bool,
negative_omid: bool = False
negative_omid: bool = False,
):
super().__init__()

Expand Down Expand Up @@ -226,7 +223,7 @@ def setup(self, stage=None):
self.trunc_len,
self.sos,
self.eos,
self.negative_omid
self.negative_omid,
)
self.dataset_val = IntrepppidDataset2(
self.dataset_path,
Expand All @@ -236,7 +233,7 @@ def setup(self, stage=None):
self.trunc_len,
self.sos,
self.eos,
self.negative_omid
self.negative_omid,
)
self.dataset_test = IntrepppidDataset2(
self.dataset_path,
Expand All @@ -246,7 +243,7 @@ def setup(self, stage=None):
self.trunc_len,
self.sos,
self.eos,
self.negative_omid
self.negative_omid,
)

def train_dataloader(self):
Expand Down Expand Up @@ -284,7 +281,7 @@ def __init__(
trunc_len: int = 1000,
sos: bool = False,
eos: bool = False,
negative_omid: bool = False
negative_omid: bool = False,
):
"""
Builds a PyTorch dataset from an HDF5 dataset in the INTREPPPID format.
Expand Down Expand Up @@ -323,20 +320,28 @@ def __init__(

with tb.open_file(self.dataset_path) as dataset:
print("loading interactions...")
for row in dataset.root["interactions"][f"c{self.c_type}"][f"c{self.c_type}_{self.split}"]:
p1, p2, omid_pid, omid_id, label = row['protein_id1'].decode('utf8'), row['protein_id2'].decode('utf8'), row['omid_protein_id'].decode('utf8'), row['omid_id'], row['label']
for row in dataset.root["interactions"][f"c{self.c_type}"][
f"c{self.c_type}_{self.split}"
]:
p1, p2, omid_pid, omid_id, label = (
row["protein_id1"].decode("utf8"),
row["protein_id2"].decode("utf8"),
row["omid_protein_id"].decode("utf8"),
row["omid_id"],
row["label"],
)
self.interactions.append((p1, p2, omid_pid, omid_id, label))

print("loading sequences...")
for row in dataset.root.sequences.iterrows():
name = row['name'].decode("utf8")
sequence = row['sequence'].decode("utf8")
name = row["name"].decode("utf8")
sequence = row["sequence"].decode("utf8")
self.sequences[name] = sequence

print("loading orthogroups...")
for row in dataset.root.orthologs.iterrows():
ortholog_group_id = row['ortholog_group_id']
protein_id = row['protein_id'].decode("utf8")
ortholog_group_id = row["ortholog_group_id"]
protein_id = row["protein_id"].decode("utf8")
self.omid_members[ortholog_group_id].append(protein_id)

@staticmethod
Expand Down Expand Up @@ -523,7 +528,7 @@ def __init__(
seed: int,
sos: bool,
eos: bool,
negative_omid: bool = False
negative_omid: bool = False,
):
"""
A `PyTorch Lightning <https://lightning.ai/docs/pytorch/stable/>`_ `Data Module <https://lightning.ai/docs/pytorch/1.9.3/api/pytorch_lightning.core.LightningDataModule.html>`_ for INTREPPPID datasets.
Expand Down Expand Up @@ -580,7 +585,7 @@ def setup(self, stage=None):
self.trunc_len,
self.sos,
self.eos,
self.negative_omid
self.negative_omid,
)
self.dataset_val = IntrepppidDataset(
self.dataset_path,
Expand All @@ -590,7 +595,7 @@ def setup(self, stage=None):
self.trunc_len,
self.sos,
self.eos,
self.negative_omid
self.negative_omid,
)
self.dataset_test = IntrepppidDataset(
self.dataset_path,
Expand All @@ -600,7 +605,7 @@ def setup(self, stage=None):
self.trunc_len,
self.sos,
self.eos,
self.negative_omid
self.negative_omid,
)

def train_dataloader(self):
Expand Down
25 changes: 17 additions & 8 deletions intrepppid/e2e/e2e_triplet.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ def __init__(
beta_classifier: float,
use_projection: bool,
optimizer_type: str,
lr: float
lr: float,
):
"""
Create an end-to-end INTREPPPID network which uses a triplet loss for the orthologue task.
Expand Down Expand Up @@ -122,7 +122,9 @@ def step(self, batch, stage):
z_omid_positive = self.encoder(omid_positive_seq)
z_omid_negative = self.encoder(omid_negative_seq)

triplet_loss = self.triplet_criterion(z_omid_anchor, z_omid_positive, z_omid_negative)
triplet_loss = self.triplet_criterion(
z_omid_anchor, z_omid_positive, z_omid_negative
)

y_hat = self(p1_seq, p2_seq).squeeze(1)

Expand Down Expand Up @@ -230,13 +232,20 @@ def configure_optimizers(self):

elif self.optimizer_type == "adamw_1cycle":
optimizer = AdamW(self.parameters(), lr=self.lr)
scheduler = OneCycleLR(optimizer, self.lr, epochs=self.num_epochs, steps_per_epoch=self.steps_per_epoch)
scheduler = OneCycleLR(
optimizer,
self.lr,
epochs=self.num_epochs,
steps_per_epoch=self.steps_per_epoch,
)

return [optimizer], [scheduler]

elif self.optimizer_type == "adamw_cosine":
optimizer = AdamW(self.parameters(), lr=self.lr)
scheduler = CosineAnnealingWarmRestarts(optimizer, T_0=10, T_mult=2, eta_min=1e-6)
scheduler = CosineAnnealingWarmRestarts(
optimizer, T_0=10, T_mult=2, eta_min=1e-6
)

return [optimizer], [scheduler]

Expand Down Expand Up @@ -310,7 +319,7 @@ def train_e2e_rnn_triplet(
"checkpoint_path": checkpoint_path,
"use_projection": use_projection,
"seed": seed,
"optimizer_type": optimizer_type
"optimizer_type": optimizer_type,
}

with open(hyperparams_path, "w") as f:
Expand All @@ -327,7 +336,7 @@ def train_e2e_rnn_triplet(
seed=seed,
sos=False,
eos=False,
negative_omid=True
negative_omid=True,
)

data_module.setup("training")
Expand All @@ -342,7 +351,7 @@ def train_e2e_rnn_triplet(
rnn_num_layers,
rnn_dropout_rate,
variational_dropout,
bi_reduce
bi_reduce,
)

head = MLPHead(embedding_size, do_rate)
Expand All @@ -360,7 +369,7 @@ def train_e2e_rnn_triplet(
beta_classifier,
use_projection,
optimizer_type,
lr
lr,
)

num_params = sum(p.numel() for p in net.parameters() if p.requires_grad)
Expand Down
4 changes: 2 additions & 2 deletions intrepppid/encoders/awd_lstm.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,7 +114,7 @@ def __init__(
rnn_num_layers: int,
rnn_dropout_rate: float,
variational_dropout: bool,
bi_reduce: str
bi_reduce: str,
):
"""
Represents an AWD-LSTM encoder.
Expand All @@ -135,7 +135,7 @@ def __init__(
rnn_num_layers,
rnn_dropout_rate,
variational_dropout,
bi_reduce
bi_reduce,
)
self.projection = Projection(
self.encoder.embedding_size, self.encoder.embedding_size * 2, 3
Expand Down
6 changes: 6 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,12 @@ async-timeout==4.0.3
attrs==23.2.0
Babel==2.14.0
beautifulsoup4==4.12.3
black==24.1.0
blosc2==2.5.1
cachetools==5.3.2
certifi==2023.11.17
charset-normalizer==3.3.2
click==8.1.7
contourpy==1.2.0
cycler==0.12.1
docutils==0.20.1
Expand All @@ -34,6 +36,7 @@ matplotlib==3.6.3
mdurl==0.1.2
msgpack==1.0.7
multidict==6.0.4
mypy-extensions==1.0.0
ndindex==1.7
numexpr==2.8.8
numpy==1.26.3
Expand All @@ -45,7 +48,9 @@ oauthlib==3.2.2
packaging==23.2
pandas==2.2.0
passlib==1.7.4
pathspec==0.12.1
pillow==10.2.0
platformdirs==4.1.0
plyvel==1.5.1
protobuf==4.23.4
py-cpuinfo==9.0.0
Expand Down Expand Up @@ -82,6 +87,7 @@ tensorboard==2.15.1
tensorboard-data-server==0.7.2
termcolor==2.4.0
threadpoolctl==3.2.0
tomli==2.0.1
torch==1.13.1
torchmetrics==0.11.1
tqdm==4.64.1
Expand Down

0 comments on commit b15c583

Please sign in to comment.