Skip to content

Commit

Permalink
Update configs to support the new DockGen dataset as well as pocket-b…
Browse files Browse the repository at this point in the history
…ased experiments
  • Loading branch information
amorehead committed Jun 3, 2024
1 parent 8121755 commit 07b3acd
Show file tree
Hide file tree
Showing 19 changed files with 55 additions and 24 deletions.
3 changes: 2 additions & 1 deletion configs/analysis/complex_alignment.yaml
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
method: neuralplexer # the method for which to align predictions - NOTE: must be one of (`diffdock`, `fabind`, `dynamicbind`, `neuralplexer`, `rfaa`)
vina_binding_site_method: diffdock # the method to use for Vina binding site prediction - NOTE: must be one of (`diffdock`, `fabind`, `dynamicbind`, `neuralplexer`, `rfaa`)
dataset: posebusters_benchmark # the dataset to use - NOTE: must be one of (`posebusters_benchmark`, `astex_diverse`, `casp15`)
dataset: posebusters_benchmark # the dataset to use - NOTE: must be one of (`posebusters_benchmark`, `astex_diverse`, `dockgen`, `casp15`)
ensemble_ranking_method: consensus # the method with which to rank-order and select the top ensemble prediction for each target - NOTE: must be one of (`consensus`, `ff`)
input_data_dir: ${oc.env:PROJECT_ROOT}/data/${dataset}_set # the input protein-ligand complex directory to recursively parse
output_dir: ${resolve_method_output_dir:${method},${dataset},${vina_binding_site_method},${ensemble_ranking_method},${repeat_index}} # the output directory to which to save the relaxed predictions
rank_to_align: 1 # the pose rank to align
aligned_filename_postfix: "_aligned" # the postfix to append to each aligned complex filename
force_process: false # whether to force processing of all complexes, even if they have already been processed
repeat_index: 1 # the repeat index which was used for inference
pocket_only_baseline: false # whether to prepare the pocket-only baseline
4 changes: 3 additions & 1 deletion configs/analysis/inference_analysis.yaml
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
full_report: true # whether to generate a full PoseBusters report (i.e. with all metrics) or a summary report (i.e. with only the most important metrics)
method: diffdock # the method for which to score predictions - NOTE: must be one of (`diffdock`, `fabind`, `dynamicbind`, `neuralplexer`, `rfaa`, `vina`, `ensemble`)
vina_binding_site_method: diffdock # the method to use for Vina binding site prediction - NOTE: must be one of (`diffdock`, `fabind`, `dynamicbind`, `neuralplexer`, `rfaa`)
dataset: posebusters_benchmark # the dataset to use - NOTE: must be one of (`posebusters_benchmark`, `astex_diverse`, `casp15`)
dataset: posebusters_benchmark # the dataset to use - NOTE: must be one of (`posebusters_benchmark`, `astex_diverse`, `dockgen`, `casp15`)
ensemble_ranking_method: consensus # the method with which to rank-order and select the top ensemble prediction for each target - NOTE: must be one of (`consensus`, `ff`)
input_csv_path: ${resolve_method_input_csv_path:${method},${dataset}} # the input CSV filepath with which to run inference
input_data_dir: ${oc.env:PROJECT_ROOT}/data/${dataset}_set # the input protein-ligand complex directory to recursively parse
posebusters_ccd_ids_filepath: ${oc.env:PROJECT_ROOT}/data/posebusters_pdb_ccd_ids.txt # the path to the PoseBusters PDB CCD IDs file that lists the targets that do not contain any crystal contacts
dockgen_test_ids_filepath: ${oc.env:PROJECT_ROOT}/data/dockgen_set/split_test_rmsd_filtered.txt # the path to the (ESMFold RMSD-filtered) DockGen test set IDs file
output_dir: ${resolve_method_output_dir:${method},${dataset},${vina_binding_site_method},${ensemble_ranking_method},${repeat_index}} # the output directory to which to save the relaxed predictions
repeat_index: 1 # the repeat index which was used for inference
pocket_only_baseline: false # whether to analyze the pocket-only baseline
5 changes: 5 additions & 0 deletions configs/data/binding_site_crop_preparation.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
dataset: posebusters_benchmark # the dataset to use - NOTE: must be one of (`posebusters_benchmark`, `astex_diverse`)
input_data_dir: ${oc.env:PROJECT_ROOT}/data/${dataset}_set # the input protein-ligand complex directory to recursively parse
input_protein_structure_dir: ${oc.env:PROJECT_ROOT}/data/${dataset}_set/${dataset}_holo_aligned_esmfold_structures # the input protein structure directory to parse
protein_ligand_distance_threshold: 4.0 # the heavy-atom distance threshold (in Angstrom) to use for finding protein binding site residues in interaction with ligand heavy atoms
num_buffer_residues: 7 # the number of sequence-regional buffer residues to include around the native binding site residues
4 changes: 3 additions & 1 deletion configs/data/diffdock_input_preparation.yaml
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
dataset: posebusters_benchmark # the dataset to use - NOTE: must be one of (`posebusters_benchmark`, `astex_diverse`, `casp15`)
dataset: posebusters_benchmark # the dataset to use - NOTE: must be one of (`posebusters_benchmark`, `astex_diverse`, `dockgen`, `casp15`)
input_data_dir: ${oc.env:PROJECT_ROOT}/data/${dataset}_set # the input protein-ligand complex directory to recursively parse
input_protein_structure_dir: ${oc.env:PROJECT_ROOT}/data/${dataset}_set/${dataset}_holo_aligned_esmfold_structures # the input protein structure directory to parse
output_csv_path: ${oc.env:PROJECT_ROOT}/forks/DiffDock/inference/diffdock_${dataset}_inputs.csv # the output CSV filepath to which to write the parsed input data
posebusters_ccd_ids_filepath: ${oc.env:PROJECT_ROOT}/data/posebusters_pdb_ccd_ids.txt # the path to the PoseBusters PDB CCD IDs file that lists the targets that do not contain any crystal contacts
dockgen_test_ids_filepath: ${oc.env:PROJECT_ROOT}/data/dockgen_set/split_test.txt # the path to the DockGen test set IDs file
protein_filepath: null # the path to the protein structure file to use
ligand_smiles: null # the ligand SMILES string for which to predict the binding pose
input_id: null # the input ID to use for inference
pocket_only_baseline: false # whether to prepare the pocket-only baseline
3 changes: 2 additions & 1 deletion configs/data/dynamicbind_input_preparation.yaml
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
dataset: casp15 # the dataset to use - NOTE: must be one of (`posebusters_benchmark`, `astex_diverse`, `casp15`)
dataset: posebusters_benchmark # the dataset to use - NOTE: must be one of (`posebusters_benchmark`, `astex_diverse`, `dockgen`, `casp15`)
input_data_dir: ${oc.env:PROJECT_ROOT}/data/${dataset}_set # the input protein-ligand complex directory to recursively parse
input_protein_data_dir: null # the input protein structure directory to recursively parse during inference
output_csv_dir: ${oc.env:PROJECT_ROOT}/forks/DynamicBind/inference/dynamicbind_${dataset}_inputs # the output CSV directory to which to write the parsed ligand SMILES strings
posebusters_ccd_ids_filepath: ${oc.env:PROJECT_ROOT}/data/posebusters_pdb_ccd_ids.txt # the path to the PoseBusters PDB CCD IDs file that lists the targets that do not contain any crystal contacts
dockgen_test_ids_filepath: ${oc.env:PROJECT_ROOT}/data/dockgen_set/split_test.txt # the path to the DockGen test set IDs file
protein_filepath: null # the path to the protein structure file to use
ligand_smiles: null # the ligand SMILES string for which to predict the binding pose
3 changes: 2 additions & 1 deletion configs/data/fabind_input_preparation.yaml
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
dataset: casp15 # the dataset to use - NOTE: must be one of (`posebusters_benchmark`, `astex_diverse`, `casp15`)
dataset: posebusters_benchmark # the dataset to use - NOTE: must be one of (`posebusters_benchmark`, `astex_diverse`, `dockgen`, `casp15`)
input_data_dir: ${oc.env:PROJECT_ROOT}/data/${dataset}_set # the input protein-ligand complex directory to recursively parse
output_csv_path: ${oc.env:PROJECT_ROOT}/forks/FABind/inference/fabind_${dataset}_inputs.csv # the output CSV filepath to which to write the parsed input data
posebusters_ccd_ids_filepath: ${oc.env:PROJECT_ROOT}/data/posebusters_pdb_ccd_ids.txt # the path to the PoseBusters PDB CCD IDs file that lists the targets that do not contain any crystal contacts
dockgen_test_ids_filepath: ${oc.env:PROJECT_ROOT}/data/dockgen_set/split_test.txt # the path to the DockGen test set IDs file
4 changes: 3 additions & 1 deletion configs/data/neuralplexer_input_preparation.yaml
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
dataset: casp15 # the dataset to use - NOTE: must be one of (`posebusters_benchmark`, `astex_diverse`, `casp15`)
dataset: posebusters_benchmark # the dataset to use - NOTE: must be one of (`posebusters_benchmark`, `astex_diverse`, `dockgen`, `casp15`)
input_data_dir: ${oc.env:PROJECT_ROOT}/data/${dataset}_set # the input protein-ligand complex directory to recursively parse
input_receptor_structure_dir: ${oc.env:PROJECT_ROOT}/data/${dataset}_set/${dataset}_holo_aligned_esmfold_structures # if not `null`, the input template protein structure directory to parse
output_csv_path: ${oc.env:PROJECT_ROOT}/forks/NeuralPLexer/inference/neuralplexer_${dataset}_inputs.csv # the output CSV filepath to which to write the parsed input data
posebusters_ccd_ids_filepath: ${oc.env:PROJECT_ROOT}/data/posebusters_pdb_ccd_ids.txt # the path to the PoseBusters PDB CCD IDs file that lists the targets that do not contain any crystal contacts
dockgen_test_ids_filepath: ${oc.env:PROJECT_ROOT}/data/dockgen_set/split_test.txt # the path to the DockGen test set IDs file
input_receptor: null # the input protein sequence
input_ligand: null # the input ligand SMILES
input_template: null # the input template protein structure to optionally use
input_id: null # the input ID to use for inference
pocket_only_baseline: false # whether to prepare the pocket-only baseline
4 changes: 3 additions & 1 deletion configs/data/rfaa_input_preparation.yaml
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
dataset: casp15 # the dataset to use - NOTE: must be one of (`posebusters_benchmark`, `astex_diverse`, `casp15`)
dataset: posebusters_benchmark # the dataset to use - NOTE: must be one of (`posebusters_benchmark`, `astex_diverse`, `dockgen`, `casp15`)
input_data_dir: ${oc.env:PROJECT_ROOT}/data/${dataset}_set # the input protein-ligand complex directory to recursively parse
output_scripts_path: ${oc.env:PROJECT_ROOT}/forks/RoseTTAFold-All-Atom/prediction_inputs/${dataset} # the output directory in which to save the input files
posebusters_ccd_ids_filepath: ${oc.env:PROJECT_ROOT}/data/posebusters_pdb_ccd_ids.txt # the path to the PoseBusters PDB CCD IDs file that lists the targets that do not contain any crystal contacts
dockgen_test_ids_filepath: ${oc.env:PROJECT_ROOT}/data/dockgen_set/split_test.txt # the path to the DockGen test set IDs file
protein_filepath: null # the path to the protein structure file to use
ligand_smiles: null # the ligand SMILES string for which to predict the binding pose
input_id: null # the input ID to use for inference
pocket_only_baseline: false # whether to prepare the pocket-only baseline
2 changes: 1 addition & 1 deletion configs/data/rfaa_output_extraction.yaml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
dataset: posebusters_benchmark # the dataset to use - NOTE: must be one of (`posebusters_benchmark`, `astex_diverse`, `casp15`)
dataset: posebusters_benchmark # the dataset to use - NOTE: must be one of (`posebusters_benchmark`, `astex_diverse`, `dockgen`, `casp15`)
prediction_inputs_dir: ${oc.env:PROJECT_ROOT}/forks/RoseTTAFold-All-Atom/prediction_inputs/${dataset}
prediction_outputs_dir: ${oc.env:PROJECT_ROOT}/forks/RoseTTAFold-All-Atom/prediction_outputs/${dataset}_${repeat_index}
inference_outputs_dir: ${oc.env:PROJECT_ROOT}/forks/RoseTTAFold-All-Atom/inference/rfaa_${dataset}_outputs_${repeat_index}
Expand Down
2 changes: 1 addition & 1 deletion configs/model/diffdock_inference.yaml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
cuda_device_index: 0 # the CUDA device to use for inference, or `null` to use CPU
python_exec_path: ${oc.env:PROJECT_ROOT}/forks/DiffDock/DiffDock/bin/python3 # the Python executable to use
diffdock_exec_dir: ${oc.env:PROJECT_ROOT}/forks/DiffDock # the DiffDock directory in which to execute the inference scripts
dataset: posebusters_benchmark # the dataset to use - NOTE: must be one of (`posebusters_benchmark`, `astex_diverse`, `casp15`)
dataset: posebusters_benchmark # the dataset to use - NOTE: must be one of (`posebusters_benchmark`, `astex_diverse`, `dockgen`, `casp15`)
input_csv_path: ${oc.env:PROJECT_ROOT}/forks/DiffDock/inference/diffdock_${dataset}_inputs.csv # the input CSV filepath with which to run inference
inference_config_path: ${oc.env:PROJECT_ROOT}/forks/DiffDock/default_inference_args.yaml # the inference configuration file to use
output_dir: ${oc.env:PROJECT_ROOT}/forks/DiffDock/inference/diffdock_${dataset}_output_${repeat_index} # the output directory to which to save the inference results
Expand Down
3 changes: 2 additions & 1 deletion configs/model/dynamicbind_inference.yaml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
cuda_device_index: 0 # the CUDA device to use for inference, or `null` to use CPU
python_exec_path: ${oc.env:PROJECT_ROOT}/forks/DynamicBind/DynamicBind/bin/python3 # the Python executable to use
dynamicbind_exec_dir: ${oc.env:PROJECT_ROOT}/forks/DynamicBind # the DynamicBind directory in which to execute the inference scripts
dataset: posebusters_benchmark # the dataset to use - NOTE: must be one of (`posebusters_benchmark`, `astex_diverse`, `casp15`)
dataset: posebusters_benchmark # the dataset to use - NOTE: must be one of (`posebusters_benchmark`, `astex_diverse`, `dockgen`, `casp15`)
input_data_dir: ${oc.env:PROJECT_ROOT}/data/${dataset}_set/${dataset}_holo_aligned_esmfold_structures # the input protein-ligand complex directory to recursively parse for protein inputs
input_ligand_csv_dir: ${oc.env:PROJECT_ROOT}/forks/DynamicBind/inference/dynamicbind_${dataset}_inputs # the input CSV directory with which to run inference
samples_per_complex: 40 # the number of samples to generate per complex
Expand All @@ -12,3 +12,4 @@ header: ${dataset} # name of the results directory to create
num_workers: 1 # the number of workers to use for native relaxation during inference
skip_existing: true # whether to skip existing predictions
repeat_index: 1 # the repeat index to use for inference
pocket_only_baseline: false # whether to run the pocket-only baseline
3 changes: 2 additions & 1 deletion configs/model/ensemble_generation.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -37,8 +37,9 @@ casp_author: "001" # group number to report in CASP format
casp_method: "Ligand_Predictor" # the method name to report in CASP format
combine_casp_output_files: false # whether to combine the CASP protein and ligand output files into a single file
generate_hpc_scripts: false # whether to generate HPC scripts for running the ensemble generation; if `false`, then local scripts will be generated instead
pocket_only_baseline: false # whether to run ensemble generation with only pocket-based baseline methods
ensemble_benchmarking: false # whether to run ensemble benchmarking
ensemble_benchmarking_dataset: posebusters_benchmark # the dataset to use for ensemble benchmarking - NOTE: must be one of (`posebusters_benchmark`, `astex_diverse`, `casp15`)
ensemble_benchmarking_dataset: posebusters_benchmark # the dataset to use for ensemble benchmarking - NOTE: must be one of (`posebusters_benchmark`, `astex_diverse`, `dockgen`, `casp15`)
ensemble_benchmarking_repeat_index: 1 # the repeat index to use for ensemble benchmarking
ensemble_ranking_method: consensus # the method with which to rank-order and select the top ensemble prediction for each target - NOTE: must be one of (`consensus`, `ff`)
ensemble_benchmarking_apo_protein_dir: ${oc.env:PROJECT_ROOT}/data/${ensemble_benchmarking_dataset}_set/${ensemble_benchmarking_dataset}_holo_aligned_esmfold_structures # the directory containing the apo proteins to use for ensemble benchmarking
Expand Down
3 changes: 2 additions & 1 deletion configs/model/fabind_inference.yaml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
cuda_device_index: 0 # the CUDA device to use for inference, or `null` to use CPU
python_exec_path: ${oc.env:PROJECT_ROOT}/forks/FABind/FABind/bin/python3 # the Python executable to use
fabind_exec_dir: ${oc.env:PROJECT_ROOT}/forks/FABind/fabind # the FABind directory in which to execute the inference scripts
dataset: posebusters_benchmark # the dataset to use - NOTE: must be one of (`posebusters_benchmark`, `astex_diverse`, `casp15`)
dataset: posebusters_benchmark # the dataset to use - NOTE: must be one of (`posebusters_benchmark`, `astex_diverse`, `dockgen`, `casp15`)
input_csv_path: ${oc.env:PROJECT_ROOT}/forks/FABind/inference/fabind_${dataset}_inputs.csv # the input CSV filepath with which to run inference
input_data_dir: ${oc.env:PROJECT_ROOT}/data/${dataset}_set/${dataset}_holo_aligned_esmfold_structures # the input protein-ligand complex directory to recursively parse
num_threads: 1 # the number of threads to use for inference
Expand All @@ -10,3 +10,4 @@ save_mols_dir: ${oc.env:PROJECT_ROOT}/forks/FABind/inference/fabind_${dataset}_t
ckpt_path: ${oc.env:PROJECT_ROOT}/forks/FABind/ckpt/best_model.bin # the checkpoint path to use for inference
output_dir: ${oc.env:PROJECT_ROOT}/forks/FABind/inference/fabind_${dataset}_output_${repeat_index} # the output directory to which to save the inference results
repeat_index: 1 # the repeat index to use for inference
pocket_only_baseline: false # whether to run the pocket-only baseline
5 changes: 3 additions & 2 deletions configs/model/inference_relaxation.yaml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
method: diffdock # the method for which to relax predictions - NOTE: must be one of (`diffdock`, `fabind`, `dynamicbind`, `neuralplexer`, `rfaa`, `vina`, `tulip`)
vina_binding_site_method: diffdock # the method to use for Vina binding site prediction - NOTE: must be one of (`diffdock`, `fabind`, `dynamicbind`, `neuralplexer`, `rfaa`)
dataset: posebusters_benchmark # the dataset for which to relax predictions - NOTE: must be one of (`posebusters_benchmark`, `astex_diverse`, `casp15`)
dataset: posebusters_benchmark # the dataset for which to relax predictions - NOTE: must be one of (`posebusters_benchmark`, `astex_diverse`, `dockgen`, `casp15`)
ensemble_ranking_method: consensus # the method with which to rank-order and select the top ensemble prediction for each target - NOTE: must be one of (`consensus`, `ff`)
num_processes: 1 # the number of parallel processes to use for relaxation
temp_dir: ${method}_${dataset}_cache_dir # temporary directory
Expand All @@ -10,7 +10,7 @@ prep_only: false # only prepare the input files
platform: "fastest" # platform on which to run relaxation
cuda_device_index: 0 # CUDA device index
log_level: "INFO" # logging level
protein_dir: ${resolve_method_protein_dir:${method},${dataset},${repeat_index}} # the directory from which to load (potentially inferred) proteins
protein_dir: ${resolve_method_protein_dir:${method},${dataset},${repeat_index},${pocket_only_baseline}} # the directory from which to load (potentially inferred) proteins
ligand_dir: ${resolve_method_ligand_dir:${method},${dataset},${vina_binding_site_method},${repeat_index}} # the directory from which to load inferred ligands
output_dir: ${resolve_method_output_dir:${method},${dataset},${vina_binding_site_method},${ensemble_ranking_method},${repeat_index}} # the output directory to which to save the relaxed predictions
relax_protein: false # whether to relax the protein - NOTE: currently periodically yields unpredictable protein-ligand separation
Expand All @@ -24,3 +24,4 @@ max_final_e_value: 1000.0 # when relaxing the protein, maximum final energy valu
max_num_attempts: 5 # when relaxing the protein, maximum number of relaxation attempts to perform
skip_existing: true # whether to skip existing relaxed predictions
repeat_index: 1 # the repeat index which was used for inference
pocket_only_baseline: false # whether to prepare the pocket-only baseline
2 changes: 1 addition & 1 deletion configs/model/neuralplexer_inference.yaml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
python_exec_path: ${oc.env:PROJECT_ROOT}/forks/NeuralPLexer/NeuralPLexer/bin/python3 # the Python executable to use
neuralplexer_exec_dir: ${oc.env:PROJECT_ROOT}/forks/NeuralPLexer # the NeuralPLexer directory in which to execute the inference scripts
dataset: posebusters_benchmark # the dataset to use - NOTE: must be one of (`posebusters_benchmark`, `astex_diverse`, `casp15`)
dataset: posebusters_benchmark # the dataset to use - NOTE: must be one of (`posebusters_benchmark`, `astex_diverse`, `dockgen`, `casp15`)
input_csv_path: ${oc.env:PROJECT_ROOT}/forks/NeuralPLexer/inference/neuralplexer_${dataset}_inputs.csv # the input CSV filepath to which parsed input data has been written
skip_existing: true # whether to skip existing predictions
task: batched_structure_sampling # the task to run - NOTE: must be one of (`single_sample_trajectory`, `batched_structure_sampling`, `structure_prediction_benchmarking`, `pdbbind_benchmarking`, `binding_site_recovery_benchmarking`)
Expand Down
2 changes: 1 addition & 1 deletion configs/model/rfaa_inference.yaml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
python_exec_path: ${oc.env:PROJECT_ROOT}/forks/RoseTTAFold-All-Atom/RFAA/bin/python3 # the Python executable to use
rfaa_exec_dir: ${oc.env:PROJECT_ROOT}/forks/RoseTTAFold-All-Atom # the RoseTTAFold-All-Atom directory in which to execute the inference scripts
dataset: casp15 # the dataset to use - NOTE: must be one of (`posebusters_benchmark`, `astex_diverse`, `casp15`)
dataset: posebusters_benchmark # the dataset to use - NOTE: must be one of (`posebusters_benchmark`, `astex_diverse`, `dockgen`, `casp15`)
input_dir: ${oc.env:PROJECT_ROOT}/forks/RoseTTAFold-All-Atom/prediction_inputs/${dataset} # the input directory with which to run inference
config_dir: ${oc.env:PROJECT_ROOT}/forks/RoseTTAFold-All-Atom/rf2aa/config/inference # the config directory with which to run inference
output_dir: ${oc.env:PROJECT_ROOT}/forks/RoseTTAFold-All-Atom/prediction_outputs/${dataset}_${repeat_index} # the output directory to which to save the inference results
Expand Down
Loading

0 comments on commit 07b3acd

Please sign in to comment.