Update configs to support the new DockGen dataset as well as pocket-b…

…ased experiments
BioinfoMachineLearning · Jun 3, 2024 · 07b3acd · 07b3acd
1 parent 8121755
commit 07b3acd
Show file tree

Hide file tree

Showing 19 changed files with 55 additions and 24 deletions.
diff --git a/configs/analysis/complex_alignment.yaml b/configs/analysis/complex_alignment.yaml
@@ -1,10 +1,11 @@
 method: neuralplexer # the method for which to align predictions - NOTE: must be one of (`diffdock`, `fabind`, `dynamicbind`, `neuralplexer`, `rfaa`)
 vina_binding_site_method: diffdock # the method to use for Vina binding site prediction - NOTE: must be one of (`diffdock`, `fabind`, `dynamicbind`, `neuralplexer`, `rfaa`)
-dataset: posebusters_benchmark # the dataset to use - NOTE: must be one of (`posebusters_benchmark`, `astex_diverse`, `casp15`)
+dataset: posebusters_benchmark # the dataset to use - NOTE: must be one of (`posebusters_benchmark`, `astex_diverse`, `dockgen`, `casp15`)
 ensemble_ranking_method: consensus # the method with which to rank-order and select the top ensemble prediction for each target - NOTE: must be one of (`consensus`, `ff`)
 input_data_dir: ${oc.env:PROJECT_ROOT}/data/${dataset}_set # the input protein-ligand complex directory to recursively parse
 output_dir: ${resolve_method_output_dir:${method},${dataset},${vina_binding_site_method},${ensemble_ranking_method},${repeat_index}} # the output directory to which to save the relaxed predictions
 rank_to_align: 1 # the pose rank to align
 aligned_filename_postfix: "_aligned" # the postfix to append to each aligned complex filename
 force_process: false # whether to force processing of all complexes, even if they have already been processed
 repeat_index: 1 # the repeat index which was used for inference
+pocket_only_baseline: false # whether to prepare the pocket-only baseline
diff --git a/configs/analysis/inference_analysis.yaml b/configs/analysis/inference_analysis.yaml
@@ -1,10 +1,12 @@
 full_report: true # whether to generate a full PoseBusters report (i.e. with all metrics) or a summary report (i.e. with only the most important metrics)
 method: diffdock # the method for which to score predictions - NOTE: must be one of (`diffdock`, `fabind`, `dynamicbind`, `neuralplexer`, `rfaa`, `vina`, `ensemble`)
 vina_binding_site_method: diffdock # the method to use for Vina binding site prediction - NOTE: must be one of (`diffdock`, `fabind`, `dynamicbind`, `neuralplexer`, `rfaa`)
-dataset: posebusters_benchmark # the dataset to use - NOTE: must be one of (`posebusters_benchmark`, `astex_diverse`, `casp15`)
+dataset: posebusters_benchmark # the dataset to use - NOTE: must be one of (`posebusters_benchmark`, `astex_diverse`, `dockgen`, `casp15`)
 ensemble_ranking_method: consensus # the method with which to rank-order and select the top ensemble prediction for each target - NOTE: must be one of (`consensus`, `ff`)
 input_csv_path: ${resolve_method_input_csv_path:${method},${dataset}} # the input CSV filepath with which to run inference
 input_data_dir: ${oc.env:PROJECT_ROOT}/data/${dataset}_set # the input protein-ligand complex directory to recursively parse
 posebusters_ccd_ids_filepath: ${oc.env:PROJECT_ROOT}/data/posebusters_pdb_ccd_ids.txt # the path to the PoseBusters PDB CCD IDs file that lists the targets that do not contain any crystal contacts
+dockgen_test_ids_filepath: ${oc.env:PROJECT_ROOT}/data/dockgen_set/split_test_rmsd_filtered.txt # the path to the (ESMFold RMSD-filtered) DockGen test set IDs file
 output_dir: ${resolve_method_output_dir:${method},${dataset},${vina_binding_site_method},${ensemble_ranking_method},${repeat_index}} # the output directory to which to save the relaxed predictions
 repeat_index: 1 # the repeat index which was used for inference
+pocket_only_baseline: false # whether to analyze the pocket-only baseline
diff --git a/configs/data/binding_site_crop_preparation.yaml b/configs/data/binding_site_crop_preparation.yaml
@@ -0,0 +1,5 @@
+dataset: posebusters_benchmark # the dataset to use - NOTE: must be one of (`posebusters_benchmark`, `astex_diverse`)
+input_data_dir: ${oc.env:PROJECT_ROOT}/data/${dataset}_set # the input protein-ligand complex directory to recursively parse
+input_protein_structure_dir: ${oc.env:PROJECT_ROOT}/data/${dataset}_set/${dataset}_holo_aligned_esmfold_structures # the input protein structure directory to parse
+protein_ligand_distance_threshold: 4.0 # the heavy-atom distance threshold (in Angstrom) to use for finding protein binding site residues in interaction with ligand heavy atoms
+num_buffer_residues: 7 # the number of sequence-regional buffer residues to include around the native binding site residues
diff --git a/configs/data/diffdock_input_preparation.yaml b/configs/data/diffdock_input_preparation.yaml
@@ -1,8 +1,10 @@
-dataset: posebusters_benchmark # the dataset to use - NOTE: must be one of (`posebusters_benchmark`, `astex_diverse`, `casp15`)
+dataset: posebusters_benchmark # the dataset to use - NOTE: must be one of (`posebusters_benchmark`, `astex_diverse`, `dockgen`, `casp15`)
 input_data_dir: ${oc.env:PROJECT_ROOT}/data/${dataset}_set # the input protein-ligand complex directory to recursively parse
 input_protein_structure_dir: ${oc.env:PROJECT_ROOT}/data/${dataset}_set/${dataset}_holo_aligned_esmfold_structures # the input protein structure directory to parse
 output_csv_path: ${oc.env:PROJECT_ROOT}/forks/DiffDock/inference/diffdock_${dataset}_inputs.csv # the output CSV filepath to which to write the parsed input data
 posebusters_ccd_ids_filepath: ${oc.env:PROJECT_ROOT}/data/posebusters_pdb_ccd_ids.txt # the path to the PoseBusters PDB CCD IDs file that lists the targets that do not contain any crystal contacts
+dockgen_test_ids_filepath: ${oc.env:PROJECT_ROOT}/data/dockgen_set/split_test.txt # the path to the DockGen test set IDs file
 protein_filepath: null # the path to the protein structure file to use
 ligand_smiles: null # the ligand SMILES string for which to predict the binding pose
 input_id: null # the input ID to use for inference
+pocket_only_baseline: false # whether to prepare the pocket-only baseline
diff --git a/configs/data/dynamicbind_input_preparation.yaml b/configs/data/dynamicbind_input_preparation.yaml
@@ -1,7 +1,8 @@
-dataset: casp15 # the dataset to use - NOTE: must be one of (`posebusters_benchmark`, `astex_diverse`, `casp15`)
+dataset: posebusters_benchmark # the dataset to use - NOTE: must be one of (`posebusters_benchmark`, `astex_diverse`, `dockgen`, `casp15`)
 input_data_dir: ${oc.env:PROJECT_ROOT}/data/${dataset}_set # the input protein-ligand complex directory to recursively parse
 input_protein_data_dir: null # the input protein structure directory to recursively parse during inference
 output_csv_dir: ${oc.env:PROJECT_ROOT}/forks/DynamicBind/inference/dynamicbind_${dataset}_inputs # the output CSV directory to which to write the parsed ligand SMILES strings
 posebusters_ccd_ids_filepath: ${oc.env:PROJECT_ROOT}/data/posebusters_pdb_ccd_ids.txt # the path to the PoseBusters PDB CCD IDs file that lists the targets that do not contain any crystal contacts
+dockgen_test_ids_filepath: ${oc.env:PROJECT_ROOT}/data/dockgen_set/split_test.txt # the path to the DockGen test set IDs file
 protein_filepath: null # the path to the protein structure file to use
 ligand_smiles: null # the ligand SMILES string for which to predict the binding pose
diff --git a/configs/data/fabind_input_preparation.yaml b/configs/data/fabind_input_preparation.yaml
@@ -1,4 +1,5 @@
-dataset: casp15 # the dataset to use - NOTE: must be one of (`posebusters_benchmark`, `astex_diverse`, `casp15`)
+dataset: posebusters_benchmark # the dataset to use - NOTE: must be one of (`posebusters_benchmark`, `astex_diverse`, `dockgen`, `casp15`)
 input_data_dir: ${oc.env:PROJECT_ROOT}/data/${dataset}_set # the input protein-ligand complex directory to recursively parse
 output_csv_path: ${oc.env:PROJECT_ROOT}/forks/FABind/inference/fabind_${dataset}_inputs.csv # the output CSV filepath to which to write the parsed input data
 posebusters_ccd_ids_filepath: ${oc.env:PROJECT_ROOT}/data/posebusters_pdb_ccd_ids.txt # the path to the PoseBusters PDB CCD IDs file that lists the targets that do not contain any crystal contacts
+dockgen_test_ids_filepath: ${oc.env:PROJECT_ROOT}/data/dockgen_set/split_test.txt # the path to the DockGen test set IDs file
diff --git a/configs/data/neuralplexer_input_preparation.yaml b/configs/data/neuralplexer_input_preparation.yaml
@@ -1,9 +1,11 @@
-dataset: casp15 # the dataset to use - NOTE: must be one of (`posebusters_benchmark`, `astex_diverse`, `casp15`)
+dataset: posebusters_benchmark # the dataset to use - NOTE: must be one of (`posebusters_benchmark`, `astex_diverse`, `dockgen`, `casp15`)
 input_data_dir: ${oc.env:PROJECT_ROOT}/data/${dataset}_set # the input protein-ligand complex directory to recursively parse
 input_receptor_structure_dir: ${oc.env:PROJECT_ROOT}/data/${dataset}_set/${dataset}_holo_aligned_esmfold_structures # if not `null`, the input template protein structure directory to parse
 output_csv_path: ${oc.env:PROJECT_ROOT}/forks/NeuralPLexer/inference/neuralplexer_${dataset}_inputs.csv # the output CSV filepath to which to write the parsed input data
 posebusters_ccd_ids_filepath: ${oc.env:PROJECT_ROOT}/data/posebusters_pdb_ccd_ids.txt # the path to the PoseBusters PDB CCD IDs file that lists the targets that do not contain any crystal contacts
+dockgen_test_ids_filepath: ${oc.env:PROJECT_ROOT}/data/dockgen_set/split_test.txt # the path to the DockGen test set IDs file
 input_receptor: null # the input protein sequence
 input_ligand: null # the input ligand SMILES
 input_template: null # the input template protein structure to optionally use
 input_id: null # the input ID to use for inference
+pocket_only_baseline: false # whether to prepare the pocket-only baseline
diff --git a/configs/data/rfaa_input_preparation.yaml b/configs/data/rfaa_input_preparation.yaml
@@ -1,7 +1,9 @@
-dataset: casp15 # the dataset to use - NOTE: must be one of (`posebusters_benchmark`, `astex_diverse`, `casp15`)
+dataset: posebusters_benchmark # the dataset to use - NOTE: must be one of (`posebusters_benchmark`, `astex_diverse`, `dockgen`, `casp15`)
 input_data_dir: ${oc.env:PROJECT_ROOT}/data/${dataset}_set # the input protein-ligand complex directory to recursively parse
 output_scripts_path: ${oc.env:PROJECT_ROOT}/forks/RoseTTAFold-All-Atom/prediction_inputs/${dataset} # the output directory in which to save the input files
 posebusters_ccd_ids_filepath: ${oc.env:PROJECT_ROOT}/data/posebusters_pdb_ccd_ids.txt # the path to the PoseBusters PDB CCD IDs file that lists the targets that do not contain any crystal contacts
+dockgen_test_ids_filepath: ${oc.env:PROJECT_ROOT}/data/dockgen_set/split_test.txt # the path to the DockGen test set IDs file
 protein_filepath: null # the path to the protein structure file to use
 ligand_smiles: null # the ligand SMILES string for which to predict the binding pose
 input_id: null # the input ID to use for inference
+pocket_only_baseline: false # whether to prepare the pocket-only baseline
diff --git a/configs/data/rfaa_output_extraction.yaml b/configs/data/rfaa_output_extraction.yaml
@@ -1,4 +1,4 @@
-dataset: posebusters_benchmark # the dataset to use - NOTE: must be one of (`posebusters_benchmark`, `astex_diverse`, `casp15`)
+dataset: posebusters_benchmark # the dataset to use - NOTE: must be one of (`posebusters_benchmark`, `astex_diverse`, `dockgen`, `casp15`)
 prediction_inputs_dir: ${oc.env:PROJECT_ROOT}/forks/RoseTTAFold-All-Atom/prediction_inputs/${dataset}
 prediction_outputs_dir: ${oc.env:PROJECT_ROOT}/forks/RoseTTAFold-All-Atom/prediction_outputs/${dataset}_${repeat_index}
 inference_outputs_dir: ${oc.env:PROJECT_ROOT}/forks/RoseTTAFold-All-Atom/inference/rfaa_${dataset}_outputs_${repeat_index}

diff --git a/configs/model/diffdock_inference.yaml b/configs/model/diffdock_inference.yaml
@@ -1,7 +1,7 @@
 cuda_device_index: 0 # the CUDA device to use for inference, or `null` to use CPU
 python_exec_path: ${oc.env:PROJECT_ROOT}/forks/DiffDock/DiffDock/bin/python3 # the Python executable to use
 diffdock_exec_dir: ${oc.env:PROJECT_ROOT}/forks/DiffDock # the DiffDock directory in which to execute the inference scripts
-dataset: posebusters_benchmark # the dataset to use - NOTE: must be one of (`posebusters_benchmark`, `astex_diverse`, `casp15`)
+dataset: posebusters_benchmark # the dataset to use - NOTE: must be one of (`posebusters_benchmark`, `astex_diverse`, `dockgen`, `casp15`)
 input_csv_path: ${oc.env:PROJECT_ROOT}/forks/DiffDock/inference/diffdock_${dataset}_inputs.csv # the input CSV filepath with which to run inference
 inference_config_path: ${oc.env:PROJECT_ROOT}/forks/DiffDock/default_inference_args.yaml # the inference configuration file to use
 output_dir: ${oc.env:PROJECT_ROOT}/forks/DiffDock/inference/diffdock_${dataset}_output_${repeat_index} # the output directory to which to save the inference results

diff --git a/configs/model/dynamicbind_inference.yaml b/configs/model/dynamicbind_inference.yaml
@@ -1,7 +1,7 @@
 cuda_device_index: 0 # the CUDA device to use for inference, or `null` to use CPU
 python_exec_path: ${oc.env:PROJECT_ROOT}/forks/DynamicBind/DynamicBind/bin/python3 # the Python executable to use
 dynamicbind_exec_dir: ${oc.env:PROJECT_ROOT}/forks/DynamicBind # the DynamicBind directory in which to execute the inference scripts
-dataset: posebusters_benchmark # the dataset to use - NOTE: must be one of (`posebusters_benchmark`, `astex_diverse`, `casp15`)
+dataset: posebusters_benchmark # the dataset to use - NOTE: must be one of (`posebusters_benchmark`, `astex_diverse`, `dockgen`, `casp15`)
 input_data_dir: ${oc.env:PROJECT_ROOT}/data/${dataset}_set/${dataset}_holo_aligned_esmfold_structures # the input protein-ligand complex directory to recursively parse for protein inputs
 input_ligand_csv_dir: ${oc.env:PROJECT_ROOT}/forks/DynamicBind/inference/dynamicbind_${dataset}_inputs # the input CSV directory with which to run inference
 samples_per_complex: 40 # the number of samples to generate per complex
@@ -12,3 +12,4 @@ header: ${dataset} # name of the results directory to create
 num_workers: 1 # the number of workers to use for native relaxation during inference
 skip_existing: true # whether to skip existing predictions
 repeat_index: 1 # the repeat index to use for inference
+pocket_only_baseline: false # whether to run the pocket-only baseline
diff --git a/configs/model/ensemble_generation.yaml b/configs/model/ensemble_generation.yaml
@@ -37,8 +37,9 @@ casp_author: "001" # group number to report in CASP format
 casp_method: "Ligand_Predictor" # the method name to report in CASP format
 combine_casp_output_files: false # whether to combine the CASP protein and ligand output files into a single file
 generate_hpc_scripts: false # whether to generate HPC scripts for running the ensemble generation; if `false`, then local scripts will be generated instead
+pocket_only_baseline: false # whether to run ensemble generation with only pocket-based baseline methods
 ensemble_benchmarking: false # whether to run ensemble benchmarking
-ensemble_benchmarking_dataset: posebusters_benchmark # the dataset to use for ensemble benchmarking - NOTE: must be one of (`posebusters_benchmark`, `astex_diverse`, `casp15`)
+ensemble_benchmarking_dataset: posebusters_benchmark # the dataset to use for ensemble benchmarking - NOTE: must be one of (`posebusters_benchmark`, `astex_diverse`, `dockgen`, `casp15`)
 ensemble_benchmarking_repeat_index: 1 # the repeat index to use for ensemble benchmarking
 ensemble_ranking_method: consensus # the method with which to rank-order and select the top ensemble prediction for each target - NOTE: must be one of (`consensus`, `ff`)
 ensemble_benchmarking_apo_protein_dir: ${oc.env:PROJECT_ROOT}/data/${ensemble_benchmarking_dataset}_set/${ensemble_benchmarking_dataset}_holo_aligned_esmfold_structures # the directory containing the apo proteins to use for ensemble benchmarking

diff --git a/configs/model/fabind_inference.yaml b/configs/model/fabind_inference.yaml
@@ -1,7 +1,7 @@
 cuda_device_index: 0 # the CUDA device to use for inference, or `null` to use CPU
 python_exec_path: ${oc.env:PROJECT_ROOT}/forks/FABind/FABind/bin/python3 # the Python executable to use
 fabind_exec_dir: ${oc.env:PROJECT_ROOT}/forks/FABind/fabind # the FABind directory in which to execute the inference scripts
-dataset: posebusters_benchmark # the dataset to use - NOTE: must be one of (`posebusters_benchmark`, `astex_diverse`, `casp15`)
+dataset: posebusters_benchmark # the dataset to use - NOTE: must be one of (`posebusters_benchmark`, `astex_diverse`, `dockgen`, `casp15`)
 input_csv_path: ${oc.env:PROJECT_ROOT}/forks/FABind/inference/fabind_${dataset}_inputs.csv # the input CSV filepath with which to run inference
 input_data_dir: ${oc.env:PROJECT_ROOT}/data/${dataset}_set/${dataset}_holo_aligned_esmfold_structures # the input protein-ligand complex directory to recursively parse
 num_threads: 1 # the number of threads to use for inference
@@ -10,3 +10,4 @@ save_mols_dir: ${oc.env:PROJECT_ROOT}/forks/FABind/inference/fabind_${dataset}_t
 ckpt_path: ${oc.env:PROJECT_ROOT}/forks/FABind/ckpt/best_model.bin # the checkpoint path to use for inference
 output_dir: ${oc.env:PROJECT_ROOT}/forks/FABind/inference/fabind_${dataset}_output_${repeat_index} # the output directory to which to save the inference results
 repeat_index: 1 # the repeat index to use for inference
+pocket_only_baseline: false # whether to run the pocket-only baseline
diff --git a/configs/model/inference_relaxation.yaml b/configs/model/inference_relaxation.yaml
@@ -1,6 +1,6 @@
 method: diffdock # the method for which to relax predictions - NOTE: must be one of (`diffdock`, `fabind`, `dynamicbind`, `neuralplexer`, `rfaa`, `vina`, `tulip`)
 vina_binding_site_method: diffdock # the method to use for Vina binding site prediction - NOTE: must be one of (`diffdock`, `fabind`, `dynamicbind`, `neuralplexer`, `rfaa`)
-dataset: posebusters_benchmark # the dataset for which to relax predictions - NOTE: must be one of (`posebusters_benchmark`, `astex_diverse`, `casp15`)
+dataset: posebusters_benchmark # the dataset for which to relax predictions - NOTE: must be one of (`posebusters_benchmark`, `astex_diverse`, `dockgen`, `casp15`)
 ensemble_ranking_method: consensus # the method with which to rank-order and select the top ensemble prediction for each target - NOTE: must be one of (`consensus`, `ff`)
 num_processes: 1 # the number of parallel processes to use for relaxation
 temp_dir: ${method}_${dataset}_cache_dir # temporary directory
@@ -10,7 +10,7 @@ prep_only: false # only prepare the input files
 platform: "fastest" # platform on which to run relaxation
 cuda_device_index: 0 # CUDA device index
 log_level: "INFO" # logging level
-protein_dir: ${resolve_method_protein_dir:${method},${dataset},${repeat_index}} # the directory from which to load (potentially inferred) proteins
+protein_dir: ${resolve_method_protein_dir:${method},${dataset},${repeat_index},${pocket_only_baseline}} # the directory from which to load (potentially inferred) proteins
 ligand_dir: ${resolve_method_ligand_dir:${method},${dataset},${vina_binding_site_method},${repeat_index}} # the directory from which to load inferred ligands
 output_dir: ${resolve_method_output_dir:${method},${dataset},${vina_binding_site_method},${ensemble_ranking_method},${repeat_index}} # the output directory to which to save the relaxed predictions
 relax_protein: false # whether to relax the protein - NOTE: currently periodically yields unpredictable protein-ligand separation
@@ -24,3 +24,4 @@ max_final_e_value: 1000.0 # when relaxing the protein, maximum final energy valu
 max_num_attempts: 5 # when relaxing the protein, maximum number of relaxation attempts to perform
 skip_existing: true # whether to skip existing relaxed predictions
 repeat_index: 1 # the repeat index which was used for inference
+pocket_only_baseline: false # whether to prepare the pocket-only baseline
diff --git a/configs/model/neuralplexer_inference.yaml b/configs/model/neuralplexer_inference.yaml
@@ -1,6 +1,6 @@
 python_exec_path: ${oc.env:PROJECT_ROOT}/forks/NeuralPLexer/NeuralPLexer/bin/python3 # the Python executable to use
 neuralplexer_exec_dir: ${oc.env:PROJECT_ROOT}/forks/NeuralPLexer # the NeuralPLexer directory in which to execute the inference scripts
-dataset: posebusters_benchmark # the dataset to use - NOTE: must be one of (`posebusters_benchmark`, `astex_diverse`, `casp15`)
+dataset: posebusters_benchmark # the dataset to use - NOTE: must be one of (`posebusters_benchmark`, `astex_diverse`, `dockgen`, `casp15`)
 input_csv_path: ${oc.env:PROJECT_ROOT}/forks/NeuralPLexer/inference/neuralplexer_${dataset}_inputs.csv # the input CSV filepath to which parsed input data has been written
 skip_existing: true # whether to skip existing predictions
 task: batched_structure_sampling # the task to run - NOTE: must be one of (`single_sample_trajectory`, `batched_structure_sampling`, `structure_prediction_benchmarking`, `pdbbind_benchmarking`, `binding_site_recovery_benchmarking`)

diff --git a/configs/model/rfaa_inference.yaml b/configs/model/rfaa_inference.yaml
@@ -1,6 +1,6 @@
 python_exec_path: ${oc.env:PROJECT_ROOT}/forks/RoseTTAFold-All-Atom/RFAA/bin/python3 # the Python executable to use
 rfaa_exec_dir: ${oc.env:PROJECT_ROOT}/forks/RoseTTAFold-All-Atom # the RoseTTAFold-All-Atom directory in which to execute the inference scripts
-dataset: casp15 # the dataset to use - NOTE: must be one of (`posebusters_benchmark`, `astex_diverse`, `casp15`)
+dataset: posebusters_benchmark # the dataset to use - NOTE: must be one of (`posebusters_benchmark`, `astex_diverse`, `dockgen`, `casp15`)
 input_dir: ${oc.env:PROJECT_ROOT}/forks/RoseTTAFold-All-Atom/prediction_inputs/${dataset} # the input directory with which to run inference
 config_dir: ${oc.env:PROJECT_ROOT}/forks/RoseTTAFold-All-Atom/rf2aa/config/inference # the config directory with which to run inference
 output_dir: ${oc.env:PROJECT_ROOT}/forks/RoseTTAFold-All-Atom/prediction_outputs/${dataset}_${repeat_index} # the output directory to which to save the inference results