diff --git a/config/config.yaml b/config/config.yaml index 52871a8..081ada5 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -1,14 +1,23 @@ -workdir: "/scratch/sjannalda/projects/PlastidTutorial" +# Working directory where all the analysis will run. This should be in "scratch" or similar folder +workdir: "/scratch/sjannalda/projects/PlastidPipelineTesting" samples: - ERR5529317: /scratch/sjannalda/projects/PlastidPipelineTesting/rawdata/ERR5529317/ERR5529317 - ERR5529436: /scratch/sjannalda/projects/PlastidPipelineTesting/rawdata/ERR5529436/ERR5529436 - ERR5554746: /scratch/sjannalda/projects/PlastidPipelineTesting/rawdata/ERR5554746/ERR5554746 - + #SRR17032099: /scratch/sjannalda/projects/PlastidPipelineTesting/rawdata/SRR17032099/SRR17032099 + SRR12917849: /scratch/sjannalda/projects/PlastidPipelineTesting/rawdata/SRR12917849/SRR12917849 + #SRR12917857: /scratch/sjannalda/projects/PlastidPipelineTesting/rawdata/SRR12917857/SRR12917857 + #ERR5529317: /scratch/sjannalda/projects/PlastidPipelineTesting/rawdata/ERR5529317/ERR5529317 + #ERR5529436: /scratch/sjannalda/projects/PlastidPipelineTesting/rawdata/ERR5529436/ERR5529436 + #ERR5554746: /scratch/sjannalda/projects/PlastidPipelineTesting/rawdata/ERR5554746/ERR5554746 + #ERR5529299: /scratch/sjannalda/projects/PlastidPipelineTesting/rawdata/ERR5529299/ERR5529299 + #Am09: /scratch/sjannalda/projects/Am09/run1/data/AM0909 + #Am21: /scratch/sjannalda/projects/Am21/run1/data/AM2134 + #SRR17032105: /scratch/sjannalda/projects/PlastidPipelineTesting/rawdata/SRR17032105/SRR17032105 + #SRR12432532: /scratch/sjannalda/projects/PlastidPipelineTesting/rawdata/SRR12432532/SRR12432532 ### Pipeline Options Trimming: TRUE Standardization: TRUE +Submission: TRUE @@ -54,4 +63,9 @@ MultiGenBank: True # Refers to generate multi-GenBank lsc_gene: "rbcL" # Gene located in LSC. Default is "rbcL" (previously used "psbA" with direction -1) lsc_gene_dir: 1 # Gene direction of lsc. 1 is forward, -1 is reverse ssc_gene: "ndhF" # Gene located in SSC -ssc_gene_dir: 1 # Gene direction of ssc. 1 is forward, -1 is reverse \ No newline at end of file +ssc_gene_dir: 1 # Gene direction of ssc. 1 is forward, -1 is reverse + + + +### Submission Options +metadata_file: "/home/sjannalda/bgbm/projects/PlastidPipeline/config/metadata.txt" \ No newline at end of file diff --git a/config/metadata.txt b/config/metadata.txt new file mode 100644 index 0000000..d569f8c --- /dev/null +++ b/config/metadata.txt @@ -0,0 +1,15 @@ +AUTHORS Siddarth Annaldasula, Katja Reichel +INSTITUTION Institut fuer Biologie-Botanik, Freie Universitaet Berlin, Altensteinstrasse 6, Berlin, Berlin 14195,Germany +SAMPLE AM09_01 +SOURCE Arnica montana L. silica-dried leaf sample +SPECIES Arnica montana +TAXONOMY cellular organisms; Eukaryota; Viridiplantae; Streptophyta; Streptophytina; Embryophyta; Tracheophyta; Euphyllophyta; Spermatophyta; Magnoliopsida; Mesangiospermae; eudicotyledons; Gunneridae; Pentapetalae; asterids; campanulids; Asterales; Asteraceae; Asteroideae; Heliantheae alliance; Madieae; Arnicinae; Arnica +TAXREF 436207 +SOURCE_MOD +/altitude="0 m" +/collected_by="Esther Sossai, Elke Zippel" +/collection_date="17-Jun-2023" +/country="Germany:Mecklenburg-Vorpommern:Barth" +/identified_by="Elke Zippel" +/lat_lon="54.393536 N 12.701389 E" +/tissue_type="leaf" \ No newline at end of file diff --git a/workflow/Snakefile b/workflow/Snakefile index 7972077..7fe06be 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -7,9 +7,13 @@ rule all: expand("{sample}/qc/filtering/filtering_report_{sample}.html", sample=config["samples"]), expand("{sample}/annotation/{sample}.standardardized.gb", sample=config["samples"]) if (config["Standardization"]) else expand("{sample}/annotation/{sample}.original.gb",sample=config["samples"]), expand("{sample}/annotation/{sample}.standardardized.fasta", sample=config["samples"]) if (config["Standardization"]) else expand("{sample}/assembly/{sample}.original.fasta",sample=config["samples"]), - #expand("{sample}/qc/backmapping/{sample}.backmapping.readcoverage.log", sample=config["samples"]), + expand("{sample}/annotation/{sample}.standardardized.cleaned.gb", sample=config["samples"]) if (config["Standardization"]) else expand("{sample}/annotation/{sample}.original.cleaned.gb",sample=config["samples"]), + expand("{sample}/annotation/{sample}.submission.gb", sample=config["samples"]) if (config["Submission"]) else "", + expand("{sample}/qc/backmapping/{sample}.backmapping.readcoverage.log", sample=config["samples"]), + + include: "rules/Preprocessing.smk" include: "rules/Trimming.smk" @@ -22,4 +26,6 @@ include: "rules/Annotation.smk" include: "rules/Backmapping.smk" +include: "rules/Submission.smk" + localrules: AnnotationPreStandardization, AnnotationPostStandardization diff --git a/workflow/envs/Annotation.yaml b/workflow/envs/Annotation.yaml index 1b8d819..669db29 100644 --- a/workflow/envs/Annotation.yaml +++ b/workflow/envs/Annotation.yaml @@ -13,8 +13,8 @@ dependencies: - biopython=1.81=py311h2582759_0 - brotli-python=1.1.0=py311hb755f60_0 - bzip2=1.0.8=h7f98852_4 - - ca-certificates=2023.7.22=hbcca054_0 - - certifi=2023.7.22=pyhd8ed1ab_0 + - ca-certificates=2024.2.2=hbcca054_0 + - certifi=2024.2.2=pyhd8ed1ab_0 - charset-normalizer=3.2.0=pyhd8ed1ab_0 - colorama=0.4.6=pyhd8ed1ab_0 - exceptiongroup=1.1.3=pyhd8ed1ab_0 @@ -38,19 +38,24 @@ dependencies: - libzlib=1.2.13=hd590300_5 - ncurses=6.4=hcb278e6_0 - numpy=1.26.0=py311h64a7726_0 - - openssl=3.1.3=hd590300_0 + - openssl=3.2.1=hd590300_0 - outcome=1.2.0=pyhd8ed1ab_0 - packaging=23.1=pyhd8ed1ab_0 + - pandas=2.2.0=py311h320fe9a_0 - pip=23.2.1=pyhd8ed1ab_0 - pysocks=1.7.1=pyha2e5f31_6 - python=3.11.5=hab00c5b_0_cpython + - python-dateutil=2.8.2=pyhd8ed1ab_0 - python-dotenv=1.0.0=pyhd8ed1ab_1 + - python-tzdata=2023.4=pyhd8ed1ab_0 - python_abi=3.11=4_cp311 + - pytz=2024.1=pyhd8ed1ab_0 - readline=8.2=h8228510_1 - requests=2.31.0=pyhd8ed1ab_0 - selenium=4.12.0=pyhd8ed1ab_0 - selenium-manager=4.12.0=he8a937b_0 - setuptools=68.2.2=pyhd8ed1ab_0 + - six=1.16.0=pyh6c4a22f_0 - sniffio=1.3.0=pyhd8ed1ab_0 - sortedcontainers=2.4.0=pyhd8ed1ab_0 - tk=8.6.12=h27826a3_0 diff --git a/workflow/envs/Annotation_old.yaml b/workflow/envs/Annotation_old.yaml new file mode 100644 index 0000000..1b8d819 --- /dev/null +++ b/workflow/envs/Annotation_old.yaml @@ -0,0 +1,67 @@ +name: annotation +channels: + - conda-forge + - bioconda + - defaults + - r + - default + - anaconda +dependencies: + - _libgcc_mutex=0.1=conda_forge + - _openmp_mutex=4.5=2_gnu + - attrs=23.1.0=pyh71513ae_1 + - biopython=1.81=py311h2582759_0 + - brotli-python=1.1.0=py311hb755f60_0 + - bzip2=1.0.8=h7f98852_4 + - ca-certificates=2023.7.22=hbcca054_0 + - certifi=2023.7.22=pyhd8ed1ab_0 + - charset-normalizer=3.2.0=pyhd8ed1ab_0 + - colorama=0.4.6=pyhd8ed1ab_0 + - exceptiongroup=1.1.3=pyhd8ed1ab_0 + - h11=0.14.0=pyhd8ed1ab_0 + - idna=3.4=pyhd8ed1ab_0 + - ld_impl_linux-64=2.40=h41732ed_0 + - libblas=3.9.0=18_linux64_openblas + - libcblas=3.9.0=18_linux64_openblas + - libexpat=2.5.0=hcb278e6_1 + - libffi=3.4.2=h7f98852_5 + - libgcc-ng=13.2.0=h807b86a_2 + - libgfortran-ng=13.2.0=h69a702a_2 + - libgfortran5=13.2.0=ha4646dd_2 + - libgomp=13.2.0=h807b86a_2 + - liblapack=3.9.0=18_linux64_openblas + - libnsl=2.0.0=h7f98852_0 + - libopenblas=0.3.24=pthreads_h413a1c8_0 + - libsqlite=3.43.0=h2797004_0 + - libstdcxx-ng=13.2.0=h7e041cc_2 + - libuuid=2.38.1=h0b41bf4_0 + - libzlib=1.2.13=hd590300_5 + - ncurses=6.4=hcb278e6_0 + - numpy=1.26.0=py311h64a7726_0 + - openssl=3.1.3=hd590300_0 + - outcome=1.2.0=pyhd8ed1ab_0 + - packaging=23.1=pyhd8ed1ab_0 + - pip=23.2.1=pyhd8ed1ab_0 + - pysocks=1.7.1=pyha2e5f31_6 + - python=3.11.5=hab00c5b_0_cpython + - python-dotenv=1.0.0=pyhd8ed1ab_1 + - python_abi=3.11=4_cp311 + - readline=8.2=h8228510_1 + - requests=2.31.0=pyhd8ed1ab_0 + - selenium=4.12.0=pyhd8ed1ab_0 + - selenium-manager=4.12.0=he8a937b_0 + - setuptools=68.2.2=pyhd8ed1ab_0 + - sniffio=1.3.0=pyhd8ed1ab_0 + - sortedcontainers=2.4.0=pyhd8ed1ab_0 + - tk=8.6.12=h27826a3_0 + - tqdm=4.66.1=pyhd8ed1ab_0 + - trio=0.22.2=py311h38be061_0 + - trio-websocket=0.10.4=pyhd8ed1ab_0 + - typing_extensions=4.8.0=pyha770c72_0 + - tzdata=2023c=h71feb2d_0 + - urllib3=2.0.5=pyhd8ed1ab_0 + - webdriver-manager=4.0.0=pyhd8ed1ab_0 + - wheel=0.41.2=pyhd8ed1ab_0 + - wsproto=1.2.0=pyhd8ed1ab_0 + - xz=5.2.6=h166bdaf_0 +prefix: /home/sjannalda/mambaforge-pypy3/envs/annotation diff --git a/workflow/rules/Annotation.smk b/workflow/rules/Annotation.smk index e1d2e61..46daa43 100644 --- a/workflow/rules/Annotation.smk +++ b/workflow/rules/Annotation.smk @@ -4,8 +4,8 @@ rule AnnotationPreStandardization: output: "{sample}/annotation/{sample}.original.gb" resources: - mem_mb=1000, - time="0-00:20:00", + mem_mb=2000, + time="0-00:30:00", chrome=1 conda: "../envs/Annotation.yaml" @@ -32,11 +32,46 @@ rule AnnotationPostStandardization: output: "{sample}/annotation/{sample}.standardardized.gb" resources: - mem_mb=1000, - time="0-00:20:00", + mem_mb=2000, + time="0-00:30:00", chrome=1 conda: "../envs/Annotation.yaml" localrule: True script: - "../scripts/GeSeqAutomation.py" \ No newline at end of file + "../scripts/GeSeqAutomation.py" + +if (not config["Standardization"]): + rule annotationQualityCheckInputFunc: + input: + "{sample}/assembly/{sample}.original.fasta", + "{sample}/annotation/{sample}.original.gb" + output: + "{sample}/annotation/{sample}.original.cleaned.gb", + "{sample}/annotation/{sample}.original.incorrect.gb" + log: + "{sample}/annotation/{sample}.original.incorrect.log" + resources: + mem_mb=1000, + time="0-0:20:00" + conda: + "../envs/Annotation.yaml" + script: + "../scripts/AnnotationQualityControl.py" +else: + rule annotationQualityCheckInputFunc: + input: + "{sample}/annotation/{sample}.standardardized.fasta", + "{sample}/annotation/{sample}.standardardized.gb" + output: + "{sample}/annotation/{sample}.standardardized.cleaned.gb", + "{sample}/annotation/{sample}.standardardized.incorrect.gb" + log: + "{sample}/annotation/{sample}.standardardized.incorrect.log" + resources: + mem_mb=1000, + time="0-0:20:00" + conda: + "../envs/Annotation.yaml" + script: + "../scripts/AnnotationQualityControl.py" \ No newline at end of file diff --git a/workflow/rules/Submission.smk b/workflow/rules/Submission.smk new file mode 100644 index 0000000..0e687f1 --- /dev/null +++ b/workflow/rules/Submission.smk @@ -0,0 +1,20 @@ +def annotationSubmissionNCBIInphutFunc(wildcards): + output = [config["metadata_file"]] + if (not config["Standardization"]): + return output + ["{sample}/annotation/{sample}.original.cleaned.gb","{sample}/annotation/{sample}.original.fasta"] + else: + return output + ["{sample}/annotation/{sample}.standardardized.cleaned.gb","{sample}/annotation/{sample}.standardardized.fasta"] + + +rule annotationSubmissionNCBI: + input: + gb=annotationSubmissionNCBIInphutFunc + output: + "{sample}/annotation/{sample}.submission.gb" + resources: + mem_mb=1000, + time="0-0:20:00" + conda: + "../envs/Annotation.yaml" + script: + "../scripts/AnnotationSubmission.py" \ No newline at end of file diff --git a/workflow/scripts/AnnotationQualityControl.py b/workflow/scripts/AnnotationQualityControl.py new file mode 100644 index 0000000..3e9fe36 --- /dev/null +++ b/workflow/scripts/AnnotationQualityControl.py @@ -0,0 +1,135 @@ +import os +import sys +from Bio import SeqIO +from Bio.Seq import Seq +from Bio.SeqRecord import SeqRecord +from Bio.SeqFeature import FeatureLocation +from Bio.SeqFeature import SeqFeature + +##### Error Classes + +class NotDivisbleBy3Error(Exception): + pass + +class TranslatedSequenceDiffersFromTranscriptSequenceError(Exception): + pass + +class DoesntStartWithStartCodonError(Exception): + pass + +class DoesntEndWithStopCodonError(Exception): + pass + +##### Input/output paths + +genome_file_input = os.path.join(snakemake.config["workdir"],snakemake.input[0]) +genome = SeqIO.read(genome_file_input, 'fasta') + +annotation_file_input = os.path.join(snakemake.config["workdir"],snakemake.input[1]) +annotation = SeqIO.read(annotation_file_input, 'genbank') + + +correct_annotation_output = os.path.join(snakemake.config["workdir"],snakemake.output[0]) +incorrect_annotation_output = os.path.join(snakemake.config["workdir"],snakemake.output[1]) + +error_log_file = os.path.join(snakemake.config["workdir"],snakemake.log[0]) +sys.stderr = sys.stdout = open(error_log_file, "w+") + +##### The Code + +output_correct = [] +output_incorrect = [] +features_gene = [] +translation_correct = True + +for feature in annotation.features: + if (feature.type != "source"): + if (feature.type == "gene"): + # Encounter new gene, add to the output unless there was an error in the CDS + if (features_gene != [] and translation_correct): + output_correct += features_gene + else: + output_incorrect += features_gene + features_gene = [] + translation_correct = True + + # Choose with annatator to use, default is blatX + annotators = feature.qualifiers["annotator"][0].split(";")[0] + if (len(annotators.split(",")) > 1): + if ("blatX" in annotators): + annotator = "blatX" + else: + annotator = annotators.split(",")[0] + else: + annotator = annotators + + elif (feature.type in ["CDS","tRNA"]): + transcript_seq = feature.extract(genome).seq + translated_seq = feature.extract(genome).seq.translate() + + # Check if CDS regions are fine + try: + error_msg = "Feature type '%s' at position start '%d'." %(feature.type, feature.location.parts[0].start + 1) + if (len(transcript_seq) % 3 != 0): + raise NotDivisbleBy3Error(error_msg) + if (len(transcript_seq)/3 != len(translated_seq)): + raise TranslatedSequenceDiffersFromTranscriptSequenceError(error_msg) + if (translated_seq[0] != "M"): + raise DoesntStartWithStartCodonError(error_msg) + if (translated_seq[-1] != "*"): + raise DoesntEndWithStopCodonError(error_msg) + + except NotDivisbleBy3Error as ve: + print(f"ERROR! Sequence is not divisible by 3: {ve}") + translation_correct = False + + except TranslatedSequenceDiffersFromTranscriptSequenceError as fnfe: + print(f"ERROR! Translated transcript sequence differs from the translated sequence: {fnfe}") + translation_correct = False + + except DoesntStartWithStartCodonError as zde: + print(f"ERROR! Feature does not start with a start codon: {zde}") + translation_correct = False + + except DoesntEndWithStopCodonError as e: + print(f"ERROR! Feature does not end with a stop codon: {e}") + translation_correct = False + + # Choose only the features with the annotator of interest + if (annotator in feature.qualifiers["annotator"][0]): + if ("info" in feature.qualifiers): + feature.qualifiers.pop("info") + if ("annotator" in feature.qualifiers): + feature.qualifiers.pop("annotator") + features_gene.append(feature) + +if (features_gene != [] and translation_correct): + output_correct += features_gene +else: + output_incorrect += features_gene + + +##### Writing Correct GenBank Entries +record_correct = SeqRecord(seq = genome.seq, + id = annotation.id, + name = annotation.name, + description = annotation.description) +record_correct.annotations["molecule_type"] = "genomic DNA" +record_correct.features.extend(output_correct) + +with open(correct_annotation_output, "w+") as output_correct_file: + SeqIO.write(record_correct, output_correct_file, "genbank") + + +##### Writing Incorrect GenBank Entries +record_incorrect = SeqRecord(seq = genome.seq, + id = annotation.id, + name = annotation.name, + description = annotation.description) +record_incorrect.annotations["molecule_type"] = "genomic DNA" +record_incorrect.features.extend(output_incorrect) + +with open(incorrect_annotation_output, "w+") as output_incorrect_file: + SeqIO.write(record_incorrect, output_incorrect_file, "genbank") + +sys.stderr.close() \ No newline at end of file diff --git a/workflow/scripts/AnnotationSubmission.py b/workflow/scripts/AnnotationSubmission.py new file mode 100644 index 0000000..1f73de6 --- /dev/null +++ b/workflow/scripts/AnnotationSubmission.py @@ -0,0 +1,87 @@ +from Bio import SeqIO, Seq +from Bio.SeqRecord import SeqRecord +from Bio.SeqFeature import SeqFeature, FeatureLocation, Reference +import Bio +import pandas as pd +from datetime import datetime +import os + + +##### Input Files +annotation_file_input = os.path.join(snakemake.config["workdir"],snakemake.input[1]) +annotation = SeqIO.read(annotation_file_input, 'genbank') + +genome_file_input = os.path.join(snakemake.config["workdir"],snakemake.input[2]) +genome = SeqIO.read(genome_file_input, 'fasta') + +metadata = pd.read_csv(os.path.join(snakemake.config["workdir"],snakemake.input[0]), sep = "\t", header = None, names = ["fields","info"], index_col = 0) + +metadata = metadata.T +if ("SOURCE_MOD" in metadata.columns): + source_mod = list(metadata.columns[list(metadata.columns).index("SOURCE_MOD")+1:]) + metadata = metadata.drop(columns=source_mod + ["SOURCE_MOD"]) + #metadata["SOURCE_MOD"]["info"] = source_mod + metadata = metadata.to_dict() +else: + source_mod = None + +##### +seq = genome.seq +locus = metadata["SAMPLE"]["info"] +definition = "%s accession %s plastid, complete genome" %(metadata["SPECIES"]["info"],metadata["SAMPLE"]["info"]) + +##### +annotations = dict() +annotations["date"] = datetime.now().strftime("%d-%b-%Y").upper() +annotations["source"] = metadata["SOURCE"]["info"] +annotations["accessions"] = metadata["SAMPLE"]["info"] +annotations["organism"] = metadata["SPECIES"]["info"] +annotations["taxonomy"] = [metadata["TAXONOMY"]["info"]] +annotations["comment"] = """Preliminary file ? please verify! + Annotated plastome generated by PIPELINENAME. + Please cite the following sources in your publication: + 1- Siddarth Annaldasula, Thomas Borsch, Katja Reichel (2024): + Title, Journal, doi. + 2- Michael Tillich, Pascal Lehwark, Tommaso Pellizzer, Elena S. + Ulbricht-Jones, Axel Fischer, Ralph Bock, Stephan Greiner (2017): + GeSeq ? versatile and accurate annotation of organelle genomes. + Nucleic Acids Research. https://doi.org/10.1093/nar/gkx391 + 3- Jian-Jun Jin, Wen-Bin Yu, Jun-Bo Yang, Yu Song, Claude W. + dePamphlis, Ting-Shuang Yi, De-Zhu Li (2020) GetOrganelle: a fast + and versatile toolkit for accurate de novo assembly of organelle + genomes. Genome Biology. https://doi.org/10.1186/s13059-020-02154-5""" +annotations["molecule_type"] = "DNA" +annotations["topology"] = "linear" + +reference = Reference() +reference.authors = metadata["AUTHORS"]["info"] +reference.title = "Direct Submission" +reference.journal = metadata["INSTITUTION"]["info"] +annotations["references"] = [reference] + +##### +features_source_dict = {"organism":metadata["SPECIES"]["info"], + "organelle":"plastid:chloroplast", + "molecule_type":"genomic DNA", + "db_xref":"taxon:"+metadata["TAXREF"]["info"]} +if (source_mod != None): + for source_mod_line in source_mod: + source_mod_info = source_mod_line.replace("/","").split("=") + features_source_dict[source_mod_info[0]] = source_mod_info[1].strip('\"').strip("\'") + +feature_source = SeqFeature(FeatureLocation(0, len(seq)), type="source", qualifiers = features_source_dict) + + +##### Output File +record = SeqRecord(seq = seq, + id = locus, + name = locus, + description = definition, + annotations = annotations, + features = [feature_source]) +record.annotations["data_file_division"] = "PLN" +record.features.extend(annotation.features) + + + +SeqIO.write(record, os.path.join(snakemake.config["workdir"],snakemake.output[0]), "genbank") \ No newline at end of file diff --git a/workflow/scripts/GeSeqAutomation.py b/workflow/scripts/GeSeqAutomation.py index cbd3259..ca461f3 100644 --- a/workflow/scripts/GeSeqAutomation.py +++ b/workflow/scripts/GeSeqAutomation.py @@ -223,14 +223,15 @@ assert(1==0) - + +time.sleep(5) ##### Downloading GenBank file annotation_filename = results_block.find_element(By.XPATH,'//a[@data-gs-format="GenBank"]').get_attribute("data-gs_filename") results_block.find_element(By.XPATH,'//a[@data-gs-format="GenBank"]').click() -time.sleep(5) +time.sleep(10) download_file_popup = driver.find_element(By.ID,"io_dialog") download_file_popup.find_element(By.CLASS_NAME,"cms_button_download").click() -time.sleep(5) +time.sleep(30) driver.quit()