Fixing the GeSeq annotation selenium scraper and some other bugs

sannalda · Oct 11, 2023 · 6f3813d · 6f3813d
1 parent 24931d5
commit 6f3813d
Show file tree

Hide file tree

Showing 6 changed files with 51 additions and 58 deletions.
diff --git a/config/config.yaml b/config/config.yaml
@@ -1,32 +1,33 @@
 workdir: "/scratch/sjannalda/projects/PlastidPipelineTesting"
 
 samples:
-    ERR5529317: /scratch/sjannalda/projects/PlastidPipelineTesting/data/ERR5529317/ERR5529317
+    #ERR5529317: /scratch/sjannalda/projects/PlastidPipelineTesting/data/ERR5529317/ERR5529317
     #ERR5529436: /scratch/sjannalda/projects/PlastidPipelineTesting/data/ERR5529436/ERR5529436
-    #ERR5554746: /scratch/sjannalda/projects/PlastidPipelineTesting/data/ERR5554746/ERR5554746
+    ERR5554746: /scratch/sjannalda/projects/PlastidPipelineTesting/data/ERR5554746/ERR5554746
     #ERR5529299: /scratch/sjannalda/projects/PlastidPipelineTesting/data/ERR5529299/ERR5529299
     #Am09: /scratch/sjannalda/projects/Am09/run1/data/AM0909
     #Am21: /scratch/sjannalda/projects/Am21/run1/data/AM2134
     #SRR17032105: /scratch/sjannalda/projects/PlastidPipelineTesting/data/SRR17032105/SRR17032105
     #SRR12432532: /scratch/sjannalda/projects/PlastidPipelineTesting/data/SRR12432532/SRR12432532
 
+
+
 ### Pipeline Options
 Trimming: TRUE
 Standardization: TRUE
 
 
-### QC Options
-#qc_folder: "qc"
-
 
 ### Trimming Options
-adapter_trimming: "$EBROOTTRIMMOMATIC/adapters/TruSeq3-PE-2.fa" # Using adapters file provided by Trimmomatic, but you can provide your own file.
+adapter_trimming: "$EBROOTTRIMMOMATIC/adapters/TruSeq3-PE-2.fa" # Using adapters file provided by Trimmomatic on HPC, but you can provide your own file.
+
 
 
 ### Filtering Options
 MinQualityScore: 30
 
 
+
 ### Assembly Options 
 organelle_database_folder: "database" # Can either provide a bath if already made, or it will be created within the workdir
 OrganelleDatabasesDownload: "embplant_pt,embplant_mt" # Apprently GetOrganelle needs both of these first before proceeding
@@ -40,7 +41,7 @@ PreGrouped: 1000000 # -P. The maximum number (integer) of high-covered reads to
 
 ### Annotation Options for GeSeq
 GenomeShape: "Linear" # Options: Linear, Circular
-SequenceSource: "Land" # Options: Land, Algae, Mito. Refers to Plastid(Land plants), Plastid (algae), or Mitochondria
+SequenceSource: "Land" # Options: Land, Algae, Mito. Refers to Plastid (Land plants), Plastid (algae), or Mitochondria
 AnnotateIR: True # Refers to annotate plastid Inverted Repeat (IR)
 AnnotateRPS12: True # Refers to annotate plastid trans-spliced rps12
 AnnotationChloe: True # Refers to support annotation by Chloë
@@ -52,6 +53,7 @@ MPIMP_RefSet: True # Refers to use MPI's reference set of chloroplast land plant
 MultiGenBank: True # Refers to generate multi-GenBank
 
 
+
 ### Standardization Options
 lsc_gene: "rbcL" # Gene located in LSC. Default is "rbcL" (previously used "psbA" with direction -1)
 lsc_gene_dir: 1 # Gene direction of lsc. 1 is forward, -1 is reverse

diff --git a/workflow/Snakefile b/workflow/Snakefile
@@ -7,7 +7,7 @@ rule all:
 		expand("{sample}/qc/filtering/filtering_report_{sample}.html", sample=config["samples"]),
 		expand("{sample}/annotation/{sample}.standardardized.gb", sample=config["samples"]) if (config["Standardization"]) else expand("{sample}/annotation/{sample}.original.gb",sample=config["samples"]),
 		expand("{sample}/annotation/{sample}.standardardized.fasta", sample=config["samples"]) if (config["Standardization"]) else expand("{sample}/assembly/{sample}.original.fasta",sample=config["samples"]),
-		expand("{sample}/qc/backmapping/{sample}.backmapping.readcoverage.log", sample=config["samples"]),
+		#expand("{sample}/qc/backmapping/{sample}.backmapping.readcoverage.log", sample=config["samples"]),
 
 
 include: "rules/Preprocessing.smk"

diff --git a/workflow/envs/Snakemake.yaml b/workflow/envs/Snakemake.yaml
@@ -28,7 +28,7 @@ dependencies:
   - c-ares=1.19.1=hd590300_0
   - ca-certificates=2023.7.22=hbcca054_0
   - cachetools=5.3.1=pyhd8ed1ab_0
-  - certifi=2023.7.22=pyhd8ed1ab_0
+  - certifi=2023.7.22=pyhd8ed1ab_0F
   - cffi=1.15.1=py311h409f033_3
   - charset-normalizer=3.2.0=pyhd8ed1ab_0
   - coin-or-cbc=2.10.10=h9002f0b_0

diff --git a/workflow/rules/Annotation.smk b/workflow/rules/Annotation.smk
@@ -1,8 +1,12 @@
 rule AnnotationPreStandardization:
 	input:
-		rules.Assembly.output.assembly
+		"{sample}/assembly/{sample}.original.fasta"
 	output:
 		"{sample}/annotation/{sample}.original.gb"
+	resources:
+		mem_mb=1000,
+		time="0-00:20:00",
+		chrome=1
 	conda:
 		"../envs/Annotation.yaml"
 	localrule: True
@@ -16,17 +20,21 @@ rule StandardizationAnnotation:
 		"{sample}/annotation/{sample}.standardardized.fasta"
 	resources:
 		mem_mb=1000,
-		time="0-0:30:00"
+		time="0-0:20:00"
 	conda:
 		"../envs/Annotation.yaml"
 	script:
 		"../scripts/StandardizationAnnotation.py"
 
 rule AnnotationPostStandardization:
 	input:
-		rules.StandardizationAnnotation.output
+		"{sample}/annotation/{sample}.standardardized.fasta"
 	output:
 		"{sample}/annotation/{sample}.standardardized.gb"
+	resources:
+		mem_mb=1000,
+		time="0-00:20:00",
+		chrome=1
 	conda:
 		"../envs/Annotation.yaml"
 	localrule: True

diff --git a/workflow/scripts/GeSeqAutomation.py b/workflow/scripts/GeSeqAutomation.py
@@ -13,6 +13,7 @@
 from selenium.webdriver.chrome.service import Service as ChromiumService
 from webdriver_manager.chrome import ChromeDriverManager
 from webdriver_manager.core.os_manager import ChromeType
+from selenium.webdriver import ActionChains
 
 
 
@@ -37,15 +38,17 @@
 
 
 ##### Selenium Webdriver
-options = Options()
-options.BinaryLocation = "/usr/bin/chromium-browser" 
+#options = Options()
+#options.BinaryLocation = "/usr/bin/chromium-browser" 
 
 op = webdriver.ChromeOptions()
 op.add_argument("--headless")
 op.add_argument('--ignore-certificate-errors')
 op.add_argument("--no-sandbox")
 op.add_argument("--disable-dev-shm-usage")
+op.add_argument("--start-maximized")
 driver = webdriver.Chrome(options=op,service=ChromiumService(ChromeDriverManager(chrome_type=ChromeType.CHROMIUM).install()))
+#driver = webdriver.Chrome(options=op)
 driver.get("https://chlorobox.mpimp-golm.mpg.de/geseq.html")
 time.sleep(5)
 
@@ -163,12 +166,16 @@
 
 if (MultiGenBank):
     if (not output_options_block.find_element(By.ID,"multigenbank_enabled").is_selected()):
-        try:
-            multigenbank_button = WebDriverWait(driver, 20).until(EC.element_to_be_clickable(output_options_block.find_element(By.ID,"multigenbank_enabled")))
-            multigenbank_button.click()
-        except ElementClickInterceptedException:
-            print("Trying to click on the button again")
-            driver.execute_script("arguments[0].click()", multigenbank_button)
+        multigenbank_element = output_options_block.find_element(By.ID,"multigenbank_enabled")
+        actions = ActionChains(driver)
+        actions.move_to_element(multigenbank_element).click().perform()
+
+        #try:
+        #    multigenbank_button = WebDriverWait(driver, 20).until(EC.element_to_be_clickable(output_options_block.find_element(By.ID,"multigenbank_enabled")))
+        #    multigenbank_button.click()
+        #except ElementClickInterceptedException:
+        #    print("Trying to click on the button again")
+        #    driver.execute_script("arguments[0].click()", multigenbank_button)
 
 
 ### Actions
@@ -190,13 +197,20 @@
 
 ignored_exceptions=(NoSuchElementException,StaleElementReferenceException,)
 
+start_time = time.time()
 job_status = results_block.find_element(By.CLASS_NAME,"gs_jobstatus").text.strip()
 while(job_status != 'Status: finished'):
-    WebDriverWait(driver,5,ignored_exceptions=ignored_exceptions).until(EC.presence_of_element_located((By.CLASS_NAME,"gs_jobstatus")))
+    #WebDriverWait(driver,5,ignored_exceptions=ignored_exceptions).until(EC.presence_of_element_located((By.CLASS_NAME,"gs_jobstatus")))
     time.sleep(1)
-    job_status = results_block.find_element(By.CLASS_NAME,"gs_jobstatus").text.strip()
+    try:
+        job_status = results_block.find_element(By.CLASS_NAME,"gs_jobstatus").text.strip()
+    except StaleElementReferenceException as e:
+        pass
     print(job_status)
-
+    curr_time = time.time()
+    if (curr_time - start_time > 1000):
+        print("GeSeq took too long, exiting...")
+        assert(1==0)
 
 
 ##### Downloading GenBank file

diff --git a/workflow/scripts/StandardizationAnnotation.py b/workflow/scripts/StandardizationAnnotation.py
@@ -20,12 +20,6 @@
     if (i.type == "gene"):
         if (i.qualifiers["gene"][0]) == lsc_gene_name:
             lsc_gene = i
-        #if ((i.qualifiers["gene"][0]) == "rrn23") and (i.location.strand == 1):
-        #    rrn23_forward = i
-        #if ((i.qualifiers["gene"][0]) == "rrn23") and (i.location.strand == -1):
-        #    rrn23_reverse = i
-        #if (i.qualifiers["gene"][0]) == "ccsA":
-        #    ccsA = i
         if (i.qualifiers["gene"][0]) == ssc_gene_name:
             ssc_gene = i
     if (i.type == "repeat_region" and "IRA" in i.qualifiers["note"][0]):
@@ -45,8 +39,7 @@
 
 new_seq = ""
 
-### LSC # psbA should be -1
-# assert(psbA.location.start < IRA.location.start):
+### LSC 
 try:
     if (lsc_gene.location.strand == lsc_gene_dir):
         new_seq += start_IRAstart
@@ -55,53 +48,29 @@
 except NameError as error:
     print("%s not found in file...skipping" %lsc_gene_name)
 
-
-### rrn23 forward should be +1
-# assert(IRA.location.start < rrn23_forward.location.start < IRA.location.end):
-#try:
-#    if (rrn23_forward.location.strand == 1):
-#        new_seq += IRAstart_IRAend
-#    else:
-#        new_seq += IRAstart_IRAend.reverse_complement()
-#except NameError as error:
-#    print("rrn23 forward not found in file...skipping")
 new_seq += IRAstart_IRAend    
 
-### SSC # ccsA should be -1, ndhF should be +1, ccsA < ndhF
+### SSC 
 try:
     if (ssc_gene.location.strand == ssc_gene_dir):
         new_seq += IRAend_IRBstart
     else:
         new_seq += IRAend_IRBstart.reverse_complement()
 except NameError as error:
-    print("ccsA or ndhF not found in file...skipping")
-# assert(IRA.location.end < ccsA.location.start < ndhF.location.start < IRB.location.start)
-
-
-### rrn23 reverse should be +1
-# assert(IRB.location.start < rrn23_reverse.location.start < IRB.location.end)
-#try:
-#    if (rrn23_reverse.location.strand == -1):
-#        new_seq += IRBstart_IRBend
-#    else:
-#        new_seq += IRBstart_IRBend.reverse_complement()
-#except NameError as error:
-#    print("rr23 reverse not found in file...skipping")
+    print("%s not found in file...skipping" %ssc_gene_name)
 
 new_seq += IRBstart_IRBend
-
 new_seq += IRBend_end
 
 record = SeqRecord(
     new_seq,
-    id="Am09",
+    id=snakemake.wildcards["sample"],
     name="FastaStandardized",
     description=""
 )
 # NOTE: The header of the FASTA file must contain no blanks/spaces (for some odd reason) i.e. ">Am09_Chloroplasts" is allowed, but not ">Am09 Chloroplasts"
 
 
-
 ##### Output
 
 #annotation_filename_standardized = '/Users/SJAnnaldasula/Documents/BGBM/Plastid/%s_standardized.fasta' %os.path.splitext(annotation_filename)[0]