Skip to content

Commit

Permalink
Fixing the GeSeq annotation selenium scraper and some other bugs
Browse files Browse the repository at this point in the history
  • Loading branch information
Annaldasula committed Oct 11, 2023
1 parent 24931d5 commit 6f3813d
Show file tree
Hide file tree
Showing 6 changed files with 51 additions and 58 deletions.
16 changes: 9 additions & 7 deletions config/config.yaml
Original file line number Diff line number Diff line change
@@ -1,32 +1,33 @@
workdir: "/scratch/sjannalda/projects/PlastidPipelineTesting"

samples:
ERR5529317: /scratch/sjannalda/projects/PlastidPipelineTesting/data/ERR5529317/ERR5529317
#ERR5529317: /scratch/sjannalda/projects/PlastidPipelineTesting/data/ERR5529317/ERR5529317
#ERR5529436: /scratch/sjannalda/projects/PlastidPipelineTesting/data/ERR5529436/ERR5529436
#ERR5554746: /scratch/sjannalda/projects/PlastidPipelineTesting/data/ERR5554746/ERR5554746
ERR5554746: /scratch/sjannalda/projects/PlastidPipelineTesting/data/ERR5554746/ERR5554746
#ERR5529299: /scratch/sjannalda/projects/PlastidPipelineTesting/data/ERR5529299/ERR5529299
#Am09: /scratch/sjannalda/projects/Am09/run1/data/AM0909
#Am21: /scratch/sjannalda/projects/Am21/run1/data/AM2134
#SRR17032105: /scratch/sjannalda/projects/PlastidPipelineTesting/data/SRR17032105/SRR17032105
#SRR12432532: /scratch/sjannalda/projects/PlastidPipelineTesting/data/SRR12432532/SRR12432532



### Pipeline Options
Trimming: TRUE
Standardization: TRUE


### QC Options
#qc_folder: "qc"


### Trimming Options
adapter_trimming: "$EBROOTTRIMMOMATIC/adapters/TruSeq3-PE-2.fa" # Using adapters file provided by Trimmomatic, but you can provide your own file.
adapter_trimming: "$EBROOTTRIMMOMATIC/adapters/TruSeq3-PE-2.fa" # Using adapters file provided by Trimmomatic on HPC, but you can provide your own file.



### Filtering Options
MinQualityScore: 30



### Assembly Options
organelle_database_folder: "database" # Can either provide a bath if already made, or it will be created within the workdir
OrganelleDatabasesDownload: "embplant_pt,embplant_mt" # Apprently GetOrganelle needs both of these first before proceeding
Expand All @@ -40,7 +41,7 @@ PreGrouped: 1000000 # -P. The maximum number (integer) of high-covered reads to

### Annotation Options for GeSeq
GenomeShape: "Linear" # Options: Linear, Circular
SequenceSource: "Land" # Options: Land, Algae, Mito. Refers to Plastid(Land plants), Plastid (algae), or Mitochondria
SequenceSource: "Land" # Options: Land, Algae, Mito. Refers to Plastid (Land plants), Plastid (algae), or Mitochondria
AnnotateIR: True # Refers to annotate plastid Inverted Repeat (IR)
AnnotateRPS12: True # Refers to annotate plastid trans-spliced rps12
AnnotationChloe: True # Refers to support annotation by Chloë
Expand All @@ -52,6 +53,7 @@ MPIMP_RefSet: True # Refers to use MPI's reference set of chloroplast land plant
MultiGenBank: True # Refers to generate multi-GenBank



### Standardization Options
lsc_gene: "rbcL" # Gene located in LSC. Default is "rbcL" (previously used "psbA" with direction -1)
lsc_gene_dir: 1 # Gene direction of lsc. 1 is forward, -1 is reverse
Expand Down
2 changes: 1 addition & 1 deletion workflow/Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ rule all:
expand("{sample}/qc/filtering/filtering_report_{sample}.html", sample=config["samples"]),
expand("{sample}/annotation/{sample}.standardardized.gb", sample=config["samples"]) if (config["Standardization"]) else expand("{sample}/annotation/{sample}.original.gb",sample=config["samples"]),
expand("{sample}/annotation/{sample}.standardardized.fasta", sample=config["samples"]) if (config["Standardization"]) else expand("{sample}/assembly/{sample}.original.fasta",sample=config["samples"]),
expand("{sample}/qc/backmapping/{sample}.backmapping.readcoverage.log", sample=config["samples"]),
#expand("{sample}/qc/backmapping/{sample}.backmapping.readcoverage.log", sample=config["samples"]),


include: "rules/Preprocessing.smk"
Expand Down
2 changes: 1 addition & 1 deletion workflow/envs/Snakemake.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ dependencies:
- c-ares=1.19.1=hd590300_0
- ca-certificates=2023.7.22=hbcca054_0
- cachetools=5.3.1=pyhd8ed1ab_0
- certifi=2023.7.22=pyhd8ed1ab_0
- certifi=2023.7.22=pyhd8ed1ab_0F
- cffi=1.15.1=py311h409f033_3
- charset-normalizer=3.2.0=pyhd8ed1ab_0
- coin-or-cbc=2.10.10=h9002f0b_0
Expand Down
14 changes: 11 additions & 3 deletions workflow/rules/Annotation.smk
Original file line number Diff line number Diff line change
@@ -1,8 +1,12 @@
rule AnnotationPreStandardization:
input:
rules.Assembly.output.assembly
"{sample}/assembly/{sample}.original.fasta"
output:
"{sample}/annotation/{sample}.original.gb"
resources:
mem_mb=1000,
time="0-00:20:00",
chrome=1
conda:
"../envs/Annotation.yaml"
localrule: True
Expand All @@ -16,17 +20,21 @@ rule StandardizationAnnotation:
"{sample}/annotation/{sample}.standardardized.fasta"
resources:
mem_mb=1000,
time="0-0:30:00"
time="0-0:20:00"
conda:
"../envs/Annotation.yaml"
script:
"../scripts/StandardizationAnnotation.py"

rule AnnotationPostStandardization:
input:
rules.StandardizationAnnotation.output
"{sample}/annotation/{sample}.standardardized.fasta"
output:
"{sample}/annotation/{sample}.standardardized.gb"
resources:
mem_mb=1000,
time="0-00:20:00",
chrome=1
conda:
"../envs/Annotation.yaml"
localrule: True
Expand Down
36 changes: 25 additions & 11 deletions workflow/scripts/GeSeqAutomation.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
from selenium.webdriver.chrome.service import Service as ChromiumService
from webdriver_manager.chrome import ChromeDriverManager
from webdriver_manager.core.os_manager import ChromeType
from selenium.webdriver import ActionChains



Expand All @@ -37,15 +38,17 @@


##### Selenium Webdriver
options = Options()
options.BinaryLocation = "/usr/bin/chromium-browser"
#options = Options()
#options.BinaryLocation = "/usr/bin/chromium-browser"

op = webdriver.ChromeOptions()
op.add_argument("--headless")
op.add_argument('--ignore-certificate-errors')
op.add_argument("--no-sandbox")
op.add_argument("--disable-dev-shm-usage")
op.add_argument("--start-maximized")
driver = webdriver.Chrome(options=op,service=ChromiumService(ChromeDriverManager(chrome_type=ChromeType.CHROMIUM).install()))
#driver = webdriver.Chrome(options=op)
driver.get("https://chlorobox.mpimp-golm.mpg.de/geseq.html")
time.sleep(5)

Expand Down Expand Up @@ -163,12 +166,16 @@

if (MultiGenBank):
if (not output_options_block.find_element(By.ID,"multigenbank_enabled").is_selected()):
try:
multigenbank_button = WebDriverWait(driver, 20).until(EC.element_to_be_clickable(output_options_block.find_element(By.ID,"multigenbank_enabled")))
multigenbank_button.click()
except ElementClickInterceptedException:
print("Trying to click on the button again")
driver.execute_script("arguments[0].click()", multigenbank_button)
multigenbank_element = output_options_block.find_element(By.ID,"multigenbank_enabled")
actions = ActionChains(driver)
actions.move_to_element(multigenbank_element).click().perform()

#try:
# multigenbank_button = WebDriverWait(driver, 20).until(EC.element_to_be_clickable(output_options_block.find_element(By.ID,"multigenbank_enabled")))
# multigenbank_button.click()
#except ElementClickInterceptedException:
# print("Trying to click on the button again")
# driver.execute_script("arguments[0].click()", multigenbank_button)


### Actions
Expand All @@ -190,13 +197,20 @@

ignored_exceptions=(NoSuchElementException,StaleElementReferenceException,)

start_time = time.time()
job_status = results_block.find_element(By.CLASS_NAME,"gs_jobstatus").text.strip()
while(job_status != 'Status: finished'):
WebDriverWait(driver,5,ignored_exceptions=ignored_exceptions).until(EC.presence_of_element_located((By.CLASS_NAME,"gs_jobstatus")))
#WebDriverWait(driver,5,ignored_exceptions=ignored_exceptions).until(EC.presence_of_element_located((By.CLASS_NAME,"gs_jobstatus")))
time.sleep(1)
job_status = results_block.find_element(By.CLASS_NAME,"gs_jobstatus").text.strip()
try:
job_status = results_block.find_element(By.CLASS_NAME,"gs_jobstatus").text.strip()
except StaleElementReferenceException as e:
pass
print(job_status)

curr_time = time.time()
if (curr_time - start_time > 1000):
print("GeSeq took too long, exiting...")
assert(1==0)


##### Downloading GenBank file
Expand Down
39 changes: 4 additions & 35 deletions workflow/scripts/StandardizationAnnotation.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,12 +20,6 @@
if (i.type == "gene"):
if (i.qualifiers["gene"][0]) == lsc_gene_name:
lsc_gene = i
#if ((i.qualifiers["gene"][0]) == "rrn23") and (i.location.strand == 1):
# rrn23_forward = i
#if ((i.qualifiers["gene"][0]) == "rrn23") and (i.location.strand == -1):
# rrn23_reverse = i
#if (i.qualifiers["gene"][0]) == "ccsA":
# ccsA = i
if (i.qualifiers["gene"][0]) == ssc_gene_name:
ssc_gene = i
if (i.type == "repeat_region" and "IRA" in i.qualifiers["note"][0]):
Expand All @@ -45,8 +39,7 @@

new_seq = ""

### LSC # psbA should be -1
# assert(psbA.location.start < IRA.location.start):
### LSC
try:
if (lsc_gene.location.strand == lsc_gene_dir):
new_seq += start_IRAstart
Expand All @@ -55,53 +48,29 @@
except NameError as error:
print("%s not found in file...skipping" %lsc_gene_name)


### rrn23 forward should be +1
# assert(IRA.location.start < rrn23_forward.location.start < IRA.location.end):
#try:
# if (rrn23_forward.location.strand == 1):
# new_seq += IRAstart_IRAend
# else:
# new_seq += IRAstart_IRAend.reverse_complement()
#except NameError as error:
# print("rrn23 forward not found in file...skipping")
new_seq += IRAstart_IRAend

### SSC # ccsA should be -1, ndhF should be +1, ccsA < ndhF
### SSC
try:
if (ssc_gene.location.strand == ssc_gene_dir):
new_seq += IRAend_IRBstart
else:
new_seq += IRAend_IRBstart.reverse_complement()
except NameError as error:
print("ccsA or ndhF not found in file...skipping")
# assert(IRA.location.end < ccsA.location.start < ndhF.location.start < IRB.location.start)


### rrn23 reverse should be +1
# assert(IRB.location.start < rrn23_reverse.location.start < IRB.location.end)
#try:
# if (rrn23_reverse.location.strand == -1):
# new_seq += IRBstart_IRBend
# else:
# new_seq += IRBstart_IRBend.reverse_complement()
#except NameError as error:
# print("rr23 reverse not found in file...skipping")
print("%s not found in file...skipping" %ssc_gene_name)

new_seq += IRBstart_IRBend

new_seq += IRBend_end

record = SeqRecord(
new_seq,
id="Am09",
id=snakemake.wildcards["sample"],
name="FastaStandardized",
description=""
)
# NOTE: The header of the FASTA file must contain no blanks/spaces (for some odd reason) i.e. ">Am09_Chloroplasts" is allowed, but not ">Am09 Chloroplasts"



##### Output

#annotation_filename_standardized = '/Users/SJAnnaldasula/Documents/BGBM/Plastid/%s_standardized.fasta' %os.path.splitext(annotation_filename)[0]
Expand Down

0 comments on commit 6f3813d

Please sign in to comment.