Skip to content

Commit

Permalink
switching to a previous version of gafpack
Browse files Browse the repository at this point in the history
  • Loading branch information
davidebolo1993 committed Aug 9, 2023
1 parent 3d23482 commit 50c5a0c
Show file tree
Hide file tree
Showing 12 changed files with 168 additions and 50 deletions.
1 change: 1 addition & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,7 @@ ENV PATH /opt/minimap2-2.26_x64-linux:$PATH

RUN git clone https://github.com/ekg/gafpack.git \
&& cd gafpack \
&& git checkout ad31875b6914d964c6fd72d1bf334f0843538fb6 \
&& cargo install --force --path .

ENV PATH /opt/gafpack/target/release:$PATH
Expand Down
2 changes: 1 addition & 1 deletion cosigt_smk/config/slurm/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ cluster:
--gres=gpu:{resources.gpu}
default-resources:
- partition=cpuq
- mem_mb=1000
- mem_mb=3000
- time="00:05:00"
- gpu=0
max-jobs-per-second: 100
Expand Down
2 changes: 1 addition & 1 deletion cosigt_smk/workflow/Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,9 @@ df=(pd.read_table(config['samples'], dtype={'sample_id': str, 'cram':str})
.sort_index()
)

include: 'rules/extract.smk'
include: 'rules/pgrtk.smk'
include: 'rules/pggb.smk'
include: 'rules/extract.smk'
include: 'rules/odgi.smk'
include: 'rules/bwa-mem2.smk'
include: 'rules/gfainject.smk'
Expand Down
2 changes: 2 additions & 0 deletions cosigt_smk/workflow/envs/python.yml
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
channels:
- anaconda
- conda-forge
- bioconda
dependencies:
- python=3.11.0
- numpy=1.23.5
- pandas=1.5.2
- matplotlib=3.6.3
- scikit-learn=1.2.2
- scipy=1.10.1
- pyfaidx=0.7.2.1
6 changes: 3 additions & 3 deletions cosigt_smk/workflow/rules/bwa-mem2.smk
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,9 @@ rule bwa_mem2_index:
bwa index
'''
input:
rules.odgi_paths.output
rules.pgrtk_filter.output
output:
multiext('results/odgi/paths/fasta/{region}.fasta', '.bwt.2bit.64', '.pac', '.ann', '.amb', '.0123')
multiext('results/pgrtk/fasta/{region}.fa', '.bwt.2bit.64', '.pac', '.ann', '.amb', '.0123')
threads:
1
resources:
Expand All @@ -23,7 +23,7 @@ rule bwa_mem2_samtools_sort:
bwa-mem2 and sam-to-bam conversion with samtools
'''
input:
ref=rules.odgi_paths.output,
ref=rules.pgrtk_filter.output,
idx=rules.bwa_mem2_index.output,
sample=rules.samtools_fasta.output
output:
Expand Down
20 changes: 18 additions & 2 deletions cosigt_smk/workflow/rules/extract.smk
Original file line number Diff line number Diff line change
@@ -1,12 +1,27 @@
from glob import glob


rule get_proper_region:
input:
rules.faidx.output
output:
'results/samtools/bed/{region}.bed'
threads:
1
params:
path=config['path']
shell:
'''
grep {params.path} {input} | cut -f 1 | cut -d "_" -f 1,3-4 | tr '_' '\t' > {output}
'''

rule samtools_view:
'''
samtools view to extract the region
'''
input:
sample=lambda wildcards: glob('resources/alignment/{sample}.*am'.format(sample=wildcards.sample)),
region=lambda wildcards: glob('resources/regions/{region}.bed'.format(region=wildcards.region))
bed=rules.get_proper_region.output
output:
'results/samtools/view/{sample}/{region}.bam'
threads:
Expand All @@ -26,8 +41,9 @@ rule samtools_view:
-o {output} \
-T {params.ref} \
-@ {threads} \
-L {input.bed} \
-M \
{input.sample} \
{params.region}
'''

rule samtools_fasta:
Expand Down
21 changes: 1 addition & 20 deletions cosigt_smk/workflow/rules/odgi.smk
Original file line number Diff line number Diff line change
Expand Up @@ -55,26 +55,7 @@ rule odgi_paths_matrix:
'''
odgi paths \
-i {input} \
-H | cut -f 1,4- | pigz > {output}
'''

rule odgi_paths:
'''
odgi paths
'''
input:
rules.pggb.output
output:
'results/odgi/paths/fasta/{region}.fasta'
threads:
1
container:
'docker://pangenome/odgi:1689324294'
shell:
'''
odgi paths \
-i {input} \
-f > {output}
-H | cut -f 1,4- | gzip > {output}
'''

rule odgi_similarity:
Expand Down
3 changes: 2 additions & 1 deletion cosigt_smk/workflow/rules/pggb.smk
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ rule pggb:
pggb
'''
input:
fasta=rules.pgrtk_get_seq.output,
fasta=rules.pgrtk_filter.output,
index=rules.faidx.output
output:
'results/pggb/{region}.og'
Expand All @@ -26,5 +26,6 @@ rule pggb:
-k 419 \
-t {threads} \
-n 200 \
-A \
&& mv {params.prefix}/*smooth.final.og {output}
'''
82 changes: 74 additions & 8 deletions cosigt_smk/workflow/rules/pgrtk.smk
Original file line number Diff line number Diff line change
@@ -1,34 +1,100 @@
rule pgrtk_get_seq:
#rule pgrtk_get_seq:
# '''
# run pgrtk api - deprecating in favour of cli, but keeping this here just in case want to re-introduce at some point
# '''
# input:
# agc=config['agc'],
# region=lambda wildcards: glob('resources/regions/{region}.bed'.format(region=wildcards.region))
# output:
# 'results/pgrtk/{region}.fa'
# threads:
# config['pgrtk']['threads']
# container:
# 'docker://davidebolo1993/graph_genotyper:latest'
# params:
# padding=config['pgrtk']['padding']
# resources:
# mem_mb=config['pgrtk']['mem_mb'],
# time=config['pgrtk']['time']
# shell:
# '''
# python workflow/scripts/pypgrtk.py {input.agc} {input.region} {params.padding} {output}
# '''

rule pgrtk_fetch_seqs:
'''
run pgrtk api
run pgrtk fetch
'''
input:
agc=config['agc'],
region=lambda wildcards: glob('resources/regions/{region}.bed'.format(region=wildcards.region))
output:
'results/pgrtk/{region}.fa'
'results/pgrtk/fetch/{region}.fa'
threads:
config['pgrtk']['threads']
32
container:
'docker://davidebolo1993/graph_genotyper:latest'
resources:
mem_mb=config['pgrtk']['mem_mb'],
time=config['pgrtk']['time']
params:
padding=config['pgrtk']['padding']
agc_prefix=config['agc'].replace('.agc','')
shell:
'''
pgr-fetch-seqs {params.agc_prefix} -r {input.region} > {output}
'''

rule pgrtk_query:
'''
run pgrtk query
'''
input:
agc=config['agc'],
fasta=rules.pgrtk_fetch_seqs.output
output:
hit='results/pgrtk/query/{region}.000.hit',
fasta='results/pgrtk/query/{region}.000.fa'
threads:
32
container:
'docker://davidebolo1993/graph_genotyper:latest'
resources:
mem_mb=config['pgrtk']['mem_mb'],
time=config['pgrtk']['time']
params:
prefix='results/pgrtk/query/{region}',
agc_prefix=config['agc'].replace('.agc','')
shell:
'''
pgr-query {params.agc_prefix} {input.fasta} {params.prefix}
'''

rule pgrtk_filter:
'''
filter hits
'''
input:
hit=rules.pgrtk_query.output.hit,
fasta=rules.pgrtk_query.output.fasta
output:
'results/pgrtk/fasta/{region}.fa'
threads:
1
conda:
'../envs/python.yml'
shell:
'''
python workflow/scripts/pypgrtk.py {input.agc} {input.region} {params.padding} {output}
python workflow/scripts/filter_pgrtk.py {input.fasta} {input.hit} {output}
'''

rule faidx:
'''
samtools faidx
'''
input:
rules.pgrtk_get_seq.output
rules.pgrtk_filter.output
output:
'results/pgrtk/{region}.fa.fai'
'results/pgrtk/fasta/{region}.fa.fai'
threads:
1
container:
Expand Down
49 changes: 49 additions & 0 deletions cosigt_smk/workflow/scripts/filter_pgrtk.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
#!/usr/bin/python3 env

#standard libraries
import os
import sys
import pyfaidx

def main(input_fa,input_hit,output_fa):

fasta=pyfaidx.Fasta(input_fa)

with open(input_hit) as fin, open(output_fa, 'w') as fout:

for row in fin:

row = row.strip().split("\t")

if row[0][0] == "#":

continue

(q_idx, q_name, query_bgn, query_end, q_len, aln_anchor_count,
src, ctg, ctg_bgn, ctg_end, orientation, out_seq_name) = row
query_bgn = int(query_bgn)
query_end = int(query_end)
ctg_bgn = int(ctg_bgn)
ctg_end = int(ctg_end)
q_len = int(q_len)

#maybe need to check why this is 12000 - but itworks for test regions
if query_bgn > 12000 or q_len - query_end > 12000:

continue

if abs(ctg_end-ctg_bgn) > q_len * 3:

continue

key=[x for x in fasta.keys() if ctg+ '_' + str(ctg_bgn) in x]
seq=fasta[key[0]][:].seq
fout.write('>' + key[0].split('::')[1] + '\n' + seq + '\n')


if __name__ == '__main__':

input_fa=os.path.abspath(sys.argv[1])
input_hit=os.path.abspath(sys.argv[2])
output_fa=os.path.abspath(sys.argv[3])
main(input_fa,input_hit,output_fa)
18 changes: 9 additions & 9 deletions cosigt_smk/workflow/scripts/organize.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,8 +87,7 @@ def default_parameters(args):

#pgrtk
d['pgrtk']=dict()
d['pgrtk']['padding'] = args.pgrtk_padding
d['pgrtk']['threads'] = args.pgrtk_threads
#d['pgrtk']['padding'] = args.pgrtk_padding
d['pgrtk']['mem_mb'] = args.pgrtk_memory
d['pgrtk']['time'] = double_quote(args.pgrtk_time)

Expand Down Expand Up @@ -130,14 +129,13 @@ def main():

#samtools extraction and sort
metrics.add_argument('--sam_threads', help='threads - samtools (view/sort) commands [5]',type=int, default=5)
metrics.add_argument('--sam_time', help='max time (hh:mm:ss) - samtools (view/sort) commands ["00:01:00"]',type=str, default='00:01:000')
metrics.add_argument('--sam_time', help='max time (hh:mm:ss) - samtools (view/sort) commands ["00:02:00"]',type=str, default='00:02:00')
metrics.add_argument('--sam_memory', help='max memory (mb) - samtools (view/sort) commands [2000]',type=int, default=2000)

#pgrtk
metrics.add_argument('--pgrtk_padding', help='padding (#bps) - pgrtk commands [100000]',type=int, default=100000)
metrics.add_argument('--pgrtk_threads', help='threads - pgrtk commands [10]',type=int, default=10)
metrics.add_argument('--pgrtk_time', help='max time (hh:mm:ss) - pgrtk commands ["00:05:00"]',type=str, default='00:05:00')
metrics.add_argument('--pgrtk_memory', help='max memory (mb) - samtools (view/sort) commands [30000]',type=int, default=30000)
metrics.add_argument('--pgrtk_padding', help='padding - pgrtk commands [200000]',type=int, default=200000)
metrics.add_argument('--pgrtk_time', help='max time (hh:mm:ss) - pgrtk commands ["00:10:00"]',type=str, default='00:10:00')
metrics.add_argument('--pgrtk_memory', help='max memory (mb) - samtools (view/sort) commands [35000]',type=int, default=35000)

#pggb
metrics.add_argument('--pggb_threads', help='threads - pggb command [32]',type=int, default=32)
Expand Down Expand Up @@ -263,16 +261,18 @@ def main():

#add to config
d['reference'] = out_reference_file
d['path'] = args.path

#add to config
d['region'] = list()


with open(args.roi) as bed_in:

for line in bed_in:

l=line.rstrip().split('\t')
region=l[0] + ':' + l[1] + '-' + l[2]
region=l[0] + '_' + str(int(l[1])-args.pgrtk_padding) + '_' + str(int(l[2])+args.pgrtk_padding)

#put regions in the config fille
d['region'].append(region)
Expand All @@ -282,7 +282,7 @@ def main():

with open(region_out, 'w') as out_region:

out_region.write(args.path +'_tagged.fa' + '\t' + l[0] + '_' + args.path + '\t' + l[1] + '\t' + l[2] + '\n')
out_region.write(region + '\t' + args.path +'_tagged.fa' + '\t' + l[0] + '_' + args.path + '\t' + str(int(l[1])-args.pgrtk_padding) + '\t' + str(int(l[2])+args.pgrtk_padding) + '\t0\n')


#dump config
Expand Down
Loading

0 comments on commit 50c5a0c

Please sign in to comment.