From 50c5a0c595a34410da9e8cf4f816c575e20a4769 Mon Sep 17 00:00:00 2001 From: davidebolo1993 Date: Wed, 9 Aug 2023 22:44:36 +0200 Subject: [PATCH] switching to a previous version of gafpack --- Dockerfile | 1 + cosigt_smk/config/slurm/config.yaml | 2 +- cosigt_smk/workflow/Snakefile | 2 +- cosigt_smk/workflow/envs/python.yml | 2 + cosigt_smk/workflow/rules/bwa-mem2.smk | 6 +- cosigt_smk/workflow/rules/extract.smk | 20 ++++- cosigt_smk/workflow/rules/odgi.smk | 21 +----- cosigt_smk/workflow/rules/pggb.smk | 3 +- cosigt_smk/workflow/rules/pgrtk.smk | 82 +++++++++++++++++++-- cosigt_smk/workflow/scripts/filter_pgrtk.py | 49 ++++++++++++ cosigt_smk/workflow/scripts/organize.py | 18 ++--- cosigt_smk/workflow/scripts/pypgrtk.py | 12 +-- 12 files changed, 168 insertions(+), 50 deletions(-) create mode 100644 cosigt_smk/workflow/scripts/filter_pgrtk.py diff --git a/Dockerfile b/Dockerfile index f59a425..f682917 100644 --- a/Dockerfile +++ b/Dockerfile @@ -113,6 +113,7 @@ ENV PATH /opt/minimap2-2.26_x64-linux:$PATH RUN git clone https://github.com/ekg/gafpack.git \ && cd gafpack \ + && git checkout ad31875b6914d964c6fd72d1bf334f0843538fb6 \ && cargo install --force --path . ENV PATH /opt/gafpack/target/release:$PATH diff --git a/cosigt_smk/config/slurm/config.yaml b/cosigt_smk/config/slurm/config.yaml index 9ccf36c..df40bb6 100644 --- a/cosigt_smk/config/slurm/config.yaml +++ b/cosigt_smk/config/slurm/config.yaml @@ -10,7 +10,7 @@ cluster: --gres=gpu:{resources.gpu} default-resources: - partition=cpuq - - mem_mb=1000 + - mem_mb=3000 - time="00:05:00" - gpu=0 max-jobs-per-second: 100 diff --git a/cosigt_smk/workflow/Snakefile b/cosigt_smk/workflow/Snakefile index 3b503af..f6d37e4 100644 --- a/cosigt_smk/workflow/Snakefile +++ b/cosigt_smk/workflow/Snakefile @@ -7,9 +7,9 @@ df=(pd.read_table(config['samples'], dtype={'sample_id': str, 'cram':str}) .sort_index() ) -include: 'rules/extract.smk' include: 'rules/pgrtk.smk' include: 'rules/pggb.smk' +include: 'rules/extract.smk' include: 'rules/odgi.smk' include: 'rules/bwa-mem2.smk' include: 'rules/gfainject.smk' diff --git a/cosigt_smk/workflow/envs/python.yml b/cosigt_smk/workflow/envs/python.yml index 383ddfa..a09fca2 100644 --- a/cosigt_smk/workflow/envs/python.yml +++ b/cosigt_smk/workflow/envs/python.yml @@ -1,6 +1,7 @@ channels: - anaconda - conda-forge + - bioconda dependencies: - python=3.11.0 - numpy=1.23.5 @@ -8,3 +9,4 @@ dependencies: - matplotlib=3.6.3 - scikit-learn=1.2.2 - scipy=1.10.1 + - pyfaidx=0.7.2.1 diff --git a/cosigt_smk/workflow/rules/bwa-mem2.smk b/cosigt_smk/workflow/rules/bwa-mem2.smk index 963b8c4..d0c697d 100644 --- a/cosigt_smk/workflow/rules/bwa-mem2.smk +++ b/cosigt_smk/workflow/rules/bwa-mem2.smk @@ -3,9 +3,9 @@ rule bwa_mem2_index: bwa index ''' input: - rules.odgi_paths.output + rules.pgrtk_filter.output output: - multiext('results/odgi/paths/fasta/{region}.fasta', '.bwt.2bit.64', '.pac', '.ann', '.amb', '.0123') + multiext('results/pgrtk/fasta/{region}.fa', '.bwt.2bit.64', '.pac', '.ann', '.amb', '.0123') threads: 1 resources: @@ -23,7 +23,7 @@ rule bwa_mem2_samtools_sort: bwa-mem2 and sam-to-bam conversion with samtools ''' input: - ref=rules.odgi_paths.output, + ref=rules.pgrtk_filter.output, idx=rules.bwa_mem2_index.output, sample=rules.samtools_fasta.output output: diff --git a/cosigt_smk/workflow/rules/extract.smk b/cosigt_smk/workflow/rules/extract.smk index ec4bec9..9798490 100644 --- a/cosigt_smk/workflow/rules/extract.smk +++ b/cosigt_smk/workflow/rules/extract.smk @@ -1,12 +1,27 @@ from glob import glob + +rule get_proper_region: + input: + rules.faidx.output + output: + 'results/samtools/bed/{region}.bed' + threads: + 1 + params: + path=config['path'] + shell: + ''' + grep {params.path} {input} | cut -f 1 | cut -d "_" -f 1,3-4 | tr '_' '\t' > {output} + ''' + rule samtools_view: ''' samtools view to extract the region ''' input: sample=lambda wildcards: glob('resources/alignment/{sample}.*am'.format(sample=wildcards.sample)), - region=lambda wildcards: glob('resources/regions/{region}.bed'.format(region=wildcards.region)) + bed=rules.get_proper_region.output output: 'results/samtools/view/{sample}/{region}.bam' threads: @@ -26,8 +41,9 @@ rule samtools_view: -o {output} \ -T {params.ref} \ -@ {threads} \ + -L {input.bed} \ + -M \ {input.sample} \ - {params.region} ''' rule samtools_fasta: diff --git a/cosigt_smk/workflow/rules/odgi.smk b/cosigt_smk/workflow/rules/odgi.smk index 675f5e9..17331e7 100644 --- a/cosigt_smk/workflow/rules/odgi.smk +++ b/cosigt_smk/workflow/rules/odgi.smk @@ -55,26 +55,7 @@ rule odgi_paths_matrix: ''' odgi paths \ -i {input} \ - -H | cut -f 1,4- | pigz > {output} - ''' - -rule odgi_paths: - ''' - odgi paths - ''' - input: - rules.pggb.output - output: - 'results/odgi/paths/fasta/{region}.fasta' - threads: - 1 - container: - 'docker://pangenome/odgi:1689324294' - shell: - ''' - odgi paths \ - -i {input} \ - -f > {output} + -H | cut -f 1,4- | gzip > {output} ''' rule odgi_similarity: diff --git a/cosigt_smk/workflow/rules/pggb.smk b/cosigt_smk/workflow/rules/pggb.smk index c6a373d..a09a4bd 100644 --- a/cosigt_smk/workflow/rules/pggb.smk +++ b/cosigt_smk/workflow/rules/pggb.smk @@ -3,7 +3,7 @@ rule pggb: pggb ''' input: - fasta=rules.pgrtk_get_seq.output, + fasta=rules.pgrtk_filter.output, index=rules.faidx.output output: 'results/pggb/{region}.og' @@ -26,5 +26,6 @@ rule pggb: -k 419 \ -t {threads} \ -n 200 \ + -A \ && mv {params.prefix}/*smooth.final.og {output} ''' diff --git a/cosigt_smk/workflow/rules/pgrtk.smk b/cosigt_smk/workflow/rules/pgrtk.smk index a79123a..491fc08 100644 --- a/cosigt_smk/workflow/rules/pgrtk.smk +++ b/cosigt_smk/workflow/rules/pgrtk.smk @@ -1,24 +1,90 @@ -rule pgrtk_get_seq: +#rule pgrtk_get_seq: +# ''' +# run pgrtk api - deprecating in favour of cli, but keeping this here just in case want to re-introduce at some point +# ''' +# input: +# agc=config['agc'], +# region=lambda wildcards: glob('resources/regions/{region}.bed'.format(region=wildcards.region)) +# output: +# 'results/pgrtk/{region}.fa' +# threads: +# config['pgrtk']['threads'] +# container: +# 'docker://davidebolo1993/graph_genotyper:latest' +# params: +# padding=config['pgrtk']['padding'] +# resources: +# mem_mb=config['pgrtk']['mem_mb'], +# time=config['pgrtk']['time'] +# shell: +# ''' +# python workflow/scripts/pypgrtk.py {input.agc} {input.region} {params.padding} {output} +# ''' + +rule pgrtk_fetch_seqs: ''' - run pgrtk api + run pgrtk fetch ''' input: agc=config['agc'], region=lambda wildcards: glob('resources/regions/{region}.bed'.format(region=wildcards.region)) output: - 'results/pgrtk/{region}.fa' + 'results/pgrtk/fetch/{region}.fa' threads: - config['pgrtk']['threads'] + 32 container: 'docker://davidebolo1993/graph_genotyper:latest' + resources: + mem_mb=config['pgrtk']['mem_mb'], + time=config['pgrtk']['time'] params: - padding=config['pgrtk']['padding'] + agc_prefix=config['agc'].replace('.agc','') + shell: + ''' + pgr-fetch-seqs {params.agc_prefix} -r {input.region} > {output} + ''' + +rule pgrtk_query: + ''' + run pgrtk query + ''' + input: + agc=config['agc'], + fasta=rules.pgrtk_fetch_seqs.output + output: + hit='results/pgrtk/query/{region}.000.hit', + fasta='results/pgrtk/query/{region}.000.fa' + threads: + 32 + container: + 'docker://davidebolo1993/graph_genotyper:latest' resources: mem_mb=config['pgrtk']['mem_mb'], time=config['pgrtk']['time'] + params: + prefix='results/pgrtk/query/{region}', + agc_prefix=config['agc'].replace('.agc','') + shell: + ''' + pgr-query {params.agc_prefix} {input.fasta} {params.prefix} + ''' + +rule pgrtk_filter: + ''' + filter hits + ''' + input: + hit=rules.pgrtk_query.output.hit, + fasta=rules.pgrtk_query.output.fasta + output: + 'results/pgrtk/fasta/{region}.fa' + threads: + 1 + conda: + '../envs/python.yml' shell: ''' - python workflow/scripts/pypgrtk.py {input.agc} {input.region} {params.padding} {output} + python workflow/scripts/filter_pgrtk.py {input.fasta} {input.hit} {output} ''' rule faidx: @@ -26,9 +92,9 @@ rule faidx: samtools faidx ''' input: - rules.pgrtk_get_seq.output + rules.pgrtk_filter.output output: - 'results/pgrtk/{region}.fa.fai' + 'results/pgrtk/fasta/{region}.fa.fai' threads: 1 container: diff --git a/cosigt_smk/workflow/scripts/filter_pgrtk.py b/cosigt_smk/workflow/scripts/filter_pgrtk.py new file mode 100644 index 0000000..b19bbf1 --- /dev/null +++ b/cosigt_smk/workflow/scripts/filter_pgrtk.py @@ -0,0 +1,49 @@ +#!/usr/bin/python3 env + +#standard libraries +import os +import sys +import pyfaidx + +def main(input_fa,input_hit,output_fa): + + fasta=pyfaidx.Fasta(input_fa) + + with open(input_hit) as fin, open(output_fa, 'w') as fout: + + for row in fin: + + row = row.strip().split("\t") + + if row[0][0] == "#": + + continue + + (q_idx, q_name, query_bgn, query_end, q_len, aln_anchor_count, + src, ctg, ctg_bgn, ctg_end, orientation, out_seq_name) = row + query_bgn = int(query_bgn) + query_end = int(query_end) + ctg_bgn = int(ctg_bgn) + ctg_end = int(ctg_end) + q_len = int(q_len) + + #maybe need to check why this is 12000 - but itworks for test regions + if query_bgn > 12000 or q_len - query_end > 12000: + + continue + + if abs(ctg_end-ctg_bgn) > q_len * 3: + + continue + + key=[x for x in fasta.keys() if ctg+ '_' + str(ctg_bgn) in x] + seq=fasta[key[0]][:].seq + fout.write('>' + key[0].split('::')[1] + '\n' + seq + '\n') + + +if __name__ == '__main__': + + input_fa=os.path.abspath(sys.argv[1]) + input_hit=os.path.abspath(sys.argv[2]) + output_fa=os.path.abspath(sys.argv[3]) + main(input_fa,input_hit,output_fa) \ No newline at end of file diff --git a/cosigt_smk/workflow/scripts/organize.py b/cosigt_smk/workflow/scripts/organize.py index 5f817b5..a356166 100644 --- a/cosigt_smk/workflow/scripts/organize.py +++ b/cosigt_smk/workflow/scripts/organize.py @@ -87,8 +87,7 @@ def default_parameters(args): #pgrtk d['pgrtk']=dict() - d['pgrtk']['padding'] = args.pgrtk_padding - d['pgrtk']['threads'] = args.pgrtk_threads + #d['pgrtk']['padding'] = args.pgrtk_padding d['pgrtk']['mem_mb'] = args.pgrtk_memory d['pgrtk']['time'] = double_quote(args.pgrtk_time) @@ -130,14 +129,13 @@ def main(): #samtools extraction and sort metrics.add_argument('--sam_threads', help='threads - samtools (view/sort) commands [5]',type=int, default=5) - metrics.add_argument('--sam_time', help='max time (hh:mm:ss) - samtools (view/sort) commands ["00:01:00"]',type=str, default='00:01:000') + metrics.add_argument('--sam_time', help='max time (hh:mm:ss) - samtools (view/sort) commands ["00:02:00"]',type=str, default='00:02:00') metrics.add_argument('--sam_memory', help='max memory (mb) - samtools (view/sort) commands [2000]',type=int, default=2000) #pgrtk - metrics.add_argument('--pgrtk_padding', help='padding (#bps) - pgrtk commands [100000]',type=int, default=100000) - metrics.add_argument('--pgrtk_threads', help='threads - pgrtk commands [10]',type=int, default=10) - metrics.add_argument('--pgrtk_time', help='max time (hh:mm:ss) - pgrtk commands ["00:05:00"]',type=str, default='00:05:00') - metrics.add_argument('--pgrtk_memory', help='max memory (mb) - samtools (view/sort) commands [30000]',type=int, default=30000) + metrics.add_argument('--pgrtk_padding', help='padding - pgrtk commands [200000]',type=int, default=200000) + metrics.add_argument('--pgrtk_time', help='max time (hh:mm:ss) - pgrtk commands ["00:10:00"]',type=str, default='00:10:00') + metrics.add_argument('--pgrtk_memory', help='max memory (mb) - samtools (view/sort) commands [35000]',type=int, default=35000) #pggb metrics.add_argument('--pggb_threads', help='threads - pggb command [32]',type=int, default=32) @@ -263,16 +261,18 @@ def main(): #add to config d['reference'] = out_reference_file + d['path'] = args.path #add to config d['region'] = list() + with open(args.roi) as bed_in: for line in bed_in: l=line.rstrip().split('\t') - region=l[0] + ':' + l[1] + '-' + l[2] + region=l[0] + '_' + str(int(l[1])-args.pgrtk_padding) + '_' + str(int(l[2])+args.pgrtk_padding) #put regions in the config fille d['region'].append(region) @@ -282,7 +282,7 @@ def main(): with open(region_out, 'w') as out_region: - out_region.write(args.path +'_tagged.fa' + '\t' + l[0] + '_' + args.path + '\t' + l[1] + '\t' + l[2] + '\n') + out_region.write(region + '\t' + args.path +'_tagged.fa' + '\t' + l[0] + '_' + args.path + '\t' + str(int(l[1])-args.pgrtk_padding) + '\t' + str(int(l[2])+args.pgrtk_padding) + '\t0\n') #dump config diff --git a/cosigt_smk/workflow/scripts/pypgrtk.py b/cosigt_smk/workflow/scripts/pypgrtk.py index d52d29d..d60c470 100644 --- a/cosigt_smk/workflow/scripts/pypgrtk.py +++ b/cosigt_smk/workflow/scripts/pypgrtk.py @@ -28,25 +28,26 @@ def main(agc_file,bed_file,padding,out_file): roi_seq = ref_db.get_sub_seq(ref_file_name, roi_chr, roi_b-padding, roi_e+padding) #find hits in the pangenomic reference - aln_range = pgrtk.query_sdb(sdb, roi_seq, merge_range_tol=100000) #force constant merging + aln_range = pgrtk.query_sdb(sdb, roi_seq, merge_range_tol=100000) seq_list=[] for k in list(aln_range.keys()): ctg_name, source, _ = sdb.seq_info[k] rgns = aln_range[k].copy() - rgns = pgrtk.merge_regions(rgns, tol=1000) for rgn in rgns: b, e,_, orientation, aln = rgn - aln.sort() - + print(ctg_name,b,e) + if aln[0][0][0] > padding or aln[-1][0][1] < padding + roi_len: + continue - if e-b < 0.75 * (roi_len + 2 * padding): + if e-b < 0.75 * (roi_len + 2 * padding): + continue seq = sdb.get_sub_seq(source, ctg_name, b, e) @@ -55,6 +56,7 @@ def main(agc_file,bed_file,padding,out_file): seq = pgrtk.rc_byte_seq(seq) + seq_list.append(('{}_{}_{}_{}'.format(ctg_name, b, e, orientation), seq)) f0 = open(out_file, 'w')