From 360abfa79fdb662fd7c738b5f3ce1e53579fea7e Mon Sep 17 00:00:00 2001 From: davidebolo1993 Date: Wed, 24 Apr 2024 14:14:00 +0200 Subject: [PATCH 1/4] switching to impg, serveral code improvements, adding benchmark --- cosigt_smk/README.md | 4 +- cosigt_smk/benchmarks/.gitignore | 0 cosigt_smk/workflow/Snakefile | 6 +- cosigt_smk/workflow/envs/python.yml | 12 -- cosigt_smk/workflow/envs/r.yml | 8 - cosigt_smk/workflow/rules/bedtools.smk | 26 ++++ cosigt_smk/workflow/rules/bwa-mem2.smk | 28 ++-- cosigt_smk/workflow/rules/cosigt.smk | 13 +- cosigt_smk/workflow/rules/extract.smk | 161 -------------------- cosigt_smk/workflow/rules/faidx.smk | 46 ++++++ cosigt_smk/workflow/rules/gafpack.smk | 6 +- cosigt_smk/workflow/rules/gfainject.smk | 14 +- cosigt_smk/workflow/rules/impg.smk | 30 ++++ cosigt_smk/workflow/rules/odgi.smk | 29 ++-- cosigt_smk/workflow/rules/pggb.smk | 10 +- cosigt_smk/workflow/rules/samtools.smk | 82 ++++++++++ cosigt_smk/workflow/scripts/organize.py | 112 +++++++------- cosigt_smk/workflow/scripts/plotbenchmark.r | 58 +++++++ 18 files changed, 367 insertions(+), 278 deletions(-) create mode 100644 cosigt_smk/benchmarks/.gitignore delete mode 100644 cosigt_smk/workflow/envs/python.yml delete mode 100644 cosigt_smk/workflow/envs/r.yml create mode 100644 cosigt_smk/workflow/rules/bedtools.smk delete mode 100644 cosigt_smk/workflow/rules/extract.smk create mode 100644 cosigt_smk/workflow/rules/faidx.smk create mode 100644 cosigt_smk/workflow/rules/impg.smk create mode 100644 cosigt_smk/workflow/rules/samtools.smk create mode 100644 cosigt_smk/workflow/scripts/plotbenchmark.r diff --git a/cosigt_smk/README.md b/cosigt_smk/README.md index b45c761..9fda98a 100644 --- a/cosigt_smk/README.md +++ b/cosigt_smk/README.md @@ -1,3 +1 @@ -# Snakemake pipeline - -Docs are in preparation +# Snakemake pipeline \ No newline at end of file diff --git a/cosigt_smk/benchmarks/.gitignore b/cosigt_smk/benchmarks/.gitignore new file mode 100644 index 0000000..e69de29 diff --git a/cosigt_smk/workflow/Snakefile b/cosigt_smk/workflow/Snakefile index 5c92d3f..e407774 100644 --- a/cosigt_smk/workflow/Snakefile +++ b/cosigt_smk/workflow/Snakefile @@ -1,4 +1,5 @@ import pandas as pd +from glob import glob configfile: 'config/config.yaml' @@ -7,7 +8,10 @@ df=(pd.read_table(config['samples'], dtype={'sample_id': str, 'cram':str}) .sort_index() ) -include: 'rules/extract.smk' +include: 'rules/impg.smk' +include: 'rules/bedtools.smk' +include: 'rules/samtools.smk' +include: 'rules/faidx.smk' include: 'rules/pggb.smk' include: 'rules/odgi.smk' include: 'rules/bwa-mem2.smk' diff --git a/cosigt_smk/workflow/envs/python.yml b/cosigt_smk/workflow/envs/python.yml deleted file mode 100644 index a09fca2..0000000 --- a/cosigt_smk/workflow/envs/python.yml +++ /dev/null @@ -1,12 +0,0 @@ -channels: - - anaconda - - conda-forge - - bioconda -dependencies: - - python=3.11.0 - - numpy=1.23.5 - - pandas=1.5.2 - - matplotlib=3.6.3 - - scikit-learn=1.2.2 - - scipy=1.10.1 - - pyfaidx=0.7.2.1 diff --git a/cosigt_smk/workflow/envs/r.yml b/cosigt_smk/workflow/envs/r.yml deleted file mode 100644 index de0a57d..0000000 --- a/cosigt_smk/workflow/envs/r.yml +++ /dev/null @@ -1,8 +0,0 @@ -channels: - - conda-forge -dependencies: - - r-base - - r-ggplot2 - - r-data.table - - r-ggforce - - r-rjson diff --git a/cosigt_smk/workflow/rules/bedtools.smk b/cosigt_smk/workflow/rules/bedtools.smk new file mode 100644 index 0000000..d647761 --- /dev/null +++ b/cosigt_smk/workflow/rules/bedtools.smk @@ -0,0 +1,26 @@ +rule bedtools_merge: + ''' + https://github.com/arq5x/bedtools2 + ''' + input: + rules.impg_project.output + output: + config['output'] + '/bedtools/{region}.bed' + threads: + 1 + resources: + mem_mb=lambda wildcards, attempt: attempt * config['default']['mem_mb'], + time=lambda wildcards, attempt: attempt * config['default']['time'] + container: + 'docker://davidebolo1993/cosigt_workflow:latest' + benchmark: + 'benchmarks/{region}.bedtools_merge.benchmark.txt' + + shell: + ''' + bedtools sort \ + -i {input} | \ + bedtools merge \ + -d 50000 \ + -i - > {output} + ''' \ No newline at end of file diff --git a/cosigt_smk/workflow/rules/bwa-mem2.smk b/cosigt_smk/workflow/rules/bwa-mem2.smk index d55a879..7a77d44 100644 --- a/cosigt_smk/workflow/rules/bwa-mem2.smk +++ b/cosigt_smk/workflow/rules/bwa-mem2.smk @@ -1,11 +1,11 @@ -rule bwa_mem2_index: +rule bwamem2_index: ''' - bwa index + https://github.com/bwa-mem2/bwa-mem2 ''' input: - rules.odgi_paths_fasta.output + rules.pyfaidx_extract.output output: - multiext(config['output'] + '/odgi/paths/fasta/{region}.fa', '.bwt.2bit.64', '.pac', '.ann', '.amb', '.0123') + multiext(config['output'] + '/pyfaidx/{region}.fasta', '.bwt.2bit.64', '.pac', '.ann', '.amb', '.0123') threads: 1 resources: @@ -13,18 +13,21 @@ rule bwa_mem2_index: time=lambda wildcards, attempt: attempt * config['bwa-mem2']['time'] container: 'docker://davidebolo1993/cosigt_workflow:latest' + benchmark: + 'benchmarks/{region}.bwamem2_index.benchmark.txt' shell: ''' bwa-mem2 index {input} ''' -rule bwa_mem2_samtools_sort: +rule bwamem2_mem_samtools_sort: ''' - bwa-mem2 and sam-to-bam conversion with samtools + https://github.com/bwa-mem2/bwa-mem2 + https://github.com/samtools/samtools ''' input: - ref=rules.odgi_paths_fasta.output, - idx=rules.bwa_mem2_index.output, + ref=rules.pyfaidx_extract.output, + idx=rules.bwamem2_index.output, sample=rules.samtools_fasta.output output: config['output'] + '/bwa-mem2/{sample}/{region}.realigned.bam' @@ -35,7 +38,14 @@ rule bwa_mem2_samtools_sort: time=lambda wildcards, attempt: attempt * config['bwa-mem2']['time'] container: 'docker://davidebolo1993/cosigt_workflow:latest' + benchmark: + 'benchmarks/{sample}.{region}.bwamem2_mem_samtools_sort.benchmark.txt' shell: ''' - bwa-mem2 mem -t {threads} {input.ref} {input.sample} | samtools sort -@ {threads} - > {output} + bwa-mem2 mem \ + -t {threads} \ + {input.ref} \ + {input.sample} | samtools sort \ + -@ {threads} \ + - > {output} ''' diff --git a/cosigt_smk/workflow/rules/cosigt.smk b/cosigt_smk/workflow/rules/cosigt.smk index 3ffd85f..411e6dc 100644 --- a/cosigt_smk/workflow/rules/cosigt.smk +++ b/cosigt_smk/workflow/rules/cosigt.smk @@ -1,11 +1,11 @@ rule cosigt_genotype: ''' - cosigt genotype + https://github.com/davidebolo1993/cosigt ''' input: zpath=rules.odgi_paths_matrix.output, xpack=rules.gafpack_coverage.output, - cluster=rules.cluster.output + cluster=rules.make_clusters.output output: geno=config['output'] + '/cosigt/{sample}/{region}/cosigt_genotype.tsv', combo=config['output'] + '/cosigt/{sample}/{region}/sorted_combos.tsv' @@ -18,7 +18,14 @@ rule cosigt_genotype: 'docker://davidebolo1993/cosigt_workflow:latest' params: prefix=config['output'] + '/cosigt/{sample}/{region}' + benchmark: + 'benchmarks/{sample}.{region}.cosigt_genotype.benchmark.txt' shell: ''' - cosigt -p {input.zpath} -g {input.xpack} -c {input.cluster} -b resources/extra/blacklist.txt -o {params.prefix} + cosigt \ + -p {input.zpath} \ + -g {input.xpack} \ + -c {input.cluster} \ + -b resources/extra/blacklist.txt \ + -o {params.prefix} ''' \ No newline at end of file diff --git a/cosigt_smk/workflow/rules/extract.smk b/cosigt_smk/workflow/rules/extract.smk deleted file mode 100644 index a8393c8..0000000 --- a/cosigt_smk/workflow/rules/extract.smk +++ /dev/null @@ -1,161 +0,0 @@ -from glob import glob -import os - -rule odgi_build: - ''' - odgi build - ''' - input: - config['graph'] - output: - config['output'] + '/odgi/build/' + os.path.basename(config['graph'].replace('.gfa', '.og')) - threads: - config['odgi']['threads'] - resources: - mem_mb=lambda wildcards, attempt: attempt * config['odgi']['mem_mb'], - time=lambda wildcards, attempt: attempt * config['odgi']['time'] - container: - 'docker://pangenome/odgi:1707818641' - shell: - ''' - odgi build \ - -g {input} \ - -o {output} \ - -t {threads} - ''' - -rule odgi_extract: - ''' - odgi extract - ''' - input: - graph=rules.odgi_build.output, - region=lambda wildcards: glob('resources/regions/{region}.bed'.format(region=wildcards.region)) - output: - config['output'] + '/odgi/extract/{region}.og' - threads: - config['odgi']['threads'] - resources: - mem_mb=lambda wildcards, attempt: attempt * config['odgi']['mem_mb'], - time=lambda wildcards, attempt: attempt * config['odgi']['time'] - container: - 'docker://pangenome/odgi:1707818641' - shell: - ''' - odgi extract \ - -i {input.graph} \ - -b {input.region} \ - -t {threads} \ - -O \ - -o {output} - ''' - - -rule odgi_paths_fasta: - ''' - odgi paths - ''' - input: - rules.odgi_extract.output - output: - config['output'] + '/odgi/paths/fasta/{region}.fa' - threads: - 1 - resources: - mem_mb=lambda wildcards, attempt: attempt * config['default']['mem_mb'], - time=lambda wildcards, attempt: attempt * config['default']['time'] - container: - 'docker://pangenome/odgi:1707818641' - shell: - ''' - odgi paths \ - -i {input} \ - -f > {output} - ''' - -rule faidx: - ''' - samtools faidx - ''' - input: - rules.odgi_paths_fasta.output - output: - config['output'] + '/odgi/paths/fasta/{region}.fa.fai' - resources: - mem_mb=lambda wildcards, attempt: attempt * config['default']['mem_mb'], - time=lambda wildcards, attempt: attempt * config['default']['time'] - container: - 'docker://davidebolo1993/cosigt_workflow:latest' - shell: - ''' - samtools faidx {input} - ''' - - -rule get_proper_region: - input: - rules.faidx.output - output: - config['output'] + '/samtools/bed/{region}.bed' - resources: - mem_mb=lambda wildcards, attempt: attempt * config['default']['mem_mb'], - time=lambda wildcards, attempt: attempt * config['default']['time'] - params: - path=config['path'] - shell: - ''' - grep {params.path} {input} | cut -f 1 | cut -d "#" -f 2 | tr ':' '\t'| tr '-' '\t' > {output} - ''' - -rule samtools_view: - ''' - samtools view to extract the region - ''' - input: - sample=lambda wildcards: glob('resources/alignment/{sample}.*am'.format(sample=wildcards.sample)), - bed=rules.get_proper_region.output - output: - config['output'] + '/samtools/view/{sample}/{region}.bam' - threads: - config['samtools']['threads'] - container: - 'docker://davidebolo1993/cosigt_workflow:latest' - params: - ref=config['reference'], - region='{region}' - resources: - mem_mb=lambda wildcards, attempt: attempt * config['samtools']['mem_mb'], - time=lambda wildcards, attempt: attempt * config['samtools']['time'] - shell: - ''' - samtools view \ - -O bam \ - -o {output} \ - -T {params.ref} \ - -@ {threads} \ - -L {input.bed} \ - -M \ - {input.sample} - ''' - -rule samtools_fasta: - ''' - samtools fasta to get fasta files from .bam - ''' - input: - rules.samtools_view.output - output: - config['output'] + '/samtools/fasta/{sample}/{region}.fasta.gz' - threads: - config['samtools']['threads'] - container: - 'docker://davidebolo1993/cosigt_workflow:latest' - resources: - mem_mb=lambda wildcards, attempt: attempt * config['samtools']['mem_mb'], - time=lambda wildcards, attempt: attempt * config['samtools']['time'] - shell: - ''' - samtools fasta \ - -@ {threads} \ - {input} | pigz > {output} - ''' \ No newline at end of file diff --git a/cosigt_smk/workflow/rules/faidx.smk b/cosigt_smk/workflow/rules/faidx.smk new file mode 100644 index 0000000..ff39c37 --- /dev/null +++ b/cosigt_smk/workflow/rules/faidx.smk @@ -0,0 +1,46 @@ +rule pyfaidx_extract: + ''' + https://github.com/mdshw5/pyfaidx/ + ''' + input: + fasta=config['assemblies'], + bed=rules.bedtools_merge.output + output: + config['output'] + '/pyfaidx/{region}.fasta' + threads: + 1 + resources: + mem_mb=lambda wildcards, attempt: attempt * config['default']['mem_mb'], + time=lambda wildcards, attempt: attempt * config['default']['time'] + container: + 'docker://davidebolo1993/cosigt_workflow:latest' + benchmark: + 'benchmarks/{region}.pyfaidx_extract.benchmark.txt' + shell: + ''' + faidx \ + --bed {input.bed} \ + {input.fasta} > {output} + ''' + +rule samtools_faidx_index: + ''' + https://github.com/samtools/samtools + ''' + input: + rules.pyfaidx_extract.output + output: + config['output'] + '/pyfaidx/{region}.fasta.fai' + threads: + 1 + resources: + mem_mb=lambda wildcards, attempt: attempt * config['default']['mem_mb'], + time=lambda wildcards, attempt: attempt * config['default']['time'] + container: + 'docker://davidebolo1993/cosigt_workflow:latest' + benchmark: + 'benchmarks/{region}.samtools_faidx_index.benchmark.txt' + shell: + ''' + samtools faidx {input} + ''' diff --git a/cosigt_smk/workflow/rules/gafpack.smk b/cosigt_smk/workflow/rules/gafpack.smk index b6a475f..4a9a048 100644 --- a/cosigt_smk/workflow/rules/gafpack.smk +++ b/cosigt_smk/workflow/rules/gafpack.smk @@ -1,10 +1,10 @@ rule gafpack_coverage: ''' - gafpack + https://github.com/ekg/gafpack ''' input: gfa=rules.odgi_view.output, - gaf=rules.gfa_inject.output + gaf=rules.gfainject_inject.output output: config['output'] + '/gafpack/{sample}/{region}.gafpack.gz' threads: @@ -14,6 +14,8 @@ rule gafpack_coverage: time=lambda wildcards, attempt: attempt * config['default']['time'] container: 'docker://davidebolo1993/cosigt_workflow:latest' + benchmark: + 'benchmarks/{sample}.{region}.gafpack_coverage.benchmark.txt' shell: ''' gafpack \ diff --git a/cosigt_smk/workflow/rules/gfainject.smk b/cosigt_smk/workflow/rules/gfainject.smk index 6e912a5..20bafa9 100644 --- a/cosigt_smk/workflow/rules/gfainject.smk +++ b/cosigt_smk/workflow/rules/gfainject.smk @@ -1,10 +1,10 @@ -rule gfa_inject: +rule gfainject_inject: ''' - gfainject + https://github.com/ekg/gfainject ''' input: gfa=rules.odgi_view.output, - bam=rules.bwa_mem2_samtools_sort.output + bam=rules.bwamem2_mem_samtools_sort.output output: config['output'] + '/gfainject/{sample}/{region}.gaf' threads: @@ -14,9 +14,11 @@ rule gfa_inject: time=lambda wildcards, attempt: attempt * config['default']['time'] container: 'docker://davidebolo1993/cosigt_workflow:latest' - params: - + benchmark: + 'benchmarks/{sample}.{region}.gfainject_inject.benchmark.txt' shell: ''' - gfainject --gfa {input.gfa} --bam {input.bam} > {output} + gfainject \ + --gfa {input.gfa} \ + --bam {input.bam} > {output} ''' \ No newline at end of file diff --git a/cosigt_smk/workflow/rules/impg.smk b/cosigt_smk/workflow/rules/impg.smk new file mode 100644 index 0000000..fb4382a --- /dev/null +++ b/cosigt_smk/workflow/rules/impg.smk @@ -0,0 +1,30 @@ +rule impg_project: + ''' + https://github.com/ekg/impg + ''' + input: + paf=config['paf'], + region=lambda wildcards: glob('resources/regions/{region}.bed'.format(region=wildcards.region)) + output: + config['output'] + '/impg/{region}.bed' + threads: + 1 + resources: + mem_mb=lambda wildcards, attempt: attempt * config['default']['mem_mb'], + time=lambda wildcards, attempt: attempt * config['default']['time'] + container: + 'docker://davidebolo1993/cosigt_workflow:latest' + benchmark: + 'benchmarks/{region}.impg_project.benchmark.txt' + params: + exclude='resources/extra/blacklist.txt' + shell: + ''' + impg \ + -p {input.paf} \ + -b {input.region} \ + -I | \ + cut -f 1-3 |\ + grep -v \ + -f {params.exclude} > {output} + ''' \ No newline at end of file diff --git a/cosigt_smk/workflow/rules/odgi.smk b/cosigt_smk/workflow/rules/odgi.smk index d52d2ce..3e58214 100644 --- a/cosigt_smk/workflow/rules/odgi.smk +++ b/cosigt_smk/workflow/rules/odgi.smk @@ -1,9 +1,9 @@ rule odgi_chop: ''' - odgi chop + https://github.com/pangenome/odgi ''' input: - rules.pggb.output + rules.pggb_construct.output output: config['output'] + '/odgi/chop/{region}.og' threads: @@ -13,6 +13,8 @@ rule odgi_chop: time=lambda wildcards, attempt: attempt * config['default']['time'] container: 'docker://pangenome/odgi:1707818641' + benchmark: + 'benchmarks/{region}.odgi_chop.benchmark.txt' shell: ''' odgi chop \ @@ -23,7 +25,7 @@ rule odgi_chop: rule odgi_view: ''' - odgi view + https://github.com/pangenome/odgi ''' input: rules.odgi_chop.output @@ -36,6 +38,8 @@ rule odgi_view: time=lambda wildcards, attempt: attempt * config['default']['time'] container: 'docker://pangenome/odgi:1707818641' + benchmark: + 'benchmarks/{region}.odgi_view.benchmark.txt' shell: ''' odgi view \ @@ -45,7 +49,7 @@ rule odgi_view: rule odgi_paths_matrix: ''' - odgi build non-binary haplotype matrix + https://github.com/pangenome/odgi ''' input: rules.odgi_chop.output @@ -58,19 +62,22 @@ rule odgi_paths_matrix: time=lambda wildcards, attempt: attempt * config['default']['time'] container: 'docker://pangenome/odgi:1707818641' + benchmark: + 'benchmarks/{region}.odgi_paths_matrix.benchmark.txt' shell: ''' odgi paths \ -i {input} \ - -H | cut -f 1,4- | gzip > {output} + -H | \ + cut -f 1,4- | gzip > {output} ''' rule odgi_similarity: ''' - odgi similarity + https://github.com/pangenome/odgi ''' input: - rules.pggb.output + rules.pggb_construct.output output: config['output'] + '/odgi/similarity/{region}.tsv' threads: @@ -80,15 +87,17 @@ rule odgi_similarity: time=lambda wildcards, attempt: attempt * config['default']['time'] container: 'docker://pangenome/odgi:1707818641' + benchmark: + 'benchmarks/{region}.odgi_similarity.benchmark.txt' shell: ''' odgi similarity \ -i {input} > {output} ''' -rule cluster: +rule make_clusters: ''' - cluster + https://github.com/davidebolo1993/cosigt ''' input: rules.odgi_similarity.output @@ -101,6 +110,8 @@ rule cluster: time=lambda wildcards, attempt: attempt * config['default']['time'] container: 'docker://davidebolo1993/cosigt_workflow:latest' + benchmark: + 'benchmarks/{region}.make_clusters.benchmark.txt' params: prefix=config['output'] + '/cluster/{region}' shell: diff --git a/cosigt_smk/workflow/rules/pggb.smk b/cosigt_smk/workflow/rules/pggb.smk index ebb9678..afdfca8 100644 --- a/cosigt_smk/workflow/rules/pggb.smk +++ b/cosigt_smk/workflow/rules/pggb.smk @@ -1,10 +1,10 @@ -rule pggb: +rule pggb_construct: ''' - pggb + https://github.com/pangenome/pggb ''' input: - fasta=rules.odgi_paths_fasta.output, - index=rules.faidx.output + fasta=rules.pyfaidx_extract.output, + index=rules.samtools_faidx_index.output output: config['output'] + '/pggb/{region}.og' threads: @@ -14,6 +14,8 @@ rule pggb: time=lambda wildcards, attempt: attempt * config['pggb']['time'] container: 'docker://pangenome/pggb:latest' + benchmark: + 'benchmarks/{region}.pggb_construct.benchmark.txt' params: prefix=config['output'] + '/pggb/{region}', flags=config['pggb']['params'] diff --git a/cosigt_smk/workflow/rules/samtools.smk b/cosigt_smk/workflow/rules/samtools.smk new file mode 100644 index 0000000..f1ae0c1 --- /dev/null +++ b/cosigt_smk/workflow/rules/samtools.smk @@ -0,0 +1,82 @@ +rule make_reference_bed: + ''' + https://github.com/davidebolo1993/cosigt + ''' + input: + rules.impg_project.output + output: + config['output'] + '/samtools/bed/{region}.bed' + threads: + 1 + resources: + mem_mb=lambda wildcards, attempt: attempt * config['default']['mem_mb'], + time=lambda wildcards, attempt: attempt * config['default']['time'] + benchmark: + 'benchmarks/{region}.make_reference_bed.benchmark.txt' + params: + path=config['path'] + shell: + ''' + grep {params.path} \ + {input} | \ + sed 's/#/\t/' | \ + rev | \ + cut -f 1-3 | \ + rev > {output} + ''' + +rule samtools_view: + ''' + https://github.com/samtools/samtools + ''' + input: + sample=lambda wildcards: glob('resources/alignments/{sample}.*am'.format(sample=wildcards.sample)), + bed=rules.make_reference_bed.output + output: + config['output'] + '/samtools/view/{sample}/{region}.bam' + threads: + config['samtools']['threads'] + resources: + mem_mb=lambda wildcards, attempt: attempt * config['samtools']['mem_mb'], + time=lambda wildcards, attempt: attempt * config['samtools']['time'] + container: + 'docker://davidebolo1993/cosigt_workflow:latest' + benchmark: + 'benchmarks/{sample}.{region}.samtools_view.benchmark.txt' + params: + ref=config['reference'] + shell: + ''' + samtools view \ + -O bam \ + -o {output} \ + -T {params.ref} \ + -@ {threads} \ + -L {input.bed} \ + -M \ + {input.sample} + ''' + +rule samtools_fasta: + ''' + https://github.com/samtools/samtools + ''' + input: + rules.samtools_view.output + output: + config['output'] + '/samtools/fasta/{sample}/{region}.fasta.gz' + threads: + config['samtools']['threads'] + resources: + mem_mb=lambda wildcards, attempt: attempt * config['samtools']['mem_mb'], + time=lambda wildcards, attempt: attempt * config['samtools']['time'] + container: + 'docker://davidebolo1993/cosigt_workflow:latest' + benchmark: + 'benchmarks/{sample}.{region}.samtools_fasta.benchmark.txt' + shell: + ''' + samtools fasta \ + -@ {threads} \ + {input} | pigz > {output} + ''' \ No newline at end of file diff --git a/cosigt_smk/workflow/scripts/organize.py b/cosigt_smk/workflow/scripts/organize.py index 937ab4e..c7f7651 100644 --- a/cosigt_smk/workflow/scripts/organize.py +++ b/cosigt_smk/workflow/scripts/organize.py @@ -11,7 +11,7 @@ class CustomFormat(HelpFormatter): ''' - custo help format + custom help format ''' def _format_action_invocation(self, action): @@ -49,11 +49,6 @@ def _get_default_metavar_for_optional(self, action): return action.dest.upper() - -def double_quote(time): - - return '"%s"' % time - def default_parameters(args): d=dict() @@ -75,7 +70,7 @@ def default_parameters(args): d['minimap2']['threads'] = args.aln_threads d['minimap2']['mem_mb'] = args.aln_memory d['minimap2']['time'] = args.aln_time - d['minimap2']['preset'] = args.aln_preset + d['minimap2']['preset'] = args.aln_preset #samtools d['samtools']=dict() @@ -83,12 +78,6 @@ def default_parameters(args): d['samtools']['mem_mb'] = args.sam_memory d['samtools']['time'] = args.sam_time - #odgi - d['odgi']=dict() - d['odgi']['threads'] = args.odgi_threads - d['odgi']['mem_mb'] = args.odgi_memory - d['odgi']['time'] = args.odgi_time - #pggb d['pggb']=dict() d['pggb']['threads'] = args.pggb_threads @@ -117,19 +106,18 @@ def main(): required = parser.add_argument_group('Required I/O arguments') - required.add_argument('-r','--reference', help='reference genome in FASTA format', metavar='FASTA', required=True) - required.add_argument('--gfa', help='variation graph in .gfa format', metavar='GFA', required=True) - required.add_argument('-a', '--alignment', help='folder with alignment files (bam,cram) - and their index - to use', metavar='FOLDER', required=True) - required.add_argument('--roi', help='one or more regions of interest in BED format', metavar='BED', required=True) - + required.add_argument('-a', '--alignments', help='folder with read-level alignment files (BAM,CRAM) - and their indexes (BAI/CSI,CRAI) - of the individuals to genotype', metavar='FOLDER', required=True) + required.add_argument('-r','--reference', help='reference FASTA file - the same the individuals to genotype are aligned to', metavar='FASTA', required=True) + required.add_argument('--fasta', help='chromosome-level assemblies in FASTA format', metavar='FASTA', required=True) + required.add_argument('--paf', help='pairwise alignment (PAF format) of the assemblies provided with --fasta', metavar='PAF', required=True) + required.add_argument('--roi', help='one or more regions of interest in BED format - first column is the assembly to use as reference (PanSN format, # delimiter)', metavar='BED', required=True) additional = parser.add_argument_group('Additional I/O arguments') - additional.add_argument('--blacklist', help='blacklist of samples (one per line) that should not be included in the analysis [None]', metavar='', required=False, default=None) - additional.add_argument('--path', help='path name in the pangenome graph to be used as a reference [grch38]',type=str, default='grch38') + + additional.add_argument('--blacklist', help='assemblies (one per line) that should not be included in the analysis [None]', metavar='', required=False, default=None) additional.add_argument('--binds', help='additional paths to bind for singularity in /path/1,/path/2 format [/localscratch]', type=str, default='/localscratch') additional.add_argument('--output', help='output folder [results]', metavar='FOLDER', default='results') - metrics = parser.add_argument_group('Specify #threads, memory and time requirements') #default @@ -137,27 +125,21 @@ def main(): metrics.add_argument('--std_memory', help='memory (mb) - default [1000]',type=int, default=1000) #alignment - metrics.add_argument('--aln_threads', help='threads - aligner [5]',type=int, default=5) + metrics.add_argument('--aln_threads', help='# threads - aligner [5]',type=int, default=5) metrics.add_argument('--aln_time', help='max time (minutes) - aligner [2]',type=int, default=5) metrics.add_argument('--aln_memory', help='max memory (mb) - aligner [5000]',type=int, default=10000) - metrics.add_argument('--aln_preset', help='long-read preset for minimap2 [map-ont] - ignore if not using the "long" branch of cosigt',type=str, default='map-ont') + metrics.add_argument('--aln_preset', help='preset for minimap2 [map-ont] - ignore if not using the long branch of cosigt', type=str, default='map-ont') - #samtools extraction and sort - metrics.add_argument('--sam_threads', help='threads - samtools (view) commands [2]',type=int, default=2) - metrics.add_argument('--sam_time', help='max time (minutes) - samtools (view) commands [5]',type=int, default=5) - metrics.add_argument('--sam_memory', help='max memory (mb) - samtools (view) commands [5000]',type=int, default=5000) + #samtools + metrics.add_argument('--sam_threads', help='# threads - samtools (view) [2]',type=int, default=2) + metrics.add_argument('--sam_time', help='max time (minutes) - samtools (view) [5]',type=int, default=5) + metrics.add_argument('--sam_memory', help='max memory (mb) - samtools (view) [5000]',type=int, default=5000) #pggb - metrics.add_argument('--pggb_threads', help='threads - pggb command [24]',type=int, default=24) - metrics.add_argument('--pggb_time', help='max time (minutes) - pggb commands [35]',type=int, default=35) - metrics.add_argument('--pggb_memory', help='max memory (mb) - pggb commands [30000]',type=int, default=30000) - metrics.add_argument('--pggb_params', help='additional parameters for pggb [-c 2 -n 94]',type=str, default='-c 2 -n 94') - - - #odgi build and extract - metrics.add_argument('--odgi_threads', help='threads - odgi (build/extract) commands [10]',type=int, default=10) - metrics.add_argument('--odgi_time', help='max time (minutes) - odgi (build/extract) commands [20]',type=int, default=20) - metrics.add_argument('--odgi_memory', help='max memory (mb) - odgi (build/extract) commands [20000]',type=int, default=20000) + metrics.add_argument('--pggb_threads', help='# threads - pggb [24]',type=int, default=24) + metrics.add_argument('--pggb_time', help='max time (minutes) - pggb [35]',type=int, default=35) + metrics.add_argument('--pggb_memory', help='max memory (mb) - pggb [30000]',type=int, default=30000) + metrics.add_argument('--pggb_params', help='additional parameters for pggb [-c 2]',type=str, default='-c 2') args = parser.parse_args() @@ -171,18 +153,19 @@ def main(): #config out_config_path=os.path.join(wd,'config') + os.makedirs(out_config_path,exist_ok=True) out_yaml_tmp=os.path.join(out_config_path, 'config.yaml.tmp') out_yaml=os.path.join(out_config_path, 'config.yaml') out_samples=os.path.join(out_config_path, 'samples.tsv') #resources out_resources=os.path.join(wd,'resources') - out_aln=os.path.join(out_resources, 'alignment') + out_aln=os.path.join(out_resources, 'alignments') os.makedirs(out_aln, exist_ok=True) - #out_agc=os.path.join(out_resources, 'agc') - #os.makedirs(out_agc, exist_ok=True) - out_gfa=os.path.join(out_resources, 'graph') - os.makedirs(out_gfa, exist_ok=True) + out_fasta=os.path.join(out_resources, 'assemblies') + os.makedirs(out_fasta, exist_ok=True) + out_paf=os.path.join(out_resources, 'paf') + os.makedirs(out_paf, exist_ok=True) out_ref=os.path.join(out_resources, 'reference') os.makedirs(out_ref, exist_ok=True) out_extra=os.path.join(out_resources, 'extra') @@ -191,8 +174,6 @@ def main(): out_regions=os.path.join(out_resources, 'regions') os.makedirs(out_regions,exist_ok=True) - #sing - out_sing=os.path.join(wd, 'singularity_bind_paths.csv') #blacklist of samples to exclude blcklst=[] @@ -211,24 +192,20 @@ def main(): open(blcklst_out, 'w').close() #symlink alignments - alns=sorted(glob.glob(args.alignment+'/*am*')) + alns=sorted(glob.glob(args.alignments+'/*am*')) with open(out_samples, 'w') as samples_out: - samples_out.write('sample_id\talignment\n') + samples_out.write('sample_id\talignments\n') for aln in alns: - if any(x.lower() in aln.lower() for x in blcklst): - - continue #skip blacklisted - bnaln=os.path.basename(aln) out_aln_file=os.path.join(out_aln, bnaln) try: - os.symlink(os.path.abspath(aln), out_aln_file) + os.symlink(os.path.abspath(aln), out_aln_file) #error out if this exists except: @@ -242,20 +219,34 @@ def main(): #add to config d['samples'] = out_samples - #symlink gfa - out_gfa_file=os.path.join(out_gfa, os.path.basename(args.gfa)) + #symlink paf + out_paf_file=os.path.join(out_paf, os.path.basename(args.paf)) try: - os.symlink(os.path.abspath(args.gfa), out_gfa_file) + os.symlink(os.path.abspath(args.paf), out_paf_file) except: pass #add to config - d['graph'] = out_gfa_file + d['paf'] = out_paf_file + + #symlink assemblies + + out_assemblies_file=os.path.join(out_fasta, os.path.basename(args.fasta)) + + try: + os.symlink(os.path.abspath(args.fasta), out_assemblies_file) + + except: + + pass + + #add to config + d['assemblies'] = out_assemblies_file #symlink reference out_reference_file=os.path.join(out_ref, os.path.basename(args.reference)) @@ -270,18 +261,19 @@ def main(): #add to config d['reference'] = out_reference_file - d['path'] = args.path #add to config d['region'] = list() - + d['path'] = '' + with open(args.roi) as bed_in: for line in bed_in: l=line.rstrip().split('\t') - region=l[0] + '_' + l[1] + '_' + l[2] + region=l[0].replace('#','_') + '_' + l[1] + '_' + l[2] + d['path'] = l[0] if d['path'] == '' else d['path'] #put regions in the config fille d['region'].append(region) @@ -291,7 +283,7 @@ def main(): with open(region_out, 'w') as out_region: - out_region.write(args.path +'#'+ l[0] + '\t' + l[1] + '\t' + l[2] + '\t' + region+'\n') + out_region.write(l[0] + '\t' + l[1] + '\t' + l[2]+'\n') #dump config @@ -310,7 +302,7 @@ def main(): os.remove(out_yaml_tmp) #write command - singpath=','.join([os.path.abspath(args.alignment),os.path.dirname(os.path.abspath(args.gfa)),os.path.dirname(os.path.abspath(args.reference)),args.binds]) + singpath=','.join(list(set([os.path.abspath(args.alignments),os.path.dirname(os.path.abspath(args.paf)),os.path.dirname(os.path.abspath(args.fasta)), os.path.dirname(os.path.abspath(args.reference)),args.binds]))) command_out=' '.join(['snakemake --profile config/slurm --singularity-args "-B '+ singpath + '" cosigt']) with open('snakemake.run.sh', 'w') as out: diff --git a/cosigt_smk/workflow/scripts/plotbenchmark.r b/cosigt_smk/workflow/scripts/plotbenchmark.r new file mode 100644 index 0000000..d64ec71 --- /dev/null +++ b/cosigt_smk/workflow/scripts/plotbenchmark.r @@ -0,0 +1,58 @@ +#!/usr/bin/Rscript + +library(ggplot2) + + +get_method <- function(x) { + + y<-tail(unlist(strsplit(x, ".", fixed=T)),3)[1] + +} + +get_region <- function(x) { + + y<-tail(unlist(strsplit(x, ".", fixed=T)),4)[1] + z<-tail(unlist(strsplit(y, "/")),1) + +} + +files<-sort(list.files("benchmarks", pattern=".benchmark.txt", full.names=T)) +method<-unique(do.call(c,lapply(files, get_method))) +region<-unique(do.call(c,lapply(files, get_region))) +benchmark_list<-list() + +l<-1 + +for (m in method) { + + for (r in region) { + + f1<-files[grep(m,files)] + f2<-f1[grep(r,f1)] + values<-c() + + for (i in c(1:length(f2))) { + + df<-data.table::fread(f2[i], header=T) + values<-c(values,df$s) + + } + + benchmark_list[[l]]<-data.frame(region=r,method=m,values=values) + l<-l+1 + + } + +} + + +benchmark_df<-do.call(rbind,benchmark_list) +p<-ggplot(benchmark_df, aes(x=method,y=values, fill=region)) + + geom_boxplot() + + theme_bw() + + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1), legend.position="top")+ + scale_y_log10(breaks=c(.1,1,10,100,1000)) + + labs(x="step", y="runtime (s)") + + +ggsave("benchmark.pdf", p, width=20) From f7aa0f47b6aa08ea01a8aabb7b436a4b569352b3 Mon Sep 17 00:00:00 2001 From: davidebolo1993 Date: Wed, 24 Apr 2024 14:18:45 +0200 Subject: [PATCH 2/4] mod bench --- cosigt_smk/benchmarks/.gitignore | 0 1 file changed, 0 insertions(+), 0 deletions(-) delete mode 100644 cosigt_smk/benchmarks/.gitignore diff --git a/cosigt_smk/benchmarks/.gitignore b/cosigt_smk/benchmarks/.gitignore deleted file mode 100644 index e69de29..0000000 From 31e41ad3fdadfd8ee918bc98bedad62fcd145294 Mon Sep 17 00:00:00 2001 From: davidebolo1993 Date: Wed, 24 Apr 2024 14:19:41 +0200 Subject: [PATCH 3/4] mod .gitignore --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index 34c9e32..2d5c689 100644 --- a/.gitignore +++ b/.gitignore @@ -9,4 +9,5 @@ cosigt_smk/.snakemake/* cosigt_smk/workflow/scripts/__pycache__ cosigt_smk/snakemake.run.sh cosigt_smk/config/* +cosigt_smk/benchmarks/* cosigt_smk/.* From e6583f7f3f2997852d07fb68388ea1a51f664338 Mon Sep 17 00:00:00 2001 From: davidebolo1993 Date: Fri, 26 Apr 2024 10:17:58 +0200 Subject: [PATCH 4/4] apply changes to impg --- cosigt_smk/workflow/rules/impg.smk | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cosigt_smk/workflow/rules/impg.smk b/cosigt_smk/workflow/rules/impg.smk index fb4382a..0597152 100644 --- a/cosigt_smk/workflow/rules/impg.smk +++ b/cosigt_smk/workflow/rules/impg.smk @@ -23,7 +23,7 @@ rule impg_project: impg \ -p {input.paf} \ -b {input.region} \ - -I | \ + -x \ cut -f 1-3 |\ grep -v \ -f {params.exclude} > {output}