diff --git a/Dockerfile b/Dockerfile index f682917..b5069a3 100644 --- a/Dockerfile +++ b/Dockerfile @@ -58,19 +58,6 @@ RUN cargo install --locked maturin #install numpy RUN pip3 install numpy -#pgr-tk installation -RUN wget https://github.com/GeneDx/pgr-tk/releases/download/v0.5.1/pgr-tk-v0.5.1.zip \ - && unzip pgr-tk-v0.5.1.zip \ - && rm pgr-tk-v0.5.1.zip \ - && chmod +x release/* \ - && chmod +x wheels/* \ - && pip3 install wheels/pgrtk-0.5.1-cp310-cp310-linux_x86_64.whl - -ENV PATH /opt/release:$PATH - -#install wheels - - #ln python to python3 -not used right now but, who knows? RUN ln -s /usr/bin/python3 /usr/bin/python diff --git a/cosigt.go b/cosigt.go index f6c22b0..a62efcd 100644 --- a/cosigt.go +++ b/cosigt.go @@ -11,35 +11,18 @@ import ( "sort" "strconv" "strings" + "path" + "github.com/akamensky/argparse" "gonum.org/v1/gonum/stat/combin" ) -//all these functions assume no errors in input files -func GetCosineSimilarity(A []float64, B []float64) float64 { - - var eucl_magn, dot_product, similarity float64 - - eucl_magn = GetMagnitude(A, B) - - if eucl_magn > 0 { - - dot_product = GetDotProduct(A, B) - similarity = dot_product / eucl_magn - - } else { - - similarity = 0 - - } - - return similarity -} func GetMagnitude(A []float64, B []float64) float64 { var A_len, B_len float64 + A_len = 0 B_len = 0 @@ -66,9 +49,42 @@ func GetDotProduct(A []float64, B []float64) float64 { return dot_product } + +func GetCosineSimilarity(A []float64, B []float64) float64 { + + /* + Simple calculation of cosine similarity + */ + + + var eucl_magn, dot_product, similarity float64 + + eucl_magn = GetMagnitude(A, B) + + if eucl_magn > 0 { + + dot_product = GetDotProduct(A, B) + similarity = dot_product / eucl_magn + + } else { + + similarity = 0 + + } + + return similarity +} + + + func ReadBlacklist(s string) []string { - ids := make([]string, 0, 10) //don't want to re-allocate too many times + /* + Samples we may want to exclude, one-per-line + */ + + + ids := make([]string, 0, 100) f, _ := os.Open(s) defer f.Close() @@ -87,6 +103,10 @@ func ReadBlacklist(s string) []string { func ReadGz(s string) ([]string, [][]float64) { + /* + Read the gzip-compressed files + */ + cov := make([][]float64, 0, 1000) id := make([]string, 0, 1000) @@ -124,6 +144,7 @@ func ReadGz(s string) ([]string, [][]float64) { id = append(id, rec[0]) } + return id, cov } @@ -176,22 +197,39 @@ func SumSlices(a []float64, b []float64) []float64 { func main() { - //I/O files + /* + Parsing of arguments with argparse + */ + + parser := argparse.NewParser("cosigt", "genotyping loci in pangenome graphs using cosine distance") + + p := parser.String("p", "paths", &argparse.Options{Required: true, Help: "gzip-compressed tsv file with path names and node coverages from odgi paths"}) + g := parser.String("g", "gaf", &argparse.Options{Required: true, Help: "gzip-compressed gaf (graph alignment format) file for a sample from gafpack"}) + b := parser.String("b", "blacklist", &argparse.Options{Required: false, Help: "txt file with names of paths to exclude (one per line)"}) + o := parser.String("o", "output", &argparse.Options{Required: true, Help: "folder prefix for output files"}) + + err := parser.Parse(os.Args) + + if err != nil { + + fmt.Print(parser.Usage(err)) + os.Exit(1) + + } //read first table - hapid, gcov := ReadGz(os.Args[1]) + hapid, gcov := ReadGz(*p) //read second table - smpl, bcov := ReadGz(os.Args[2]) + smpl, bcov := ReadGz(*g) //read blacklist - it can be empty or not - blck := ReadBlacklist(os.Args[3]) - //trace combinations we already use + blck := ReadBlacklist(*b) + //trace combinations we have already seen seen:=make(map[int]bool) - //store results in map m := make(map[string]float64) - + //generate all possible diploid combination n := len(hapid) - k := 2 + k := 2 //fixed ploidy - this can be however adjusted gen := combin.NewCombinationGenerator(n, k) for gen.Next() { @@ -204,8 +242,6 @@ func main() { indiv := (hapid[h1] + "-" + hapid[h2]) m[indiv] = GetCosineSimilarity(sum, bcov[0]) - //comment the following if we don't need to add homo - _,ok:=seen[h1] if !ok { @@ -228,7 +264,6 @@ func main() { } - //comment above if we don't need to add homo } @@ -250,12 +285,19 @@ func main() { }) //write combinations - _ = os.Mkdir(os.Args[4], os.ModePerm) - WriteMap(m, os.Args[4]+"/combos.tsv") + outpath:=path.Clean(*o) + _ = os.Mkdir(outpath, os.ModePerm) + + combos:=path.Clean(outpath + "/combos.tsv") + WriteMap(m, combos) //write best score - f, _ := os.Create(os.Args[4] + "/best_genotype.tsv") + best:=path.Clean(outpath + "/best_genotype.tsv") + f, _ := os.Create(best) defer f.Close() result := fmt.Sprintf("#sample\tbest_genotype\tbest_score\n%s\t%s\t%.16f\n", smpl[0], keys[0], m[keys[0]]) - _, _ = f.WriteString(result) + _, _ = f.WriteString(result) + + //all done + } diff --git a/cosigt_smk/config/slurm/config.yaml b/cosigt_smk/config/slurm/config.yaml index c04c6bf..0fac503 100644 --- a/cosigt_smk/config/slurm/config.yaml +++ b/cosigt_smk/config/slurm/config.yaml @@ -11,13 +11,13 @@ cluster: default-resources: - partition=cpuq - mem_mb=1000 - - time="00:10:00" + - time=10 - gpu=0 max-jobs-per-second: 10 max-status-checks-per-second: 20 local-cores: 1 latency-wait: 90 -jobs: 600 +jobs: 100 keep-going: True rerun-incomplete: True printshellcmds: True @@ -26,3 +26,4 @@ use-conda: True use-singularity: True verbose: True reason: True +retries: 4 diff --git a/cosigt_smk/logs/.gitignore b/cosigt_smk/logs/.gitignore deleted file mode 100644 index e69de29..0000000 diff --git a/cosigt_smk/snakemake.run.sh b/cosigt_smk/snakemake.run.sh new file mode 100644 index 0000000..81cd8f3 --- /dev/null +++ b/cosigt_smk/snakemake.run.sh @@ -0,0 +1 @@ +snakemake --profile config/slurm --singularity-args "-B /group/soranzo/davide.bolognini/working/crams,/group/soranzo/davide.bolognini/working/graph,/ssu/gassu/reference_data/human_genomes/GRCh38,/localscratch" cosigt diff --git a/cosigt_smk/workflow/Snakefile b/cosigt_smk/workflow/Snakefile index f6d37e4..d49f60b 100644 --- a/cosigt_smk/workflow/Snakefile +++ b/cosigt_smk/workflow/Snakefile @@ -7,21 +7,14 @@ df=(pd.read_table(config['samples'], dtype={'sample_id': str, 'cram':str}) .sort_index() ) -include: 'rules/pgrtk.smk' -include: 'rules/pggb.smk' include: 'rules/extract.smk' +include: 'rules/pggb.smk' include: 'rules/odgi.smk' include: 'rules/bwa-mem2.smk' include: 'rules/gfainject.smk' include: 'rules/gafpack.smk' include: 'rules/cosigt.smk' -include: 'rules/evaluate.smk' rule cosigt: input: - expand('results/cosigt/{sample}/{region}/best_genotype.tsv', sample=df['sample_id'].tolist(), region=config['region']), - -rule evaluation: - input: - expand('results/cosigt/evaluation/{region}/evaluation.pdf', region=config['region']), - expand('results/cosigt/evaluation/{region}/evaluation.jaccard.pdf', region=config['region']) \ No newline at end of file + expand('results/cosigt/{sample}/{region}/best_genotype.tsv', sample=df['sample_id'].tolist(), region=config['region']) diff --git a/cosigt_smk/workflow/rules/bwa-mem2.smk b/cosigt_smk/workflow/rules/bwa-mem2.smk index d0c697d..6ec9e79 100644 --- a/cosigt_smk/workflow/rules/bwa-mem2.smk +++ b/cosigt_smk/workflow/rules/bwa-mem2.smk @@ -3,9 +3,9 @@ rule bwa_mem2_index: bwa index ''' input: - rules.pgrtk_filter.output + rules.odgi_paths_fasta.output output: - multiext('results/pgrtk/fasta/{region}.fa', '.bwt.2bit.64', '.pac', '.ann', '.amb', '.0123') + multiext('results/odgi/paths/fasta/{region}.fa', '.bwt.2bit.64', '.pac', '.ann', '.amb', '.0123') threads: 1 resources: @@ -23,7 +23,7 @@ rule bwa_mem2_samtools_sort: bwa-mem2 and sam-to-bam conversion with samtools ''' input: - ref=rules.pgrtk_filter.output, + ref=rules.odgi_paths_fasta.output, idx=rules.bwa_mem2_index.output, sample=rules.samtools_fasta.output output: diff --git a/cosigt_smk/workflow/rules/cosigt.smk b/cosigt_smk/workflow/rules/cosigt.smk index 4b14cf7..170a1d3 100644 --- a/cosigt_smk/workflow/rules/cosigt.smk +++ b/cosigt_smk/workflow/rules/cosigt.smk @@ -16,6 +16,6 @@ rule cosigt_genotype: prefix='results/cosigt/{sample}/{region}' shell: ''' - cosigt {input.zpath} {input.xpack} resources/extra/bad_samples.txt {params.prefix} + cosigt -p {input.zpath} -g {input.xpack} -b resources/extra/blacklist.txt -o {params.prefix} ''' diff --git a/cosigt_smk/workflow/rules/extract.smk b/cosigt_smk/workflow/rules/extract.smk index 26c81c8..9c7bada 100644 --- a/cosigt_smk/workflow/rules/extract.smk +++ b/cosigt_smk/workflow/rules/extract.smk @@ -1,4 +1,93 @@ from glob import glob +import os + +rule odgi_build: + ''' + odgi build + ''' + input: + config['graph'] + output: + 'results/odgi/build/' + os.path.basename(config['graph'].replace('.gfa', '.og')) + threads: + config['odgi']['threads'] + resources: + mem_mb=lambda wildcards, attempt: attempt * config['odgi']['mem_mb'], + time=lambda wildcards, attempt: attempt * config['odgi']['time'] + container: + 'docker://pangenome/odgi:1707818641' + shell: + ''' + odgi build \ + -g {input} \ + -o {output} \ + -O \ + -t {threads} + ''' + +rule odgi_extract: + ''' + odgi extract + ''' + input: + graph=rules.odgi_build.output, + region=lambda wildcards: glob('resources/regions/{region}.bed'.format(region=wildcards.region)) + output: + 'results/odgi/extract/{region}.og' + threads: + config['odgi']['threads'] + resources: + mem_mb=lambda wildcards, attempt: attempt * config['odgi']['mem_mb'], + time=lambda wildcards, attempt: attempt * config['odgi']['time'] + container: + 'docker://pangenome/odgi:1707818641' + shell: + ''' + odgi extract \ + -i {input.graph} \ + -b {input.region} \ + -t {threads} \ + -O \ + -o {output} + ''' + + +rule odgi_paths_fasta: + ''' + odgi paths + ''' + input: + rules.odgi_extract.output + output: + 'results/odgi/paths/fasta/{region}.fa' + threads: + 1 + container: + 'docker://pangenome/odgi:1707818641' + shell: + ''' + odgi paths \ + -i {input} \ + -f > {output} + ''' + +rule faidx: + ''' + samtools faidx + ''' + input: + rules.odgi_paths_fasta.output + output: + 'results/odgi/paths/fasta/{region}.fa.fai' + threads: + 1 + container: + 'docker://davidebolo1993/graph_genotyper:latest' + shell: + ''' + samtools faidx {input} + ''' + rule get_proper_region: input: @@ -11,7 +100,7 @@ rule get_proper_region: path=config['path'] shell: ''' - grep {params.path} {input} | cut -f 1 | cut -d "_" -f 1,3-4 | tr '_' '\t' > {output} + grep {params.path} {input} | cut -f 1 | cut -d "#" -f 2 | tr ':' '\t'| tr '-' '\t' > {output} ''' rule samtools_view: @@ -31,8 +120,8 @@ rule samtools_view: ref=config['reference'], region='{region}' resources: - mem_mb=config['samtools']['mem_mb'], - time=config['samtools']['time'] + mem_mb=lambda wildcards, attempt: attempt * config['samtools']['mem_mb'], + time=lambda wildcards, attempt: attempt * config['samtools']['time'] shell: ''' samtools view \ @@ -42,7 +131,7 @@ rule samtools_view: -@ {threads} \ -L {input.bed} \ -M \ - {input.sample} \ + {input.sample} ''' rule samtools_fasta: @@ -58,8 +147,8 @@ rule samtools_fasta: container: 'docker://davidebolo1993/graph_genotyper:latest' resources: - mem_mb=config['samtools']['mem_mb'], - time=config['samtools']['time'] + mem_mb=lambda wildcards, attempt: attempt * config['samtools']['mem_mb'], + time=lambda wildcards, attempt: attempt * config['samtools']['time'] shell: ''' samtools fasta \ diff --git a/cosigt_smk/workflow/rules/odgi.smk b/cosigt_smk/workflow/rules/odgi.smk index 17331e7..4224e01 100644 --- a/cosigt_smk/workflow/rules/odgi.smk +++ b/cosigt_smk/workflow/rules/odgi.smk @@ -1,5 +1,3 @@ -import os - rule odgi_chop: ''' odgi chop @@ -11,7 +9,7 @@ rule odgi_chop: threads: 1 container: - 'docker://pangenome/odgi:1689324294' + 'docker://pangenome/odgi:1707818641' shell: ''' odgi chop \ @@ -31,7 +29,7 @@ rule odgi_view: threads: 1 container: - 'docker://pangenome/odgi:1689324294' + 'docker://pangenome/odgi:1707818641' shell: ''' odgi view \ @@ -50,7 +48,7 @@ rule odgi_paths_matrix: threads: 1 container: - 'docker://pangenome/odgi:1689324294' + 'docker://pangenome/odgi:1707818641' shell: ''' odgi paths \ @@ -69,7 +67,7 @@ rule odgi_similarity: threads: 1 container: - 'docker://pangenome/odgi:1689324294' + 'docker://pangenome/odgi:1707818641' shell: ''' odgi similarity \ diff --git a/cosigt_smk/workflow/rules/pggb.smk b/cosigt_smk/workflow/rules/pggb.smk index be6265b..cd9d2d2 100644 --- a/cosigt_smk/workflow/rules/pggb.smk +++ b/cosigt_smk/workflow/rules/pggb.smk @@ -3,7 +3,7 @@ rule pggb: pggb ''' input: - fasta=rules.pgrtk_filter.output, + fasta=rules.odgi_paths_fasta.output, index=rules.faidx.output output: 'results/pggb/{region}.og' @@ -12,8 +12,8 @@ rule pggb: container: 'docker://pangenome/pggb:latest' resources: - mem_mb=config['pggb']['mem_mb'], - time=config['pggb']['time'] + mem_mb=lambda wildcards, attempt: attempt * config['pggb']['mem_mb'], + time=lambda wildcards, attempt: attempt * config['pggb']['time'] params: prefix='results/pggb/{region}' shell: @@ -22,11 +22,7 @@ rule pggb: -i {input.fasta} \ -o {params.prefix} \ -t {threads} \ - -n 200 \ - -S \ - -A \ - -p 90 \ - -s 5k \ - -k 419 \ + -n 94 \ + -c 2 \ && mv {params.prefix}/*smooth.final.og {output} ''' diff --git a/cosigt_smk/workflow/rules/pgrtk.smk b/cosigt_smk/workflow/rules/pgrtk.smk deleted file mode 100644 index 491fc08..0000000 --- a/cosigt_smk/workflow/rules/pgrtk.smk +++ /dev/null @@ -1,105 +0,0 @@ -#rule pgrtk_get_seq: -# ''' -# run pgrtk api - deprecating in favour of cli, but keeping this here just in case want to re-introduce at some point -# ''' -# input: -# agc=config['agc'], -# region=lambda wildcards: glob('resources/regions/{region}.bed'.format(region=wildcards.region)) -# output: -# 'results/pgrtk/{region}.fa' -# threads: -# config['pgrtk']['threads'] -# container: -# 'docker://davidebolo1993/graph_genotyper:latest' -# params: -# padding=config['pgrtk']['padding'] -# resources: -# mem_mb=config['pgrtk']['mem_mb'], -# time=config['pgrtk']['time'] -# shell: -# ''' -# python workflow/scripts/pypgrtk.py {input.agc} {input.region} {params.padding} {output} -# ''' - -rule pgrtk_fetch_seqs: - ''' - run pgrtk fetch - ''' - input: - agc=config['agc'], - region=lambda wildcards: glob('resources/regions/{region}.bed'.format(region=wildcards.region)) - output: - 'results/pgrtk/fetch/{region}.fa' - threads: - 32 - container: - 'docker://davidebolo1993/graph_genotyper:latest' - resources: - mem_mb=config['pgrtk']['mem_mb'], - time=config['pgrtk']['time'] - params: - agc_prefix=config['agc'].replace('.agc','') - shell: - ''' - pgr-fetch-seqs {params.agc_prefix} -r {input.region} > {output} - ''' - -rule pgrtk_query: - ''' - run pgrtk query - ''' - input: - agc=config['agc'], - fasta=rules.pgrtk_fetch_seqs.output - output: - hit='results/pgrtk/query/{region}.000.hit', - fasta='results/pgrtk/query/{region}.000.fa' - threads: - 32 - container: - 'docker://davidebolo1993/graph_genotyper:latest' - resources: - mem_mb=config['pgrtk']['mem_mb'], - time=config['pgrtk']['time'] - params: - prefix='results/pgrtk/query/{region}', - agc_prefix=config['agc'].replace('.agc','') - shell: - ''' - pgr-query {params.agc_prefix} {input.fasta} {params.prefix} - ''' - -rule pgrtk_filter: - ''' - filter hits - ''' - input: - hit=rules.pgrtk_query.output.hit, - fasta=rules.pgrtk_query.output.fasta - output: - 'results/pgrtk/fasta/{region}.fa' - threads: - 1 - conda: - '../envs/python.yml' - shell: - ''' - python workflow/scripts/filter_pgrtk.py {input.fasta} {input.hit} {output} - ''' - -rule faidx: - ''' - samtools faidx - ''' - input: - rules.pgrtk_filter.output - output: - 'results/pgrtk/fasta/{region}.fa.fai' - threads: - 1 - container: - 'docker://davidebolo1993/graph_genotyper:latest' - shell: - ''' - samtools faidx {input} - ''' \ No newline at end of file diff --git a/cosigt_smk/workflow/rules/evaluate.smk b/cosigt_smk/workflow/scripts/evaluate.smk similarity index 100% rename from cosigt_smk/workflow/rules/evaluate.smk rename to cosigt_smk/workflow/scripts/evaluate.smk diff --git a/cosigt_smk/workflow/scripts/filter_pgrtk.py b/cosigt_smk/workflow/scripts/filter_pgrtk.py deleted file mode 100644 index b19bbf1..0000000 --- a/cosigt_smk/workflow/scripts/filter_pgrtk.py +++ /dev/null @@ -1,49 +0,0 @@ -#!/usr/bin/python3 env - -#standard libraries -import os -import sys -import pyfaidx - -def main(input_fa,input_hit,output_fa): - - fasta=pyfaidx.Fasta(input_fa) - - with open(input_hit) as fin, open(output_fa, 'w') as fout: - - for row in fin: - - row = row.strip().split("\t") - - if row[0][0] == "#": - - continue - - (q_idx, q_name, query_bgn, query_end, q_len, aln_anchor_count, - src, ctg, ctg_bgn, ctg_end, orientation, out_seq_name) = row - query_bgn = int(query_bgn) - query_end = int(query_end) - ctg_bgn = int(ctg_bgn) - ctg_end = int(ctg_end) - q_len = int(q_len) - - #maybe need to check why this is 12000 - but itworks for test regions - if query_bgn > 12000 or q_len - query_end > 12000: - - continue - - if abs(ctg_end-ctg_bgn) > q_len * 3: - - continue - - key=[x for x in fasta.keys() if ctg+ '_' + str(ctg_bgn) in x] - seq=fasta[key[0]][:].seq - fout.write('>' + key[0].split('::')[1] + '\n' + seq + '\n') - - -if __name__ == '__main__': - - input_fa=os.path.abspath(sys.argv[1]) - input_hit=os.path.abspath(sys.argv[2]) - output_fa=os.path.abspath(sys.argv[3]) - main(input_fa,input_hit,output_fa) \ No newline at end of file diff --git a/cosigt_smk/workflow/scripts/organize.py b/cosigt_smk/workflow/scripts/organize.py index 4770221..aa307e5 100644 --- a/cosigt_smk/workflow/scripts/organize.py +++ b/cosigt_smk/workflow/scripts/organize.py @@ -1,12 +1,9 @@ #!/usr/bin/python3 env #standard libraries - import os -import sys import glob import yaml -import shutil import argparse from argparse import HelpFormatter @@ -65,37 +62,37 @@ def default_parameters(args): d['bwa-mem2']=dict() d['bwa-mem2']['threads'] = args.aln_threads d['bwa-mem2']['mem_mb'] = args.aln_memory - d['bwa-mem2']['time'] = double_quote(args.aln_time) + d['bwa-mem2']['time'] = args.aln_time #bwa-mem d['bwa']=dict() d['bwa']['threads'] = args.aln_threads d['bwa']['mem_mb'] = args.aln_memory - d['bwa']['time'] = double_quote(args.aln_time) + d['bwa']['time'] = args.aln_time #minimap2 d['minimap2']=dict() d['minimap2']['threads'] = args.aln_threads d['minimap2']['mem_mb'] = args.aln_memory - d['minimap2']['time'] = double_quote(args.aln_time) + d['minimap2']['time'] = args.aln_time #samtools d['samtools']=dict() d['samtools']['threads'] = args.sam_threads d['samtools']['mem_mb'] = args.sam_memory - d['samtools']['time'] = double_quote(args.sam_time) + d['samtools']['time'] = args.sam_time - #pgrtk - d['pgrtk']=dict() - #d['pgrtk']['padding'] = args.pgrtk_padding - d['pgrtk']['mem_mb'] = args.pgrtk_memory - d['pgrtk']['time'] = double_quote(args.pgrtk_time) + #odgi + d['odgi']=dict() + d['odgi']['threads'] = args.odgi_threads + d['odgi']['mem_mb'] = args.odgi_memory + d['odgi']['time'] = args.odgi_time #pggb d['pggb']=dict() d['pggb']['threads'] = args.pggb_threads d['pggb']['mem_mb'] = args.pggb_memory - d['pggb']['time'] = double_quote(args.pggb_time) + d['pggb']['time'] = args.pggb_time return d @@ -106,41 +103,41 @@ def main(): parse arguments and organize inputs for running cosigt properly without additional efforts from the users ''' - parser = argparse.ArgumentParser(prog='cosigt', description='''COsine SImilarity-based GenoTyper''', epilog='''This program was developed by Davide Bolognini at Human Technopole.''', formatter_class=CustomFormat) + parser = argparse.ArgumentParser(prog='cosigt', description='''COsine SImilarity-based GenoTyper''', epilog='''Developed by Davide Bolognini @ Human Technopole''', formatter_class=CustomFormat) required = parser.add_argument_group('Required I/O arguments') required.add_argument('-r','--reference', help='reference genome in FASTA format', metavar='FASTA', required=True) - required.add_argument('--agc', help='agc file from pgrtk - available at https://giab-data.s3.amazonaws.com/PGR-TK-Files/pgr-tk-HGRP-y1-evaluation-set-v0.tar', metavar='AGC', required=True) + required.add_argument('--gfa', help='variation graph in .gfa format', metavar='GFA', required=True) required.add_argument('-a', '--alignment', help='folder with alignment files (bam,cram) - and their index - to use', metavar='FOLDER', required=True) required.add_argument('--roi', help='one or more regions of interest in BED format', metavar='BED', required=True) additional = parser.add_argument_group('Additional I/O arguments') - additional.add_argument('--blacklist', help='blacklist of samples (one per line) that should not be included in the analysis [None]', metavar='', required=False, default=None) - additional.add_argument('--path', help='path name in the graph to use as a reference [hg38]',type=str, default='hg38') + additional.add_argument('--path', help='path name in the pangenome graph to be used as a reference [grch38]',type=str, default='grch38') + additional.add_argument('--binds', help='additional paths to bind for singularity in /path/1,/path/2 format', type=str, default='/localscratch') metrics = parser.add_argument_group('Specify #threads, memory and time requirements') #alignment metrics.add_argument('--aln_threads', help='threads - aligner [5]',type=int, default=5) - metrics.add_argument('--aln_time', help='max time (hh:mm:ss) - aligner ["00:02:30"]',type=str, default='00:02:30') - metrics.add_argument('--aln_memory', help='max memory (mb) - aligner[6000]',type=int, default=6000) + metrics.add_argument('--aln_time', help='max time (minutes) - aligner [2]',type=int, default=2) + metrics.add_argument('--aln_memory', help='max memory (mb) - aligner[5000]',type=int, default=5000) #samtools extraction and sort - metrics.add_argument('--sam_threads', help='threads - samtools (view/sort) commands [1]',type=int, default=1) - metrics.add_argument('--sam_time', help='max time (hh:mm:ss) - samtools (view/sort) commands ["00:08:00"]',type=str, default='00:08:00') + metrics.add_argument('--sam_threads', help='threads - samtools (view/sort) commands [2]',type=int, default=2) + metrics.add_argument('--sam_time', help='max time (minutes) - samtools (view/sort) commands [8]',type=int, default=8) metrics.add_argument('--sam_memory', help='max memory (mb) - samtools (view/sort) commands [1000]',type=int, default=1000) - #pgrtk - metrics.add_argument('--pgrtk_padding', help='padding - pgrtk commands [200000]',type=int, default=200000) - metrics.add_argument('--pgrtk_time', help='max time (hh:mm:ss) - pgrtk commands ["00:10:00"]',type=str, default='00:10:00') - metrics.add_argument('--pgrtk_memory', help='max memory (mb) - pgrtk commands [35000]',type=int, default=35000) - #pggb - metrics.add_argument('--pggb_threads', help='threads - pggb command [32]',type=int, default=32) - metrics.add_argument('--pggb_time', help='max time (hh:mm:ss) - pggb commands ["00:25:00"]',type=str, default='00:25:00') - metrics.add_argument('--pggb_memory', help='max memory (mb) - pggb commands [2000]',type=int, default=2000) + metrics.add_argument('--pggb_threads', help='threads - pggb command [24]',type=int, default=24) + metrics.add_argument('--pggb_time', help='max time (minutes) - pggb commands [35]',type=int, default=35) + metrics.add_argument('--pggb_memory', help='max memory (mb) - pggb commands [20000]',type=int, default=20000) + + #odgi build and extract + metrics.add_argument('--odgi_threads', help='threads - odgi (build/extract) commands [10]',type=int, default=10) + metrics.add_argument('--odgi_time', help='max time (minutes) - odgi (build/extract) commands [10]',type=int, default=10) + metrics.add_argument('--odgi_memory', help='max memory (mb) - odgi (build/extract) commands [30000]',type=int, default=30000) args = parser.parse_args() @@ -162,13 +159,15 @@ def main(): out_resources=os.path.join(wd,'resources') out_aln=os.path.join(out_resources, 'alignment') os.makedirs(out_aln, exist_ok=True) - out_agc=os.path.join(out_resources, 'agc') - os.makedirs(out_agc, exist_ok=True) + #out_agc=os.path.join(out_resources, 'agc') + #os.makedirs(out_agc, exist_ok=True) + out_gfa=os.path.join(out_resources, 'graph') + os.makedirs(out_gfa, exist_ok=True) out_ref=os.path.join(out_resources, 'reference') os.makedirs(out_ref, exist_ok=True) out_extra=os.path.join(out_resources, 'extra') os.makedirs(out_extra,exist_ok=True) - blcklst_out=os.path.join(out_extra, 'bad_samples.txt') + blcklst_out=os.path.join(out_extra, 'blacklist.txt') out_regions=os.path.join(out_resources, 'regions') os.makedirs(out_regions,exist_ok=True) @@ -176,7 +175,6 @@ def main(): out_sing=os.path.join(wd, 'singularity_bind_paths.csv') #blacklist of samples to exclude - blcklst=[] if args.blacklist is not None: @@ -190,9 +188,7 @@ def main(): else: - with open(blcklst_out, 'w') as bad_samples_out: - - pass + open(blcklst_out, 'w').close() #symlink alignments alns=sorted(glob.glob(args.alignment+'/*am*')) @@ -220,33 +216,26 @@ def main(): if aln.endswith('am'): #this is not an index, rather a true alignment - sample_name=bnaln.split('.')[0] + sample_name='.'.join(bnaln.split('.')[:-1]) samples_out.write(sample_name + '\t' + out_aln_file + '\n') #add to config d['samples'] = out_samples - #symlink agc - agc=os.path.abspath(args.agc) - agc_dir=os.path.dirname(agc) - agc_file=os.path.basename(agc) - agc_pattern=agc_file.replace('.agc', '') - agc_files=glob.glob(agc_dir + '/' +agc_pattern+'*') - - for f in agc_files: - - try: - - os.symlink(f, os.path.join(out_agc, os.path.basename(f))) + #symlink gfa + out_gfa_file=os.path.join(out_gfa, os.path.basename(args.gfa)) - except: + try: - pass + os.symlink(os.path.abspath(args.gfa), out_gfa_file) + + except: - out_agc_file=os.path.join(out_agc,agc_file) + pass #add to config - d['agc'] = out_agc_file + d['graph'] = out_gfa_file + #symlink reference out_reference_file=os.path.join(out_ref, os.path.basename(args.reference)) @@ -266,13 +255,13 @@ def main(): #add to config d['region'] = list() - with open(args.roi) as bed_in: for line in bed_in: l=line.rstrip().split('\t') - region=l[0] + '_' + str(int(l[1])-args.pgrtk_padding) + '_' + str(int(l[2])+args.pgrtk_padding) + + region=l[0] + '_' + l[1] + '_' + l[2] #put regions in the config fille d['region'].append(region) @@ -282,7 +271,7 @@ def main(): with open(region_out, 'w') as out_region: - out_region.write(region + '\t' + args.path +'_tagged.fa' + '\t' + l[0] + '_' + args.path + '\t' + str(int(l[1])-args.pgrtk_padding) + '\t' + str(int(l[2])+args.pgrtk_padding) + '\t0\n') + out_region.write(args.path +'#'+ l[0] + '\t' + l[1] + '\t' + l[2] + '\t' + region+'\n') #dump config @@ -291,7 +280,6 @@ def main(): yml_out.close() #remove single quotes - with open(out_yaml_tmp) as filein, open(out_yaml, 'w') as fileout: for line in filein: @@ -301,10 +289,13 @@ def main(): os.remove(out_yaml_tmp) - #write singularity paths for those using singularity - with open(out_sing, 'w') as singpath: + #write command + singpath=','.join([os.path.abspath(args.alignment),os.path.dirname(os.path.abspath(args.gfa)),os.path.dirname(os.path.abspath(args.reference)),args.binds]) + command_out=' '.join(['snakemake --profile config/slurm --singularity-args "-B '+ singpath + '" cosigt']) + + with open('snakemake.run.sh', 'w') as out: - singpath.write(','.join([os.path.abspath(args.alignment),os.path.dirname(os.path.abspath(args.agc)),os.path.dirname(os.path.abspath(args.reference)),'/localscratch']) + '\n') + out.write(command_out + '\n') if __name__ == '__main__': diff --git a/cosigt_smk/workflow/scripts/pypgrtk.py b/cosigt_smk/workflow/scripts/pypgrtk.py deleted file mode 100644 index d60c470..0000000 --- a/cosigt_smk/workflow/scripts/pypgrtk.py +++ /dev/null @@ -1,83 +0,0 @@ -#!/usr/bin/python3 - -import pgrtk -import os -import sys - -def main(agc_file,bed_file,padding,out_file): - - #load agc_file - ref_db=pgrtk.AGCFile(agc_file) - - #load region - with open(bed_file) as bed: - - for line in bed: - - fields=line.rstrip().split('\t') - ref_file_name, roi_chr, roi_b, roi_e = fields[0],fields[1], int(fields[2]), int(fields[3]) - roi_len=roi_e - roi_b - break - #not needed - but make sure we read just one line - - #sdb - sdb = pgrtk.SeqIndexDB() - sdb.load_from_agc_index(agc_file.replace('.agc', '')) - - #retrieve roi sequence - roi_seq = ref_db.get_sub_seq(ref_file_name, roi_chr, roi_b-padding, roi_e+padding) - - #find hits in the pangenomic reference - aln_range = pgrtk.query_sdb(sdb, roi_seq, merge_range_tol=100000) - seq_list=[] - - for k in list(aln_range.keys()): - - ctg_name, source, _ = sdb.seq_info[k] - rgns = aln_range[k].copy() - rgns = pgrtk.merge_regions(rgns, tol=1000) - - for rgn in rgns: - - b, e,_, orientation, aln = rgn - print(ctg_name,b,e) - - if aln[0][0][0] > padding or aln[-1][0][1] < padding + roi_len: - - continue - - if e-b < 0.75 * (roi_len + 2 * padding): - - continue - - seq = sdb.get_sub_seq(source, ctg_name, b, e) - - if orientation == 1: - - seq = pgrtk.rc_byte_seq(seq) - - - seq_list.append(('{}_{}_{}_{}'.format(ctg_name, b, e, orientation), seq)) - - f0 = open(out_file, 'w') - - for ctg, seq in seq_list: - - print('>{}'.format(ctg), file=f0) - print(pgrtk.u8_to_string(seq), file=f0) - - f0.close() - - -if __name__ == '__main__': - - agc_file=os.path.abspath(sys.argv[1]) - bed_file=os.path.abspath(sys.argv[2]) - padding=int(sys.argv[3]) - out_file=os.path.abspath(sys.argv[4]) - print(agc_file,bed_file,padding,out_file) - main(agc_file,bed_file,padding,out_file) - - - -