Skip to content

Commit

Permalink
workflow rework
Browse files Browse the repository at this point in the history
  • Loading branch information
davidebolo1993 committed Feb 28, 2024
1 parent e247261 commit c1aaab2
Show file tree
Hide file tree
Showing 16 changed files with 243 additions and 382 deletions.
13 changes: 0 additions & 13 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -58,19 +58,6 @@ RUN cargo install --locked maturin
#install numpy
RUN pip3 install numpy

#pgr-tk installation
RUN wget https://github.com/GeneDx/pgr-tk/releases/download/v0.5.1/pgr-tk-v0.5.1.zip \
&& unzip pgr-tk-v0.5.1.zip \
&& rm pgr-tk-v0.5.1.zip \
&& chmod +x release/* \
&& chmod +x wheels/* \
&& pip3 install wheels/pgrtk-0.5.1-cp310-cp310-linux_x86_64.whl

ENV PATH /opt/release:$PATH

#install wheels


#ln python to python3 -not used right now but, who knows?
RUN ln -s /usr/bin/python3 /usr/bin/python

Expand Down
114 changes: 78 additions & 36 deletions cosigt.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,35 +11,18 @@ import (
"sort"
"strconv"
"strings"
"path"

"github.com/akamensky/argparse"
"gonum.org/v1/gonum/stat/combin"
)

//all these functions assume no errors in input files

func GetCosineSimilarity(A []float64, B []float64) float64 {

var eucl_magn, dot_product, similarity float64

eucl_magn = GetMagnitude(A, B)

if eucl_magn > 0 {

dot_product = GetDotProduct(A, B)
similarity = dot_product / eucl_magn

} else {

similarity = 0

}

return similarity
}

func GetMagnitude(A []float64, B []float64) float64 {

var A_len, B_len float64

A_len = 0
B_len = 0

Expand All @@ -66,9 +49,42 @@ func GetDotProduct(A []float64, B []float64) float64 {
return dot_product
}


func GetCosineSimilarity(A []float64, B []float64) float64 {

/*
Simple calculation of cosine similarity
*/


var eucl_magn, dot_product, similarity float64

eucl_magn = GetMagnitude(A, B)

if eucl_magn > 0 {

dot_product = GetDotProduct(A, B)
similarity = dot_product / eucl_magn

} else {

similarity = 0

}

return similarity
}



func ReadBlacklist(s string) []string {

ids := make([]string, 0, 10) //don't want to re-allocate too many times
/*
Samples we may want to exclude, one-per-line
*/


ids := make([]string, 0, 100)

f, _ := os.Open(s)
defer f.Close()
Expand All @@ -87,6 +103,10 @@ func ReadBlacklist(s string) []string {

func ReadGz(s string) ([]string, [][]float64) {

/*
Read the gzip-compressed files
*/

cov := make([][]float64, 0, 1000)
id := make([]string, 0, 1000)

Expand Down Expand Up @@ -124,6 +144,7 @@ func ReadGz(s string) ([]string, [][]float64) {
id = append(id, rec[0])

}

return id, cov
}

Expand Down Expand Up @@ -176,22 +197,39 @@ func SumSlices(a []float64, b []float64) []float64 {

func main() {

//I/O files
/*
Parsing of arguments with argparse
*/

parser := argparse.NewParser("cosigt", "genotyping loci in pangenome graphs using cosine distance")

p := parser.String("p", "paths", &argparse.Options{Required: true, Help: "gzip-compressed tsv file with path names and node coverages from odgi paths"})
g := parser.String("g", "gaf", &argparse.Options{Required: true, Help: "gzip-compressed gaf (graph alignment format) file for a sample from gafpack"})
b := parser.String("b", "blacklist", &argparse.Options{Required: false, Help: "txt file with names of paths to exclude (one per line)"})
o := parser.String("o", "output", &argparse.Options{Required: true, Help: "folder prefix for output files"})

err := parser.Parse(os.Args)

if err != nil {

fmt.Print(parser.Usage(err))
os.Exit(1)

}

//read first table
hapid, gcov := ReadGz(os.Args[1])
hapid, gcov := ReadGz(*p)
//read second table
smpl, bcov := ReadGz(os.Args[2])
smpl, bcov := ReadGz(*g)
//read blacklist - it can be empty or not
blck := ReadBlacklist(os.Args[3])
//trace combinations we already use
blck := ReadBlacklist(*b)
//trace combinations we have already seen
seen:=make(map[int]bool)

//store results in map
m := make(map[string]float64)

//generate all possible diploid combination
n := len(hapid)
k := 2
k := 2 //fixed ploidy - this can be however adjusted
gen := combin.NewCombinationGenerator(n, k)

for gen.Next() {
Expand All @@ -204,8 +242,6 @@ func main() {
indiv := (hapid[h1] + "-" + hapid[h2])
m[indiv] = GetCosineSimilarity(sum, bcov[0])

//comment the following if we don't need to add homo

_,ok:=seen[h1]

if !ok {
Expand All @@ -228,7 +264,6 @@ func main() {

}

//comment above if we don't need to add homo

}

Expand All @@ -250,12 +285,19 @@ func main() {
})

//write combinations
_ = os.Mkdir(os.Args[4], os.ModePerm)
WriteMap(m, os.Args[4]+"/combos.tsv")
outpath:=path.Clean(*o)
_ = os.Mkdir(outpath, os.ModePerm)

combos:=path.Clean(outpath + "/combos.tsv")
WriteMap(m, combos)

//write best score
f, _ := os.Create(os.Args[4] + "/best_genotype.tsv")
best:=path.Clean(outpath + "/best_genotype.tsv")
f, _ := os.Create(best)
defer f.Close()
result := fmt.Sprintf("#sample\tbest_genotype\tbest_score\n%s\t%s\t%.16f\n", smpl[0], keys[0], m[keys[0]])
_, _ = f.WriteString(result)
_, _ = f.WriteString(result)

//all done

}
5 changes: 3 additions & 2 deletions cosigt_smk/config/slurm/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11,13 +11,13 @@ cluster:
default-resources:
- partition=cpuq
- mem_mb=1000
- time="00:10:00"
- time=10
- gpu=0
max-jobs-per-second: 10
max-status-checks-per-second: 20
local-cores: 1
latency-wait: 90
jobs: 600
jobs: 100
keep-going: True
rerun-incomplete: True
printshellcmds: True
Expand All @@ -26,3 +26,4 @@ use-conda: True
use-singularity: True
verbose: True
reason: True
retries: 4
Empty file removed cosigt_smk/logs/.gitignore
Empty file.
1 change: 1 addition & 0 deletions cosigt_smk/snakemake.run.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
snakemake --profile config/slurm --singularity-args "-B /group/soranzo/davide.bolognini/working/crams,/group/soranzo/davide.bolognini/working/graph,/ssu/gassu/reference_data/human_genomes/GRCh38,/localscratch" cosigt
11 changes: 2 additions & 9 deletions cosigt_smk/workflow/Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -7,21 +7,14 @@ df=(pd.read_table(config['samples'], dtype={'sample_id': str, 'cram':str})
.sort_index()
)

include: 'rules/pgrtk.smk'
include: 'rules/pggb.smk'
include: 'rules/extract.smk'
include: 'rules/pggb.smk'
include: 'rules/odgi.smk'
include: 'rules/bwa-mem2.smk'
include: 'rules/gfainject.smk'
include: 'rules/gafpack.smk'
include: 'rules/cosigt.smk'
include: 'rules/evaluate.smk'

rule cosigt:
input:
expand('results/cosigt/{sample}/{region}/best_genotype.tsv', sample=df['sample_id'].tolist(), region=config['region']),

rule evaluation:
input:
expand('results/cosigt/evaluation/{region}/evaluation.pdf', region=config['region']),
expand('results/cosigt/evaluation/{region}/evaluation.jaccard.pdf', region=config['region'])
expand('results/cosigt/{sample}/{region}/best_genotype.tsv', sample=df['sample_id'].tolist(), region=config['region'])
6 changes: 3 additions & 3 deletions cosigt_smk/workflow/rules/bwa-mem2.smk
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,9 @@ rule bwa_mem2_index:
bwa index
'''
input:
rules.pgrtk_filter.output
rules.odgi_paths_fasta.output
output:
multiext('results/pgrtk/fasta/{region}.fa', '.bwt.2bit.64', '.pac', '.ann', '.amb', '.0123')
multiext('results/odgi/paths/fasta/{region}.fa', '.bwt.2bit.64', '.pac', '.ann', '.amb', '.0123')
threads:
1
resources:
Expand All @@ -23,7 +23,7 @@ rule bwa_mem2_samtools_sort:
bwa-mem2 and sam-to-bam conversion with samtools
'''
input:
ref=rules.pgrtk_filter.output,
ref=rules.odgi_paths_fasta.output,
idx=rules.bwa_mem2_index.output,
sample=rules.samtools_fasta.output
output:
Expand Down
2 changes: 1 addition & 1 deletion cosigt_smk/workflow/rules/cosigt.smk
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,6 @@ rule cosigt_genotype:
prefix='results/cosigt/{sample}/{region}'
shell:
'''
cosigt {input.zpath} {input.xpack} resources/extra/bad_samples.txt {params.prefix}
cosigt -p {input.zpath} -g {input.xpack} -b resources/extra/blacklist.txt -o {params.prefix}
'''

Loading

0 comments on commit c1aaab2

Please sign in to comment.