Skip to content

Commit

Permalink
updating workflow
Browse files Browse the repository at this point in the history
  • Loading branch information
davidebolo1993 committed Apr 3, 2024
1 parent 14a4390 commit 7c178c8
Show file tree
Hide file tree
Showing 15 changed files with 169 additions and 414 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/docker.yml
Original file line number Diff line number Diff line change
Expand Up @@ -22,4 +22,4 @@ jobs:
uses: docker/build-push-action@v2
with:
push: true
tags: davidebolo1993/graph_genotyper:latest
tags: davidebolo1993/cosigt_workflow:latest
1 change: 0 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
go.sum
go.mod
cosigt
cosigt_smk/results/*
cosigt_smk/resources/*
cosigt_smk/logs/*
cosigt_smk/.snakemake/*
Expand Down
23 changes: 14 additions & 9 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -56,16 +56,20 @@ RUN rustup update
RUN cargo install --locked maturin

#install numpy
RUN pip3 install numpy
RUN pip3 install numpy \
pandas \
matplotlib \
scikit-learn \
scipy

#ln python to python3 -not used right now but, who knows?
RUN ln -s /usr/bin/python3 /usr/bin/python

##install samtools
RUN wget https://github.com/samtools/samtools/releases/download/1.18/samtools-1.18.tar.bz2 \
&& tar -jxvf samtools-1.18.tar.bz2 \
&& rm samtools-1.18.tar.bz2 \
&& cd samtools-1.18 \
RUN wget https://github.com/samtools/samtools/releases/download/1.19.2/samtools-1.19.2.tar.bz2 \
&& tar -jxvf samtools-1.19.2.tar.bz2 \
&& rm samtools-1.19.2.tar.bz2 \
&& cd samtools-1.19.2 \
&& ./configure \
&& make \
&& make install
Expand All @@ -89,14 +93,15 @@ ENV PATH /opt/bwa-mem2-2.2.1_x64-linux:$PATH

##install minimap2

RUN wget https://github.com/lh3/minimap2/releases/download/v2.26/minimap2-2.26_x64-linux.tar.bz2 \
&& tar -jxvf minimap2-2.26_x64-linux.tar.bz2 \
&& rm minimap2-2.26_x64-linux.tar.bz2
RUN wget https://github.com/lh3/minimap2/releases/download/v2.28/minimap2-2.28_x64-linux.tar.bz2 \
&& tar -jxvf minimap2-2.28_x64-linux.tar.bz2 \
&& rm minimap2-2.28_x64-linux.tar.bz2

ENV PATH /opt/minimap2-2.26_x64-linux:$PATH
ENV PATH /opt/minimap2-2.28_x64-linux:$PATH

##install gafpack
##checkout to a specific version
##we need to git bisect in order to understand what is not working properly with the newest version

RUN git clone https://github.com/ekg/gafpack.git \
&& cd gafpack \
Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,4 @@

Workflow for genotyping from graph.

Docs [here](https://cosigtdoc.readthedocs.io/en/latest/)
Docs [here](https://davidebolo1993.github.io/cosigtdoc/)
107 changes: 77 additions & 30 deletions cosigt.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,10 @@ import (
"bufio"
"compress/gzip"
"encoding/csv"
"encoding/json"
"fmt"
"io"
"io/ioutil"
"math"
"os"
"sort"
Expand All @@ -21,6 +23,10 @@ import (

func GetMagnitude(A []float64, B []float64) float64 {

/*
Calculate magnitude
*/

var A_len, B_len float64

A_len = 0
Expand All @@ -38,6 +44,10 @@ func GetMagnitude(A []float64, B []float64) float64 {

func GetDotProduct(A []float64, B []float64) float64 {

/*
Calculate dot product
*/

var dot_product float64

for i := 0; i < len(A); i++ {
Expand All @@ -53,10 +63,9 @@ func GetDotProduct(A []float64, B []float64) float64 {
func GetCosineSimilarity(A []float64, B []float64) float64 {

/*
Simple calculation of cosine similarity
Calculate cosine similarity
*/


var eucl_magn, dot_product, similarity float64

eucl_magn = GetMagnitude(A, B)
Expand All @@ -80,10 +89,9 @@ func GetCosineSimilarity(A []float64, B []float64) float64 {
func ReadBlacklist(s string) []string {

/*
Samples we may want to exclude, one-per-line
Read file with paths to exclude - can be empty
*/


ids := make([]string, 0, 100)

f, _ := os.Open(s)
Expand All @@ -104,7 +112,7 @@ func ReadBlacklist(s string) []string {
func ReadGz(s string) ([]string, [][]float64) {

/*
Read the gzip-compressed files
Read gafpack and odgi paths files
*/

cov := make([][]float64, 0, 1000)
Expand Down Expand Up @@ -148,25 +156,70 @@ func ReadGz(s string) ([]string, [][]float64) {
return id, cov
}

func WriteMap(m map[string]float64, s string) {
func ReadJson(s string) map[string]string {

f, _ := os.Create(s)
defer f.Close()
/*
Read cluster json file
*/

clstr:=make(map[string]string)
jsonFile,_ := os.Open(s)
defer jsonFile.Close()
byteValue, _ := ioutil.ReadAll(jsonFile)
json.Unmarshal([]byte(byteValue), &clstr)

return clstr

}




func WriteResults(m map[string]float64, keys *[]string, clstr map[string]string, s string, id string) {

/*
Write results to sorted_combos and cosigt_genotype
*/

f1, _ := os.Create(s + "/sorted_combos.tsv")
defer f1.Close()

w := csv.NewWriter(f)
f2, _ := os.Create(s + "/cosigt_genotype.tsv")
defer f2.Close()

w := csv.NewWriter(f1)
w.Comma = '\t'
defer w.Flush()

for k, v := range m {
x := csv.NewWriter(f2)
x.Comma = '\t'
defer x.Flush()

for i,k:=range (*keys) {

haps:=strings.Split(k,"$")

if i == 0 {

//k, fmt.Sprintf("%.16f", v)
_ = x.Write([]string{"id", "h1", "h2", "c1", "c2", "cs"})
_ = w.Write([]string{"h1", "h2", "c1", "c2", "cs"})
_ = x.Write([]string{id,haps[0],haps[1],clstr[haps[0]],clstr[haps[1]],fmt.Sprintf("%.16f", m[k])})

}

_ = w.Write([]string{k, fmt.Sprintf("%.16f", v)})
_ = w.Write([]string{haps[0],haps[1],clstr[haps[0]],clstr[haps[1]],fmt.Sprintf("%.16f", m[k])})

}

}

func SliceContains(s string, ids []string) bool {

/*
Check if substring in any string
*/

for _, x := range ids {

if strings.Contains(s, x) {
Expand All @@ -182,6 +235,9 @@ func SliceContains(s string, ids []string) bool {
}

func SumSlices(a []float64, b []float64) []float64 {
/*
Sum slices of floats
*/

c := make([]float64, len(a))

Expand All @@ -206,6 +262,7 @@ func main() {
p := parser.String("p", "paths", &argparse.Options{Required: true, Help: "gzip-compressed tsv file with path names and node coverages from odgi paths"})
g := parser.String("g", "gaf", &argparse.Options{Required: true, Help: "gzip-compressed gaf (graph alignment format) file for a sample from gafpack"})
b := parser.String("b", "blacklist", &argparse.Options{Required: false, Help: "txt file with names of paths to exclude (one per line)"})
c := parser.String("c", "cluster", &argparse.Options{Required: false, Help: "cluster json file as generated with cluster.py"})
o := parser.String("o", "output", &argparse.Options{Required: true, Help: "folder prefix for output files"})

err := parser.Parse(os.Args)
Expand All @@ -221,15 +278,17 @@ func main() {
hapid, gcov := ReadGz(*p)
//read second table
smpl, bcov := ReadGz(*g)
//read blacklist - it can be empty or not
//read blacklist
blck := ReadBlacklist(*b)
//read cluster .json file
clstr:= ReadJson(*c)
//trace combinations we have already seen
seen:=make(map[int]bool)
//store results in map
m := make(map[string]float64)
//generate all possible diploid combination
n := len(hapid)
k := 2 //fixed ploidy - this can be however adjusted
k := 2 //fixed ploidy
gen := combin.NewCombinationGenerator(n, k)

for gen.Next() {
Expand All @@ -239,15 +298,15 @@ func main() {
if len(blck) == 0 || (!SliceContains(hapid[h1], blck) && !SliceContains(hapid[h2], blck)) { //nothing to blacklist or both ids not in blacklist

sum := SumSlices(gcov[h1], gcov[h2])
indiv := (hapid[h1] + "-" + hapid[h2])
indiv := (hapid[h1] + "$" + hapid[h2])
m[indiv] = GetCosineSimilarity(sum, bcov[0])

_,ok:=seen[h1]

if !ok {

sum=SumSlices(gcov[h1], gcov[h1])
indiv= (hapid[h1] + "-" + hapid[h1])
indiv= (hapid[h1] + "$" + hapid[h1])
m[indiv] = GetCosineSimilarity(sum, bcov[0])
seen[h1] = true

Expand All @@ -258,7 +317,7 @@ func main() {
if !ok {

sum=SumSlices(gcov[h2], gcov[h2])
indiv= (hapid[h2] + "-" + hapid[h2])
indiv= (hapid[h2] + "$" + hapid[h2])
m[indiv] = GetCosineSimilarity(sum, bcov[0])
seen[h2] = true

Expand All @@ -284,20 +343,8 @@ func main() {

})

//write combinations
//write genotype
outpath:=path.Clean(*o)
_ = os.Mkdir(outpath, os.ModePerm)

combos:=path.Clean(outpath + "/combos.tsv")
WriteMap(m, combos)

//write best score
best:=path.Clean(outpath + "/best_genotype.tsv")
f, _ := os.Create(best)
defer f.Close()
result := fmt.Sprintf("#sample\tbest_genotype\tbest_score\n%s\t%s\t%.16f\n", smpl[0], keys[0], m[keys[0]])
_, _ = f.WriteString(result)

//all done

WriteResults(m,&keys,clstr,outpath,smpl[0])
}
4 changes: 2 additions & 2 deletions cosigt_smk/workflow/rules/bwa-mem2.smk
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ rule bwa_mem2_index:
mem_mb=lambda wildcards, attempt: attempt * config['bwa-mem2']['mem_mb'],
time=lambda wildcards, attempt: attempt * config['bwa-mem2']['time']
container:
'docker://davidebolo1993/graph_genotyper:latest'
'docker://davidebolo1993/cosigt_workflow:latest'
shell:
'''
bwa-mem2 index {input}
Expand All @@ -34,7 +34,7 @@ rule bwa_mem2_samtools_sort:
mem_mb=lambda wildcards, attempt: attempt * config['bwa-mem2']['mem_mb'],
time=lambda wildcards, attempt: attempt * config['bwa-mem2']['time']
container:
'docker://davidebolo1993/graph_genotyper:latest'
'docker://davidebolo1993/cosigt_workflow:latest'
shell:
'''
bwa-mem2 mem -t {threads} {input.ref} {input.sample} | samtools sort -@ {threads} - > {output}
Expand Down
11 changes: 6 additions & 5 deletions cosigt_smk/workflow/rules/cosigt.smk
Original file line number Diff line number Diff line change
Expand Up @@ -4,20 +4,21 @@ rule cosigt_genotype:
'''
input:
zpath=rules.odgi_paths_matrix.output,
xpack=rules.gafpack_coverage.output
xpack=rules.gafpack_coverage.output,
cluster=rules.cluster.output
output:
geno=config['output'] + '/cosigt/{sample}/{region}/best_genotype.tsv',
combo=config['output'] + '/cosigt/{sample}/{region}/combos.tsv'
geno=config['output'] + '/cosigt/{sample}/{region}/cosigt_genotype.tsv',
combo=config['output'] + '/cosigt/{sample}/{region}/sorted_combos.tsv'
threads:
1
resources:
mem_mb=lambda wildcards, attempt: attempt * config['default']['mem_mb'],
time=lambda wildcards, attempt: attempt * config['default']['time']
container:
'docker://davidebolo1993/graph_genotyper:latest'
'docker://davidebolo1993/cosigt_workflow:latest'
params:
prefix=config['output'] + '/cosigt/{sample}/{region}'
shell:
'''
cosigt -p {input.zpath} -g {input.xpack} -b resources/extra/blacklist.txt -o {params.prefix}
cosigt -p {input.zpath} -g {input.xpack} -c {input.cluster} -b resources/extra/blacklist.txt -o {params.prefix}
'''
6 changes: 3 additions & 3 deletions cosigt_smk/workflow/rules/extract.smk
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,7 @@ rule faidx:
mem_mb=lambda wildcards, attempt: attempt * config['default']['mem_mb'],
time=lambda wildcards, attempt: attempt * config['default']['time']
container:
'docker://davidebolo1993/graph_genotyper:latest'
'docker://davidebolo1993/cosigt_workflow:latest'
shell:
'''
samtools faidx {input}
Expand Down Expand Up @@ -119,7 +119,7 @@ rule samtools_view:
threads:
config['samtools']['threads']
container:
'docker://davidebolo1993/graph_genotyper:latest'
'docker://davidebolo1993/cosigt_workflow:latest'
params:
ref=config['reference'],
region='{region}'
Expand Down Expand Up @@ -149,7 +149,7 @@ rule samtools_fasta:
threads:
config['samtools']['threads']
container:
'docker://davidebolo1993/graph_genotyper:latest'
'docker://davidebolo1993/cosigt_workflow:latest'
resources:
mem_mb=lambda wildcards, attempt: attempt * config['samtools']['mem_mb'],
time=lambda wildcards, attempt: attempt * config['samtools']['time']
Expand Down
2 changes: 1 addition & 1 deletion cosigt_smk/workflow/rules/gafpack.smk
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ rule gafpack_coverage:
mem_mb=lambda wildcards, attempt: attempt * config['default']['mem_mb'],
time=lambda wildcards, attempt: attempt * config['default']['time']
container:
'docker://davidebolo1993/graph_genotyper:latest'
'docker://davidebolo1993/cosigt_workflow:latest'
shell:
'''
gafpack \
Expand Down
2 changes: 1 addition & 1 deletion cosigt_smk/workflow/rules/gfainject.smk
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ rule gfa_inject:
mem_mb=lambda wildcards, attempt: attempt * config['default']['mem_mb'],
time=lambda wildcards, attempt: attempt * config['default']['time']
container:
'docker://davidebolo1993/graph_genotyper:latest'
'docker://davidebolo1993/cosigt_workflow:latest'
params:

shell:
Expand Down
Loading

0 comments on commit 7c178c8

Please sign in to comment.