From 7c178c8a5bd58677645e5964e29ad6eb69c2fc53 Mon Sep 17 00:00:00 2001 From: davidebolo1993 Date: Wed, 3 Apr 2024 14:03:40 +0200 Subject: [PATCH] updating workflow --- .github/workflows/docker.yml | 2 +- .gitignore | 1 - Dockerfile | 23 +-- README.md | 2 +- cosigt.go | 107 ++++++++++---- cosigt_smk/workflow/rules/bwa-mem2.smk | 4 +- cosigt_smk/workflow/rules/cosigt.smk | 11 +- cosigt_smk/workflow/rules/extract.smk | 6 +- cosigt_smk/workflow/rules/gafpack.smk | 2 +- cosigt_smk/workflow/rules/gfainject.smk | 2 +- cosigt_smk/workflow/rules/odgi.smk | 26 +++- cosigt_smk/workflow/scripts/cluster.py | 72 +++++----- cosigt_smk/workflow/scripts/evaluate.smk | 83 ----------- cosigt_smk/workflow/scripts/make_heatmap.r | 83 ----------- cosigt_smk/workflow/scripts/tpr.py | 159 --------------------- 15 files changed, 169 insertions(+), 414 deletions(-) delete mode 100644 cosigt_smk/workflow/scripts/evaluate.smk delete mode 100644 cosigt_smk/workflow/scripts/make_heatmap.r delete mode 100644 cosigt_smk/workflow/scripts/tpr.py diff --git a/.github/workflows/docker.yml b/.github/workflows/docker.yml index 8414079..8f0d121 100644 --- a/.github/workflows/docker.yml +++ b/.github/workflows/docker.yml @@ -22,4 +22,4 @@ jobs: uses: docker/build-push-action@v2 with: push: true - tags: davidebolo1993/graph_genotyper:latest + tags: davidebolo1993/cosigt_workflow:latest diff --git a/.gitignore b/.gitignore index 7dd2967..de5a0c1 100644 --- a/.gitignore +++ b/.gitignore @@ -3,7 +3,6 @@ go.sum go.mod cosigt -cosigt_smk/results/* cosigt_smk/resources/* cosigt_smk/logs/* cosigt_smk/.snakemake/* diff --git a/Dockerfile b/Dockerfile index b5069a3..59ec475 100644 --- a/Dockerfile +++ b/Dockerfile @@ -56,16 +56,20 @@ RUN rustup update RUN cargo install --locked maturin #install numpy -RUN pip3 install numpy +RUN pip3 install numpy \ + pandas \ + matplotlib \ + scikit-learn \ + scipy #ln python to python3 -not used right now but, who knows? RUN ln -s /usr/bin/python3 /usr/bin/python ##install samtools -RUN wget https://github.com/samtools/samtools/releases/download/1.18/samtools-1.18.tar.bz2 \ - && tar -jxvf samtools-1.18.tar.bz2 \ - && rm samtools-1.18.tar.bz2 \ - && cd samtools-1.18 \ +RUN wget https://github.com/samtools/samtools/releases/download/1.19.2/samtools-1.19.2.tar.bz2 \ + && tar -jxvf samtools-1.19.2.tar.bz2 \ + && rm samtools-1.19.2.tar.bz2 \ + && cd samtools-1.19.2 \ && ./configure \ && make \ && make install @@ -89,14 +93,15 @@ ENV PATH /opt/bwa-mem2-2.2.1_x64-linux:$PATH ##install minimap2 -RUN wget https://github.com/lh3/minimap2/releases/download/v2.26/minimap2-2.26_x64-linux.tar.bz2 \ - && tar -jxvf minimap2-2.26_x64-linux.tar.bz2 \ - && rm minimap2-2.26_x64-linux.tar.bz2 +RUN wget https://github.com/lh3/minimap2/releases/download/v2.28/minimap2-2.28_x64-linux.tar.bz2 \ + && tar -jxvf minimap2-2.28_x64-linux.tar.bz2 \ + && rm minimap2-2.28_x64-linux.tar.bz2 -ENV PATH /opt/minimap2-2.26_x64-linux:$PATH +ENV PATH /opt/minimap2-2.28_x64-linux:$PATH ##install gafpack ##checkout to a specific version +##we need to git bisect in order to understand what is not working properly with the newest version RUN git clone https://github.com/ekg/gafpack.git \ && cd gafpack \ diff --git a/README.md b/README.md index bcc8d02..29a72d5 100644 --- a/README.md +++ b/README.md @@ -2,4 +2,4 @@ Workflow for genotyping from graph. -Docs [here](https://cosigtdoc.readthedocs.io/en/latest/) +Docs [here](https://davidebolo1993.github.io/cosigtdoc/) diff --git a/cosigt.go b/cosigt.go index a62efcd..37c4ed6 100644 --- a/cosigt.go +++ b/cosigt.go @@ -4,8 +4,10 @@ import ( "bufio" "compress/gzip" "encoding/csv" + "encoding/json" "fmt" "io" + "io/ioutil" "math" "os" "sort" @@ -21,6 +23,10 @@ import ( func GetMagnitude(A []float64, B []float64) float64 { + /* + Calculate magnitude + */ + var A_len, B_len float64 A_len = 0 @@ -38,6 +44,10 @@ func GetMagnitude(A []float64, B []float64) float64 { func GetDotProduct(A []float64, B []float64) float64 { + /* + Calculate dot product + */ + var dot_product float64 for i := 0; i < len(A); i++ { @@ -53,10 +63,9 @@ func GetDotProduct(A []float64, B []float64) float64 { func GetCosineSimilarity(A []float64, B []float64) float64 { /* - Simple calculation of cosine similarity + Calculate cosine similarity */ - var eucl_magn, dot_product, similarity float64 eucl_magn = GetMagnitude(A, B) @@ -80,10 +89,9 @@ func GetCosineSimilarity(A []float64, B []float64) float64 { func ReadBlacklist(s string) []string { /* - Samples we may want to exclude, one-per-line + Read file with paths to exclude - can be empty */ - ids := make([]string, 0, 100) f, _ := os.Open(s) @@ -104,7 +112,7 @@ func ReadBlacklist(s string) []string { func ReadGz(s string) ([]string, [][]float64) { /* - Read the gzip-compressed files + Read gafpack and odgi paths files */ cov := make([][]float64, 0, 1000) @@ -148,18 +156,59 @@ func ReadGz(s string) ([]string, [][]float64) { return id, cov } -func WriteMap(m map[string]float64, s string) { +func ReadJson(s string) map[string]string { - f, _ := os.Create(s) - defer f.Close() + /* + Read cluster json file + */ + + clstr:=make(map[string]string) + jsonFile,_ := os.Open(s) + defer jsonFile.Close() + byteValue, _ := ioutil.ReadAll(jsonFile) + json.Unmarshal([]byte(byteValue), &clstr) + + return clstr + +} + + + + +func WriteResults(m map[string]float64, keys *[]string, clstr map[string]string, s string, id string) { + + /* + Write results to sorted_combos and cosigt_genotype + */ + + f1, _ := os.Create(s + "/sorted_combos.tsv") + defer f1.Close() - w := csv.NewWriter(f) + f2, _ := os.Create(s + "/cosigt_genotype.tsv") + defer f2.Close() + + w := csv.NewWriter(f1) w.Comma = '\t' defer w.Flush() - for k, v := range m { + x := csv.NewWriter(f2) + x.Comma = '\t' + defer x.Flush() + + for i,k:=range (*keys) { + + haps:=strings.Split(k,"$") + + if i == 0 { + + //k, fmt.Sprintf("%.16f", v) + _ = x.Write([]string{"id", "h1", "h2", "c1", "c2", "cs"}) + _ = w.Write([]string{"h1", "h2", "c1", "c2", "cs"}) + _ = x.Write([]string{id,haps[0],haps[1],clstr[haps[0]],clstr[haps[1]],fmt.Sprintf("%.16f", m[k])}) + + } - _ = w.Write([]string{k, fmt.Sprintf("%.16f", v)}) + _ = w.Write([]string{haps[0],haps[1],clstr[haps[0]],clstr[haps[1]],fmt.Sprintf("%.16f", m[k])}) } @@ -167,6 +216,10 @@ func WriteMap(m map[string]float64, s string) { func SliceContains(s string, ids []string) bool { + /* + Check if substring in any string + */ + for _, x := range ids { if strings.Contains(s, x) { @@ -182,6 +235,9 @@ func SliceContains(s string, ids []string) bool { } func SumSlices(a []float64, b []float64) []float64 { + /* + Sum slices of floats + */ c := make([]float64, len(a)) @@ -206,6 +262,7 @@ func main() { p := parser.String("p", "paths", &argparse.Options{Required: true, Help: "gzip-compressed tsv file with path names and node coverages from odgi paths"}) g := parser.String("g", "gaf", &argparse.Options{Required: true, Help: "gzip-compressed gaf (graph alignment format) file for a sample from gafpack"}) b := parser.String("b", "blacklist", &argparse.Options{Required: false, Help: "txt file with names of paths to exclude (one per line)"}) + c := parser.String("c", "cluster", &argparse.Options{Required: false, Help: "cluster json file as generated with cluster.py"}) o := parser.String("o", "output", &argparse.Options{Required: true, Help: "folder prefix for output files"}) err := parser.Parse(os.Args) @@ -221,15 +278,17 @@ func main() { hapid, gcov := ReadGz(*p) //read second table smpl, bcov := ReadGz(*g) - //read blacklist - it can be empty or not + //read blacklist blck := ReadBlacklist(*b) + //read cluster .json file + clstr:= ReadJson(*c) //trace combinations we have already seen seen:=make(map[int]bool) //store results in map m := make(map[string]float64) //generate all possible diploid combination n := len(hapid) - k := 2 //fixed ploidy - this can be however adjusted + k := 2 //fixed ploidy gen := combin.NewCombinationGenerator(n, k) for gen.Next() { @@ -239,7 +298,7 @@ func main() { if len(blck) == 0 || (!SliceContains(hapid[h1], blck) && !SliceContains(hapid[h2], blck)) { //nothing to blacklist or both ids not in blacklist sum := SumSlices(gcov[h1], gcov[h2]) - indiv := (hapid[h1] + "-" + hapid[h2]) + indiv := (hapid[h1] + "$" + hapid[h2]) m[indiv] = GetCosineSimilarity(sum, bcov[0]) _,ok:=seen[h1] @@ -247,7 +306,7 @@ func main() { if !ok { sum=SumSlices(gcov[h1], gcov[h1]) - indiv= (hapid[h1] + "-" + hapid[h1]) + indiv= (hapid[h1] + "$" + hapid[h1]) m[indiv] = GetCosineSimilarity(sum, bcov[0]) seen[h1] = true @@ -258,7 +317,7 @@ func main() { if !ok { sum=SumSlices(gcov[h2], gcov[h2]) - indiv= (hapid[h2] + "-" + hapid[h2]) + indiv= (hapid[h2] + "$" + hapid[h2]) m[indiv] = GetCosineSimilarity(sum, bcov[0]) seen[h2] = true @@ -284,20 +343,8 @@ func main() { }) - //write combinations + //write genotype outpath:=path.Clean(*o) _ = os.Mkdir(outpath, os.ModePerm) - - combos:=path.Clean(outpath + "/combos.tsv") - WriteMap(m, combos) - - //write best score - best:=path.Clean(outpath + "/best_genotype.tsv") - f, _ := os.Create(best) - defer f.Close() - result := fmt.Sprintf("#sample\tbest_genotype\tbest_score\n%s\t%s\t%.16f\n", smpl[0], keys[0], m[keys[0]]) - _, _ = f.WriteString(result) - - //all done - + WriteResults(m,&keys,clstr,outpath,smpl[0]) } diff --git a/cosigt_smk/workflow/rules/bwa-mem2.smk b/cosigt_smk/workflow/rules/bwa-mem2.smk index a1643a4..d55a879 100644 --- a/cosigt_smk/workflow/rules/bwa-mem2.smk +++ b/cosigt_smk/workflow/rules/bwa-mem2.smk @@ -12,7 +12,7 @@ rule bwa_mem2_index: mem_mb=lambda wildcards, attempt: attempt * config['bwa-mem2']['mem_mb'], time=lambda wildcards, attempt: attempt * config['bwa-mem2']['time'] container: - 'docker://davidebolo1993/graph_genotyper:latest' + 'docker://davidebolo1993/cosigt_workflow:latest' shell: ''' bwa-mem2 index {input} @@ -34,7 +34,7 @@ rule bwa_mem2_samtools_sort: mem_mb=lambda wildcards, attempt: attempt * config['bwa-mem2']['mem_mb'], time=lambda wildcards, attempt: attempt * config['bwa-mem2']['time'] container: - 'docker://davidebolo1993/graph_genotyper:latest' + 'docker://davidebolo1993/cosigt_workflow:latest' shell: ''' bwa-mem2 mem -t {threads} {input.ref} {input.sample} | samtools sort -@ {threads} - > {output} diff --git a/cosigt_smk/workflow/rules/cosigt.smk b/cosigt_smk/workflow/rules/cosigt.smk index d18d516..3ffd85f 100644 --- a/cosigt_smk/workflow/rules/cosigt.smk +++ b/cosigt_smk/workflow/rules/cosigt.smk @@ -4,20 +4,21 @@ rule cosigt_genotype: ''' input: zpath=rules.odgi_paths_matrix.output, - xpack=rules.gafpack_coverage.output + xpack=rules.gafpack_coverage.output, + cluster=rules.cluster.output output: - geno=config['output'] + '/cosigt/{sample}/{region}/best_genotype.tsv', - combo=config['output'] + '/cosigt/{sample}/{region}/combos.tsv' + geno=config['output'] + '/cosigt/{sample}/{region}/cosigt_genotype.tsv', + combo=config['output'] + '/cosigt/{sample}/{region}/sorted_combos.tsv' threads: 1 resources: mem_mb=lambda wildcards, attempt: attempt * config['default']['mem_mb'], time=lambda wildcards, attempt: attempt * config['default']['time'] container: - 'docker://davidebolo1993/graph_genotyper:latest' + 'docker://davidebolo1993/cosigt_workflow:latest' params: prefix=config['output'] + '/cosigt/{sample}/{region}' shell: ''' - cosigt -p {input.zpath} -g {input.xpack} -b resources/extra/blacklist.txt -o {params.prefix} + cosigt -p {input.zpath} -g {input.xpack} -c {input.cluster} -b resources/extra/blacklist.txt -o {params.prefix} ''' \ No newline at end of file diff --git a/cosigt_smk/workflow/rules/extract.smk b/cosigt_smk/workflow/rules/extract.smk index 8e3f204..a8393c8 100644 --- a/cosigt_smk/workflow/rules/extract.smk +++ b/cosigt_smk/workflow/rules/extract.smk @@ -85,7 +85,7 @@ rule faidx: mem_mb=lambda wildcards, attempt: attempt * config['default']['mem_mb'], time=lambda wildcards, attempt: attempt * config['default']['time'] container: - 'docker://davidebolo1993/graph_genotyper:latest' + 'docker://davidebolo1993/cosigt_workflow:latest' shell: ''' samtools faidx {input} @@ -119,7 +119,7 @@ rule samtools_view: threads: config['samtools']['threads'] container: - 'docker://davidebolo1993/graph_genotyper:latest' + 'docker://davidebolo1993/cosigt_workflow:latest' params: ref=config['reference'], region='{region}' @@ -149,7 +149,7 @@ rule samtools_fasta: threads: config['samtools']['threads'] container: - 'docker://davidebolo1993/graph_genotyper:latest' + 'docker://davidebolo1993/cosigt_workflow:latest' resources: mem_mb=lambda wildcards, attempt: attempt * config['samtools']['mem_mb'], time=lambda wildcards, attempt: attempt * config['samtools']['time'] diff --git a/cosigt_smk/workflow/rules/gafpack.smk b/cosigt_smk/workflow/rules/gafpack.smk index 39f7188..b6a475f 100644 --- a/cosigt_smk/workflow/rules/gafpack.smk +++ b/cosigt_smk/workflow/rules/gafpack.smk @@ -13,7 +13,7 @@ rule gafpack_coverage: mem_mb=lambda wildcards, attempt: attempt * config['default']['mem_mb'], time=lambda wildcards, attempt: attempt * config['default']['time'] container: - 'docker://davidebolo1993/graph_genotyper:latest' + 'docker://davidebolo1993/cosigt_workflow:latest' shell: ''' gafpack \ diff --git a/cosigt_smk/workflow/rules/gfainject.smk b/cosigt_smk/workflow/rules/gfainject.smk index cec7fcf..6e912a5 100644 --- a/cosigt_smk/workflow/rules/gfainject.smk +++ b/cosigt_smk/workflow/rules/gfainject.smk @@ -13,7 +13,7 @@ rule gfa_inject: mem_mb=lambda wildcards, attempt: attempt * config['default']['mem_mb'], time=lambda wildcards, attempt: attempt * config['default']['time'] container: - 'docker://davidebolo1993/graph_genotyper:latest' + 'docker://davidebolo1993/cosigt_workflow:latest' params: shell: diff --git a/cosigt_smk/workflow/rules/odgi.smk b/cosigt_smk/workflow/rules/odgi.smk index 3ed5747..8b2566d 100644 --- a/cosigt_smk/workflow/rules/odgi.smk +++ b/cosigt_smk/workflow/rules/odgi.smk @@ -84,4 +84,28 @@ rule odgi_similarity: ''' odgi similarity \ -i {input} > {output} - ''' \ No newline at end of file + ''' + +rule cluster: + ''' + cluster + ''' + input: + rules.odgi_similarity.output + output: + config['output'] + '/cluster/{region}.json' + threads: + 1 + resources: + mem_mb=lambda wildcards, attempt: attempt * config['default']['mem_mb'], + time=lambda wildcards, attempt: attempt * config['default']['time'] + container: + 'docker://davidebolo1993/cosigt_workflow:latest' + params: + prefix=config['output'] + '/cluster/{region}' + shell: + ''' + python workflow/scripts/cluster.py \ + {input} \ + {params.prefix} + ''' \ No newline at end of file diff --git a/cosigt_smk/workflow/scripts/cluster.py b/cosigt_smk/workflow/scripts/cluster.py index f26db1c..5810e44 100644 --- a/cosigt_smk/workflow/scripts/cluster.py +++ b/cosigt_smk/workflow/scripts/cluster.py @@ -1,6 +1,5 @@ #!/usr/bin/python3 -import os import sys import json import pandas as pd @@ -25,21 +24,11 @@ def agglomerative(mtx,prefix): perform agglomerative clustering and plot dendrogram ''' - sil_score=[] - result=[] - d=dict() - agg=AgglomerativeClustering(distance_threshold=0, n_clusters=None, metric='precomputed', linkage='average') cluster=agg.fit(mtx) linkage_matrix=LinkageMatrix(agg) - - plt.figure() - dendrogram = hierarchy.dendrogram(linkage_matrix) - plt.ylabel('jaccard dissimilarity') - plt.tick_params(labelbottom = False, bottom = False) - plt.savefig(os.path.join(prefix, 'dendrogram.jaccard.dissimilarity.pdf')) - plt.close() + sil_score=[] #all silhouette scores range_n_clusters = range(2,mtx.shape[0]) @@ -53,38 +42,53 @@ def agglomerative(mtx,prefix): #get best k=sorted(sil_score, key=lambda x: x[1], reverse=True)[0][0] - #plot best - plt.figure(figsize=(10,6)) - dendrogram = hierarchy.dendrogram(linkage_matrix, truncate_mode = 'lastp', p = k) - plt.xlabel('haplotype group') - plt.ylabel('jaccard dissimilarity') - plt.xticks(rotation = 45) - plt.savefig(os.path.join(prefix, 'dendrogram.jaccard.bestcut.pdf')) - plt.close() + #results + hapdict=dict() agg=AgglomerativeClustering(n_clusters=k, metric='precomputed', linkage='average') cluster=agg.fit(mtx) groups=set(cluster.labels_) + haplotype_labels=list() - #result - for g in groups: - - group=list(np.take(mtx.columns.tolist(),np.where(cluster.labels_ == g))[0]) - result.append(group) - - with open(os.path.join(prefix, 'dendrogram.jaccard.bestcut.tsv'), 'w') as outfile: + with open(prefix + '.clusters.tsv', 'w') as outfile: - for g in result: + outfile.write('group\thaplotypes\n') - outfile.write(','.join(g) + '\n') + for i,g in enumerate(groups): + + group=list(np.take(mtx.columns.tolist(),np.where(cluster.labels_ == g))[0]) - for l in g: + for h in group: - d[l]=[x for x in g if x != l] + hapdict[h] = 'HG'+str(i) + outfile.write('HG'+str(i) + '\t' + ','.join(group) + '\n') + haplotype_labels.append('HG'+str(i) + ' (' + str(len(group)) + ')') - with open(os.path.join(prefix, 'dendrogram.jaccard.bestcut.json'), 'w') as outfile: + with open(prefix + '.clusters.json', 'w') as outfile: - json.dump(d, outfile, indent = 4) + json.dump(hapdict, outfile, indent = 4) + + dendrogram = hierarchy.dendrogram(linkage_matrix, truncate_mode = 'lastp', p = k) + hgl = {dendrogram['leaves'][ii]: haplotype_labels[ii] for ii in range(len(dendrogram['leaves']))} + + #plot best + plt.figure(figsize=(10,6)) + hierarchy.dendrogram( + linkage_matrix, + truncate_mode='lastp', # show only the last p merged clusters + p=k, # show only the last p merged clusters + leaf_label_func=lambda x: hgl[x], + leaf_rotation=45, + leaf_font_size=5, + show_contracted=True + ) + plt.xlabel('haplotype group (# haplotypes)') + plt.ylabel('jaccard dissimilarity') + plt.xticks(rotation = 45) + plt.savefig(prefix + '.clusters.pdf') + plt.close() + + def LinkageMatrix(model): @@ -103,7 +107,7 @@ def LinkageMatrix(model): if child_idx < n_samples: - current_count += 1 # leaf node + current_count += 1 #leaf node else: diff --git a/cosigt_smk/workflow/scripts/evaluate.smk b/cosigt_smk/workflow/scripts/evaluate.smk deleted file mode 100644 index f0c9926..0000000 --- a/cosigt_smk/workflow/scripts/evaluate.smk +++ /dev/null @@ -1,83 +0,0 @@ -import pandas as pd - -df=(pd.read_table(config['samples'], dtype={'sample_id': str, 'cram':str}) - .set_index('sample_id', drop=False) - .sort_index() -) - -rule evaluate_cosigt: - ''' - cosigt evaluation - ''' - input: - rules.cosigt_genotype.output.combo - output: - 'results/cosigt/{sample}/{region}/evaluation.tsv' - threads: - 1 - params: - samplename='{sample}' - shell: - ''' - sort -k 2 -n -r {input} | awk -v var="{params.samplename}" -F '{params.samplename}' '{{print NR "\t" NF-1 "\t" var "\t" $0}}' | sed 's/haplotype1-/haplotype1_/g' | sed 's/haplotype2-/haplotype2_/g' | sed -E 's/([0-9])\-([0-9])/\\1_\\2/g' |tr '-' '\t' | sed 's/haplotype1_/haplotype1-/g' | sed 's/haplotype2_/haplotype2-/g' > {output} - ''' - -rule plot_evaluation: - ''' - plot evaluation. Need to wait for all the evaluations table to be there - ''' - input: - expand('results/cosigt/{sample}/{region}/evaluation.tsv', sample=df['sample_id'].tolist(),region=config['region']), - lambda wildcards: glob('results/cosigt/*/{region}/evaluation.tsv'.format(region=wildcards.region)) - output: - 'results/cosigt/evaluation/{region}/evaluation.pdf' - threads: - 1 - conda: - '../envs/python.yml' - params: - region='{region}' - shell: - ''' - python workflow/scripts/tpr.py results/cosigt '' {output} {params.region} - ''' - -rule get_best_n_clusters: - ''' - cluster haplotypes by similarity - ''' - input: - mtx=rules.odgi_similarity.output - output: - 'results/cosigt/evaluation/{region}/dendrogram.jaccard.bestcut.json' - threads: - 1 - conda: - '../envs/python.yml' - params: - region='{region}' - shell: - ''' - python workflow/scripts/cluster.py {input.mtx} results/cosigt/evaluation/{params.region} - ''' - -rule plot_evaluation_dendro_jaccard: - ''' - plot evaluation using helper dendrogram - based on jaccard distance - ''' - input: - expand('results/cosigt/{sample}/{region}/evaluation.tsv', sample=df['sample_id'].tolist(),region=config['region']), - lambda wildcards: glob('results/cosigt/*/{region}/evaluation.tsv'.format(region=wildcards.region)), - json=rules.get_best_n_clusters.output - output: - 'results/cosigt/evaluation/{region}/evaluation.jaccard.pdf' - threads: - 1 - conda: - '../envs/python.yml' - params: - region='{region}' - shell: - ''' - python workflow/scripts/tpr.py results/cosigt {input.json} {output} {params.region} - ''' diff --git a/cosigt_smk/workflow/scripts/make_heatmap.r b/cosigt_smk/workflow/scripts/make_heatmap.r deleted file mode 100644 index fb1a8a4..0000000 --- a/cosigt_smk/workflow/scripts/make_heatmap.r +++ /dev/null @@ -1,83 +0,0 @@ -library(data.table) -library(ggplot2) - -args <- commandArgs(trailingOnly = TRUE) - -df<-data.frame(fread(args[1])) -selected_paths<-df$path.name -df<-df[,c(2:ncol(df))] -df_out<-matrix(0, nrow=nrow(df)*ncol(df), ncol=3) - -n<-0 -for (i in c(1:nrow(df))) { - - print(i) - for (j in c(1:ncol(df))) { - - n<-n+1 - df_out[n,1]<- selected_paths[i] - df_out[n,2]<- j - df_out[n,3]<- df[i,j] - - } - -} -df<-data.frame(df_out) -colnames(df)<-c('x', 'y', 'z') - -df$y<-as.numeric(df$y) -df$z<-as.numeric(df$z) - - -p_all <- ggplot(df, aes(x = y, y = x, fill = z)) + - geom_raster() + - scale_fill_gradient2( - oob = scales::squish - ) + - theme_void() + - theme(axis.text.x = element_text(), axis.title.x=element_text())+ - theme(axis.text.y = element_text(size=5))+ - theme(legend.position = "none") + - labs(x="node #") - -ggsave("all.pdf", width=15, height=12) - -selected_paths<-c("chr1_hg19_103998686_104406594_0", "chr1_hg38_103456064_103863972_0", "chr1_chm13_103304997_103901127_0", "HG01175#1#JAHAMA010000003.1_33204707_33808232_1", "HG02109#1#JAHEPG010000026.1_16673035_17080816_0", "HG002#2#JAHKSD010000021.1_102757493_103165350_0", "HG02257#2#JAGYVH010000034.1_98190388_98857961_0") -df<-subset(df, x%in%selected_paths) -#df$x<-factor(df$x,levels=selected_paths) -df$y<-as.numeric(df$y) -df$z<-as.numeric(df$z) - -p <- ggplot(df, aes(x = y, y = x, fill = z)) + - geom_raster() + - scale_fill_gradient2( - oob = scales::squish - ) + - theme_void() + - theme(axis.text.x = element_text(), axis.title.x=element_text())+ - theme(axis.text.y = element_text())+ - theme(legend.position = "none") + - labs(x="node #") - -ggsave("zoom1.pdf", width=15, height=2) - -#zoom on selected nodes -#example_nodes -nodes<-c(9800:9850) -df_sub<-subset(df, y%in%nodes) - -p_zoom <- ggplot(df_sub, aes(x = y, y = x, fill = z)) + - geom_tile(color = "white", - lwd = .5, - linetype = 1) + - geom_text(aes(label = z), color = "white", size = 4) + - scale_fill_gradient2( - oob = scales::squish - ) + - theme_void() + - theme(axis.text.x = element_text(), axis.title.x=element_text())+ - theme(axis.text.y = element_text(size=5))+ - theme(legend.position = "none") + - labs(x="node #") - -ggsave("zoom2.pdf", width=15, height=2) diff --git a/cosigt_smk/workflow/scripts/tpr.py b/cosigt_smk/workflow/scripts/tpr.py deleted file mode 100644 index ef0b1ae..0000000 --- a/cosigt_smk/workflow/scripts/tpr.py +++ /dev/null @@ -1,159 +0,0 @@ -#!/usr/bin/python3 - -import json -import sys -import os -import glob -import pandas as pd -import numpy as np -import matplotlib.pyplot as plt - -def infiles(infolder,region): - - ''' - find evaluation tables - ''' - - return glob.glob(infolder + '/*/' + region +'/evaluation.tsv') - - -def read_json(injson): - - ''' - read json with similarities between haplotypes - ''' - - try: - - with open(injson, 'r') as j: - - data=json.load(j) - - return data - - except: - - return None - -def GetHaplotypes(sample,df): - - ''' - get possible haplotype names - ''' - - out=[] - - for k in set(df[3].tolist() + df[4].tolist()): - - if sample in k: - - out.append(k) - - return out - - - -def eval_sample(infile,json): - - ''' - evaluate - ''' - - df=pd.read_table(infile, sep='\t', header=None) - sample=list(df.head(1)[2])[0] - haplos=GetHaplotypes(sample,df) - vec=np.zeros(df.shape[0]) - - h1 = [haplos[0]] - - if json is not None: - - if h1[0] in json: - - h1.extend(json[h1[0]]) - - if len(haplos) > 1: - - h2=[haplos[1]] - - if json is not None: - - if h2[0] in json: - - h2.extend(json[h2[0]]) - - else: #h2 can be anything - - h2=list(set(df[3].tolist() + df[4].tolist())) - - for i,row in df.iterrows(): - - if (row[3] in h1 and row[4] in h2) or (row[3] in h2 and row[4] in h1): - - vec[i:]=1 - break - - else: - - vec[i] = 0 - - return (sample, vec) - - -def main(): - - ''' - calculate true positive rate - ''' - - #parallelize at region level - eval_files=infiles(os.path.abspath(sys.argv[1]),sys.argv[4]) - jsonfile=read_json(os.path.abspath(sys.argv[2])) - vectors,samples=[],[] - - for f in eval_files: - - sample,eval_vec=eval_sample(f,jsonfile) - vectors.append(eval_vec) - samples.append(sample) - - result=np.zeros(len(vectors[0])) - - with open(os.path.abspath(sys.argv[3]).replace('.pdf', '.badsamples.tsv'), 'w') as outreport: - - for i in range(len(vectors[0])): - - nogood=[] - sum=0 - - for l,el in enumerate(vectors): #for each combination, store results plus names of no_good samples - - if el[i] != 1: - - nogood.append(samples[l]) - - sum+=el[i] - - result[i] = sum/len(vectors) - outreport.write(str(i+1) + '\t' + ','.join(nogood) + '\n') - - if jsonfile is not None: - - cut=os.path.basename(sys.argv[2]).split('.')[1] - - else: - - cut = 'no' - - plt.title('Evaluation - ' + cut + ' dendrogram') - plt.ylim([0, 1]) - plt.plot(list(range(len(result))), result, marker='*') - plt.xlabel('Combination index') - plt.ylabel('True positive rate') - plt.savefig(os.path.abspath(sys.argv[3])) - - - -if __name__ == '__main__': - - main()