Skip to content

Commit

Permalink
adding debug rule, automated coverage estimation for plotting
Browse files Browse the repository at this point in the history
  • Loading branch information
davidebolo1993 committed Jul 25, 2024
1 parent 2e23758 commit 5d2f16a
Show file tree
Hide file tree
Showing 8 changed files with 316 additions and 4 deletions.
9 changes: 9 additions & 0 deletions cosigt_smk/workflow/Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,16 @@ include: 'rules/bwa-mem2.smk'
include: 'rules/gfainject.smk'
include: 'rules/gafpack.smk'
include: 'rules/cosigt.smk'
include: 'rules/megadepth.smk'

rule cosigt:
input:
expand(config['output'] + '/cosigt/{sample}/{region}/cosigt_genotype.tsv', sample=df['sample_id'].tolist(), region=config['region'])

rule annotate:
input:
expand(config['output'] + '/odgi/untangle/{region}.gggenes.tsv', region=config['region'])

rule debug:
input:
expand(config['output'] + '/megadepth/{sample}/{region}.all.bw', sample=df['sample_id'].tolist(), region=config['region'])
1 change: 1 addition & 0 deletions cosigt_smk/workflow/rules/bwa-mem2.smk
Original file line number Diff line number Diff line change
Expand Up @@ -49,3 +49,4 @@ rule bwamem2_mem_samtools_sort:
-@ {threads} \
- > {output}
'''

32 changes: 32 additions & 0 deletions cosigt_smk/workflow/rules/cosigt.smk
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,38 @@ rule cosigt_genotype:
prefix=config['output'] + '/cosigt/{sample}/{region}'
benchmark:
'benchmarks/{sample}.{region}.cosigt_genotype.benchmark.txt'
shell:
'''
cosigt \
-p {input.zpath} \
-g {input.xpack} \
-c {input.cluster} \
-b resources/extra/blacklist.txt \
-o {params.prefix}
'''

rule cosigt_genotype2:
'''
https://github.com/davidebolo1993/cosigt
'''
input:
zpath=rules.odgi_paths_matrix.output,
xpack=rules.gafpack_coverage.output,
cluster=rules.make_clusters2.output
output:
geno=config['output'] + '/cosigt2/{sample}/{region}/cosigt_genotype.tsv',
combo=config['output'] + '/cosigt2/{sample}/{region}/sorted_combos.tsv'
threads:
1
resources:
mem_mb=lambda wildcards, attempt: attempt * config['default']['mem_mb'],
time=lambda wildcards, attempt: attempt * config['default']['time']
container:
'docker://davidebolo1993/cosigt_workflow:latest'
params:
prefix=config['output'] + '/cosigt2/{sample}/{region}'
benchmark:
'benchmarks/{sample}.{region}.cosigt_genotype2.benchmark.txt'
shell:
'''
cosigt \
Expand Down
26 changes: 26 additions & 0 deletions cosigt_smk/workflow/rules/megadepth.smk
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
rule megadepth_bam_to_bigwig:
'''
https://github.com/ChristopherWilks/megadepth
'''
input:
rules.bwamem2_mem_samtools_sort.output
output:
config['output'] + '/megadepth/{sample}/{region}.all.bw'
threads:
1
resources:
mem_mb=lambda wildcards, attempt: attempt * config['default']['mem_mb'],
time=lambda wildcards, attempt: attempt * config['default']['time']
container:
'docker://davidebolo1993/cosigt_workflow:latest'
benchmark:
'benchmarks/{sample}.{region}.megadepth_bam_to_bigwig.benchmark.txt'
params:
prefix=config['output'] + '/megadepth/{sample}/{region}
shell:
'''
megadepth \
--bigwig \
--prefix {params.prefix} \
{input}
'''
158 changes: 157 additions & 1 deletion cosigt_smk/workflow/rules/odgi.smk
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,33 @@ rule odgi_paths_matrix:
cut -f 1,4- | gzip > {output}
'''


rule cosine_distance_from_paths:
'''
https://github.com/davidebolo1993/cosigt
'''

input:
rules.odgi_paths_matrix.output
output:
config['output'] + '/cosine_paths/{region}.tsv'
threads:
1
resources:
mem_mb=lambda wildcards, attempt: attempt * config['default']['mem_mb'],
time=lambda wildcards, attempt: attempt * config['default']['time']
container:
'docker://davidebolo1993/cosigt_workflow:latest'
benchmark:
'benchmarks/{region}.cosine_distance_from_paths.benchmark.txt'
shell:
'''
python workflow/scripts/combos.py \
{input} \
{output}
'''


rule odgi_similarity:
'''
https://github.com/pangenome/odgi
Expand All @@ -95,6 +122,7 @@ rule odgi_similarity:
-i {input} > {output}
'''


rule make_clusters:
'''
https://github.com/davidebolo1993/cosigt
Expand All @@ -119,4 +147,132 @@ rule make_clusters:
python workflow/scripts/cluster.py \
{input} \
{params.prefix}
'''
'''


#plot structures when using rule annotate
rule odgi_procbed:
'''
https://github.com/pangenome/odgi
'''
input:
og=rules.pggb_construct.output,
bed='resources/annotations/{region}.bed'
output:
config['output'] + '/odgi/procbed/{region}.bed'
threads:
1
resources:
mem_mb=lambda wildcards, attempt: attempt * config['default']['mem_mb'],
time=lambda wildcards, attempt: attempt * config['default']['time']
container:
'docker://pangenome/odgi:1707818641'
benchmark:
'benchmarks/{region}.odgi_procbed.benchmark.txt'
shell:
'''
odgi procbed \
-i {input.og} \
-b {input.bed} > {output}
'''


rule annot_names:
'''
https://github.com/davidebolo1993/cosigt
'''
input:
rules.odgi_procbed.output
output:
config['output'] + '/odgi/procbed/{region}.names.txt'
threads:
1
resources:
mem_mb=lambda wildcards, attempt: attempt * config['default']['mem_mb'],
time=lambda wildcards, attempt: attempt * config['default']['time']
benchmark:
'benchmarks/{region}.annot_names.benchmark.txt'
shell:
'''
cat <(cut -f 4 {input}) <(cut -f 4 {input} | sed 's/$/_inv/g') > {output}
'''


rule odgi_inject:
'''
https://github.com/pangenome/odgi
'''
input:
og=rules.pggb_construct.output,
bed=rules.odgi_procbed.output
output:
config['output'] + '/odgi/inject/{region}.og'
threads:
1
resources:
mem_mb=lambda wildcards, attempt: attempt * config['default']['mem_mb'],
time=lambda wildcards, attempt: attempt * config['default']['time']
container:
'docker://pangenome/odgi:1707818641'
benchmark:
'benchmarks/{region}.odgi_inject.benchmark.txt'
shell:
'''
odgi inject \
-i {input.og} \
-b {input.bed} \
-o {output}
'''


rule odgi_flip:
'''
https://github.com/pangenome/odgi
'''
input:
rules.odgi_inject.output,
output:
config['output'] + '/odgi/flip/{region}.og'
threads:
1
resources:
mem_mb=lambda wildcards, attempt: attempt * config['default']['mem_mb'],
time=lambda wildcards, attempt: attempt * config['default']['time']
container:
'docker://pangenome/odgi:1707818641'
benchmark:
'benchmarks/{region}.odgi_flip.benchmark.txt'
shell:
'''
odgi flip \
-i {input} \
-o {output}
'''


rule odgi_untangle:
'''
https://github.com/pangenome/odgi
'''
input:
og=rules.odgi_flip.output,
annots=rules.annot_names.output
output:
config['output'] + '/odgi/untangle/{region}.gggenes.tsv'
threads:
1
resources:
mem_mb=lambda wildcards, attempt: attempt * config['default']['mem_mb'],
time=lambda wildcards, attempt: attempt * config['default']['time']
container:
'docker://pangenome/odgi:1707818641'
benchmark:
'benchmarks/{region}.odgi_untangle.benchmark.txt'
shell:
'''
odgi untangle \
-R {input.annots} \
-i {input.og} \
-j 0.3 \
-g > {output}
'''
7 changes: 4 additions & 3 deletions cosigt_smk/workflow/scripts/cluster.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,8 @@ def read_tsv(tsv):
'''
read tsv with distances between haplotypes from odgi similarity
'''

return 1-pd.read_table(tsv).set_index(['group.a', 'group.b'])['jaccard.similarity'].sort_index().unstack()
return 1-pd.read_table(tsv).pivot(index='group.a',columns ='group.b',values='jaccard.similarity')

def agglomerative(mtx,prefix):

Expand All @@ -30,7 +30,8 @@ def agglomerative(mtx,prefix):

sil_score=[] #all silhouette scores

range_n_clusters = range(2,mtx.shape[0])
maxc=round(mtx.shape[0]/100*33)
range_n_clusters = range(2,maxc)

for i,n_clusters in enumerate(range_n_clusters):

Expand Down
22 changes: 22 additions & 0 deletions cosigt_smk/workflow/scripts/plotcoverage.r
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
#!/usr/bin/Rscript

args <- commandArgs(trailingOnly = TRUE)

library(rtracklayer)
library(ggplot2)

files<-sort(list.files(args[1], pattern="*.all.bw", full.names=T,recursive=T))

for (f in files) {

name<-basename(dirname(f))
bw <- import.bw(f)
df<- data.frame(bw)
p<-ggplot(df, aes(x=start, y=score, group=seqnames)) +
geom_point(show.legend=F) +
geom_smooth(show.legend=F) +
facet_wrap(~seqnames, ncol=5, scales="free_x") +
theme_linedraw()
ggsave(paste0(f,".pdf"),width=20, height=20)

}
65 changes: 65 additions & 0 deletions cosigt_smk/workflow/scripts/plotgggenes.r
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
#!/usr/bin/Rscript

args <- commandArgs(trailingOnly = TRUE)

library(ggplot2)
library(gggenes)
library(rjson)

x <- data.table::fread(args[1])
y <- fromJSON(file=args[2])
reference<-args[3]

if (!(reference %in% x$molecule)) {

x$strand<-abs(x$strand-1)

}

x$old_label<-x$molecule
x$molecule<-gsub("_inv", "", x$molecule)
c<-c()

for (i in c(1:nrow(x))) {

if (x$molecule[i] %in% names(y)) {

c<-c(c, y[[x$molecule[i]]])

} else {

c<-c(c, "reference")
}

}

x$c<-c
x<-x[order(c),]
x<-subset(x, c != "reference")

x$ymin<-0
x$ymax<-0
x$molecule<-as.numeric(factor(x$molecule, levels=unique(x$molecule)))

for (cl in unique(x$c)) {

s<-subset(x, (c == cl))
first<-as.numeric(head(s,1)$molecule)-0.4
last<-as.numeric(tail(s,1)$molecule)+0.4
x[which(x$c == cl),]$ymin<-first
x[which(x$c == cl),]$ymax<-last

}

p<-ggplot(x, aes(xmin=start, xmax=end, y=molecule, fill=gene, forward=strand, label=gene)) +
geom_gene_arrow(arrowhead_height = unit(3, "mm"), arrowhead_width = unit(1, "mm"), show.legend=FALSE) +
scale_fill_brewer(palette = "Set3") +
geom_rect(data= data.frame(ymin=unique(x$ymin),ymax=unique(x$ymax),cluster=unique(x$c)), inherit.aes = FALSE, mapping=aes(xmin = -Inf, xmax = +Inf, ymin = ymin, ymax = ymax, color = cluster), alpha=0.4, show.legend=TRUE, linewidth=1, fill=NA) +
#scale_color_brewer(palette = "Dark2") +
geom_gene_label(align = "right") +
theme_genes() +
labs(y="haplotype") +
scale_y_continuous(breaks=c(1:length(unique(x$old_label))), labels=unique(x$old_label))+
guides(fill = "none")

ggsave(paste0(args[3], ".pdf"), height=20, width=20)

0 comments on commit 5d2f16a

Please sign in to comment.