Skip to content

Commit

Permalink
pore-c pipeline fixed
Browse files Browse the repository at this point in the history
  • Loading branch information
Dmitry-Antipov committed Aug 17, 2024
1 parent 613634b commit 49adbfe
Show file tree
Hide file tree
Showing 5 changed files with 46 additions and 58 deletions.
9 changes: 6 additions & 3 deletions src/Snakefiles/8-hicPipeline.sm
Original file line number Diff line number Diff line change
Expand Up @@ -510,7 +510,7 @@ cd 8-hicPipeline
cat > ./hic_phasing.sh <<EOF
#!/bin/sh
set -e
{PYTHON} {VERKKO}/scripts/hicverkko.py {params.NO_RDNA} {params.UNEVEN_DEPTH} .
{PYTHON} {VERKKO}/scripts/launch_phasing.py {params.NO_RDNA} {params.UNEVEN_DEPTH} .
EOF

chmod +x ./hic_phasing.sh
Expand Down Expand Up @@ -577,6 +577,7 @@ chmod +x ./rukki_hic.sh
'''

#TODO: python script params

rule HiC_rdnascaff:
input:
graph = '8-hicPipeline/unitigs.hpc.noseq.gfa',
Expand All @@ -585,12 +586,14 @@ rule HiC_rdnascaff:
unitigs_telo = '8-hicPipeline/unitigs.telo',
unitigs_nohpc50 = '8-hicPipeline/unitigs_nonhpc50.mashmap',
path_mashmap = rules.prepareScaffolding.output.path_mashmap,
prefiltered_bam = rules.scaffoldMergeBWA.output.alignments
prefiltered_aln = rules.scaffoldMergeBWA.output.alignments if config['withPOREC'] == "False" else rules.transformBWA.output.byread_mapping
output:
scaff_tsv_path = '8-hicPipeline/scaff_rukki.paths.tsv',
scaff_gaf_path = '8-hicPipeline/scaff_rukki.paths.gaf',
final_tsv_path = '8-hicPipeline/rukki.paths.tsv',
final_gaf_path = '8-hicPipeline/rukki.paths.gaf',
params:
withporec = config['withPOREC']
log:
err = '8-hicPipeline/hic_scaff.err'
threads:
Expand All @@ -607,7 +610,7 @@ cd 8-hicPipeline
cat > ./hic_scaff.sh <<EOF
#!/bin/sh
set -e
{PYTHON} {VERKKO}/scripts/rdna_scaff.py .
{PYTHON} {VERKKO}/scripts/launch_scaffolding.py . {params.withporec}


cp ../{output.scaff_tsv_path} ../{output.final_tsv_path}
Expand Down
4 changes: 2 additions & 2 deletions src/main.mk
Original file line number Diff line number Diff line change
Expand Up @@ -285,10 +285,10 @@ FILES += \
scripts/circularize_ctgs.py -> ../lib/verkko/scripts/circularize_ctgs.py \
scripts/cluster.py -> ../lib/verkko/scripts/cluster.py \
scripts/graph_functions.py -> ../lib/verkko/scripts/graph_functions.py \
scripts/hicverkko.py -> ../lib/verkko/scripts/hicverkko.py \
scripts/launch_phasing.py -> ../lib/verkko/scripts/launch_phasing.py \
scripts/parse_sam_pairs.py -> ../lib/verkko/scripts/parse_sam_pairs.py \
scripts/hic_prefilter.py -> ../lib/verkko/scripts/hic_prefilter.py \
scripts/rdna_scaff.py -> ../lib/verkko/scripts/rdna_scaff.py \
scripts/launch_scaffolding.py -> ../lib/verkko/scripts/launch_scaffolding.py \
scripts/remove_nodes_add_telomere.py -> ../lib/verkko/scripts/remove_nodes_add_telomere.py \
scripts/scaffolding/logger_wrap.py -> ../lib/verkko/scripts/scaffolding/logger_wrap.py \
scripts/scaffolding/match_graph.py -> ../lib/verkko/scripts/scaffolding/match_graph.py \
Expand Down
File renamed without changes.
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
debug = False

run_dir = sys.argv[1]

porec = (sys.argv[2] == "True")
#TODO: deprectated
telomere_script = "/data/antipovd2/devel/scripts/verkko_output_analysis//getOnlyTelomere.sh"
hap_assign_script = "/data/antipovd2/devel/scripts/verkko_output_analysis//get.sh"
Expand Down Expand Up @@ -50,7 +50,10 @@
#path_mashmap = os.path.join(hicrun_dir, "paths2chm13100K.mashmap")
rukki_path_storage = path_storage.PathStorage(G)
rukki_path_storage.readFromFile(old_rukki_tsv_file)
#sf.try_to_scaff(rukki_paths, telomere_locations_file, os.path.join(hicrun_dir, "hic_mapping.byread.output"), os.path.join(hicrun_dir, "unitigs.matches"), G, indirectG, uncompressed_nodes)
sg = scaffold_graph.ScaffoldGraph(rukki_path_storage, telomere_locations_file, os.path.join(hicrun_dir, "hic_nodefiltered.bam"), os.path.join(hicrun_dir, "unitigs_nonhpc50.mashmap"), G, uncompressed_nodes, path_mashmap, logger)
if porec:
read_aln = os.path.join(hicrun_dir, "hic_mapping.byread.output")
else:
read_aln = os.path.join(hicrun_dir, "hic_nodefiltered.bam")
sg = scaffold_graph.ScaffoldGraph(rukki_path_storage, telomere_locations_file, read_aln, os.path.join(hicrun_dir, "unitigs_nonhpc50.mashmap"), G, uncompressed_nodes, path_mashmap, porec, logger)
res = sg.generateScaffolds()

82 changes: 32 additions & 50 deletions src/scripts/scaffolding/scaffold_graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,7 @@ class ScaffoldGraph:
#to check whether paths have similar lengths, in cheating t2t connection
SIMILAR_LEN_FRAC = 0.7

def __init__(self, rukki_paths, telomere_locations_file, hic_alignment_file, matches_file, G, uncompressed_fasta, path_mashmap, logger):
def __init__(self, rukki_paths, telomere_locations_file, hic_alignment_file, matches_file, G, uncompressed_fasta, path_mashmap, porec, logger):
self.logger = logger_wrap.UpdatedAdapter(logger, self.__class__.__name__)
self.rukki_paths = rukki_paths
self.uncompressed_lens = gf.get_lengths(uncompressed_fasta)
Expand All @@ -126,17 +126,15 @@ def __init__(self, rukki_paths, telomere_locations_file, hic_alignment_file, mat
self.logger.debug(self.tel_nodes)

self.match_graph = match_graph.MatchGraph(matches_file, G, -239239239, ScaffoldGraph.MATCHGRAPH_LONG_NODE, ScaffoldGraph.MATCHGRAPH_MIN_ALIGNMENT, logger)


all_connections, unique_connections = self.get_connections_bam(hic_alignment_file, interesting_nodes, True)


if porec == True:
self.logger.info ("Loading pore-c alignments")
all_connections, unique_connections = self.get_connections_porec(hic_alignment_file, True)
else:
self.logger.info ("Loading Hi-C alignments")
all_connections, unique_connections = self.get_connections_bam(hic_alignment_file, True)
self.output_basename = "scaff_rukki.paths"


#presaved_pathscores = self.loadPresavedScores("precomputed.pathscores")

#Used for scaffolding starts and debug,
telomeric_ends = self.getTelomericEnds()
self.hic_alignment_file = hic_alignment_file

Expand Down Expand Up @@ -210,15 +208,6 @@ def isHaploidPath(self, paths, nor_path_id):
nor_node = or_node.strip("-+")

for next in self.match_graph.getHomologousNodes(nor_node, True):
''''
if len (nodes_to_path_ids[next]) > 1:
#soemthing weird, ignoring
continue
next_id = nodes_to_path_ids[next][0]
if not (next_id in homs):
homs[next_id] = 0
homs[next_id] += self.match_graph.getEdgeAttribute(nor_node, next, 'homology_len')
'''
total_hom += self.match_graph.getEdgeAttribute(nor_node, next, 'homology_len')
path_len = paths.getLength(nor_path_id)
#TODO: should exclude homozygous nodes here
Expand Down Expand Up @@ -380,7 +369,6 @@ def isNextByRef(self, from_path_id, to_path_id):
if aligned[mid].query_len * ScaffoldGraph.INTERMEDIATE_PATH_FRACTION < inconsistency_len:
return False
return True

return False

def forbiddenPair(self, from_path_id, to_path_id):
Expand Down Expand Up @@ -834,44 +822,38 @@ def outputScaffolds(self, final_paths):
tsv_file.write(final_paths.getPathTsv(path_id) + "\n")
gaf_file.write(final_paths.getPathGaf(path_id) + "\n")
return

#returns: dict {(start_id, end_id):[[start_pos1, end_pos1]]}. Coords not compressed!
def get_connections(self, alignment_file, use_multimappers:bool):
#TODO: this works with hic_mapping.byread.output file that should be exterminated after phasing refactoring
def get_connections_porec(self, alignment_file, use_multimappers:bool):
res = {}
unique_storage = {}
#A01660:39:HNYM7DSX3:1:1101:1696:29982 utig4-73 utig4-1056 1 16949880 78591191
ind = 0
for line in open (alignment_file):
ind += 1
if (ind % 1000000 == 0):
self.logger.debug (f"Processed {ind} alignments")
if (ind % 10000000 == 0):
self.logger.debug (f"Processed {ind} pore-c alignments")

arr = line.split()
if not (use_multimappers) and (arr[1].find(",") != -1 or arr[2].find(",") != -1):
continue
first = arr[1].split(",")
second = arr[2].split(",")
first_coords = arr[4].split(",")
second_coords = arr[5].split(",")
weight = 1 / (len (first) * len(second))
#efficiently 1/2, 1/3, 1/4, 2/2 are allowed for now.
if weight < 0.24:
continue

for i in range (0, len(first)):
node_f = first[i]
for j in range (0, len(second)):
node_s = second[j]
if not (node_f, node_s) in res:
res[(node_f, node_s)] = []
next = [int(first_coords[i]), int(second_coords[j]), weight]
res[(node_f, node_s)].append(next)

if not (node_s, node_f) in res:
res[(node_s, node_f)] = []
res[(node_s, node_f)].append([int(second_coords[j]), int(first_coords[i]), weight])
return res

def get_connections_bam(self, bam_filename, interesting_names, use_multimappers:bool):
first = arr[1]
second = arr[2]
first_coords = int(arr[4])
second_coords = int(arr[5])
if first > second:
first, second = second, first
first_coords, second_coords = second_coords, first_coords
weight = self.INT_NORMALIZATION
names_pair = (first, second)
coords_pair = (first_coords, second_coords)
if not names_pair in res:
res[names_pair] = []
if not names_pair in unique_storage:
unique_storage[names_pair] = []
res[names_pair].append((coords_pair[0], coords_pair[1], weight))
unique_storage[names_pair].append((coords_pair[0], coords_pair[1], weight))
self.logger.debug (f"Finally processed {ind} pore-c alignments")
return res, unique_storage

def get_connections_bam(self, bam_filename, use_multimappers:bool):
res = {}
unique_storage = {}
#A01660:39:HNYM7DSX3:1:1101:1696:29982 utig4-73 utig4-1056 1 16949880 78591191
Expand Down

0 comments on commit 49adbfe

Please sign in to comment.