Skip to content

Commit

Permalink
telomeres fixed, to_scaff removed
Browse files Browse the repository at this point in the history
  • Loading branch information
Dmitry-Antipov committed Jul 16, 2024
1 parent 79f474d commit 11b3da6
Show file tree
Hide file tree
Showing 4 changed files with 76 additions and 56 deletions.
12 changes: 9 additions & 3 deletions src/scripts/graph_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -203,16 +203,20 @@ def getComponentColors(G):
current_color += 1
return component_colors


def get_telomeric_nodes(telomere_locations_file, G):
aux_tel_nodes = set()
new_G = G.copy()
CLOSE_ENOUGH = 20000
with open(telomere_locations_file) as f:
for l in f:
parts = l.strip().split('\t')
telnode = ""
graph_node = parts[0]
if int(parts[1]) < 20000:

to_start = int(parts[1])
to_end = int(parts[3]) - int(parts[2])
#Do not want to add telomeres to both ends of short telomeric node
if to_start < CLOSE_ENOUGH and to_start <= to_end:
telnode="telomere_"+graph_node+"+_start"
new_G.add_node(telnode, length=0, coverage=0)
new_G.add_edge(telnode, graph_node+'+', mid_length=G.nodes[graph_node+'+']['length'])
Expand All @@ -223,7 +227,7 @@ def get_telomeric_nodes(telomere_locations_file, G):
new_G.add_edge(graph_node+'-', telnode, mid_length=G.nodes[graph_node+'+']['length'])
aux_tel_nodes.add(telnode)

if int(parts[2])+20000 > G.nodes[graph_node + '+']['length']:
elif to_end < CLOSE_ENOUGH and to_end < to_start:
telnode="telomere_"+graph_node+"+_end"
new_G.add_node(telnode, length=0, coverage=0)
new_G.add_edge(graph_node+'+', telnode, mid_length=G.nodes[graph_node+'+']['length'])
Expand All @@ -233,6 +237,8 @@ def get_telomeric_nodes(telomere_locations_file, G):
new_G.add_node(telnode, length=0, coverage=0)
new_G.add_edge(telnode, graph_node+'-', mid_length=G.nodes[graph_node+'+']['length'])
aux_tel_nodes.add(telnode)
else:
sys.stderr.write(f"Warning: telomere location {l} is not close enough to any ends of the contig, skipping\n")
return aux_tel_nodes, new_G


Expand Down
2 changes: 1 addition & 1 deletion src/scripts/hic_prefilter.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@
for cur_read in bamfile:
total_reads += 1
# awk '$7 != "*" && $7 != "="' | awk '$NF !~ "SA" && $NF !~ ($7 "," )' | awk '$5 != 0 || $NF ~ "XA"'
if cur_read.is_unmapped or cur_read.reference_name == cur_read.next_reference_name or (cur_read.has_tag("XA") and cur_read.get_tag("XA").find(cur_read.reference_name+",") != -1) or (not cur_read.has_tag("XA") and cur_read.mapping_quality == 0):
if cur_read.is_unmapped or cur_read.reference_name == cur_read.next_reference_name or (cur_read.has_tag("XA") and cur_read.get_tag("XA").find(cur_read.next_reference_name+",") != -1) or (not cur_read.has_tag("XA") and cur_read.mapping_quality == 0):
basic_filtered += 1
continue
cur_name = cur_read.query_name
Expand Down
26 changes: 22 additions & 4 deletions src/scripts/scaffolding/match_graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ class HomologyInfo:
JUMP_JOINING_FRACTION = 0.5
#DO not want huge jumps anyway
JUMP_JOINING_ABSOLUTE = 5000000

def __init__(self, node1, node2, len1, len2, logger):
self.nodes = [node1, node2]
self.len = [len1, len2]
Expand Down Expand Up @@ -100,7 +101,7 @@ def fillCoverage(self):
while prev_int[start_i] != -1:
start_i = prev_int[start_i]
self.approximate_positions[ind] = [covered_intervals[start_i][0], covered_intervals[max_i][1]]

def getCoveredLen(self):
#in weird case matches can be larger than seq, avoiding
return min(self.covered[0], self.covered[1], self.len[0], self.len[1])
Expand Down Expand Up @@ -160,11 +161,24 @@ def isValid(self, node1, node2):
#extracting length, sometimes we do not haev this info in other places
def getLength(self, node):
return self.lens[node]


def getApproximatePositionLength(self, node1, node2, ind):
if not self.isValid(node1, node2):
return 0
return self.homologies[node1][node2].approximate_positions[ind][1] - self.homologies[node1][node2].approximate_positions[ind][0]


# homology_weight - large negative something, min_big_homology - minimal length of homology to be taken in account.
# Some shorter ones can still sometimes be used if they are in regular bulge_like structure
# G can be both directed and undirected graph
class MatchGraph:

#Best match > CLEAR_BEST*1.5 then clear homology
CLEAR_BEST = 1.5

#homologous intervals should cover at least 1/3 of at least one of the nodes in pair
REQUIRED_COVERAGE_FRACTION = 1/3

def __init__(self, mashmap_sim, G, homology_weight, min_big_homology, min_alignment, logger):
self.matchGraph = nx.Graph()
self.hom_storage = HomologyStorage(logger, mashmap_sim, min_alignment)
Expand Down Expand Up @@ -210,14 +224,18 @@ def __init__(self, mashmap_sim, G, homology_weight, min_big_homology, min_alignm
#homology storage may be asymetric, mashmap do not guararntee anything
#possibly should forcely symmetrize...
if self.hom_storage.isValid(ec[0], ec[1]) and (not G.has_edge(ec[0], ec[1])):
long_enough = True
for i in range (0, 2):
best_homology = True
best_len = self.matchGraph.edges[ec]['homology_len']
for adj_node in self.matchGraph.neighbors(ec[i]):
if adj_node != ec[1 - i] and best_len * 0.8 < self.matchGraph.edges[ec[i], adj_node]['homology_len']:
if adj_node != ec[1 - i] and best_len < self.matchGraph.edges[ec[i], adj_node]['homology_len'] * MatchGraph.CLEAR_BEST:
best_homology = False
clear_best_match = clear_best_match or best_homology
if clear_best_match:
#we have total length of homologous sequences without joining and approximate positions with joining, check if any is long enough
if max(best_len, self.hom_storage.getApproximatePositionLength(ec[0], ec[1], i)) < self.hom_storage.getLength(ec[i]) * MatchGraph.REQUIRED_COVERAGE_FRACTION:
long_enough = False
if clear_best_match and long_enough:
self.matchGraph.add_edge(ec[0], ec[1], weight = homology_weight, homology_len = best_len, intervals = self.hom_storage.homologies[ec[0]][ec[1]].filtered_intervals,
orientation = self.hom_storage.homologies[ec[0]][ec[1]].orientation)
else:
Expand Down
92 changes: 44 additions & 48 deletions src/scripts/scaffolding/scaffold_graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,8 +58,13 @@ class ScaffoldGraph:
#if one of orientation is relatively close in graph(<min(1/4*path_length, CLOSE_IN_GRAPH) and other is really far (>3/4 of the length), we move all connections from far one to close
CLOSE_IN_GRAPH = 500000
#If paths are closer than CLOSE_IN_GRAPH, we significantly increase scores. Should be reconsidered when lots of gaps present

#can be asymetric because of the 1/4 path_length rule, possibly should reconsider it
CONNECTIVITY_MULTIPLICATIVE_BONUS = 2
#TODO: reconsidered to be smaller than min of two paths, check whether anything go wrong

#efficiently was 4 in lots of cases because applied twice
#TODO recheck
CONNECTIVITY_MULTIPLICATIVE_BONUS = 4


#default values for MatchGraph construction
Expand All @@ -86,7 +91,7 @@ class ScaffoldGraph:

#Consequtive paths scores are increased by this factor.
#TODO Possibly should be some high absolute constant too?
REFERENCE_MULTIPLICATIVE_BONUS = 4
REFERENCE_MULTIPLICATIVE_BONUS = 5
#Just too long/far
TOO_FAR = 1000000000

Expand Down Expand Up @@ -115,7 +120,7 @@ def __init__(self, rukki_paths, telomere_locations_file, hic_alignment_file, mat
#presaved_pathscores = self.loadPresavedScores("precomputed.pathscores")

#Used for scaffolding starts and debug,
self.to_scaff = self.get_paths_to_scaff(ScaffoldGraph.MIN_PATH_TO_SCAFFOLD)
telomeric_ends = self.getTelomericEnds()
self.hic_alignment_file = hic_alignment_file

self.match_graph = match_graph.MatchGraph(matches_file, G, -239239239, ScaffoldGraph.MATCHGRAPH_LONG_NODE, ScaffoldGraph.MATCHGRAPH_MIN_ALIGNMENT, logger)
Expand All @@ -139,13 +144,10 @@ def __init__(self, rukki_paths, telomere_locations_file, hic_alignment_file, mat
for id in rukki_paths.getPathIds():
for dir in ('-', '+'):
or_id = id + dir
#TODO: what about shorter without telomere
tels_ends = [True, True]
if id in self.to_scaff.keys():
if dir == '+':
tels_ends = self.to_scaff[id]
else:
tels_ends = [self.to_scaff[id][1], self.to_scaff[id][0]]
if dir == '+':
tels_ends = telomeric_ends[id]
else:
tels_ends = [telomeric_ends[id][1], telomeric_ends[id][0]]
self.scaffold_graph.add_node(or_id, telomere = tels_ends)
#possilby unefficient but whelp
scores = {}
Expand Down Expand Up @@ -179,8 +181,9 @@ def __init__(self, rukki_paths, telomere_locations_file, hic_alignment_file, mat
or_from_path_id = from_path_id + from_dir
or_to_path_id = to_path_id + to_dir
self.scaffold_graph.add_edge(or_from_path_id, or_to_path_id, weight = scores[from_path_id][to_path_id][from_dir + to_dir], unique_weight = unique_scores[from_path_id][to_path_id][from_dir + to_dir])
if from_path_id in self.to_scaff and to_path_id in self.to_scaff:
self.logger.debug (f"Counted scores {from_path_id} {to_path_id} {scores[from_path_id][to_path_id]}")
if self.rukki_paths.getLength(from_path_id) >= ScaffoldGraph.MIN_PATH_TO_SCAFFOLD and self.rukki_paths.getLength(from_path_id) >= ScaffoldGraph.MIN_PATH_TO_SCAFFOLD:
self.logger.debug (f"Final not-unique scores {from_path_id} {to_path_id} {scores[from_path_id][to_path_id]}")
self.logger.debug (f"Final unique scores {from_path_id} {to_path_id} {unique_scores[from_path_id][to_path_id]}")

#TODO: move all rc_<smth> somewhere, not right place

Expand Down Expand Up @@ -223,21 +226,6 @@ def getHaploidPaths(self):
self.logger.info(f"Found haploid path {nor_path_id} with homology {total_hom} and len {path_len} ")
return haploids


def getClosestTelomere(self, path, direction):
#From telomere to rc(path_end)
if direction == '+':
path = gf.rc_path(path)
closest = 1000000000

add_dist = 0
#Or use all nodes?
shortened_path = [path[0]]

for tel_node in self.tel_nodes:
closest = min(closest, self.nodeToPathDist(tel_node, shortened_path, False) + add_dist)
return closest/2

#Dist from node end to path. Allowed to go not in the first path node, but then additional length in path is added
#Optionally allowing to use homologous nodes (to improve in gaps)
def nodeToPathDist(self, node, path, check_homologous):
Expand Down Expand Up @@ -421,7 +409,9 @@ def generateScaffolds(self):
middle_paths = []
#to avoid outputing same path twice
nor_used_path_ids = set()
for from_path_id in self.to_scaff:
for from_path_id in self.rukki_paths.getPathIds():
if self.rukki_paths.getLength(from_path_id) < ScaffoldGraph.MIN_PATH_TO_SCAFFOLD:
continue
for from_dir in ('-', '+'):
or_from_path_id = from_path_id + from_dir
if not self.scaffold_graph.nodes[or_from_path_id]['telomere'][1]:
Expand Down Expand Up @@ -761,6 +751,7 @@ def fixOrientation(self, path_ids, scores):
for i in range (0, 2):
#Do we need length check here? Should it be same as for telomeric one
#TODO: WIP

if self.rukki_paths.getLength(path_ids[i]) <= self.TOO_FAR:
for fixed_orientation in ('-', '+'):
shortest_paths = {'-':1000000000, '+':1000000000}
Expand All @@ -773,7 +764,11 @@ def fixOrientation(self, path_ids, scores):
if orient == '-':
to_check[i] = gf.rc_path(paths[i])
shortest_paths[orient] = self.pathDist(to_check[0], to_check[1], True)
self.logger.debug(f"Checking dists {to_check} index {i} dist {shortest_paths[orient]} cutoffs {min_cutoff} {max_cutoff}")
if (i == 0):
orient_pair = orient + fixed_orientation
else:
orient_pair = fixed_orientation + orient
self.logger.debug(f"Checking dists {path_ids} orientations: {orient_pair} index {i} dist {shortest_paths[orient]} cutoffs {min_cutoff} {max_cutoff}")

if shortest_paths['-'] < min_cutoff and shortest_paths['+'] > max_cutoff:
correct_or = "-"
Expand All @@ -799,10 +794,23 @@ def fixOrientation(self, path_ids, scores):
else:
if scores[incorrect_pair] >= ScaffoldGraph.MIN_LINKS * self.INT_NORMALIZATION and scores[incorrect_pair] > scores[correct_pair]:
self.logger.debug(f"Dangerous connectivity tuning pair too long to move {path_ids}, i {i} scores {scores}, tels {self.scaffold_graph.nodes[path_ids[i]+'+']['telomere']}")
scores[correct_pair] *= self.CONNECTIVITY_MULTIPLICATIVE_BONUS
#TODO: this may happen twice or once!!!
#scores[correct_pair] *= self.CONNECTIVITY_MULTIPLICATIVE_BONUS
scores[incorrect_pair] = 0
self.logger.debug (f"Connectivity tuned pair {path_ids}, scores {scores}, tels {self.scaffold_graph.nodes[path_ids[i]+'+']['telomere']}")

#self.logger.debug (f"Connectivity tuned pair {path_ids}, scores {scores}, tels {self.scaffold_graph.nodes[path_ids[i]+'+']['telomere']}")
#Code duplication:(
for first_or in ('-', '+'):
for second_or in ('-', '+'):
str = first_or + second_or
to_check = paths.copy()
if first_or == "-":
to_check[0] = gf.rc_path(paths[0])
if second_or == "-":
to_check[1] = gf.rc_path(paths[1])
dist_paths = self.pathDist(to_check[0], to_check[1], True)
if dist_paths < self.CLOSE_IN_GRAPH and dist_paths < self.rukki_paths.getLength(path_ids[0]) / 4 and dist_paths < self.rukki_paths.getLength(path_ids[1]) / 4:
scores[str] *= self.CONNECTIVITY_MULTIPLICATIVE_BONUS
self.logger.debug(f"Connectivity bonus applied pair {path_ids} orientations {str}, scores {scores}")

return scores

Expand Down Expand Up @@ -834,14 +842,7 @@ def getNodePairConnections(self, pair, orientations, connections, shift_before,
if pair in connections:
for conn in connections[pair]:
cons.append(conn)
'''
if rc_pair in connections:
for coords, w in connections[rc_pair].items():
cons.append((coords[1], coords[0], w))
if pair in connections:
for coords, w in connections[pair].items():
cons.append((coords[0], coords[1], w))
'''

for conn in cons:
in_homo = False
for i in range (0, len(intervals[0])):
Expand All @@ -855,7 +856,7 @@ def getNodePairConnections(self, pair, orientations, connections, shift_before,
in_homo = True
break
if in_homo:
filtered+= 1
filtered += 1
continue
else:
not_filtered += 1
Expand Down Expand Up @@ -994,7 +995,7 @@ def getPresavedPathPairConnections(self, path_ids, presaved_scores):
return scores

#returns dict, {id:[present_start_relo, present_end_telo]}
def get_paths_to_scaff(self, long_enough):
def getTelomericEnds(self):
res = {}
for id in self.rukki_paths.paths:
total_l = self.rukki_paths.path_lengths[id]
Expand All @@ -1005,10 +1006,5 @@ def get_paths_to_scaff(self, long_enough):
tel_start = True
if self.upd_G.has_edge(self.rukki_paths.paths[id][-1], telomere):
tel_end = True
if tel_start and tel_end:
continue
#all long enough AND containing telomere
if total_l > long_enough:
res[id] = [tel_start, tel_end]
#print (f"will use path {paths.paths[id]} {tel_start} {tel_end}")
res[id] = [tel_start, tel_end]
return res

0 comments on commit 11b3da6

Please sign in to comment.