From 33c20799215b6c51a3b954f936ac3c32cba6b1df Mon Sep 17 00:00:00 2001 From: Dmitry Antipov Date: Mon, 23 Sep 2024 21:53:04 -0400 Subject: [PATCH] speedup - counting only necessary distances --- src/scripts/scaffolding/match_graph.py | 23 ++++- src/scripts/scaffolding/path_storage.py | 16 +++ src/scripts/scaffolding/scaffold_graph.py | 120 +++++++++++++++++++--- 3 files changed, 146 insertions(+), 13 deletions(-) diff --git a/src/scripts/scaffolding/match_graph.py b/src/scripts/scaffolding/match_graph.py index 86465a7a..9e00d393 100755 --- a/src/scripts/scaffolding/match_graph.py +++ b/src/scripts/scaffolding/match_graph.py @@ -315,4 +315,25 @@ def isHomologousPath (self, paths, lens): self.logger.debug (f"Found homologous paths {paths} with homology size {hom_size}") return True else: - return False \ No newline at end of file + return False + + def getHomologousPaths(self, path_storage, path_id): + homologous_paths = [] + homologous_nodes = set() + for node in path_storage.getEdgeSequenceById(path_id): + if node in self.matchGraph.nodes: + for hom_node in self.matchGraph.neighbors(node): + if self.matchGraph.edges[node, hom_node]['weight'] < 0: + homologous_nodes.add(hom_node) + self.logger.debug(f"For path {path_id} counted homologous nodes {homologous_nodes}") + paths_to_check = set() + for node in homologous_nodes: + paths_to_check.update(path_storage.getPathsFromNode(node)) + self.logger.debug(f"For path {path_id} suspicious homologous paths {paths_to_check}") + + for susp_path_id in paths_to_check: + if self.isHomologousPath([path_storage.getPathById(path_id), path_storage.getPathById(susp_path_id)], [path_storage.getLength(path_id), path_storage.getLength(susp_path_id)]): + homologous_paths.append(susp_path_id) + self.logger.debug(f"For path {path_id} found {len(paths_to_check)} homologous paths {paths_to_check}") + + return homologous_paths diff --git a/src/scripts/scaffolding/path_storage.py b/src/scripts/scaffolding/path_storage.py index 69630e73..9f4beeda 100755 --- a/src/scripts/scaffolding/path_storage.py +++ b/src/scripts/scaffolding/path_storage.py @@ -22,6 +22,9 @@ def __init__(self, G): self.G = G self.pairwise_dists = {} + self.node_to_paths = {} + self.node_to_paths_counted = False + def getPathById(self, path_id): return self.paths[path_id] @@ -150,3 +153,16 @@ def writePathAsFasta(self, input_fasta, output_fasta): else: overlap = 0 f.write(f"\n") + + def getPathsFromNode(self, node_id): + if not self.node_to_paths_counted: + for path_id in self.paths: + for node in self.getEdgeSequenceById(path_id): + if not node in self.node_to_paths: + self.node_to_paths[node] = [] + self.node_to_paths[node].append(path_id) + self.node_to_paths_counted = True + if node_id in self.node_to_paths: + return self.node_to_paths[node_id] + else: + return [] \ No newline at end of file diff --git a/src/scripts/scaffolding/scaffold_graph.py b/src/scripts/scaffolding/scaffold_graph.py index a868b2eb..512dd1f8 100755 --- a/src/scripts/scaffolding/scaffold_graph.py +++ b/src/scripts/scaffolding/scaffold_graph.py @@ -103,6 +103,7 @@ class ScaffoldGraph: #Just too long/far TOO_FAR = 1000000000 + #efficiantly it is so because of bwa behaviour on XA tags but not used directly MAX_ALIGNMENTS = 6 @@ -112,6 +113,10 @@ class ScaffoldGraph: #to check whether paths have similar lengths, in cheating t2t connection SIMILAR_LEN_FRAC = 0.7 + #Constants to switch off distance counting for huge graphs + MAX_GRAPH_FOR_DISTANCES = 1000000 + MAX_COMPONENT_FOR_DISTANCES = 50000 + def __init__(self, rukki_paths, telomere_locations_file, hic_alignment_file, matches_file, G, uncompressed_fasta, path_mashmap, porec, logger): self.logger = logger_wrap.UpdatedAdapter(logger, self.__class__.__name__) self.rukki_paths = rukki_paths @@ -132,9 +137,11 @@ def __init__(self, rukki_paths, telomere_locations_file, hic_alignment_file, mat all_connections, unique_connections = self.get_connections_porec(hic_alignment_file, True) else: self.logger.info ("Loading Hi-C alignments") - all_connections, unique_connections = self.get_connections_bam(hic_alignment_file, True) - self.output_basename = "scaff_rukki.paths" + all_connections, unique_connections = self.get_connections_bam(hic_alignment_file, True) + self.all_connections = all_connections + self.unique_connections = unique_connections + self.output_basename = "scaff_rukki.paths" telomeric_ends = self.getTelomericEnds() self.hic_alignment_file = hic_alignment_file @@ -166,12 +173,25 @@ def __init__(self, rukki_paths, telomere_locations_file, hic_alignment_file, mat #possilby unefficient but whelp scores = {} unique_scores = {} - self.dists = dict(nx.all_pairs_dijkstra_path_length(self.upd_G, weight=lambda u, v, d: self.upd_G.edges[u, v]['mid_length'])) - self.logger.info("Pairwise distances in assembly graph calculated") + self.dists = {} + if G.number_of_nodes() > ScaffoldGraph.MAX_GRAPH_FOR_DISTANCES: + self.logger.info(f"Graph is too big {G.number_of_nodes()}, not calculating pairwise distances") + else: + max_comp_size = len(max(nx.weakly_connected_components(G), key=len)) + if max_comp_size > ScaffoldGraph.MAX_COMPONENT_FOR_DISTANCES: + self.logger.info(f"Biggest component is too big {len(max_comp_size)}, not calculating pairwise distances") + else: + self.logger.info("Calculating pairwise distances for assembly graph nodes") + self.dists = dict(nx.all_pairs_dijkstra_path_length(self.upd_G, weight=lambda u, v, d: self.upd_G.edges[u, v]['mid_length'])) + self.logger.info("Pairwise distances in assembly graph calculated") + self.haploids = self.getHaploidPaths(self.rukki_paths) #bam should be prefiltered #all_connections = self.get_connections_bam("../", True) + self.scores = {} + self.unique_scores = {} + ''' for from_path_id in self.rukki_paths.getPathIds(): scores[from_path_id] = {} unique_scores[from_path_id] = {} @@ -195,11 +215,9 @@ def __init__(self, rukki_paths, telomere_locations_file, hic_alignment_file, mat or_from_path_id = from_path_id + from_dir or_to_path_id = to_path_id + to_dir self.scaffold_graph.add_edge(or_from_path_id, or_to_path_id, weight = scores[from_path_id][to_path_id][from_dir + to_dir], unique_weight = unique_scores[from_path_id][to_path_id][from_dir + to_dir]) - if self.rukki_paths.getLength(from_path_id) >= ScaffoldGraph.MIN_PATH_TO_SCAFFOLD and self.rukki_paths.getLength(from_path_id) >= ScaffoldGraph.MIN_PATH_TO_SCAFFOLD: + if self.rukki_paths.getLength(from_path_id) >= ScaffoldGraph.MIN_PATH_TO_SCAFFOLD and self.rukki_paths.getLength(from_path_id) >= ScaffoldGraph.MIN_PATH_TO_SCAFFOLD: self.logger.debug (f"Final not-unique scores {from_path_id} {to_path_id} {scores[from_path_id][to_path_id]}") - self.logger.debug (f"Final unique scores {from_path_id} {to_path_id} {unique_scores[from_path_id][to_path_id]}") - -#TODO: move all rc_ somewhere, not right place + self.logger.debug (f"Final unique scores {from_path_id} {to_path_id} {unique_scores[from_path_id][to_path_id]}")''' def isHaploidPath(self, paths, nor_path_id): @@ -393,6 +411,50 @@ def forbiddenPair(self, from_path_id, to_path_id): return True return False + + def getScores(self, or_path_id, path_storage, type): + res = [] + if type == "weight": + cur_scores = self.scores + connections = self.all_connections + elif type == "unique_weight": + cur_scores = self.unique_scores + connections = self.unique_connections + else: + self.logger.error(f"Unknown type {type}") + return res + nor_path_id = gf.nor_path_id(or_path_id) + precounted = True + + homologous_paths = self.match_graph.getHomologousPaths(path_storage, nor_path_id) + #Counting only necessary scores, cashing in global dict + if not (or_path_id in cur_scores): + cur_scores[or_path_id] = {} + cur_scores[gf.rc_path_id(or_path_id)] = {} + precounted = False + for next_or_path in self.scaffold_graph.nodes(): + #Counting all orientations together so no need to double count + if next_or_path[-1] == "-": + continue + next_nor_path = gf.nor_path_id(next_or_path) + #TODO: optimize homology counting, once per all paths + if next_nor_path in homologous_paths: + continue + else: + if not precounted: + nor_scores = self.getPathPairConnections([nor_path_id, next_nor_path], connections, self.uncompressed_lens) + for from_dir in ('-', '+'): + for to_dir in ('-', '+'): + or_from_path_id = nor_path_id + from_dir + or_to_path_id = next_nor_path + to_dir + if nor_scores[from_dir + to_dir] > 0: + cur_scores[or_from_path_id][or_to_path_id] = nor_scores[from_dir + to_dir] + for next_or_path in self.scaffold_graph.nodes(): + if next_or_path in cur_scores[or_path_id]: + res.append([next_or_path, cur_scores[or_path_id][next_or_path]]) + res.sort(key=lambda x: x[1], reverse=True) + return res + #Main logic is here! # returns NONE/DONE/next_path_id #type: weight/unique_weight @@ -404,11 +466,21 @@ def findExtension(self, cur_path_id, type): if self.scaffold_graph.nodes[cur_path_id]['telomere'][1]: self.logger.info (f"Stopped at the telomere") return "DONE",0 + ''' for next_node in self.scaffold_graph.successors(cur_path_id): #specific hacks to avoid local_scores.append([next_node, self.scaffold_graph.edges[cur_path_id, next_node][type]]) - local_scores.sort(key=lambda x: x[1], reverse=True) + alt_scores = self.getScores(cur_path_id, self.rukki_paths, type) + + if len(alt_scores) > 0 and alt_scores[0]!= local_scores[0]: + self.logger.warning(f"Difference! local scores {local_scores[0]} alt scores {alt_scores[0]}") + for i in range (0,10): + self.logger.warning(f"Alt scores {i} {alt_scores[i]}") + elif len(alt_scores) == 0 and len(local_scores) > 0: + self.logger.warning(f"Local scores but no alt scores, best extension {local_scores[0]}") + ''' + local_scores = self.getScores(cur_path_id, self.rukki_paths, type) best_ind = -1 second_best_ind = -1 for i in range (0, len(local_scores)): @@ -504,10 +576,12 @@ def generateScaffolds(self): break else: hom_before = False + nor_new_path_id = gf.nor_path_id(next_path_id) + homologous_paths = self.match_graph.getHomologousPaths(self.rukki_paths, gf.nor_path_id(nor_new_path_id)) for prev_path_id in cur_scaffold: - nor_new_path_id = gf.nor_path_id(next_path_id) nor_prev_path_id = gf.nor_path_id(prev_path_id) - if self.match_graph.isHomologousPath([self.rukki_paths.getPathById(nor_new_path_id), self.rukki_paths.getPathById(nor_prev_path_id)], [self.rukki_paths.getLength(nor_new_path_id), self.rukki_paths.getLength(nor_prev_path_id)]): + if nor_prev_path_id in homologous_paths: +# self.match_graph.isHomologousPath([self.rukki_paths.getPathById(nor_new_path_id), self.rukki_paths.getPathById(nor_prev_path_id)], [self.rukki_paths.getLength(nor_new_path_id), self.rukki_paths.getLength(nor_prev_path_id)]): #TODO: really not normal if we see that best extension is homologous to some path in scaffold, deserves investigation self.logger.warning(f"Trying to extend, had homologous path in same scaffold before! {nor_new_path_id} {nor_prev_path_id}") hom_before = True @@ -574,9 +648,11 @@ def getAdditionalRephase(self, scaffolded_rephased, used_ids, paths): for nor_old_id in scaffolded_rephased: if not self.isHaploidPath(paths, nor_old_id): #very unoptimal but this should not be frequent + homologous_paths = self.match_graph.getHomologousPaths(paths, nor_old_id) for alt_path_id in paths.getPathIds(): #scaffolded path should not be significantly shorter than homologous alternative to recolor - if paths.getLength(nor_old_id) * 2 > paths.getLength(alt_path_id) and self.match_graph.isHomologousPath([paths.getPathById(nor_old_id), paths.getPathById(alt_path_id)], [paths.getLength(nor_old_id), paths.getLength(alt_path_id)]): + if paths.getLength(nor_old_id) * 2 > paths.getLength(alt_path_id) and alt_path_id in homologous_paths: + #and self.match_graph.isHomologousPath([paths.getPathById(nor_old_id), paths.getPathById(alt_path_id)], [paths.getLength(nor_old_id), paths.getLength(alt_path_id)]): #alt_path_id is homologous to something rephased and thus should be rephased too. old_label = paths.getLabel(alt_path_id) splitted_id = alt_path_id.split("_") @@ -1333,3 +1409,23 @@ def getTelomericEnds(self): tel_end = True res[id] = [tel_start, tel_end] return res + + #For each paths returns its connected component. If multiple, then 0 + def getPathColors(self, rukki_paths, graph): + components = sorted(nx.weakly_connected_components(self.upd_G),key=len, reverse=True) + node_colors = {} + path_colors = {} + for i in range (0, len(components)): + for node in components[i]: + node_colors[node] = i + 1 + for path_id in rukki_paths.getPathIds(): + current_colors = set() + for node in rukki_paths.getPathById(path_id): + current_colors.add(node_colors[gf.nor_node(node)]) + if len (current_colors) == 1: + path_colors[path_id] = current_colors.pop() + else: + path_colors[path_id] = 0 + return path_colors + + \ No newline at end of file