From e6bb6216151530903c27b0b2a4f666fdf82c32ea Mon Sep 17 00:00:00 2001 From: Sergey Koren Date: Wed, 25 Sep 2024 14:45:44 -0400 Subject: [PATCH] add RG tags to bam to delineate read types --- src/Snakefiles/7-combineConsensus.sm | 3 ++- src/scripts/bam_rename.py | 29 ++++++++++++++++++++++++---- 2 files changed, 27 insertions(+), 5 deletions(-) diff --git a/src/Snakefiles/7-combineConsensus.sm b/src/Snakefiles/7-combineConsensus.sm index 2fd94ea..835529e 100644 --- a/src/Snakefiles/7-combineConsensus.sm +++ b/src/Snakefiles/7-combineConsensus.sm @@ -35,6 +35,7 @@ rule combineConsensus: packcns = combineConsensusI, tigmap = rules.buildPackages.output.tigmap, scfmap = rules.layoutContigs.output.scfmap, + layout = rules.layoutContigs.output.layout, hificov = rules.verkko.input.hifi5cov, finished = rules.buildPackages.output.finished, pathstsv = rules.rukki.output.pathstsv if config['ruk_enable'] == "True" else rules.emptyfile.output, @@ -121,7 +122,7 @@ if [ "{params.haveBAM}" = "True" ]; then echo "" mem_per_core=\`expr {resources.mem_gb} \/ {threads} | awk '{{if (\$1 < 1) print "1G"; else print \$1"G"}}'\` - {params.SAMTOOLS} merge -f -O bam --reference combined.fasta -@{threads} -u - {params.packbam} | {PYTHON} {VERKKO}/scripts/bam_rename.py ../{input.scfmap} ../{input.tigmap} {params.packcns} | {params.SAMTOOLS} sort -m \$mem_per_core -@{threads} -o ../{output.bam} + {params.SAMTOOLS} merge -f -O bam --reference combined.fasta -@{threads} -u - {params.packbam} | {PYTHON} {VERKKO}/scripts/bam_rename.py ../{input.layout} ../6-layoutContigs/ont-gapfill.txt ../{input.scfmap} ../{input.tigmap} {params.packcns} | {params.SAMTOOLS} sort -m \$mem_per_core -@{threads} -o ../{output.bam} else touch ../{output.bam} fi diff --git a/src/scripts/bam_rename.py b/src/scripts/bam_rename.py index 3e1825b..50e582b 100755 --- a/src/scripts/bam_rename.py +++ b/src/scripts/bam_rename.py @@ -8,14 +8,27 @@ # in the case of scaffolds, offset the start coordinate by the start of the sequence in the scaffold # bamfile = pysam.AlignmentFile("-", "rb") -scfmap = seq.readScfMap(sys.argv[1]) -namedict = seq.readNameMap(sys.argv[2]) +layout = sys.argv[1] +gap_info = sys.argv[2] +scfmap = seq.readScfMap(sys.argv[3]) +namedict = seq.readNameMap(sys.argv[4]) lens = dict() offsets = dict() names = dict() tigtohap = dict() +readtorg = dict() -for filename in sys.argv[3:]: +with open(layout) as f: + for l in f: + parts = l.strip().split('\t') + if len(parts) > 2: + readtorg[parts[0]] = "LA" if (int(parts[-1]) == 0) else "UL" +with open(gap_info) as f: + for l in f: + parts = l.strip().split('\t') + readtorg[parts[0]] = "UL-gap" + +for filename in sys.argv[5:]: sys.stderr.write("Starting file %s\n"%(filename)) inf = seq.openInput(filename) @@ -52,18 +65,26 @@ # drop the old reference names header['SQ'] = [sq_entry for sq_entry in header['SQ'] if sq_entry['SN'] in scfmap] +# add read tags to note which reads are LA, LA gapfill, or UL +if 'RG' not in header: + header['RG'] = [] +header['RG'].append({'ID' : "LA"}) +header['RG'].append({'ID' : "UL-gap"}) +header['RG'].append({'ID' : "UL"}) + # now loop through and update, keeping only the mapped reads output_bam = pysam.AlignmentFile('-', 'wb', header=header) for read in bamfile: if not read.is_unmapped: reference_name = bamfile.get_reference_name(read.reference_id) - if reference_name not in offsets or tigtohap[reference_name] not in output_bam.references: + if reference_name not in offsets or tigtohap[reference_name] not in output_bam.references or read.query_name not in readtorg: sys.stderr.write("Error: I didn't find how to translate %s\n"%(reference_name)) sys.exit(1) # make a copy of the existing read in the new bam, replacing the string reference new_read = pysam.AlignedSegment.fromstring(read.to_string().replace(f'\t{reference_name}\t', f'\t{tigtohap[reference_name]}\t'), output_bam.header) + new_read.set_tag('RG', readtorg[read.query_name]) # update the coordinate new_read.reference_start = read.reference_start + offsets[reference_name] output_bam.write(new_read)