Skip to content

Commit

Permalink
add RG tags to bam to delineate read types
Browse files Browse the repository at this point in the history
  • Loading branch information
skoren committed Sep 25, 2024
1 parent 5c8c62e commit e6bb621
Show file tree
Hide file tree
Showing 2 changed files with 27 additions and 5 deletions.
3 changes: 2 additions & 1 deletion src/Snakefiles/7-combineConsensus.sm
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ rule combineConsensus:
packcns = combineConsensusI,
tigmap = rules.buildPackages.output.tigmap,
scfmap = rules.layoutContigs.output.scfmap,
layout = rules.layoutContigs.output.layout,
hificov = rules.verkko.input.hifi5cov,
finished = rules.buildPackages.output.finished,
pathstsv = rules.rukki.output.pathstsv if config['ruk_enable'] == "True" else rules.emptyfile.output,
Expand Down Expand Up @@ -121,7 +122,7 @@ if [ "{params.haveBAM}" = "True" ]; then
echo ""
mem_per_core=\`expr {resources.mem_gb} \/ {threads} | awk '{{if (\$1 < 1) print "1G"; else print \$1"G"}}'\`

{params.SAMTOOLS} merge -f -O bam --reference combined.fasta -@{threads} -u - {params.packbam} | {PYTHON} {VERKKO}/scripts/bam_rename.py ../{input.scfmap} ../{input.tigmap} {params.packcns} | {params.SAMTOOLS} sort -m \$mem_per_core -@{threads} -o ../{output.bam}
{params.SAMTOOLS} merge -f -O bam --reference combined.fasta -@{threads} -u - {params.packbam} | {PYTHON} {VERKKO}/scripts/bam_rename.py ../{input.layout} ../6-layoutContigs/ont-gapfill.txt ../{input.scfmap} ../{input.tigmap} {params.packcns} | {params.SAMTOOLS} sort -m \$mem_per_core -@{threads} -o ../{output.bam}
else
touch ../{output.bam}
fi
Expand Down
29 changes: 25 additions & 4 deletions src/scripts/bam_rename.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,14 +8,27 @@
# in the case of scaffolds, offset the start coordinate by the start of the sequence in the scaffold
#
bamfile = pysam.AlignmentFile("-", "rb")
scfmap = seq.readScfMap(sys.argv[1])
namedict = seq.readNameMap(sys.argv[2])
layout = sys.argv[1]
gap_info = sys.argv[2]
scfmap = seq.readScfMap(sys.argv[3])
namedict = seq.readNameMap(sys.argv[4])
lens = dict()
offsets = dict()
names = dict()
tigtohap = dict()
readtorg = dict()

for filename in sys.argv[3:]:
with open(layout) as f:
for l in f:
parts = l.strip().split('\t')
if len(parts) > 2:
readtorg[parts[0]] = "LA" if (int(parts[-1]) == 0) else "UL"
with open(gap_info) as f:
for l in f:
parts = l.strip().split('\t')
readtorg[parts[0]] = "UL-gap"

for filename in sys.argv[5:]:
sys.stderr.write("Starting file %s\n"%(filename))
inf = seq.openInput(filename)

Expand Down Expand Up @@ -52,18 +65,26 @@
# drop the old reference names
header['SQ'] = [sq_entry for sq_entry in header['SQ'] if sq_entry['SN'] in scfmap]

# add read tags to note which reads are LA, LA gapfill, or UL
if 'RG' not in header:
header['RG'] = []
header['RG'].append({'ID' : "LA"})
header['RG'].append({'ID' : "UL-gap"})
header['RG'].append({'ID' : "UL"})

# now loop through and update, keeping only the mapped reads
output_bam = pysam.AlignmentFile('-', 'wb', header=header)
for read in bamfile:
if not read.is_unmapped:
reference_name = bamfile.get_reference_name(read.reference_id)
if reference_name not in offsets or tigtohap[reference_name] not in output_bam.references:
if reference_name not in offsets or tigtohap[reference_name] not in output_bam.references or read.query_name not in readtorg:
sys.stderr.write("Error: I didn't find how to translate %s\n"%(reference_name))
sys.exit(1)

# make a copy of the existing read in the new bam, replacing the string reference
new_read = pysam.AlignedSegment.fromstring(read.to_string().replace(f'\t{reference_name}\t', f'\t{tigtohap[reference_name]}\t'), output_bam.header)

new_read.set_tag('RG', readtorg[read.query_name])
# update the coordinate
new_read.reference_start = read.reference_start + offsets[reference_name]
output_bam.write(new_read)
Expand Down

0 comments on commit e6bb621

Please sign in to comment.