Skip to content

Commit

Permalink
- add cigar utility method get_alignment_offsets to find the offsets …
Browse files Browse the repository at this point in the history
…of the aligned segment in the read

- tests included
  • Loading branch information
yfarjoun committed Sep 19, 2024
1 parent c13d3cc commit e371e66
Show file tree
Hide file tree
Showing 2 changed files with 83 additions and 1 deletion.
36 changes: 35 additions & 1 deletion fgpyo/sam/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -368,7 +368,7 @@ class CigarOp(enum.Enum):
code (int): The `~pysam` cigar operator code.
character (int): The single character cigar operator.
consumes_query (bool): True if this operator consumes query bases, False otherwise.
consumes_target (bool): True if this operator consumes target bases, False otherwise.
consumes_reference (bool): True if this operator consumes reference bases, False otherwise.
"""

M = (0, "M", True, True) #: Match or Mismatch the reference
Expand Down Expand Up @@ -547,6 +547,40 @@ def length_on_target(self) -> int:
"""Returns the length of the alignment on the target sequence."""
return sum([elem.length_on_target for elem in self.elements])

def get_alignment_offsets(self, reverse: bool = False) -> Tuple[int, int]:
"""
Get the starting and ending offset for the alignment based on the CIGAR string.
Arguments:
reverse: bool Whether to count from the end of the read sequence.
Default is False, which counts from the beginning of the read sequence.
Returns:
Tuple[int, int] The start and end of the _aligned part_ of the read.
0-based, open-ended offsets (to the read sequence.)
If `reverse` is True, the offsets count from the end of the read sequence.
If the Cigar contains no alignment operators that consume sequence bases, or
only clipping operators, the start and end offsets will be the same value (indicating
an empty region).
"""
start_offset: int = 0
end_offset: int = 0
cig_el: CigarElement
alignment_began = False
elements = self.elements if not reverse else reversed(self.elements)
for cig_el in elements:
if cig_el.operator.is_clipping and not alignment_began:
start_offset += cig_el.length_on_query
end_offset += cig_el.length_on_query
elif cig_el.operator.is_clipping:
break
else:
alignment_began = True
end_offset += cig_el.length_on_query

return start_offset, end_offset

def __getitem__(
self, index: Union[int, slice]
) -> Union[CigarElement, Tuple[CigarElement, ...]]:
Expand Down
48 changes: 48 additions & 0 deletions tests/fgpyo/sam/test_cigar.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,3 +31,51 @@ def test_bad_index_raises_index_error(index: int) -> None:
def test_bad_index_raises_type_error(index: int) -> None:
with pytest.raises(TypeError):
cigar[index]


@pytest.mark.parametrize(
("cigar_string", "start", "end"),
{
("10M", 0, 10),
("10M10I", 0, 20),
("10X10I", 0, 20),
("10X10D", 0, 10),
("10=10D", 0, 10),
("10S10M", 10, 20),
("10H10M", 0, 10),
("10H10S10M", 10, 20),
("10H10S10M5S", 10, 20),
("10H10S10M5S10H", 10, 20),
("10H", 0, 0),
("10S", 10, 10),
("10S10H", 10, 10),
("5H10S10H", 10, 10),
},
)
def test_get_alignments(cigar_string, start, end):
cig = Cigar.from_cigarstring(cigar_string)
assert Cigar.get_alignment_offsets(cig, False) == (start, end)


@pytest.mark.parametrize(
("cigar_string", "start", "end"),
{
("10M", 0, 10),
("10M10I", 0, 20),
("10X10I", 0, 20),
("10X10D", 0, 10),
("10=10D", 0, 10),
("10S10M", 0, 10),
("10H10M", 0, 10),
("10H10S10M", 0, 10),
("10H10S10M5S", 5, 15),
("10H10S10M5S10H", 5, 15),
("10H", 0, 0),
("10S", 10, 10),
("10S10H", 10, 10),
("5H10S10H", 10, 10),
},
)
def test_get_alignments_reversed(cigar_string, start, end):
cig = Cigar.from_cigarstring(cigar_string)
assert Cigar.get_alignment_offsets(cig, True) == (start, end)

0 comments on commit e371e66

Please sign in to comment.