Skip to content

Commit

Permalink
PERF: use str.maketrans() for rc() in _make.py
Browse files Browse the repository at this point in the history
  • Loading branch information
fedarko committed Sep 4, 2023
1 parent 04297b0 commit e8230a5
Show file tree
Hide file tree
Showing 2 changed files with 20 additions and 9 deletions.
19 changes: 13 additions & 6 deletions wotplot/_make.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,9 @@
from pydivsufsort import divsufsort
from ._scipy_sm_constructor_getter import get_sm_constructor

NT2COMP = {"A": "T", "C": "G", "T": "A", "G": "C"}
DNA = "ACGT"
RCDNA = "TGCA"
NT2COMP = str.maketrans(DNA, RCDNA)
# Appended to the end of a string when we create its suffix array. "$" occurs
# lexicographically before all of the DNA nucleotides, and including this
# character is helpful when creating suffix arrays -- see Chapter 9 of
Expand All @@ -16,10 +18,15 @@


def rc(seq):
out = ""
for i in range(len(seq) - 1, -1, -1):
out += NT2COMP[seq[i]]
return out
"""Computes the reverse-complement of a DNA string.
References
----------
Use str.maketrans() to replace nucleotides with their complements; this
approach (which should be relatively efficient) is based on Devon Ryan's
suggestion at https://bioinformatics.stackexchange.com/a/3585.
"""
return seq.translate(NT2COMP)[::-1]


def _validate_and_stringify_seq(seq, k):
Expand All @@ -31,7 +38,7 @@ def _validate_and_stringify_seq(seq, k):
# silly question of what if a string contains both T and U??? It's
# easiest to just mandate that we only take in A/C/G/T strings -- this
# forces the user to do the conversion, so they can decide what to do.
if c not in NT2COMP:
if c not in DNA:
raise ValueError(
f"Input sequence contains character {c}; only DNA nucleotides "
"(A, C, G, T) are currently allowed."
Expand Down
10 changes: 7 additions & 3 deletions wotplot/tests/test_make_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,13 @@ def test_rc_good():

def test_rc_badchar():
# in practice, bad characters should already have been caught when make()
# validates input sequences. however, we may as well be careful here...
with pytest.raises(KeyError):
rc("ACGTM")
# validates input sequences, and the new method we use for computing
# reverse-complements (rc()) doesn't disallow non-DNA characters. Let's
# verify that these "bad characters" don't cause a crash, but keep in mind
# that these sorts of scenarios should never happen in practice (as of
# writing).
assert rc("ACGTM") == "MACGT"
assert rc("T G") == "C A"


# def test_get_kmer_dd_good():
Expand Down

0 comments on commit e8230a5

Please sign in to comment.