PERF: use str.maketrans() for rc() in _make.py

fedarko · Sep 4, 2023 · e8230a5 · e8230a5
1 parent 04297b0
commit e8230a5
Show file tree

Hide file tree

Showing 2 changed files with 20 additions and 9 deletions.
diff --git a/wotplot/_make.py b/wotplot/_make.py
@@ -2,7 +2,9 @@
 from pydivsufsort import divsufsort
 from ._scipy_sm_constructor_getter import get_sm_constructor
 
-NT2COMP = {"A": "T", "C": "G", "T": "A", "G": "C"}
+DNA = "ACGT"
+RCDNA = "TGCA"
+NT2COMP = str.maketrans(DNA, RCDNA)
 # Appended to the end of a string when we create its suffix array. "$" occurs
 # lexicographically before all of the DNA nucleotides, and including this
 # character is helpful when creating suffix arrays -- see Chapter 9 of
@@ -16,10 +18,15 @@
 
 
 def rc(seq):
-    out = ""
-    for i in range(len(seq) - 1, -1, -1):
-        out += NT2COMP[seq[i]]
-    return out
+    """Computes the reverse-complement of a DNA string.
+
+    References
+    ----------
+    Use str.maketrans() to replace nucleotides with their complements; this
+    approach (which should be relatively efficient) is based on Devon Ryan's
+    suggestion at https://bioinformatics.stackexchange.com/a/3585.
+    """
+    return seq.translate(NT2COMP)[::-1]
 
 
 def _validate_and_stringify_seq(seq, k):
@@ -31,7 +38,7 @@ def _validate_and_stringify_seq(seq, k):
         # silly question of what if a string contains both T and U??? It's
         # easiest to just mandate that we only take in A/C/G/T strings -- this
         # forces the user to do the conversion, so they can decide what to do.
-        if c not in NT2COMP:
+        if c not in DNA:
             raise ValueError(
                 f"Input sequence contains character {c}; only DNA nucleotides "
                 "(A, C, G, T) are currently allowed."

diff --git a/wotplot/tests/test_make_utils.py b/wotplot/tests/test_make_utils.py
@@ -17,9 +17,13 @@ def test_rc_good():
 
 def test_rc_badchar():
     # in practice, bad characters should already have been caught when make()
-    # validates input sequences. however, we may as well be careful here...
-    with pytest.raises(KeyError):
-        rc("ACGTM")
+    # validates input sequences, and the new method we use for computing
+    # reverse-complements (rc()) doesn't disallow non-DNA characters. Let's
+    # verify that these "bad characters" don't cause a crash, but keep in mind
+    # that these sorts of scenarios should never happen in practice (as of
+    # writing).
+    assert rc("ACGTM") == "MACGT"
+    assert rc("T G") == "C A"
 
 
 # def test_get_kmer_dd_good():