Vectorised version of simple align. Performance tests to come.

e2463bc9 · Etienne MORICE · 4dbdda67 · e2463bc9 · e2463bc9
Verified Commit e2463bc9 authored 6 years ago by Etienne MORICE
--- a/alignementseq.py
+++ b/alignementseq.py
@@ -6,6 +6,8 @@ Created on Tue Jan 15 15:02:16 2019
 @author: ariane.delrocq
 """
+import numpy as np
 import Bio.SubsMat.MatrixInfo
 import Bio.SeqIO
 from Bio.SeqRecord import SeqRecord
@@ -21,8 +23,9 @@ import Bio.pairwise2
 mat = Bio.SubsMat.MatrixInfo.blosum62
+def score(a, b, gap=-8):
-def score(a, b, gap=-8, id=1, mut=-1):
+    """Default scoring function for align
+    """
    if not a or not b:
        return gap
    else:
@@ -32,13 +35,16 @@ def score(a, b, gap=-8, id=1, mut=-1):
            return mat[a,b]
        except KeyError:
            return mat[b,a]
-#    elif a==b:
-#        return id
-#    else:
-#        return mut
 def align(seq1, seq2, score=score):
+    """Main implementation of align, with blosum scoring and no specific gap
+    extension penalties.
+    Args:
+        seq1: A SeqRecord object
+        seq2: Same.
+    """
    x = seq1.seq
    y = seq2.seq
    n=len(x)
@@ -106,8 +112,111 @@ def align(seq1, seq2, score=score):
    aseq2.seq = Seq(aly[::-1])
    return bestScore, aseq1, aseq2
+def dict_to_weight(subs_dict):
+    """Converts blosum dict to indexable numpy array.
+    The score function is the critical op here, we do not need a dict lookup in
+    the innermost loop.
+    Returns:
+        26 x 26 numpy array, [0,0] being s('A', 'A')
+    """
+    weight = np.zeros((26,26))
+    for (r1, r2), w in subs_dict.items():
+        r1 = ord(r1) - ord('A')
+        r2 = ord(r2) - ord('A')
+        weight[r1, r2] = weight[r2, r1] = w
+    return weight
+def vec_align(seq1, seq2, mat=dict_to_weight(Bio.SubsMat.MatrixInfo.blosum62), gap=-8):
+    """Proof of concept vector-style implementation of align, written mainly in numpy
+    arrays operations.
+    M(i,j) is the score of the best alignement containing exactly i letters of
+    x and j letters of y.
+    To allow vector operations, we rewrite M as S(s, k) with s=i+j and 
+    k=i-max(0,s-m)
+    Hence:
+        M(i,j) = S(i+j, i-max(0,i+j-m))
+        S(s, k) = M(k+max(0,s-m), min(s,m)-k)
+    The relation become:
+        M(i,j) = max( M(i,j-1)+gap, M(i-1,j)+gap, M(i-1,j-1)+s(i,j) )
+        S(s,k) = max(
+           S(s-1,k+1(s>m))+gap,
+           S(s-1,k-1+1(s>m))+gap,
+           S(s-2,k-1+1(s>m)+1(s>m+1)) + s(k+max(0,s-m),min(s,m)-k)
+        )
+    Note that S(s,k) does not depend anymore on S(k,.), allowing vector operation
+    Last index of the diag s can be written (useful for simplifications):
+        min(n,s)-max(0,s-m)
+        = min(n, m, s, n+m-s)
+        = min(m,s)-max(0,s-n)
+    """
+    n = len(seq1.seq)
+    m = len(seq2.seq)
+    x = np.array(list(ord(c)-ord('A') for c in seq1.seq))
+    y = np.array(list(ord(c)-ord('A') for c in seq2.seq))
+    mat = mat.flatten()
+    S = [ np.zeros(min(s,n,m,n+m-s)+1, dtype=np.int) for s in range(n+m+1) ]
+    S[0][0] = 0
+    S[1][0] = gap
+    S[1][1] = gap
+    for s in range(2,n+m+1):
+        # First/last cell must be treated differently when first line/col of M
+        if(s <= m):
+            S[s][0] = S[s-1][0] + gap
+        if(s <= n):
+            S[s][s-max(0,s-m)] = S[s-1][s-1-max(0,s-1-m)] + gap
+        # The offsets in the formula become begin/end of slices.
+        S[s][int(s<=m):len(S[s])-int(s<=n)] = np.maximum(np.maximum(
+            S[s-1][1:] + gap,
+            S[s-1][:-1] + gap),
+            S[s-2][int(s>m+1):len(S[s-2])-int(s>n+1)]
+            +mat[
+                x[max(0,s-m-1):min(n,s-1)] * 26 +
+                y[max(0,s-n-1):min(m,s-1)][::-1]
+                ]
+            )
+    # Trace back
+    score = S[n+m][0]
+    i,j = n,m
+    alx, aly = "",""
+    while i > 0 or j > 0:
+        if j > 0:
+            prev_score = S[i+j-1][i-max(0, i+j-m-1)]
+            if prev_score + gap == score:
+                score = prev_score
+                j -= 1
+                aly += seq2[j]
+                alx += '-'
+                continue
+        if i > 0:
+            prev_score = S[i+j-1][i-max(0, i+j-m-1)-1]
+            if prev_score + gap == score:
+                score = prev_score
+                i -= 1
+                alx += seq1[i]
+                aly += '-'
+                continue
+        score = S[i+j-2][i-max(0, i+j-m-2)-1]
+        i -= 1
+        j -= 1
+        alx += seq1[i]
+        aly += seq2[j]
+    aseq1 = SeqRecord(seq1)
+    aseq1.seq = Seq(alx[::-1])
+    aseq2 = SeqRecord(seq2)
+    aseq2.seq = Seq(aly[::-1])
+    return S[n+m][0], aseq1, aseq2
 def align2steps(seq1, seq2, d=8, e=4):
+    """Main implementation of align, handling also gap extensions.
+    """
    x = seq1.seq
    y = seq2.seq
    n=len(x)
@@ -228,6 +337,8 @@ def align2steps(seq1, seq2, d=8, e=4):
    return bestScore, aseq1, aseq2
 def test():
+    """Obsolete test function, to be moved to unit tests in a separate file.
+    """
 #    print(align("chat", "cat"))
 #    print(align("chat", "cgat"))
 #    print(align("chat", "at"))

--- a/alignementseq_tests.py
+++ b/alignementseq_tests.py
@@ -8,6 +8,7 @@ import zipfile
 import Bio.SeqIO
 import Bio.pairwise2
 from Bio.SeqRecord import SeqRecord
+from Bio.SubsMat.MatrixInfo import blosum62
 class AlignmentSeqTestCase(unittest.TestCase):
    def setUp(self):
@@ -28,9 +29,10 @@ class AlignmentSeqTestCase(unittest.TestCase):
            with zipfile.ZipFile(balibase_zippath) as balibase_zip:
                balibase_zip.extractall()
-    # Generator function to iterate over the first two sequences of each
-    # unaligned fasta file.
    def get_dataset_heads(self):
+        """Generator function to iterate over the first two sequences of each
+        unaligned fasta file.
+        """
        dataset_dir = os.path.join(self.balibase_path, "RV11.unaligned")
        for filename in os.listdir(dataset_dir):
            records = Bio.SeqIO.parse(
@@ -40,24 +42,26 @@ class AlignmentSeqTestCase(unittest.TestCase):
            seq2 = next(records)
            yield seq1, seq2, filename
-    # Strip strings of their '-' before comparing them
    def assertSameResidues(self, str1, str2):
+        """Strip strings of their '-' before comparing them
+        """
        return self.assertEqual(
            str(str1).translate({ord('-'):None}),
            str(str2).translate({ord('-'):None})
            )
-    # Test alignments with the simplest metric. As there can be a huge number of
-    # solutions, we check only that we got the right score, and one valid
-    # alignment.
    def test_simple_align(self):
+        """Test alignments with the simplest metric. As there can be a huge number of
+        solutions, we check only that we got the right score, and one valid
+        alignment.
+        """
        from alignementseq import align
        score_fn = lambda a,b : -2 if a == '' or b == '' else -1 if a != b else 1
        for s1, s2, filename in self.get_dataset_heads():
            score, r1, r2 = align(s1, s2, score_fn)
            (ex_r1, ex_r2, exp_score, *_) = Bio.pairwise2.align.globalms(
-                s1.seq, s2.seq, 1, -1, -2, -2)[0]
+                s1.seq, s2.seq, 1, -1, -2, -2, one_alignment_only=True)[0]
            try:
                self.assertSameResidues(r1.seq, s1.seq)
@@ -78,18 +82,30 @@ class AlignmentSeqTestCase(unittest.TestCase):
                raise
    def test_blosum_align(self):
-        from alignementseq import align
+        from alignementseq import align, vec_align
-        cases = [
+        for s1, s2, filename in self.get_dataset_heads():
-                ("CHAT", "CAT", 10, "CHAT", "C-AT"),
+            (ex_r1, ex_r2, exp_score, *_) = Bio.pairwise2.align.globalds(
-                ("CHAT", "CGAT", 16, "CHAT", "CGAT"),
+                s1.seq, s2.seq, blosum62, -8, -8, one_alignment_only=True)[0]
-                ("CHAT", "AT", -7, "CHAT", "--AT")
+            for method in align, vec_align:
-                ]
+                score, r1, r2 = method(s1, s2)
-        for s1, s2, exp_score, exp_r1, exp_r2 in cases:
+                try:
-            score, r1, r2 = align(SeqRecord(s1), SeqRecord(s2))
+                    self.assertSameResidues(r1.seq, s1.seq)
-            self.assertEqual(score, exp_score)
+                    self.assertSameResidues(r2.seq, s2.seq)
-            self.assertEqual(r1.seq, exp_r1)
+                    self.assertEqual(score, sum(
-            self.assertEqual(r2.seq, exp_r2)
+                            -8 if a == '-' or b == '-' else
+                            blosum62[a,b] if (a,b) in blosum62 else
+                            blosum62[b,a]
+                        for a,b in zip(r1.seq, r2.seq)))
+                    self.assertEqual(score, exp_score)
+                except AssertionError:
+                    print('from', filename, "with", method, ":")
+                    print(r1.seq)
+                    print(r2.seq)
+                    print('')
+                    print(ex_r1)
+                    print(ex_r2)
+                    raise
 if __name__ == '__main__':
    unittest.main()