Benchmark and switch to traceback matrix for vec_align for #10

60dc8f50 · Etienne MORICE · b2053a14 · 60dc8f50 · 60dc8f50 · 60dc8f50
Verified Commit 60dc8f50 authored 6 years ago by Etienne MORICE
--- a/alignementseq.py
+++ b/alignementseq.py
@@ -232,6 +232,111 @@ def vec_align(seq1, seq2, mat=dict_to_weight(Bio.SubsMat.MatrixInfo.blosum62), g
    aseq2.seq = Seq(aly[::-1])
    return S[n+m][0], aseq1, aseq2

+def vec_align2(seq1, seq2, mat=dict_to_weight(Bio.SubsMat.MatrixInfo.blosum62), gap=-8):
+    """Proof of concept vector-style implementation of align, written mainly in numpy
+    arrays operations.
+
+    M(i,j) is the score of the best alignement containing exactly i letters of
+    x and j letters of y.
+    To allow vector operations, we rewrite M as S(s, k) with s=i+j and 
+    k=i-max(0,s-m)
+    Hence:
+        M(i,j) = S(i+j, i-max(0,i+j-m))
+        S(s, k) = M(k+max(0,s-m), min(s,m)-k)
+    The relation become:
+        M(i,j) = max( M(i,j-1)+gap, M(i-1,j)+gap, M(i-1,j-1)+s(i,j) )
+        S(s,k) = max(
+           S(s-1,k+1(s>m))+gap,
+           S(s-1,k-1+1(s>m))+gap,
+           S(s-2,k-1+1(s>m)+1(s>m+1)) + s(k+max(0,s-m),min(s,m)-k)
+        )
+    Note that S(s,k) does not depend anymore on S(k,.), allowing vector operation
+    Last index of the diag s can be written (useful for simplifications):
+        min(n,s)-max(0,s-m)
+        = min(n, m, s, n+m-s)
+        = min(m,s)-max(0,s-n)
+    """
+
+    n = len(seq1.seq)
+    m = len(seq2.seq)
+    x = np.array(list(ord(c)-ord('A') for c in seq1.seq))
+    y = np.array(list(ord(c)-ord('A') for c in seq2.seq))
+    mat = mat.flatten()
+
+    # Traceback matrix, bit 0 = gap, bit 1 = gap on y side
+    T = [ np.empty(min(s,n,m,n+m-s)+1, dtype=np.byte) for s in range(n+m+1) ]
+    # Scores, we just have to remember three lines
+    S = tuple( np.empty(min(n,m)+1, dtype=np.int32) for s in range(3) )
+
+    S[0][0] = 0
+    S[1][0] = gap
+    S[1][1] = gap
+
+    # T[0][0] never used
+    T[1][0] = 1 | 0
+    T[1][1] = 1 | 2
+
+    x *= 26
+    for s in range(2,n+m+1):
+        # First/last cell must be treated differently when first line/col of M
+        if(s <= m):
+            S[2][0] = S[1][0] + gap
+            T[s][0] = 1 | 0
+        if(s <= n):
+            S[2][s-max(0,s-m)] = S[1][s-1-max(0,s-1-m)] + gap
+            T[s][s-max(0,s-m)] = 1 | 2
+
+        # The offsets in the formula become begin/end of slices.
+        gapped = S[1] + gap
+
+        max_gap = np.maximum(
+            gapped[1:len(T[s-1])],
+            gapped[:len(T[s-1])-1])
+        T[s][int(s<=m):len(T[s])-int(s<=n)] = np.equal(
+                max_gap,
+                gapped[:len(T[s-1])-1]
+                ) << 1
+
+        S[2][int(s<=m):len(T[s])-int(s<=n)] = np.maximum(max_gap,
+            S[0][int(s>m+1):len(T[s-2])-int(s>n+1)]
+            +mat[
+                x[max(0,s-m-1):min(n,s-1)] + y[max(0,s-n-1):min(m,s-1)][::-1]
+                ]
+            )
+        T[s][int(s<=m):len(T[s])-int(s<=n)] |= np.equal(
+                max_gap, 
+                S[2][int(s<=m):len(T[s])-int(s<=n)]
+                )
+
+        S = (S[1], S[2], S[0])
+
+    # Trace back
+    i,j = n,m
+    alx, aly = "",""
+    while i > 0 or j > 0:
+        is_gap = T[i+j][i-max(0,i+j-m)] & 1
+        if is_gap:
+            is_y_gap = T[i+j][i-max(0,i+j-m)] & 2
+            if is_y_gap:
+                i -= 1
+                alx += seq1[i]
+                aly += '-'
+            else:
+                j -= 1
+                aly += seq2[j]
+                alx += '-'
+        else:
+            i -= 1
+            j -= 1
+            alx += seq1[i]
+            aly += seq2[j]
+    
+    aseq1 = SeqRecord(seq1)
+    aseq1.seq = Seq(alx[::-1])
+    aseq2 = SeqRecord(seq2)
+    aseq2.seq = Seq(aly[::-1])
+    return S[1][0], aseq1, aseq2
+
 def align_dihedrals(seq1, seq2, angles1, angles2):
    """
    Align two sequences, by using a simple cost function using dihedral angles

--- a/alignementseq_multiple.py
+++ b/alignementseq_multiple.py
-from alignementseq import score
+from alignementseq import score, vec_align
 from Bio.SeqRecord import SeqRecord
 from Bio.Seq import Seq

+import tree_generator
+
+def multiple_align(sequences):
+    """Top-level multiple alignment routine
+
+    Arguments:
+        sequences: lists of SeqRecord objects
+
+    Returns:
+        List of aligned SeqRecord objects, with '-' where needed.
+    """
+
+    tree = tree_generator.tree_build(sequences, vec_align)
+    return multiple_align_from_linkage_matrix(sequences, tree)
+
 def multiple_align_from_tree(tree):
    """Multiple alignmnent from a SeqRecord tree
-    
+ 
    Arguments:
        tree: nested structure of python tuples, with SeqRecord objects as leafs
    """
@@ -27,7 +42,7 @@ def multiple_align_from_tree(tree):

 def multiple_align_from_linkage_matrix(sequences, tree):
    """Multiple alignmnent from a SeqRecord tree
-    
+ 
    Arguments:
        sequences: indexable container of SeqRecord objects to align
        tree: scipy.hierarchy linkage matrix
@@ -43,7 +58,7 @@ def multiple_align_from_linkage_matrix(sequences, tree):
        inode = int(inode)
        if inode < n:
            return [sequences[inode]]
-        ileft, iright, dist, size = tree[inode - n]
+        ileft, iright, _, _ = tree[inode - n]
        aleft = _join(ileft)
        aright = _join(iright)
        return join_alignments(aleft, aright)
@@ -58,12 +73,12 @@ def _score(a, b, base_score=score):
        if b == '-':
            return 0
        else:
-            return score('', b)
+            return base_score('', b)
    else:
        if b == '-':
-            return score('', a)
+            return base_score('', a)
        else:
-            return score(a, b)
+            return base_score(a, b)


 def _mscore(seqs1, seqs2, i, j, _score=_score):
@@ -135,9 +150,9 @@ def join_alignments(aleft, aright, _mscore=_mscore, gap=-8):
            for jseq in range(len(aright)):
                aseqs[len(aleft)+jseq] += aright[jseq][nextj]
        i,j = nexti,nextj
-    
-    ret = [ SeqRecord(id=s.id, seq="") for s in aleft ] + [ SeqRecord(id=s.id, seq="") for s in aright ]
+
+    ret = [ SeqRecord(id=s.id, seq="", description=s.description) 
+            for alignment in (aleft, aright) for s in alignment ]
    for rec, seq in zip(ret, aseqs):
        rec.seq = Seq(seq[::-1])
    return ret
-
--- a/alignementseq_tests.py
+++ b/alignementseq_tests.py
@@ -176,8 +176,7 @@ class BalibaseTestCase(unittest.TestCase):
            yield records, filename

 class AlignmentSeqTestCase(BalibaseTestCase):
-    """
-    TestCase for alignment correctness and quality
+    """ TestCase for alignment correctness and quality
    """

    def assert_same_residues(self, str1, str2):
@@ -226,13 +225,13 @@ class AlignmentSeqTestCase(BalibaseTestCase):
        """Tests alignments with blosum but no gap extension.
        """

-        from alignementseq import align, vec_align
+        from alignementseq import align, vec_align, vec_align2
        aligner = Bio.pairwise2.align
        
        for seq1, seq2, filename in self.get_dataset_heads():
            (ex_r1, ex_r2, exp_score, *_) = aligner.globalds(
                seq1.seq, seq2.seq, blosum62, -8, -8, one_alignment_only=True)[0]
-            for method in align, vec_align:
+            for method in align, vec_align, vec_align2:
                score, ret1, ret2 = method(seq1, seq2)
                try:
                    self.assert_same_residues(ret1.seq, seq1.seq)
@@ -328,37 +327,58 @@ class AlignmentSeqTestCase(BalibaseTestCase):
        for j in jobs:
            j.join()

-    def create_alignments_with(self, method, method_name=None):
-        """
-        Create fasta files with aligned sequences using the given method, under
-        balibase_path/<method_name>
-        """
-
-        dataset_dir = os.path.join(self.balibase_path, "RV11.unaligned")
-        out_dir = os.path.join(self.balibase_path, method_name or method.__name__)
-
-        for in_filename in os.listdir(dataset_dir):
-            sequences = list(Bio.SeqIO.parse(in_filename, "fasta"))
-
    def test_benchmark_multiple_align(self):
-        """Tests the multiple_align function (using blosum and gap extension)."""
+        """Perform multiple alignments with different methods and evaluate the
+        results with bali_score"""

        res_dir = "balibase_results"
        if not os.path.isdir(res_dir):
            os.mkdir(res_dir)

-        # Cached
+        # Define methods to test
+        from alignementseq_multiple import multiple_align
+        methods_test = [ multiple_align ]
+        methods_test_names = [ m.__name__ for m in methods_test ]
+        methods_eval_names = methods_test_names + ["clustalo"]
+
+        # Create clustalo alignment separately, and cache them
        self.create_clustalo_alignments()
         
+        # Align and eval with every other method
+        ## Directories
+        dataset_dir = os.path.join(self.balibase_path, "RV11.unaligned")
        ref_dataset_dir = os.path.join(self.balibase_path, "RV11.aligned")
        clustalo_dir = os.path.join(self.balibase_path, "clustalo")
+        for name in methods_test_names:
+            out_dir = os.path.join(self.balibase_path, name)
+            if not os.path.isdir(out_dir):
+                os.mkdir(out_dir)

        scores = []
-        for filename in os.listdir(ref_dataset_dir):
-            scores.append(self.bali_score(
-                    os.path.join(ref_dataset_dir, filename),
-                    os.path.join(clustalo_dir, filename)
-                    ))
+
+        for in_filename in os.listdir(dataset_dir):
+
+            # Read input
+            sequences = list(Bio.SeqIO.parse(
+                os.path.join(dataset_dir, in_filename),
+                "fasta"
+                ))
+
+            # Create output for all but clustalo
+            for method, name in zip(methods_test, methods_test_names):
+                out_dir = os.path.join(self.balibase_path, name)
+                with open(os.path.join(out_dir, in_filename), "w") as out_fd:
+                    aseqs = method(sequences)
+                    Bio.SeqIO.write(aseqs, out_fd, "fasta")
+
+            for name in methods_eval_names:
+                out_dir = os.path.join(self.balibase_path, name)
+                scores.append(self.bali_score(
+                        os.path.join(ref_dataset_dir, in_filename),
+                        os.path.join(out_dir, in_filename)
+                        ))
+                scores[-1]['method'] = name
+
        scores = pd.DataFrame(scores)
        scores.to_csv(os.path.join(res_dir, "clustalo.csv"))

@@ -439,7 +459,7 @@ class PerformanceTestCase(BalibaseTestCase):
    def test_performance(self):
        """Create timing graph comparing speed of simple alignment methods
        """
-        from alignementseq import align, vec_align
+        from alignementseq import align, vec_align, vec_align2
        aligner = Bio.pairwise2.align
        def biopython_align(seq1, seq2):
            return aligner.globalds(
@@ -447,7 +467,7 @@ class PerformanceTestCase(BalibaseTestCase):
                one_alignment_only=True)[0]


-        methods = (vec_align, align, biopython_align)
+        methods = (vec_align, align, biopython_align, vec_align2)
        times = pd.DataFrame(columns=("length1", "length2", "method", "time"),
                dtype=float)
        for seq1, seq2, filename in self.get_dataset_heads():

--- a/tree_generator.py
+++ b/tree_generator.py
@@ -8,10 +8,10 @@ Created on Tue Jan 29 14:59:19 2019
 import numpy as np
 from Bio.SeqRecord import SeqRecord
 from Bio.Cluster import treecluster
-import alignementseq
-from scipy.cluster.hierarchy import average, dendrogram
+#import alignementseq
+from scipy.cluster.hierarchy import average #, dendrogram

-seqs = [SeqRecord('TAACCCCAAAAGAACCAGA'), SeqRecord('TTCTGTAGTAGAGATGGAATTAAGAAAAAACCATCAACTATAACCCCAAAAGAACCAGA'), SeqRecord('TTCTGTAGTAGAGATGGAATTAAGAAAAAACCATCAACTATAACCCCAAAAGAACCAGA'),SeqRecord('GTAGTAGAGATGGAATTAAGAAAAAACCATCAACTATAACCCCAAGAGAACCAGA'), SeqRecord('GAGCCGGATGAGAAGAAACTCTCATGTCCGGTTCTGTAGTAGAGATGGAATTAAGAAAAAACCATCAACTATAACCCCAAGAGAACCAGA'), SeqRecord('TTTTCATTCGCGAGGAGCCGGATGAGAAGAAACTCTCATGTCCGGTTCTGTAGTAGAGATGGAATTAAGAAAAAACCATCAACTATAACCCCAAGAGAACCAGA')]
+_test_seqs = [SeqRecord('TAACCCCAAAAGAACCAGA'), SeqRecord('TTCTGTAGTAGAGATGGAATTAAGAAAAAACCATCAACTATAACCCCAAAAGAACCAGA'), SeqRecord('TTCTGTAGTAGAGATGGAATTAAGAAAAAACCATCAACTATAACCCCAAAAGAACCAGA'),SeqRecord('GTAGTAGAGATGGAATTAAGAAAAAACCATCAACTATAACCCCAAGAGAACCAGA'), SeqRecord('GAGCCGGATGAGAAGAAACTCTCATGTCCGGTTCTGTAGTAGAGATGGAATTAAGAAAAAACCATCAACTATAACCCCAAGAGAACCAGA'), SeqRecord('TTTTCATTCGCGAGGAGCCGGATGAGAAGAAACTCTCATGTCCGGTTCTGTAGTAGAGATGGAATTAAGAAAAAACCATCAACTATAACCCCAAGAGAACCAGA')]

 #Arguments : une liste de séquences SeqRecord, une fonction d'alignement
 score_gap = -8
@@ -20,14 +20,14 @@ def tree_build(seqs, align):
    M = np.zeros((len(seqs),len(seqs))) 
    for i in range (len(seqs)):
        for j in range (i):
-            M[i][j], a, b = align(seqs[i], seqs[j])
+            M[i][j], _, _ = align(seqs[i], seqs[j])
            M[i][j] = 1-abs(M[i][j]/(max(len_seqs)*(11-score_gap)))
-    print(type(M), M)
+#    print(type(M), M)
 #    tree1 = treecluster(None, method = 'a', dist = 'e', distancematrix=M)
    tree= average(M)
    return tree


-#tree = tree_build(seqs, alignementseq.align2steps)
+#tree = tree_build(_test_seqs, alignementseq.align2steps)
 #print(tree)
 #dn = dendrogram(tree)