Added more serious unit tests, found two errors.

* Forgotten an in-place modification of arguments in `align` * Wrong initialisation of first column in `align` (index confusion)

Added more serious unit tests, found two errors.
4dbdda67 · Etienne MORICE · 17c0571f · 4dbdda67 · 4dbdda67
Verified Commit 4dbdda67 authored 6 years ago by Etienne MORICE
--- a/alignementseq.py
+++ b/alignementseq.py
@@ -13,11 +13,11 @@ from Bio.Seq import Seq
 import Bio.pairwise2
-seqs=[]
+#seqs=[]
-handle = open("balibase/RV11.unaligned/BBS11001.fasta")
+#handle = open("balibase/RV11.unaligned/BBS11001.fasta")
-for seq in Bio.SeqIO.parse(handle, "fasta"):
+#for seq in Bio.SeqIO.parse(handle, "fasta"):
-    seqs.append(seq)
+#    seqs.append(seq)
-handle.close()
+#handle.close()
 mat = Bio.SubsMat.MatrixInfo.blosum62
@@ -51,7 +51,7 @@ def align(seq1, seq2, score=score):
        mm[i][0] = mm[i-1][0] + score('', x[i-1])
        path[i][0] = (i-1, 0)
    for j in range(1, m+1):
-        mm[0][j] = mm[j-1][0] + score('', y[j-1])
+        mm[0][j] = mm[0][j-1] + score('', y[j-1])
        path[0][j] = (0, j-1)
    # fill table
@@ -100,11 +100,11 @@ def align(seq1, seq2, score=score):
        alx+='-'
        aly+=y[j-1]
        j -=1
-    alx = alx[::-1]
+    aseq1 = SeqRecord(seq1)
-    aly = aly[::-1]
+    aseq1.seq = Seq(alx[::-1])
-    seq1.seq = Seq(alx)
+    aseq2 = SeqRecord(seq2)
-    seq2.seq = Seq(aly)
+    aseq2.seq = Seq(aly[::-1])
-    return bestScore, seq1, seq2
+    return bestScore, aseq1, aseq2
 def align2steps(seq1, seq2, d=8, e=4):

--- a/alignementseq_tests.py
+++ b/alignementseq_tests.py
+#!/usr/bin/env python3
 import unittest
 import urllib.request
 import os
 import zipfile
+import Bio.SeqIO
+import Bio.pairwise2
 from Bio.SeqRecord import SeqRecord
 class AlignmentSeqTestCase(unittest.TestCase):
    def setUp(self):
        balibase_zippath = "balibase.zip"
-        balibase_path = "balibase"
+        self.balibase_path = "balibase"
        testfile_path = os.path.join(
-            balibase_path, 
+            self.balibase_path, 
            "RV11.unaligned", "BBS11001.fasta")
-        if not os.path.isdir(balibase_path):
+        if not os.path.isdir(self.balibase_path):
-            os.mkdir(balibase_path)
+            os.mkdir(self.balibase_path)
        if not os.path.isfile(testfile_path):
            if not os.path.isfile(balibase_zippath):
                print("Fetching balibase archive from moodle...")
@@ -24,17 +28,65 @@ class AlignmentSeqTestCase(unittest.TestCase):
            with zipfile.ZipFile(balibase_zippath) as balibase_zip:
                balibase_zip.extractall()
+    # Generator function to iterate over the first two sequences of each
+    # unaligned fasta file.
+    def get_dataset_heads(self):
+        dataset_dir = os.path.join(self.balibase_path, "RV11.unaligned")
+        for filename in os.listdir(dataset_dir):
+            records = Bio.SeqIO.parse(
+                    os.path.join(dataset_dir, filename),
+                    "fasta")
+            seq1 = next(records)
+            seq2 = next(records)
+            yield seq1, seq2, filename
+    # Strip strings of their '-' before comparing them
+    def assertSameResidues(self, str1, str2):
+        return self.assertEqual(
+            str(str1).translate({ord('-'):None}),
+            str(str2).translate({ord('-'):None})
+            )
+    # Test alignments with the simplest metric. As there can be a huge number of
+    # solutions, we check only that we got the right score, and one valid
+    # alignment.
    def test_simple_align(self):
        from alignementseq import align
        score_fn = lambda a,b : -2 if a == '' or b == '' else -1 if a != b else 1
+        for s1, s2, filename in self.get_dataset_heads():
+            score, r1, r2 = align(s1, s2, score_fn)
+            (ex_r1, ex_r2, exp_score, *_) = Bio.pairwise2.align.globalms(
+                s1.seq, s2.seq, 1, -1, -2, -2)[0]
+            try:
+                self.assertSameResidues(r1.seq, s1.seq)
+                self.assertSameResidues(r2.seq, s2.seq)
+                self.assertEqual(score, sum(
+                        -2 if a == '-' or b == '-' else
+                        -1 if a != b else 
+                        1
+                    for a,b in zip(r1.seq, r2.seq)))
+                self.assertEqual(score, exp_score)
+            except AssertionError:
+                print('from', filename, ':')
+                print(r1.seq)
+                print(r2.seq)
+                print('')
+                print(ex_r1)
+                print(ex_r2)
+                raise
+    def test_blosum_align(self):
+        from alignementseq import align
        cases = [
-                ("CHAT", "CAT", 1, "CHAT", "C-AT"),
+                ("CHAT", "CAT", 10, "CHAT", "C-AT"),
-                ("CHAT", "CGAT", 2, "CHAT", "CGAT"),
+                ("CHAT", "CGAT", 16, "CHAT", "CGAT"),
-                ("CHAT", "AT", -2, "CHAT", "--AT")
+                ("CHAT", "AT", -7, "CHAT", "--AT")
                ]
        for s1, s2, exp_score, exp_r1, exp_r2 in cases:
-            score, r1, r2 = align(SeqRecord(s1), SeqRecord(s2), score_fn)
+            score, r1, r2 = align(SeqRecord(s1), SeqRecord(s2))
            self.assertEqual(score, exp_score)
            self.assertEqual(r1.seq, exp_r1)
            self.assertEqual(r2.seq, exp_r2)