alignementseq_tests.py

#!/usr/bin/env python3

import unittest
import urllib.request
import os
import zipfile

import Bio.SeqIO
import Bio.pairwise2
from Bio.SeqRecord import SeqRecord
from Bio.SubsMat.MatrixInfo import blosum62

class AlignmentSeqTestCase(unittest.TestCase):
    def setUp(self):
        balibase_zippath = "balibase.zip"
        self.balibase_path = "balibase"
        testfile_path = os.path.join(
            self.balibase_path, 
            "RV11.unaligned", "BBS11001.fasta")

        if not os.path.isdir(self.balibase_path):
            os.mkdir(self.balibase_path)
        if not os.path.isfile(testfile_path):
            if not os.path.isfile(balibase_zippath):
                print("Fetching balibase archive from moodle...")
                urllib.request.urlretrieve(
                    "https://moodle.polytechnique.fr/mod/resource/view.php?id=38570",
                    balibase_zippath)
            with zipfile.ZipFile(balibase_zippath) as balibase_zip:
                balibase_zip.extractall()

    def get_dataset_heads(self):
        """Generator function to iterate over the first two sequences of each
        unaligned fasta file.
        """
        dataset_dir = os.path.join(self.balibase_path, "RV11.unaligned")
        for filename in os.listdir(dataset_dir):
            records = Bio.SeqIO.parse(
                    os.path.join(dataset_dir, filename),
                    "fasta")
            seq1 = next(records)
            seq2 = next(records)
            yield seq1, seq2, filename

    def assertSameResidues(self, str1, str2):
        """Strip strings of their '-' before comparing them
        """
        return self.assertEqual(
            str(str1).translate({ord('-'):None}),
            str(str2).translate({ord('-'):None})
            )

    def test_simple_align(self):
        """Test alignments with the simplest metric. As there can be a huge number of
        solutions, we check only that we got the right score, and one valid
        alignment.
        """
        from alignementseq import align
        score_fn = lambda a,b : -2 if a == '' or b == '' else -1 if a != b else 1

        for s1, s2, filename in self.get_dataset_heads():
            score, r1, r2 = align(s1, s2, score_fn)
            (ex_r1, ex_r2, exp_score, *_) = Bio.pairwise2.align.globalms(
                s1.seq, s2.seq, 1, -1, -2, -2, one_alignment_only=True)[0]

            try:
                self.assertSameResidues(r1.seq, s1.seq)
                self.assertSameResidues(r2.seq, s2.seq)
                self.assertEqual(score, sum(
                        -2 if a == '-' or b == '-' else
                        -1 if a != b else 
                        1
                    for a,b in zip(r1.seq, r2.seq)))
                self.assertEqual(score, exp_score)
            except AssertionError:
                print('from', filename, ':')
                print(r1.seq)
                print(r2.seq)
                print('')
                print(ex_r1)
                print(ex_r2)
                raise

    def test_blosum_align(self):
        from alignementseq import align, vec_align
        
        for s1, s2, filename in self.get_dataset_heads():
            (ex_r1, ex_r2, exp_score, *_) = Bio.pairwise2.align.globalds(
                s1.seq, s2.seq, blosum62, -8, -8, one_alignment_only=True)[0]
            for method in align, vec_align:
                score, r1, r2 = method(s1, s2)
                try:
                    self.assertSameResidues(r1.seq, s1.seq)
                    self.assertSameResidues(r2.seq, s2.seq)
                    self.assertEqual(score, sum(
                            -8 if a == '-' or b == '-' else
                            blosum62[a,b] if (a,b) in blosum62 else
                            blosum62[b,a]
                        for a,b in zip(r1.seq, r2.seq)))
                    self.assertEqual(score, exp_score)
                except AssertionError:
                    print('from', filename, "with", method, ":")
                    print(r1.seq)
                    print(r2.seq)
                    print('')
                    print(ex_r1)
                    print(ex_r2)
                    raise

if __name__ == '__main__':
    unittest.main()