#!/usr/bin/env python3 import unittest import urllib.request import os import zipfile import Bio.SeqIO import Bio.pairwise2 from Bio.SeqRecord import SeqRecord from Bio.SubsMat.MatrixInfo import blosum62 class AlignmentSeqTestCase(unittest.TestCase): def setUp(self): balibase_zippath = "balibase.zip" self.balibase_path = "balibase" testfile_path = os.path.join( self.balibase_path, "RV11.unaligned", "BBS11001.fasta") if not os.path.isdir(self.balibase_path): os.mkdir(self.balibase_path) if not os.path.isfile(testfile_path): if not os.path.isfile(balibase_zippath): print("Fetching balibase archive from moodle...") urllib.request.urlretrieve( "https://moodle.polytechnique.fr/mod/resource/view.php?id=38570", balibase_zippath) with zipfile.ZipFile(balibase_zippath) as balibase_zip: balibase_zip.extractall() def get_dataset_heads(self): """Generator function to iterate over the first two sequences of each unaligned fasta file. """ dataset_dir = os.path.join(self.balibase_path, "RV11.unaligned") for filename in os.listdir(dataset_dir): records = Bio.SeqIO.parse( os.path.join(dataset_dir, filename), "fasta") seq1 = next(records) seq2 = next(records) yield seq1, seq2, filename def assertSameResidues(self, str1, str2): """Strip strings of their '-' before comparing them """ return self.assertEqual( str(str1).translate({ord('-'):None}), str(str2).translate({ord('-'):None}) ) def test_simple_align(self): """Test alignments with the simplest metric. As there can be a huge number of solutions, we check only that we got the right score, and one valid alignment. """ from alignementseq import align score_fn = lambda a,b : -2 if a == '' or b == '' else -1 if a != b else 1 for s1, s2, filename in self.get_dataset_heads(): score, r1, r2 = align(s1, s2, score_fn) (ex_r1, ex_r2, exp_score, *_) = Bio.pairwise2.align.globalms( s1.seq, s2.seq, 1, -1, -2, -2, one_alignment_only=True)[0] try: self.assertSameResidues(r1.seq, s1.seq) self.assertSameResidues(r2.seq, s2.seq) self.assertEqual(score, sum( -2 if a == '-' or b == '-' else -1 if a != b else 1 for a,b in zip(r1.seq, r2.seq))) self.assertEqual(score, exp_score) except AssertionError: print('from', filename, ':') print(r1.seq) print(r2.seq) print('') print(ex_r1) print(ex_r2) raise def test_blosum_align(self): from alignementseq import align, vec_align for s1, s2, filename in self.get_dataset_heads(): (ex_r1, ex_r2, exp_score, *_) = Bio.pairwise2.align.globalds( s1.seq, s2.seq, blosum62, -8, -8, one_alignment_only=True)[0] for method in align, vec_align: score, r1, r2 = method(s1, s2) try: self.assertSameResidues(r1.seq, s1.seq) self.assertSameResidues(r2.seq, s2.seq) self.assertEqual(score, sum( -8 if a == '-' or b == '-' else blosum62[a,b] if (a,b) in blosum62 else blosum62[b,a] for a,b in zip(r1.seq, r2.seq))) self.assertEqual(score, exp_score) except AssertionError: print('from', filename, "with", method, ":") print(r1.seq) print(r2.seq) print('') print(ex_r1) print(ex_r2) raise if __name__ == '__main__': unittest.main()