#!/usr/bin/env python3 import unittest import urllib.request import os import zipfile import Bio.SeqIO import Bio.pairwise2 from Bio.SeqRecord import SeqRecord class AlignmentSeqTestCase(unittest.TestCase): def setUp(self): balibase_zippath = "balibase.zip" self.balibase_path = "balibase" testfile_path = os.path.join( self.balibase_path, "RV11.unaligned", "BBS11001.fasta") if not os.path.isdir(self.balibase_path): os.mkdir(self.balibase_path) if not os.path.isfile(testfile_path): if not os.path.isfile(balibase_zippath): print("Fetching balibase archive from moodle...") urllib.request.urlretrieve( "https://moodle.polytechnique.fr/mod/resource/view.php?id=38570", balibase_zippath) with zipfile.ZipFile(balibase_zippath) as balibase_zip: balibase_zip.extractall() # Generator function to iterate over the first two sequences of each # unaligned fasta file. def get_dataset_heads(self): dataset_dir = os.path.join(self.balibase_path, "RV11.unaligned") for filename in os.listdir(dataset_dir): records = Bio.SeqIO.parse( os.path.join(dataset_dir, filename), "fasta") seq1 = next(records) seq2 = next(records) yield seq1, seq2, filename # Strip strings of their '-' before comparing them def assertSameResidues(self, str1, str2): return self.assertEqual( str(str1).translate({ord('-'):None}), str(str2).translate({ord('-'):None}) ) # Test alignments with the simplest metric. As there can be a huge number of # solutions, we check only that we got the right score, and one valid # alignment. def test_simple_align(self): from alignementseq import align score_fn = lambda a,b : -2 if a == '' or b == '' else -1 if a != b else 1 for s1, s2, filename in self.get_dataset_heads(): score, r1, r2 = align(s1, s2, score_fn) (ex_r1, ex_r2, exp_score, *_) = Bio.pairwise2.align.globalms( s1.seq, s2.seq, 1, -1, -2, -2)[0] try: self.assertSameResidues(r1.seq, s1.seq) self.assertSameResidues(r2.seq, s2.seq) self.assertEqual(score, sum( -2 if a == '-' or b == '-' else -1 if a != b else 1 for a,b in zip(r1.seq, r2.seq))) self.assertEqual(score, exp_score) except AssertionError: print('from', filename, ':') print(r1.seq) print(r2.seq) print('') print(ex_r1) print(ex_r2) raise def test_blosum_align(self): from alignementseq import align cases = [ ("CHAT", "CAT", 10, "CHAT", "C-AT"), ("CHAT", "CGAT", 16, "CHAT", "CGAT"), ("CHAT", "AT", -7, "CHAT", "--AT") ] for s1, s2, exp_score, exp_r1, exp_r2 in cases: score, r1, r2 = align(SeqRecord(s1), SeqRecord(s2)) self.assertEqual(score, exp_score) self.assertEqual(r1.seq, exp_r1) self.assertEqual(r2.seq, exp_r2) if __name__ == '__main__': unittest.main()