Newer
Older
#!/usr/bin/env python3
import unittest
import urllib.request
import os
import zipfile
import Bio.SeqIO
import Bio.pairwise2
from Bio.SeqRecord import SeqRecord
from Bio.SubsMat.MatrixInfo import blosum62
class AlignmentSeqTestCase(unittest.TestCase):
def setUp(self):
balibase_zippath = "balibase.zip"
self.balibase_path = "balibase"
testfile_path = os.path.join(
"RV11.unaligned", "BBS11001.fasta")
if not os.path.isdir(self.balibase_path):
os.mkdir(self.balibase_path)
if not os.path.isfile(testfile_path):
if not os.path.isfile(balibase_zippath):
print("Fetching balibase archive from moodle...")
urllib.request.urlretrieve(
"https://moodle.polytechnique.fr/mod/resource/view.php?id=38570",
balibase_zippath)
with zipfile.ZipFile(balibase_zippath) as balibase_zip:
balibase_zip.extractall()
def get_dataset_heads(self):
"""Generator function to iterate over the first two sequences of each
unaligned fasta file.
"""
dataset_dir = os.path.join(self.balibase_path, "RV11.unaligned")
for filename in os.listdir(dataset_dir):
records = Bio.SeqIO.parse(
os.path.join(dataset_dir, filename),
"fasta")
seq1 = next(records)
seq2 = next(records)
yield seq1, seq2, filename
def assertSameResidues(self, str1, str2):
"""Strip strings of their '-' before comparing them
"""
return self.assertEqual(
str(str1).translate({ord('-'):None}),
str(str2).translate({ord('-'):None})
)
"""Test alignments with the simplest metric. As there can be a huge number of
solutions, we check only that we got the right score, and one valid
alignment.
"""
from alignementseq import align
score_fn = lambda a,b : -2 if a == '' or b == '' else -1 if a != b else 1
for s1, s2, filename in self.get_dataset_heads():
score, r1, r2 = align(s1, s2, score_fn)
(ex_r1, ex_r2, exp_score, *_) = Bio.pairwise2.align.globalms(
s1.seq, s2.seq, 1, -1, -2, -2, one_alignment_only=True)[0]
try:
self.assertSameResidues(r1.seq, s1.seq)
self.assertSameResidues(r2.seq, s2.seq)
self.assertEqual(score, sum(
-2 if a == '-' or b == '-' else
-1 if a != b else
1
for a,b in zip(r1.seq, r2.seq)))
self.assertEqual(score, exp_score)
except AssertionError:
print('from', filename, ':')
print(r1.seq)
print(r2.seq)
print('')
print(ex_r1)
print(ex_r2)
raise
def test_blosum_align(self):
from alignementseq import align, vec_align
for s1, s2, filename in self.get_dataset_heads():
(ex_r1, ex_r2, exp_score, *_) = Bio.pairwise2.align.globalds(
s1.seq, s2.seq, blosum62, -8, -8, one_alignment_only=True)[0]
for method in align, vec_align:
score, r1, r2 = method(s1, s2)
try:
self.assertSameResidues(r1.seq, s1.seq)
self.assertSameResidues(r2.seq, s2.seq)
self.assertEqual(score, sum(
-8 if a == '-' or b == '-' else
blosum62[a,b] if (a,b) in blosum62 else
blosum62[b,a]
for a,b in zip(r1.seq, r2.seq)))
self.assertEqual(score, exp_score)
except AssertionError:
print('from', filename, "with", method, ":")
print(r1.seq)
print(r2.seq)
print('')
print(ex_r1)
print(ex_r2)
raise
if __name__ == '__main__':
unittest.main()