Skip to content
Snippets Groups Projects
Verified Commit 4dbdda67 authored by Etienne MORICE's avatar Etienne MORICE
Browse files

Added more serious unit tests, found two errors.

 * Forgotten an in-place modification of arguments in `align`
 * Wrong initialisation of first column in `align` (index confusion)
parent 17c0571f
No related branches found
No related tags found
No related merge requests found
...@@ -13,11 +13,11 @@ from Bio.Seq import Seq ...@@ -13,11 +13,11 @@ from Bio.Seq import Seq
import Bio.pairwise2 import Bio.pairwise2
seqs=[] #seqs=[]
handle = open("balibase/RV11.unaligned/BBS11001.fasta") #handle = open("balibase/RV11.unaligned/BBS11001.fasta")
for seq in Bio.SeqIO.parse(handle, "fasta"): #for seq in Bio.SeqIO.parse(handle, "fasta"):
seqs.append(seq) # seqs.append(seq)
handle.close() #handle.close()
mat = Bio.SubsMat.MatrixInfo.blosum62 mat = Bio.SubsMat.MatrixInfo.blosum62
...@@ -51,7 +51,7 @@ def align(seq1, seq2, score=score): ...@@ -51,7 +51,7 @@ def align(seq1, seq2, score=score):
mm[i][0] = mm[i-1][0] + score('', x[i-1]) mm[i][0] = mm[i-1][0] + score('', x[i-1])
path[i][0] = (i-1, 0) path[i][0] = (i-1, 0)
for j in range(1, m+1): for j in range(1, m+1):
mm[0][j] = mm[j-1][0] + score('', y[j-1]) mm[0][j] = mm[0][j-1] + score('', y[j-1])
path[0][j] = (0, j-1) path[0][j] = (0, j-1)
# fill table # fill table
...@@ -100,11 +100,11 @@ def align(seq1, seq2, score=score): ...@@ -100,11 +100,11 @@ def align(seq1, seq2, score=score):
alx+='-' alx+='-'
aly+=y[j-1] aly+=y[j-1]
j -=1 j -=1
alx = alx[::-1] aseq1 = SeqRecord(seq1)
aly = aly[::-1] aseq1.seq = Seq(alx[::-1])
seq1.seq = Seq(alx) aseq2 = SeqRecord(seq2)
seq2.seq = Seq(aly) aseq2.seq = Seq(aly[::-1])
return bestScore, seq1, seq2 return bestScore, aseq1, aseq2
def align2steps(seq1, seq2, d=8, e=4): def align2steps(seq1, seq2, d=8, e=4):
......
#!/usr/bin/env python3
import unittest import unittest
import urllib.request import urllib.request
import os import os
import zipfile import zipfile
import Bio.SeqIO
import Bio.pairwise2
from Bio.SeqRecord import SeqRecord from Bio.SeqRecord import SeqRecord
class AlignmentSeqTestCase(unittest.TestCase): class AlignmentSeqTestCase(unittest.TestCase):
def setUp(self): def setUp(self):
balibase_zippath = "balibase.zip" balibase_zippath = "balibase.zip"
balibase_path = "balibase" self.balibase_path = "balibase"
testfile_path = os.path.join( testfile_path = os.path.join(
balibase_path, self.balibase_path,
"RV11.unaligned", "BBS11001.fasta") "RV11.unaligned", "BBS11001.fasta")
if not os.path.isdir(balibase_path): if not os.path.isdir(self.balibase_path):
os.mkdir(balibase_path) os.mkdir(self.balibase_path)
if not os.path.isfile(testfile_path): if not os.path.isfile(testfile_path):
if not os.path.isfile(balibase_zippath): if not os.path.isfile(balibase_zippath):
print("Fetching balibase archive from moodle...") print("Fetching balibase archive from moodle...")
...@@ -24,17 +28,65 @@ class AlignmentSeqTestCase(unittest.TestCase): ...@@ -24,17 +28,65 @@ class AlignmentSeqTestCase(unittest.TestCase):
with zipfile.ZipFile(balibase_zippath) as balibase_zip: with zipfile.ZipFile(balibase_zippath) as balibase_zip:
balibase_zip.extractall() balibase_zip.extractall()
# Generator function to iterate over the first two sequences of each
# unaligned fasta file.
def get_dataset_heads(self):
dataset_dir = os.path.join(self.balibase_path, "RV11.unaligned")
for filename in os.listdir(dataset_dir):
records = Bio.SeqIO.parse(
os.path.join(dataset_dir, filename),
"fasta")
seq1 = next(records)
seq2 = next(records)
yield seq1, seq2, filename
# Strip strings of their '-' before comparing them
def assertSameResidues(self, str1, str2):
return self.assertEqual(
str(str1).translate({ord('-'):None}),
str(str2).translate({ord('-'):None})
)
# Test alignments with the simplest metric. As there can be a huge number of
# solutions, we check only that we got the right score, and one valid
# alignment.
def test_simple_align(self): def test_simple_align(self):
from alignementseq import align from alignementseq import align
score_fn = lambda a,b : -2 if a == '' or b == '' else -1 if a != b else 1 score_fn = lambda a,b : -2 if a == '' or b == '' else -1 if a != b else 1
for s1, s2, filename in self.get_dataset_heads():
score, r1, r2 = align(s1, s2, score_fn)
(ex_r1, ex_r2, exp_score, *_) = Bio.pairwise2.align.globalms(
s1.seq, s2.seq, 1, -1, -2, -2)[0]
try:
self.assertSameResidues(r1.seq, s1.seq)
self.assertSameResidues(r2.seq, s2.seq)
self.assertEqual(score, sum(
-2 if a == '-' or b == '-' else
-1 if a != b else
1
for a,b in zip(r1.seq, r2.seq)))
self.assertEqual(score, exp_score)
except AssertionError:
print('from', filename, ':')
print(r1.seq)
print(r2.seq)
print('')
print(ex_r1)
print(ex_r2)
raise
def test_blosum_align(self):
from alignementseq import align
cases = [ cases = [
("CHAT", "CAT", 1, "CHAT", "C-AT"), ("CHAT", "CAT", 10, "CHAT", "C-AT"),
("CHAT", "CGAT", 2, "CHAT", "CGAT"), ("CHAT", "CGAT", 16, "CHAT", "CGAT"),
("CHAT", "AT", -2, "CHAT", "--AT") ("CHAT", "AT", -7, "CHAT", "--AT")
] ]
for s1, s2, exp_score, exp_r1, exp_r2 in cases: for s1, s2, exp_score, exp_r1, exp_r2 in cases:
score, r1, r2 = align(SeqRecord(s1), SeqRecord(s2), score_fn) score, r1, r2 = align(SeqRecord(s1), SeqRecord(s2))
self.assertEqual(score, exp_score) self.assertEqual(score, exp_score)
self.assertEqual(r1.seq, exp_r1) self.assertEqual(r1.seq, exp_r1)
self.assertEqual(r2.seq, exp_r2) self.assertEqual(r2.seq, exp_r2)
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment