Skip to content
Snippets Groups Projects
Verified Commit e2463bc9 authored by Etienne MORICE's avatar Etienne MORICE
Browse files

Vectorised version of simple align. Performance tests to come.

parent 4dbdda67
No related branches found
No related tags found
No related merge requests found
...@@ -6,6 +6,8 @@ Created on Tue Jan 15 15:02:16 2019 ...@@ -6,6 +6,8 @@ Created on Tue Jan 15 15:02:16 2019
@author: ariane.delrocq @author: ariane.delrocq
""" """
import numpy as np
import Bio.SubsMat.MatrixInfo import Bio.SubsMat.MatrixInfo
import Bio.SeqIO import Bio.SeqIO
from Bio.SeqRecord import SeqRecord from Bio.SeqRecord import SeqRecord
...@@ -21,8 +23,9 @@ import Bio.pairwise2 ...@@ -21,8 +23,9 @@ import Bio.pairwise2
mat = Bio.SubsMat.MatrixInfo.blosum62 mat = Bio.SubsMat.MatrixInfo.blosum62
def score(a, b, gap=-8):
def score(a, b, gap=-8, id=1, mut=-1): """Default scoring function for align
"""
if not a or not b: if not a or not b:
return gap return gap
else: else:
...@@ -32,13 +35,16 @@ def score(a, b, gap=-8, id=1, mut=-1): ...@@ -32,13 +35,16 @@ def score(a, b, gap=-8, id=1, mut=-1):
return mat[a,b] return mat[a,b]
except KeyError: except KeyError:
return mat[b,a] return mat[b,a]
# elif a==b:
# return id
# else:
# return mut
def align(seq1, seq2, score=score): def align(seq1, seq2, score=score):
"""Main implementation of align, with blosum scoring and no specific gap
extension penalties.
Args:
seq1: A SeqRecord object
seq2: Same.
"""
x = seq1.seq x = seq1.seq
y = seq2.seq y = seq2.seq
n=len(x) n=len(x)
...@@ -106,8 +112,111 @@ def align(seq1, seq2, score=score): ...@@ -106,8 +112,111 @@ def align(seq1, seq2, score=score):
aseq2.seq = Seq(aly[::-1]) aseq2.seq = Seq(aly[::-1])
return bestScore, aseq1, aseq2 return bestScore, aseq1, aseq2
def dict_to_weight(subs_dict):
"""Converts blosum dict to indexable numpy array.
The score function is the critical op here, we do not need a dict lookup in
the innermost loop.
Returns:
26 x 26 numpy array, [0,0] being s('A', 'A')
"""
weight = np.zeros((26,26))
for (r1, r2), w in subs_dict.items():
r1 = ord(r1) - ord('A')
r2 = ord(r2) - ord('A')
weight[r1, r2] = weight[r2, r1] = w
return weight
def vec_align(seq1, seq2, mat=dict_to_weight(Bio.SubsMat.MatrixInfo.blosum62), gap=-8):
"""Proof of concept vector-style implementation of align, written mainly in numpy
arrays operations.
M(i,j) is the score of the best alignement containing exactly i letters of
x and j letters of y.
To allow vector operations, we rewrite M as S(s, k) with s=i+j and
k=i-max(0,s-m)
Hence:
M(i,j) = S(i+j, i-max(0,i+j-m))
S(s, k) = M(k+max(0,s-m), min(s,m)-k)
The relation become:
M(i,j) = max( M(i,j-1)+gap, M(i-1,j)+gap, M(i-1,j-1)+s(i,j) )
S(s,k) = max(
S(s-1,k+1(s>m))+gap,
S(s-1,k-1+1(s>m))+gap,
S(s-2,k-1+1(s>m)+1(s>m+1)) + s(k+max(0,s-m),min(s,m)-k)
)
Note that S(s,k) does not depend anymore on S(k,.), allowing vector operation
Last index of the diag s can be written (useful for simplifications):
min(n,s)-max(0,s-m)
= min(n, m, s, n+m-s)
= min(m,s)-max(0,s-n)
"""
n = len(seq1.seq)
m = len(seq2.seq)
x = np.array(list(ord(c)-ord('A') for c in seq1.seq))
y = np.array(list(ord(c)-ord('A') for c in seq2.seq))
mat = mat.flatten()
S = [ np.zeros(min(s,n,m,n+m-s)+1, dtype=np.int) for s in range(n+m+1) ]
S[0][0] = 0
S[1][0] = gap
S[1][1] = gap
for s in range(2,n+m+1):
# First/last cell must be treated differently when first line/col of M
if(s <= m):
S[s][0] = S[s-1][0] + gap
if(s <= n):
S[s][s-max(0,s-m)] = S[s-1][s-1-max(0,s-1-m)] + gap
# The offsets in the formula become begin/end of slices.
S[s][int(s<=m):len(S[s])-int(s<=n)] = np.maximum(np.maximum(
S[s-1][1:] + gap,
S[s-1][:-1] + gap),
S[s-2][int(s>m+1):len(S[s-2])-int(s>n+1)]
+mat[
x[max(0,s-m-1):min(n,s-1)] * 26 +
y[max(0,s-n-1):min(m,s-1)][::-1]
]
)
# Trace back
score = S[n+m][0]
i,j = n,m
alx, aly = "",""
while i > 0 or j > 0:
if j > 0:
prev_score = S[i+j-1][i-max(0, i+j-m-1)]
if prev_score + gap == score:
score = prev_score
j -= 1
aly += seq2[j]
alx += '-'
continue
if i > 0:
prev_score = S[i+j-1][i-max(0, i+j-m-1)-1]
if prev_score + gap == score:
score = prev_score
i -= 1
alx += seq1[i]
aly += '-'
continue
score = S[i+j-2][i-max(0, i+j-m-2)-1]
i -= 1
j -= 1
alx += seq1[i]
aly += seq2[j]
aseq1 = SeqRecord(seq1)
aseq1.seq = Seq(alx[::-1])
aseq2 = SeqRecord(seq2)
aseq2.seq = Seq(aly[::-1])
return S[n+m][0], aseq1, aseq2
def align2steps(seq1, seq2, d=8, e=4): def align2steps(seq1, seq2, d=8, e=4):
"""Main implementation of align, handling also gap extensions.
"""
x = seq1.seq x = seq1.seq
y = seq2.seq y = seq2.seq
n=len(x) n=len(x)
...@@ -228,6 +337,8 @@ def align2steps(seq1, seq2, d=8, e=4): ...@@ -228,6 +337,8 @@ def align2steps(seq1, seq2, d=8, e=4):
return bestScore, aseq1, aseq2 return bestScore, aseq1, aseq2
def test(): def test():
"""Obsolete test function, to be moved to unit tests in a separate file.
"""
# print(align("chat", "cat")) # print(align("chat", "cat"))
# print(align("chat", "cgat")) # print(align("chat", "cgat"))
# print(align("chat", "at")) # print(align("chat", "at"))
......
...@@ -8,6 +8,7 @@ import zipfile ...@@ -8,6 +8,7 @@ import zipfile
import Bio.SeqIO import Bio.SeqIO
import Bio.pairwise2 import Bio.pairwise2
from Bio.SeqRecord import SeqRecord from Bio.SeqRecord import SeqRecord
from Bio.SubsMat.MatrixInfo import blosum62
class AlignmentSeqTestCase(unittest.TestCase): class AlignmentSeqTestCase(unittest.TestCase):
def setUp(self): def setUp(self):
...@@ -28,9 +29,10 @@ class AlignmentSeqTestCase(unittest.TestCase): ...@@ -28,9 +29,10 @@ class AlignmentSeqTestCase(unittest.TestCase):
with zipfile.ZipFile(balibase_zippath) as balibase_zip: with zipfile.ZipFile(balibase_zippath) as balibase_zip:
balibase_zip.extractall() balibase_zip.extractall()
# Generator function to iterate over the first two sequences of each
# unaligned fasta file.
def get_dataset_heads(self): def get_dataset_heads(self):
"""Generator function to iterate over the first two sequences of each
unaligned fasta file.
"""
dataset_dir = os.path.join(self.balibase_path, "RV11.unaligned") dataset_dir = os.path.join(self.balibase_path, "RV11.unaligned")
for filename in os.listdir(dataset_dir): for filename in os.listdir(dataset_dir):
records = Bio.SeqIO.parse( records = Bio.SeqIO.parse(
...@@ -40,24 +42,26 @@ class AlignmentSeqTestCase(unittest.TestCase): ...@@ -40,24 +42,26 @@ class AlignmentSeqTestCase(unittest.TestCase):
seq2 = next(records) seq2 = next(records)
yield seq1, seq2, filename yield seq1, seq2, filename
# Strip strings of their '-' before comparing them
def assertSameResidues(self, str1, str2): def assertSameResidues(self, str1, str2):
"""Strip strings of their '-' before comparing them
"""
return self.assertEqual( return self.assertEqual(
str(str1).translate({ord('-'):None}), str(str1).translate({ord('-'):None}),
str(str2).translate({ord('-'):None}) str(str2).translate({ord('-'):None})
) )
# Test alignments with the simplest metric. As there can be a huge number of
# solutions, we check only that we got the right score, and one valid
# alignment.
def test_simple_align(self): def test_simple_align(self):
"""Test alignments with the simplest metric. As there can be a huge number of
solutions, we check only that we got the right score, and one valid
alignment.
"""
from alignementseq import align from alignementseq import align
score_fn = lambda a,b : -2 if a == '' or b == '' else -1 if a != b else 1 score_fn = lambda a,b : -2 if a == '' or b == '' else -1 if a != b else 1
for s1, s2, filename in self.get_dataset_heads(): for s1, s2, filename in self.get_dataset_heads():
score, r1, r2 = align(s1, s2, score_fn) score, r1, r2 = align(s1, s2, score_fn)
(ex_r1, ex_r2, exp_score, *_) = Bio.pairwise2.align.globalms( (ex_r1, ex_r2, exp_score, *_) = Bio.pairwise2.align.globalms(
s1.seq, s2.seq, 1, -1, -2, -2)[0] s1.seq, s2.seq, 1, -1, -2, -2, one_alignment_only=True)[0]
try: try:
self.assertSameResidues(r1.seq, s1.seq) self.assertSameResidues(r1.seq, s1.seq)
...@@ -78,18 +82,30 @@ class AlignmentSeqTestCase(unittest.TestCase): ...@@ -78,18 +82,30 @@ class AlignmentSeqTestCase(unittest.TestCase):
raise raise
def test_blosum_align(self): def test_blosum_align(self):
from alignementseq import align from alignementseq import align, vec_align
cases = [ for s1, s2, filename in self.get_dataset_heads():
("CHAT", "CAT", 10, "CHAT", "C-AT"), (ex_r1, ex_r2, exp_score, *_) = Bio.pairwise2.align.globalds(
("CHAT", "CGAT", 16, "CHAT", "CGAT"), s1.seq, s2.seq, blosum62, -8, -8, one_alignment_only=True)[0]
("CHAT", "AT", -7, "CHAT", "--AT") for method in align, vec_align:
] score, r1, r2 = method(s1, s2)
for s1, s2, exp_score, exp_r1, exp_r2 in cases: try:
score, r1, r2 = align(SeqRecord(s1), SeqRecord(s2)) self.assertSameResidues(r1.seq, s1.seq)
self.assertEqual(score, exp_score) self.assertSameResidues(r2.seq, s2.seq)
self.assertEqual(r1.seq, exp_r1) self.assertEqual(score, sum(
self.assertEqual(r2.seq, exp_r2) -8 if a == '-' or b == '-' else
blosum62[a,b] if (a,b) in blosum62 else
blosum62[b,a]
for a,b in zip(r1.seq, r2.seq)))
self.assertEqual(score, exp_score)
except AssertionError:
print('from', filename, "with", method, ":")
print(r1.seq)
print(r2.seq)
print('')
print(ex_r1)
print(ex_r2)
raise
if __name__ == '__main__': if __name__ == '__main__':
unittest.main() unittest.main()
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment