Skip to content
Snippets Groups Projects
Verified Commit 60dc8f50 authored by Etienne MORICE's avatar Etienne MORICE
Browse files

Benchmark and switch to traceback matrix for vec_align for #10

parent b2053a14
No related branches found
No related tags found
No related merge requests found
......@@ -232,6 +232,111 @@ def vec_align(seq1, seq2, mat=dict_to_weight(Bio.SubsMat.MatrixInfo.blosum62), g
aseq2.seq = Seq(aly[::-1])
return S[n+m][0], aseq1, aseq2
def vec_align2(seq1, seq2, mat=dict_to_weight(Bio.SubsMat.MatrixInfo.blosum62), gap=-8):
"""Proof of concept vector-style implementation of align, written mainly in numpy
arrays operations.
M(i,j) is the score of the best alignement containing exactly i letters of
x and j letters of y.
To allow vector operations, we rewrite M as S(s, k) with s=i+j and
k=i-max(0,s-m)
Hence:
M(i,j) = S(i+j, i-max(0,i+j-m))
S(s, k) = M(k+max(0,s-m), min(s,m)-k)
The relation become:
M(i,j) = max( M(i,j-1)+gap, M(i-1,j)+gap, M(i-1,j-1)+s(i,j) )
S(s,k) = max(
S(s-1,k+1(s>m))+gap,
S(s-1,k-1+1(s>m))+gap,
S(s-2,k-1+1(s>m)+1(s>m+1)) + s(k+max(0,s-m),min(s,m)-k)
)
Note that S(s,k) does not depend anymore on S(k,.), allowing vector operation
Last index of the diag s can be written (useful for simplifications):
min(n,s)-max(0,s-m)
= min(n, m, s, n+m-s)
= min(m,s)-max(0,s-n)
"""
n = len(seq1.seq)
m = len(seq2.seq)
x = np.array(list(ord(c)-ord('A') for c in seq1.seq))
y = np.array(list(ord(c)-ord('A') for c in seq2.seq))
mat = mat.flatten()
# Traceback matrix, bit 0 = gap, bit 1 = gap on y side
T = [ np.empty(min(s,n,m,n+m-s)+1, dtype=np.byte) for s in range(n+m+1) ]
# Scores, we just have to remember three lines
S = tuple( np.empty(min(n,m)+1, dtype=np.int32) for s in range(3) )
S[0][0] = 0
S[1][0] = gap
S[1][1] = gap
# T[0][0] never used
T[1][0] = 1 | 0
T[1][1] = 1 | 2
x *= 26
for s in range(2,n+m+1):
# First/last cell must be treated differently when first line/col of M
if(s <= m):
S[2][0] = S[1][0] + gap
T[s][0] = 1 | 0
if(s <= n):
S[2][s-max(0,s-m)] = S[1][s-1-max(0,s-1-m)] + gap
T[s][s-max(0,s-m)] = 1 | 2
# The offsets in the formula become begin/end of slices.
gapped = S[1] + gap
max_gap = np.maximum(
gapped[1:len(T[s-1])],
gapped[:len(T[s-1])-1])
T[s][int(s<=m):len(T[s])-int(s<=n)] = np.equal(
max_gap,
gapped[:len(T[s-1])-1]
) << 1
S[2][int(s<=m):len(T[s])-int(s<=n)] = np.maximum(max_gap,
S[0][int(s>m+1):len(T[s-2])-int(s>n+1)]
+mat[
x[max(0,s-m-1):min(n,s-1)] + y[max(0,s-n-1):min(m,s-1)][::-1]
]
)
T[s][int(s<=m):len(T[s])-int(s<=n)] |= np.equal(
max_gap,
S[2][int(s<=m):len(T[s])-int(s<=n)]
)
S = (S[1], S[2], S[0])
# Trace back
i,j = n,m
alx, aly = "",""
while i > 0 or j > 0:
is_gap = T[i+j][i-max(0,i+j-m)] & 1
if is_gap:
is_y_gap = T[i+j][i-max(0,i+j-m)] & 2
if is_y_gap:
i -= 1
alx += seq1[i]
aly += '-'
else:
j -= 1
aly += seq2[j]
alx += '-'
else:
i -= 1
j -= 1
alx += seq1[i]
aly += seq2[j]
aseq1 = SeqRecord(seq1)
aseq1.seq = Seq(alx[::-1])
aseq2 = SeqRecord(seq2)
aseq2.seq = Seq(aly[::-1])
return S[1][0], aseq1, aseq2
def align_dihedrals(seq1, seq2, angles1, angles2):
"""
Align two sequences, by using a simple cost function using dihedral angles
......
from alignementseq import score
from alignementseq import score, vec_align
from Bio.SeqRecord import SeqRecord
from Bio.Seq import Seq
import tree_generator
def multiple_align(sequences):
"""Top-level multiple alignment routine
Arguments:
sequences: lists of SeqRecord objects
Returns:
List of aligned SeqRecord objects, with '-' where needed.
"""
tree = tree_generator.tree_build(sequences, vec_align)
return multiple_align_from_linkage_matrix(sequences, tree)
def multiple_align_from_tree(tree):
"""Multiple alignmnent from a SeqRecord tree
Arguments:
tree: nested structure of python tuples, with SeqRecord objects as leafs
"""
......@@ -27,7 +42,7 @@ def multiple_align_from_tree(tree):
def multiple_align_from_linkage_matrix(sequences, tree):
"""Multiple alignmnent from a SeqRecord tree
Arguments:
sequences: indexable container of SeqRecord objects to align
tree: scipy.hierarchy linkage matrix
......@@ -43,7 +58,7 @@ def multiple_align_from_linkage_matrix(sequences, tree):
inode = int(inode)
if inode < n:
return [sequences[inode]]
ileft, iright, dist, size = tree[inode - n]
ileft, iright, _, _ = tree[inode - n]
aleft = _join(ileft)
aright = _join(iright)
return join_alignments(aleft, aright)
......@@ -58,12 +73,12 @@ def _score(a, b, base_score=score):
if b == '-':
return 0
else:
return score('', b)
return base_score('', b)
else:
if b == '-':
return score('', a)
return base_score('', a)
else:
return score(a, b)
return base_score(a, b)
def _mscore(seqs1, seqs2, i, j, _score=_score):
......@@ -135,9 +150,9 @@ def join_alignments(aleft, aright, _mscore=_mscore, gap=-8):
for jseq in range(len(aright)):
aseqs[len(aleft)+jseq] += aright[jseq][nextj]
i,j = nexti,nextj
ret = [ SeqRecord(id=s.id, seq="") for s in aleft ] + [ SeqRecord(id=s.id, seq="") for s in aright ]
ret = [ SeqRecord(id=s.id, seq="", description=s.description)
for alignment in (aleft, aright) for s in alignment ]
for rec, seq in zip(ret, aseqs):
rec.seq = Seq(seq[::-1])
return ret
......@@ -176,8 +176,7 @@ class BalibaseTestCase(unittest.TestCase):
yield records, filename
class AlignmentSeqTestCase(BalibaseTestCase):
"""
TestCase for alignment correctness and quality
""" TestCase for alignment correctness and quality
"""
def assert_same_residues(self, str1, str2):
......@@ -226,13 +225,13 @@ class AlignmentSeqTestCase(BalibaseTestCase):
"""Tests alignments with blosum but no gap extension.
"""
from alignementseq import align, vec_align
from alignementseq import align, vec_align, vec_align2
aligner = Bio.pairwise2.align
for seq1, seq2, filename in self.get_dataset_heads():
(ex_r1, ex_r2, exp_score, *_) = aligner.globalds(
seq1.seq, seq2.seq, blosum62, -8, -8, one_alignment_only=True)[0]
for method in align, vec_align:
for method in align, vec_align, vec_align2:
score, ret1, ret2 = method(seq1, seq2)
try:
self.assert_same_residues(ret1.seq, seq1.seq)
......@@ -328,37 +327,58 @@ class AlignmentSeqTestCase(BalibaseTestCase):
for j in jobs:
j.join()
def create_alignments_with(self, method, method_name=None):
"""
Create fasta files with aligned sequences using the given method, under
balibase_path/<method_name>
"""
dataset_dir = os.path.join(self.balibase_path, "RV11.unaligned")
out_dir = os.path.join(self.balibase_path, method_name or method.__name__)
for in_filename in os.listdir(dataset_dir):
sequences = list(Bio.SeqIO.parse(in_filename, "fasta"))
def test_benchmark_multiple_align(self):
"""Tests the multiple_align function (using blosum and gap extension)."""
"""Perform multiple alignments with different methods and evaluate the
results with bali_score"""
res_dir = "balibase_results"
if not os.path.isdir(res_dir):
os.mkdir(res_dir)
# Cached
# Define methods to test
from alignementseq_multiple import multiple_align
methods_test = [ multiple_align ]
methods_test_names = [ m.__name__ for m in methods_test ]
methods_eval_names = methods_test_names + ["clustalo"]
# Create clustalo alignment separately, and cache them
self.create_clustalo_alignments()
# Align and eval with every other method
## Directories
dataset_dir = os.path.join(self.balibase_path, "RV11.unaligned")
ref_dataset_dir = os.path.join(self.balibase_path, "RV11.aligned")
clustalo_dir = os.path.join(self.balibase_path, "clustalo")
for name in methods_test_names:
out_dir = os.path.join(self.balibase_path, name)
if not os.path.isdir(out_dir):
os.mkdir(out_dir)
scores = []
for filename in os.listdir(ref_dataset_dir):
scores.append(self.bali_score(
os.path.join(ref_dataset_dir, filename),
os.path.join(clustalo_dir, filename)
))
for in_filename in os.listdir(dataset_dir):
# Read input
sequences = list(Bio.SeqIO.parse(
os.path.join(dataset_dir, in_filename),
"fasta"
))
# Create output for all but clustalo
for method, name in zip(methods_test, methods_test_names):
out_dir = os.path.join(self.balibase_path, name)
with open(os.path.join(out_dir, in_filename), "w") as out_fd:
aseqs = method(sequences)
Bio.SeqIO.write(aseqs, out_fd, "fasta")
for name in methods_eval_names:
out_dir = os.path.join(self.balibase_path, name)
scores.append(self.bali_score(
os.path.join(ref_dataset_dir, in_filename),
os.path.join(out_dir, in_filename)
))
scores[-1]['method'] = name
scores = pd.DataFrame(scores)
scores.to_csv(os.path.join(res_dir, "clustalo.csv"))
......@@ -439,7 +459,7 @@ class PerformanceTestCase(BalibaseTestCase):
def test_performance(self):
"""Create timing graph comparing speed of simple alignment methods
"""
from alignementseq import align, vec_align
from alignementseq import align, vec_align, vec_align2
aligner = Bio.pairwise2.align
def biopython_align(seq1, seq2):
return aligner.globalds(
......@@ -447,7 +467,7 @@ class PerformanceTestCase(BalibaseTestCase):
one_alignment_only=True)[0]
methods = (vec_align, align, biopython_align)
methods = (vec_align, align, biopython_align, vec_align2)
times = pd.DataFrame(columns=("length1", "length2", "method", "time"),
dtype=float)
for seq1, seq2, filename in self.get_dataset_heads():
......
......@@ -8,10 +8,10 @@ Created on Tue Jan 29 14:59:19 2019
import numpy as np
from Bio.SeqRecord import SeqRecord
from Bio.Cluster import treecluster
import alignementseq
from scipy.cluster.hierarchy import average, dendrogram
#import alignementseq
from scipy.cluster.hierarchy import average #, dendrogram
seqs = [SeqRecord('TAACCCCAAAAGAACCAGA'), SeqRecord('TTCTGTAGTAGAGATGGAATTAAGAAAAAACCATCAACTATAACCCCAAAAGAACCAGA'), SeqRecord('TTCTGTAGTAGAGATGGAATTAAGAAAAAACCATCAACTATAACCCCAAAAGAACCAGA'),SeqRecord('GTAGTAGAGATGGAATTAAGAAAAAACCATCAACTATAACCCCAAGAGAACCAGA'), SeqRecord('GAGCCGGATGAGAAGAAACTCTCATGTCCGGTTCTGTAGTAGAGATGGAATTAAGAAAAAACCATCAACTATAACCCCAAGAGAACCAGA'), SeqRecord('TTTTCATTCGCGAGGAGCCGGATGAGAAGAAACTCTCATGTCCGGTTCTGTAGTAGAGATGGAATTAAGAAAAAACCATCAACTATAACCCCAAGAGAACCAGA')]
_test_seqs = [SeqRecord('TAACCCCAAAAGAACCAGA'), SeqRecord('TTCTGTAGTAGAGATGGAATTAAGAAAAAACCATCAACTATAACCCCAAAAGAACCAGA'), SeqRecord('TTCTGTAGTAGAGATGGAATTAAGAAAAAACCATCAACTATAACCCCAAAAGAACCAGA'),SeqRecord('GTAGTAGAGATGGAATTAAGAAAAAACCATCAACTATAACCCCAAGAGAACCAGA'), SeqRecord('GAGCCGGATGAGAAGAAACTCTCATGTCCGGTTCTGTAGTAGAGATGGAATTAAGAAAAAACCATCAACTATAACCCCAAGAGAACCAGA'), SeqRecord('TTTTCATTCGCGAGGAGCCGGATGAGAAGAAACTCTCATGTCCGGTTCTGTAGTAGAGATGGAATTAAGAAAAAACCATCAACTATAACCCCAAGAGAACCAGA')]
#Arguments : une liste de séquences SeqRecord, une fonction d'alignement
score_gap = -8
......@@ -20,14 +20,14 @@ def tree_build(seqs, align):
M = np.zeros((len(seqs),len(seqs)))
for i in range (len(seqs)):
for j in range (i):
M[i][j], a, b = align(seqs[i], seqs[j])
M[i][j], _, _ = align(seqs[i], seqs[j])
M[i][j] = 1-abs(M[i][j]/(max(len_seqs)*(11-score_gap)))
print(type(M), M)
# print(type(M), M)
# tree1 = treecluster(None, method = 'a', dist = 'e', distancematrix=M)
tree= average(M)
return tree
#tree = tree_build(seqs, alignementseq.align2steps)
#tree = tree_build(_test_seqs, alignementseq.align2steps)
#print(tree)
#dn = dendrogram(tree)
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment