Newer
Older
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Tue Jan 15 15:02:16 2019
@author: ariane.delrocq
"""
import numpy as np
import Bio.SeqIO
from Bio.SeqRecord import SeqRecord
from Bio.Seq import Seq
import Bio.pairwise2
#seqs=[]
#handle = open("balibase/RV11.unaligned/BBS11001.fasta")
#for seq in Bio.SeqIO.parse(handle, "fasta"):
# seqs.append(seq)
#handle.close()
def score(a, b, gap=-8):
"""Default scoring function for align
"""
if not a or not b:
return gap
else:
a = a.upper()
b = b.upper()
try:
return mat[a,b]
except KeyError:
return mat[b,a]
def align(seq1, seq2, score=score):
"""Main implementation of align, with blosum scoring and no specific gap
extension penalties.
Args:
seq1: A SeqRecord object
seq2: Same.
"""
mm = [[0 for _ in range(m+1)] for _ in range(n+1)] # scores matrix
path = [[0 for _ in range(m+1)] for _ in range(n+1)] # last step for optimal score
# first line and col
for i in range(1, n+1):
mm[i][0] = mm[i-1][0] + score('', x[i-1])
mm[0][j] = mm[0][j-1] + score('', y[j-1])
path[0][j] = (0, j-1)
# fill table
for i in range(1, n+1):
for j in range(1, m+1):
a = x[i-1]
b = y[j-1]
s1 = mm[i-1][j-1] + score(a,b)
s2 = mm[i-1][j] + score('', a)
s3 = mm[i][j-1] + score('', b)
bestScore = mm[n][m]
alx = ""
aly = ""
i, j = n, m
while i > 0 and j > 0:
i2, j2 = path[i][j]
if i==i2:
alx+='-'
else:
alx+=x[i-1]
if j==j2:
aly+='-'
else:
aly+=y[j-1]
i,j = i2, j2
while i>0:
aly+='-'
alx+=x[i-1]
i -=1
while j>0:
alx+='-'
aly+=y[j-1]
j -=1
aseq1 = SeqRecord(seq1)
aseq1.seq = Seq(alx[::-1])
aseq2 = SeqRecord(seq2)
aseq2.seq = Seq(aly[::-1])
return bestScore, aseq1, aseq2
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
def dict_to_weight(subs_dict):
"""Converts blosum dict to indexable numpy array.
The score function is the critical op here, we do not need a dict lookup in
the innermost loop.
Returns:
26 x 26 numpy array, [0,0] being s('A', 'A')
"""
weight = np.zeros((26,26))
for (r1, r2), w in subs_dict.items():
r1 = ord(r1) - ord('A')
r2 = ord(r2) - ord('A')
weight[r1, r2] = weight[r2, r1] = w
return weight
def vec_align(seq1, seq2, mat=dict_to_weight(Bio.SubsMat.MatrixInfo.blosum62), gap=-8):
"""Proof of concept vector-style implementation of align, written mainly in numpy
arrays operations.
M(i,j) is the score of the best alignement containing exactly i letters of
x and j letters of y.
To allow vector operations, we rewrite M as S(s, k) with s=i+j and
k=i-max(0,s-m)
Hence:
M(i,j) = S(i+j, i-max(0,i+j-m))
S(s, k) = M(k+max(0,s-m), min(s,m)-k)
The relation become:
M(i,j) = max( M(i,j-1)+gap, M(i-1,j)+gap, M(i-1,j-1)+s(i,j) )
S(s,k) = max(
S(s-1,k+1(s>m))+gap,
S(s-1,k-1+1(s>m))+gap,
S(s-2,k-1+1(s>m)+1(s>m+1)) + s(k+max(0,s-m),min(s,m)-k)
)
Note that S(s,k) does not depend anymore on S(k,.), allowing vector operation
Last index of the diag s can be written (useful for simplifications):
min(n,s)-max(0,s-m)
= min(n, m, s, n+m-s)
= min(m,s)-max(0,s-n)
"""
n = len(seq1.seq)
m = len(seq2.seq)
x = np.array(list(ord(c)-ord('A') for c in seq1.seq))
y = np.array(list(ord(c)-ord('A') for c in seq2.seq))
mat = mat.flatten()
S = [ np.zeros(min(s,n,m,n+m-s)+1, dtype=np.int32) for s in range(n+m+1) ]
S[0][0] = 0
S[1][0] = gap
S[1][1] = gap
for s in range(2,n+m+1):
# First/last cell must be treated differently when first line/col of M
if(s <= m):
S[s][0] = S[s-1][0] + gap
if(s <= n):
S[s][s-max(0,s-m)] = S[s-1][s-1-max(0,s-1-m)] + gap
# The offsets in the formula become begin/end of slices.
S[s][int(s<=m):len(S[s])-int(s<=n)] = np.maximum(np.maximum(
S[s-2][int(s>m+1):len(S[s-2])-int(s>n+1)]
+mat[
x[max(0,s-m-1):min(n,s-1)] + y[max(0,s-n-1):min(m,s-1)][::-1]
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
]
)
# Trace back
score = S[n+m][0]
i,j = n,m
alx, aly = "",""
while i > 0 or j > 0:
if j > 0:
prev_score = S[i+j-1][i-max(0, i+j-m-1)]
if prev_score + gap == score:
score = prev_score
j -= 1
aly += seq2[j]
alx += '-'
continue
if i > 0:
prev_score = S[i+j-1][i-max(0, i+j-m-1)-1]
if prev_score + gap == score:
score = prev_score
i -= 1
alx += seq1[i]
aly += '-'
continue
score = S[i+j-2][i-max(0, i+j-m-2)-1]
i -= 1
j -= 1
alx += seq1[i]
aly += seq2[j]
aseq1 = SeqRecord(seq1)
aseq1.seq = Seq(alx[::-1])
aseq2 = SeqRecord(seq2)
aseq2.seq = Seq(aly[::-1])
return S[n+m][0], aseq1, aseq2
def align_dihedrals(seq1, seq2, struct1, struct2):
"""
Align two sequences, by using a simple cost function using dihedral angles
as well as amino acids names.
Arguments:
seq1: First sequence as SeqRecord
seq2:
struct1: First structure as Bio.PDB object
struct2:
"""
print("Not implemented")
"""Main implementation of align, handling also gap extensions.
"""
x = seq1.seq
y = seq2.seq
n=len(x)
m=len(y)
mm = [[0 for _ in range(m+1)] for _ in range(n+1)] # scores matrix
mx = [[0 for _ in range(m+1)] for _ in range(n+1)]
my = [[0 for _ in range(m+1)] for _ in range(n+1)]
pathm = [[0 for _ in range(m+1)] for _ in range(n+1)] # last step for optimal score
pathx = [[0 for _ in range(m+1)] for _ in range(n+1)]
pathy = [[0 for _ in range(m+1)] for _ in range(n+1)]
lower_bound = - (n+m) * max(e,d)
# first line and col
for i in range(1, n+1):
mx[i][0] = -d -(i-1)*e
pathx[i][0] = pathx
# Best alignment ending with no gap but with 0 letters of one seq does
# not exist... but is still used in the formulas below
mm[i][0] = lower_bound
# Same for best alignment with 0 letter of Y ending on a gap on the X
# side with a letter of Y
my[i][0] = lower_bound
my[0][j] = -d - (j-1)*e
pathy[0][j] = pathy
mm[0][j] = lower_bound
mx[0][j] = lower_bound
for i in range(1, n+1):
for j in range(1, m+1):
a = x[i-1]
b = y[j-1]
# find max for M
s1 = mm[i-1][j-1] + score(a,b)
s2 = mx[i-1][j-1] + score(a,b)
if s1 >= s2:
if s1 >= s3:
mm[i][j] = s1
else: # s2 > s1
if s2 >= s3:
mm[i][j] = s2
# find max for I_x
s4 = mm[i-1][j] - d
s5 = mx[i-1][j] - e
if s4 >= s5:
mx[i][j] = s4
# find max for I_y
s6 = mm[i][j-1] - d
s7 = my[i][j-1] - e
if s6 >= s7:
my[i][j] = s6
pathy[i][j] = pathy
bestScore, bestPath = max(
[(mm,pathm),(mx,pathx),(my,pathy)],
key=(lambda t:t[0][n][m]))
bestScore = bestScore[n][m]
prev_path = path[i][j]
if path == pathm:
i -= 1
j -= 1
alx += x[i]
aly += y[j]
elif path == pathy:
j -= 1
alx += '-'
aly += y[j]
i -= 1
alx += x[i]
aly += '-'
path = prev_path
while i>0:
aly+='-'
alx+=x[i-1]
i -=1
while j>0:
alx+='-'
aly+=y[j-1]
j -=1
alx = alx[::-1]
aly = aly[::-1]
aseq1 = SeqRecord(seq1)
aseq1.seq = Seq(alx)
aseq2 = SeqRecord(seq2)
aseq2.seq = Seq(aly)
return bestScore, aseq1, aseq2
"""Obsolete test function, to be moved to unit tests in a separate file.
"""
# print(align("chat", "cat"))
# print(align("chat", "cgat"))
# print(align("chat", "at"))
s, x, y = align2steps(seqs[0], seqs[1])
print(s)
print(x.seq)
print(y.seq)
with open("balibase/us/1.fasta", "w") as fd:
Bio.SeqIO.write((x, y), fd, "fasta")