Newer
Older
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Tue Jan 15 15:02:16 2019
@author: ariane.delrocq
"""
import Bio.SubsMat.MatrixInfo
import Bio.SeqIO
from Bio.SeqRecord import SeqRecord
from Bio.Seq import Seq
seqs=[]
handle = open("balibase/RV11.unaligned/BBS11001.fasta")
for seq in Bio.SeqIO.parse(handle, "fasta"):
seqs.append(seq)
handle.close()
mat = Bio.SubsMat.MatrixInfo.blosum62
def score(a, b, gap=-8, id=1, mut=-1):
if not a or not b:
return gap
else:
a = a.upper()
b = b.upper()
try:
return mat[a,b]
except KeyError:
return mat[b,a]
# elif a==b:
# return id
# else:
# return mut
def align(seq1, seq2):
x = seq1.seq
y = seq2.seq
mm = [[0 for _ in range(m+1)] for _ in range(n+1)] # scores matrix
path = [[0 for _ in range(m+1)] for _ in range(n+1)] # last step for optimal score
# first line and col
for i in range(1, n+1):
mm[i][0] = mm[i-1][0] + score('', x[i-1])
mm[0][j] = mm[j-1][0] + score('', y[j-1])
path[0][j] = (0, j-1)
# fill table
for i in range(1, n+1):
for j in range(1, m+1):
a = x[i-1]
b = y[j-1]
s1 = mm[i-1][j-1] + score(a,b)
s2 = mm[i-1][j] + score('', a)
s3 = mm[i][j-1] + score('', b)
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
bestScore = mm[n][m]
alx = ""
aly = ""
i, j = n, m
while i > 0 and j > 0:
i2, j2 = path[i][j]
if i==i2:
alx+='-'
else:
alx+=x[i-1]
if j==j2:
aly+='-'
else:
aly+=y[j-1]
i,j = i2, j2
while i>0:
aly+='-'
alx+=x[i-1]
i -=1
while j>0:
alx+='-'
aly+=y[j-1]
j -=1
alx = alx[::-1]
aly = aly[::-1]
seq1.seq = Seq(alx)
seq2.seq = Seq(aly)
return bestScore, seq1, seq2
def align2steps(seq1, seq2, d=8, e=4):
x = seq1.seq
y = seq2.seq
n=len(x)
m=len(y)
mm = [[0 for _ in range(m+1)] for _ in range(n+1)] # scores matrix
mx = [[0 for _ in range(m+1)] for _ in range(n+1)]
my = [[0 for _ in range(m+1)] for _ in range(n+1)]
path = [[0 for _ in range(m+1)] for _ in range(n+1)] # last step for optimal score
# first line and col
for i in range(1, n+1):
mx[i][1] = my[i][1] = -d -(i-1)*e
path[i][0] = (i-1, 0)
for j in range(1, m+1):
mx[1][j] = my[1][i] = -d - (j-1)*e
path[0][j] = (0, j-1)
# fill table
for i in range(2, n+1):
for j in range(2, m+1):
a = x[i-1]
b = y[j-1]
# find max for M
s1 = mm[i-1][j-1] + score(a,b)
s2 = mx[i-1][j-1] + score(a,b)
s3 = my[i][j-1] + score(a,b)
if s1 >= s2:
if s1 >= s3:
mm[i][j] = s1
path[i][j] = (i-1, j-1)
else:
mm[i][j] = s3
path[i][j] = (i, j-1)
else: # s2 > s1
if s2 >= s3:
mm[i][j] = s2
path[i][j] = (i-1, j)
else:
mm[i][j] = s3
path[i][j] = (i, j-1)
# find max for I_x
s4 = mm[i-1][j] - d
s5 = mx[i-1][j] - e
if s4 >= s5:
mx[i][j] = s4
else:
mx[i][j] = s5
# find max for I_y
s6 = mm[i][j-1] - d
s7 = my[i][j-1] - e
if s6 >= s7:
my[i][j] = s6
else:
my[i][j] = s7
bestScore = mm[n][m]
alx = ""
aly = ""
i, j = n, m
while i > 0 and j > 0:
i2, j2 = path[i][j]
if i==i2:
alx+='-'
else:
alx+=x[i-1]
if j==j2:
aly+='-'
else:
aly+=y[j-1]
i,j = i2, j2
while i>0:
aly+='-'
alx+=x[i-1]
i -=1
while j>0:
alx+='-'
aly+=y[j-1]
j -=1
alx = alx[::-1]
aly = aly[::-1]
seq1.seq = Seq(alx)
seq2.seq = Seq(aly)
return bestScore, seq1, seq2
def test():
# print(align("chat", "cat"))
# print(align("chat", "cgat"))
# print(align("chat", "at"))
s, x, y = align(seqs[0], seqs[1])
print(s,x,y)
with open("balibase/us/1.fasta", "w") as fd:
Bio.SeqIO.write((x, y), fd, "fasta")
test()