#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Created on Tue Jan 15 15:02:16 2019 @author: ariane.delrocq """ import Bio.SubsMat.MatrixInfo import Bio.SeqIO from Bio.SeqRecord import SeqRecord from Bio.Seq import Seq seqs=[] handle = open("balibase/RV11.unaligned/BBS11001.fasta") for seq in Bio.SeqIO.parse(handle, "fasta"): seqs.append(seq) handle.close() mat = Bio.SubsMat.MatrixInfo.blosum62 def score(a, b, gap=-8, id=1, mut=-1): if not a or not b: return gap else: a = a.upper() b = b.upper() try: return mat[a,b] except KeyError: return mat[b,a] # elif a==b: # return id # else: # return mut def align(seq1, seq2): x = seq1.seq y = seq2.seq n=len(x) m=len(y) mm = [[0 for _ in range(m+1)] for _ in range(n+1)] # scores matrix path = [[0 for _ in range(m+1)] for _ in range(n+1)] # last step for optimal score # first line and col for i in range(1, n+1): mm[i][0] = mm[i-1][0] + score('', x[i-1]) path[i][0] = (i-1, 0) for j in range(1, m+1): mm[0][j] = mm[j-1][0] + score('', y[j-1]) path[0][j] = (0, j-1) # fill table for i in range(1, n+1): for j in range(1, m+1): a = x[i-1] b = y[j-1] s1 = mm[i-1][j-1] + score(a,b) s2 = mm[i-1][j] + score('', a) s3 = mm[i][j-1] + score('', b) if s1 >= s2: if s1 >= s3: mm[i][j] = s1 path[i][j] = (i-1, j-1) else: mm[i][j] = s3 path[i][j] = (i, j-1) else: # s2 > s1 if s2 >= s3: mm[i][j] = s2 path[i][j] = (i-1, j) else: mm[i][j] = s3 path[i][j] = (i, j-1) bestScore = mm[n][m] alx = "" aly = "" i, j = n, m while i > 0 and j > 0: i2, j2 = path[i][j] if i==i2: alx+='-' else: alx+=x[i-1] if j==j2: aly+='-' else: aly+=y[j-1] i,j = i2, j2 while i>0: aly+='-' alx+=x[i-1] i -=1 while j>0: alx+='-' aly+=y[j-1] j -=1 alx = alx[::-1] aly = aly[::-1] seq1.seq = Seq(alx) seq2.seq = Seq(aly) return bestScore, seq1, seq2 def align2steps(seq1, seq2, d=8, e=4): x = seq1.seq y = seq2.seq n=len(x) m=len(y) mm = [[0 for _ in range(m+1)] for _ in range(n+1)] # scores matrix mx = [[0 for _ in range(m+1)] for _ in range(n+1)] my = [[0 for _ in range(m+1)] for _ in range(n+1)] path = [[0 for _ in range(m+1)] for _ in range(n+1)] # last step for optimal score # first line and col for i in range(1, n+1): mx[i][1] = my[i][1] = -d -(i-1)*e path[i][0] = (i-1, 0) for j in range(1, m+1): mx[1][j] = my[1][i] = -d - (j-1)*e path[0][j] = (0, j-1) # fill table for i in range(2, n+1): for j in range(2, m+1): a = x[i-1] b = y[j-1] # find max for M s1 = mm[i-1][j-1] + score(a,b) s2 = mx[i-1][j-1] + score(a,b) s3 = my[i][j-1] + score(a,b) if s1 >= s2: if s1 >= s3: mm[i][j] = s1 path[i][j] = (i-1, j-1) else: mm[i][j] = s3 path[i][j] = (i, j-1) else: # s2 > s1 if s2 >= s3: mm[i][j] = s2 path[i][j] = (i-1, j) else: mm[i][j] = s3 path[i][j] = (i, j-1) # find max for I_x s4 = mm[i-1][j] - d s5 = mx[i-1][j] - e if s4 >= s5: mx[i][j] = s4 else: mx[i][j] = s5 # find max for I_y s6 = mm[i][j-1] - d s7 = my[i][j-1] - e if s6 >= s7: my[i][j] = s6 else: my[i][j] = s7 bestScore = mm[n][m] alx = "" aly = "" i, j = n, m while i > 0 and j > 0: i2, j2 = path[i][j] if i==i2: alx+='-' else: alx+=x[i-1] if j==j2: aly+='-' else: aly+=y[j-1] i,j = i2, j2 while i>0: aly+='-' alx+=x[i-1] i -=1 while j>0: alx+='-' aly+=y[j-1] j -=1 alx = alx[::-1] aly = aly[::-1] seq1.seq = Seq(alx) seq2.seq = Seq(aly) return bestScore, seq1, seq2 def test(): # print(align("chat", "cat")) # print(align("chat", "cgat")) # print(align("chat", "at")) s, x, y = align(seqs[0], seqs[1]) print(s,x,y) with open("balibase/us/1.fasta", "w") as fd: Bio.SeqIO.write((x, y), fd, "fasta") test()