#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Created on Tue Jan 15 15:02:16 2019 @author: ariane.delrocq """ import numpy as np import Bio.SubsMat.MatrixInfo import Bio.SeqIO from Bio.SeqRecord import SeqRecord from Bio.Seq import Seq import Bio.pairwise2 #seqs=[] #handle = open("balibase/RV11.unaligned/BBS11001.fasta") #for seq in Bio.SeqIO.parse(handle, "fasta"): # seqs.append(seq) #handle.close() mat = Bio.SubsMat.MatrixInfo.blosum62 def score(a, b, gap=-8): """Default scoring function for align """ if not a or not b: return gap else: a = a.upper() b = b.upper() try: return mat[a,b] except KeyError: return mat[b,a] def dihedral_score(angles1, angles2, phi0=1., psi0=1.): """ Tentative dihedral scoring component. """ phi1, psi1 = angles1 phi2, psi2 = angles2 s = 0 if phi1 is not None and phi2 is not None: err = (phi2 - phi1) / phi0 err2 = err*err s -= err2 / (1 + err2) if psi1 is not None and psi2 is not None: err = (psi2 - psi1) / psi0 err2 = err*err s -= err2 / (1 + err2) return s def align(seq1, seq2, score=score): """Main implementation of align, with blosum scoring and no specific gap extension penalties. Args: seq1: A SeqRecord object seq2: Same. Returns: A (score, aseq1, aseq2) tuple, holding alignment score and aligned sequences with '-' inserted. """ x = seq1.seq y = seq2.seq n=len(x) m=len(y) mm = [[0 for _ in range(m+1)] for _ in range(n+1)] # scores matrix path = [[0 for _ in range(m+1)] for _ in range(n+1)] # last step for optimal score # first line and col for i in range(1, n+1): mm[i][0] = mm[i-1][0] + score('', x[i-1]) path[i][0] = (i-1, 0) for j in range(1, m+1): mm[0][j] = mm[0][j-1] + score('', y[j-1]) path[0][j] = (0, j-1) # fill table for i in range(1, n+1): for j in range(1, m+1): a = x[i-1] b = y[j-1] s1 = mm[i-1][j-1] + score(a,b) s2 = mm[i-1][j] + score('', a) s3 = mm[i][j-1] + score('', b) if s1 >= s2: if s1 >= s3: mm[i][j] = s1 path[i][j] = (i-1, j-1) else: mm[i][j] = s3 path[i][j] = (i, j-1) else: # s2 > s1 if s2 >= s3: mm[i][j] = s2 path[i][j] = (i-1, j) else: mm[i][j] = s3 path[i][j] = (i, j-1) bestScore = mm[n][m] alx = "" aly = "" i, j = n, m while i > 0 and j > 0: i2, j2 = path[i][j] if i==i2: alx+='-' else: alx+=x[i-1] if j==j2: aly+='-' else: aly+=y[j-1] i,j = i2, j2 while i>0: aly+='-' alx+=x[i-1] i -=1 while j>0: alx+='-' aly+=y[j-1] j -=1 aseq1 = SeqRecord(seq1) aseq1.seq = Seq(alx[::-1]) aseq2 = SeqRecord(seq2) aseq2.seq = Seq(aly[::-1]) return bestScore, aseq1, aseq2 def dict_to_weight(subs_dict): """Converts blosum dict to indexable numpy array. The score function is the critical op here, we do not need a dict lookup in the innermost loop. Returns: 26 x 26 numpy array, [0,0] being s('A', 'A') """ weight = np.zeros((26,26)) for (r1, r2), w in subs_dict.items(): r1 = ord(r1) - ord('A') r2 = ord(r2) - ord('A') weight[r1, r2] = weight[r2, r1] = w return weight def vec_align(seq1, seq2, mat=dict_to_weight(Bio.SubsMat.MatrixInfo.blosum62), gap=-8): """Proof of concept vector-style implementation of align, written mainly in numpy arrays operations. M(i,j) is the score of the best alignement containing exactly i letters of x and j letters of y. To allow vector operations, we rewrite M as S(s, k) with s=i+j and k=i-max(0,s-m) Hence: M(i,j) = S(i+j, i-max(0,i+j-m)) S(s, k) = M(k+max(0,s-m), min(s,m)-k) The relation become: M(i,j) = max( M(i,j-1)+gap, M(i-1,j)+gap, M(i-1,j-1)+s(i,j) ) S(s,k) = max( S(s-1,k+1(s>m))+gap, S(s-1,k-1+1(s>m))+gap, S(s-2,k-1+1(s>m)+1(s>m+1)) + s(k+max(0,s-m),min(s,m)-k) ) Note that S(s,k) does not depend anymore on S(k,.), allowing vector operation Last index of the diag s can be written (useful for simplifications): min(n,s)-max(0,s-m) = min(n, m, s, n+m-s) = min(m,s)-max(0,s-n) """ n = len(seq1.seq) m = len(seq2.seq) x = np.array(list(ord(c)-ord('A') for c in seq1.seq)) y = np.array(list(ord(c)-ord('A') for c in seq2.seq)) mat = mat.flatten() S = [ np.zeros(min(s,n,m,n+m-s)+1, dtype=np.int32) for s in range(n+m+1) ] S[0][0] = 0 S[1][0] = gap S[1][1] = gap x *= 26 for s in range(2,n+m+1): # First/last cell must be treated differently when first line/col of M if(s <= m): S[s][0] = S[s-1][0] + gap if(s <= n): S[s][s-max(0,s-m)] = S[s-1][s-1-max(0,s-1-m)] + gap # The offsets in the formula become begin/end of slices. gapped = S[s-1] + gap S[s][int(s<=m):len(S[s])-int(s<=n)] = np.maximum(np.maximum( gapped[1:], gapped[:-1]), S[s-2][int(s>m+1):len(S[s-2])-int(s>n+1)] +mat[ x[max(0,s-m-1):min(n,s-1)] + y[max(0,s-n-1):min(m,s-1)][::-1] ] ) # Trace back score = S[n+m][0] i,j = n,m alx, aly = "","" while i > 0 or j > 0: if j > 0: prev_score = S[i+j-1][i-max(0, i+j-m-1)] if prev_score + gap == score: score = prev_score j -= 1 aly += seq2[j] alx += '-' continue if i > 0: prev_score = S[i+j-1][i-max(0, i+j-m-1)-1] if prev_score + gap == score: score = prev_score i -= 1 alx += seq1[i] aly += '-' continue score = S[i+j-2][i-max(0, i+j-m-2)-1] i -= 1 j -= 1 alx += seq1[i] aly += seq2[j] aseq1 = SeqRecord(seq1) aseq1.seq = Seq(alx[::-1]) aseq2 = SeqRecord(seq2) aseq2.seq = Seq(aly[::-1]) return S[n+m][0], aseq1, aseq2 def align_dihedrals(seq1, seq2, angles1, angles2): """ Align two sequences, by using a simple cost function using dihedral angles as well as amino acids names. Arguments: seq1: First sequence as SeqRecord seq2: angles1: Dihedral angles for seq1, as a zipped phi-psi list angles2: """ align2steps(seq1, seq2, d=8, e=4, angles=True, angles1=angles1, angles2=angles2) def align2steps(seq1, seq2, d=8, e=4, angles=False, angles1=None, angles2=None): """Main implementation of align, handling also gap extensions. Adatped to handle dihedral costs. """ x = seq1.seq y = seq2.seq n=len(x) m=len(y) mm = [[0 for _ in range(m+1)] for _ in range(n+1)] # scores matrix mx = [[0 for _ in range(m+1)] for _ in range(n+1)] my = [[0 for _ in range(m+1)] for _ in range(n+1)] pathm = [[0 for _ in range(m+1)] for _ in range(n+1)] # last step for optimal score pathx = [[0 for _ in range(m+1)] for _ in range(n+1)] pathy = [[0 for _ in range(m+1)] for _ in range(n+1)] lower_bound = - (n+m) * max(e,d) # first line and col for i in range(1, n+1): mx[i][0] = -d -(i-1)*e pathx[i][0] = pathx # Best alignment ending with no gap but with 0 letters of one seq does # not exist... but is still used in the formulas below mm[i][0] = lower_bound # Same for best alignment with 0 letter of Y ending on a gap on the X # side with a letter of Y my[i][0] = lower_bound for j in range(1, m+1): my[0][j] = -d - (j-1)*e pathy[0][j] = pathy mm[0][j] = lower_bound mx[0][j] = lower_bound # fill table for i in range(1, n+1): for j in range(1, m+1): a = x[i-1] b = y[j-1] s = score(a,b) if(angles): s += 10 * dihedral_score(angles1[i-1], angles2[j-1]) # find max for M s1 = mm[i-1][j-1] + s s2 = mx[i-1][j-1] + s s3 = my[i-1][j-1] + s if s1 >= s2: if s1 >= s3: mm[i][j] = s1 pathm[i][j] = pathm else: mm[i][j] = s3 pathm[i][j] = pathy else: # s2 > s1 if s2 >= s3: mm[i][j] = s2 pathm[i][j] = pathx else: mm[i][j] = s3 pathm[i][j] = pathy # find max for I_x s4 = mm[i-1][j] - d s5 = mx[i-1][j] - e if s4 >= s5: mx[i][j] = s4 pathx[i][j] = pathm else: mx[i][j] = s5 pathx[i][j] = pathx # find max for I_y s6 = mm[i][j-1] - d s7 = my[i][j-1] - e if s6 >= s7: my[i][j] = s6 pathy[i][j] = pathm else: my[i][j] = s7 pathy[i][j] = pathy bestScore, bestPath = max( [(mm,pathm),(mx,pathx),(my,pathy)], key=(lambda t:t[0][n][m])) bestScore = bestScore[n][m] alx = "" aly = "" i, j = n, m path = bestPath while i > 0 and j > 0: prev_path = path[i][j] if path == pathm: i -= 1 j -= 1 alx += x[i] aly += y[j] elif path == pathy: j -= 1 alx += '-' aly += y[j] else: i -= 1 alx += x[i] aly += '-' path = prev_path while i>0: aly+='-' alx+=x[i-1] i -=1 while j>0: alx+='-' aly+=y[j-1] j -=1 alx = alx[::-1] aly = aly[::-1] aseq1 = SeqRecord(seq1) aseq1.seq = Seq(alx) aseq2 = SeqRecord(seq2) aseq2.seq = Seq(aly) return bestScore, aseq1, aseq2 def test(): """Obsolete test function, to be moved to unit tests in a separate file. """ # print(align("chat", "cat")) # print(align("chat", "cgat")) # print(align("chat", "at")) s, x, y = align2steps(seqs[0], seqs[1]) print(s) print(x.seq) print(y.seq) with open("balibase/us/1.fasta", "w") as fd: Bio.SeqIO.write((x, y), fd, "fasta") print("Ref:") ref = Bio.pairwise2.align.globalds( seqs[0].seq, seqs[1].seq, Bio.SubsMat.MatrixInfo.blosum62, -8, -4) for x, y, s, *_ in ref: print(s) print(x) print(y)