Skip to content
Snippets Groups Projects
Verified Commit b110234a authored by Etienne MORICE's avatar Etienne MORICE
Browse files

Multiple align from linkage matrices

Refactored multiple align to isolate alignment from tree browsing logic,
and wrote a logic to browse scipy-style linkage matrix + tests +
updated deps
parent fb6ecdb9
No related branches found
No related tags found
No related merge requests found
...@@ -6,8 +6,48 @@ def multiple_align_from_tree(tree): ...@@ -6,8 +6,48 @@ def multiple_align_from_tree(tree):
Arguments: Arguments:
tree: nested structure of python tuples, with SeqRecord objects as leafs tree: nested structure of python tuples, with SeqRecord objects as leafs
""" """
def _join(node):
"""Recursive helper to browse tree
Arguments:
node: (node, node) | SeqRecord
"""
if not isinstance(node, tuple):
return [node]
left, right = node
aleft = _join(left)
aright = _join(right)
return join_alignments(aleft, aright)
return _join(tree) return _join(tree)
def multiple_align_from_linkage_matrix(sequences, tree):
"""Multiple alignmnent from a SeqRecord tree
Arguments:
sequences: indexable container of SeqRecord objects to align
tree: scipy.hierarchy linkage matrix
"""
n = len(sequences)
def _join(inode):
"""Recursive helper to browse tree
Arguments:
inode: index of node row in linkage matrix
"""
inode = int(inode)
if inode < n:
return [sequences[inode]]
ileft, iright, dist, size = tree[inode - n]
aleft = _join(ileft)
aright = _join(iright)
return join_alignments(aleft, aright)
return _join(n+len(tree)-1)
def _score(a, b, base_score=score): def _score(a, b, base_score=score):
"""Generalized one-letter score, with '-' as new letter. """Generalized one-letter score, with '-' as new letter.
""" """
...@@ -37,17 +77,13 @@ def _mscore_gap(seqs, i, gap): ...@@ -37,17 +77,13 @@ def _mscore_gap(seqs, i, gap):
return gap * sum(x[i] != '-' for x in seqs) return gap * sum(x[i] != '-' for x in seqs)
def _join(node, _mscore=_mscore, gap=-8): def join_alignments(aleft, aright, _mscore=_mscore, gap=-8):
""" """
Takes two multiple alignments as lists of SeqRecords, and align them.
Returns: Returns:
Multiple align as list of aligned SeqRecords Multiple align as list of aligned SeqRecords
""" """
if not isinstance(node, tuple):
return [node]
left, right = node
aleft = _join(left)
aright = _join(right)
n = len(aleft[0]) n = len(aleft[0])
m = len(aright[0]) m = len(aright[0])
......
biopython==1.73 biopython>=1.73
pandas==0.23.3 pandas>=0.23.3
matplotlib==2.0.0 matplotlib>=2.0.0
numpy>=1.16.1
scipy>=1.2.0
...@@ -2,6 +2,7 @@ import unittest ...@@ -2,6 +2,7 @@ import unittest
import alignementseq_multiple import alignementseq_multiple
from Bio.SeqRecord import SeqRecord
class MultipleAlignTestCase(unittest.TestCase): class MultipleAlignTestCase(unittest.TestCase):
def test_align_from_tree(self): def test_align_from_tree(self):
...@@ -9,3 +10,18 @@ class MultipleAlignTestCase(unittest.TestCase): ...@@ -9,3 +10,18 @@ class MultipleAlignTestCase(unittest.TestCase):
alignementseq_multiple.multiple_align_from_tree((("CHAT", "CAT"), "HER")), alignementseq_multiple.multiple_align_from_tree((("CHAT", "CAT"), "HER")),
['CHAT', 'C-AT', 'H-ER'] ['CHAT', 'C-AT', 'H-ER']
) )
def test_align_from_linkage_matrix(self):
import tree_generator
import alignementseq
seqs = [ SeqRecord(s) for s in ("CHAT", "CAT", "HER") ]
tree = tree_generator.tree_build(seqs, alignementseq.vec_align)
aseqs = alignementseq_multiple.multiple_align_from_linkage_matrix(
seqs,
tree
)
aseqs_ref = [ "CHAT", "C-AT", "H-ER" ]
aseqs_ref.sort()
aseqs.sort()
self.assertEqual(aseqs_ref, aseqs)
...@@ -11,7 +11,7 @@ from Bio.Cluster import treecluster ...@@ -11,7 +11,7 @@ from Bio.Cluster import treecluster
import alignementseq import alignementseq
from scipy.cluster.hierarchy import average, dendrogram from scipy.cluster.hierarchy import average, dendrogram
seqs = [SeqRecord('TAACCCCAAAAGAACCAGA'), SeqRecord('TTCTGTAGTAGAGATGGAATTAAGAAAAAACCATCAACTATAACCCCAAAAGAACCAGA'), SeqRecord('TTCTGTAGTAGAGATGGAATTAAGAAAAAACCATCAACTATAACCCCAAAAGAACCAGA'),SeqRecord('GTAGTAGAGATGGAATTAAGAAAAAACCATCAACTATAACCCCAAGAGAACCAGA'), SeqRecord('GAGCCGGATGAGAAGAAACTCTCATGTCCGGTTCTGTAGTAGAGATGGAATTAAGAAAAAACCATCAACTATAACCCCAAGAGAACCAGA'), SeqRecord('TTTTCATTCGCGAGGAGCCGGATGAGAAGAAACTCTCATGTCCGGTTCTGTAGTAGAGATGGAATTAAGAAAAAACCATCAACTATAACCCCAAGAGAACCAGA')] #seqs = [SeqRecord('TAACCCCAAAAGAACCAGA'), SeqRecord('TTCTGTAGTAGAGATGGAATTAAGAAAAAACCATCAACTATAACCCCAAAAGAACCAGA'), SeqRecord('TTCTGTAGTAGAGATGGAATTAAGAAAAAACCATCAACTATAACCCCAAAAGAACCAGA'),SeqRecord('GTAGTAGAGATGGAATTAAGAAAAAACCATCAACTATAACCCCAAGAGAACCAGA'), SeqRecord('GAGCCGGATGAGAAGAAACTCTCATGTCCGGTTCTGTAGTAGAGATGGAATTAAGAAAAAACCATCAACTATAACCCCAAGAGAACCAGA'), SeqRecord('TTTTCATTCGCGAGGAGCCGGATGAGAAGAAACTCTCATGTCCGGTTCTGTAGTAGAGATGGAATTAAGAAAAAACCATCAACTATAACCCCAAGAGAACCAGA')]
#Arguments : une liste de séquences, une fonction d'alignement #Arguments : une liste de séquences, une fonction d'alignement
score_gap = -8 score_gap = -8
...@@ -22,12 +22,12 @@ def tree_build(seqs, align): ...@@ -22,12 +22,12 @@ def tree_build(seqs, align):
for j in range (i): for j in range (i):
M[i][j], a, b = align(seqs[i], seqs[j]) M[i][j], a, b = align(seqs[i], seqs[j])
M[i][j] = 1-abs(M[i][j]/(max(len_seqs)*(11-score_gap))) M[i][j] = 1-abs(M[i][j]/(max(len_seqs)*(11-score_gap)))
print(type(M), M) # print(type(M), M)
# tree = treecluster(None, distancematrix=M, method = 'a', dist = 'e',) # tree = treecluster(None, distancematrix=M, method = 'a', dist = 'e',)
tree= average(M) tree= average(M)
return tree return tree
tree = tree_build(seqs, alignementseq.align2steps) #tree = tree_build(seqs, alignementseq.align2steps)
print(tree) #print(tree)
dn = dendrogram(tree) #dn = dendrogram(tree)
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment