Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
B
BIOINF588
Manage
Activity
Members
Plan
Wiki
Code
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Deploy
Releases
Container Registry
Model registry
Analyze
Contributor analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Etienne MORICE
BIOINF588
Commits
60dc8f50
Verified
Commit
60dc8f50
authored
6 years ago
by
Etienne MORICE
Browse files
Options
Downloads
Patches
Plain Diff
Benchmark and switch to traceback matrix for vec_align for #10
parent
b2053a14
No related branches found
Branches containing commit
No related tags found
No related merge requests found
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
alignementseq.py
+105
-0
105 additions, 0 deletions
alignementseq.py
alignementseq_multiple.py
+25
-10
25 additions, 10 deletions
alignementseq_multiple.py
alignementseq_tests.py
+45
-25
45 additions, 25 deletions
alignementseq_tests.py
tree_generator.py
+6
-6
6 additions, 6 deletions
tree_generator.py
with
181 additions
and
41 deletions
alignementseq.py
+
105
−
0
View file @
60dc8f50
...
...
@@ -232,6 +232,111 @@ def vec_align(seq1, seq2, mat=dict_to_weight(Bio.SubsMat.MatrixInfo.blosum62), g
aseq2
.
seq
=
Seq
(
aly
[::
-
1
])
return
S
[
n
+
m
][
0
],
aseq1
,
aseq2
def
vec_align2
(
seq1
,
seq2
,
mat
=
dict_to_weight
(
Bio
.
SubsMat
.
MatrixInfo
.
blosum62
),
gap
=-
8
):
"""
Proof of concept vector-style implementation of align, written mainly in numpy
arrays operations.
M(i,j) is the score of the best alignement containing exactly i letters of
x and j letters of y.
To allow vector operations, we rewrite M as S(s, k) with s=i+j and
k=i-max(0,s-m)
Hence:
M(i,j) = S(i+j, i-max(0,i+j-m))
S(s, k) = M(k+max(0,s-m), min(s,m)-k)
The relation become:
M(i,j) = max( M(i,j-1)+gap, M(i-1,j)+gap, M(i-1,j-1)+s(i,j) )
S(s,k) = max(
S(s-1,k+1(s>m))+gap,
S(s-1,k-1+1(s>m))+gap,
S(s-2,k-1+1(s>m)+1(s>m+1)) + s(k+max(0,s-m),min(s,m)-k)
)
Note that S(s,k) does not depend anymore on S(k,.), allowing vector operation
Last index of the diag s can be written (useful for simplifications):
min(n,s)-max(0,s-m)
= min(n, m, s, n+m-s)
= min(m,s)-max(0,s-n)
"""
n
=
len
(
seq1
.
seq
)
m
=
len
(
seq2
.
seq
)
x
=
np
.
array
(
list
(
ord
(
c
)
-
ord
(
'
A
'
)
for
c
in
seq1
.
seq
))
y
=
np
.
array
(
list
(
ord
(
c
)
-
ord
(
'
A
'
)
for
c
in
seq2
.
seq
))
mat
=
mat
.
flatten
()
# Traceback matrix, bit 0 = gap, bit 1 = gap on y side
T
=
[
np
.
empty
(
min
(
s
,
n
,
m
,
n
+
m
-
s
)
+
1
,
dtype
=
np
.
byte
)
for
s
in
range
(
n
+
m
+
1
)
]
# Scores, we just have to remember three lines
S
=
tuple
(
np
.
empty
(
min
(
n
,
m
)
+
1
,
dtype
=
np
.
int32
)
for
s
in
range
(
3
)
)
S
[
0
][
0
]
=
0
S
[
1
][
0
]
=
gap
S
[
1
][
1
]
=
gap
# T[0][0] never used
T
[
1
][
0
]
=
1
|
0
T
[
1
][
1
]
=
1
|
2
x
*=
26
for
s
in
range
(
2
,
n
+
m
+
1
):
# First/last cell must be treated differently when first line/col of M
if
(
s
<=
m
):
S
[
2
][
0
]
=
S
[
1
][
0
]
+
gap
T
[
s
][
0
]
=
1
|
0
if
(
s
<=
n
):
S
[
2
][
s
-
max
(
0
,
s
-
m
)]
=
S
[
1
][
s
-
1
-
max
(
0
,
s
-
1
-
m
)]
+
gap
T
[
s
][
s
-
max
(
0
,
s
-
m
)]
=
1
|
2
# The offsets in the formula become begin/end of slices.
gapped
=
S
[
1
]
+
gap
max_gap
=
np
.
maximum
(
gapped
[
1
:
len
(
T
[
s
-
1
])],
gapped
[:
len
(
T
[
s
-
1
])
-
1
])
T
[
s
][
int
(
s
<=
m
):
len
(
T
[
s
])
-
int
(
s
<=
n
)]
=
np
.
equal
(
max_gap
,
gapped
[:
len
(
T
[
s
-
1
])
-
1
]
)
<<
1
S
[
2
][
int
(
s
<=
m
):
len
(
T
[
s
])
-
int
(
s
<=
n
)]
=
np
.
maximum
(
max_gap
,
S
[
0
][
int
(
s
>
m
+
1
):
len
(
T
[
s
-
2
])
-
int
(
s
>
n
+
1
)]
+
mat
[
x
[
max
(
0
,
s
-
m
-
1
):
min
(
n
,
s
-
1
)]
+
y
[
max
(
0
,
s
-
n
-
1
):
min
(
m
,
s
-
1
)][::
-
1
]
]
)
T
[
s
][
int
(
s
<=
m
):
len
(
T
[
s
])
-
int
(
s
<=
n
)]
|=
np
.
equal
(
max_gap
,
S
[
2
][
int
(
s
<=
m
):
len
(
T
[
s
])
-
int
(
s
<=
n
)]
)
S
=
(
S
[
1
],
S
[
2
],
S
[
0
])
# Trace back
i
,
j
=
n
,
m
alx
,
aly
=
""
,
""
while
i
>
0
or
j
>
0
:
is_gap
=
T
[
i
+
j
][
i
-
max
(
0
,
i
+
j
-
m
)]
&
1
if
is_gap
:
is_y_gap
=
T
[
i
+
j
][
i
-
max
(
0
,
i
+
j
-
m
)]
&
2
if
is_y_gap
:
i
-=
1
alx
+=
seq1
[
i
]
aly
+=
'
-
'
else
:
j
-=
1
aly
+=
seq2
[
j
]
alx
+=
'
-
'
else
:
i
-=
1
j
-=
1
alx
+=
seq1
[
i
]
aly
+=
seq2
[
j
]
aseq1
=
SeqRecord
(
seq1
)
aseq1
.
seq
=
Seq
(
alx
[::
-
1
])
aseq2
=
SeqRecord
(
seq2
)
aseq2
.
seq
=
Seq
(
aly
[::
-
1
])
return
S
[
1
][
0
],
aseq1
,
aseq2
def
align_dihedrals
(
seq1
,
seq2
,
angles1
,
angles2
):
"""
Align two sequences, by using a simple cost function using dihedral angles
...
...
This diff is collapsed.
Click to expand it.
alignementseq_multiple.py
+
25
−
10
View file @
60dc8f50
from
alignementseq
import
score
from
alignementseq
import
score
,
vec_align
from
Bio.SeqRecord
import
SeqRecord
from
Bio.Seq
import
Seq
import
tree_generator
def
multiple_align
(
sequences
):
"""
Top-level multiple alignment routine
Arguments:
sequences: lists of SeqRecord objects
Returns:
List of aligned SeqRecord objects, with
'
-
'
where needed.
"""
tree
=
tree_generator
.
tree_build
(
sequences
,
vec_align
)
return
multiple_align_from_linkage_matrix
(
sequences
,
tree
)
def
multiple_align_from_tree
(
tree
):
"""
Multiple alignmnent from a SeqRecord tree
Arguments:
tree: nested structure of python tuples, with SeqRecord objects as leafs
"""
...
...
@@ -27,7 +42,7 @@ def multiple_align_from_tree(tree):
def
multiple_align_from_linkage_matrix
(
sequences
,
tree
):
"""
Multiple alignmnent from a SeqRecord tree
Arguments:
sequences: indexable container of SeqRecord objects to align
tree: scipy.hierarchy linkage matrix
...
...
@@ -43,7 +58,7 @@ def multiple_align_from_linkage_matrix(sequences, tree):
inode
=
int
(
inode
)
if
inode
<
n
:
return
[
sequences
[
inode
]]
ileft
,
iright
,
dist
,
size
=
tree
[
inode
-
n
]
ileft
,
iright
,
_
,
_
=
tree
[
inode
-
n
]
aleft
=
_join
(
ileft
)
aright
=
_join
(
iright
)
return
join_alignments
(
aleft
,
aright
)
...
...
@@ -58,12 +73,12 @@ def _score(a, b, base_score=score):
if
b
==
'
-
'
:
return
0
else
:
return
score
(
''
,
b
)
return
base_
score
(
''
,
b
)
else
:
if
b
==
'
-
'
:
return
score
(
''
,
a
)
return
base_
score
(
''
,
a
)
else
:
return
score
(
a
,
b
)
return
base_
score
(
a
,
b
)
def
_mscore
(
seqs1
,
seqs2
,
i
,
j
,
_score
=
_score
):
...
...
@@ -135,9 +150,9 @@ def join_alignments(aleft, aright, _mscore=_mscore, gap=-8):
for
jseq
in
range
(
len
(
aright
)):
aseqs
[
len
(
aleft
)
+
jseq
]
+=
aright
[
jseq
][
nextj
]
i
,
j
=
nexti
,
nextj
ret
=
[
SeqRecord
(
id
=
s
.
id
,
seq
=
""
)
for
s
in
aleft
]
+
[
SeqRecord
(
id
=
s
.
id
,
seq
=
""
)
for
s
in
aright
]
ret
=
[
SeqRecord
(
id
=
s
.
id
,
seq
=
""
,
description
=
s
.
description
)
for
alignment
in
(
aleft
,
aright
)
for
s
in
alignment
]
for
rec
,
seq
in
zip
(
ret
,
aseqs
):
rec
.
seq
=
Seq
(
seq
[::
-
1
])
return
ret
This diff is collapsed.
Click to expand it.
alignementseq_tests.py
+
45
−
25
View file @
60dc8f50
...
...
@@ -176,8 +176,7 @@ class BalibaseTestCase(unittest.TestCase):
yield
records
,
filename
class
AlignmentSeqTestCase
(
BalibaseTestCase
):
"""
TestCase for alignment correctness and quality
"""
TestCase for alignment correctness and quality
"""
def
assert_same_residues
(
self
,
str1
,
str2
):
...
...
@@ -226,13 +225,13 @@ class AlignmentSeqTestCase(BalibaseTestCase):
"""
Tests alignments with blosum but no gap extension.
"""
from
alignementseq
import
align
,
vec_align
from
alignementseq
import
align
,
vec_align
,
vec_align2
aligner
=
Bio
.
pairwise2
.
align
for
seq1
,
seq2
,
filename
in
self
.
get_dataset_heads
():
(
ex_r1
,
ex_r2
,
exp_score
,
*
_
)
=
aligner
.
globalds
(
seq1
.
seq
,
seq2
.
seq
,
blosum62
,
-
8
,
-
8
,
one_alignment_only
=
True
)[
0
]
for
method
in
align
,
vec_align
:
for
method
in
align
,
vec_align
,
vec_align2
:
score
,
ret1
,
ret2
=
method
(
seq1
,
seq2
)
try
:
self
.
assert_same_residues
(
ret1
.
seq
,
seq1
.
seq
)
...
...
@@ -328,37 +327,58 @@ class AlignmentSeqTestCase(BalibaseTestCase):
for
j
in
jobs
:
j
.
join
()
def
create_alignments_with
(
self
,
method
,
method_name
=
None
):
"""
Create fasta files with aligned sequences using the given method, under
balibase_path/<method_name>
"""
dataset_dir
=
os
.
path
.
join
(
self
.
balibase_path
,
"
RV11.unaligned
"
)
out_dir
=
os
.
path
.
join
(
self
.
balibase_path
,
method_name
or
method
.
__name__
)
for
in_filename
in
os
.
listdir
(
dataset_dir
):
sequences
=
list
(
Bio
.
SeqIO
.
parse
(
in_filename
,
"
fasta
"
))
def
test_benchmark_multiple_align
(
self
):
"""
Tests the multiple_align function (using blosum and gap extension).
"""
"""
Perform multiple alignments with different methods and evaluate the
results with bali_score
"""
res_dir
=
"
balibase_results
"
if
not
os
.
path
.
isdir
(
res_dir
):
os
.
mkdir
(
res_dir
)
# Cached
# Define methods to test
from
alignementseq_multiple
import
multiple_align
methods_test
=
[
multiple_align
]
methods_test_names
=
[
m
.
__name__
for
m
in
methods_test
]
methods_eval_names
=
methods_test_names
+
[
"
clustalo
"
]
# Create clustalo alignment separately, and cache them
self
.
create_clustalo_alignments
()
# Align and eval with every other method
## Directories
dataset_dir
=
os
.
path
.
join
(
self
.
balibase_path
,
"
RV11.unaligned
"
)
ref_dataset_dir
=
os
.
path
.
join
(
self
.
balibase_path
,
"
RV11.aligned
"
)
clustalo_dir
=
os
.
path
.
join
(
self
.
balibase_path
,
"
clustalo
"
)
for
name
in
methods_test_names
:
out_dir
=
os
.
path
.
join
(
self
.
balibase_path
,
name
)
if
not
os
.
path
.
isdir
(
out_dir
):
os
.
mkdir
(
out_dir
)
scores
=
[]
for
filename
in
os
.
listdir
(
ref_dataset_dir
):
scores
.
append
(
self
.
bali_score
(
os
.
path
.
join
(
ref_dataset_dir
,
filename
),
os
.
path
.
join
(
clustalo_dir
,
filename
)
))
for
in_filename
in
os
.
listdir
(
dataset_dir
):
# Read input
sequences
=
list
(
Bio
.
SeqIO
.
parse
(
os
.
path
.
join
(
dataset_dir
,
in_filename
),
"
fasta
"
))
# Create output for all but clustalo
for
method
,
name
in
zip
(
methods_test
,
methods_test_names
):
out_dir
=
os
.
path
.
join
(
self
.
balibase_path
,
name
)
with
open
(
os
.
path
.
join
(
out_dir
,
in_filename
),
"
w
"
)
as
out_fd
:
aseqs
=
method
(
sequences
)
Bio
.
SeqIO
.
write
(
aseqs
,
out_fd
,
"
fasta
"
)
for
name
in
methods_eval_names
:
out_dir
=
os
.
path
.
join
(
self
.
balibase_path
,
name
)
scores
.
append
(
self
.
bali_score
(
os
.
path
.
join
(
ref_dataset_dir
,
in_filename
),
os
.
path
.
join
(
out_dir
,
in_filename
)
))
scores
[
-
1
][
'
method
'
]
=
name
scores
=
pd
.
DataFrame
(
scores
)
scores
.
to_csv
(
os
.
path
.
join
(
res_dir
,
"
clustalo.csv
"
))
...
...
@@ -439,7 +459,7 @@ class PerformanceTestCase(BalibaseTestCase):
def
test_performance
(
self
):
"""
Create timing graph comparing speed of simple alignment methods
"""
from
alignementseq
import
align
,
vec_align
from
alignementseq
import
align
,
vec_align
,
vec_align2
aligner
=
Bio
.
pairwise2
.
align
def
biopython_align
(
seq1
,
seq2
):
return
aligner
.
globalds
(
...
...
@@ -447,7 +467,7 @@ class PerformanceTestCase(BalibaseTestCase):
one_alignment_only
=
True
)[
0
]
methods
=
(
vec_align
,
align
,
biopython_align
)
methods
=
(
vec_align
,
align
,
biopython_align
,
vec_align2
)
times
=
pd
.
DataFrame
(
columns
=
(
"
length1
"
,
"
length2
"
,
"
method
"
,
"
time
"
),
dtype
=
float
)
for
seq1
,
seq2
,
filename
in
self
.
get_dataset_heads
():
...
...
This diff is collapsed.
Click to expand it.
tree_generator.py
+
6
−
6
View file @
60dc8f50
...
...
@@ -8,10 +8,10 @@ Created on Tue Jan 29 14:59:19 2019
import
numpy
as
np
from
Bio.SeqRecord
import
SeqRecord
from
Bio.Cluster
import
treecluster
import
alignementseq
from
scipy.cluster.hierarchy
import
average
,
dendrogram
#
import alignementseq
from
scipy.cluster.hierarchy
import
average
#
, dendrogram
seqs
=
[
SeqRecord
(
'
TAACCCCAAAAGAACCAGA
'
),
SeqRecord
(
'
TTCTGTAGTAGAGATGGAATTAAGAAAAAACCATCAACTATAACCCCAAAAGAACCAGA
'
),
SeqRecord
(
'
TTCTGTAGTAGAGATGGAATTAAGAAAAAACCATCAACTATAACCCCAAAAGAACCAGA
'
),
SeqRecord
(
'
GTAGTAGAGATGGAATTAAGAAAAAACCATCAACTATAACCCCAAGAGAACCAGA
'
),
SeqRecord
(
'
GAGCCGGATGAGAAGAAACTCTCATGTCCGGTTCTGTAGTAGAGATGGAATTAAGAAAAAACCATCAACTATAACCCCAAGAGAACCAGA
'
),
SeqRecord
(
'
TTTTCATTCGCGAGGAGCCGGATGAGAAGAAACTCTCATGTCCGGTTCTGTAGTAGAGATGGAATTAAGAAAAAACCATCAACTATAACCCCAAGAGAACCAGA
'
)]
_test_
seqs
=
[
SeqRecord
(
'
TAACCCCAAAAGAACCAGA
'
),
SeqRecord
(
'
TTCTGTAGTAGAGATGGAATTAAGAAAAAACCATCAACTATAACCCCAAAAGAACCAGA
'
),
SeqRecord
(
'
TTCTGTAGTAGAGATGGAATTAAGAAAAAACCATCAACTATAACCCCAAAAGAACCAGA
'
),
SeqRecord
(
'
GTAGTAGAGATGGAATTAAGAAAAAACCATCAACTATAACCCCAAGAGAACCAGA
'
),
SeqRecord
(
'
GAGCCGGATGAGAAGAAACTCTCATGTCCGGTTCTGTAGTAGAGATGGAATTAAGAAAAAACCATCAACTATAACCCCAAGAGAACCAGA
'
),
SeqRecord
(
'
TTTTCATTCGCGAGGAGCCGGATGAGAAGAAACTCTCATGTCCGGTTCTGTAGTAGAGATGGAATTAAGAAAAAACCATCAACTATAACCCCAAGAGAACCAGA
'
)]
#Arguments : une liste de séquences SeqRecord, une fonction d'alignement
score_gap
=
-
8
...
...
@@ -20,14 +20,14 @@ def tree_build(seqs, align):
M
=
np
.
zeros
((
len
(
seqs
),
len
(
seqs
)))
for
i
in
range
(
len
(
seqs
)):
for
j
in
range
(
i
):
M
[
i
][
j
],
a
,
b
=
align
(
seqs
[
i
],
seqs
[
j
])
M
[
i
][
j
],
_
,
_
=
align
(
seqs
[
i
],
seqs
[
j
])
M
[
i
][
j
]
=
1
-
abs
(
M
[
i
][
j
]
/
(
max
(
len_seqs
)
*
(
11
-
score_gap
)))
print
(
type
(
M
),
M
)
#
print(type(M), M)
# tree1 = treecluster(None, method = 'a', dist = 'e', distancematrix=M)
tree
=
average
(
M
)
return
tree
#tree = tree_build(seqs, alignementseq.align2steps)
#tree = tree_build(
_test_
seqs, alignementseq.align2steps)
#print(tree)
#dn = dendrogram(tree)
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment