Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
B
BIOINF588
Manage
Activity
Members
Plan
Wiki
Code
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Deploy
Releases
Container Registry
Model registry
Analyze
Contributor analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Etienne MORICE
BIOINF588
Commits
e2463bc9
Verified
Commit
e2463bc9
authored
6 years ago
by
Etienne MORICE
Browse files
Options
Downloads
Patches
Plain Diff
Vectorised version of simple align. Performance tests to come.
parent
4dbdda67
No related branches found
Branches containing commit
No related tags found
No related merge requests found
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
alignementseq.py
+118
-7
118 additions, 7 deletions
alignementseq.py
alignementseq_tests.py
+35
-19
35 additions, 19 deletions
alignementseq_tests.py
with
153 additions
and
26 deletions
alignementseq.py
+
118
−
7
View file @
e2463bc9
...
@@ -6,6 +6,8 @@ Created on Tue Jan 15 15:02:16 2019
...
@@ -6,6 +6,8 @@ Created on Tue Jan 15 15:02:16 2019
@author: ariane.delrocq
@author: ariane.delrocq
"""
"""
import
numpy
as
np
import
Bio.SubsMat.MatrixInfo
import
Bio.SubsMat.MatrixInfo
import
Bio.SeqIO
import
Bio.SeqIO
from
Bio.SeqRecord
import
SeqRecord
from
Bio.SeqRecord
import
SeqRecord
...
@@ -21,8 +23,9 @@ import Bio.pairwise2
...
@@ -21,8 +23,9 @@ import Bio.pairwise2
mat
=
Bio
.
SubsMat
.
MatrixInfo
.
blosum62
mat
=
Bio
.
SubsMat
.
MatrixInfo
.
blosum62
def
score
(
a
,
b
,
gap
=-
8
):
def
score
(
a
,
b
,
gap
=-
8
,
id
=
1
,
mut
=-
1
):
"""
Default scoring function for align
"""
if
not
a
or
not
b
:
if
not
a
or
not
b
:
return
gap
return
gap
else
:
else
:
...
@@ -32,13 +35,16 @@ def score(a, b, gap=-8, id=1, mut=-1):
...
@@ -32,13 +35,16 @@ def score(a, b, gap=-8, id=1, mut=-1):
return
mat
[
a
,
b
]
return
mat
[
a
,
b
]
except
KeyError
:
except
KeyError
:
return
mat
[
b
,
a
]
return
mat
[
b
,
a
]
# elif a==b:
# return id
# else:
# return mut
def
align
(
seq1
,
seq2
,
score
=
score
):
def
align
(
seq1
,
seq2
,
score
=
score
):
"""
Main implementation of align, with blosum scoring and no specific gap
extension penalties.
Args:
seq1: A SeqRecord object
seq2: Same.
"""
x
=
seq1
.
seq
x
=
seq1
.
seq
y
=
seq2
.
seq
y
=
seq2
.
seq
n
=
len
(
x
)
n
=
len
(
x
)
...
@@ -106,8 +112,111 @@ def align(seq1, seq2, score=score):
...
@@ -106,8 +112,111 @@ def align(seq1, seq2, score=score):
aseq2
.
seq
=
Seq
(
aly
[::
-
1
])
aseq2
.
seq
=
Seq
(
aly
[::
-
1
])
return
bestScore
,
aseq1
,
aseq2
return
bestScore
,
aseq1
,
aseq2
def
dict_to_weight
(
subs_dict
):
"""
Converts blosum dict to indexable numpy array.
The score function is the critical op here, we do not need a dict lookup in
the innermost loop.
Returns:
26 x 26 numpy array, [0,0] being s(
'
A
'
,
'
A
'
)
"""
weight
=
np
.
zeros
((
26
,
26
))
for
(
r1
,
r2
),
w
in
subs_dict
.
items
():
r1
=
ord
(
r1
)
-
ord
(
'
A
'
)
r2
=
ord
(
r2
)
-
ord
(
'
A
'
)
weight
[
r1
,
r2
]
=
weight
[
r2
,
r1
]
=
w
return
weight
def
vec_align
(
seq1
,
seq2
,
mat
=
dict_to_weight
(
Bio
.
SubsMat
.
MatrixInfo
.
blosum62
),
gap
=-
8
):
"""
Proof of concept vector-style implementation of align, written mainly in numpy
arrays operations.
M(i,j) is the score of the best alignement containing exactly i letters of
x and j letters of y.
To allow vector operations, we rewrite M as S(s, k) with s=i+j and
k=i-max(0,s-m)
Hence:
M(i,j) = S(i+j, i-max(0,i+j-m))
S(s, k) = M(k+max(0,s-m), min(s,m)-k)
The relation become:
M(i,j) = max( M(i,j-1)+gap, M(i-1,j)+gap, M(i-1,j-1)+s(i,j) )
S(s,k) = max(
S(s-1,k+1(s>m))+gap,
S(s-1,k-1+1(s>m))+gap,
S(s-2,k-1+1(s>m)+1(s>m+1)) + s(k+max(0,s-m),min(s,m)-k)
)
Note that S(s,k) does not depend anymore on S(k,.), allowing vector operation
Last index of the diag s can be written (useful for simplifications):
min(n,s)-max(0,s-m)
= min(n, m, s, n+m-s)
= min(m,s)-max(0,s-n)
"""
n
=
len
(
seq1
.
seq
)
m
=
len
(
seq2
.
seq
)
x
=
np
.
array
(
list
(
ord
(
c
)
-
ord
(
'
A
'
)
for
c
in
seq1
.
seq
))
y
=
np
.
array
(
list
(
ord
(
c
)
-
ord
(
'
A
'
)
for
c
in
seq2
.
seq
))
mat
=
mat
.
flatten
()
S
=
[
np
.
zeros
(
min
(
s
,
n
,
m
,
n
+
m
-
s
)
+
1
,
dtype
=
np
.
int
)
for
s
in
range
(
n
+
m
+
1
)
]
S
[
0
][
0
]
=
0
S
[
1
][
0
]
=
gap
S
[
1
][
1
]
=
gap
for
s
in
range
(
2
,
n
+
m
+
1
):
# First/last cell must be treated differently when first line/col of M
if
(
s
<=
m
):
S
[
s
][
0
]
=
S
[
s
-
1
][
0
]
+
gap
if
(
s
<=
n
):
S
[
s
][
s
-
max
(
0
,
s
-
m
)]
=
S
[
s
-
1
][
s
-
1
-
max
(
0
,
s
-
1
-
m
)]
+
gap
# The offsets in the formula become begin/end of slices.
S
[
s
][
int
(
s
<=
m
):
len
(
S
[
s
])
-
int
(
s
<=
n
)]
=
np
.
maximum
(
np
.
maximum
(
S
[
s
-
1
][
1
:]
+
gap
,
S
[
s
-
1
][:
-
1
]
+
gap
),
S
[
s
-
2
][
int
(
s
>
m
+
1
):
len
(
S
[
s
-
2
])
-
int
(
s
>
n
+
1
)]
+
mat
[
x
[
max
(
0
,
s
-
m
-
1
):
min
(
n
,
s
-
1
)]
*
26
+
y
[
max
(
0
,
s
-
n
-
1
):
min
(
m
,
s
-
1
)][::
-
1
]
]
)
# Trace back
score
=
S
[
n
+
m
][
0
]
i
,
j
=
n
,
m
alx
,
aly
=
""
,
""
while
i
>
0
or
j
>
0
:
if
j
>
0
:
prev_score
=
S
[
i
+
j
-
1
][
i
-
max
(
0
,
i
+
j
-
m
-
1
)]
if
prev_score
+
gap
==
score
:
score
=
prev_score
j
-=
1
aly
+=
seq2
[
j
]
alx
+=
'
-
'
continue
if
i
>
0
:
prev_score
=
S
[
i
+
j
-
1
][
i
-
max
(
0
,
i
+
j
-
m
-
1
)
-
1
]
if
prev_score
+
gap
==
score
:
score
=
prev_score
i
-=
1
alx
+=
seq1
[
i
]
aly
+=
'
-
'
continue
score
=
S
[
i
+
j
-
2
][
i
-
max
(
0
,
i
+
j
-
m
-
2
)
-
1
]
i
-=
1
j
-=
1
alx
+=
seq1
[
i
]
aly
+=
seq2
[
j
]
aseq1
=
SeqRecord
(
seq1
)
aseq1
.
seq
=
Seq
(
alx
[::
-
1
])
aseq2
=
SeqRecord
(
seq2
)
aseq2
.
seq
=
Seq
(
aly
[::
-
1
])
return
S
[
n
+
m
][
0
],
aseq1
,
aseq2
def
align2steps
(
seq1
,
seq2
,
d
=
8
,
e
=
4
):
def
align2steps
(
seq1
,
seq2
,
d
=
8
,
e
=
4
):
"""
Main implementation of align, handling also gap extensions.
"""
x
=
seq1
.
seq
x
=
seq1
.
seq
y
=
seq2
.
seq
y
=
seq2
.
seq
n
=
len
(
x
)
n
=
len
(
x
)
...
@@ -228,6 +337,8 @@ def align2steps(seq1, seq2, d=8, e=4):
...
@@ -228,6 +337,8 @@ def align2steps(seq1, seq2, d=8, e=4):
return
bestScore
,
aseq1
,
aseq2
return
bestScore
,
aseq1
,
aseq2
def
test
():
def
test
():
"""
Obsolete test function, to be moved to unit tests in a separate file.
"""
# print(align("chat", "cat"))
# print(align("chat", "cat"))
# print(align("chat", "cgat"))
# print(align("chat", "cgat"))
# print(align("chat", "at"))
# print(align("chat", "at"))
...
...
This diff is collapsed.
Click to expand it.
alignementseq_tests.py
+
35
−
19
View file @
e2463bc9
...
@@ -8,6 +8,7 @@ import zipfile
...
@@ -8,6 +8,7 @@ import zipfile
import
Bio.SeqIO
import
Bio.SeqIO
import
Bio.pairwise2
import
Bio.pairwise2
from
Bio.SeqRecord
import
SeqRecord
from
Bio.SeqRecord
import
SeqRecord
from
Bio.SubsMat.MatrixInfo
import
blosum62
class
AlignmentSeqTestCase
(
unittest
.
TestCase
):
class
AlignmentSeqTestCase
(
unittest
.
TestCase
):
def
setUp
(
self
):
def
setUp
(
self
):
...
@@ -28,9 +29,10 @@ class AlignmentSeqTestCase(unittest.TestCase):
...
@@ -28,9 +29,10 @@ class AlignmentSeqTestCase(unittest.TestCase):
with
zipfile
.
ZipFile
(
balibase_zippath
)
as
balibase_zip
:
with
zipfile
.
ZipFile
(
balibase_zippath
)
as
balibase_zip
:
balibase_zip
.
extractall
()
balibase_zip
.
extractall
()
# Generator function to iterate over the first two sequences of each
# unaligned fasta file.
def
get_dataset_heads
(
self
):
def
get_dataset_heads
(
self
):
"""
Generator function to iterate over the first two sequences of each
unaligned fasta file.
"""
dataset_dir
=
os
.
path
.
join
(
self
.
balibase_path
,
"
RV11.unaligned
"
)
dataset_dir
=
os
.
path
.
join
(
self
.
balibase_path
,
"
RV11.unaligned
"
)
for
filename
in
os
.
listdir
(
dataset_dir
):
for
filename
in
os
.
listdir
(
dataset_dir
):
records
=
Bio
.
SeqIO
.
parse
(
records
=
Bio
.
SeqIO
.
parse
(
...
@@ -40,24 +42,26 @@ class AlignmentSeqTestCase(unittest.TestCase):
...
@@ -40,24 +42,26 @@ class AlignmentSeqTestCase(unittest.TestCase):
seq2
=
next
(
records
)
seq2
=
next
(
records
)
yield
seq1
,
seq2
,
filename
yield
seq1
,
seq2
,
filename
# Strip strings of their '-' before comparing them
def
assertSameResidues
(
self
,
str1
,
str2
):
def
assertSameResidues
(
self
,
str1
,
str2
):
"""
Strip strings of their
'
-
'
before comparing them
"""
return
self
.
assertEqual
(
return
self
.
assertEqual
(
str
(
str1
).
translate
({
ord
(
'
-
'
):
None
}),
str
(
str1
).
translate
({
ord
(
'
-
'
):
None
}),
str
(
str2
).
translate
({
ord
(
'
-
'
):
None
})
str
(
str2
).
translate
({
ord
(
'
-
'
):
None
})
)
)
# Test alignments with the simplest metric. As there can be a huge number of
# solutions, we check only that we got the right score, and one valid
# alignment.
def
test_simple_align
(
self
):
def
test_simple_align
(
self
):
"""
Test alignments with the simplest metric. As there can be a huge number of
solutions, we check only that we got the right score, and one valid
alignment.
"""
from
alignementseq
import
align
from
alignementseq
import
align
score_fn
=
lambda
a
,
b
:
-
2
if
a
==
''
or
b
==
''
else
-
1
if
a
!=
b
else
1
score_fn
=
lambda
a
,
b
:
-
2
if
a
==
''
or
b
==
''
else
-
1
if
a
!=
b
else
1
for
s1
,
s2
,
filename
in
self
.
get_dataset_heads
():
for
s1
,
s2
,
filename
in
self
.
get_dataset_heads
():
score
,
r1
,
r2
=
align
(
s1
,
s2
,
score_fn
)
score
,
r1
,
r2
=
align
(
s1
,
s2
,
score_fn
)
(
ex_r1
,
ex_r2
,
exp_score
,
*
_
)
=
Bio
.
pairwise2
.
align
.
globalms
(
(
ex_r1
,
ex_r2
,
exp_score
,
*
_
)
=
Bio
.
pairwise2
.
align
.
globalms
(
s1
.
seq
,
s2
.
seq
,
1
,
-
1
,
-
2
,
-
2
)[
0
]
s1
.
seq
,
s2
.
seq
,
1
,
-
1
,
-
2
,
-
2
,
one_alignment_only
=
True
)[
0
]
try
:
try
:
self
.
assertSameResidues
(
r1
.
seq
,
s1
.
seq
)
self
.
assertSameResidues
(
r1
.
seq
,
s1
.
seq
)
...
@@ -78,18 +82,30 @@ class AlignmentSeqTestCase(unittest.TestCase):
...
@@ -78,18 +82,30 @@ class AlignmentSeqTestCase(unittest.TestCase):
raise
raise
def
test_blosum_align
(
self
):
def
test_blosum_align
(
self
):
from
alignementseq
import
align
from
alignementseq
import
align
,
vec_align
cases
=
[
for
s1
,
s2
,
filename
in
self
.
get_dataset_heads
():
(
"
CHAT
"
,
"
CAT
"
,
10
,
"
CHAT
"
,
"
C-AT
"
),
(
ex_r1
,
ex_r2
,
exp_score
,
*
_
)
=
Bio
.
pairwise2
.
align
.
globalds
(
(
"
CHAT
"
,
"
CGAT
"
,
16
,
"
CHAT
"
,
"
CGAT
"
),
s1
.
seq
,
s2
.
seq
,
blosum62
,
-
8
,
-
8
,
one_alignment_only
=
True
)[
0
]
(
"
CHAT
"
,
"
AT
"
,
-
7
,
"
CHAT
"
,
"
--AT
"
)
for
method
in
align
,
vec_align
:
]
score
,
r1
,
r2
=
method
(
s1
,
s2
)
for
s1
,
s2
,
exp_score
,
exp_r1
,
exp_r2
in
cases
:
try
:
score
,
r1
,
r2
=
align
(
SeqRecord
(
s1
),
SeqRecord
(
s2
))
self
.
assertSameResidues
(
r1
.
seq
,
s1
.
seq
)
self
.
assertEqual
(
score
,
exp_score
)
self
.
assertSameResidues
(
r2
.
seq
,
s2
.
seq
)
self
.
assertEqual
(
r1
.
seq
,
exp_r1
)
self
.
assertEqual
(
score
,
sum
(
self
.
assertEqual
(
r2
.
seq
,
exp_r2
)
-
8
if
a
==
'
-
'
or
b
==
'
-
'
else
blosum62
[
a
,
b
]
if
(
a
,
b
)
in
blosum62
else
blosum62
[
b
,
a
]
for
a
,
b
in
zip
(
r1
.
seq
,
r2
.
seq
)))
self
.
assertEqual
(
score
,
exp_score
)
except
AssertionError
:
print
(
'
from
'
,
filename
,
"
with
"
,
method
,
"
:
"
)
print
(
r1
.
seq
)
print
(
r2
.
seq
)
print
(
''
)
print
(
ex_r1
)
print
(
ex_r2
)
raise
if
__name__
==
'
__main__
'
:
if
__name__
==
'
__main__
'
:
unittest
.
main
()
unittest
.
main
()
...
...
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment