Skip to content
Snippets Groups Projects
Verified Commit 61a442bf authored by Etienne MORICE's avatar Etienne MORICE
Browse files

Trying to match structural information to balibase, many problems

parent ee758231
No related branches found
No related tags found
No related merge requests found
...@@ -4,3 +4,4 @@ __pycache__/ ...@@ -4,3 +4,4 @@ __pycache__/
balibase.zip balibase.zip
atlas atlas
timings.png timings.png
data/
...@@ -217,6 +217,20 @@ def vec_align(seq1, seq2, mat=dict_to_weight(Bio.SubsMat.MatrixInfo.blosum62), g ...@@ -217,6 +217,20 @@ def vec_align(seq1, seq2, mat=dict_to_weight(Bio.SubsMat.MatrixInfo.blosum62), g
aseq2.seq = Seq(aly[::-1]) aseq2.seq = Seq(aly[::-1])
return S[n+m][0], aseq1, aseq2 return S[n+m][0], aseq1, aseq2
def align_dihedrals(seq1, seq2, struct1, struct2):
"""
Align two sequences, by using a simple cost function using dihedral angles
as well as amino acids names.
Arguments:
seq1: First sequence as SeqRecord
seq2:
struct1: First structure as Bio.PDB object
struct2:
"""
print("Not implemented")
def align2steps(seq1, seq2, d=8, e=4): def align2steps(seq1, seq2, d=8, e=4):
"""Main implementation of align, handling also gap extensions. """Main implementation of align, handling also gap extensions.
""" """
......
...@@ -3,6 +3,8 @@ ...@@ -3,6 +3,8 @@
import unittest import unittest
import urllib.request import urllib.request
import os import os
import sys
import re
import zipfile import zipfile
import warnings import warnings
import timeit import timeit
...@@ -15,6 +17,8 @@ import matplotlib.pyplot ...@@ -15,6 +17,8 @@ import matplotlib.pyplot
import Bio.SeqIO import Bio.SeqIO
import Bio.pairwise2 import Bio.pairwise2
import Bio.Entrez import Bio.Entrez
import Bio.PDB
import Bio.PDB.mmtf
from Bio.SeqRecord import SeqRecord from Bio.SeqRecord import SeqRecord
from Bio.SubsMat.MatrixInfo import blosum62 from Bio.SubsMat.MatrixInfo import blosum62
...@@ -25,6 +29,8 @@ class BalibaseTestCase(unittest.TestCase): ...@@ -25,6 +29,8 @@ class BalibaseTestCase(unittest.TestCase):
def setUp(self): def setUp(self):
balibase_zippath = "balibase.zip" balibase_zippath = "balibase.zip"
self.balibase_path = "balibase" self.balibase_path = "balibase"
self.exclude_fastas = ["BBS11037.fasta"] # This one does not contain correct
# pdb codes as ids
testfile_path = os.path.join( testfile_path = os.path.join(
self.balibase_path, self.balibase_path,
"RV11.unaligned", "BBS11001.fasta") "RV11.unaligned", "BBS11001.fasta")
...@@ -40,18 +46,63 @@ class BalibaseTestCase(unittest.TestCase): ...@@ -40,18 +46,63 @@ class BalibaseTestCase(unittest.TestCase):
with zipfile.ZipFile(balibase_zippath) as balibase_zip: with zipfile.ZipFile(balibase_zippath) as balibase_zip:
balibase_zip.extractall() balibase_zip.extractall()
def get_dataset_heads(self): def get_dataset_heads(self, full=False):
"""Generator function to iterate over the first two sequences of each """Generator function to iterate over the first two sequences of each
unaligned fasta file. unaligned fasta file.
Arguments:
full (bool): also yield the original sequences ("BB" files) and not
only the blocks of interest ("BBS")
""" """
dataset_dir = os.path.join(self.balibase_path, "RV11.unaligned") dataset_dir = os.path.join(self.balibase_path, "RV11.unaligned")
for filename in os.listdir(dataset_dir): for filename in os.listdir(dataset_dir):
records = Bio.SeqIO.parse( if filename.startswith("BBS"):
os.path.join(dataset_dir, filename), records = Bio.SeqIO.parse(
"fasta") os.path.join(dataset_dir, filename),
seq1 = next(records) "fasta")
seq2 = next(records) seq1 = next(records)
yield seq1, seq2, filename seq2 = next(records)
if not full:
yield seq1, seq2, filename
else:
orecords = Bio.SeqIO.parse(
os.path.join(dataset_dir, "BB" + filename[3:]),
"fasta")
oseq1 = next(orecords)
oseq2 = next(orecords)
yield seq1, seq2, oseq1, oseq2, filename
def get_datasets_heads_with_struct(self):
"""Generator function to iterate over the first sequences of each
unaligned fasta file, and join the PDB to it
Returns:
Generator of (seq1, seq2, orig_seq1, orig_seq2, struct1, struct2,
name) lists, seq_ are the sequences of interest, orig_seq_ the full
sequences from which they were extracted, struct_ the PDB objects,
name the filename of the original fasta on disk (for debugging
mostly)
"""
parser = Bio.PDB.MMCIFParser()
for s1, s2, os1, os2, name in self.get_dataset_heads(full=True):
if name in self.exclude_fastas:
warnings.warn("Exluding file "+name+" according to exclusion list")
continue
item = [s1, s2, os1, os2]
for s in s1, s2:
fname = Bio.PDB.PDBList().retrieve_pdb_file(
s.id[0:4],
file_format="mmCif",
pdir="data"
)
with warnings.catch_warnings():
warnings.simplefilter("ignore")
struct = parser.get_structure(s.id[0:4], fname)
print(fname, s.id[0:4])
item.append(struct)
item.append(name)
yield item
class AlignmentSeqTestCase(BalibaseTestCase): class AlignmentSeqTestCase(BalibaseTestCase):
...@@ -178,7 +229,24 @@ class AlignmentSeqTestCase(BalibaseTestCase): ...@@ -178,7 +229,24 @@ class AlignmentSeqTestCase(BalibaseTestCase):
print(ex_r1) print(ex_r1)
print(ex_r2) print(ex_r2)
raise raise
def test_align_dihedrals(self):
for seq1, seq2, oseq1, oseq2, struct1, struct2, name in self.get_datasets_heads_with_struct():
skip = False
for seq, struct in ((oseq1, struct1), (oseq2, struct2)):
try:
chain = next(c for c in struct.get_chains() if
str(Bio.PDB.Polypeptide.Polypeptide(c).get_sequence()).startswith(str(seq.seq))
)
except StopIteration:
warnings.warn("No suitable chain found for seq id "+seq.id+" in structure "+struct.id+", skipping file.")
skip = True
break
polyp = Bio.PDB.Polypeptide.Polypeptide(chain)
if skip:
continue
# ============================================================================= # =============================================================================
# def test_multiple_align(self): # def test_multiple_align(self):
# """Tests the multiple_align function (using blosum and gap extension).""" # """Tests the multiple_align function (using blosum and gap extension)."""
......
Source diff could not be displayed: it is too large. Options to address this: view the blob.
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment