Skip to content
Snippets Groups Projects
Verified Commit 61a442bf authored by Etienne MORICE's avatar Etienne MORICE
Browse files

Trying to match structural information to balibase, many problems

parent ee758231
No related branches found
No related tags found
No related merge requests found
......@@ -4,3 +4,4 @@ __pycache__/
balibase.zip
atlas
timings.png
data/
......@@ -217,6 +217,20 @@ def vec_align(seq1, seq2, mat=dict_to_weight(Bio.SubsMat.MatrixInfo.blosum62), g
aseq2.seq = Seq(aly[::-1])
return S[n+m][0], aseq1, aseq2
def align_dihedrals(seq1, seq2, struct1, struct2):
"""
Align two sequences, by using a simple cost function using dihedral angles
as well as amino acids names.
Arguments:
seq1: First sequence as SeqRecord
seq2:
struct1: First structure as Bio.PDB object
struct2:
"""
print("Not implemented")
def align2steps(seq1, seq2, d=8, e=4):
"""Main implementation of align, handling also gap extensions.
"""
......
......@@ -3,6 +3,8 @@
import unittest
import urllib.request
import os
import sys
import re
import zipfile
import warnings
import timeit
......@@ -15,6 +17,8 @@ import matplotlib.pyplot
import Bio.SeqIO
import Bio.pairwise2
import Bio.Entrez
import Bio.PDB
import Bio.PDB.mmtf
from Bio.SeqRecord import SeqRecord
from Bio.SubsMat.MatrixInfo import blosum62
......@@ -25,6 +29,8 @@ class BalibaseTestCase(unittest.TestCase):
def setUp(self):
balibase_zippath = "balibase.zip"
self.balibase_path = "balibase"
self.exclude_fastas = ["BBS11037.fasta"] # This one does not contain correct
# pdb codes as ids
testfile_path = os.path.join(
self.balibase_path,
"RV11.unaligned", "BBS11001.fasta")
......@@ -40,18 +46,63 @@ class BalibaseTestCase(unittest.TestCase):
with zipfile.ZipFile(balibase_zippath) as balibase_zip:
balibase_zip.extractall()
def get_dataset_heads(self):
def get_dataset_heads(self, full=False):
"""Generator function to iterate over the first two sequences of each
unaligned fasta file.
Arguments:
full (bool): also yield the original sequences ("BB" files) and not
only the blocks of interest ("BBS")
"""
dataset_dir = os.path.join(self.balibase_path, "RV11.unaligned")
for filename in os.listdir(dataset_dir):
records = Bio.SeqIO.parse(
os.path.join(dataset_dir, filename),
"fasta")
seq1 = next(records)
seq2 = next(records)
yield seq1, seq2, filename
if filename.startswith("BBS"):
records = Bio.SeqIO.parse(
os.path.join(dataset_dir, filename),
"fasta")
seq1 = next(records)
seq2 = next(records)
if not full:
yield seq1, seq2, filename
else:
orecords = Bio.SeqIO.parse(
os.path.join(dataset_dir, "BB" + filename[3:]),
"fasta")
oseq1 = next(orecords)
oseq2 = next(orecords)
yield seq1, seq2, oseq1, oseq2, filename
def get_datasets_heads_with_struct(self):
"""Generator function to iterate over the first sequences of each
unaligned fasta file, and join the PDB to it
Returns:
Generator of (seq1, seq2, orig_seq1, orig_seq2, struct1, struct2,
name) lists, seq_ are the sequences of interest, orig_seq_ the full
sequences from which they were extracted, struct_ the PDB objects,
name the filename of the original fasta on disk (for debugging
mostly)
"""
parser = Bio.PDB.MMCIFParser()
for s1, s2, os1, os2, name in self.get_dataset_heads(full=True):
if name in self.exclude_fastas:
warnings.warn("Exluding file "+name+" according to exclusion list")
continue
item = [s1, s2, os1, os2]
for s in s1, s2:
fname = Bio.PDB.PDBList().retrieve_pdb_file(
s.id[0:4],
file_format="mmCif",
pdir="data"
)
with warnings.catch_warnings():
warnings.simplefilter("ignore")
struct = parser.get_structure(s.id[0:4], fname)
print(fname, s.id[0:4])
item.append(struct)
item.append(name)
yield item
class AlignmentSeqTestCase(BalibaseTestCase):
......@@ -178,7 +229,24 @@ class AlignmentSeqTestCase(BalibaseTestCase):
print(ex_r1)
print(ex_r2)
raise
def test_align_dihedrals(self):
for seq1, seq2, oseq1, oseq2, struct1, struct2, name in self.get_datasets_heads_with_struct():
skip = False
for seq, struct in ((oseq1, struct1), (oseq2, struct2)):
try:
chain = next(c for c in struct.get_chains() if
str(Bio.PDB.Polypeptide.Polypeptide(c).get_sequence()).startswith(str(seq.seq))
)
except StopIteration:
warnings.warn("No suitable chain found for seq id "+seq.id+" in structure "+struct.id+", skipping file.")
skip = True
break
polyp = Bio.PDB.Polypeptide.Polypeptide(chain)
if skip:
continue
# =============================================================================
# def test_multiple_align(self):
# """Tests the multiple_align function (using blosum and gap extension)."""
......
Source diff could not be displayed: it is too large. Options to address this: view the blob.
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment