Trying to match structural information to balibase, many problems

61a442bf · Etienne MORICE · ee758231 · 61a442bf · 61a442bf · 61a442bf
Verified Commit 61a442bf authored 6 years ago by Etienne MORICE
--- a/.gitignore
+++ b/.gitignore
@@ -4,3 +4,4 @@ __pycache__/
 balibase.zip
 atlas
 timings.png
+data/
--- a/alignementseq.py
+++ b/alignementseq.py
@@ -217,6 +217,20 @@ def vec_align(seq1, seq2, mat=dict_to_weight(Bio.SubsMat.MatrixInfo.blosum62), g
    aseq2.seq = Seq(aly[::-1])
    return S[n+m][0], aseq1, aseq2
+def align_dihedrals(seq1, seq2, struct1, struct2):
+    """
+    Align two sequences, by using a simple cost function using dihedral angles
+    as well as amino acids names.
+    Arguments:
+        seq1: First sequence as SeqRecord
+        seq2:
+        struct1: First structure as Bio.PDB object
+        struct2:
+    """
+    print("Not implemented")
 def align2steps(seq1, seq2, d=8, e=4):
    """Main implementation of align, handling also gap extensions.
    """

--- a/alignementseq_tests.py
+++ b/alignementseq_tests.py
@@ -3,6 +3,8 @@
 import unittest
 import urllib.request
 import os
+import sys
+import re
 import zipfile
 import warnings
 import timeit
@@ -15,6 +17,8 @@ import matplotlib.pyplot
 import Bio.SeqIO
 import Bio.pairwise2
 import Bio.Entrez
+import Bio.PDB
+import Bio.PDB.mmtf
 from Bio.SeqRecord import SeqRecord
 from Bio.SubsMat.MatrixInfo import blosum62
@@ -25,6 +29,8 @@ class BalibaseTestCase(unittest.TestCase):
    def setUp(self):
        balibase_zippath = "balibase.zip"
        self.balibase_path = "balibase"
+        self.exclude_fastas = ["BBS11037.fasta"] # This one does not contain correct
+        # pdb codes as ids
        testfile_path = os.path.join(
            self.balibase_path, 
            "RV11.unaligned", "BBS11001.fasta")
@@ -40,18 +46,63 @@ class BalibaseTestCase(unittest.TestCase):
            with zipfile.ZipFile(balibase_zippath) as balibase_zip:
                balibase_zip.extractall()
-    def get_dataset_heads(self):
+    def get_dataset_heads(self, full=False):
        """Generator function to iterate over the first two sequences of each
        unaligned fasta file.
+        Arguments:
+            full (bool): also yield the original sequences ("BB" files) and not
+            only the blocks of interest ("BBS")
        """
        dataset_dir = os.path.join(self.balibase_path, "RV11.unaligned")
        for filename in os.listdir(dataset_dir):
-            records = Bio.SeqIO.parse(
+            if filename.startswith("BBS"):
-                    os.path.join(dataset_dir, filename),
+                records = Bio.SeqIO.parse(
-                    "fasta")
+                        os.path.join(dataset_dir, filename),
-            seq1 = next(records)
+                        "fasta")
-            seq2 = next(records)
+                seq1 = next(records)
-            yield seq1, seq2, filename
+                seq2 = next(records)
+                if not full:
+                    yield seq1, seq2, filename
+                else:
+                    orecords = Bio.SeqIO.parse(
+                            os.path.join(dataset_dir, "BB" + filename[3:]),
+                            "fasta")
+                    oseq1 = next(orecords)
+                    oseq2 = next(orecords)
+                    yield seq1, seq2, oseq1, oseq2, filename
+    def get_datasets_heads_with_struct(self):
+        """Generator function to iterate over the first sequences of each
+        unaligned fasta file, and join the PDB to it
+        Returns:
+            Generator of (seq1, seq2, orig_seq1, orig_seq2, struct1, struct2,
+            name) lists, seq_ are the sequences of interest, orig_seq_ the full
+            sequences from which they were extracted, struct_ the PDB objects,
+            name the filename of the original fasta on disk (for debugging
+            mostly)
+        """
+        parser = Bio.PDB.MMCIFParser()
+        for s1, s2, os1, os2, name in self.get_dataset_heads(full=True):
+            if name in self.exclude_fastas:
+                warnings.warn("Exluding file "+name+" according to exclusion list")
+                continue
+            item = [s1, s2, os1, os2]
+            for s in s1, s2:
+                fname = Bio.PDB.PDBList().retrieve_pdb_file(
+                        s.id[0:4],
+                        file_format="mmCif",
+                        pdir="data"
+                        )
+                with warnings.catch_warnings():
+                    warnings.simplefilter("ignore")
+                    struct = parser.get_structure(s.id[0:4], fname)
+                print(fname, s.id[0:4])
+                item.append(struct)
+            item.append(name)
+            yield item
 class AlignmentSeqTestCase(BalibaseTestCase):
@@ -178,7 +229,24 @@ class AlignmentSeqTestCase(BalibaseTestCase):
                print(ex_r1)
                print(ex_r2)
                raise
+    def test_align_dihedrals(self):
+        for seq1, seq2, oseq1, oseq2, struct1, struct2, name in self.get_datasets_heads_with_struct():
+            skip = False
+            for seq, struct in ((oseq1, struct1), (oseq2, struct2)):
+                try:
+                    chain = next(c for c in struct.get_chains() if
+                            str(Bio.PDB.Polypeptide.Polypeptide(c).get_sequence()).startswith(str(seq.seq))
+                            )
+                except StopIteration:
+                    warnings.warn("No suitable chain found for seq id "+seq.id+" in structure "+struct.id+", skipping file.")
+                    skip = True
+                    break
+                polyp = Bio.PDB.Polypeptide.Polypeptide(chain)
+            if skip:
+                continue
 # =============================================================================
 #     def test_multiple_align(self):
 #         """Tests the multiple_align function (using blosum and gap extension)."""

--- a/data/2gaa.cif
+++ b/data/2gaa.cif