Benchmark for ClustalO

1b346096 · Etienne MORICE · c0e1e97e · 1b346096 · 1b346096 · 1b346096
Verified Commit 1b346096 authored 6 years ago by Etienne MORICE
--- a/.gitignore
+++ b/.gitignore
@@ -6,3 +6,4 @@ timings.png
 balibase.zip
 *.fasta
 data/
+balibase_results/
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -8,6 +8,9 @@ cache:
 test:
        script:
                - python3 -m unittest alignementseq_tests test_alignementseq_multiple
+        artifacts:
+                paths:
+                        - balibase_results/

 test_performance:
        script:

--- a/alignementseq_tests.py
+++ b/alignementseq_tests.py
@@ -30,7 +30,7 @@ import os
 import sys
 import io

-spec, m = None, None
+bali_spec, bali_m = None, None

 def bali_score(ref_fasta_path, test_fasta_path):
    """Portable wrapper for the bali_score.py script.
@@ -39,16 +39,16 @@ def bali_score(ref_fasta_path, test_fasta_path):
    Reproduces the results of the bali_score C program.
    """
    path = os.path.join("balibase", "bali_score.py")
-    global spec, m
-    if spec is None:
-        spec = importlib.util.spec_from_file_location("baliscore", path)
-        m = importlib.util.module_from_spec(spec)
+    global bali_spec, bali_m
+    if bali_spec is None:
+        bali_spec = importlib.util.spec_from_file_location("baliscore", path)
+        bali_m = importlib.util.module_from_spec(bali_spec)
    tmp_argv = sys.argv
    tmp_stdout = sys.stdout
    buf = sys.stdout = io.StringIO()
    sys.argv = [path, ref_fasta_path, test_fasta_path]
    try:
-        spec.loader.exec_module(m)
+        bali_spec.loader.exec_module(bali_m)
    finally:
        sys.argv = tmp_argv
        sys.stdout = tmp_stdout
@@ -162,8 +162,6 @@ class BalibaseTestCase(unittest.TestCase):
            item.append(name)
            yield item

-class AlignmentSeqTestCase(BalibaseTestCase):
-    
    def get_dataset_records(self):
        """Generator function to iterate over the record generator  of each
        unaligned fasta file.
@@ -175,6 +173,8 @@ class AlignmentSeqTestCase(BalibaseTestCase):
                    "fasta")
            yield records, filename

+class AlignmentSeqTestCase(BalibaseTestCase):
+
    def assertSameResidues(self, str1, str2):
        """Strip strings of their '-' before comparing them
        """
@@ -288,9 +288,13 @@ class AlignmentSeqTestCase(BalibaseTestCase):
                print(ex_r2)
                raise
 
-    def test_benchmark_multiple_align(self):
-        """Tests the multiple_align function (using blosum and gap extension)."""
-        from Bio import AlignIO
+    def create_clustalo_alignments(self):
+        """
+        Submit alignment jobs to the EMBL pipeline to get reference ClustalO
+        alignments.
+
+        A rate limiting is applied, typically 5 jobs/second.
+        """
        import clustalo as cl
     
        dataset_dir = os.path.join(self.balibase_path, "RV11.unaligned")
@@ -315,6 +319,29 @@ class AlignmentSeqTestCase(BalibaseTestCase):
            jobs.append(j)
        for j in jobs:
            j.join()
+
+    def test_benchmark_multiple_align(self):
+        """Tests the multiple_align function (using blosum and gap extension)."""
+
+        res_dir = "balibase_results"
+        if not os.path.isdir(res_dir):
+            os.mkdir(res_dir)
+
+        # Cached
+        self.create_clustalo_alignments()
+         
+        ref_dataset_dir = os.path.join(self.balibase_path, "RV11.aligned")
+        clustalo_dir = os.path.join(self.balibase_path, "clustalo")
+
+        scores = []
+        for filename in os.listdir(ref_dataset_dir):
+            scores.append(bali_score(
+                    os.path.join(ref_dataset_dir, filename),
+                    os.path.join(clustalo_dir, filename)
+                    ))
+        scores = pd.DataFrame(scores)
+        scores.to_csv(os.path.join(res_dir, "clustalo.csv"))
+
 
    def test_align_dihedrals(self):
        """Iterates over the balibase dataset, match the chains from the mmcif