Skip to content
Snippets Groups Projects
projet_steyeart.py 5.86 KiB
Newer Older
Amaury LEROY's avatar
Amaury LEROY committed
# -*- coding: utf-8 -*-
"""
Created on Tue Jan 22 15:39:51 2019

@author: Amaury Leroy
"""
Amaury LEROY's avatar
Amaury LEROY committed
import numpy as np
Amaury LEROY's avatar
Amaury LEROY committed
import re
from Bio.ExPASy import Prosite
Amaury LEROY's avatar
Amaury LEROY committed
#help(Prosite)
import urllib.request
urllib.request.urlretrieve("ftp://ftp.expasy.org/databases/prosite/prosite.dat", "prosite.dat")

handle = open("prosite.dat")
records = Prosite.parse(handle)

 #for record in  records:
for record in (r for r in records if "P-loop" in r.description):
     print(record.accession)
     print(record.pattern)

handle.close()
Amaury LEROY's avatar
Amaury LEROY committed
    
#Questions 3/4/5
Amaury LEROY's avatar
Amaury LEROY committed
import subprocess
import Bio.SeqIO

def download_orf_code():
    urllib.request.urlretrieve("http://downloads.yeastgenome.org/sequence/S288C_reference/orf_dna/orf_coding_all.fasta.gz", "orf.fasta.gz")
    subprocess.call(["gunzip", "-k", "orf.fasta.gz"])
    return "orf.fasta.gz"

def read_fasta(filename):
    file = open(filename, "r")
    records = [seqrec for seqrec in Bio.SeqIO.parse(file, "fasta")]
    print(len(records))
    file.close()

filename = download_orf_code()
filename = "orf.fasta"
read_fasta(filename)


Amaury LEROY's avatar
Amaury LEROY committed
import gzip
with gzip.open("orf_coding_all.fasta.gz", "rt") as handle:
Amaury LEROY's avatar
Amaury LEROY committed
    sequences= list(Bio.SeqIO.parse(handle, "fasta") )
    print(sum(len(r) for r in Bio.SeqIO.parse(handle, "fasta")))
Amaury LEROY's avatar
Amaury LEROY committed

seq_trad=[]
for i in range(len(sequences)):
    seq_trad.append(sequences[i].seq.translate(table="Yeast Mitochondrial"))

with gzip.open("orf_trans_all.fasta.gz", "rt") as handle:
Amaury LEROY's avatar
Amaury LEROY committed
    seq_trad_true= list(Bio.SeqIO.parse(handle, "fasta") )
Amaury LEROY's avatar
Amaury LEROY committed

compteur = 0
Amaury LEROY's avatar
Amaury LEROY committed
index_true=[]
detail=[] #Nombre d'erreurs de tradu par séquence
Amaury LEROY's avatar
Amaury LEROY committed
for i in range(len(seq_trad))  :
Amaury LEROY's avatar
Amaury LEROY committed
    
Amaury LEROY's avatar
Amaury LEROY committed
    if str(seq_trad[i])!=str(seq_trad_true[i].seq):
        compteur+=1
Amaury LEROY's avatar
Amaury LEROY committed
    else:
        index_true.append(i)
    compt=0
    for j in range(len(seq_trad[i])):
        if seq_trad[i][j] != seq_trad_true[i].seq[j]:
            compt+=1
    detail.append(compt)
print(compteur, " erreur(s) de traduction") 
print(index_true, ": bons indices")
Amaury LEROY's avatar
Amaury LEROY committed


#Question 6. Il manque le stockage et l'histogramme horizontal
regex = "[AG]-x(4)-G-K-[ST]".translate({ord(k):v for k,v in {"-":"", "(":"{", ")":"}","x":"."}.items()})

creg = re.compile(regex)

Amaury LEROY's avatar
Amaury LEROY committed
#re.search(creg, "MSQQVGNSIRRKLVIVGDGACGKTCLLIVFSK")
re.search(creg, "MSQQVGNSIRRKLVIVGDGACGKTCLLIVFSKMSQQVGNSIRRKLVIVGDGACGKTCLLIVFSK")
#re.search(creg, str(seq_trad_true[0].seq))

n=0
file = open("ploops_proteins.txt", "w")
ploop_indices=set()
for index,sequence in enumerate(seq_trad_true):
    if re.search(creg, str(sequence.seq)):
        n+=1
        file.write(sequence.id+ " longueur: " +str(len(sequence.seq))+ "\n")
        ploop_indices.add(index)


file.close()        
print("nombre de protéines à P-loop: ", n)      

from collections import defaultdict

histo_genome = defaultdict(int)
histo_ploop = defaultdict(int)
nb_aa_genome=0
nb_aa_ploop=0
for i in range(len(seq_trad_true)):
    for c in seq_trad_true[i].seq:
        nb_aa_genome+=1
        histo_genome[c] += 1
        if i in ploop_indices:
            nb_aa_ploop+=1
            histo_ploop[c] +=1

for c,n in histo_genome.items():
    histo_genome[c]=float(n)/nb_aa_genome*100
for c,n in histo_ploop.items():
    histo_ploop[c]=float(n)/nb_aa_ploop*100


file = open("histogram.txt", "w")
for c,p in histo_genome.items():
    file.write(c+" ploops | "+ "p"*int(histo_ploop[c]*5)+ " {:.3}% \n".format(histo_ploop[c]))
    file.write(c+" genome | "+ "*"*int(p*5)+ " {:.3}% \n".format(p))

file.close()

import matplotlib.pyplot as plt

aa=[]
prop_g=[]
prop_ploop=[]
for c,p in histo_genome.items():
    aa.append(c)
    prop_g.append(p)
    prop_ploop.append(histo_ploop[c])
    
# data to plot
n_groups = len(aa)
 
# create plot
fig, ax = plt.subplots()
index = np.arange(n_groups)
bar_width = 0.45
opacity = 0.8
 
rects1 = plt.bar(index, prop_g, bar_width,
                 alpha=opacity,
                 color='b',
                 label='genome')
 
rects2 = plt.bar(index + bar_width, prop_ploop, bar_width,
                 alpha=opacity,
                 color='g',
                 label='ploop')

plt.xlabel('Acides Aminés')
plt.ylabel('Proportions')
plt.xticks(index + bar_width, aa)
plt.legend()

plt.tight_layout()
plt.show()



## Partie 1.4
import Bio.Restriction as res


unrestricted = []
for i in ploop_indices:
    rec = seq_trad_true[i]
    no_site = []
    if res.EcoRI.site not in rec.seq:
        no_site.append("EcoRI")
    if res.XhoI.site not in rec.seq:
        no_site.append("XhoI")
    if res.TaqI.site not in rec.seq:
        no_site.append("TaqI")
    if no_site:
        unrestricted.append((rec.id, no_site))

import os
#os.mkdir("cartes_restrictions")

for sequence in sequences:    
    ecorisite=[]
    i = sequence.seq.find(res.EcoRI.site)
    while i>-1:
        ecorisite.append(i)
        i = sequence.seq.find(res.EcoRI.site,i+1)   
    xhoi=[]
    j = sequence.seq.find(res.XhoI.site)
    while j>-1:
        xhoi.append(j)
        j = sequence.seq.find(res.XhoI.site,j+1)
    taqi=[]
    k = sequence.seq.find(res.TaqI.site)
    while k>-1:
        taqi.append(k)
        k = sequence.seq.find(res.TaqI.site,k+1)
    liste= [(indice,"EcoRI") for indice in ecorisite]+[(indice,"XhoI") for indice in xhoi]+[(indice,"TaqI") for indice in taqi]
    liste.sort(key=lambda x: x[0])
    if liste:  
        file= open("cartes_restrictions/cr_"+sequence.id+".txt","w")
        file.write(sequence.id + " \n")
        for element in liste:     
            file.write(str(element[0])+ " : "+ element[1] + " \n" )
        file.close()
        
import Bio.PDB as pdb
pdbl = pdb.PDBList()
pdbl.retrieve_pdb_file("2GAA", pdir="data")


parser = pdb.MMCIFParser()
structure=parser.get_structure("2GAA", "data/2gaa.cif")

print("Chains: ", len(list(structure.get_chains())))
print("Residues: ", len(list(structure.get_residues())))
print("Atoms: ", len(list(structure.get_atoms())))


import requests

url="http://www.ebi.ac.uk/pdbe-srv/view/entry"
def pdb_to_uniprot(pdb_id):
    pdb_mapping_response = requests.get(url, params={'query':pdb_id})