# -*- coding: utf-8 -*- """ Created on Tue Jan 22 15:39:51 2019 @author: Amaury Leroy """ import numpy as np import re from Bio.ExPASy import Prosite #help(Prosite) import urllib.request urllib.request.urlretrieve("ftp://ftp.expasy.org/databases/prosite/prosite.dat", "prosite.dat") handle = open("prosite.dat") records = Prosite.parse(handle) #for record in records: for record in (r for r in records if "P-loop" in r.description): print(record.accession) print(record.pattern) handle.close() #Questions 3/4/5 import subprocess import Bio.SeqIO def download_orf_code(): urllib.request.urlretrieve("http://downloads.yeastgenome.org/sequence/S288C_reference/orf_dna/orf_coding_all.fasta.gz", "orf.fasta.gz") subprocess.call(["gunzip", "-k", "orf.fasta.gz"]) return "orf.fasta.gz" def read_fasta(filename): file = open(filename, "r") records = [seqrec for seqrec in Bio.SeqIO.parse(file, "fasta")] print(len(records)) file.close() filename = download_orf_code() filename = "orf.fasta" read_fasta(filename) import gzip with gzip.open("orf_coding_all.fasta.gz", "rt") as handle: sequences= list(Bio.SeqIO.parse(handle, "fasta") ) print(sum(len(r) for r in Bio.SeqIO.parse(handle, "fasta"))) seq_trad=[] for i in range(len(sequences)): seq_trad.append(sequences[i].seq.translate(table="Yeast Mitochondrial")) with gzip.open("orf_trans_all.fasta.gz", "rt") as handle: seq_trad_true= list(Bio.SeqIO.parse(handle, "fasta") ) compteur = 0 index_true=[] detail=[] #Nombre d'erreurs de tradu par séquence for i in range(len(seq_trad)) : if str(seq_trad[i])!=str(seq_trad_true[i].seq): compteur+=1 else: index_true.append(i) compt=0 for j in range(len(seq_trad[i])): if seq_trad[i][j] != seq_trad_true[i].seq[j]: compt+=1 detail.append(compt) print(compteur, " erreur(s) de traduction") print(index_true, ": bons indices") #Question 6. Il manque le stockage et l'histogramme horizontal regex = "[AG]-x(4)-G-K-[ST]".translate({ord(k):v for k,v in {"-":"", "(":"{", ")":"}","x":"."}.items()}) creg = re.compile(regex) #re.search(creg, "MSQQVGNSIRRKLVIVGDGACGKTCLLIVFSK") re.search(creg, "MSQQVGNSIRRKLVIVGDGACGKTCLLIVFSKMSQQVGNSIRRKLVIVGDGACGKTCLLIVFSK") #re.search(creg, str(seq_trad_true[0].seq)) n=0 file = open("ploops_proteins.txt", "w") ploop_indices=set() for index,sequence in enumerate(seq_trad_true): if re.search(creg, str(sequence.seq)): n+=1 file.write(sequence.id+ " longueur: " +str(len(sequence.seq))+ "\n") ploop_indices.add(index) file.close() print("nombre de protéines à P-loop: ", n) from collections import defaultdict histo_genome = defaultdict(int) histo_ploop = defaultdict(int) nb_aa_genome=0 nb_aa_ploop=0 for i in range(len(seq_trad_true)): for c in seq_trad_true[i].seq: nb_aa_genome+=1 histo_genome[c] += 1 if i in ploop_indices: nb_aa_ploop+=1 histo_ploop[c] +=1 for c,n in histo_genome.items(): histo_genome[c]=float(n)/nb_aa_genome*100 for c,n in histo_ploop.items(): histo_ploop[c]=float(n)/nb_aa_ploop*100 file = open("histogram.txt", "w") for c,p in histo_genome.items(): file.write(c+" ploops | "+ "p"*int(histo_ploop[c]*5)+ " {:.3}% \n".format(histo_ploop[c])) file.write(c+" genome | "+ "*"*int(p*5)+ " {:.3}% \n".format(p)) file.close() import matplotlib.pyplot as plt aa=[] prop_g=[] prop_ploop=[] for c,p in histo_genome.items(): aa.append(c) prop_g.append(p) prop_ploop.append(histo_ploop[c]) # data to plot n_groups = len(aa) # create plot fig, ax = plt.subplots() index = np.arange(n_groups) bar_width = 0.45 opacity = 0.8 rects1 = plt.bar(index, prop_g, bar_width, alpha=opacity, color='b', label='genome') rects2 = plt.bar(index + bar_width, prop_ploop, bar_width, alpha=opacity, color='g', label='ploop') plt.xlabel('Acides Aminés') plt.ylabel('Proportions') plt.xticks(index + bar_width, aa) plt.legend() plt.tight_layout() plt.show() ## Partie 1.4 import Bio.Restriction as res unrestricted = [] for i in ploop_indices: rec = seq_trad_true[i] no_site = [] if res.EcoRI.site not in rec.seq: no_site.append("EcoRI") if res.XhoI.site not in rec.seq: no_site.append("XhoI") if res.TaqI.site not in rec.seq: no_site.append("TaqI") if no_site: unrestricted.append((rec.id, no_site)) import os #os.mkdir("cartes_restrictions") for sequence in sequences: ecorisite=[] i = sequence.seq.find(res.EcoRI.site) while i>-1: ecorisite.append(i) i = sequence.seq.find(res.EcoRI.site,i+1) xhoi=[] j = sequence.seq.find(res.XhoI.site) while j>-1: xhoi.append(j) j = sequence.seq.find(res.XhoI.site,j+1) taqi=[] k = sequence.seq.find(res.TaqI.site) while k>-1: taqi.append(k) k = sequence.seq.find(res.TaqI.site,k+1) liste= [(indice,"EcoRI") for indice in ecorisite]+[(indice,"XhoI") for indice in xhoi]+[(indice,"TaqI") for indice in taqi] liste.sort(key=lambda x: x[0]) if liste: file= open("cartes_restrictions/cr_"+sequence.id+".txt","w") file.write(sequence.id + " \n") for element in liste: file.write(str(element[0])+ " : "+ element[1] + " \n" ) file.close() import Bio.PDB as pdb pdbl = pdb.PDBList() pdbl.retrieve_pdb_file("2GAA", pdir="data") parser = pdb.MMCIFParser() structure=parser.get_structure("2GAA", "data/2gaa.cif") print("Chains: ", len(list(structure.get_chains()))) print("Residues: ", len(list(structure.get_residues()))) print("Atoms: ", len(list(structure.get_atoms()))) import requests url="http://www.ebi.ac.uk/pdbe-srv/view/entry" def pdb_to_uniprot(pdb_id): pdb_mapping_response = requests.get(url, params={'query':pdb_id})