#!/usr/bin/env python # coding: utf-8 # # Biopython # # ![biopython logo](http://biopython.org/assets/images/biopython_logo_white.png) # # ## A quick overview # ### [Sander Bollen](mailto://a.h.b.bollen@lumc.nl) # # What is Biopython? # # ## 'Python Tools for Computational Molecular Biology' # # - Fully open-source # - Actively developed # - Large community # # What can it do? # # Modules, classes and functions for manipulating biological data # # - File parsers and writers. # - Sequence files: fasta, fastq, genbank, abi, sff, etc. # - Alignment files: clustal, emboss, phylip, nexus, etc. # - Sequence search outputs: BLAST, HMMER, BLAT, etc. # - Phylogenetic trees: newick, nexus, phyloxml, etc. # - Sequence motifs: AlignAce, TRANSFAC, etc. # - Others: PDB files, etc. # - Access to remote resources (e.g., Entrez, NCBI BLAST). # - Application wrappers. # - A simple graphing tool. # - Simple algorithms (e.g., pairwise alignment, cluster analysis). # - References such as codon tables and IUPAC sequences. # # Where can I find more information? # # - [Biopython Homepage](http://biopython.org/) # - [Biopython development repository](http://github.com/biopython/biopython) # - [Biopython mailing list](http://lists.open-bio.org/pipermail/biopython/) # - [Biopython 'cookbook'](http://biopython.org/DIST/docs/tutorial/Tutorial.html) (essential reading!) # # Manipulating sequence data # # ## Seq and SeqRecord objects # # `Seq` and `SeqRecord` objects are the basis of all sequence manipulation in Biopython. # # * `Seq` is a raw sequence with an alphabet (e.g. DNA or RNA). # * `SeqRecord` is a sequence with metadata (e.g. names, ids, etc). This contains a `Seq` object. # # In[1]: from Bio.Seq import Seq from Bio.Alphabet import generic_dna # create a sequence, and store it in a variable my_sequence = Seq("ATGGCCCTGTGGATGCGCCTCCTGCCCCTG", generic_dna) print(my_sequence) # There are lots of built in methods that can be used to manipulate the sequence # # The sequence acts like a string in many ways # # In[2]: # get the length of a sequence print("length: {0}".format(len(my_sequence))) # In[3]: # slice and dice print(my_sequence[:10]) # In[4]: # change the case print(my_sequence.lower()) # In[5]: # concatenate the first and last 10 nucleotides print(my_sequence[:10] + my_sequence[-10:]) # But also has more sequence-specific methods # In[6]: # complement print(my_sequence.complement()) # In[7]: # reverse complement print(my_sequence.reverse_complement()) # In[8]: # transcribe from DNA to RNA rna = my_sequence.transcribe() print(rna) # In[9]: # Translate from nucleotide to protein protein = my_sequence.translate() print(protein) # # Manipulating Sequence Data # # ## Bio.SeqIO # # Input and output of sequence files. # # - `SeqIO.read` # - Read a file containing a single sequence # - `SeqIO.parse` # - Iterate over all sequences in a sequence file # - `SeqIO.write` # - write sequences to a file # In[10]: from Bio import SeqIO # read the first sequence # returns SeqRecord objects for record in SeqIO.parse("../data/records.fa", "fasta"): dna = record break print(dna) # Each record is an object with several fields, including: # # - `record.id` # - the sequence id # - `record.name` # - sequence name, usually the same as the id # - `record.description` # - sequence description # # The actual sequence is a separate object contained within the record which can be accessed using record.seq # # The sequence has an 'alphabet' associated with it which defines which letters are allowed. # # Different alphabets are used for DNA, RNA, protein etc. # In[11]: print(dna.seq) # In[12]: # we can then do our sequence manipulations on the `.seq` attribute of the record print(dna.seq.reverse_complement()) # Sequence records can easily be written to a file. # # Specifying the file type allows conversion between different formats. # # For example, to convert from a fastq file to fasta format: # In[13]: records = SeqIO.parse("../data/easy.fastq", "fastq") SeqIO.write(records, "tmp.fasta", "fasta") # ## Sequence alignment # # It is possible align sequences using biopython with various methods. # Some of these depend on external tools (e.g. `clustalw`), but simple pair-wise alignment is supported out of the box. # # In[14]: from Bio.pairwise2 import align, format_alignment # load fasta with insulin for several species as a handle ins_handle = SeqIO.parse("../data/ins.fa", "fasta") # In[15]: # make a list of records ins_records = [] for item in ins_handle: ins_records.append(item) # In[16]: # extract a little of the sequence for human and chimp human_ins_bit = ins_records[0][-45:] chimp_ins_bit = ins_records[1][-45:] # In[17]: # get the alignments with smith-waterman, without gap penalties alignments = align.localxx(human_ins_bit.seq, chimp_ins_bit.seq) # print the best alignment best = alignments[0] print(format_alignment(*best)) # In[18]: # get alignments with specified scores and penalties: # 2 for match, 4 for mismatch, #-2 for gap open, -0.5 for gap extend gap_alignments = align.localms(human_ins_bit.seq, chimp_ins_bit.seq, 2, -4, -2, -0.5) print(format_alignment(*gap_alignments[0])) # # Motis # # We can also get a consensus sequence given multiple sequences, and visualize the result. # # In[19]: motif_handle = SeqIO.parse("../data/motif.fa", "fasta") motif_records = [] for item in motif_handle: motif_records.append(item.seq[:9].upper()) # In[20]: from Bio import motifs # create a motif object motif = motifs.create(motif_records) # In[21]: print(motif.counts) # In[22]: print(motif.consensus) # In[23]: motif.weblogo("tmp.svg", format="SVG") # ![ll](tmp.svg) # # Remote files # # NCBI allow for remote querying of their Entrez database, and Biopython allows us to use their services from within python. # # We can use the Entrez.efetch utility to retrieve various records from one of NCBI's databases. # # A full list of these services and their documentation can be found on the [Entrez utilities help page](https://www.ncbi.nlm.nih.gov/books/NBK25500/) # In[24]: from Bio import Entrez # IMPORTANT: # # To monitor potential excessive use of their services, NCBI requests you to specify your email address with each request. # # With Biopython, you can set it once for your session like this: # In[25]: Entrez.email = 'python@lumc.nl' # Now we can make a query of the database. # # The Entrez.efetch function returns a file-like handle that instead of pointing to a local file, points to a remote resource. # In[26]: efetch_handle = Entrez.efetch(db="nucleotide", id="NM_005804", rettype="gb", retmode="text") # We can use the handle as if it were a normal file handle opened with ```open("filename", "r")```, and read from it using SeqIO.read() # In[27]: ncbi_record = SeqIO.read(efetch_handle, 'genbank') print(ncbi_record) # It is also possible to query for multiple records # In[28]: efetch_handle = Entrez.efetch(db="nucleotide", id=["NM_005804","NM_000967"], rettype="gb", retmode="text") # Which can then be iterated over using ```SeqIO.parse``` # In[29]: for record in SeqIO.parse(efetch_handle, 'genbank'): print(record.id, record.description) # # Remote Tools # # It is possible to use Biopyton with remote tools. # # For example, we can submit a BLAST search to the NCBI service. ([Documentation here](https://www.ncbi.nlm.nih.gov/BLAST/Doc/urlapi.html)) # # We will use qblast function in the Bio.Blast.NCBIWWW module to perform a BLAST search using the record we retrieved earlier. # # NOTE: It can take some time for the search results to become available # In[30]: from Bio.Blast.NCBIWWW import qblast blast_handle = qblast('blastn', 'nt', ncbi_record.seq) print(ncbi_record.seq) # We can the read from the file handle using the ```Bio.SearchIO``` module. # In[31]: from Bio import SearchIO qresult = SearchIO.read(blast_handle, 'blast-xml') print(qresult) # In[32]: print(qresult[0]) # In[33]: print(qresult[1]) # # That was just an overview # # This lesson was just a small taste of what can be done with Biopython. # # I strongly recommend looking at the [Biopython 'cookbook'](http://biopython.org/DIST/docs/tutorial/Tutorial.html) to get an idea of the wide range of things that you can do with it. # The lesson was based on previous material by [Guy Allard](mailto://w.g.allard@lumc.nl), [Wibowo Arindrarto](mailto://w.arindrarto@lumc.nl) and Martijn Vermaat. # # License: [Creative Commons Attribution 3.0 License (CC-by)](http://creativecommons.org/licenses/by/3.0) # In[ ]: