#!/usr/bin/env python
# coding: utf-8

# # Biopython
# 
# ![biopython logo](http://biopython.org/assets/images/biopython_logo_white.png)
# 
# ## A quick overview
# ### [Sander Bollen](mailto://a.h.b.bollen@lumc.nl)

# # What is Biopython?
# 
# ## 'Python Tools for Computational Molecular Biology'
# 
# - Fully open-source
# - Actively developed
# - Large community

# # What can it do?
# 
# Modules, classes and functions for manipulating biological data
# 
# - File parsers and writers.
#   - Sequence files: fasta, fastq, genbank, abi, sff, etc.
#   - Alignment files: clustal, emboss, phylip, nexus, etc.
#   - Sequence search outputs: BLAST, HMMER, BLAT, etc.
#   - Phylogenetic trees: newick, nexus, phyloxml, etc.
#   - Sequence motifs: AlignAce, TRANSFAC, etc.
#   - Others: PDB files, etc.
# - Access to remote resources (e.g., Entrez, NCBI BLAST).
# - Application wrappers.
# - A simple graphing tool.
# - Simple algorithms (e.g., pairwise alignment, cluster analysis).
# - References such as codon tables and IUPAC sequences.

# # Where can I find more information?
# 
# - [Biopython Homepage](http://biopython.org/)
# - [Biopython development repository](http://github.com/biopython/biopython)
# - [Biopython mailing list](http://lists.open-bio.org/pipermail/biopython/)
# - [Biopython 'cookbook'](http://biopython.org/DIST/docs/tutorial/Tutorial.html) (essential reading!)

# # Manipulating sequence data
# 
# ## Seq and SeqRecord objects
# 
# `Seq` and `SeqRecord` objects are the basis of all sequence manipulation in Biopython. 
# 
# * `Seq` is a raw sequence with an alphabet (e.g. DNA or RNA).
# * `SeqRecord` is a sequence with metadata (e.g. names, ids, etc). This contains a `Seq` object. 
# 

# In[1]:


from Bio.Seq import Seq
from Bio.Alphabet import generic_dna

# create a sequence, and store it in a variable

my_sequence = Seq("ATGGCCCTGTGGATGCGCCTCCTGCCCCTG", generic_dna)
print(my_sequence)


# There are lots of built in methods that can be used to manipulate the sequence
# 
# The sequence acts like a string in many ways
# 

# In[2]:


# get the length of a sequence
print("length: {0}".format(len(my_sequence)))


# In[3]:


# slice and dice
print(my_sequence[:10])


# In[4]:


# change the case
print(my_sequence.lower())


# In[5]:


# concatenate the first and last 10 nucleotides
print(my_sequence[:10] + my_sequence[-10:])


# But also has more sequence-specific methods

# In[6]:


# complement
print(my_sequence.complement())


# In[7]:


# reverse complement
print(my_sequence.reverse_complement())


# In[8]:


# transcribe from DNA to RNA
rna = my_sequence.transcribe()
print(rna)


# In[9]:


# Translate from nucleotide to protein
protein = my_sequence.translate()
print(protein)


# # Manipulating Sequence Data
# 
# ## Bio.SeqIO
# 
# Input and output of sequence files.
# 
# - `SeqIO.read`
#   - Read a file containing a single sequence
# - `SeqIO.parse`
#   - Iterate over all sequences in a sequence file
# - `SeqIO.write`
#   - write sequences to a file

# In[10]:


from Bio import SeqIO

# read the first sequence
# returns SeqRecord objects
for record in SeqIO.parse("../data/records.fa", "fasta"):
    dna = record
    break

print(dna)


# Each record is an object with several fields, including:
# 
# - `record.id`
#   - the sequence id
# - `record.name`
#   - sequence name, usually the same as the id
# - `record.description`
#   - sequence description
# 
# The actual sequence is a separate object contained within the record which can be accessed using record.seq
# 
# The sequence has an 'alphabet' associated with it which defines which letters are allowed.
# 
# Different alphabets are used for DNA, RNA, protein etc.

# In[11]:


print(dna.seq)


# In[12]:


# we can then do our sequence manipulations on the `.seq` attribute of the record

print(dna.seq.reverse_complement())


# Sequence records can easily be written to a file.
# 
# Specifying the file type allows conversion between different formats.
# 
# For example, to convert from a fastq file to fasta format:

# In[13]:


records = SeqIO.parse("../data/easy.fastq", "fastq")
SeqIO.write(records, "tmp.fasta", "fasta")


# ## Sequence alignment
# 
# It is possible align sequences using biopython with various methods.
# Some of these depend on external tools (e.g. `clustalw`), but simple pair-wise alignment is supported out of the box.
# 

# In[14]:


from Bio.pairwise2 import align, format_alignment

# load fasta with insulin for several species as a handle
ins_handle = SeqIO.parse("../data/ins.fa", "fasta")


# In[15]:


# make a list of records 
ins_records = []
for item in ins_handle:
    ins_records.append(item)


# In[16]:


# extract a little of the sequence for human and chimp

human_ins_bit = ins_records[0][-45:]
chimp_ins_bit = ins_records[1][-45:]


# In[17]:


# get the alignments with smith-waterman, without gap penalties
alignments = align.localxx(human_ins_bit.seq, chimp_ins_bit.seq)


# print the best alignment 
best = alignments[0]
print(format_alignment(*best))


# In[18]:


# get alignments with specified scores and penalties:
# 2 for match, 4 for mismatch, 
#-2 for gap open, -0.5 for gap extend
gap_alignments = align.localms(human_ins_bit.seq, 
                               chimp_ins_bit.seq, 
                               2, -4, -2, -0.5)

print(format_alignment(*gap_alignments[0]))


# # Motis
# 
# We can also get a consensus sequence given multiple sequences, and visualize the result. 
# 

# In[19]:


motif_handle = SeqIO.parse("../data/motif.fa", "fasta")
motif_records = []
for item in motif_handle:
    motif_records.append(item.seq[:9].upper())


# In[20]:


from Bio import motifs

# create a motif object
motif = motifs.create(motif_records)


# In[21]:


print(motif.counts)


# In[22]:


print(motif.consensus)


# In[23]:


motif.weblogo("tmp.svg", format="SVG")


# ![ll](tmp.svg)

# # Remote files
# 
# NCBI allow for remote querying of their Entrez database, and Biopython allows us to use their services from within python.
# 
# We can use the Entrez.efetch utility to retrieve various records from one of NCBI's databases.
# 
# A full list of these services and their documentation can be found on the [Entrez utilities help page](https://www.ncbi.nlm.nih.gov/books/NBK25500/)

# In[24]:


from Bio import Entrez


# IMPORTANT:
# 
# To monitor potential excessive use of their services, NCBI requests you to specify your email address with each request.
# 
# With Biopython, you can set it once for your session like this:

# In[25]:


Entrez.email = 'python@lumc.nl'


# Now we can make a query of the database.
# 
# The Entrez.efetch function returns a file-like handle that instead of pointing to a local file, points to a remote resource.

# In[26]:


efetch_handle = Entrez.efetch(db="nucleotide", id="NM_005804",
                              rettype="gb", retmode="text")


# We can use the handle as if it were a normal file handle opened with ```open("filename", "r")```, and read from it using SeqIO.read()

# In[27]:


ncbi_record = SeqIO.read(efetch_handle, 'genbank')

print(ncbi_record)


# It is also possible to query for multiple records

# In[28]:


efetch_handle = Entrez.efetch(db="nucleotide", id=["NM_005804","NM_000967"],
                              rettype="gb", retmode="text")


# Which can then be iterated over using ```SeqIO.parse```

# In[29]:


for record in SeqIO.parse(efetch_handle, 'genbank'):
    print(record.id, record.description)


# # Remote Tools
# 
# It is possible to use Biopyton with remote tools.
# 
# For example, we can submit a BLAST search to the NCBI service. ([Documentation here](https://www.ncbi.nlm.nih.gov/BLAST/Doc/urlapi.html))
# 
# We will use qblast function in the Bio.Blast.NCBIWWW module to perform a BLAST search using the record we retrieved earlier.
# 
# NOTE: It can take some time for the search results to become available

# In[30]:


from Bio.Blast.NCBIWWW import qblast
blast_handle = qblast('blastn', 'nt', ncbi_record.seq)

print(ncbi_record.seq)


# We can the read from the file handle using the ```Bio.SearchIO``` module.

# In[31]:


from Bio import SearchIO
qresult = SearchIO.read(blast_handle, 'blast-xml')
print(qresult)


# In[32]:


print(qresult[0])


# In[33]:


print(qresult[1])


# # That was just an overview
# 
# This lesson was just a small taste of what can be done with Biopython.
# 
# I strongly recommend looking at the [Biopython 'cookbook'](http://biopython.org/DIST/docs/tutorial/Tutorial.html) to get an idea of the wide range of things that you can do with it.

# The lesson was based on previous material by [Guy Allard](mailto://w.g.allard@lumc.nl), [Wibowo Arindrarto](mailto://w.arindrarto@lumc.nl) and Martijn Vermaat.
# 
# License: [Creative Commons Attribution 3.0 License (CC-by)](http://creativecommons.org/licenses/by/3.0)

# In[ ]: