#!/usr/bin/env python
# coding: utf-8

# # myChEMBL BLAST Tutorial
# 
# ### myChEMBL team, ChEMBL Group, EMBL-EBI.
# 
# This notebook is intended to illustrate the following:
# 
# * How to run a BLAST search and parse the results
# * Creating a basic Druggability Score and linking this score to a BLAST search
# 

# ## How to run a BLAST search and parse the results

# ### What is a BLAST search?
# 
# BLAST, the Basic Local Alignment Search Tool, allows a user to search with either a protein (amino acid based) or nucleotide (DNA or RNA based) sequence and find statistically significant similar sequences within a BLAST sequence database. The finer details of how BLAST works can reviewed at the [NCBI website](http://blast.ncbi.nlm.nih.gov/Blast.cgi) or [Wikipedia](http://en.wikipedia.org/wiki/BLAST).
# 
# ### Does ChEMBL provide any BLAST services?
# 
# Yes. The website allows a user to run a BLAST search against the [protein target sequences](https://www.ebi.ac.uk/chembl/target) or the [biotherapeutic sequences](https://www.ebi.ac.uk/chembl/). Users can also download all of the 'sequence databases' from the [ftpsite](ftp://ftp.ebi.ac.uk/pub/databases/chembl/ChEMBLdb/latest/) (look for the .fa.gz downloads) and run BLAST searches locally. Currently the ChEMBL Web Services do not provide this functionality, but this will be added in the future.
# 
# ### How do I run a BLAST search
# 
# If you want to search ChEMBL, you could use the ChEMBL web interface. Alternatively you could run the search locally working through the following steps:
# 
# 1. Download blast software from NCBI
# 2. Download/Create sequence database you wish to search against
# 3. Format sequence database
# 4. Run search
# 
# As you see, it is not too complicated and steps 1-3 have already carried out on this version of myChEMBL. Before we run a search we will first set up some parameters:

# In[1]:


import re

# Input parameters
blast_exe        = '/home/chembl/blast/ncbi-blast-2.2.29+/bin/blastp'
query_file       = '/tmp/test.fa'
eval_threshold   = 0.001
num_descriptions = 5 
num_alignments   = 5
database         = '/home/chembl/blast/chembl/chembl_21.fa'

# Output parameters
results_txt      = '/tmp/test.out'
results_xml      = '/tmp/test.xml'
results_csv      = '/tmp/test.csv'


# Query sequence used throughout this tutorial
# ** Feel free to edit the protein sequence below **
# ** DO NOT INCLUDE WHITESPACES IN SEQUENCE HEADER LINE ** 
# **
query_sequence = '''
>Q96P68_OXGR1_HUMAN
MNEPLDYLANASDFPDYAAAFGNCTDENIPLKMHYLPVIYGIIFLVGFPGNAVVISTYIF
KMRPWKSSTIIMLNLACTDLLYLTSLPFLIHYYASGENWIFGDFMCKFIRFSFHFNLYSS
ILFLTCFSIFRYCVIIHPMSCFSIHKTRCAVVACAVVWIISLVAVIPMTFLITSTNRTNR
SACLDLTSSDELNTIKWYNLILTATTFCLPLVIVTLCYTTIIHTLTHGLQTDSCLKQKAR
RLTILLLLAFYVCFLPFHILRVIRIESRLLSISCSIENQIHEAYIVSRPLAALNTFGNLL
LYVVVSDNFQQAVCSTVRCKVSGNLEQAKKISYSNNP
>Q86XF0_DHFRL1_HUMAN
MFLLLNCIVAVSQNMGIGKNGDLPRPPLRNEFRYFQRMTTTSSVEGKQNLVIMGRKTWFS
IPEKNRPLKDRINLVLSRELKEPPQGAHFLARSLDDALKLTERPELANKVDMIWIVGGSS
VYKEAMNHLGHLKLFVTRIMQDFESDTFFSEIDLEKYKLLPEYPGVLSDVQEGKHIKYKF
EVCEKDD
>Q9UKX5_ITGA11_HUMAN
MDLPRGLVVAWALSLWPGFTDTFNMDTRKPRVIPGSRTAFFGYTVQQHDISGNKWLVVGA
PLETNGYQKTGDVYKCPVIHGNCTKLNLGRVTLSNVSERKDNMRLGLSLATNPKDNSFLA
CSPLWSHECGSSYYTTGMCSRVNSNFRFSKTVAPALQRCQTYMDIVIVLDGSNSIYPWVE
VQHFLINILKKFYIGPGQIQVGVVQYGEDVVHEFHLNDYRSVKDVVEAASHIEQRGGTET
RTAFGIEFARSEAFQKGGRKGAKKVMIVITDGESHDSPDLEKVIQQSERDNVTRYAVAVL
GYYNRRGINPETFLNEIKYIASDPDDKHFFNVTDEAALKDIVDALGDRIFSLEGTNKNET
SFGLEMSQTGFSSHVVEDGVLLGAVGAYDWNGAVLKETSAGKVIPLRESYLKEFPEELKN
HGAYLGYTVTSVVSSRQGRVYVAGAPRFNHTGKVILFTMHNNRSLTIHQAMRGQQIGSYF
GSEITSVDIDGDGVTDVLLVGAPMYFNEGRERGKVYVYELRQNLFVYNGTLKDSHSYQNA
RFGSSIASVRDLNQDSYNDVVVGAPLEDNHAGAIYIFHGFRGSILKTPKQRITASELATG
LQYFGCSIHGQLDLNEDGLIDLAVGALGNAVILWSRPVVQINASLHFEPSKINIFHRDCK
RSGRDATCLAAFLCFTPIFLAPHFQTTTVGIRYNATMDERRYTPRAHLDEGGDRFTNRAV
LLSSGQELCERINFHVLDTADYVKPVTFSVEYSLEDPDHGPMLDDGWPTTLRVSVPFWNG
CNEDEHCVPDLVLDARSDLPTAMEYCQRVLRKPAQDCSAYTLSFDTTVFIIESTRQRVAV
EATLENRGENAYSTVLNISQSANLQFASLIQKEDSDGSIECVNEERRLQKQVCNVSYPFF
RAKAKVAFRLDFEFSKSIFLHHLEIELAAGSDSNERDSTKEDNVAPLRFHLKYEADVLFT
RSSSLSHYEVKPNSSLERYDGIGPPFSCIFRIQNLGLFPIHGMMMKITIPIATRSGNRLL
KLRDFLTDEANTSCNIWGNSTEYRPTPVEEDLRRAPQLNHSNSDVVSINCNIRLVPNQEI
NFHLLGNLWLRSLKALKYKSMKIMVNAALQRQFHSPFIFREEDPSRQIVFEISKQEDWQV
PIWIIVGSTLGGLLLLALLVLALWKLGFFRSARRRREPGLDPTPKVLE
>P06804_TNFA_MOUSE
MSTESMIRDVELAEEALPQKMGGFQNSRRCLCLSLFSFLLVAGATTLFCLLNFGVIGPQR
DEKFPNGLPLISSMAQTLTLRSSSQNSSDKPVAHVVANHQVEEQLEWLSQRANALLANGM
DLKDNQLVVPADGLYLVYSQVLFKGQGCPDYVLLTHTVSRFAISYQEKVNLLSAVKSPCP
KDTPEGAELKPWYEPIYLGGVFQLEKGDQLSAEVNLPKYLDFAESGQVYFGVIAL
>P48050_KCNJ4_HUMAN
MHGHSRNGQAHVPRRKRRNRFVKKNGQCNVYFANLSNKSQRYMADIFTTCVDTRWRYMLM
IFSAAFLVSWLFFGLLFWCIAFFHGDLEASPGVPAAGGPAAGGGGAAPVAPKPCIMHVNG
FLGAFLFSVETQTTIGYGFRCVTEECPLAVIAVVVQSIVGCVIDSFMIGTIMAKMARPKK
RAQTLLFSHHAVISVRDGKLCLMWRVGNLRKSHIVEAHVRAQLIKPYMTQEGEYLPLDQR
DLNVGYDIGLDRIFLVSPIIIVHEIDEDSPLYGMGKEELESEDFEIVVILEGMVEATAMT
TQARSSYLASEILWGHRFEPVVFEEKSHYKVDYSRFHKTYEVAGTPCCSARELQESKITV
LPAPPPPPSAFCYENELALMSQEEEEMEEEAAAAAAVAAGLGLEAGSKEEAGIIRMLEFG
SHLDLERMQASLPLDNISYRRESAI
>Q80Z70_SE1L1_RAT
MQVRVRLLLLLCAVLLGSAAASSDEETNQDESLDSKGALPTDGSVKDHTTGKVVLLARDL
LILKDSEVESLLQDEEESSKSQEEVSVTEDISFLDSPNPSSKTYEELKRVRKPVLTAIEG
TAHGEPCHFPFLFLDKEYDECTSDGREDGRLWCATTYDYKTDEKWGFCETEEDAAKRRQM
QEAEAIYQSGMKILNGSTRKNQKREAYRYLQKAAGMNHTKALERVSYALLFGDYLTQNIQ
AAKEMFEKLTEEGSPKGQTGLGFLYASGLGVNSSQAKALVYYTFGALGGNLIAHMVLGYR
YWAGIGVLQSCESALTHYRLVANHVASDISLTGGSVVQRIRLPDEVENPGMNSGMLEEDL
IQYYQFLAEKGDVQAQVGLGQLHLHGGRGVEQNHQRAFDYFNLAANAGNSHAMAFLGKMY
SEGSDIVPQSNETALHYFKKAADMGNPVGQSGLGMAYLYGRGVQVNYDLALKYFQKAAEQ
GWVDGQLQLGSMYYNGIGVKRDYKQALKYFNLASQGGHILAFYNLAQMHASGTGVMRSCH
TAVELFKNVCERGRWSERLMTAYNSYKDDDYNAAVVQYLLLAEQGYEVAQSNAAFILDQR
EATIVGENETYPRALLHWNRAASQGYTVARIKLGDYHFYGFGTDVDYETAFIHYRLASEQ
QHSAQAMFNLGYMHEKGLGIKQDIHLAKRFYDMAAEASPDAQVPVFLALCKLGVVYFLQY
IREANIRDLFTQLDMDQLLGPEWDLYLMTIIALLLGTVIAYRQRQHQDIPVPRPPGPRPA
PPQQEGPPEQQPPQ
>P33277_GAP1_SCHPO
MTKRHSGTLSSSVLPQTNRLSLLRNRESTSVLYTIDLDMESDVEDAFFHLDRELHDLKQQ
ISSQSKQNFVLERDVRYLDSKIALLIQNRMAQEEQHEFAKRLNDNYNAVKGSFPDDRKLQ
LYGALFFLLQSEPAYIASLVRRVKLFNMDALLQIVMFNIYGNQYESREEHLLLSLFQMVL
TTEFEATSDVLSLLRANTPVSRMLTTYTRRGPGQAYLRSILYQCINDVAIHPDLQLDIHP
LSVYRYLVNTGQLSPSEDDNLLTNEEVSEFPAVKNAIQERSAQLLLLTKRFLDAVLNSID
EIPYGIRWVCKLIRNLTNRLFPSISDSTICSLIGGFFFLRFVNPAIISPQTSMLLDSCPS
DNVRKTLATIAKIIQSVANGTSSTKTHLDVSFQPMLKEYEEKVHNLLRKLGNVGDFFEAL
ELDQYIALSKKSLALEMTVNEIYLTHEIILENLDNLYDPDSHVHLILQELGEPCKSVPQE
DNCLVTLPLYNRWDSSIPDLKQNLKVTREDILYVDAKTLFIQLLRLLPSGHPATRVPLDL
PLIADSVSSLKSMSLMKKGIRAIELLDELSTLRLVDKENRYEPLTSEVEKEFIDLDALYE
RIRAERDALQDVHRAICDHNEYLQTQLQIYGSYLNNARSQIKPSHSDSKGFSRGVGVVGI
KPKNIKSSNTVKLSSQQLKKESVLLNCTIPEFNVSNTYFTFSSPSTDNFVIAVYQRGHSK
VLVEVCICLDDVLQRRYASNPVVDLGFLTFEANKLYHLFEQLFLRK
>Q96PD4_IL17F_HUMAN
MTVKTLHGPAMVKYLLLSILGLAFLSEAAARKIPKVGHTFFQKPESCPPVPGGSMKLDIG
IINENQRVSMSRNIESRSTSPWNYTVTWDPNRYPSEVVQAQCRNLGCINAQGKEDISMNS
VPIQQETLVVRRKHQGCSVSFQLEKVLVTVGCTCVTPVIHHVQ
>P10144_GRAB_HUMAN
MQPILLLLAFLLLPRADAGEIIGGHEAKPHSRPYMAYLMIWDQKSLKRCGGFLIRDDFVL
TAAHCWGSSINVTLGAHNIKEQEPTQQFIPVKRPIPHPAYNPKNFSNDIMLLQLERKAKR
TRAVQPLRLPSNKAQVKPGQTCSVAGWGQTAPLGKHSHTLQEVKMTVQEDRKCESDLRHY
YDSTIELCVGDPEIKKTSFKGDSGGPLVCNKVAQGIVSYGRNNGMPPRACTKVSSFVHWI
KKTMKRY
'''

# We will use the query sequence lengths later - just store these for now
query_sequence_details = {}
query_sequence_order   = [];

seq_counter = 0
for seq in query_sequence.split('>'):
    seq = seq.strip(' \n\t')
    if(len(seq) == 0):
        continue
        
    seq_header  = seq.split('\n')[0].strip()
    seq_length  = len(''.join(seq.split('\n')[1:]))
    seq_counter = seq_counter+1
    query_sequence_details[seq_header] = {}
    query_sequence_details[seq_header]['seq_length']  = seq_length
    query_sequence_order.append(seq_header)
    
# Write test query sequence above to query_file location
text_file = open(query_file, "w")
text_file.write(query_sequence)
text_file.close()


# Now that we have defined some query parameters we can run a BLAST search. The query below will execute the BLAST search and the raw BLAST output will be printed afterwards (Note, the -num_descriptions and -num_alignments arguments are used to limit the size of the output in the this online notebook).

# In[2]:


# So lets try and run the 'raw' commandline version
get_ipython().system('$blast_exe -query $query_file -db $database -evalue $eval_threshold -num_descriptions $num_descriptions -num_alignments $num_alignments')

# Stdout should be printed below:


# ### That's great, but is there a way to run the command above programmatically?
# 
# Yes.
# 
# The output above can described as 'classic' BLAST output and though it is fairly easy to read, can be quite tricky to parse. To help us, there are many programming language specific libraries which allow you to run BLAST searches and easily parse the output, for example [BioPerl](http://www.bioperl.org), [BioJava](http://biojava.org) and [BioRuby](http://bioruby.org/). Do not worry, as we are working in a Python environment we have the [Biopython](http://biopython.org) library to our disposal, so lets get started. We can wrap a commandline BLAST search with the NcbiblastpCommandline method:

# In[3]:


from Bio.Blast.Applications import NcbiblastpCommandline

# The outfmt=5 value creates an XML formatted file
blastp_cmd = NcbiblastpCommandline(cmd=blast_exe, query=query_file, db=database, outfmt=5, out=results_xml, evalue=eval_threshold)
stdout, stderr = blastp_cmd()


# We can now parse the BLAST result file and create blast record object. This gives us access to the underlying BLAST result values, using the [Alignment](http://biopython.org/DIST/docs/api/Bio.Blast.Record.Alignment-class.html) and [HSP](http://biopython.org/DIST/docs/api/Bio.Blast.Record.HSP-class.html) classes:  

# In[4]:


from Bio.Blast import NCBIXML
result_handle = open(results_xml)
blast_records = NCBIXML.parse(result_handle)

E_VALUE_THRESH = 0.04
result_counter = 0

for blast_record in blast_records:
    for alignment in blast_record.alignments:
         result_counter+=1
         for hsp in alignment.hsps:
             if result_counter <= 5:
                 print '\n# Result ', result_counter, '#'
                 print 'Sequence:   ' + alignment.title
                 print 'Length:    ',  alignment.length
                 print 'E-Value:   ',  hsp.expect
                 print 'Score:     ',  hsp.score
                 print 'Identities:',  hsp.identities
                 print(hsp.query[0:75] + '...')
                 print(hsp.match[0:75] + '...')
                 print(hsp.sbjct[0:75] + '...')


# ### BLAST Data Processing with Pandas
# 
# Biopython is great and provides you with lots of additional functionality, but for the purpose of this tutorial we will now turn our attention to processing BLAST data using [pandas](http://pandas.pydata.org/). To get started we need to turn our BLAST output into a 'tabular' format. Fortunately we can create a CSV BLAST results file, so lets create this now (one thing to note about the BLAST CSV output, is that it does not include the BLAST alignments):

# In[5]:


# Create a blast output file in csv format so that it can easily be loaded by pandas
# The outfmt=10 value creates an CSV formatted file
get_ipython().system('$blast_exe -query $query_file -db $database -outfmt 10 -out $results_csv -evalue $eval_threshold')


# We can now load the BLAST results into a pandas dataframe. You should be able to map the result values (e.g. length, identity, e-value,..) in the table below to the earlier 'classic' and bioptyhon BLAST results:

# In[6]:


# Now load BLAST information into pandas dataframe
import pandas
from pandas import DataFrame, read_csv, merge
from pandas.io import sql
from pandas.io.sql import read_sql
# Limit the default pandas dataframe size  
pandas.set_option('display.max_rows', 10)

# Setup database connection to local ChEMBL instance
import psycopg2
con = psycopg2.connect(port=5432, user='chembl', dbname='chembl_21')

Location = results_csv
blast_df = read_csv(Location, names=['query', 'chembl_target_id', 'identity', 'length', 'mismatch', 'gapopen', 'qstart', 'qend', 'sstart', 'send', 'evalue', 'bitscore'])
blast_df


# We have not done anything new yet, just presented the BLAST results in yet another format, so what next?

# ## Creating a basic Druggability Score and linking this score to a BLAST search

# The benefit of using a pandas dataframe is that it makes it very easy for us to join the BLAST resultset to another pandas dataframe resultset, in a similar way to how you might join 2 or more tables in an SQL query. So lets create some additional dataframes.
# 
# 
# First, lets get some additional about the ChEMBL targets, such as names, organism:

# In[7]:


# Select additional target information from the target_dictionary table 
sql1 = """
select td.chembl_id as chembl_target_id,
       td.pref_name,
       td.organism,
       td.tax_id,
       td.tid,
       td.target_type
 from target_dictionary td
"""

chembl_target_df = read_sql(sql1, con)
chembl_target_df


# Next, lets use the ChEMBL database to get a count FDA approved drugs that bind each of the targets in the database:

# In[8]:


# We can traverse the ChEMBL activities table to get the count of FDA approved molecules,
# which bind ChEMBL targets with high affinity
sql2 = """
select t.chembl_id as chembl_target_id,
       count(m.chembl_id) as drug_count
  from activities a,
       assays s,
       target_dictionary t,
       molecule_dictionary m
 where a.assay_id=s.assay_id
   and s.tid=t.tid
   and m.molregno=a.molregno
   and a.pchembl_value >= 6
   and s.confidence_score >= 8
   and m.max_phase = 4
   and m.therapeutic_flag=1
group by t.chembl_id
"""

chembl_drug_df = read_sql(sql2, con)
chembl_drug_df


# We can use the ChEMBL database again to get a count of 'drug-like' molecules that bind each of the targets in the database:

# In[9]:


# Similar to the previous query, but this time get the count of 'drug-like' compounds (defined by
# having no rule-of-5 violations), which bind ChEMBL targets with high affinity 
sql3 = """
select t.chembl_id as chembl_target_id,
       count(m.chembl_id) as druglike_count
  from activities a,
       assays s,
       target_dictionary t,
       molecule_dictionary m,
       compound_properties p
 where a.assay_id=s.assay_id
   and s.tid=t.tid
   and m.molregno=a.molregno
   and m.molregno=p.molregno
   and a.pchembl_value >= 6
   and s.confidence_score >= 8
   and p.num_ro5_violations=0
group by t.chembl_id
"""

chembl_druglike_df = read_sql(sql3, con)
chembl_druglike_df


# Finally, lets get the list of known drug-target interactions from the ChEMBL [Mechanism of Action](http://en.wikipedia.org/wiki/Mechanism_of_action) tables, as not all interactions will be captured in the activities table:

# In[10]:


# Get the count of molecules assoicated to a ChEMBL target through a known Mechanism of Action
sql4 = """
select td.chembl_id as chembl_target_id,
       count(distinct dm.molregno) as moa_count
 from drug_mechanism dm, 
      target_dictionary td
where dm.tid=td.tid
group by td.chembl_id
"""

chembl_moa_df = read_sql(sql4, con)
chembl_moa_df


# We now have 5 resultsets:
# 
# 1. The BLAST results (blast_df)
# 2. The additional ChEMBL Target information (chembl_target_df)
# 3. The FDA approved drug binding counts (chembl_drug_df)
# 4. The 'drug-like' binding counts (chembl_druglike_df)
# 5. The Mechanism-of-Action molecule counts (chembl_moa_df)
# 
# So we can now think about merging the resultsets together. By planned good fortune each of the resultsets share the attribute 'chembl_target_id', so lets us that to merge:

# In[11]:


# Carry out the merge and also only return columns we are interested in
rs_merge_df = merge(blast_df, 
                  chembl_target_df,   how='left', on='chembl_target_id' ).merge(
                  chembl_drug_df,     how='left', on='chembl_target_id' ).merge(
                  chembl_druglike_df, how='left', on='chembl_target_id' ).merge(
                  chembl_moa_df,      how='left', on='chembl_target_id')[[
                  'query', 'chembl_target_id','pref_name', 'organism', 'length', 'evalue', 'identity', 'bitscore', 'moa_count', 'drug_count', 'druglike_count' 
                  ]].fillna(0)

rs_merge_df


# So lets create a **really simple** score based on the information we have to predict in a target is likely to druggable:

# In[12]:


def druggability_score(query_sequence_length, align_length, identity, moa_count, drug_count, druglike_count):

    align_length = float(align_length)
    identity     = float(identity)   
    
    moa_score      = (align_length/query_sequence_length) * (identity/100) * (1 if (moa_count > 0) else 0)
    drug_score     = (align_length/query_sequence_length) * (identity/100) * (1 if (drug_count > 0) else 0) * 0.8
    druglike_score = (align_length/query_sequence_length) * (identity/100) * (1 if (druglike_count > 0) else 0) * 0.5
    total_score    = round((moa_score + drug_score + druglike_score),2)
    
    return (1 if (total_score > 1) else total_score)


# The cryptic 0.8 and 0.5 values are there to down weight the contribution of the drug_count and druglike_count values (I said it was simple). It is also assumed the mechanism of action information is a fact, i.e. it is known that this target binds 1 or more drugs, so no down weighting is applied. 
# 
# So lets add this new druggable score column to the results table:

# In[13]:


rs_merge_df['druggability_score'] = rs_merge_df.apply(lambda row: druggability_score(query_sequence_details[row['query']]['seq_length'],
                                                                                     row['length'], 
                                                                                     row['identity'], 
                                                                                     row['moa_count'], 
                                                                                     row['drug_count'], 
                                                                                     row['druglike_count']),axis=1)

rs_merge_df


# Great, we have an extra column in the data frame, which contains the Druggability Score. As sequence identity contributes significantly to the score, we could just take the max value for the druggability_score column and say this is its Druggability Score for this particular protein. So the predicted druggability_score for the first query sequence defined in query_sequence is:

# In[14]:


grouped_df = rs_merge_df.groupby('query')['druggability_score'].max().reset_index()
print grouped_df.ix[0]['query']+":",grouped_df.ix[0]['druggability_score']


# ### The Conclusion
# 
# We can wrap up this tutorial by presenting the Druggability Score for all sequences defined in query_sequence in a friendly pandas data frame:

# In[15]:


# Show all results in final table
pandas.set_option('display.max_rows', 500)

druggability_results_df = DataFrame({'query':query_sequence_order}).merge(
                                      grouped_df,
                                      how='left', 
                                      on='query').fillna('No BLAST hits')
druggability_results_df


# ### Caveats with Drugability Score presented in this tutorial
# 
# * Species information is ignored
# * No distinction is made between small molecule drugs and biotherapeutics, e.g. monoclonal antibodies
# * Multiple HSPs between query and target hits are ignored
# * It has not been tested beyond the scope of this notebook