#!/usr/bin/env python # coding: utf-8 # # myChEMBL BLAST Tutorial # # ### myChEMBL team, ChEMBL Group, EMBL-EBI. # # This notebook is intended to illustrate the following: # # * How to run a BLAST search and parse the results # * Creating a basic Druggability Score and linking this score to a BLAST search # # ## How to run a BLAST search and parse the results # ### What is a BLAST search? # # BLAST, the Basic Local Alignment Search Tool, allows a user to search with either a protein (amino acid based) or nucleotide (DNA or RNA based) sequence and find statistically significant similar sequences within a BLAST sequence database. The finer details of how BLAST works can reviewed at the [NCBI website](http://blast.ncbi.nlm.nih.gov/Blast.cgi) or [Wikipedia](http://en.wikipedia.org/wiki/BLAST). # # ### Does ChEMBL provide any BLAST services? # # Yes. The website allows a user to run a BLAST search against the [protein target sequences](https://www.ebi.ac.uk/chembl/target) or the [biotherapeutic sequences](https://www.ebi.ac.uk/chembl/). Users can also download all of the 'sequence databases' from the [ftpsite](ftp://ftp.ebi.ac.uk/pub/databases/chembl/ChEMBLdb/latest/) (look for the .fa.gz downloads) and run BLAST searches locally. Currently the ChEMBL Web Services do not provide this functionality, but this will be added in the future. # # ### How do I run a BLAST search # # If you want to search ChEMBL, you could use the ChEMBL web interface. Alternatively you could run the search locally working through the following steps: # # 1. Download blast software from NCBI # 2. Download/Create sequence database you wish to search against # 3. Format sequence database # 4. Run search # # As you see, it is not too complicated and steps 1-3 have already carried out on this version of myChEMBL. Before we run a search we will first set up some parameters: # In[1]: import re # Input parameters blast_exe = '/home/chembl/blast/ncbi-blast-2.2.29+/bin/blastp' query_file = '/tmp/test.fa' eval_threshold = 0.001 num_descriptions = 5 num_alignments = 5 database = '/home/chembl/blast/chembl/chembl_21.fa' # Output parameters results_txt = '/tmp/test.out' results_xml = '/tmp/test.xml' results_csv = '/tmp/test.csv' # Query sequence used throughout this tutorial # ** Feel free to edit the protein sequence below ** # ** DO NOT INCLUDE WHITESPACES IN SEQUENCE HEADER LINE ** # ** query_sequence = ''' >Q96P68_OXGR1_HUMAN MNEPLDYLANASDFPDYAAAFGNCTDENIPLKMHYLPVIYGIIFLVGFPGNAVVISTYIF KMRPWKSSTIIMLNLACTDLLYLTSLPFLIHYYASGENWIFGDFMCKFIRFSFHFNLYSS ILFLTCFSIFRYCVIIHPMSCFSIHKTRCAVVACAVVWIISLVAVIPMTFLITSTNRTNR SACLDLTSSDELNTIKWYNLILTATTFCLPLVIVTLCYTTIIHTLTHGLQTDSCLKQKAR RLTILLLLAFYVCFLPFHILRVIRIESRLLSISCSIENQIHEAYIVSRPLAALNTFGNLL LYVVVSDNFQQAVCSTVRCKVSGNLEQAKKISYSNNP >Q86XF0_DHFRL1_HUMAN MFLLLNCIVAVSQNMGIGKNGDLPRPPLRNEFRYFQRMTTTSSVEGKQNLVIMGRKTWFS IPEKNRPLKDRINLVLSRELKEPPQGAHFLARSLDDALKLTERPELANKVDMIWIVGGSS VYKEAMNHLGHLKLFVTRIMQDFESDTFFSEIDLEKYKLLPEYPGVLSDVQEGKHIKYKF EVCEKDD >Q9UKX5_ITGA11_HUMAN MDLPRGLVVAWALSLWPGFTDTFNMDTRKPRVIPGSRTAFFGYTVQQHDISGNKWLVVGA PLETNGYQKTGDVYKCPVIHGNCTKLNLGRVTLSNVSERKDNMRLGLSLATNPKDNSFLA CSPLWSHECGSSYYTTGMCSRVNSNFRFSKTVAPALQRCQTYMDIVIVLDGSNSIYPWVE VQHFLINILKKFYIGPGQIQVGVVQYGEDVVHEFHLNDYRSVKDVVEAASHIEQRGGTET RTAFGIEFARSEAFQKGGRKGAKKVMIVITDGESHDSPDLEKVIQQSERDNVTRYAVAVL GYYNRRGINPETFLNEIKYIASDPDDKHFFNVTDEAALKDIVDALGDRIFSLEGTNKNET SFGLEMSQTGFSSHVVEDGVLLGAVGAYDWNGAVLKETSAGKVIPLRESYLKEFPEELKN HGAYLGYTVTSVVSSRQGRVYVAGAPRFNHTGKVILFTMHNNRSLTIHQAMRGQQIGSYF GSEITSVDIDGDGVTDVLLVGAPMYFNEGRERGKVYVYELRQNLFVYNGTLKDSHSYQNA RFGSSIASVRDLNQDSYNDVVVGAPLEDNHAGAIYIFHGFRGSILKTPKQRITASELATG LQYFGCSIHGQLDLNEDGLIDLAVGALGNAVILWSRPVVQINASLHFEPSKINIFHRDCK RSGRDATCLAAFLCFTPIFLAPHFQTTTVGIRYNATMDERRYTPRAHLDEGGDRFTNRAV LLSSGQELCERINFHVLDTADYVKPVTFSVEYSLEDPDHGPMLDDGWPTTLRVSVPFWNG CNEDEHCVPDLVLDARSDLPTAMEYCQRVLRKPAQDCSAYTLSFDTTVFIIESTRQRVAV EATLENRGENAYSTVLNISQSANLQFASLIQKEDSDGSIECVNEERRLQKQVCNVSYPFF RAKAKVAFRLDFEFSKSIFLHHLEIELAAGSDSNERDSTKEDNVAPLRFHLKYEADVLFT RSSSLSHYEVKPNSSLERYDGIGPPFSCIFRIQNLGLFPIHGMMMKITIPIATRSGNRLL KLRDFLTDEANTSCNIWGNSTEYRPTPVEEDLRRAPQLNHSNSDVVSINCNIRLVPNQEI NFHLLGNLWLRSLKALKYKSMKIMVNAALQRQFHSPFIFREEDPSRQIVFEISKQEDWQV PIWIIVGSTLGGLLLLALLVLALWKLGFFRSARRRREPGLDPTPKVLE >P06804_TNFA_MOUSE MSTESMIRDVELAEEALPQKMGGFQNSRRCLCLSLFSFLLVAGATTLFCLLNFGVIGPQR DEKFPNGLPLISSMAQTLTLRSSSQNSSDKPVAHVVANHQVEEQLEWLSQRANALLANGM DLKDNQLVVPADGLYLVYSQVLFKGQGCPDYVLLTHTVSRFAISYQEKVNLLSAVKSPCP KDTPEGAELKPWYEPIYLGGVFQLEKGDQLSAEVNLPKYLDFAESGQVYFGVIAL >P48050_KCNJ4_HUMAN MHGHSRNGQAHVPRRKRRNRFVKKNGQCNVYFANLSNKSQRYMADIFTTCVDTRWRYMLM IFSAAFLVSWLFFGLLFWCIAFFHGDLEASPGVPAAGGPAAGGGGAAPVAPKPCIMHVNG FLGAFLFSVETQTTIGYGFRCVTEECPLAVIAVVVQSIVGCVIDSFMIGTIMAKMARPKK RAQTLLFSHHAVISVRDGKLCLMWRVGNLRKSHIVEAHVRAQLIKPYMTQEGEYLPLDQR DLNVGYDIGLDRIFLVSPIIIVHEIDEDSPLYGMGKEELESEDFEIVVILEGMVEATAMT TQARSSYLASEILWGHRFEPVVFEEKSHYKVDYSRFHKTYEVAGTPCCSARELQESKITV LPAPPPPPSAFCYENELALMSQEEEEMEEEAAAAAAVAAGLGLEAGSKEEAGIIRMLEFG SHLDLERMQASLPLDNISYRRESAI >Q80Z70_SE1L1_RAT MQVRVRLLLLLCAVLLGSAAASSDEETNQDESLDSKGALPTDGSVKDHTTGKVVLLARDL LILKDSEVESLLQDEEESSKSQEEVSVTEDISFLDSPNPSSKTYEELKRVRKPVLTAIEG TAHGEPCHFPFLFLDKEYDECTSDGREDGRLWCATTYDYKTDEKWGFCETEEDAAKRRQM QEAEAIYQSGMKILNGSTRKNQKREAYRYLQKAAGMNHTKALERVSYALLFGDYLTQNIQ AAKEMFEKLTEEGSPKGQTGLGFLYASGLGVNSSQAKALVYYTFGALGGNLIAHMVLGYR YWAGIGVLQSCESALTHYRLVANHVASDISLTGGSVVQRIRLPDEVENPGMNSGMLEEDL IQYYQFLAEKGDVQAQVGLGQLHLHGGRGVEQNHQRAFDYFNLAANAGNSHAMAFLGKMY SEGSDIVPQSNETALHYFKKAADMGNPVGQSGLGMAYLYGRGVQVNYDLALKYFQKAAEQ GWVDGQLQLGSMYYNGIGVKRDYKQALKYFNLASQGGHILAFYNLAQMHASGTGVMRSCH TAVELFKNVCERGRWSERLMTAYNSYKDDDYNAAVVQYLLLAEQGYEVAQSNAAFILDQR EATIVGENETYPRALLHWNRAASQGYTVARIKLGDYHFYGFGTDVDYETAFIHYRLASEQ QHSAQAMFNLGYMHEKGLGIKQDIHLAKRFYDMAAEASPDAQVPVFLALCKLGVVYFLQY IREANIRDLFTQLDMDQLLGPEWDLYLMTIIALLLGTVIAYRQRQHQDIPVPRPPGPRPA PPQQEGPPEQQPPQ >P33277_GAP1_SCHPO MTKRHSGTLSSSVLPQTNRLSLLRNRESTSVLYTIDLDMESDVEDAFFHLDRELHDLKQQ ISSQSKQNFVLERDVRYLDSKIALLIQNRMAQEEQHEFAKRLNDNYNAVKGSFPDDRKLQ LYGALFFLLQSEPAYIASLVRRVKLFNMDALLQIVMFNIYGNQYESREEHLLLSLFQMVL TTEFEATSDVLSLLRANTPVSRMLTTYTRRGPGQAYLRSILYQCINDVAIHPDLQLDIHP LSVYRYLVNTGQLSPSEDDNLLTNEEVSEFPAVKNAIQERSAQLLLLTKRFLDAVLNSID EIPYGIRWVCKLIRNLTNRLFPSISDSTICSLIGGFFFLRFVNPAIISPQTSMLLDSCPS DNVRKTLATIAKIIQSVANGTSSTKTHLDVSFQPMLKEYEEKVHNLLRKLGNVGDFFEAL ELDQYIALSKKSLALEMTVNEIYLTHEIILENLDNLYDPDSHVHLILQELGEPCKSVPQE DNCLVTLPLYNRWDSSIPDLKQNLKVTREDILYVDAKTLFIQLLRLLPSGHPATRVPLDL PLIADSVSSLKSMSLMKKGIRAIELLDELSTLRLVDKENRYEPLTSEVEKEFIDLDALYE RIRAERDALQDVHRAICDHNEYLQTQLQIYGSYLNNARSQIKPSHSDSKGFSRGVGVVGI KPKNIKSSNTVKLSSQQLKKESVLLNCTIPEFNVSNTYFTFSSPSTDNFVIAVYQRGHSK VLVEVCICLDDVLQRRYASNPVVDLGFLTFEANKLYHLFEQLFLRK >Q96PD4_IL17F_HUMAN MTVKTLHGPAMVKYLLLSILGLAFLSEAAARKIPKVGHTFFQKPESCPPVPGGSMKLDIG IINENQRVSMSRNIESRSTSPWNYTVTWDPNRYPSEVVQAQCRNLGCINAQGKEDISMNS VPIQQETLVVRRKHQGCSVSFQLEKVLVTVGCTCVTPVIHHVQ >P10144_GRAB_HUMAN MQPILLLLAFLLLPRADAGEIIGGHEAKPHSRPYMAYLMIWDQKSLKRCGGFLIRDDFVL TAAHCWGSSINVTLGAHNIKEQEPTQQFIPVKRPIPHPAYNPKNFSNDIMLLQLERKAKR TRAVQPLRLPSNKAQVKPGQTCSVAGWGQTAPLGKHSHTLQEVKMTVQEDRKCESDLRHY YDSTIELCVGDPEIKKTSFKGDSGGPLVCNKVAQGIVSYGRNNGMPPRACTKVSSFVHWI KKTMKRY ''' # We will use the query sequence lengths later - just store these for now query_sequence_details = {} query_sequence_order = []; seq_counter = 0 for seq in query_sequence.split('>'): seq = seq.strip(' \n\t') if(len(seq) == 0): continue seq_header = seq.split('\n')[0].strip() seq_length = len(''.join(seq.split('\n')[1:])) seq_counter = seq_counter+1 query_sequence_details[seq_header] = {} query_sequence_details[seq_header]['seq_length'] = seq_length query_sequence_order.append(seq_header) # Write test query sequence above to query_file location text_file = open(query_file, "w") text_file.write(query_sequence) text_file.close() # Now that we have defined some query parameters we can run a BLAST search. The query below will execute the BLAST search and the raw BLAST output will be printed afterwards (Note, the -num_descriptions and -num_alignments arguments are used to limit the size of the output in the this online notebook). # In[2]: # So lets try and run the 'raw' commandline version get_ipython().system('$blast_exe -query $query_file -db $database -evalue $eval_threshold -num_descriptions $num_descriptions -num_alignments $num_alignments') # Stdout should be printed below: # ### That's great, but is there a way to run the command above programmatically? # # Yes. # # The output above can described as 'classic' BLAST output and though it is fairly easy to read, can be quite tricky to parse. To help us, there are many programming language specific libraries which allow you to run BLAST searches and easily parse the output, for example [BioPerl](http://www.bioperl.org), [BioJava](http://biojava.org) and [BioRuby](http://bioruby.org/). Do not worry, as we are working in a Python environment we have the [Biopython](http://biopython.org) library to our disposal, so lets get started. We can wrap a commandline BLAST search with the NcbiblastpCommandline method: # In[3]: from Bio.Blast.Applications import NcbiblastpCommandline # The outfmt=5 value creates an XML formatted file blastp_cmd = NcbiblastpCommandline(cmd=blast_exe, query=query_file, db=database, outfmt=5, out=results_xml, evalue=eval_threshold) stdout, stderr = blastp_cmd() # We can now parse the BLAST result file and create blast record object. This gives us access to the underlying BLAST result values, using the [Alignment](http://biopython.org/DIST/docs/api/Bio.Blast.Record.Alignment-class.html) and [HSP](http://biopython.org/DIST/docs/api/Bio.Blast.Record.HSP-class.html) classes: # In[4]: from Bio.Blast import NCBIXML result_handle = open(results_xml) blast_records = NCBIXML.parse(result_handle) E_VALUE_THRESH = 0.04 result_counter = 0 for blast_record in blast_records: for alignment in blast_record.alignments: result_counter+=1 for hsp in alignment.hsps: if result_counter <= 5: print '\n# Result ', result_counter, '#' print 'Sequence: ' + alignment.title print 'Length: ', alignment.length print 'E-Value: ', hsp.expect print 'Score: ', hsp.score print 'Identities:', hsp.identities print(hsp.query[0:75] + '...') print(hsp.match[0:75] + '...') print(hsp.sbjct[0:75] + '...') # ### BLAST Data Processing with Pandas # # Biopython is great and provides you with lots of additional functionality, but for the purpose of this tutorial we will now turn our attention to processing BLAST data using [pandas](http://pandas.pydata.org/). To get started we need to turn our BLAST output into a 'tabular' format. Fortunately we can create a CSV BLAST results file, so lets create this now (one thing to note about the BLAST CSV output, is that it does not include the BLAST alignments): # In[5]: # Create a blast output file in csv format so that it can easily be loaded by pandas # The outfmt=10 value creates an CSV formatted file get_ipython().system('$blast_exe -query $query_file -db $database -outfmt 10 -out $results_csv -evalue $eval_threshold') # We can now load the BLAST results into a pandas dataframe. You should be able to map the result values (e.g. length, identity, e-value,..) in the table below to the earlier 'classic' and bioptyhon BLAST results: # In[6]: # Now load BLAST information into pandas dataframe import pandas from pandas import DataFrame, read_csv, merge from pandas.io import sql from pandas.io.sql import read_sql # Limit the default pandas dataframe size pandas.set_option('display.max_rows', 10) # Setup database connection to local ChEMBL instance import psycopg2 con = psycopg2.connect(port=5432, user='chembl', dbname='chembl_21') Location = results_csv blast_df = read_csv(Location, names=['query', 'chembl_target_id', 'identity', 'length', 'mismatch', 'gapopen', 'qstart', 'qend', 'sstart', 'send', 'evalue', 'bitscore']) blast_df # We have not done anything new yet, just presented the BLAST results in yet another format, so what next? # ## Creating a basic Druggability Score and linking this score to a BLAST search # The benefit of using a pandas dataframe is that it makes it very easy for us to join the BLAST resultset to another pandas dataframe resultset, in a similar way to how you might join 2 or more tables in an SQL query. So lets create some additional dataframes. # # # First, lets get some additional about the ChEMBL targets, such as names, organism: # In[7]: # Select additional target information from the target_dictionary table sql1 = """ select td.chembl_id as chembl_target_id, td.pref_name, td.organism, td.tax_id, td.tid, td.target_type from target_dictionary td """ chembl_target_df = read_sql(sql1, con) chembl_target_df # Next, lets use the ChEMBL database to get a count FDA approved drugs that bind each of the targets in the database: # In[8]: # We can traverse the ChEMBL activities table to get the count of FDA approved molecules, # which bind ChEMBL targets with high affinity sql2 = """ select t.chembl_id as chembl_target_id, count(m.chembl_id) as drug_count from activities a, assays s, target_dictionary t, molecule_dictionary m where a.assay_id=s.assay_id and s.tid=t.tid and m.molregno=a.molregno and a.pchembl_value >= 6 and s.confidence_score >= 8 and m.max_phase = 4 and m.therapeutic_flag=1 group by t.chembl_id """ chembl_drug_df = read_sql(sql2, con) chembl_drug_df # We can use the ChEMBL database again to get a count of 'drug-like' molecules that bind each of the targets in the database: # In[9]: # Similar to the previous query, but this time get the count of 'drug-like' compounds (defined by # having no rule-of-5 violations), which bind ChEMBL targets with high affinity sql3 = """ select t.chembl_id as chembl_target_id, count(m.chembl_id) as druglike_count from activities a, assays s, target_dictionary t, molecule_dictionary m, compound_properties p where a.assay_id=s.assay_id and s.tid=t.tid and m.molregno=a.molregno and m.molregno=p.molregno and a.pchembl_value >= 6 and s.confidence_score >= 8 and p.num_ro5_violations=0 group by t.chembl_id """ chembl_druglike_df = read_sql(sql3, con) chembl_druglike_df # Finally, lets get the list of known drug-target interactions from the ChEMBL [Mechanism of Action](http://en.wikipedia.org/wiki/Mechanism_of_action) tables, as not all interactions will be captured in the activities table: # In[10]: # Get the count of molecules assoicated to a ChEMBL target through a known Mechanism of Action sql4 = """ select td.chembl_id as chembl_target_id, count(distinct dm.molregno) as moa_count from drug_mechanism dm, target_dictionary td where dm.tid=td.tid group by td.chembl_id """ chembl_moa_df = read_sql(sql4, con) chembl_moa_df # We now have 5 resultsets: # # 1. The BLAST results (blast_df) # 2. The additional ChEMBL Target information (chembl_target_df) # 3. The FDA approved drug binding counts (chembl_drug_df) # 4. The 'drug-like' binding counts (chembl_druglike_df) # 5. The Mechanism-of-Action molecule counts (chembl_moa_df) # # So we can now think about merging the resultsets together. By planned good fortune each of the resultsets share the attribute 'chembl_target_id', so lets us that to merge: # In[11]: # Carry out the merge and also only return columns we are interested in rs_merge_df = merge(blast_df, chembl_target_df, how='left', on='chembl_target_id' ).merge( chembl_drug_df, how='left', on='chembl_target_id' ).merge( chembl_druglike_df, how='left', on='chembl_target_id' ).merge( chembl_moa_df, how='left', on='chembl_target_id')[[ 'query', 'chembl_target_id','pref_name', 'organism', 'length', 'evalue', 'identity', 'bitscore', 'moa_count', 'drug_count', 'druglike_count' ]].fillna(0) rs_merge_df # So lets create a **really simple** score based on the information we have to predict in a target is likely to druggable: # In[12]: def druggability_score(query_sequence_length, align_length, identity, moa_count, drug_count, druglike_count): align_length = float(align_length) identity = float(identity) moa_score = (align_length/query_sequence_length) * (identity/100) * (1 if (moa_count > 0) else 0) drug_score = (align_length/query_sequence_length) * (identity/100) * (1 if (drug_count > 0) else 0) * 0.8 druglike_score = (align_length/query_sequence_length) * (identity/100) * (1 if (druglike_count > 0) else 0) * 0.5 total_score = round((moa_score + drug_score + druglike_score),2) return (1 if (total_score > 1) else total_score) # The cryptic 0.8 and 0.5 values are there to down weight the contribution of the drug_count and druglike_count values (I said it was simple). It is also assumed the mechanism of action information is a fact, i.e. it is known that this target binds 1 or more drugs, so no down weighting is applied. # # So lets add this new druggable score column to the results table: # In[13]: rs_merge_df['druggability_score'] = rs_merge_df.apply(lambda row: druggability_score(query_sequence_details[row['query']]['seq_length'], row['length'], row['identity'], row['moa_count'], row['drug_count'], row['druglike_count']),axis=1) rs_merge_df # Great, we have an extra column in the data frame, which contains the Druggability Score. As sequence identity contributes significantly to the score, we could just take the max value for the druggability_score column and say this is its Druggability Score for this particular protein. So the predicted druggability_score for the first query sequence defined in query_sequence is: # In[14]: grouped_df = rs_merge_df.groupby('query')['druggability_score'].max().reset_index() print grouped_df.ix[0]['query']+":",grouped_df.ix[0]['druggability_score'] # ### The Conclusion # # We can wrap up this tutorial by presenting the Druggability Score for all sequences defined in query_sequence in a friendly pandas data frame: # In[15]: # Show all results in final table pandas.set_option('display.max_rows', 500) druggability_results_df = DataFrame({'query':query_sequence_order}).merge( grouped_df, how='left', on='query').fillna('No BLAST hits') druggability_results_df # ### Caveats with Drugability Score presented in this tutorial # # * Species information is ignored # * No distinction is made between small molecule drugs and biotherapeutics, e.g. monoclonal antibodies # * Multiple HSPs between query and target hits are ignored # * It has not been tested beyond the scope of this notebook