#!/usr/bin/env python
# coding: utf-8

# # The GDSCTools library 
# Nore that in this notebook, we need the following code but this may not be required in a script or a standard Python shell if pylab is already loaded.
# 
# ## Regression methods
# 
# Currently (v0.15), we have 3 classes available that implements 3 regression methods namely:
# 
# - GDSCElasticNet
# - GDSCRidge
# - GDSCLasso
# 
# In the following, we will only use the GDSCElasticNet class but one can use other regression methods with the same usage.
# 
# ## Features
# 
# - Train a model for a given alpha with a cross validation (CV)
# - Scan alpha parameter, repeating the model training with CV
# - Find the best alpha parameter automatically using CV
# - Check significance of the Pearson correlation with random models
# - Plot importances and weights of model coefficients
# - Identify drugs of interest automatically
# 

# In[1]:


# For ipython notebooks only
get_ipython().run_line_magic('pylab', 'inline')
rcParams['figure.figsize'] = 10,5
#import warnings
#warnings.simplefilter('ignore', 'DeprecationWarning')


# Then, we can import some functionalities from GDSCtools library

# In[2]:


from gdsctools import gdsctools_data
from gdsctools import regression
from gdsctools import GenomicFeatures, IC50     # optional


# You may import everything in one go as follows::
#         
#     from gdsctools import *

# ## The Data
# Let us get some data to play with. First, we get the drug responses from the v17 data set (*ic50* variable here below). Second, we read the genomic features for the same version (*gf* here below). 

# An IC50 data set with 988 cell lines and Nd=265 drugs

# In[3]:


ic50 = gdsctools_data("test_IC50.csv")
ic50 = gdsctools_data("IC50_v17.csv.gz")
IC50(ic50)


# Genomic feature data with 988 cell lines and Nf features. Yet, this data set is quite large hence the selection of only a subset of the features.

# In[4]:


gf = gdsctools_data("genomic_features_v17.csv.gz")
# to speed up computation, we keep only 50 features:
gf = GenomicFeatures(gf)
gf.df = gf.df[["TISSUE_FACTOR", "MSI_FACTOR"] + list(gf.df.columns[219:271])]
gf


# ## The regression

# From the regression module, we can use the **GDSCElasticNet** class. It takes as input a 
# drug response matrix and a genomic features matrix. See the ANOVA analysis
# or http://gdsctools.readthedocs.org documentation for details about the data formats.

# In[5]:


gd = regression.GDSCElasticNet(ic50, gf)


# Data are not normalised. However, one can force the X data (features) to be standarised (mean is substracted and result divided by the standard deviation). To do so, uncomment the following statement.

# In[6]:


# gd.scale = True


# **Note:** The ElasticNet like Lasso and Ridge methods have a parameter alpha, which will be tuned and of paramount importance in the next sections. The Elastic Net has also a l1_ratio parameter (0.5 by default). Let us emphasize the different terminology use in scikit-learn and the R package glmnet:
# 
# | sklearn  |  glmnet |
# |-----------------------
# |l1_ratio  | alpha|
# |alpha     | lambda|
# 
# 
# 

# ## Get the best model

# Our first objective is to get the best model that is the best set of parameters (alpha parameter). This can be achieved using a cross validation strategy. In GDSCTools, this can be achieved in one go using  the runCV function. Let us pick a drug identifier known to have a large pearson correlation. Its identifier being 1047 in the present data set

# In[7]:


drugid = 1047


# Just call this method (**runCV**).

# In[8]:


res = gd.runCV(drugid, kfolds=10)


# This is the fastest way in GDSCTools to get the best alpha parameter (highest pearson correlation). For performance reasons, there is no plot being shown. The returned objects contains information such as the best alpha parameter

# In[9]:


res.alpha, res.ln_alpha


# Note that in general we will use a log scale alpha value (*ln alpha*):

# ## Importance features

# In[10]:


alpha = 0.01159
best_model = gd.get_model(alpha=alpha)
res = gd.plot_weight(drugid, best_model)


# Here we see that TP53_mut feature is predominent.

# In[11]:


res = gd.plot_importance(drugid, model=gd.get_model(alpha=alpha))


# In[12]:


res[res.weight !=0]


# In[13]:


# One CV for a given alpha
# plots the prediction of the model for all data
scores = gd.fit(drugid, alpha=alpha)


# In[14]:


all_scores = []
for this in range(100):
    scores = gd.fit(drugid, alpha=0.08, show=False, shuffle=True, randomize_Y=False)
    all_scores.extend(scores)
random_scores = []
for this in range(100):
    scores = gd.fit(drugid, alpha=0.08, show=False, shuffle=True, randomize_Y=True)
    random_scores.extend(scores)
    

# The scores contains the pearson correlation for each K-fold

# In[15]:


# scores returned corresponds to the N=10 KFold scores
_ = hist(all_scores,bins=20); xlabel('scores for the N folds ')
_ = hist(random_scores,bins=20, color="r"); xlabel('scores for the N folds ')


# ## Tuning alpha manually (and plotting)

# In[15]:


results = gd.tune_alpha(drugid, alpha_range=(-3.5,-1))


# In[16]:


results


# In[17]:


results["ln_alpha"]


# This is not and runCV is recommended. 
# - shuffle
# - runCV gives stochastic answer while fitCV does not

# In[ ]:


# In[18]:


alphas = []
Rps = []
from easydev import Progress
pb = Progress(len(gd.drugIds))
count = 0
for drug in gd.drugIds:
    res = gd.runCV(drug, kfolds=10, verbose=False)
    alphas.append(res.alpha)
    Rps.append(res.Rp)
    pb.animate(count)
    count += 1
    

# In[19]:


plot(alphas, Rps, "o")
xlabel("alpha")
ylabel("Rps")


# In[20]:


best_alpha = 0.085


# In[21]:


import pandas as pd
df = pd.DataFrame({"Rps":Rps, "drug": gd.drugIds})
df[df.Rps > 0.38]


# In[22]:


# Checking the validity of runCV on N instances
res = gd.check_randomness(1047, N=20)


# We can also now look at the weights for that model:
#     

# In[23]:


res = gd.boxplot(drugid, model=gd.get_model(alpha=best_alpha), n=10, bx_vert=False)


# In[24]:


res = gd.dendogram_coefficients(cmap="terrain")


# In[25]:


tt = []
bayes = []
Rp = []
from easydev import Progress
pb = Progress(len(gd.drugIds))
for i, this in enumerate(gd.drugIds):
    res = gd.check_randomness(this, 4, show=False)
    tt.append(res['ttest_pval'])
    bayes.append(res['bayes_factor'])
    Rp.append(mean(res['scores']))
    pb.animate(i+1)


# In[27]:


bayes = [x if not np.isinf(x) else 25 for x in bayes]
scatter(-log(tt), Rp, c=[1+x for x in bayes], s=60, alpha=0.5)
colorbar()
ylabel("Rp"); 
xlabel("-log tt")
#xlim([0,40])
#ylim([0,25])
#plot(-log(tt), Rp, "ro", alpha=0.5)


# In[26]:


import pandas as pd
df = pd.DataFrame({"drugid": gd.drugIds, "Rp":Rp, "logtt":-log(tt), "bayes":bayes})


# In[27]:


df.query("Rp>0.3 and logtt>10 and bayes>=10")


# In[ ]: