#!/usr/bin/env python
# coding: utf-8

# # slalom tutorial 

# In this notebook we illustrate how slalom can be used to identify biological drivers on the mESC cell cycle staged dataset.

# First, we load some modules and set some directories

# In[2]:


import os
import slalom
import pdb
from slalom import plotFactors, plotRelevance, plotLoadings, saveFA, dumpFA
get_ipython().run_line_magic('pylab', 'inline')

#specify where the hdf5 file is
data_dir = '../data/'
out_dir = './results'


# slalom expects an expression file, typically with log transformed gene expression values  as well as a gene set annotation. These data can be provided as single hdf5 file, which can be generated using separate R scripts (in the R folder). Alternatively, the expression matrix and the annotation can be loaded as text files in python. 
# 
# 
# 
# 
# 

# # Loading raw expression data and gene set annotations

# By default we here load the MsigDB annotation, which needs to be downloaded separately from the Broad web page. 

# In[3]:


####
#Option 1: load a pre-defined hdf5 file
#We provide an (optional) hdf file with the required data - this was generated using
#the R script write_slalom.R in the R folder
if 0:
    annoDB = 'MSigDB'
    dFile = os.path.join(data_dir,'Buettneretal2015.hdf5')
    data = slalom.load_hdf5(dFile, anno=annoDB)
    
####

####
#Option 2: load the annotation and the data directly
if 1:
    #Annotation file
    annoFile = os.path.join(data_dir,'h.all.v5.0.symbols.gmt.txt') #MSigDB
    annoDB   = 'MSigDB'
    if not os.path.exists(annoFile):
        raise Exception("MSigDB annotation file needs to be downloaded manually")
    #Note: the license of MSigDB does not permit redistribution of the raw annotation files.
    #Please register at http://software.broadinstitute.org/gsea/msigdb and download the
    #hallmark gene sets and place the file in data folder.

    #dataFile: csv file with log expresison values
    dataFile = os.path.join(data_dir,'Buettneretal.csv.gz') # note that the first column (row names) contains the cell cycle stage in numeric form
    data = slalom.utils.load_txt(dataFile=dataFile,annoFiles=annoFile,annoDBs=annoDB)
####

###alternatively the data can be loaded from the provided hdf5 file
#dFile = 'Buettneretal2015.hdf5'
#data = slalom.load_hdf5(dFile, data_dir=data_dir)


#print statistics for the loaded dataset
print ("Loaded {:d} cells, {:d} genes".format(data['Y'].shape[0],data['Y'].shape[1]))
print ("Annotation: {:d} terms".format(len(data['terms'])))


# # Initializing the slalom model

# In[4]:


#I: indicator matrix that assigns genes to pathways
I = data['I'] #if loaded from the hdf file change to I = data['IMSigDB']
#Y: log expresison values 
Y = data['Y']
#terms: ther names of the terms
terms = data['terms']

#gene_ids: the ids of the genes in Y
gene_ids = data['genes']

#initialize FA instance, here using a Gaussian noise model and fitting 3 dense hidden factors
FA = slalom.initFA(Y, terms,I, gene_ids=gene_ids, noise='gauss', nHidden=3, minGenes=15)


# ## Model training

# In[5]:


#model training
FA.train()

#print diagnostics
FA.printDiagnostics()


# Next, we plot the results, including  factor relevance, gene set augmentation and a scatter plot of the two most relevant factors, in this case G2M Checkpoint and P53 Pathway. 

# In[6]:


#plot results

fig = plotRelevance(FA)

#get factors; analogous getters are implemented for relevance and weights (see docs)
X = FA.getX(terms=['G2m checkpoint','P53 pathway'])

#scatter plot of the top two factors
fig = plotFactors(X=X, lab=data['lab'], terms=['G2m checkpoint','P53 pathway'], isCont=False)


#the same plot can be generated by passing the FA object to the plot function
#plotFactors(FA=FA,idx1=0,idx2=1, lab = data['lab'], isCont=False )


# In[7]:


#visualize changes for the G2m checkpoint 
fig = plotLoadings(FA, 'G2m checkpoint', n_genes=20)


# Finally, we can safe the model results as CSV files or using a .hdf5 file.

# 
# 

# In[17]:


#if not os.path.exists(out_dir):
#    os.makedirs(out_dir)
#out_file_name = os.path.join(out_dir,FA.getName()+'.hdf5')
#saveFA(FA, out_name=out_file_name