#!/usr/bin/env python # coding: utf-8 # # slalom tutorial # In this notebook we illustrate how slalom can be used to identify biological drivers on the mESC cell cycle staged dataset. # First, we load some modules and set some directories # In[2]: import os import slalom import pdb from slalom import plotFactors, plotRelevance, plotLoadings, saveFA, dumpFA get_ipython().run_line_magic('pylab', 'inline') #specify where the hdf5 file is data_dir = '../data/' out_dir = './results' # slalom expects an expression file, typically with log transformed gene expression values as well as a gene set annotation. These data can be provided as single hdf5 file, which can be generated using separate R scripts (in the R folder). Alternatively, the expression matrix and the annotation can be loaded as text files in python. # # # # # # # Loading raw expression data and gene set annotations # By default we here load the MsigDB annotation, which needs to be downloaded separately from the Broad web page. # In[3]: #### #Option 1: load a pre-defined hdf5 file #We provide an (optional) hdf file with the required data - this was generated using #the R script write_slalom.R in the R folder if 0: annoDB = 'MSigDB' dFile = os.path.join(data_dir,'Buettneretal2015.hdf5') data = slalom.load_hdf5(dFile, anno=annoDB) #### #### #Option 2: load the annotation and the data directly if 1: #Annotation file annoFile = os.path.join(data_dir,'h.all.v5.0.symbols.gmt.txt') #MSigDB annoDB = 'MSigDB' if not os.path.exists(annoFile): raise Exception("MSigDB annotation file needs to be downloaded manually") #Note: the license of MSigDB does not permit redistribution of the raw annotation files. #Please register at http://software.broadinstitute.org/gsea/msigdb and download the #hallmark gene sets and place the file in data folder. #dataFile: csv file with log expresison values dataFile = os.path.join(data_dir,'Buettneretal.csv.gz') # note that the first column (row names) contains the cell cycle stage in numeric form data = slalom.utils.load_txt(dataFile=dataFile,annoFiles=annoFile,annoDBs=annoDB) #### ###alternatively the data can be loaded from the provided hdf5 file #dFile = 'Buettneretal2015.hdf5' #data = slalom.load_hdf5(dFile, data_dir=data_dir) #print statistics for the loaded dataset print ("Loaded {:d} cells, {:d} genes".format(data['Y'].shape[0],data['Y'].shape[1])) print ("Annotation: {:d} terms".format(len(data['terms']))) # # Initializing the slalom model # In[4]: #I: indicator matrix that assigns genes to pathways I = data['I'] #if loaded from the hdf file change to I = data['IMSigDB'] #Y: log expresison values Y = data['Y'] #terms: ther names of the terms terms = data['terms'] #gene_ids: the ids of the genes in Y gene_ids = data['genes'] #initialize FA instance, here using a Gaussian noise model and fitting 3 dense hidden factors FA = slalom.initFA(Y, terms,I, gene_ids=gene_ids, noise='gauss', nHidden=3, minGenes=15) # ## Model training # In[5]: #model training FA.train() #print diagnostics FA.printDiagnostics() # Next, we plot the results, including factor relevance, gene set augmentation and a scatter plot of the two most relevant factors, in this case G2M Checkpoint and P53 Pathway. # In[6]: #plot results fig = plotRelevance(FA) #get factors; analogous getters are implemented for relevance and weights (see docs) X = FA.getX(terms=['G2m checkpoint','P53 pathway']) #scatter plot of the top two factors fig = plotFactors(X=X, lab=data['lab'], terms=['G2m checkpoint','P53 pathway'], isCont=False) #the same plot can be generated by passing the FA object to the plot function #plotFactors(FA=FA,idx1=0,idx2=1, lab = data['lab'], isCont=False ) # In[7]: #visualize changes for the G2m checkpoint fig = plotLoadings(FA, 'G2m checkpoint', n_genes=20) # Finally, we can safe the model results as CSV files or using a .hdf5 file. # # # In[17]: #if not os.path.exists(out_dir): # os.makedirs(out_dir) #out_file_name = os.path.join(out_dir,FA.getName()+'.hdf5') #saveFA(FA, out_name=out_file_name