#!/usr/bin/env python # coding: utf-8 # # Setting up # In[1]: # activiate inline plotting get_ipython().run_line_magic('matplotlib', 'inline') from setup import * # In[2]: if 0: file_name = tutorial_data.get_file('arab107') geno_reader = gr.genotype_reader_tables(file_name) pheno_reader = phr.pheno_reader_tables(file_name) #the data object allows to query specific genotype or phenotype data dataset = data.QTLData(geno_reader=geno_reader,pheno_reader=pheno_reader) # In[3]: if 1: #the data used in this study have been pre-converted into an hdf5 file. #to preprocess your own data, please use limix command line tool file_name = tutorial_data.get_file('BYxRM') geno_reader = gr.genotype_reader_tables(file_name) pheno_reader = phr.pheno_reader_tables(file_name) #the data object allows to query specific genotype or phenotype data dataset = data.QTLData(geno_reader=geno_reader,pheno_reader=pheno_reader) # In[4]: #choosing which phentoypes to model jointly ? phenotypes,sample_idx = dataset.getPhenotypes(intersection=True) phenotype_names = dataset.phenotype_ID pl.figure(figsize=[20,20]) Ce= sp.cov(phenotypes.values.T) pl.imshow(Ce,aspect='auto',interpolation='none') pl.xticks(sp.arange(len(phenotype_names)),phenotype_names,rotation=90) pl.yticks(sp.arange(len(phenotype_names)),phenotype_names,rotation=0) pl.colorbar() # In[5]: #select subset of phenotypes #flowering phenotypes (A. thaliana data) phenotype_names = ['5_FT10','6_FT16','7_FT22'] #a larger set of correlated and anti correlated traits: #phenotype_names = ['Ethanol','Congo_red','Galactose'] #YPD, different temperatures: phenotype_names = ['YPD:37C','YPD:15C','YPD:4C'] phenotype_query = "(phenotype_ID in %s)" % str(phenotype_names) data_subsample = dataset.subsample_phenotypes(phenotype_query=phenotype_query, intersection=True) sample_relatedness_unnormalized = data_subsample.getCovariance(normalize=True,center=True) sample_relatedness = sample_relatedness_unnormalized/ \ sample_relatedness_unnormalized.diagonal().mean() if 1: #subsample for speed? Is = sp.arange(dataset.geno_pos.shape[0]) #take every 10th SNP Is = Is[::10] data_subsample = data_subsample.subsample(cols_geno = Is) #get variables we need from data snps = data_subsample.getGenotypes(center=False,unit=False) position = data_subsample.getPos() position,chromBounds = data_util.estCumPos(position=position,offset=100000) phenotypes,sample_idx = data_subsample.getPhenotypes(phenotype_query=phenotype_query, intersection=True) phenotype_std = preprocess.rankStandardizeNormal(phenotypes.values) N = snps.shape[0] S = snps.shape[1] P = phenotypes.shape[1] print "loaded %d samples, %d phenotypes, %s snps" % (N,P,S) # In[6]: #preprocess phenotypes using boxcox if 0: phenotype_vals_boxcox, maxlog = preprocess.boxcox(phenotypes.values) phenotype_vals_boxcox -= phenotype_vals_boxcox.mean(axis=0) phenotype_vals_boxcox /= phenotype_vals_boxcox.std(axis=0) phenotypes.ix[:,:] = phenotype_vals_boxcox # # Correlation between phentypes # * We start by examining the correlation between phenotypes # In[7]: #pairwise corrrelations of the first 3 traits pl.figure(figsize=[15,5]) pl.subplot(1,3,1) pl.plot(phenotypes[phenotype_names[0]].values,phenotypes[phenotype_names[1]].values,'.') pl.xlabel(phenotype_names[0]) pl.ylabel(phenotype_names[1]) pl.subplot(1,3,2) pl.plot(phenotypes[phenotype_names[1]].values,phenotypes[phenotype_names[2]].values,'.') pl.xlabel(phenotype_names[1]) pl.ylabel(phenotype_names[2]) pl.subplot(1,3,3) pl.plot(phenotypes[phenotype_names[0]].values,phenotypes[phenotype_names[2]].values,'.') pl.xlabel(phenotype_names[0]) pl.ylabel(phenotype_names[2]) # In[8]: Ce = sp.corrcoef(phenotypes.T) sample_relatedness = data_subsample.getCovariance() #set parameters for the analysis N, G = phenotypes.shape # variance component model vc = var.VarianceDecomposition(phenotypes.values) vc.addFixedEffect() vc.addRandomEffect(K=sample_relatedness,trait_covar_type='freeform') vc.addRandomEffect(is_noise=True,trait_covar_type='freeform') vc.optimize() # retrieve geno and noise covariance matrix Cg = vc.getTraitCovar(0) Cn = vc.getTraitCovar(1) # In[9]: pl.figure(figsize=[15,5]) pl.subplot(1,3,1) pl.imshow(Ce,aspect='auto',interpolation='none',vmin=-1,vmax=1) pl.xticks(sp.arange(len(phenotype_names)),phenotypes.columns) pl.yticks(sp.arange(len(phenotype_names)),phenotypes.columns) pl.title('empirical correlation') pl.subplot(1,3,2) pl.imshow(Cg,aspect='auto',interpolation='none',vmin=-1,vmax=1) pl.xticks(sp.arange(len(phenotype_names)),phenotypes.columns) pl.yticks(sp.arange(len(phenotype_names)),phenotypes.columns) pl.title('genetic covariance') pl.subplot(1,3,3) pl.imshow(Cn,aspect='auto',interpolation='none',vmin=-1,vmax=1) pl.xticks(sp.arange(len(phenotype_names)),phenotypes.columns) pl.yticks(sp.arange(len(phenotype_names)),phenotypes.columns) pl.title('noise covariance') pl.colorbar() # In[10]: #convert P-values to a DataFrame for nice output writing: lmm = qtl.test_lmm(snps=snps[sample_idx],pheno=phenotypes.values,K=sample_relatedness) #convert P-values to a DataFrame for nice output writing: pvalues_lmm = pd.DataFrame(data=lmm.getPv().T,index=data_subsample.geno_ID, columns=phenotype_names) # In[11]: # Genome-wide manhatton plots for one phenotype: for p_ID in phenotype_names: pl.figure(figsize=[15,4]) plot_manhattan(posCum=position['pos_cum'],pv=pvalues_lmm[p_ID].values,chromBounds=chromBounds,thr_plotting=0.05) pl.title(p_ID) # #Any effect test # In[12]: P = phenotypes.values.shape[1] covs = None #covariates Acovs = None #the design matrix for the covariates Asnps = sp.eye(P) #the design matrix for the SNPs K1r = sample_relatedness #the first sample-sample covariance matrix (non-noise) K2r = sp.eye(N) #the second sample-sample covariance matrix (noise) K1c = None #the first phenotype-phenotype covariance matrix (non-noise) K2c = None #the second phenotype-phenotype covariance matrix (noise) covar_type = 'freeform' #the type of the trait/trait covariance to be estimated searchDelta = False #specify if delta should be optimized for each SNP test="lrt" #specify type of statistical test # Running the analysis # when cov are not set (None), LIMIX considers an intercept (covs=SP.ones((N,1))) lmm, pvalues = qtl.test_lmm_kronecker(snps=snps[sample_idx],phenos=phenotypes.values,covs=covs,Acovs=Acovs, Asnps=Asnps,K1r=K1r,trait_covar_type=covar_type) #convert P-values to a DataFrame for nice output writing: pvalues = pd.DataFrame(data=pvalues.T,index=data_subsample.geno_ID,columns=['multi_trait']) # In[13]: pl.figure(figsize=[15,4]) thr = 0.1/snps.shape[1] plot_manhattan(posCum=position['pos_cum'],thr=thr,pv=pvalues['multi_trait'].values,chromBounds=chromBounds,thr_plotting=0.05) pl.title('Any effect test') # #Testing for common effects # A common effect test is a 1 degree of freedom test and can be done by setting # \begin{equation} # \mathbf{A}_1^\text{(snp)} = \mathbf{1}_{1,P},\;\;\; # \mathbf{A}_0^\text{(snp)} = \mathbf{0} # \end{equation} # In[14]: P = phenotypes.values.shape[1] covs = None #covariates Acovs = None #the design matrix for the covariates Asnps = sp.ones((1,P)) #the design matrix for the SNPs K1r = sample_relatedness #the first sample-sample covariance matrix (non-noise) K2r = sp.eye(N) #the second sample-sample covariance matrix (noise) K1c = None #the first phenotype-phenotype covariance matrix (non-noise) K2c = None #the second phenotype-phenotype covariance matrix (noise) covar_type = 'freeform' #the type of the trait/trait covariance to be estimated searchDelta = False #specify if delta should be optimized for each SNP test="lrt" #specify type of statistical test # Running the analysis # when cov are not set (None), LIMIX considers an intercept (covs=SP.ones((N,1))) lmm, pvalues_common = qtl.test_lmm_kronecker(snps=snps[sample_idx],phenos=phenotypes.values,covs=covs,Acovs=Acovs, Asnps=Asnps,K1r=K1r,trait_covar_type=covar_type) #convert P-values to a DataFrame for nice output writing: pvalues_common = pd.DataFrame(data=pvalues_common.T,index=data_subsample.geno_ID,columns=['common']) # In[15]: pl.figure(figsize=[15,4]) plot_manhattan(posCum=position['pos_cum'],pv=pvalues_common['common'].values,chromBounds=chromBounds,thr_plotting=0.1) pl.title('common') # #Testing for GxE (specific effect test) # For a specifc effect test for trait $p$ # the alternative model is set to have both a common and a specific effect # for transcript $p$ from the SNP while the null model has only a common effect. # # It is a 1 degree of freedom test and, # in the particular case of $P=3$ traits and for $p=0$, it can be done by setting # \begin{equation} # \mathbf{A}_1^\text{(snp)} = # \begin{pmatrix} # 1 & 0 & 0 \\ # 1 & 1 & 1 # \end{pmatrix} # \;\;\;, # \mathbf{A}_0^\text{(snp)} = \mathbf{1}_{1,3} # \end{equation} # In[16]: Asnps0 = sp.ones((1,P)) #the null model design matrix for the SNPs Asnps1 = sp.zeros((2,P)) #the alternative model design matrix for the SNPs Asnps1[0,:] = 1.0 Asnps1[1,0] = 1.0 # Running the analysis # when cov are not set (None), LIMIX considers an intercept (covs=SP.ones((N,1))) pvalues_inter = qtl.test_interaction_lmm_kronecker(snps=snps[sample_idx],phenos=phenotypes.values,covs=covs,Acovs=Acovs, Asnps0=Asnps0,Asnps1=Asnps1,K1r=K1r,trait_covar_type=covar_type) print "Design(0): \n"+str(Asnps0) print "Design(Alt): \n"+str(Asnps1) #convert P-values to a DataFrame for nice output writing: pvalues_inter = pd.DataFrame(data=sp.concatenate(pvalues_inter).T,index=data_subsample.geno_ID, columns=["specific","null_common","alternative_any"]) # In[17]: pl.figure(figsize=[15,4]) plot_manhattan(posCum=position['pos_cum'],pv=pvalues_inter['specific'].values,chromBounds=chromBounds,thr_plotting=0.1) pl.title('specific') # In[18]: tests = ['Any effect test','Interaction effect test'] pl.figure(figsize=[15,10]) #lim = -1.2*sp.log10(SP.array([pvalues['alternative_any'].min(), # pvalues_inter['null_common'].min(),pvalues_inter['specific'].min()]).min()) plt = pl.subplot(2,1,1) plot_manhattan(position['pos_cum'],pvalues_inter['alternative_any'], chromBounds,colorS='k',colorNS='k',alphaNS=0.05,labelS='any') plot_manhattan(position['pos_cum'],pvalues_inter['null_common'], chromBounds,colorS='y',colorNS='y',alphaNS=0.05,labelS='common') plot_manhattan(position['pos_cum'],pvalues_inter['specific'], chromBounds,colorS='r',colorNS='r',alphaNS=0.05,labelS='specific') pl.legend(loc='upper left') # In[ ]: # In[ ]: # In[ ]: