%%capture %load_ext rpy2.ipython %matplotlib inline from IPython.display import HTML from matplotlib import pyplot import os, numpy as np np.random.seed(0) import pandas, statsmodels.api as sm from selection.algorithms.lasso import lasso, data_carving import itable HTML(""" """) # Run with commit a9eec2db226088bfd89e3d61f9fbc20d933ae83a # from http://github.com/jonathan-taylor/selective-inference if not os.path.exists("NRTI_DATA.txt"): NRTI = pandas.read_table("http://hivdb.stanford.edu/pages/published_analysis/genophenoPNAS2006/DATA/NRTI_DATA.txt", na_values="NA") else: NRTI = pandas.read_table("NRTI_DATA.txt") NRTI_specific = [] NRTI_muts = [] mixtures = np.zeros(NRTI.shape[0]) for i in range(1,241): d = NRTI['P%d' % i] for mut in np.unique(d): if mut not in ['-','.'] and len(mut) == 1: test = np.equal(d, mut) if test.sum() > 10: NRTI_specific.append(np.array(np.equal(d, mut))) NRTI_muts.append("P%d%s" % (i,mut)) NRTI_specific = NRTI.from_records(np.array(NRTI_specific).T, columns=NRTI_muts) # Next, standardize the data, keeping only those where Y is not missing X_NRTI = np.array(NRTI_specific, np.float) Y = NRTI['3TC'] # shorthand keep = ~np.isnan(Y).astype(np.bool) X_NRTI = X_NRTI[np.nonzero(keep)]; Y=Y[keep] Y = np.array(np.log(Y), np.float); Y -= Y.mean() X_NRTI -= X_NRTI.mean(0)[None, :]; X_NRTI /= X_NRTI.std(0)[None,:] X = X_NRTI # shorthand # Design matrix # Columns are site / amino acid pairs X.shape # Variable names NRTI_muts[:10], len(NRTI_muts) %%capture # Fit OLS model and plot coefficients n, p = X.shape ols_fit = sm.OLS(Y, X).fit() sigma_3TC = np.linalg.norm(ols_fit.resid) / np.sqrt(n-p-1) OLS_3TC = ols_fit.params fig_3TC = pyplot.figure(figsize=(24,12)) ax_3TC = fig_3TC.gca() ax_3TC.bar(np.arange(1, len(NRTI_muts)+1)-0.5, OLS_3TC, label='OLS', color='red', alpha=0.7) ax_3TC.set_xticks(range(1, (len(NRTI_muts)+1))) ax_3TC.set_xticklabels(NRTI_muts, rotation='vertical', fontsize=18) ax_3TC.set_xlim([-1, len(NRTI_muts)+1]) ax_3TC.set_title(r'OLS coefficients for 3TC resistance, $\hat{\sigma}$=%0.1e' % sigma_3TC, fontsize=50) ax_3TC.set_ylabel('Parameter estimates', fontsize=30) ax_3TC.legend(fontsize=30) ; fig_3TC # Choose a LASSO parameter lambda_theoretical = [] for i in range(10000): lambda_theoretical.append(np.fabs(np.dot(X.T, np.random.standard_normal(n))).max()) lambda_theoretical = np.round((1 * np.mean(lambda_theoretical) * sigma_3TC)) lambda_theoretical from selection.algorithms.lasso import lasso lasso_3TC = lasso(Y, X, lambda_theoretical, sigma=sigma_3TC) lasso_3TC.fit() active_3TC = [NRTI_muts[i] for i in lasso_3TC.active] active_3TC %%capture ax_3TC.bar(np.arange(1, len(NRTI_muts)+1)-0.5, lasso_3TC.soln, label='LASSO', color='gray') ax_3TC.set_title(r'LASSO coefficients ($\lambda=%0.1f$)' % lambda_theoretical, fontsize=50) ax_3TC.legend(fontsize=30, loc='upper left') fig_3TC %%capture selected_OLS = sm.OLS(Y, X[:,lasso_3TC.active]).fit() OLS_lower, OLS_upper = np.asarray(selected_OLS.conf_int()).T selective_lower = lasso_3TC.intervals['lower'] # [i[0] for _, _, _, i in lasso_3TC.intervals] selective_upper = lasso_3TC.intervals['upper'] # [i[1] for _, _, _, i in lasso_3TC.intervals] selective_intervals = ([['Mutation', 'OLS Lower', 'OLS Upper', 'Selective Lower', 'Selective Upper']] + zip(active_3TC, OLS_lower, OLS_upper, selective_lower, selective_upper)) %%capture fig_select = pyplot.figure(figsize=(24,12)) ax_select = fig_select.gca() ax_select.bar(np.arange(1, len(active_3TC)+1)-0.5, selected_OLS.params, color='grey', alpha=0.5) ax_select.set_xticks(range(1, (len(active_3TC)+1))) ax_select.set_xticklabels(active_3TC, rotation='vertical', fontsize=18) ax_select.set_xlim([0, len(active_3TC)+1]) ax_select.set_title(r'Selective' % sigma_3TC, fontsize=50) ax_select.set_ylabel('Parameter values', fontsize=30) ax_select.errorbar(np.arange(1, (len(active_3TC)+1)), selected_OLS.params, yerr=[selected_OLS.params - OLS_lower, OLS_upper - selected_OLS.params], capsize=10, capthick=5, fmt=None, label='OLS (no coverage guarantees)', elinewidth=3, ecolor='k') ax_select.legend(fontsize=30, loc='upper left', numpoints=1) ax_select.plot([0, len(active_3TC)+1], [0,0], 'k--') fig_select %%capture ax_select.errorbar(np.arange(1, (len(active_3TC)+1)), selected_OLS.params, yerr=[selected_OLS.params - selective_lower, selective_upper - selected_OLS.params], capsize=10, capthick=5, fmt=None, label='Selective', elinewidth=3, ecolor='r') ax_select.legend(fontsize=30, loc='upper left', numpoints=1) fig_select fig_select fig_select %%R -i X,Y library(lars) plot(lars(X, as.numeric(Y), type='lar')) fig_select selective_pvalues = pandas.DataFrame({'Mutation':active_3TC, 'Naive OLS':selected_OLS.pvalues, 'Selective':[p for _, p in lasso_3TC.active_pvalues]}) pvalue_table = itable.PrettyTable(selective_pvalues, tstyle=itable.TableStyle(theme="theme1"), center=True) signif_select = selective_pvalues['Selective'] < 0.05 signif_OLS = selective_pvalues['Naive OLS'] < 0.05 pvalue_table.set_cell_style(cols=[1], rows=np.nonzero(signif_OLS)[0], color='red', font_weight='bold', format_function=lambda p: "%0.3f" % p) pvalue_table.set_cell_style(cols=[1], rows=np.nonzero(~signif_OLS)[0], format_function=lambda p: "%0.3f" % p) pvalue_table.set_cell_style(cols=[2], rows=np.nonzero(signif_select)[0], color='red', font_weight='bold', format_function=lambda p: "%0.3f" % p) pvalue_table.set_cell_style(cols=[2], rows=np.nonzero(~signif_select)[0], format_function=lambda p: "%0.3f" % p) pvalue_table %%capture active_carve, selective_pval_carve, selective_interval_carve, split_pval, split_interval = [], [], [], [], [] results = data_carving(Y, X, lam_frac=1., sigma=sigma_3TC, burnin=5000, ndraw=20000, split_frac=0.9, splitting=True)[0] for result in results: active_carve.append(NRTI_muts[result[0]]) selective_pval_carve.append(result[1]) selective_interval_carve.append(result[2]) split_pval.append(result[3]) split_interval.append(result[4]) selective_interval_carve = np.array(selective_interval_carve).T split_interval = np.array(split_interval).T split_OLS = np.mean(split_interval, 0) fig_carve = pyplot.figure(figsize=(24,12)) ax_carve = fig_carve.gca() ax_carve.bar(np.arange(1, len(active_carve)+1)-0.5, split_OLS, color='grey', alpha=0.5) ax_carve.set_xticks(range(1, (len(active_carve)+1))) ax_carve.set_xticklabels([v[1:] for v in active_carve], rotation='vertical', fontsize=18) ax_carve.set_xlim([0, len(active_carve)+1]) ax_carve.set_title(r'Intervals of selected model using 90% for selection', fontsize=50) ax_carve.set_ylabel('Parameter values', fontsize=30) ax_carve.errorbar(np.arange(1, (len(active_carve)+1)), split_OLS, yerr=[split_OLS - split_interval[0], split_interval[1] - split_OLS], capsize=10, capthick=5, fmt=None, label='Data splitting', elinewidth=3, ecolor='k') ax_carve.legend(fontsize=30, loc='upper left', numpoints=1) ax_carve.plot([0, len(active_carve)+1], [0,0], 'k--') fig_carve %%capture ax_carve.errorbar(np.arange(1, (len(active_carve)+1)), split_OLS, yerr=[split_OLS - selective_interval_carve[0], selective_interval_carve[1] - split_OLS], capsize=10, capthick=5, fmt=None, label='Data carving', elinewidth=3, ecolor='r') ax_carve.legend(fontsize=30, loc='upper left', numpoints=1) fig_carve fig_carve carve_pvalues = pandas.DataFrame({'Mutation':active_carve, 'Data splitting':split_pval, 'Data carving':selective_pval_carve}) carve_pvalues = carve_pvalues.reindex_axis(['Mutation', 'Data splitting', 'Data carving'], axis=1) carve_pvalue_table = itable.PrettyTable(carve_pvalues, tstyle=itable.TableStyle(theme="theme1"), center=True) signif_carve = carve_pvalues['Data carving'] < 0.05 signif_split = carve_pvalues['Data splitting'] < 0.05 carve_pvalue_table.set_cell_style(cols=[1], rows=np.nonzero(signif_split)[0], color='red', font_weight='bold', format_function=lambda p: "%0.3f" % p) carve_pvalue_table.set_cell_style(cols=[1], rows=np.nonzero(~signif_split)[0], format_function=lambda p: "%0.3f" % p) carve_pvalue_table.set_cell_style(cols=[2], rows=np.nonzero(signif_carve)[0], color='red', font_weight='bold', format_function=lambda p: "%0.3f" % p) carve_pvalue_table.set_cell_style(cols=[2], rows=np.nonzero(~signif_carve)[0], format_function=lambda p: "%0.3f" % p) carve_pvalue_table fig_carve carve_pvalue_table import os def build(): cmd = ''' ipython nbconvert --to slides --profile=stats --reveal-prefix=http://statweb.stanford.edu/~jtaylo/reveal.js %(title)s.ipynb; mkdir www; cp -r %(title)s.slides.html www/index.html; cp fig_lasso*png www; cp convex.svg www; cp %(title)s.ipynb www; ''' % {'title':'sfu'} print cmd os.system(cmd) build()