# import necessary modules: import numpy as np import pandas as pd import patsy import seaborn as sns import statsmodels.api as sm import statsmodels.formula.api as smf import statsmodels.stats as sm_stats import matplotlib.pyplot as plt %matplotlib inline sns.set_style("white") # import the dataset csv file into a Pandas dataframe object: sleep = pd.read_csv('../Datasets/sleep.txt', sep='\t') sleep = sleep.dropna(axis=0) # drop on axis 0 (i.e. rows) sleep['log_BodyWt'] = np.log(sleep['BodyWt']) sleep.info() g = sns.jointplot("log_BodyWt", "TotalSleep", sleep) # this command returns both X and y (hence "dmatrices" plural). See "dmatrix" if you just want X. y, X = patsy.dmatrices('TotalSleep ~ log_BodyWt', sleep) y X betas = np.array([12., -4.]) # the . after the number makes it a floating-point number rather than an integer. yhat = np.inner(X, betas) # this plot uses straight matplotlib commands, not seaborn: plt.scatter(yhat, y) plt.plot((-30, 40), (-30, 40), c=".2", ls="--"); plt.xlabel('yhat (predicted)') plt.ylabel('y (real)') plt.show() errors = y - yhat # absolute errors for each data point squared_errors = errors ** 2 # squared errors sse = squared_errors.sum() # sum of squared errors sse fit = sm.GLM(y, X).fit() # fit the design matrices we specified above using Patsy... # let's look at the fit summary: print(fit.summary()) yhat_2 = fit.predict() # note that internally, the fit.predict() function does something similar # to our inner product between X and beta, above. plt.scatter(yhat_2, y) plt.plot((-30, 40), (-30, 40), c=".2", ls="--"); plt.xlabel('yhat_2 (predicted)') plt.ylabel('y (real)') plt.show() errors = y - yhat_2 # absolute errors for each data point squared_errors = errors ** 2 # squared errors sse2 = squared_errors.sum() # sum of squared errors sse2 fit_2 = smf.glm(formula='TotalSleep ~ log_BodyWt', data=sleep).fit() print(fit_2.summary()) # first we add to the sleep data frame, make a data frame of the stuff we want to plot: plots_df = sleep[['TotalSleep', 'log_BodyWt']].copy() plots_df['yhat'] = fit.predict() plots_df['resids'] = plots_df['TotalSleep'] - plots_df['yhat'] # note the fit object also contains various residuals. plots_df.head() # plot predictions against real data: plt.scatter(sleep['log_BodyWt'], sleep['TotalSleep'], c='k') plt.plot(sleep['log_BodyWt'], fit.predict(), c='r') plt.xlabel('Log Bodyweight') plt.ylabel('Total Sleep') plt.legend(['Prediction', 'Data']) plt.show() # plot residuals against bodyweight: resids = sleep['TotalSleep'] - fit.predict() plt.scatter(plots_df['log_BodyWt'], resids, c='k') plt.plot((-8, 10), (0, 0), 'r--') plt.xlabel('Log Bodyweight') plt.ylabel('Residuals') plt.tight_layout() plt.show() plt.hist(resids.values); sns.lmplot('log_BodyWt', 'TotalSleep', data=sleep) sns.despine(offset=10, trim=True); sns.residplot('log_BodyWt', 'TotalSleep', data=sleep) sns.despine(offset=10, trim=True); sleep['danger_cat'] = 'low' # set as a string, "low" for all. sleep.loc[sleep['Danger'] > 3, 'danger_cat'] = 'high' # set Danger=4 or 5 to "high" sleep['danger_cat'] # how does total sleep appear to change according to our new categories? sns.set_context('notebook') g = sns.boxplot(sleep['TotalSleep'], groupby=sleep['danger_cat']) sns.despine(ax=g, offset=10); # Note how "violinplot" can be a bit misleading here, due to few cases and the kernel density smooth: # g = sns.violinplot(sleep['TotalSleep'], groupby=sleep['danger_cat']) # sns.despine(ax=g, offset=10); plt.savefig('blah.pdf') y, X = patsy.dmatrices('TotalSleep ~ danger_cat', sleep) X fit = smf.glm(formula='TotalSleep ~ danger_cat', data=sleep).fit() print(fit.summary()) high_array = sleep['TotalSleep'][sleep['danger_cat']=='high'] low_array = sleep['TotalSleep'][sleep['danger_cat']=='low'] # how does this compare to an independent groups z-test? t, p = sm_stats.weightstats.ztest(high_array, low_array) 'Independent samples z-test: z = {t}, p = {p}'.format(t=np.round(t, decimals=3), p=p) 'Model values: z = {t}, p = {p}'.format( t=np.round(fit.tvalues[1], decimals=3), p=fit.pvalues[1]) fit_ols = smf.ols(formula='TotalSleep ~ danger_cat', data=sleep).fit() print(fit_ols.summary()) t, p, df = sm_stats.weightstats.ttest_ind(low_array, high_array) 'Independent samples t-test: t({df}) = {t}, p = {p}'.format( t=np.round(t, decimals=3), df=int(df), p=p) 'Model values: t = {t}, p = {p}'.format( t=np.round(fit_ols.tvalues[1], decimals=3), p=fit_ols.pvalues[1]) # using statsmodels, must first compute "descrstats": d1 = sm_stats.weightstats.DescrStatsW(high_array) t, p = d1.ztest_mean(0) 'Independent z-test: z = {t}, p = {p}'.format( t=np.round(t, decimals=3), p=p) 'Model values: z = {t}, p = {p}'.format( t=np.round(fit.tvalues[0], decimals=3), p=fit.pvalues[0]) # cell to check "high" alone high_only = sleep.copy() high_only = high_only[high_only['danger_cat']=='high'] fit_high = smf.glm(formula='TotalSleep ~ 1', data=high_only).fit() print(fit_high.summary()) sleep['life_cat'] = pd.qcut(sleep['LifeSpan'], [0, .33, .66, 1]) sleep['life_cat'] # category labels are a bit ugly, reflecting range of cuts: # change category labels, then convert to object: sleep['life_cat'] = sleep['life_cat'].cat.rename_categories(['short', 'med', 'long']) # rename categories sleep['life_cat'] = sleep['life_cat'].astype('object') # make it an object, rather than a category sleep['life_cat'] y, X = patsy.dmatrices('TotalSleep ~ life_cat', sleep) X sleep['life_cat'].head() X[0:5, :] # first five rows, and second two cols of X fit = smf.glm('TotalSleep ~ life_cat', sleep).fit() print(fit.summary()) long_mean = fit.params[0] short_mean = fit.params[0] + fit.params[2] med_mean = fit.params[0] + fit.params[1] print('the predictions are: \ \n long = {} \ \n med = {} \ \n short = {}'.format(np.round(long_mean, decimals=1), np.round(med_mean, decimals=1), np.round(short_mean, decimals=1))) g = sns.boxplot(sleep['TotalSleep'], groupby=sleep['life_cat']) sns.despine(ax=g, offset=10); from IPython.core.display import HTML def css_styling(): styles = open("../custom_style.css", "r").read() return HTML(styles) css_styling()