# import necessary modules:
import numpy as np
import pandas as pd
import patsy
import seaborn as sns
import statsmodels.api as sm
import statsmodels.formula.api as smf
import statsmodels.stats as sm_stats
import matplotlib.pyplot as plt

%matplotlib inline

sns.set_style("white")

# import the dataset csv file into a Pandas dataframe object:
sleep = pd.read_csv('../Datasets/sleep.txt', sep='\t') 
sleep = sleep.dropna(axis=0)  # drop on axis 0 (i.e. rows)
sleep['log_BodyWt'] = np.log(sleep['BodyWt'])
sleep.info()

g = sns.jointplot("log_BodyWt", "TotalSleep", sleep)

# this command returns both X and y (hence "dmatrices" plural). See "dmatrix" if you just want X.
y, X = patsy.dmatrices('TotalSleep ~ log_BodyWt', sleep)

y

X

betas = np.array([12., -4.])   # the . after the number makes it a floating-point number rather than an integer.
yhat = np.inner(X, betas)


# this plot uses straight matplotlib commands, not seaborn:
plt.scatter(yhat, y)
plt.plot((-30, 40), (-30, 40), c=".2", ls="--");
plt.xlabel('yhat (predicted)')
plt.ylabel('y (real)')
plt.show()

errors = y - yhat  # absolute errors for each data point
squared_errors = errors ** 2  # squared errors
sse = squared_errors.sum()  # sum of squared errors
sse

fit = sm.GLM(y, X).fit() # fit the design matrices we specified above using Patsy...
# let's look at the fit summary:
print(fit.summary())

yhat_2 = fit.predict()

# note that internally, the fit.predict() function does something similar 
# to our inner product between X and beta, above.

plt.scatter(yhat_2, y)
plt.plot((-30, 40), (-30, 40), c=".2", ls="--");
plt.xlabel('yhat_2 (predicted)')
plt.ylabel('y (real)')
plt.show()

errors = y - yhat_2  # absolute errors for each data point
squared_errors = errors ** 2  # squared errors
sse2 = squared_errors.sum()  # sum of squared errors
sse2

fit_2 = smf.glm(formula='TotalSleep ~ log_BodyWt', data=sleep).fit()
print(fit_2.summary())

# first we add to the sleep data frame, make a data frame of the stuff we want to plot:
plots_df = sleep[['TotalSleep', 'log_BodyWt']].copy()
plots_df['yhat'] = fit.predict()
plots_df['resids'] = plots_df['TotalSleep'] - plots_df['yhat']  
# note the fit object also contains various residuals.
plots_df.head()

# plot predictions against real data:
plt.scatter(sleep['log_BodyWt'], sleep['TotalSleep'], c='k')
plt.plot(sleep['log_BodyWt'], fit.predict(), c='r')
plt.xlabel('Log Bodyweight')
plt.ylabel('Total Sleep')
plt.legend(['Prediction', 'Data'])
plt.show()

# plot residuals against bodyweight:
resids = sleep['TotalSleep'] - fit.predict()
plt.scatter(plots_df['log_BodyWt'], resids, c='k')
plt.plot((-8, 10), (0, 0), 'r--')
plt.xlabel('Log Bodyweight')
plt.ylabel('Residuals')
plt.tight_layout()
plt.show()

plt.hist(resids.values);

sns.lmplot('log_BodyWt', 'TotalSleep', data=sleep)
sns.despine(offset=10, trim=True);

sns.residplot('log_BodyWt', 'TotalSleep', data=sleep)
sns.despine(offset=10, trim=True);

sleep['danger_cat'] = 'low'  # set as a string, "low" for all.
sleep.loc[sleep['Danger'] > 3, 'danger_cat'] = 'high'  # set Danger=4 or 5 to "high"
sleep['danger_cat']

# how does total sleep appear to change according to our new categories?
sns.set_context('notebook')
g = sns.boxplot(sleep['TotalSleep'], groupby=sleep['danger_cat'])
sns.despine(ax=g, offset=10);

# Note how "violinplot" can be a bit misleading here, due to few cases and the kernel density smooth:
# g = sns.violinplot(sleep['TotalSleep'], groupby=sleep['danger_cat'])
# sns.despine(ax=g, offset=10);
plt.savefig('blah.pdf')

y, X = patsy.dmatrices('TotalSleep ~ danger_cat', sleep)
X

fit = smf.glm(formula='TotalSleep ~ danger_cat', data=sleep).fit()
print(fit.summary())

high_array = sleep['TotalSleep'][sleep['danger_cat']=='high']
low_array = sleep['TotalSleep'][sleep['danger_cat']=='low']

# how does this compare to an independent groups z-test?
t, p = sm_stats.weightstats.ztest(high_array, low_array)
'Independent samples z-test: z = {t}, p = {p}'.format(t=np.round(t, decimals=3), p=p)

'Model values: z = {t}, p = {p}'.format(
                        t=np.round(fit.tvalues[1], decimals=3), 
                        p=fit.pvalues[1])

fit_ols = smf.ols(formula='TotalSleep ~ danger_cat', data=sleep).fit()
print(fit_ols.summary())

t, p, df = sm_stats.weightstats.ttest_ind(low_array, high_array)
'Independent samples t-test: t({df}) = {t}, p = {p}'.format(
    t=np.round(t, decimals=3), df=int(df), p=p)

'Model values: t = {t}, p = {p}'.format(
                        t=np.round(fit_ols.tvalues[1], decimals=3), 
                        p=fit_ols.pvalues[1])

# using statsmodels, must first compute "descrstats":
d1 = sm_stats.weightstats.DescrStatsW(high_array)
t, p = d1.ztest_mean(0)
'Independent z-test: z = {t}, p = {p}'.format(
    t=np.round(t, decimals=3), p=p)

'Model values: z = {t}, p = {p}'.format(
                        t=np.round(fit.tvalues[0], decimals=3), 
                        p=fit.pvalues[0])


# cell to check "high" alone
high_only = sleep.copy()
high_only = high_only[high_only['danger_cat']=='high']
fit_high = smf.glm(formula='TotalSleep ~ 1', data=high_only).fit()
print(fit_high.summary())

sleep['life_cat'] = pd.qcut(sleep['LifeSpan'], [0, .33, .66, 1])
sleep['life_cat']

# category labels are a bit ugly, reflecting range of cuts:

# change category labels, then convert to object:
sleep['life_cat'] = sleep['life_cat'].cat.rename_categories(['short', 'med', 'long']) # rename categories
sleep['life_cat'] = sleep['life_cat'].astype('object')  # make it an object, rather than a category
sleep['life_cat']

y, X = patsy.dmatrices('TotalSleep ~ life_cat', sleep)
X

sleep['life_cat'].head()

X[0:5, :]  # first five rows, and second two cols of X

fit = smf.glm('TotalSleep ~ life_cat', sleep).fit()
print(fit.summary())

long_mean = fit.params[0]
short_mean = fit.params[0] + fit.params[2]
med_mean = fit.params[0] + fit.params[1]

print('the predictions are: \
      \n long = {} \
      \n med = {} \
      \n short = {}'.format(np.round(long_mean, decimals=1), 
                          np.round(med_mean, decimals=1), 
                          np.round(short_mean, decimals=1)))

g = sns.boxplot(sleep['TotalSleep'], groupby=sleep['life_cat'])
sns.despine(ax=g, offset=10);

from IPython.core.display import HTML

def css_styling():
    styles = open("../custom_style.css", "r").read()
    return HTML(styles)
css_styling()