%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import brewer2mpl
from matplotlib import rcParams

#colorbrewer2 Dark2 qualitative color table
dark2_cmap = brewer2mpl.get_map('Dark2', 'Qualitative', 7)
dark2_colors = dark2_cmap.mpl_colors

rcParams['figure.figsize'] = (10, 6)
rcParams['figure.dpi'] = 150
rcParams['axes.color_cycle'] = dark2_colors
rcParams['lines.linewidth'] = 2
rcParams['axes.facecolor'] = 'white'
rcParams['font.size'] = 20
rcParams['patch.edgecolor'] = 'white'
rcParams['patch.facecolor'] = dark2_colors[0]
rcParams['font.family'] = 'StixGeneral'


def remove_border(axes=None, top=False, right=False, left=True, bottom=True):
    """
    Minimize chartjunk by stripping out unnecesasry plot borders and axis ticks
    
    The top/right/left/bottom keywords toggle whether the corresponding plot border is drawn
    """
    ax = axes or plt.gca()
    ax.spines['top'].set_visible(top)
    ax.spines['right'].set_visible(right)
    ax.spines['left'].set_visible(left)
    ax.spines['bottom'].set_visible(bottom)
    
    #turn off all ticks
    ax.yaxis.set_ticks_position('none')
    ax.xaxis.set_ticks_position('none')
    
    #now re-enable visibles
    if top:
        ax.xaxis.tick_top()
    if bottom:
        ax.xaxis.tick_bottom()
    if left:
        ax.yaxis.tick_left()
    if right:
        ax.yaxis.tick_right()

import csv
results = []
path  = "train.csv"
with open(path, "r") as d:
    header = d.next().strip("\n").split(",")
    for line in d:
        results.append(line.strip("\n").split(","))
data  = pd.DataFrame(data=np.asarray(results), columns=header)

from datetime import datetime, date, time
data['hour'] = data['datetime'].map(lambda x: (datetime.strptime(x, "%Y-%m-%d %H:%M:%S")).hour)
data['weekday'] = data['datetime'].map(lambda x: (datetime.strptime(x, "%Y-%m-%d %H:%M:%S")).weekday())
data['month'] = data['datetime'].map(lambda x: (datetime.strptime(x, "%Y-%m-%d %H:%M:%S")).month)
data.head()

data['temp'] = data['temp'].astype(float)
data['atemp'] = data['atemp'].astype(float)
data['humidity'] = data['humidity'].astype(float)
data['windspeed'] = data['windspeed'].astype(float)
data['casual'] = data['casual'].astype(int)
data['registered'] = data['registered'].astype(int)
data['count'] = data['count'].astype(int)

season = pd.get_dummies(data = data.iloc[:,1], prefix = 'season')
holiday = pd.get_dummies(data = data.iloc[:,2], prefix = 'holiday')
workingday = pd.get_dummies(data = data.iloc[:,3], prefix = 'workingday')
weather = pd.get_dummies(data = data.iloc[:,4], prefix = 'weather')
weekday = pd.get_dummies(data = data.iloc[:,12], prefix = 'weekday')
hour = pd.get_dummies(data = data.iloc[:,13], prefix = 'hour')
month = pd.get_dummies(data = data.iloc[:,14], prefix = 'month')

finalDF = data.iloc[:,5:9] #numerical
#categorical
finalDF =finalDF.join(season)
finalDF = finalDF.join(holiday)
finalDF = finalDF.join(workingday)
finalDF = finalDF.join(weather)
finalDF = finalDF.join(weekday)
finalDF = finalDF.join(hour)
finalDF = finalDF.join(month)
finalDF = finalDF.join(data.iloc[:,9:12]) #labels
print finalDF.shape
finalDF.head()

fig, axes=plt.subplots(figsize=(35, 14), nrows=2, ncols=3)
fig.subplots_adjust(hspace = 0.3)

sum_dataREG = data.groupby('weekday').registered.sum()
count_dataREG = data.groupby('weekday').registered.count()
average_dataREG = sum_dataREG / count_dataREG
sum_dataCAS = data.groupby('weekday').casual.sum()
count_dataCAS = data.groupby('weekday').casual.count()
average_dataCAS = sum_dataCAS / count_dataCAS
index = np.arange(len(sum_dataCAS))
axes[0][0].bar(index, average_dataREG, color = "r", label = "registered users")
axes[0][0].bar(index, average_dataCAS, color = "b", label = "casual users")
remove_border()
axes[0][0].set_ylabel("Total bikes rented")
axes[0][0].set_xlabel("Weekday")
axes[0][0].set_title("Average number of rentals by day of the week")
axes[0][0].legend()
remove_border(axes[0][0])

sum_dataREG = data.groupby('hour').registered.sum()
count_dataREG = data.groupby('hour').registered.count()
average_dataREG = sum_dataREG / count_dataREG
sum_dataCAS = data.groupby('hour').casual.sum()
count_dataCAS = data.groupby('hour').casual.count()
average_dataCAS = sum_dataCAS / count_dataCAS
index = np.arange(len(sum_dataCAS))
axes[0][1].bar(index, average_dataREG, color = "r", label = "registered users")
axes[0][1].bar(index, average_dataCAS, color = "b", label = "casual users")
remove_border()
axes[0][1].set_ylabel("Total bikes rented")
axes[0][1].set_xlabel("Hour of day")
axes[0][1].set_title("Average number of rentals by hour of day")
axes[0][1].legend()
remove_border(axes[0][1])

sum_dataREG = data.groupby('month').registered.sum()
count_dataREG = data.groupby('month').registered.count()
average_dataREG = sum_dataREG / count_dataREG
sum_dataCAS = data.groupby('month').casual.sum()
count_dataCAS = data.groupby('month').casual.count()
average_dataCAS = sum_dataCAS / count_dataCAS
index = np.arange(len(sum_dataCAS))
axes[0][2].bar(index, average_dataREG, color = "r", label = "registered users")
axes[0][2].bar(index, average_dataCAS, color = "b", label = "casual users")
remove_border()
axes[0][2].set_ylabel("Total bikes rented")
axes[0][2].set_xlabel("Month of year")
axes[0][2].set_title("Average number of rentals by month of year")
axes[0][2].legend()
remove_border(axes[0][2])

sum_dataREG = data.groupby('workingday').registered.sum()
count_dataREG = data.groupby('workingday').registered.count()
average_dataREG = sum_dataREG / count_dataREG
sum_dataCAS = data.groupby('workingday').casual.sum()
count_dataCAS = data.groupby('workingday').casual.count()
average_dataCAS = sum_dataCAS / count_dataCAS
index = np.arange(len(sum_dataCAS))
axes[1][0].bar(index, average_dataREG, color = "r", label = "registered users")
axes[1][0].bar(index, average_dataCAS, color = "b", label = "casual users")
remove_border()
axes[1][0].set_ylabel("Total bikes rented")
axes[1][0].set_title("Average number of rentals by working day")
axes[1][0].legend()
axes[1][0].axes.set_xticklabels(["","","Non-Working","","","","","Working"])
remove_border(axes[1][0])

sum_dataREG = data.groupby('holiday').registered.sum()
count_dataREG = data.groupby('holiday').registered.count()
average_dataREG = sum_dataREG / count_dataREG
sum_dataCAS = data.groupby('holiday').casual.sum()
count_dataCAS = data.groupby('holiday').casual.count()
average_dataCAS = sum_dataCAS / count_dataCAS
index = np.arange(len(sum_dataCAS))
axes[1][1].bar(index, average_dataREG, color = "r", label = "registered users")
axes[1][1].bar(index, average_dataCAS, color = "b", label = "casual users")
remove_border()
axes[1][1].set_ylabel("Total bikes rented")
axes[1][1].set_title("Average number of rentals by holiday")
axes[1][1].legend()
axes[1][1].axes.set_xticklabels(["","","Non-Holiday","","","","","Holiday"])
remove_border(axes[1][1])

fig.delaxes(axes[1][2])

fig, axes=plt.subplots(figsize=(35, 16), nrows=2, ncols=3)
fig.subplots_adjust(hspace = 0.5)

sum_dataREG = data.groupby('temp').registered.sum()
count_dataREG = data.groupby('temp').registered.count()
average_dataREG = sum_dataREG / count_dataREG
sum_dataCAS = data.groupby('temp').casual.sum()
count_dataCAS = data.groupby('temp').casual.count()
average_dataCAS = sum_dataCAS / count_dataCAS
index = np.arange(len(sum_dataCAS))
axes[0][0].bar(index, average_dataREG, color = "r", label = "registered users")
axes[0][0].bar(index, average_dataCAS, color = "b", label = "casual users")
remove_border(axes[0][0])
axes[0][0].set_ylabel("Total bikes rented")
axes[0][0].set_xlabel("Temperature (Celcius)")
axes[0][0].set_title("Average number of rentals according to temperature", y=1.1)
axes[0][0].legend()

sum_dataREG = data.groupby('weather').registered.sum()
count_dataREG = data.groupby('weather').registered.count()
average_dataREG = sum_dataREG / count_dataREG
sum_dataCAS = data.groupby('weather').casual.sum()
count_dataCAS = data.groupby('weather').casual.count()
average_dataCAS = sum_dataCAS / count_dataCAS
index = np.arange(len(sum_dataCAS))
axes[0][1].bar(index, average_dataREG, color = "r", label = "registered users")
axes[0][1].bar(index, average_dataCAS, color = "b", label = "casual users")
remove_border(axes[0][1])
axes[0][1].set_ylabel("Total bikes rented")
axes[0][1].set_xlabel("Weather")
axes[0][1].set_title("Average number of rentals according to weather encoding", y=1.1)
axes[0][1].legend()

sum_dataREG = data.groupby('season').registered.sum()
count_dataREG = data.groupby('season').registered.count()
average_dataREG = sum_dataREG / count_dataREG
sum_dataCAS = data.groupby('season').casual.sum()
count_dataCAS = data.groupby('season').casual.count()
average_dataCAS = sum_dataCAS / count_dataCAS
index = np.arange(len(sum_dataCAS))
axes[0][2].bar(index, average_dataREG, color = "r", label = "registered users")
axes[0][2].bar(index, average_dataCAS, color = "b", label = "casual users")
remove_border(axes[0][2])
axes[0][2].set_ylabel("Total bikes rented")
axes[0][2].set_xlabel("Season")
axes[0][2].set_title("Average number of rentals according to season encoding", y=1.1)
axes[0][2].legend()

sum_dataREG = data.groupby('atemp').registered.sum()
count_dataREG = data.groupby('atemp').registered.count()
average_dataREG = sum_dataREG / count_dataREG
sum_dataCAS = data.groupby('atemp').casual.sum()
count_dataCAS = data.groupby('atemp').casual.count()
average_dataCAS = sum_dataCAS / count_dataCAS
index = np.arange(len(sum_dataCAS))
axes[1][0].bar(index, average_dataREG, color = "r", label = "registered users")
axes[1][0].bar(index, average_dataCAS, color = "b", label = "casual users")
remove_border(axes[1][0])
axes[1][0].set_ylabel("Total bikes rented")
axes[1][0].set_xlabel("Feels Like Temperature (Celcius)")
axes[1][0].set_title("Average number of rentals according to 'feels like' temperature", y=1.1)
axes[1][0].legend()

sum_dataREG = data.groupby('humidity').registered.sum()
count_dataREG = data.groupby('humidity').registered.count()
average_dataREG = sum_dataREG / count_dataREG
sum_dataCAS = data.groupby('humidity').casual.sum()
count_dataCAS = data.groupby('humidity').casual.count()
average_dataCAS = sum_dataCAS / count_dataCAS
index = np.arange(len(sum_dataCAS))
axes[1][1].bar(index, average_dataREG, color = "r", label = "registered users")
axes[1][1].bar(index, average_dataCAS, color = "b", label = "casual users")
remove_border(axes[1][1])
axes[1][1].set_ylabel("Total bikes rented")
axes[1][1].set_xlabel("Humidity")
axes[1][1].set_title("Average number of rentals according to humidity", y=1.1)
axes[1][1].legend()

sum_dataREG = data.groupby('windspeed').registered.sum()
count_dataREG = data.groupby('windspeed').registered.count()
average_dataREG = sum_dataREG / count_dataREG
sum_dataCAS = data.groupby('windspeed').casual.sum()
count_dataCAS = data.groupby('windspeed').casual.count()
average_dataCAS = sum_dataCAS / count_dataCAS
index = np.arange(len(sum_dataCAS))
axes[1][2].bar(index, average_dataREG, color = "r", label = "registered users")
axes[1][2].bar(index, average_dataCAS, color = "b", label = "casual users")
remove_border(axes[1][2])
axes[1][2].set_ylabel("Total bikes rented")
axes[1][2].set_xlabel("Windspeed")
axes[1][2].set_title("Average number of rentals according to windspeed", y=1.1)
axes[1][2].legend()

weather_interest = ['temp','atemp','humidity','windspeed','season', 'weather', 'hour', 'month']
_ = pd.scatter_matrix(data[weather_interest].astype(float), figsize=(18,18), alpha=0.8, diagonal='hist')

from sklearn.cross_validation import train_test_split
from sklearn.linear_model import LinearRegression

#Create two datasets for casual and registered users

y_data = finalDF['count']
x_data = finalDF.drop('count', 1)
x_data = x_data.drop('registered', 1)
x_data = x_data.drop('casual', 1)

#casual
y_data_casual = finalDF['casual']
x_data_casual = finalDF.drop('casual',1)
x_data_casual = x_data_casual.drop('registered',1)
x_data_casual = x_data_casual.drop('count',1)

#registered
y_data_registered = finalDF['registered']
x_data_registered = finalDF.drop('casual',1)
x_data_registered = x_data_registered.drop('registered',1)
x_data_registered = x_data_registered.drop('count',1)

xtrain_reg, xtest_reg, ytrain_reg, ytest_reg = train_test_split(x_data_registered, y_data_registered)
xtrain_cas, xtest_cas, ytrain_cas, ytest_cas = train_test_split(x_data_casual, y_data_casual)
xtrain, xtest, ytrain, ytest = train_test_split(x_data, y_data)

clf = LinearRegression()
clf_cas = LinearRegression()
clf_reg = LinearRegression()

clf.fit(xtrain, ytrain)
clf_cas.fit(xtrain_cas, ytrain_cas)
clf_reg.fit(xtrain_reg, ytrain_reg)

#The below code searched for negative count predictions as we assumed this was why we were unable to calculate our log error
# check = [e for e in clf.predict(xtest) if e <= -1]
# print check

print "Squared Error ", np.mean((clf.predict(xtest)-ytest)**2)
print "Squared Error casual ", np.mean((clf_cas.predict(xtest_cas)-ytest_cas)**2)
print "Squared Error reg " , np.mean((clf_reg.predict(xtest_reg)-ytest_reg)**2), "\n"

print "Training accuracy: %0.2f%%" % (100*clf.score(xtrain, ytrain))
print "Test accuracy: %0.2f%%" % (100*clf.score(xtest, ytest)) + "\n"

print "Training accuracy casual: %0.2f%%" % (100*clf_cas.score(xtrain_cas, ytrain_cas))
print "Test accuracy casual: %0.2f%%" % (100*clf_cas.score(xtest_cas, ytest_cas)) + "\n"

print "Training accuracy reg: %0.2f%%" % (100*clf_reg.score(xtrain_reg, ytrain_reg))
print "Test accuracy reg: %0.2f%%" % (100*clf_reg.score(xtest_reg, ytest_reg))

plt.scatter(clf.predict(xtest), clf.predict(xtest)- ytest, c='g', s=40, label='Residual Testing')
plt.scatter(clf.predict(xtrain), clf.predict(xtrain)- ytrain, c='b', s=40, alpha = 0.25, label='Residual Training')
plt.plot([-200,600],[0,0], c='r')
plt.legend()
plt.title('Residuals vs Bike Share Count')
plt.xlabel("Predicted count values")
plt.ylabel("Residuals")

remove_border()

finalDF_mod = finalDF
#Build the first model using total count as a label so convert the count column to log counts
#Note: it is ok to just directly convert to log(y) as min(y) is 1 and not 0 in the 'count' column
finalDF_mod['log_count'] = finalDF['count'].map(lambda x: np.log(x))
#Remove the count, the registered counts and the causal counts columns
finalDF_mod = finalDF.drop('count', 1)
finalDF_mod = finalDF_mod.drop('registered', 1)
finalDF_mod = finalDF_mod.drop('casual', 1)

finalDF_mod.head()

def rmsele(actual, pred):
    """
    Given a column of predictions and a column of actuals, calculate the RMSELE
    """
    squared_errors = (np.log(pred + 1) - np.log(actual + 1)) ** 2
    mean_squared = np.sum(squared_errors) / len(squared_errors)
    return np.sqrt(mean_squared)

x_data = finalDF_mod.drop('log_count', 1)
print x_data.shape
y_data = finalDF_mod['log_count']

xtrain, xtest, ytrain, ytest = train_test_split(x_data, y_data)
clf = LinearRegression()
clf.fit(xtrain, ytrain)

#Root Mean Squared Logarithmic Error (RMSLE)
print "Root Mean Squared Logarithmic Error: ", rmsele(np.exp(clf.predict(xtest)), np.exp(ytest))
print "Training accuracy: %0.2f%%" % (100*clf.score(xtrain, ytrain))
print "Test accuracy: %0.2f%%" % (100*clf.score(xtest, ytest)) + "\n"

plt.scatter(np.exp(clf.predict(xtest)), np.exp(ytest))
plt.title("Poisson Regression Model Prediction")
plt.ylabel("Actual Rental Counts")
plt.xlabel("Predicted Rental Counts")
plt.xlim(0, 1000)
plt.ylim(0, 1000)
remove_border()

plt.scatter(np.exp(clf.predict(xtest)), np.exp(clf.predict(xtest))-np.exp(ytest), c='g', s=40, label='Residual Testing')
plt.scatter(np.exp(clf.predict(xtrain)), np.exp(clf.predict(xtrain))-np.exp(ytrain), c='b', s=40, alpha = 0.25, label='Residual Training')
plt.plot([0,600],[0,0], c='r')
plt.xlim(0,600)
plt.legend()
plt.title('Residuals vs Bike Share Count')
plt.xlabel("Predicted count values")
plt.ylabel("Residuals")

remove_border()

test_results = []
path  = "test.csv"
with open(path, "r") as d:
    test_header = d.next().strip("\n").split(",")
    for line in d:
        test_results.append(line.strip("\n").split(","))
test = pd.DataFrame(data=np.asarray(test_results), columns=test_header)

test['hour'] = test['datetime'].map(lambda x: (datetime.strptime(x, "%Y-%m-%d %H:%M:%S")).hour)
test['weekday'] = test['datetime'].map(lambda x: (datetime.strptime(x, "%Y-%m-%d %H:%M:%S")).weekday())
test['month'] = test['datetime'].map(lambda x: (datetime.strptime(x, "%Y-%m-%d %H:%M:%S")).month)
test['season'] = test['season'].astype(int)
test['holiday'] = test['holiday'].astype(int)
test['workingday'] = test['workingday'].astype(int)
test['weather'] = test['weather'].astype(int)
test['temp'] = test['temp'].astype(float)
test['atemp'] = test['atemp'].astype(float)
test['humidity'] = test['humidity'].astype(float)
test['windspeed'] = test['windspeed'].astype(float)

season_test = pd.get_dummies(data = test.iloc[:,1], prefix = 'season')
holiday_test = pd.get_dummies(data = test.iloc[:,2], prefix = 'holiday')
workingday_test = pd.get_dummies(data = test.iloc[:,3], prefix = 'workingday')
weather_test = pd.get_dummies(data = test.iloc[:,4], prefix = 'weather')
weekday_test = pd.get_dummies(data = test.iloc[:,9], prefix = 'weekday')
hour_test = pd.get_dummies(data = test.iloc[:,10], prefix = 'hour')
month_test = pd.get_dummies(data = test.iloc[:,11], prefix = 'month')
finalDF_test = test.iloc[:,5:9] #numerical
#categorical
finalDF_test = finalDF_test.join(season_test)
finalDF_test = finalDF_test.join(holiday_test)
finalDF_test = finalDF_test.join(workingday_test)
finalDF_test = finalDF_test.join(weather_test)
finalDF_test = finalDF_test.join(weekday_test)
finalDF_test = finalDF_test.join(hour_test)
finalDF_test = finalDF_test.join(month_test)
print finalDF_test.shape
finalDF_test.head()

df = pd.DataFrame(data=zip(test['datetime'], np.exp(clf.predict(finalDF_test))), columns = ["datetime", "count"])
pd.DataFrame.to_csv(df, 'submission.csv', index = False)

from sklearn.ensemble import RandomForestRegressor
from sklearn.cross_validation import train_test_split
from sklearn.metrics import make_scorer

finalDF_mod = data
finalDF_mod = data.drop('datetime', 1)
finalDF_mod = finalDF_mod.drop('count', 1)
finalDF_mod = finalDF_mod.drop('registered', 1)
finalDF_mod = finalDF_mod.drop('casual', 1)
y_data = data['count']
x_data = finalDF_mod
features = x_data.columns

xtrain, xtest, ytrain, ytest = train_test_split(x_data, y_data)

from sklearn.grid_search import GridSearchCV

# This helper function will make a callable that we can use in cross_val_score
rmsele_scorer = make_scorer(rmsele, greater_is_better=False)

#Parameter Optimisation with grid search
tuned_parameters = [{'max_features': ['sqrt', 'log2', 'auto'], 'max_depth': [5, 8, 12], 'min_samples_leaf': [2, 5, 10]}]
rf =  GridSearchCV(RandomForestRegressor(n_jobs=1, n_estimators=1000), tuned_parameters, cv=3, verbose=2, scoring=rmsele_scorer).fit(xtrain, ytrain)

print 'Best Parameters:'
print rf.best_estimator_

# Model Running with optimum parameters on train, test split for model performance evaluation and plotting
# rf_op = RandomForestRegressor(n_jobs=1, n_estimators=1000, min_samples_leaf=2, min_samples_split=2, max_features = 'auto', oob_score=True)
# rf_op.fit(xtrain, ytrain)
# preds = rf_op.predict(xtest)

# Model Running with optimum parameters on entire training set
rf_op = RandomForestRegressor(n_jobs=1, n_estimators=1000, min_samples_leaf=2, min_samples_split=2, max_features = 'auto', oob_score=True)
rf_op.fit(x_data, y_data)

plt.scatter(preds, ytest)
plt.title("Random Forest Model Prediction")
plt.ylabel("Actual Rental Counts")
plt.xlabel("Predicted Rental Counts")
plt.xlim(0, 1000)
plt.ylim(0, 1000)
remove_border()

#Root Mean Squared Logarithmic Error (RMSLE)
print "Root Mean Squared Logarithmic Error Train: ", rmsele(rf_op.predict(xtrain), ytrain)
print "Root Mean Squared Logarithmic Error Test: ", rmsele(rf_op.predict(xtest), ytest)
print "Training accuracy: %0.2f%%" % (100*rf_op.score(xtrain, ytrain))
print "Test accuracy: %0.2f%%" % (100*rf_op.score(xtest, ytest)) + "\n"

numfeat = len(features)
indices = np.argsort(rf_op.feature_importances_)[::-1][:numfeat]
plt.bar(xrange(numfeat), rf_op.feature_importances_[indices], align='center', alpha=.5)
plt.xticks(xrange(numfeat), features[indices], rotation='vertical', fontsize=12)
plt.xlim([-1, numfeat])
plt.ylabel('Feature percentages', fontsize=12)
plt.title('Feature importances computed by Random Forest', fontsize=16)
remove_border()
print zip(features, rf_op.feature_importances_)

# submit = test.drop('datetime', 1)
# print submit.shape, test.shape, xtrain.shape, xtest.shape
# dfForest = pd.DataFrame(data=zip(test['datetime'], rf_op.predict(submit)), columns = ["datetime", "count"])
# dfForest[1:100]
# pd.DataFrame.to_csv(dfForest, 'submission_forest.csv', index = False)

ET_results = []
path  = "submission_ET.csv"
with open(path, "r") as d:
    ET_header = d.next().strip("\n").split(",")
    for line in d:
        segment = line.strip("\n").split(",")
        segment[1] = str(round(float(segment[1])))
        line = segment[0] + "," + segment[1]
        ET_results.append(line.strip("\n").split(","))
        
ET_ints = pd.DataFrame(data=np.asarray(ET_results), columns=ET_header)
pd.DataFrame.to_csv(ET_ints, 'submission_ET_round.csv', index = False)

from sklearn.ensemble import ExtraTreesRegressor

ET = ExtraTreesRegressor(n_estimators=1000, min_samples_split=2, min_samples_leaf=2, max_features='auto', n_jobs=1)
ET.fit(x_data, y_data)

#Root Mean Squared Logarithmic Error (RMSLE)
print "Root Mean Squared Logarithmic Error Train: ", rmsele(ET.predict(x_data), y_data)
print "Training accuracy: %0.2f%%" % (100*ET.score(x_data, y_data))

# submit = test.drop('datetime', 1)
# print submit.shape, test.shape, xtrain.shape, xtest.shape
# dfET = pd.DataFrame(data=zip(test['datetime'], ET.predict(submit)), columns = ["datetime", "count"])
# dfET[1:100]
pd.DataFrame.to_csv(dfET, 'submission_weird.csv', index = False)

from numpy import corrcoef, sum, log, arange
from pylab import pcolor, show, colorbar, xticks, yticks
import matplotlib.pyplot as plt
fig, ax = plt.subplots()
R = corrcoef(x_data.T)
pcolor(R)
colorbar()
ax.set_xticks(np.arange(x_data.shape[1]) + 0.5, minor=False) #code to align the labels with the correct column of data on plot
ax.set_yticks(np.arange(x_data.shape[1]) + 0.5, minor=False)#code to align the labels with the correct row of data on plot
ax.set_xticklabels(x_data.columns, rotation=90)
ax.set_yticklabels(x_data.columns)
ax.set_title("Correlation Matrix of Features")

from sklearn.decomposition import PCA
pca = PCA(n_components=None)
pca_X = pca.fit_transform(x_data)
label = [("pca_" + str(e)) for e in range(pca_X.shape[1])]

# print pca_X.shape
fig, ax = plt.subplots()
R = corrcoef(pca_X.T)
pcolor(R)
colorbar()
ax.set_xticks(np.arange(pca_X.shape[1]) + 0.5, minor=False) #code to align the labels with the correct column of data on plot
ax.set_yticks(np.arange(pca_X.shape[1]) + 0.5, minor=False)#code to align the labels with the correct row of data on plot
ax.set_xticklabels(label, rotation=90)
ax.set_yticklabels(label)
ax.set_title("Correlation Matrix of PCA Components")

# Split the data into training and cross validation test set to evaluate the performance of PCA
x_ETtrain, x_ETtest, y_ETtrain, y_ETtest = train_test_split(pca_X, y_data)

#Re-run the extra tree model using the PCA components as the new features
ET.fit(x_ETtrain, y_ETtrain)

preds = ET.predict(x_ETtest)
plt.scatter(preds, y_ETtest)
plt.title("Extra Tree Model Prediction")
plt.ylabel("Actual Rental Counts")
plt.xlabel("Predicted Rental Counts")
plt.xlim(0, 1000)
plt.ylim(0, 1000)
remove_border()

#Root Mean Squared Logarithmic Error (RMSLE)
print "Root Mean Squared Logarithmic Error Train: ", rmsele(ET.predict(x_ETtrain), y_ETtrain)
print "Root Mean Squared Logarithmic Error Test: ", rmsele(ET.predict(x_ETtest), y_ETtest)
print "Training accuracy: %0.2f%%" % (100*ET.score(x_ETtrain, y_ETtrain))
print "Test accuracy: %0.2f%%" % (100*ET.score(x_ETtest, y_ETtest)) + "\n"

numfeat = len(features)
indices = np.argsort(ET.feature_importances_)[::-1][:numfeat]
plt.bar(xrange(numfeat), ET.feature_importances_[indices], align='center', alpha=.5)
plt.xticks(xrange(numfeat), label, rotation='vertical', fontsize=12)
plt.xlim([-1, numfeat])
plt.ylabel('Feature percentages', fontsize=12)
plt.title('Feature importances computed by Extra Tree', fontsize=16)
remove_border()

x_ETtrain, x_ETtest, y_ETtrain, y_ETtest = train_test_split(x_data, y_data)
ET.fit(x_ETtrain, y_ETtrain)

#Root Mean Squared Logarithmic Error (RMSLE)
print "Root Mean Squared Logarithmic Error Train: ", rmsele(ET.predict(x_ETtrain), y_ETtrain)
print "Root Mean Squared Logarithmic Error Test: ", rmsele(ET.predict(x_ETtest), y_ETtest)
print "Training accuracy: %0.2f%%" % (100*ET.score(x_ETtrain, y_ETtrain))
print "Test accuracy: %0.2f%%" % (100*ET.score(x_ETtest, y_ETtest)) + "\n"

fig, axes=plt.subplots(figsize=(35, 30), nrows=4, ncols=3)
fig.subplots_adjust(hspace = 0.5)
residual = list(ET.predict(x_ETtest).astype(float)-y_ETtest.astype(float))

for i in range(4):
    for j in range(3):
        if (i*3 + j) < 11:
            axes[i][j].scatter(list(x_ETtest[:, i*3 + j].astype(float)), residual, c='g', s=50, alpha = 0.75)
            axes[i][j].set_title('Residuals vs ' + features[i*3 + j])
            axes[i][j].set_xlabel(features[i*3 + j])
            axes[i][j].set_ylabel("Residuals")
            remove_border(axes[i][j])
        else:
            fig.delaxes(axes[i][j])

finalDF_mod = data
finalDF_mod = data.drop('datetime', 1)
finalDF_mod = finalDF_mod.drop('count', 1)
finalDF_mod = finalDF_mod.drop('registered', 1)
finalDF_mod = finalDF_mod.drop('casual', 1)
y_data_reg = data['registered']
y_data_cas = data['casual']
x_data = finalDF_mod

#REGISTERED MODEL
ETreg = ExtraTreesRegressor(n_estimators=1000, min_samples_split=2, min_samples_leaf=2, max_features='auto', n_jobs=1)
xreg_train, xreg_test, yreg_train, yreg_test = train_test_split(x_data, y_data_reg)
ETreg.fit(xreg_train, yreg_train)

#CASUAL MODEL
ETcas = ExtraTreesRegressor(n_estimators=1000, min_samples_split=2, min_samples_leaf=2, max_features='auto', n_jobs=1)
xcas_train, xcas_test, ycas_train, ycas_test = train_test_split(x_data, y_data_cas)
ETcas.fit(xcas_train, ycas_train)

#Root Mean Squared Logarithmic Error (RMSLE)
print "CASUAL USERS MODEL"
print "Root Mean Squared Logarithmic Error Train: ", rmsele(ETcas.predict(xcas_train), ycas_train)
print "Root Mean Squared Logarithmic Error Test: ", rmsele(ETcas.predict(xcas_test), ycas_test)
print "Training accuracy: %0.2f%%" % (100*ETcas.score(xcas_train, ycas_train))
print "Test accuracy: %0.2f%%" % (100*ETcas.score(xcas_test, ycas_test)) + "\n"

print "REGISTERED USERS MODEL"
print "Root Mean Squared Logarithmic Error Train: ", rmsele(ETreg.predict(xreg_train), yreg_train)
print "Root Mean Squared Logarithmic Error Test: ", rmsele(ETreg.predict(xreg_test), yreg_test)
print "Training accuracy: %0.2f%%" % (100*ETreg.score(xreg_train, yreg_train))
print "Test accuracy: %0.2f%%" % (100*ETreg.score(xreg_test, yreg_test)) + "\n"

print "COMBINED MODEL"
print "Root Mean Squared Logarithmic Error Train: ", rmsele((ETreg.predict(xreg_train)+ETcas.predict(xcas_train)), yreg_train+ycas_train)
print "Root Mean Squared Logarithmic Error Test: ", rmsele((ETreg.predict(xreg_test)+ETcas.predict(xcas_test)), yreg_test+ycas_test)

# submit = test.drop('datetime', 1)
# print submit.shape, test.shape, xtrain.shape, xtest.shape
# dfcombo = pd.DataFrame(data=zip(test['datetime'], (ETreg.predict(submit)+ETcas.predict(submit))), columns = ["datetime", "count"])
# dfcombo[1:100]
pd.DataFrame.to_csv(dfcombo, 'submission_comboET.csv', index = False)

import pymc as mc #Monte Carlo simulation libraray for Bayesian MCMC
import scipy.stats as stats #Import library which contains statistical functions for our analysis

def poisson(x, beta, alpha=0):
    return np.exp(np.dot(beta, x) + alpha)

# We normalise the hours so that no single term dominates the higher order modelling
def normalize(x):
    return (x - x.mean()) / x.std()

hour = data['hour'][0:3000]
hour_norm = normalize(hour)
hour_2 = hour**2
hour_2_norm = normalize(hour_2)
hour_3 = hour**3
hour_3_norm = normalize(hour_3)

# Y count data
y = data['registered'][0:3000]

# We assume Normal priors for alpha, beta, gamma, delta which are the contributors to the lambda parameter of our eventual Poisson distribution
# We choose the Normal distribution as its random variables can take both positive and negative numbers 
beta = mc.Normal("beta", 3, 0.5)
alpha = mc.Normal("alpha", 4, 0.5)
gamma = mc.Normal("gamma", -2.5, 0.5)
delta = mc.Normal("delta", 0, 1)

# Our deterministic functions combines our priors to calculate the lambda rate parameter for our Poisson
@mc.deterministic
def rate_param(x=hour_norm, x2=hour_2_norm, x3=hour_3_norm, beta=beta, alpha=alpha, gamma = gamma, delta = delta):
    return np.exp(np.dot(beta, x) + np.dot(gamma, x2) + np.dot(delta, x3) + alpha)

# The observed data is modelled as a Poisson with the observed=True argument to show that this should not be modified
# but tied to our dataset
observed = mc.Poisson("poisson_count", mu=rate_param, value=y, observed=True)

# We pass all our parameters to the model so that during the simulation they can be updated simultaneously
model = mc.Model([observed, beta, alpha, gamma, delta])

# MCMC Simulation code
map_ = mc.MAP(model)
map_.fit()
mcmc = mc.MCMC(model)
mcmc.sample(120000, 50000, 2)

alpha_samples = mcmc.trace('alpha')[:, None]  # best to make them 1d
beta_samples = mcmc.trace('beta')[:, None]
gamma_samples = mcmc.trace('gamma')[:, None]
delta_samples = mcmc.trace('delta')[:, None]

# histograms of the samples:
plt.subplot(411)
plt.title(r"Posterior distributions of the variables $\alpha, \beta$")
plt.ylabel("Occurrences")
plt.hist(alpha_samples, histtype='stepfilled', bins=35, alpha=0.85,
         label=r"posterior of $\alpha$", color="#A60628", normed=True)
plt.legend()
remove_border()

plt.subplot(412)
plt.ylabel("Occurrences")
plt.hist(beta_samples, histtype='stepfilled', bins=35, alpha=0.85,
         label=r"posterior of $\beta$", color="#7A68A6", normed=True)
plt.legend()
remove_border()

plt.subplot(413)
plt.ylabel("Occurrences")
plt.hist(gamma_samples, histtype='stepfilled', bins=35, alpha=0.85,
         label=r"posterior of $\gamma$", color="#A60628", normed=True)
plt.legend()
remove_border()

plt.subplot(414)
plt.ylabel("Occurrences")
plt.hist(delta_samples, histtype='stepfilled', bins=35, alpha=0.85,
         label=r"posterior of $\delta$", color="red", normed=True)
plt.legend()
remove_border()

t = np.linspace(hour_norm.min(), hour_norm.max(), 50)[:, None]
t2 = np.linspace(hour_2_norm.min(), hour_2_norm.max(), 50)[:, None]
t3 = np.linspace(hour_3_norm.min(), hour_3_norm.max(), 50)[:, None]
lambda_ =  np.exp(np.dot(beta_samples, t.T) + np.dot(gamma_samples, t2.T) + np.dot(delta_samples, t3.T) + alpha_samples)
mean_count_t_ = lambda_.mean(axis=0)
# plt.plot(t, mean_count_t_)

#Simulate data points
simulated = mc.Poisson("poisson_sim", rate_param)
N = 10000

mcmc = mc.MCMC([simulated, alpha, beta, gamma, delta, observed])
mcmc.sample(N)

simulations = mcmc.trace("poisson_sim")[:]

for i in range(50):
    plt.scatter(hour, simulations[i, :], color="k", s=2, alpha=0.1)
    remove_border()
plt.title("Simulated draws from Poisson distribution using lambda posterior")
plt.ylabel("Rental bikes count")
plt.xlabel("Hour")

df = pd.DataFrame(data=hour)
df['bikes'] = y
mean_data_hour = df.groupby('hour').bikes.mean()

plt.scatter(hour, y, color="k", s=50, alpha=0.2)
plt.plot(mean_data_hour.index, mean_data_hour.values, color='r', label = "average")
plt.title("Posterior expected value of counts of bike rentals; plus realizations")
plt.legend(loc="upper left")
plt.xlim(0, 24)
plt.ylabel("Rental bikes count")
plt.xlabel("Hours")
remove_border()
plt.legend()

from sklearn.cluster import KMeans
hour = np.asarray(hour)

n = 2 #Number of components of the mixture
kme = KMeans(n) # This is not needed but it is to help convergence
# kme.fit(data[:, np.newaxis])
kme.fit(hour[:, np.newaxis])
# ndata = len(data)
ndata = len(hour)

dd = mc.Dirichlet('dd', theta=(1,)*n)
category = mc.Categorical('category', p=dd, size=ndata, value=kme.labels_)

# alphas = mc.Uniform("alphas", 0, 1, size=n)
alphas = mc.Gamma('alphas', alpha=1, beta=.0001 ,size=n)
means = mc.TruncatedNormal('means', kme.cluster_centers_[:,0],0.1,a=0.0, b=100000, size=n)

@mc.deterministic
def mean(category=category, means=means):
    return means[category]

@mc.deterministic
def alpha(category=category, alphas=alphas):
    return alphas[category]

obs = mc.NegativeBinomial('obs', mean, alpha, value=y, observed = True)

predictive = mc.NegativeBinomial('predictive', mean, alpha)

model = mc.Model({'dd': dd,
                  'category': category,
                  'alphas': alphas,
                  'means': means,
                  'predictive':predictive,
                  'obs': obs})

mcmc = mc.MCMC(model)
mcmc.sample(10000, 7000)

G = [hour[np.nonzero(np.round(mcmc.trace("category")[:].mean(axis=0)) == i)] for i in range(0, 2) ]
plt.hist(G, bins=30, stacked = True)
remove_border()
plt.title("Assignment of category across hours")
plt.xlabel("Hour")

simulations = mcmc.trace("predictive")[:]
print simulations.shape
print simulations[0:100]


for i in range(100):
    plt.scatter(hour, simulations[i, :], color="k", s=2, alpha=0.1)
    remove_border()
plt.title("Simulated draws from Negative Binomial distribution")
plt.ylabel("Rental bikes count")
plt.xlabel("Hours")