%matplotlib inline import numpy as np import matplotlib.pyplot as plt import pandas as pd import brewer2mpl from matplotlib import rcParams #colorbrewer2 Dark2 qualitative color table dark2_cmap = brewer2mpl.get_map('Dark2', 'Qualitative', 7) dark2_colors = dark2_cmap.mpl_colors rcParams['figure.figsize'] = (10, 6) rcParams['figure.dpi'] = 150 rcParams['axes.color_cycle'] = dark2_colors rcParams['lines.linewidth'] = 2 rcParams['axes.facecolor'] = 'white' rcParams['font.size'] = 20 rcParams['patch.edgecolor'] = 'white' rcParams['patch.facecolor'] = dark2_colors[0] rcParams['font.family'] = 'StixGeneral' def remove_border(axes=None, top=False, right=False, left=True, bottom=True): """ Minimize chartjunk by stripping out unnecesasry plot borders and axis ticks The top/right/left/bottom keywords toggle whether the corresponding plot border is drawn """ ax = axes or plt.gca() ax.spines['top'].set_visible(top) ax.spines['right'].set_visible(right) ax.spines['left'].set_visible(left) ax.spines['bottom'].set_visible(bottom) #turn off all ticks ax.yaxis.set_ticks_position('none') ax.xaxis.set_ticks_position('none') #now re-enable visibles if top: ax.xaxis.tick_top() if bottom: ax.xaxis.tick_bottom() if left: ax.yaxis.tick_left() if right: ax.yaxis.tick_right() import csv results = [] path = "train.csv" with open(path, "r") as d: header = d.next().strip("\n").split(",") for line in d: results.append(line.strip("\n").split(",")) data = pd.DataFrame(data=np.asarray(results), columns=header) from datetime import datetime, date, time data['hour'] = data['datetime'].map(lambda x: (datetime.strptime(x, "%Y-%m-%d %H:%M:%S")).hour) data['weekday'] = data['datetime'].map(lambda x: (datetime.strptime(x, "%Y-%m-%d %H:%M:%S")).weekday()) data['month'] = data['datetime'].map(lambda x: (datetime.strptime(x, "%Y-%m-%d %H:%M:%S")).month) data.head() data['temp'] = data['temp'].astype(float) data['atemp'] = data['atemp'].astype(float) data['humidity'] = data['humidity'].astype(float) data['windspeed'] = data['windspeed'].astype(float) data['casual'] = data['casual'].astype(int) data['registered'] = data['registered'].astype(int) data['count'] = data['count'].astype(int) season = pd.get_dummies(data = data.iloc[:,1], prefix = 'season') holiday = pd.get_dummies(data = data.iloc[:,2], prefix = 'holiday') workingday = pd.get_dummies(data = data.iloc[:,3], prefix = 'workingday') weather = pd.get_dummies(data = data.iloc[:,4], prefix = 'weather') weekday = pd.get_dummies(data = data.iloc[:,12], prefix = 'weekday') hour = pd.get_dummies(data = data.iloc[:,13], prefix = 'hour') month = pd.get_dummies(data = data.iloc[:,14], prefix = 'month') finalDF = data.iloc[:,5:9] #numerical #categorical finalDF =finalDF.join(season) finalDF = finalDF.join(holiday) finalDF = finalDF.join(workingday) finalDF = finalDF.join(weather) finalDF = finalDF.join(weekday) finalDF = finalDF.join(hour) finalDF = finalDF.join(month) finalDF = finalDF.join(data.iloc[:,9:12]) #labels print finalDF.shape finalDF.head() fig, axes=plt.subplots(figsize=(35, 14), nrows=2, ncols=3) fig.subplots_adjust(hspace = 0.3) sum_dataREG = data.groupby('weekday').registered.sum() count_dataREG = data.groupby('weekday').registered.count() average_dataREG = sum_dataREG / count_dataREG sum_dataCAS = data.groupby('weekday').casual.sum() count_dataCAS = data.groupby('weekday').casual.count() average_dataCAS = sum_dataCAS / count_dataCAS index = np.arange(len(sum_dataCAS)) axes[0][0].bar(index, average_dataREG, color = "r", label = "registered users") axes[0][0].bar(index, average_dataCAS, color = "b", label = "casual users") remove_border() axes[0][0].set_ylabel("Total bikes rented") axes[0][0].set_xlabel("Weekday") axes[0][0].set_title("Average number of rentals by day of the week") axes[0][0].legend() remove_border(axes[0][0]) sum_dataREG = data.groupby('hour').registered.sum() count_dataREG = data.groupby('hour').registered.count() average_dataREG = sum_dataREG / count_dataREG sum_dataCAS = data.groupby('hour').casual.sum() count_dataCAS = data.groupby('hour').casual.count() average_dataCAS = sum_dataCAS / count_dataCAS index = np.arange(len(sum_dataCAS)) axes[0][1].bar(index, average_dataREG, color = "r", label = "registered users") axes[0][1].bar(index, average_dataCAS, color = "b", label = "casual users") remove_border() axes[0][1].set_ylabel("Total bikes rented") axes[0][1].set_xlabel("Hour of day") axes[0][1].set_title("Average number of rentals by hour of day") axes[0][1].legend() remove_border(axes[0][1]) sum_dataREG = data.groupby('month').registered.sum() count_dataREG = data.groupby('month').registered.count() average_dataREG = sum_dataREG / count_dataREG sum_dataCAS = data.groupby('month').casual.sum() count_dataCAS = data.groupby('month').casual.count() average_dataCAS = sum_dataCAS / count_dataCAS index = np.arange(len(sum_dataCAS)) axes[0][2].bar(index, average_dataREG, color = "r", label = "registered users") axes[0][2].bar(index, average_dataCAS, color = "b", label = "casual users") remove_border() axes[0][2].set_ylabel("Total bikes rented") axes[0][2].set_xlabel("Month of year") axes[0][2].set_title("Average number of rentals by month of year") axes[0][2].legend() remove_border(axes[0][2]) sum_dataREG = data.groupby('workingday').registered.sum() count_dataREG = data.groupby('workingday').registered.count() average_dataREG = sum_dataREG / count_dataREG sum_dataCAS = data.groupby('workingday').casual.sum() count_dataCAS = data.groupby('workingday').casual.count() average_dataCAS = sum_dataCAS / count_dataCAS index = np.arange(len(sum_dataCAS)) axes[1][0].bar(index, average_dataREG, color = "r", label = "registered users") axes[1][0].bar(index, average_dataCAS, color = "b", label = "casual users") remove_border() axes[1][0].set_ylabel("Total bikes rented") axes[1][0].set_title("Average number of rentals by working day") axes[1][0].legend() axes[1][0].axes.set_xticklabels(["","","Non-Working","","","","","Working"]) remove_border(axes[1][0]) sum_dataREG = data.groupby('holiday').registered.sum() count_dataREG = data.groupby('holiday').registered.count() average_dataREG = sum_dataREG / count_dataREG sum_dataCAS = data.groupby('holiday').casual.sum() count_dataCAS = data.groupby('holiday').casual.count() average_dataCAS = sum_dataCAS / count_dataCAS index = np.arange(len(sum_dataCAS)) axes[1][1].bar(index, average_dataREG, color = "r", label = "registered users") axes[1][1].bar(index, average_dataCAS, color = "b", label = "casual users") remove_border() axes[1][1].set_ylabel("Total bikes rented") axes[1][1].set_title("Average number of rentals by holiday") axes[1][1].legend() axes[1][1].axes.set_xticklabels(["","","Non-Holiday","","","","","Holiday"]) remove_border(axes[1][1]) fig.delaxes(axes[1][2]) fig, axes=plt.subplots(figsize=(35, 16), nrows=2, ncols=3) fig.subplots_adjust(hspace = 0.5) sum_dataREG = data.groupby('temp').registered.sum() count_dataREG = data.groupby('temp').registered.count() average_dataREG = sum_dataREG / count_dataREG sum_dataCAS = data.groupby('temp').casual.sum() count_dataCAS = data.groupby('temp').casual.count() average_dataCAS = sum_dataCAS / count_dataCAS index = np.arange(len(sum_dataCAS)) axes[0][0].bar(index, average_dataREG, color = "r", label = "registered users") axes[0][0].bar(index, average_dataCAS, color = "b", label = "casual users") remove_border(axes[0][0]) axes[0][0].set_ylabel("Total bikes rented") axes[0][0].set_xlabel("Temperature (Celcius)") axes[0][0].set_title("Average number of rentals according to temperature", y=1.1) axes[0][0].legend() sum_dataREG = data.groupby('weather').registered.sum() count_dataREG = data.groupby('weather').registered.count() average_dataREG = sum_dataREG / count_dataREG sum_dataCAS = data.groupby('weather').casual.sum() count_dataCAS = data.groupby('weather').casual.count() average_dataCAS = sum_dataCAS / count_dataCAS index = np.arange(len(sum_dataCAS)) axes[0][1].bar(index, average_dataREG, color = "r", label = "registered users") axes[0][1].bar(index, average_dataCAS, color = "b", label = "casual users") remove_border(axes[0][1]) axes[0][1].set_ylabel("Total bikes rented") axes[0][1].set_xlabel("Weather") axes[0][1].set_title("Average number of rentals according to weather encoding", y=1.1) axes[0][1].legend() sum_dataREG = data.groupby('season').registered.sum() count_dataREG = data.groupby('season').registered.count() average_dataREG = sum_dataREG / count_dataREG sum_dataCAS = data.groupby('season').casual.sum() count_dataCAS = data.groupby('season').casual.count() average_dataCAS = sum_dataCAS / count_dataCAS index = np.arange(len(sum_dataCAS)) axes[0][2].bar(index, average_dataREG, color = "r", label = "registered users") axes[0][2].bar(index, average_dataCAS, color = "b", label = "casual users") remove_border(axes[0][2]) axes[0][2].set_ylabel("Total bikes rented") axes[0][2].set_xlabel("Season") axes[0][2].set_title("Average number of rentals according to season encoding", y=1.1) axes[0][2].legend() sum_dataREG = data.groupby('atemp').registered.sum() count_dataREG = data.groupby('atemp').registered.count() average_dataREG = sum_dataREG / count_dataREG sum_dataCAS = data.groupby('atemp').casual.sum() count_dataCAS = data.groupby('atemp').casual.count() average_dataCAS = sum_dataCAS / count_dataCAS index = np.arange(len(sum_dataCAS)) axes[1][0].bar(index, average_dataREG, color = "r", label = "registered users") axes[1][0].bar(index, average_dataCAS, color = "b", label = "casual users") remove_border(axes[1][0]) axes[1][0].set_ylabel("Total bikes rented") axes[1][0].set_xlabel("Feels Like Temperature (Celcius)") axes[1][0].set_title("Average number of rentals according to 'feels like' temperature", y=1.1) axes[1][0].legend() sum_dataREG = data.groupby('humidity').registered.sum() count_dataREG = data.groupby('humidity').registered.count() average_dataREG = sum_dataREG / count_dataREG sum_dataCAS = data.groupby('humidity').casual.sum() count_dataCAS = data.groupby('humidity').casual.count() average_dataCAS = sum_dataCAS / count_dataCAS index = np.arange(len(sum_dataCAS)) axes[1][1].bar(index, average_dataREG, color = "r", label = "registered users") axes[1][1].bar(index, average_dataCAS, color = "b", label = "casual users") remove_border(axes[1][1]) axes[1][1].set_ylabel("Total bikes rented") axes[1][1].set_xlabel("Humidity") axes[1][1].set_title("Average number of rentals according to humidity", y=1.1) axes[1][1].legend() sum_dataREG = data.groupby('windspeed').registered.sum() count_dataREG = data.groupby('windspeed').registered.count() average_dataREG = sum_dataREG / count_dataREG sum_dataCAS = data.groupby('windspeed').casual.sum() count_dataCAS = data.groupby('windspeed').casual.count() average_dataCAS = sum_dataCAS / count_dataCAS index = np.arange(len(sum_dataCAS)) axes[1][2].bar(index, average_dataREG, color = "r", label = "registered users") axes[1][2].bar(index, average_dataCAS, color = "b", label = "casual users") remove_border(axes[1][2]) axes[1][2].set_ylabel("Total bikes rented") axes[1][2].set_xlabel("Windspeed") axes[1][2].set_title("Average number of rentals according to windspeed", y=1.1) axes[1][2].legend() weather_interest = ['temp','atemp','humidity','windspeed','season', 'weather', 'hour', 'month'] _ = pd.scatter_matrix(data[weather_interest].astype(float), figsize=(18,18), alpha=0.8, diagonal='hist') from sklearn.cross_validation import train_test_split from sklearn.linear_model import LinearRegression #Create two datasets for casual and registered users y_data = finalDF['count'] x_data = finalDF.drop('count', 1) x_data = x_data.drop('registered', 1) x_data = x_data.drop('casual', 1) #casual y_data_casual = finalDF['casual'] x_data_casual = finalDF.drop('casual',1) x_data_casual = x_data_casual.drop('registered',1) x_data_casual = x_data_casual.drop('count',1) #registered y_data_registered = finalDF['registered'] x_data_registered = finalDF.drop('casual',1) x_data_registered = x_data_registered.drop('registered',1) x_data_registered = x_data_registered.drop('count',1) xtrain_reg, xtest_reg, ytrain_reg, ytest_reg = train_test_split(x_data_registered, y_data_registered) xtrain_cas, xtest_cas, ytrain_cas, ytest_cas = train_test_split(x_data_casual, y_data_casual) xtrain, xtest, ytrain, ytest = train_test_split(x_data, y_data) clf = LinearRegression() clf_cas = LinearRegression() clf_reg = LinearRegression() clf.fit(xtrain, ytrain) clf_cas.fit(xtrain_cas, ytrain_cas) clf_reg.fit(xtrain_reg, ytrain_reg) #The below code searched for negative count predictions as we assumed this was why we were unable to calculate our log error # check = [e for e in clf.predict(xtest) if e <= -1] # print check print "Squared Error ", np.mean((clf.predict(xtest)-ytest)**2) print "Squared Error casual ", np.mean((clf_cas.predict(xtest_cas)-ytest_cas)**2) print "Squared Error reg " , np.mean((clf_reg.predict(xtest_reg)-ytest_reg)**2), "\n" print "Training accuracy: %0.2f%%" % (100*clf.score(xtrain, ytrain)) print "Test accuracy: %0.2f%%" % (100*clf.score(xtest, ytest)) + "\n" print "Training accuracy casual: %0.2f%%" % (100*clf_cas.score(xtrain_cas, ytrain_cas)) print "Test accuracy casual: %0.2f%%" % (100*clf_cas.score(xtest_cas, ytest_cas)) + "\n" print "Training accuracy reg: %0.2f%%" % (100*clf_reg.score(xtrain_reg, ytrain_reg)) print "Test accuracy reg: %0.2f%%" % (100*clf_reg.score(xtest_reg, ytest_reg)) plt.scatter(clf.predict(xtest), clf.predict(xtest)- ytest, c='g', s=40, label='Residual Testing') plt.scatter(clf.predict(xtrain), clf.predict(xtrain)- ytrain, c='b', s=40, alpha = 0.25, label='Residual Training') plt.plot([-200,600],[0,0], c='r') plt.legend() plt.title('Residuals vs Bike Share Count') plt.xlabel("Predicted count values") plt.ylabel("Residuals") remove_border() finalDF_mod = finalDF #Build the first model using total count as a label so convert the count column to log counts #Note: it is ok to just directly convert to log(y) as min(y) is 1 and not 0 in the 'count' column finalDF_mod['log_count'] = finalDF['count'].map(lambda x: np.log(x)) #Remove the count, the registered counts and the causal counts columns finalDF_mod = finalDF.drop('count', 1) finalDF_mod = finalDF_mod.drop('registered', 1) finalDF_mod = finalDF_mod.drop('casual', 1) finalDF_mod.head() def rmsele(actual, pred): """ Given a column of predictions and a column of actuals, calculate the RMSELE """ squared_errors = (np.log(pred + 1) - np.log(actual + 1)) ** 2 mean_squared = np.sum(squared_errors) / len(squared_errors) return np.sqrt(mean_squared) x_data = finalDF_mod.drop('log_count', 1) print x_data.shape y_data = finalDF_mod['log_count'] xtrain, xtest, ytrain, ytest = train_test_split(x_data, y_data) clf = LinearRegression() clf.fit(xtrain, ytrain) #Root Mean Squared Logarithmic Error (RMSLE) print "Root Mean Squared Logarithmic Error: ", rmsele(np.exp(clf.predict(xtest)), np.exp(ytest)) print "Training accuracy: %0.2f%%" % (100*clf.score(xtrain, ytrain)) print "Test accuracy: %0.2f%%" % (100*clf.score(xtest, ytest)) + "\n" plt.scatter(np.exp(clf.predict(xtest)), np.exp(ytest)) plt.title("Poisson Regression Model Prediction") plt.ylabel("Actual Rental Counts") plt.xlabel("Predicted Rental Counts") plt.xlim(0, 1000) plt.ylim(0, 1000) remove_border() plt.scatter(np.exp(clf.predict(xtest)), np.exp(clf.predict(xtest))-np.exp(ytest), c='g', s=40, label='Residual Testing') plt.scatter(np.exp(clf.predict(xtrain)), np.exp(clf.predict(xtrain))-np.exp(ytrain), c='b', s=40, alpha = 0.25, label='Residual Training') plt.plot([0,600],[0,0], c='r') plt.xlim(0,600) plt.legend() plt.title('Residuals vs Bike Share Count') plt.xlabel("Predicted count values") plt.ylabel("Residuals") remove_border() test_results = [] path = "test.csv" with open(path, "r") as d: test_header = d.next().strip("\n").split(",") for line in d: test_results.append(line.strip("\n").split(",")) test = pd.DataFrame(data=np.asarray(test_results), columns=test_header) test['hour'] = test['datetime'].map(lambda x: (datetime.strptime(x, "%Y-%m-%d %H:%M:%S")).hour) test['weekday'] = test['datetime'].map(lambda x: (datetime.strptime(x, "%Y-%m-%d %H:%M:%S")).weekday()) test['month'] = test['datetime'].map(lambda x: (datetime.strptime(x, "%Y-%m-%d %H:%M:%S")).month) test['season'] = test['season'].astype(int) test['holiday'] = test['holiday'].astype(int) test['workingday'] = test['workingday'].astype(int) test['weather'] = test['weather'].astype(int) test['temp'] = test['temp'].astype(float) test['atemp'] = test['atemp'].astype(float) test['humidity'] = test['humidity'].astype(float) test['windspeed'] = test['windspeed'].astype(float) season_test = pd.get_dummies(data = test.iloc[:,1], prefix = 'season') holiday_test = pd.get_dummies(data = test.iloc[:,2], prefix = 'holiday') workingday_test = pd.get_dummies(data = test.iloc[:,3], prefix = 'workingday') weather_test = pd.get_dummies(data = test.iloc[:,4], prefix = 'weather') weekday_test = pd.get_dummies(data = test.iloc[:,9], prefix = 'weekday') hour_test = pd.get_dummies(data = test.iloc[:,10], prefix = 'hour') month_test = pd.get_dummies(data = test.iloc[:,11], prefix = 'month') finalDF_test = test.iloc[:,5:9] #numerical #categorical finalDF_test = finalDF_test.join(season_test) finalDF_test = finalDF_test.join(holiday_test) finalDF_test = finalDF_test.join(workingday_test) finalDF_test = finalDF_test.join(weather_test) finalDF_test = finalDF_test.join(weekday_test) finalDF_test = finalDF_test.join(hour_test) finalDF_test = finalDF_test.join(month_test) print finalDF_test.shape finalDF_test.head() df = pd.DataFrame(data=zip(test['datetime'], np.exp(clf.predict(finalDF_test))), columns = ["datetime", "count"]) pd.DataFrame.to_csv(df, 'submission.csv', index = False) from sklearn.ensemble import RandomForestRegressor from sklearn.cross_validation import train_test_split from sklearn.metrics import make_scorer finalDF_mod = data finalDF_mod = data.drop('datetime', 1) finalDF_mod = finalDF_mod.drop('count', 1) finalDF_mod = finalDF_mod.drop('registered', 1) finalDF_mod = finalDF_mod.drop('casual', 1) y_data = data['count'] x_data = finalDF_mod features = x_data.columns xtrain, xtest, ytrain, ytest = train_test_split(x_data, y_data) from sklearn.grid_search import GridSearchCV # This helper function will make a callable that we can use in cross_val_score rmsele_scorer = make_scorer(rmsele, greater_is_better=False) #Parameter Optimisation with grid search tuned_parameters = [{'max_features': ['sqrt', 'log2', 'auto'], 'max_depth': [5, 8, 12], 'min_samples_leaf': [2, 5, 10]}] rf = GridSearchCV(RandomForestRegressor(n_jobs=1, n_estimators=1000), tuned_parameters, cv=3, verbose=2, scoring=rmsele_scorer).fit(xtrain, ytrain) print 'Best Parameters:' print rf.best_estimator_ # Model Running with optimum parameters on train, test split for model performance evaluation and plotting # rf_op = RandomForestRegressor(n_jobs=1, n_estimators=1000, min_samples_leaf=2, min_samples_split=2, max_features = 'auto', oob_score=True) # rf_op.fit(xtrain, ytrain) # preds = rf_op.predict(xtest) # Model Running with optimum parameters on entire training set rf_op = RandomForestRegressor(n_jobs=1, n_estimators=1000, min_samples_leaf=2, min_samples_split=2, max_features = 'auto', oob_score=True) rf_op.fit(x_data, y_data) plt.scatter(preds, ytest) plt.title("Random Forest Model Prediction") plt.ylabel("Actual Rental Counts") plt.xlabel("Predicted Rental Counts") plt.xlim(0, 1000) plt.ylim(0, 1000) remove_border() #Root Mean Squared Logarithmic Error (RMSLE) print "Root Mean Squared Logarithmic Error Train: ", rmsele(rf_op.predict(xtrain), ytrain) print "Root Mean Squared Logarithmic Error Test: ", rmsele(rf_op.predict(xtest), ytest) print "Training accuracy: %0.2f%%" % (100*rf_op.score(xtrain, ytrain)) print "Test accuracy: %0.2f%%" % (100*rf_op.score(xtest, ytest)) + "\n" numfeat = len(features) indices = np.argsort(rf_op.feature_importances_)[::-1][:numfeat] plt.bar(xrange(numfeat), rf_op.feature_importances_[indices], align='center', alpha=.5) plt.xticks(xrange(numfeat), features[indices], rotation='vertical', fontsize=12) plt.xlim([-1, numfeat]) plt.ylabel('Feature percentages', fontsize=12) plt.title('Feature importances computed by Random Forest', fontsize=16) remove_border() print zip(features, rf_op.feature_importances_) # submit = test.drop('datetime', 1) # print submit.shape, test.shape, xtrain.shape, xtest.shape # dfForest = pd.DataFrame(data=zip(test['datetime'], rf_op.predict(submit)), columns = ["datetime", "count"]) # dfForest[1:100] # pd.DataFrame.to_csv(dfForest, 'submission_forest.csv', index = False) ET_results = [] path = "submission_ET.csv" with open(path, "r") as d: ET_header = d.next().strip("\n").split(",") for line in d: segment = line.strip("\n").split(",") segment[1] = str(round(float(segment[1]))) line = segment[0] + "," + segment[1] ET_results.append(line.strip("\n").split(",")) ET_ints = pd.DataFrame(data=np.asarray(ET_results), columns=ET_header) pd.DataFrame.to_csv(ET_ints, 'submission_ET_round.csv', index = False) from sklearn.ensemble import ExtraTreesRegressor ET = ExtraTreesRegressor(n_estimators=1000, min_samples_split=2, min_samples_leaf=2, max_features='auto', n_jobs=1) ET.fit(x_data, y_data) #Root Mean Squared Logarithmic Error (RMSLE) print "Root Mean Squared Logarithmic Error Train: ", rmsele(ET.predict(x_data), y_data) print "Training accuracy: %0.2f%%" % (100*ET.score(x_data, y_data)) # submit = test.drop('datetime', 1) # print submit.shape, test.shape, xtrain.shape, xtest.shape # dfET = pd.DataFrame(data=zip(test['datetime'], ET.predict(submit)), columns = ["datetime", "count"]) # dfET[1:100] pd.DataFrame.to_csv(dfET, 'submission_weird.csv', index = False) from numpy import corrcoef, sum, log, arange from pylab import pcolor, show, colorbar, xticks, yticks import matplotlib.pyplot as plt fig, ax = plt.subplots() R = corrcoef(x_data.T) pcolor(R) colorbar() ax.set_xticks(np.arange(x_data.shape[1]) + 0.5, minor=False) #code to align the labels with the correct column of data on plot ax.set_yticks(np.arange(x_data.shape[1]) + 0.5, minor=False)#code to align the labels with the correct row of data on plot ax.set_xticklabels(x_data.columns, rotation=90) ax.set_yticklabels(x_data.columns) ax.set_title("Correlation Matrix of Features") from sklearn.decomposition import PCA pca = PCA(n_components=None) pca_X = pca.fit_transform(x_data) label = [("pca_" + str(e)) for e in range(pca_X.shape[1])] # print pca_X.shape fig, ax = plt.subplots() R = corrcoef(pca_X.T) pcolor(R) colorbar() ax.set_xticks(np.arange(pca_X.shape[1]) + 0.5, minor=False) #code to align the labels with the correct column of data on plot ax.set_yticks(np.arange(pca_X.shape[1]) + 0.5, minor=False)#code to align the labels with the correct row of data on plot ax.set_xticklabels(label, rotation=90) ax.set_yticklabels(label) ax.set_title("Correlation Matrix of PCA Components") # Split the data into training and cross validation test set to evaluate the performance of PCA x_ETtrain, x_ETtest, y_ETtrain, y_ETtest = train_test_split(pca_X, y_data) #Re-run the extra tree model using the PCA components as the new features ET.fit(x_ETtrain, y_ETtrain) preds = ET.predict(x_ETtest) plt.scatter(preds, y_ETtest) plt.title("Extra Tree Model Prediction") plt.ylabel("Actual Rental Counts") plt.xlabel("Predicted Rental Counts") plt.xlim(0, 1000) plt.ylim(0, 1000) remove_border() #Root Mean Squared Logarithmic Error (RMSLE) print "Root Mean Squared Logarithmic Error Train: ", rmsele(ET.predict(x_ETtrain), y_ETtrain) print "Root Mean Squared Logarithmic Error Test: ", rmsele(ET.predict(x_ETtest), y_ETtest) print "Training accuracy: %0.2f%%" % (100*ET.score(x_ETtrain, y_ETtrain)) print "Test accuracy: %0.2f%%" % (100*ET.score(x_ETtest, y_ETtest)) + "\n" numfeat = len(features) indices = np.argsort(ET.feature_importances_)[::-1][:numfeat] plt.bar(xrange(numfeat), ET.feature_importances_[indices], align='center', alpha=.5) plt.xticks(xrange(numfeat), label, rotation='vertical', fontsize=12) plt.xlim([-1, numfeat]) plt.ylabel('Feature percentages', fontsize=12) plt.title('Feature importances computed by Extra Tree', fontsize=16) remove_border() x_ETtrain, x_ETtest, y_ETtrain, y_ETtest = train_test_split(x_data, y_data) ET.fit(x_ETtrain, y_ETtrain) #Root Mean Squared Logarithmic Error (RMSLE) print "Root Mean Squared Logarithmic Error Train: ", rmsele(ET.predict(x_ETtrain), y_ETtrain) print "Root Mean Squared Logarithmic Error Test: ", rmsele(ET.predict(x_ETtest), y_ETtest) print "Training accuracy: %0.2f%%" % (100*ET.score(x_ETtrain, y_ETtrain)) print "Test accuracy: %0.2f%%" % (100*ET.score(x_ETtest, y_ETtest)) + "\n" fig, axes=plt.subplots(figsize=(35, 30), nrows=4, ncols=3) fig.subplots_adjust(hspace = 0.5) residual = list(ET.predict(x_ETtest).astype(float)-y_ETtest.astype(float)) for i in range(4): for j in range(3): if (i*3 + j) < 11: axes[i][j].scatter(list(x_ETtest[:, i*3 + j].astype(float)), residual, c='g', s=50, alpha = 0.75) axes[i][j].set_title('Residuals vs ' + features[i*3 + j]) axes[i][j].set_xlabel(features[i*3 + j]) axes[i][j].set_ylabel("Residuals") remove_border(axes[i][j]) else: fig.delaxes(axes[i][j]) finalDF_mod = data finalDF_mod = data.drop('datetime', 1) finalDF_mod = finalDF_mod.drop('count', 1) finalDF_mod = finalDF_mod.drop('registered', 1) finalDF_mod = finalDF_mod.drop('casual', 1) y_data_reg = data['registered'] y_data_cas = data['casual'] x_data = finalDF_mod #REGISTERED MODEL ETreg = ExtraTreesRegressor(n_estimators=1000, min_samples_split=2, min_samples_leaf=2, max_features='auto', n_jobs=1) xreg_train, xreg_test, yreg_train, yreg_test = train_test_split(x_data, y_data_reg) ETreg.fit(xreg_train, yreg_train) #CASUAL MODEL ETcas = ExtraTreesRegressor(n_estimators=1000, min_samples_split=2, min_samples_leaf=2, max_features='auto', n_jobs=1) xcas_train, xcas_test, ycas_train, ycas_test = train_test_split(x_data, y_data_cas) ETcas.fit(xcas_train, ycas_train) #Root Mean Squared Logarithmic Error (RMSLE) print "CASUAL USERS MODEL" print "Root Mean Squared Logarithmic Error Train: ", rmsele(ETcas.predict(xcas_train), ycas_train) print "Root Mean Squared Logarithmic Error Test: ", rmsele(ETcas.predict(xcas_test), ycas_test) print "Training accuracy: %0.2f%%" % (100*ETcas.score(xcas_train, ycas_train)) print "Test accuracy: %0.2f%%" % (100*ETcas.score(xcas_test, ycas_test)) + "\n" print "REGISTERED USERS MODEL" print "Root Mean Squared Logarithmic Error Train: ", rmsele(ETreg.predict(xreg_train), yreg_train) print "Root Mean Squared Logarithmic Error Test: ", rmsele(ETreg.predict(xreg_test), yreg_test) print "Training accuracy: %0.2f%%" % (100*ETreg.score(xreg_train, yreg_train)) print "Test accuracy: %0.2f%%" % (100*ETreg.score(xreg_test, yreg_test)) + "\n" print "COMBINED MODEL" print "Root Mean Squared Logarithmic Error Train: ", rmsele((ETreg.predict(xreg_train)+ETcas.predict(xcas_train)), yreg_train+ycas_train) print "Root Mean Squared Logarithmic Error Test: ", rmsele((ETreg.predict(xreg_test)+ETcas.predict(xcas_test)), yreg_test+ycas_test) # submit = test.drop('datetime', 1) # print submit.shape, test.shape, xtrain.shape, xtest.shape # dfcombo = pd.DataFrame(data=zip(test['datetime'], (ETreg.predict(submit)+ETcas.predict(submit))), columns = ["datetime", "count"]) # dfcombo[1:100] pd.DataFrame.to_csv(dfcombo, 'submission_comboET.csv', index = False) import pymc as mc #Monte Carlo simulation libraray for Bayesian MCMC import scipy.stats as stats #Import library which contains statistical functions for our analysis def poisson(x, beta, alpha=0): return np.exp(np.dot(beta, x) + alpha) # We normalise the hours so that no single term dominates the higher order modelling def normalize(x): return (x - x.mean()) / x.std() hour = data['hour'][0:3000] hour_norm = normalize(hour) hour_2 = hour**2 hour_2_norm = normalize(hour_2) hour_3 = hour**3 hour_3_norm = normalize(hour_3) # Y count data y = data['registered'][0:3000] # We assume Normal priors for alpha, beta, gamma, delta which are the contributors to the lambda parameter of our eventual Poisson distribution # We choose the Normal distribution as its random variables can take both positive and negative numbers beta = mc.Normal("beta", 3, 0.5) alpha = mc.Normal("alpha", 4, 0.5) gamma = mc.Normal("gamma", -2.5, 0.5) delta = mc.Normal("delta", 0, 1) # Our deterministic functions combines our priors to calculate the lambda rate parameter for our Poisson @mc.deterministic def rate_param(x=hour_norm, x2=hour_2_norm, x3=hour_3_norm, beta=beta, alpha=alpha, gamma = gamma, delta = delta): return np.exp(np.dot(beta, x) + np.dot(gamma, x2) + np.dot(delta, x3) + alpha) # The observed data is modelled as a Poisson with the observed=True argument to show that this should not be modified # but tied to our dataset observed = mc.Poisson("poisson_count", mu=rate_param, value=y, observed=True) # We pass all our parameters to the model so that during the simulation they can be updated simultaneously model = mc.Model([observed, beta, alpha, gamma, delta]) # MCMC Simulation code map_ = mc.MAP(model) map_.fit() mcmc = mc.MCMC(model) mcmc.sample(120000, 50000, 2) alpha_samples = mcmc.trace('alpha')[:, None] # best to make them 1d beta_samples = mcmc.trace('beta')[:, None] gamma_samples = mcmc.trace('gamma')[:, None] delta_samples = mcmc.trace('delta')[:, None] # histograms of the samples: plt.subplot(411) plt.title(r"Posterior distributions of the variables $\alpha, \beta$") plt.ylabel("Occurrences") plt.hist(alpha_samples, histtype='stepfilled', bins=35, alpha=0.85, label=r"posterior of $\alpha$", color="#A60628", normed=True) plt.legend() remove_border() plt.subplot(412) plt.ylabel("Occurrences") plt.hist(beta_samples, histtype='stepfilled', bins=35, alpha=0.85, label=r"posterior of $\beta$", color="#7A68A6", normed=True) plt.legend() remove_border() plt.subplot(413) plt.ylabel("Occurrences") plt.hist(gamma_samples, histtype='stepfilled', bins=35, alpha=0.85, label=r"posterior of $\gamma$", color="#A60628", normed=True) plt.legend() remove_border() plt.subplot(414) plt.ylabel("Occurrences") plt.hist(delta_samples, histtype='stepfilled', bins=35, alpha=0.85, label=r"posterior of $\delta$", color="red", normed=True) plt.legend() remove_border() t = np.linspace(hour_norm.min(), hour_norm.max(), 50)[:, None] t2 = np.linspace(hour_2_norm.min(), hour_2_norm.max(), 50)[:, None] t3 = np.linspace(hour_3_norm.min(), hour_3_norm.max(), 50)[:, None] lambda_ = np.exp(np.dot(beta_samples, t.T) + np.dot(gamma_samples, t2.T) + np.dot(delta_samples, t3.T) + alpha_samples) mean_count_t_ = lambda_.mean(axis=0) # plt.plot(t, mean_count_t_) #Simulate data points simulated = mc.Poisson("poisson_sim", rate_param) N = 10000 mcmc = mc.MCMC([simulated, alpha, beta, gamma, delta, observed]) mcmc.sample(N) simulations = mcmc.trace("poisson_sim")[:] for i in range(50): plt.scatter(hour, simulations[i, :], color="k", s=2, alpha=0.1) remove_border() plt.title("Simulated draws from Poisson distribution using lambda posterior") plt.ylabel("Rental bikes count") plt.xlabel("Hour") df = pd.DataFrame(data=hour) df['bikes'] = y mean_data_hour = df.groupby('hour').bikes.mean() plt.scatter(hour, y, color="k", s=50, alpha=0.2) plt.plot(mean_data_hour.index, mean_data_hour.values, color='r', label = "average") plt.title("Posterior expected value of counts of bike rentals; plus realizations") plt.legend(loc="upper left") plt.xlim(0, 24) plt.ylabel("Rental bikes count") plt.xlabel("Hours") remove_border() plt.legend() from sklearn.cluster import KMeans hour = np.asarray(hour) n = 2 #Number of components of the mixture kme = KMeans(n) # This is not needed but it is to help convergence # kme.fit(data[:, np.newaxis]) kme.fit(hour[:, np.newaxis]) # ndata = len(data) ndata = len(hour) dd = mc.Dirichlet('dd', theta=(1,)*n) category = mc.Categorical('category', p=dd, size=ndata, value=kme.labels_) # alphas = mc.Uniform("alphas", 0, 1, size=n) alphas = mc.Gamma('alphas', alpha=1, beta=.0001 ,size=n) means = mc.TruncatedNormal('means', kme.cluster_centers_[:,0],0.1,a=0.0, b=100000, size=n) @mc.deterministic def mean(category=category, means=means): return means[category] @mc.deterministic def alpha(category=category, alphas=alphas): return alphas[category] obs = mc.NegativeBinomial('obs', mean, alpha, value=y, observed = True) predictive = mc.NegativeBinomial('predictive', mean, alpha) model = mc.Model({'dd': dd, 'category': category, 'alphas': alphas, 'means': means, 'predictive':predictive, 'obs': obs}) mcmc = mc.MCMC(model) mcmc.sample(10000, 7000) G = [hour[np.nonzero(np.round(mcmc.trace("category")[:].mean(axis=0)) == i)] for i in range(0, 2) ] plt.hist(G, bins=30, stacked = True) remove_border() plt.title("Assignment of category across hours") plt.xlabel("Hour") simulations = mcmc.trace("predictive")[:] print simulations.shape print simulations[0:100] for i in range(100): plt.scatter(hour, simulations[i, :], color="k", s=2, alpha=0.1) remove_border() plt.title("Simulated draws from Negative Binomial distribution") plt.ylabel("Rental bikes count") plt.xlabel("Hours")