This is an ipython notebook, which details work done in building a regressor on 'bike sharing demand' for Kaggle's 'Knowledge' competition.
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import brewer2mpl
from matplotlib import rcParams
#colorbrewer2 Dark2 qualitative color table
dark2_cmap = brewer2mpl.get_map('Dark2', 'Qualitative', 7)
dark2_colors = dark2_cmap.mpl_colors
rcParams['figure.figsize'] = (10, 6)
rcParams['figure.dpi'] = 150
rcParams['axes.color_cycle'] = dark2_colors
rcParams['lines.linewidth'] = 2
rcParams['axes.facecolor'] = 'white'
rcParams['font.size'] = 20
rcParams['patch.edgecolor'] = 'white'
rcParams['patch.facecolor'] = dark2_colors[0]
rcParams['font.family'] = 'StixGeneral'
def remove_border(axes=None, top=False, right=False, left=True, bottom=True):
"""
Minimize chartjunk by stripping out unnecesasry plot borders and axis ticks
The top/right/left/bottom keywords toggle whether the corresponding plot border is drawn
"""
ax = axes or plt.gca()
ax.spines['top'].set_visible(top)
ax.spines['right'].set_visible(right)
ax.spines['left'].set_visible(left)
ax.spines['bottom'].set_visible(bottom)
#turn off all ticks
ax.yaxis.set_ticks_position('none')
ax.xaxis.set_ticks_position('none')
#now re-enable visibles
if top:
ax.xaxis.tick_top()
if bottom:
ax.xaxis.tick_bottom()
if left:
ax.yaxis.tick_left()
if right:
ax.yaxis.tick_right()
import csv
results = []
path = "train.csv"
with open(path, "r") as d:
header = d.next().strip("\n").split(",")
for line in d:
results.append(line.strip("\n").split(","))
data = pd.DataFrame(data=np.asarray(results), columns=header)
We are asked to predict the number of rentals during each hour of the day, so it makes sense to split the datetime column into 'hour of the day'. We also split it into 'day of the week' and 'month of the year' as we think these might be useful features for the model and might provide insights. For example there might be some useful information encoded in whether it is a weekday versus a weekend or also whether it is the start of the year and people start cycling as a New Year resolution.
from datetime import datetime, date, time
data['hour'] = data['datetime'].map(lambda x: (datetime.strptime(x, "%Y-%m-%d %H:%M:%S")).hour)
data['weekday'] = data['datetime'].map(lambda x: (datetime.strptime(x, "%Y-%m-%d %H:%M:%S")).weekday())
data['month'] = data['datetime'].map(lambda x: (datetime.strptime(x, "%Y-%m-%d %H:%M:%S")).month)
data.head()
datetime | season | holiday | workingday | weather | temp | atemp | humidity | windspeed | casual | registered | count | hour | weekday | month | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2011-01-01 00:00:00 | 1 | 0 | 0 | 1 | 9.84 | 14.395 | 81 | 0 | 3 | 13 | 16 | 0 | 5 | 1 |
1 | 2011-01-01 01:00:00 | 1 | 0 | 0 | 1 | 9.02 | 13.635 | 80 | 0 | 8 | 32 | 40 | 1 | 5 | 1 |
2 | 2011-01-01 02:00:00 | 1 | 0 | 0 | 1 | 9.02 | 13.635 | 80 | 0 | 5 | 27 | 32 | 2 | 5 | 1 |
3 | 2011-01-01 03:00:00 | 1 | 0 | 0 | 1 | 9.84 | 14.395 | 75 | 0 | 3 | 10 | 13 | 3 | 5 | 1 |
4 | 2011-01-01 04:00:00 | 1 | 0 | 0 | 1 | 9.84 | 14.395 | 75 | 0 | 0 | 1 | 1 | 4 | 5 | 1 |
We convert the numerical variables, namely 'temp', 'atemp', 'humidity' and 'windspeed' to floats. We also convert the count variables corresponding to our labelled examples, namely 'registered', 'casual' and 'count' as int.
data['temp'] = data['temp'].astype(float)
data['atemp'] = data['atemp'].astype(float)
data['humidity'] = data['humidity'].astype(float)
data['windspeed'] = data['windspeed'].astype(float)
data['casual'] = data['casual'].astype(int)
data['registered'] = data['registered'].astype(int)
data['count'] = data['count'].astype(int)
For the categorical variables, namely 'season', 'holiday', 'workingday', 'weather', 'weekday', hour', and 'month' we use a 0-1 hot encoding.
season = pd.get_dummies(data = data.iloc[:,1], prefix = 'season')
holiday = pd.get_dummies(data = data.iloc[:,2], prefix = 'holiday')
workingday = pd.get_dummies(data = data.iloc[:,3], prefix = 'workingday')
weather = pd.get_dummies(data = data.iloc[:,4], prefix = 'weather')
weekday = pd.get_dummies(data = data.iloc[:,12], prefix = 'weekday')
hour = pd.get_dummies(data = data.iloc[:,13], prefix = 'hour')
month = pd.get_dummies(data = data.iloc[:,14], prefix = 'month')
Then we reconstruct the dataframe.
finalDF = data.iloc[:,5:9] #numerical
#categorical
finalDF =finalDF.join(season)
finalDF = finalDF.join(holiday)
finalDF = finalDF.join(workingday)
finalDF = finalDF.join(weather)
finalDF = finalDF.join(weekday)
finalDF = finalDF.join(hour)
finalDF = finalDF.join(month)
finalDF = finalDF.join(data.iloc[:,9:12]) #labels
print finalDF.shape
finalDF.head()
(10886, 62)
temp | atemp | humidity | windspeed | season_1 | season_2 | season_3 | season_4 | holiday_0 | holiday_1 | ... | month_6 | month_7 | month_8 | month_9 | month_10 | month_11 | month_12 | casual | registered | count | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 9.84 | 14.395 | 81 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 13 | 16 |
1 | 9.02 | 13.635 | 80 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 8 | 32 | 40 |
2 | 9.02 | 13.635 | 80 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | 27 | 32 |
3 | 9.84 | 14.395 | 75 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 10 | 13 |
4 | 9.84 | 14.395 | 75 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 |
5 rows × 62 columns
fig, axes=plt.subplots(figsize=(35, 14), nrows=2, ncols=3)
fig.subplots_adjust(hspace = 0.3)
sum_dataREG = data.groupby('weekday').registered.sum()
count_dataREG = data.groupby('weekday').registered.count()
average_dataREG = sum_dataREG / count_dataREG
sum_dataCAS = data.groupby('weekday').casual.sum()
count_dataCAS = data.groupby('weekday').casual.count()
average_dataCAS = sum_dataCAS / count_dataCAS
index = np.arange(len(sum_dataCAS))
axes[0][0].bar(index, average_dataREG, color = "r", label = "registered users")
axes[0][0].bar(index, average_dataCAS, color = "b", label = "casual users")
remove_border()
axes[0][0].set_ylabel("Total bikes rented")
axes[0][0].set_xlabel("Weekday")
axes[0][0].set_title("Average number of rentals by day of the week")
axes[0][0].legend()
remove_border(axes[0][0])
sum_dataREG = data.groupby('hour').registered.sum()
count_dataREG = data.groupby('hour').registered.count()
average_dataREG = sum_dataREG / count_dataREG
sum_dataCAS = data.groupby('hour').casual.sum()
count_dataCAS = data.groupby('hour').casual.count()
average_dataCAS = sum_dataCAS / count_dataCAS
index = np.arange(len(sum_dataCAS))
axes[0][1].bar(index, average_dataREG, color = "r", label = "registered users")
axes[0][1].bar(index, average_dataCAS, color = "b", label = "casual users")
remove_border()
axes[0][1].set_ylabel("Total bikes rented")
axes[0][1].set_xlabel("Hour of day")
axes[0][1].set_title("Average number of rentals by hour of day")
axes[0][1].legend()
remove_border(axes[0][1])
sum_dataREG = data.groupby('month').registered.sum()
count_dataREG = data.groupby('month').registered.count()
average_dataREG = sum_dataREG / count_dataREG
sum_dataCAS = data.groupby('month').casual.sum()
count_dataCAS = data.groupby('month').casual.count()
average_dataCAS = sum_dataCAS / count_dataCAS
index = np.arange(len(sum_dataCAS))
axes[0][2].bar(index, average_dataREG, color = "r", label = "registered users")
axes[0][2].bar(index, average_dataCAS, color = "b", label = "casual users")
remove_border()
axes[0][2].set_ylabel("Total bikes rented")
axes[0][2].set_xlabel("Month of year")
axes[0][2].set_title("Average number of rentals by month of year")
axes[0][2].legend()
remove_border(axes[0][2])
sum_dataREG = data.groupby('workingday').registered.sum()
count_dataREG = data.groupby('workingday').registered.count()
average_dataREG = sum_dataREG / count_dataREG
sum_dataCAS = data.groupby('workingday').casual.sum()
count_dataCAS = data.groupby('workingday').casual.count()
average_dataCAS = sum_dataCAS / count_dataCAS
index = np.arange(len(sum_dataCAS))
axes[1][0].bar(index, average_dataREG, color = "r", label = "registered users")
axes[1][0].bar(index, average_dataCAS, color = "b", label = "casual users")
remove_border()
axes[1][0].set_ylabel("Total bikes rented")
axes[1][0].set_title("Average number of rentals by working day")
axes[1][0].legend()
axes[1][0].axes.set_xticklabels(["","","Non-Working","","","","","Working"])
remove_border(axes[1][0])
sum_dataREG = data.groupby('holiday').registered.sum()
count_dataREG = data.groupby('holiday').registered.count()
average_dataREG = sum_dataREG / count_dataREG
sum_dataCAS = data.groupby('holiday').casual.sum()
count_dataCAS = data.groupby('holiday').casual.count()
average_dataCAS = sum_dataCAS / count_dataCAS
index = np.arange(len(sum_dataCAS))
axes[1][1].bar(index, average_dataREG, color = "r", label = "registered users")
axes[1][1].bar(index, average_dataCAS, color = "b", label = "casual users")
remove_border()
axes[1][1].set_ylabel("Total bikes rented")
axes[1][1].set_title("Average number of rentals by holiday")
axes[1][1].legend()
axes[1][1].axes.set_xticklabels(["","","Non-Holiday","","","","","Holiday"])
remove_border(axes[1][1])
fig.delaxes(axes[1][2])
The above bar plots shows there is a difference in user behaviour between the casual and registered users.
Indeed from the first plot we can see that the casual users rent bycicles proportionately more frequently at the weekend and also that there is a drop in bikes rented from the registered users.
It is interesting to observe from the second plot that the two user groups seem to come from separate distributions. The registered users seem to come from a bimodal distribution peaked some point early morning around 8am and some point later in the evening around 17 / 18 h, both corresponding to working hours. On the other hand the casual users seem to come from a different distribution which is single peaked showcasing a 'day-tripped' behaviour.
It is odd to note from the third plot that the rentals for Novemeber and December remain high whereas they drop in January and February for the registered users. We would expect that those months might show similar behaviours; perhaps this shows the importance of establishing a routine ? After having cycled all year, people stick to their cycling regime but after the holiday period, people find it more difficult to get back into it.
The two final plots show that there is an increase in casual user bike rentals during holidays and non working days and correspondingly a higher number of bikes rented by registered users in those days.
fig, axes=plt.subplots(figsize=(35, 16), nrows=2, ncols=3)
fig.subplots_adjust(hspace = 0.5)
sum_dataREG = data.groupby('temp').registered.sum()
count_dataREG = data.groupby('temp').registered.count()
average_dataREG = sum_dataREG / count_dataREG
sum_dataCAS = data.groupby('temp').casual.sum()
count_dataCAS = data.groupby('temp').casual.count()
average_dataCAS = sum_dataCAS / count_dataCAS
index = np.arange(len(sum_dataCAS))
axes[0][0].bar(index, average_dataREG, color = "r", label = "registered users")
axes[0][0].bar(index, average_dataCAS, color = "b", label = "casual users")
remove_border(axes[0][0])
axes[0][0].set_ylabel("Total bikes rented")
axes[0][0].set_xlabel("Temperature (Celcius)")
axes[0][0].set_title("Average number of rentals according to temperature", y=1.1)
axes[0][0].legend()
sum_dataREG = data.groupby('weather').registered.sum()
count_dataREG = data.groupby('weather').registered.count()
average_dataREG = sum_dataREG / count_dataREG
sum_dataCAS = data.groupby('weather').casual.sum()
count_dataCAS = data.groupby('weather').casual.count()
average_dataCAS = sum_dataCAS / count_dataCAS
index = np.arange(len(sum_dataCAS))
axes[0][1].bar(index, average_dataREG, color = "r", label = "registered users")
axes[0][1].bar(index, average_dataCAS, color = "b", label = "casual users")
remove_border(axes[0][1])
axes[0][1].set_ylabel("Total bikes rented")
axes[0][1].set_xlabel("Weather")
axes[0][1].set_title("Average number of rentals according to weather encoding", y=1.1)
axes[0][1].legend()
sum_dataREG = data.groupby('season').registered.sum()
count_dataREG = data.groupby('season').registered.count()
average_dataREG = sum_dataREG / count_dataREG
sum_dataCAS = data.groupby('season').casual.sum()
count_dataCAS = data.groupby('season').casual.count()
average_dataCAS = sum_dataCAS / count_dataCAS
index = np.arange(len(sum_dataCAS))
axes[0][2].bar(index, average_dataREG, color = "r", label = "registered users")
axes[0][2].bar(index, average_dataCAS, color = "b", label = "casual users")
remove_border(axes[0][2])
axes[0][2].set_ylabel("Total bikes rented")
axes[0][2].set_xlabel("Season")
axes[0][2].set_title("Average number of rentals according to season encoding", y=1.1)
axes[0][2].legend()
sum_dataREG = data.groupby('atemp').registered.sum()
count_dataREG = data.groupby('atemp').registered.count()
average_dataREG = sum_dataREG / count_dataREG
sum_dataCAS = data.groupby('atemp').casual.sum()
count_dataCAS = data.groupby('atemp').casual.count()
average_dataCAS = sum_dataCAS / count_dataCAS
index = np.arange(len(sum_dataCAS))
axes[1][0].bar(index, average_dataREG, color = "r", label = "registered users")
axes[1][0].bar(index, average_dataCAS, color = "b", label = "casual users")
remove_border(axes[1][0])
axes[1][0].set_ylabel("Total bikes rented")
axes[1][0].set_xlabel("Feels Like Temperature (Celcius)")
axes[1][0].set_title("Average number of rentals according to 'feels like' temperature", y=1.1)
axes[1][0].legend()
sum_dataREG = data.groupby('humidity').registered.sum()
count_dataREG = data.groupby('humidity').registered.count()
average_dataREG = sum_dataREG / count_dataREG
sum_dataCAS = data.groupby('humidity').casual.sum()
count_dataCAS = data.groupby('humidity').casual.count()
average_dataCAS = sum_dataCAS / count_dataCAS
index = np.arange(len(sum_dataCAS))
axes[1][1].bar(index, average_dataREG, color = "r", label = "registered users")
axes[1][1].bar(index, average_dataCAS, color = "b", label = "casual users")
remove_border(axes[1][1])
axes[1][1].set_ylabel("Total bikes rented")
axes[1][1].set_xlabel("Humidity")
axes[1][1].set_title("Average number of rentals according to humidity", y=1.1)
axes[1][1].legend()
sum_dataREG = data.groupby('windspeed').registered.sum()
count_dataREG = data.groupby('windspeed').registered.count()
average_dataREG = sum_dataREG / count_dataREG
sum_dataCAS = data.groupby('windspeed').casual.sum()
count_dataCAS = data.groupby('windspeed').casual.count()
average_dataCAS = sum_dataCAS / count_dataCAS
index = np.arange(len(sum_dataCAS))
axes[1][2].bar(index, average_dataREG, color = "r", label = "registered users")
axes[1][2].bar(index, average_dataCAS, color = "b", label = "casual users")
remove_border(axes[1][2])
axes[1][2].set_ylabel("Total bikes rented")
axes[1][2].set_xlabel("Windspeed")
axes[1][2].set_title("Average number of rentals according to windspeed", y=1.1)
axes[1][2].legend()
<matplotlib.legend.Legend at 0x12b345fd0>
As expected, the casual user bike rental decreases as the weather worsen with the season and as the temperature decreases - temperature seems to have a significant effect.
There is unusual behaviour at high windspeeds from casual users, this could be due to the likely low number of occurences of these windspeeds in the dataset.
We observe lesser rentals for both groups at high humidities.
weather_interest = ['temp','atemp','humidity','windspeed','season', 'weather', 'hour', 'month']
_ = pd.scatter_matrix(data[weather_interest].astype(float), figsize=(18,18), alpha=0.8, diagonal='hist')
The above scatter plot matrix was used to try and observe any unusual behaviour, as well as potential correlations between variables in the dataset
Correlations
Our first model is a simple linear regression. This is seen to be a poor choice, with the reasons why being explored below.
from sklearn.cross_validation import train_test_split
from sklearn.linear_model import LinearRegression
#Create two datasets for casual and registered users
y_data = finalDF['count']
x_data = finalDF.drop('count', 1)
x_data = x_data.drop('registered', 1)
x_data = x_data.drop('casual', 1)
#casual
y_data_casual = finalDF['casual']
x_data_casual = finalDF.drop('casual',1)
x_data_casual = x_data_casual.drop('registered',1)
x_data_casual = x_data_casual.drop('count',1)
#registered
y_data_registered = finalDF['registered']
x_data_registered = finalDF.drop('casual',1)
x_data_registered = x_data_registered.drop('registered',1)
x_data_registered = x_data_registered.drop('count',1)
xtrain_reg, xtest_reg, ytrain_reg, ytest_reg = train_test_split(x_data_registered, y_data_registered)
xtrain_cas, xtest_cas, ytrain_cas, ytest_cas = train_test_split(x_data_casual, y_data_casual)
xtrain, xtest, ytrain, ytest = train_test_split(x_data, y_data)
We trained two seperate models, one for each of the 'casual' and 'registered' users. This allows us to take advantage of the fact that the behaviour of the groups appeared to be different from the plots above. A composite model of the two is also trained for comparison purposes.
clf = LinearRegression()
clf_cas = LinearRegression()
clf_reg = LinearRegression()
clf.fit(xtrain, ytrain)
clf_cas.fit(xtrain_cas, ytrain_cas)
clf_reg.fit(xtrain_reg, ytrain_reg)
#The below code searched for negative count predictions as we assumed this was why we were unable to calculate our log error
# check = [e for e in clf.predict(xtest) if e <= -1]
# print check
print "Squared Error ", np.mean((clf.predict(xtest)-ytest)**2)
print "Squared Error casual ", np.mean((clf_cas.predict(xtest_cas)-ytest_cas)**2)
print "Squared Error reg " , np.mean((clf_reg.predict(xtest_reg)-ytest_reg)**2), "\n"
print "Training accuracy: %0.2f%%" % (100*clf.score(xtrain, ytrain))
print "Test accuracy: %0.2f%%" % (100*clf.score(xtest, ytest)) + "\n"
print "Training accuracy casual: %0.2f%%" % (100*clf_cas.score(xtrain_cas, ytrain_cas))
print "Test accuracy casual: %0.2f%%" % (100*clf_cas.score(xtest_cas, ytest_cas)) + "\n"
print "Training accuracy reg: %0.2f%%" % (100*clf_reg.score(xtrain_reg, ytrain_reg))
print "Test accuracy reg: %0.2f%%" % (100*clf_reg.score(xtest_reg, ytest_reg))
Squared Error 11836.1071948 Squared Error casual 1132.00602578 Squared Error reg 8659.22159901 Training accuracy: 63.40% Test accuracy: 64.69% Training accuracy casual: 58.81% Test accuracy casual: 56.57% Training accuracy reg: 63.56% Test accuracy reg: 61.58%
The mean squared error incorporates the variance and the bias. The MSE of the single model is higher than the sum of the MSE of the two separate models combined. The single model shows a higher accuracy of the two separate models suggesting that is it slightly less biased, therefore suggesting a larger variance causing the larger MSE. This suggests that the two separate models are probably going to be more generalisable and suffer less from high variance and overfitting.
Below is a plot of the residuals.
plt.scatter(clf.predict(xtest), clf.predict(xtest)- ytest, c='g', s=40, label='Residual Testing')
plt.scatter(clf.predict(xtrain), clf.predict(xtrain)- ytrain, c='b', s=40, alpha = 0.25, label='Residual Training')
plt.plot([-200,600],[0,0], c='r')
plt.legend()
plt.title('Residuals vs Bike Share Count')
plt.xlabel("Predicted count values")
plt.ylabel("Residuals")
remove_border()
From the above analysis, we realise that modeling Y as a linear function of the regression coefficients may not be sensible and a more realistic model might be to use the natural log of the response variable, ln(Y), as a linear function of the coefficients instead. This will ensure that our response variable can only be defined in the interval [0, +inf)
finalDF_mod = finalDF
#Build the first model using total count as a label so convert the count column to log counts
#Note: it is ok to just directly convert to log(y) as min(y) is 1 and not 0 in the 'count' column
finalDF_mod['log_count'] = finalDF['count'].map(lambda x: np.log(x))
#Remove the count, the registered counts and the causal counts columns
finalDF_mod = finalDF.drop('count', 1)
finalDF_mod = finalDF_mod.drop('registered', 1)
finalDF_mod = finalDF_mod.drop('casual', 1)
finalDF_mod.head()
temp | atemp | humidity | windspeed | season_1 | season_2 | season_3 | season_4 | holiday_0 | holiday_1 | ... | month_4 | month_5 | month_6 | month_7 | month_8 | month_9 | month_10 | month_11 | month_12 | log_count | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 9.84 | 14.395 | 81 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2.772589 |
1 | 9.02 | 13.635 | 80 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3.688879 |
2 | 9.02 | 13.635 | 80 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3.465736 |
3 | 9.84 | 14.395 | 75 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2.564949 |
4 | 9.84 | 14.395 | 75 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.000000 |
5 rows × 60 columns
def rmsele(actual, pred):
"""
Given a column of predictions and a column of actuals, calculate the RMSELE
"""
squared_errors = (np.log(pred + 1) - np.log(actual + 1)) ** 2
mean_squared = np.sum(squared_errors) / len(squared_errors)
return np.sqrt(mean_squared)
x_data = finalDF_mod.drop('log_count', 1)
print x_data.shape
y_data = finalDF_mod['log_count']
xtrain, xtest, ytrain, ytest = train_test_split(x_data, y_data)
clf = LinearRegression()
clf.fit(xtrain, ytrain)
#Root Mean Squared Logarithmic Error (RMSLE)
print "Root Mean Squared Logarithmic Error: ", rmsele(np.exp(clf.predict(xtest)), np.exp(ytest))
print "Training accuracy: %0.2f%%" % (100*clf.score(xtrain, ytrain))
print "Test accuracy: %0.2f%%" % (100*clf.score(xtest, ytest)) + "\n"
(10886, 59) Root Mean Squared Logarithmic Error: 0.619990505154 Training accuracy: 80.27% Test accuracy: 80.08%
plt.scatter(np.exp(clf.predict(xtest)), np.exp(ytest))
plt.title("Poisson Regression Model Prediction")
plt.ylabel("Actual Rental Counts")
plt.xlabel("Predicted Rental Counts")
plt.xlim(0, 1000)
plt.ylim(0, 1000)
remove_border()
The above plot shows the predictions of our model against the actual values. We can see that the model appears to capture the general trends in the data, but struggles to predict rentals accurately expecially at higher predicted count values.
plt.scatter(np.exp(clf.predict(xtest)), np.exp(clf.predict(xtest))-np.exp(ytest), c='g', s=40, label='Residual Testing')
plt.scatter(np.exp(clf.predict(xtrain)), np.exp(clf.predict(xtrain))-np.exp(ytrain), c='b', s=40, alpha = 0.25, label='Residual Training')
plt.plot([0,600],[0,0], c='r')
plt.xlim(0,600)
plt.legend()
plt.title('Residuals vs Bike Share Count')
plt.xlabel("Predicted count values")
plt.ylabel("Residuals")
remove_border()
We can see from the above residuals plot that the mean is approximately averaged around zero and the heteroscedascity problem has been reduced. Also the non-linearity problem has been resolved.
We decide to use the above model as a first model for generating predictions for the Kaggle competition. The following code generates predictions from the test set, which has been pre-processed in the same way as the training set.
Note the above model was trained on a subset of the training data (as we used a train_test_split to cross validate our model and present the analysis here); for the submission, the model is trained over the full dataset.
Below is also the code to write the submission file with the rental count predictions per hour.
test_results = []
path = "test.csv"
with open(path, "r") as d:
test_header = d.next().strip("\n").split(",")
for line in d:
test_results.append(line.strip("\n").split(","))
test = pd.DataFrame(data=np.asarray(test_results), columns=test_header)
test['hour'] = test['datetime'].map(lambda x: (datetime.strptime(x, "%Y-%m-%d %H:%M:%S")).hour)
test['weekday'] = test['datetime'].map(lambda x: (datetime.strptime(x, "%Y-%m-%d %H:%M:%S")).weekday())
test['month'] = test['datetime'].map(lambda x: (datetime.strptime(x, "%Y-%m-%d %H:%M:%S")).month)
test['season'] = test['season'].astype(int)
test['holiday'] = test['holiday'].astype(int)
test['workingday'] = test['workingday'].astype(int)
test['weather'] = test['weather'].astype(int)
test['temp'] = test['temp'].astype(float)
test['atemp'] = test['atemp'].astype(float)
test['humidity'] = test['humidity'].astype(float)
test['windspeed'] = test['windspeed'].astype(float)
season_test = pd.get_dummies(data = test.iloc[:,1], prefix = 'season')
holiday_test = pd.get_dummies(data = test.iloc[:,2], prefix = 'holiday')
workingday_test = pd.get_dummies(data = test.iloc[:,3], prefix = 'workingday')
weather_test = pd.get_dummies(data = test.iloc[:,4], prefix = 'weather')
weekday_test = pd.get_dummies(data = test.iloc[:,9], prefix = 'weekday')
hour_test = pd.get_dummies(data = test.iloc[:,10], prefix = 'hour')
month_test = pd.get_dummies(data = test.iloc[:,11], prefix = 'month')
finalDF_test = test.iloc[:,5:9] #numerical
#categorical
finalDF_test = finalDF_test.join(season_test)
finalDF_test = finalDF_test.join(holiday_test)
finalDF_test = finalDF_test.join(workingday_test)
finalDF_test = finalDF_test.join(weather_test)
finalDF_test = finalDF_test.join(weekday_test)
finalDF_test = finalDF_test.join(hour_test)
finalDF_test = finalDF_test.join(month_test)
print finalDF_test.shape
finalDF_test.head()
(6493, 59)
temp | atemp | humidity | windspeed | season_1 | season_2 | season_3 | season_4 | holiday_0 | holiday_1 | ... | month_3 | month_4 | month_5 | month_6 | month_7 | month_8 | month_9 | month_10 | month_11 | month_12 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 10.66 | 11.365 | 56 | 26.0027 | 1 | 0 | 0 | 0 | 1 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
1 | 10.66 | 13.635 | 56 | 0.0000 | 1 | 0 | 0 | 0 | 1 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
2 | 10.66 | 13.635 | 56 | 0.0000 | 1 | 0 | 0 | 0 | 1 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
3 | 10.66 | 12.880 | 56 | 11.0014 | 1 | 0 | 0 | 0 | 1 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
4 | 10.66 | 12.880 | 56 | 11.0014 | 1 | 0 | 0 | 0 | 1 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
5 rows × 59 columns
df = pd.DataFrame(data=zip(test['datetime'], np.exp(clf.predict(finalDF_test))), columns = ["datetime", "count"])
pd.DataFrame.to_csv(df, 'submission.csv', index = False)
We now implement random forest models to reduce the root mean squared logarithmic error. For this section we will use the non 'hot-encoded' dataframe, as the random forest will not be affected by using this data.
from sklearn.ensemble import RandomForestRegressor
from sklearn.cross_validation import train_test_split
from sklearn.metrics import make_scorer
finalDF_mod = data
finalDF_mod = data.drop('datetime', 1)
finalDF_mod = finalDF_mod.drop('count', 1)
finalDF_mod = finalDF_mod.drop('registered', 1)
finalDF_mod = finalDF_mod.drop('casual', 1)
y_data = data['count']
x_data = finalDF_mod
features = x_data.columns
xtrain, xtest, ytrain, ytest = train_test_split(x_data, y_data)
from sklearn.grid_search import GridSearchCV
# This helper function will make a callable that we can use in cross_val_score
rmsele_scorer = make_scorer(rmsele, greater_is_better=False)
#Parameter Optimisation with grid search
tuned_parameters = [{'max_features': ['sqrt', 'log2', 'auto'], 'max_depth': [5, 8, 12], 'min_samples_leaf': [2, 5, 10]}]
rf = GridSearchCV(RandomForestRegressor(n_jobs=1, n_estimators=1000), tuned_parameters, cv=3, verbose=2, scoring=rmsele_scorer).fit(xtrain, ytrain)
Fitting 3 folds for each of 27 candidates, totalling 81 fits [GridSearchCV] max_features=sqrt, max_depth=5, min_samples_leaf=2 .............. [GridSearchCV] ..... max_features=sqrt, max_depth=5, min_samples_leaf=2 - 5.0s [GridSearchCV] max_features=sqrt, max_depth=5, min_samples_leaf=2 .............. [GridSearchCV] ..... max_features=sqrt, max_depth=5, min_samples_leaf=2 - 5.0s [GridSearchCV] max_features=sqrt, max_depth=5, min_samples_leaf=2 .............. [GridSearchCV] ..... max_features=sqrt, max_depth=5, min_samples_leaf=2 - 5.2s [GridSearchCV] max_features=sqrt, max_depth=5, min_samples_leaf=5 .............. [GridSearchCV] ..... max_features=sqrt, max_depth=5, min_samples_leaf=5 - 5.2s [GridSearchCV] max_features=sqrt, max_depth=5, min_samples_leaf=5 .............. [GridSearchCV] ..... max_features=sqrt, max_depth=5, min_samples_leaf=5 - 5.1s [GridSearchCV] max_features=sqrt, max_depth=5, min_samples_leaf=5 .............. [GridSearchCV] ..... max_features=sqrt, max_depth=5, min_samples_leaf=5 - 5.1s [GridSearchCV] max_features=sqrt, max_depth=5, min_samples_leaf=10 ............. [GridSearchCV] .... max_features=sqrt, max_depth=5, min_samples_leaf=10 - 4.9s [GridSearchCV] max_features=sqrt, max_depth=5, min_samples_leaf=10 ............. [GridSearchCV] .... max_features=sqrt, max_depth=5, min_samples_leaf=10 - 4.9s [GridSearchCV] max_features=sqrt, max_depth=5, min_samples_leaf=10 ............. [GridSearchCV] .... max_features=sqrt, max_depth=5, min_samples_leaf=10 - 5.1s [GridSearchCV] max_features=log2, max_depth=5, min_samples_leaf=2 .............. [GridSearchCV] ..... max_features=log2, max_depth=5, min_samples_leaf=2 - 5.2s [GridSearchCV] max_features=log2, max_depth=5, min_samples_leaf=2 .............. [GridSearchCV] ..... max_features=log2, max_depth=5, min_samples_leaf=2 - 5.2s [GridSearchCV] max_features=log2, max_depth=5, min_samples_leaf=2 .............. [GridSearchCV] ..... max_features=log2, max_depth=5, min_samples_leaf=2 - 5.0s [GridSearchCV] max_features=log2, max_depth=5, min_samples_leaf=5 .............. [GridSearchCV] ..... max_features=log2, max_depth=5, min_samples_leaf=5 - 5.0s [GridSearchCV] max_features=log2, max_depth=5, min_samples_leaf=5 .............. [GridSearchCV] ..... max_features=log2, max_depth=5, min_samples_leaf=5 - 4.9s [GridSearchCV] max_features=log2, max_depth=5, min_samples_leaf=5 .............. [GridSearchCV] ..... max_features=log2, max_depth=5, min_samples_leaf=5 - 4.9s [GridSearchCV] max_features=log2, max_depth=5, min_samples_leaf=10 ............. [GridSearchCV] .... max_features=log2, max_depth=5, min_samples_leaf=10 - 5.0s [GridSearchCV] max_features=log2, max_depth=5, min_samples_leaf=10 ............. [GridSearchCV] .... max_features=log2, max_depth=5, min_samples_leaf=10 - 4.9s [GridSearchCV] max_features=log2, max_depth=5, min_samples_leaf=10 ............. [GridSearchCV] .... max_features=log2, max_depth=5, min_samples_leaf=10 - 4.9s [GridSearchCV] max_features=auto, max_depth=5, min_samples_leaf=2 .............. [GridSearchCV] ..... max_features=auto, max_depth=5, min_samples_leaf=2 - 15.6s [GridSearchCV] max_features=auto, max_depth=5, min_samples_leaf=2 .............. [GridSearchCV] ..... max_features=auto, max_depth=5, min_samples_leaf=2 - 15.6s [GridSearchCV] max_features=auto, max_depth=5, min_samples_leaf=2 .............. [GridSearchCV] ..... max_features=auto, max_depth=5, min_samples_leaf=2 - 15.5s [GridSearchCV] max_features=auto, max_depth=5, min_samples_leaf=5 .............. [GridSearchCV] ..... max_features=auto, max_depth=5, min_samples_leaf=5 - 15.8s [GridSearchCV] max_features=auto, max_depth=5, min_samples_leaf=5 .............. [GridSearchCV] ..... max_features=auto, max_depth=5, min_samples_leaf=5 - 15.6s [GridSearchCV] max_features=auto, max_depth=5, min_samples_leaf=5 .............. [GridSearchCV] ..... max_features=auto, max_depth=5, min_samples_leaf=5 - 15.6s [GridSearchCV] max_features=auto, max_depth=5, min_samples_leaf=10 ............. [GridSearchCV] .... max_features=auto, max_depth=5, min_samples_leaf=10 - 15.8s [GridSearchCV] max_features=auto, max_depth=5, min_samples_leaf=10 ............. [GridSearchCV] .... max_features=auto, max_depth=5, min_samples_leaf=10 - 15.7s [GridSearchCV] max_features=auto, max_depth=5, min_samples_leaf=10 ............. [GridSearchCV] .... max_features=auto, max_depth=5, min_samples_leaf=10 - 15.6s [GridSearchCV] max_features=sqrt, max_depth=8, min_samples_leaf=2 .............. [GridSearchCV] ..... max_features=sqrt, max_depth=8, min_samples_leaf=2 - 7.5s [GridSearchCV] max_features=sqrt, max_depth=8, min_samples_leaf=2 .............. [GridSearchCV] ..... max_features=sqrt, max_depth=8, min_samples_leaf=2 - 7.6s [GridSearchCV] max_features=sqrt, max_depth=8, min_samples_leaf=2 .............. [GridSearchCV] ..... max_features=sqrt, max_depth=8, min_samples_leaf=2 - 7.3s [GridSearchCV] max_features=sqrt, max_depth=8, min_samples_leaf=5 .............. [GridSearchCV] ..... max_features=sqrt, max_depth=8, min_samples_leaf=5 - 7.2s [GridSearchCV] max_features=sqrt, max_depth=8, min_samples_leaf=5 .............. [GridSearchCV] ..... max_features=sqrt, max_depth=8, min_samples_leaf=5 - 7.8s [GridSearchCV] max_features=sqrt, max_depth=8, min_samples_leaf=5 .............. [GridSearchCV] ..... max_features=sqrt, max_depth=8, min_samples_leaf=5 - 7.8s [GridSearchCV] max_features=sqrt, max_depth=8, min_samples_leaf=10 ............. [GridSearchCV] .... max_features=sqrt, max_depth=8, min_samples_leaf=10 - 7.6s [GridSearchCV] max_features=sqrt, max_depth=8, min_samples_leaf=10 ............. [GridSearchCV] .... max_features=sqrt, max_depth=8, min_samples_leaf=10 - 8.0s [GridSearchCV] max_features=sqrt, max_depth=8, min_samples_leaf=10 ............. [GridSearchCV] .... max_features=sqrt, max_depth=8, min_samples_leaf=10 - 7.2s [GridSearchCV] max_features=log2, max_depth=8, min_samples_leaf=2 .............. [GridSearchCV] ..... max_features=log2, max_depth=8, min_samples_leaf=2 - 8.4s [GridSearchCV] max_features=log2, max_depth=8, min_samples_leaf=2 .............. [GridSearchCV] ..... max_features=log2, max_depth=8, min_samples_leaf=2 - 9.7s [GridSearchCV] max_features=log2, max_depth=8, min_samples_leaf=2 .............. [GridSearchCV] ..... max_features=log2, max_depth=8, min_samples_leaf=2 - 8.6s [GridSearchCV] max_features=log2, max_depth=8, min_samples_leaf=5 .............. [GridSearchCV] ..... max_features=log2, max_depth=8, min_samples_leaf=5 - 9.1s [GridSearchCV] max_features=log2, max_depth=8, min_samples_leaf=5 .............. [GridSearchCV] ..... max_features=log2, max_depth=8, min_samples_leaf=5 - 9.6s
[Parallel(n_jobs=1)]: Done 1 jobs | elapsed: 5.0s [Parallel(n_jobs=1)]: Done 41 jobs | elapsed: 5.8min
[GridSearchCV] max_features=log2, max_depth=8, min_samples_leaf=5 .............. [GridSearchCV] ..... max_features=log2, max_depth=8, min_samples_leaf=5 - 9.3s [GridSearchCV] max_features=log2, max_depth=8, min_samples_leaf=10 ............. [GridSearchCV] .... max_features=log2, max_depth=8, min_samples_leaf=10 - 7.8s [GridSearchCV] max_features=log2, max_depth=8, min_samples_leaf=10 ............. [GridSearchCV] .... max_features=log2, max_depth=8, min_samples_leaf=10 - 7.3s [GridSearchCV] max_features=log2, max_depth=8, min_samples_leaf=10 ............. [GridSearchCV] .... max_features=log2, max_depth=8, min_samples_leaf=10 - 7.3s [GridSearchCV] max_features=auto, max_depth=8, min_samples_leaf=2 .............. [GridSearchCV] ..... max_features=auto, max_depth=8, min_samples_leaf=2 - 23.7s [GridSearchCV] max_features=auto, max_depth=8, min_samples_leaf=2 .............. [GridSearchCV] ..... max_features=auto, max_depth=8, min_samples_leaf=2 - 23.3s [GridSearchCV] max_features=auto, max_depth=8, min_samples_leaf=2 .............. [GridSearchCV] ..... max_features=auto, max_depth=8, min_samples_leaf=2 - 24.3s [GridSearchCV] max_features=auto, max_depth=8, min_samples_leaf=5 .............. [GridSearchCV] ..... max_features=auto, max_depth=8, min_samples_leaf=5 - 23.5s [GridSearchCV] max_features=auto, max_depth=8, min_samples_leaf=5 .............. [GridSearchCV] ..... max_features=auto, max_depth=8, min_samples_leaf=5 - 23.4s [GridSearchCV] max_features=auto, max_depth=8, min_samples_leaf=5 .............. [GridSearchCV] ..... max_features=auto, max_depth=8, min_samples_leaf=5 - 23.5s [GridSearchCV] max_features=auto, max_depth=8, min_samples_leaf=10 ............. [GridSearchCV] .... max_features=auto, max_depth=8, min_samples_leaf=10 - 22.9s [GridSearchCV] max_features=auto, max_depth=8, min_samples_leaf=10 ............. [GridSearchCV] .... max_features=auto, max_depth=8, min_samples_leaf=10 - 23.1s [GridSearchCV] max_features=auto, max_depth=8, min_samples_leaf=10 ............. [GridSearchCV] .... max_features=auto, max_depth=8, min_samples_leaf=10 - 23.9s [GridSearchCV] max_features=sqrt, max_depth=12, min_samples_leaf=2 ............. [GridSearchCV] .... max_features=sqrt, max_depth=12, min_samples_leaf=2 - 11.4s [GridSearchCV] max_features=sqrt, max_depth=12, min_samples_leaf=2 ............. [GridSearchCV] .... max_features=sqrt, max_depth=12, min_samples_leaf=2 - 11.2s [GridSearchCV] max_features=sqrt, max_depth=12, min_samples_leaf=2 ............. [GridSearchCV] .... max_features=sqrt, max_depth=12, min_samples_leaf=2 - 12.5s [GridSearchCV] max_features=sqrt, max_depth=12, min_samples_leaf=5 ............. [GridSearchCV] .... max_features=sqrt, max_depth=12, min_samples_leaf=5 - 11.5s [GridSearchCV] max_features=sqrt, max_depth=12, min_samples_leaf=5 ............. [GridSearchCV] .... max_features=sqrt, max_depth=12, min_samples_leaf=5 - 10.3s [GridSearchCV] max_features=sqrt, max_depth=12, min_samples_leaf=5 ............. [GridSearchCV] .... max_features=sqrt, max_depth=12, min_samples_leaf=5 - 10.5s [GridSearchCV] max_features=sqrt, max_depth=12, min_samples_leaf=10 ............ [GridSearchCV] ... max_features=sqrt, max_depth=12, min_samples_leaf=10 - 10.6s [GridSearchCV] max_features=sqrt, max_depth=12, min_samples_leaf=10 ............ [GridSearchCV] ... max_features=sqrt, max_depth=12, min_samples_leaf=10 - 9.4s [GridSearchCV] max_features=sqrt, max_depth=12, min_samples_leaf=10 ............ [GridSearchCV] ... max_features=sqrt, max_depth=12, min_samples_leaf=10 - 9.4s [GridSearchCV] max_features=log2, max_depth=12, min_samples_leaf=2 ............. [GridSearchCV] .... max_features=log2, max_depth=12, min_samples_leaf=2 - 11.8s [GridSearchCV] max_features=log2, max_depth=12, min_samples_leaf=2 ............. [GridSearchCV] .... max_features=log2, max_depth=12, min_samples_leaf=2 - 11.4s [GridSearchCV] max_features=log2, max_depth=12, min_samples_leaf=2 ............. [GridSearchCV] .... max_features=log2, max_depth=12, min_samples_leaf=2 - 11.5s [GridSearchCV] max_features=log2, max_depth=12, min_samples_leaf=5 ............. [GridSearchCV] .... max_features=log2, max_depth=12, min_samples_leaf=5 - 10.4s [GridSearchCV] max_features=log2, max_depth=12, min_samples_leaf=5 ............. [GridSearchCV] .... max_features=log2, max_depth=12, min_samples_leaf=5 - 10.2s [GridSearchCV] max_features=log2, max_depth=12, min_samples_leaf=5 ............. [GridSearchCV] .... max_features=log2, max_depth=12, min_samples_leaf=5 - 10.1s [GridSearchCV] max_features=log2, max_depth=12, min_samples_leaf=10 ............ [GridSearchCV] ... max_features=log2, max_depth=12, min_samples_leaf=10 - 9.3s [GridSearchCV] max_features=log2, max_depth=12, min_samples_leaf=10 ............ [GridSearchCV] ... max_features=log2, max_depth=12, min_samples_leaf=10 - 9.4s [GridSearchCV] max_features=log2, max_depth=12, min_samples_leaf=10 ............ [GridSearchCV] ... max_features=log2, max_depth=12, min_samples_leaf=10 - 9.1s [GridSearchCV] max_features=auto, max_depth=12, min_samples_leaf=2 ............. [GridSearchCV] .... max_features=auto, max_depth=12, min_samples_leaf=2 - 35.3s [GridSearchCV] max_features=auto, max_depth=12, min_samples_leaf=2 ............. [GridSearchCV] .... max_features=auto, max_depth=12, min_samples_leaf=2 - 35.2s [GridSearchCV] max_features=auto, max_depth=12, min_samples_leaf=2 ............. [GridSearchCV] .... max_features=auto, max_depth=12, min_samples_leaf=2 - 35.0s [GridSearchCV] max_features=auto, max_depth=12, min_samples_leaf=5 ............. [GridSearchCV] .... max_features=auto, max_depth=12, min_samples_leaf=5 - 30.6s [GridSearchCV] max_features=auto, max_depth=12, min_samples_leaf=5 ............. [GridSearchCV] .... max_features=auto, max_depth=12, min_samples_leaf=5 - 30.4s [GridSearchCV] max_features=auto, max_depth=12, min_samples_leaf=5 ............. [GridSearchCV] .... max_features=auto, max_depth=12, min_samples_leaf=5 - 30.4s [GridSearchCV] max_features=auto, max_depth=12, min_samples_leaf=10 ............ [GridSearchCV] ... max_features=auto, max_depth=12, min_samples_leaf=10 - 27.6s [GridSearchCV] max_features=auto, max_depth=12, min_samples_leaf=10 ............ [GridSearchCV] ... max_features=auto, max_depth=12, min_samples_leaf=10 - 27.6s [GridSearchCV] max_features=auto, max_depth=12, min_samples_leaf=10 ............ [GridSearchCV] ... max_features=auto, max_depth=12, min_samples_leaf=10 - 27.6s
[Parallel(n_jobs=1)]: Done 81 out of 81 | elapsed: 17.7min finished
print 'Best Parameters:'
print rf.best_estimator_
Best Parameters: RandomForestRegressor(bootstrap=True, compute_importances=None, criterion=mse, max_depth=12, max_features=auto, min_density=None, min_samples_leaf=2, min_samples_split=2, n_estimators=1000, n_jobs=1, oob_score=False, random_state=None, verbose=0)
# Model Running with optimum parameters on train, test split for model performance evaluation and plotting
# rf_op = RandomForestRegressor(n_jobs=1, n_estimators=1000, min_samples_leaf=2, min_samples_split=2, max_features = 'auto', oob_score=True)
# rf_op.fit(xtrain, ytrain)
# preds = rf_op.predict(xtest)
# Model Running with optimum parameters on entire training set
rf_op = RandomForestRegressor(n_jobs=1, n_estimators=1000, min_samples_leaf=2, min_samples_split=2, max_features = 'auto', oob_score=True)
rf_op.fit(x_data, y_data)
RandomForestRegressor(bootstrap=True, compute_importances=None, criterion='mse', max_depth=None, max_features='auto', min_density=None, min_samples_leaf=2, min_samples_split=2, n_estimators=1000, n_jobs=1, oob_score=True, random_state=None, verbose=0)
plt.scatter(preds, ytest)
plt.title("Random Forest Model Prediction")
plt.ylabel("Actual Rental Counts")
plt.xlabel("Predicted Rental Counts")
plt.xlim(0, 1000)
plt.ylim(0, 1000)
remove_border()
The above plot is significantly improved relative to the first models, especially in terms of the prediction at higher count values.
#Root Mean Squared Logarithmic Error (RMSLE)
print "Root Mean Squared Logarithmic Error Train: ", rmsele(rf_op.predict(xtrain), ytrain)
print "Root Mean Squared Logarithmic Error Test: ", rmsele(rf_op.predict(xtest), ytest)
print "Training accuracy: %0.2f%%" % (100*rf_op.score(xtrain, ytrain))
print "Test accuracy: %0.2f%%" % (100*rf_op.score(xtest, ytest)) + "\n"
Root Mean Squared Logarithmic Error Train: 0.235220513018 Root Mean Squared Logarithmic Error Test: 0.397522765562 Training accuracy: 96.46% Test accuracy: 86.76%
The rmsle errors above are significantly improved from the values of ~(0.6-0.7) seen in the first model.
numfeat = len(features)
indices = np.argsort(rf_op.feature_importances_)[::-1][:numfeat]
plt.bar(xrange(numfeat), rf_op.feature_importances_[indices], align='center', alpha=.5)
plt.xticks(xrange(numfeat), features[indices], rotation='vertical', fontsize=12)
plt.xlim([-1, numfeat])
plt.ylabel('Feature percentages', fontsize=12)
plt.title('Feature importances computed by Random Forest', fontsize=16)
remove_border()
print zip(features, rf_op.feature_importances_)
[('season', 0.013386597371169855), ('holiday', 0.00012960342326433908), ('workingday', 0.01149803620318693), ('weather', 0.0044052148778971267), ('temp', 0.21428771441255937), ('atemp', 0.054088139749190682), ('humidity', 0.0041826736926212716), ('windspeed', 0.00036199940908252724), ('hour', 0.66710431583338592), ('weekday', 0.0099917898628404597), ('month', 0.020563915164800488)]
Hour has the largest effect on the prediction, followed by temperature as might be expected. The effect of the other predictors appears to be low in comparison and may offer some room for feature selection / transformation for model improvement.
# submit = test.drop('datetime', 1)
# print submit.shape, test.shape, xtrain.shape, xtest.shape
# dfForest = pd.DataFrame(data=zip(test['datetime'], rf_op.predict(submit)), columns = ["datetime", "count"])
# dfForest[1:100]
# pd.DataFrame.to_csv(dfForest, 'submission_forest.csv', index = False)
Code below checks whether integer submission yields a better score as counts should in practise not be floats....it does not. (marginally different however)
ET_results = []
path = "submission_ET.csv"
with open(path, "r") as d:
ET_header = d.next().strip("\n").split(",")
for line in d:
segment = line.strip("\n").split(",")
segment[1] = str(round(float(segment[1])))
line = segment[0] + "," + segment[1]
ET_results.append(line.strip("\n").split(","))
ET_ints = pd.DataFrame(data=np.asarray(ET_results), columns=ET_header)
pd.DataFrame.to_csv(ET_ints, 'submission_ET_round.csv', index = False)
The extra tree regressor in scikit learn implements more randomised trees than the random forest, some research suggest that this may slightly improve predictions. We implement this quickly before considering feature optimisation.
from sklearn.ensemble import ExtraTreesRegressor
ET = ExtraTreesRegressor(n_estimators=1000, min_samples_split=2, min_samples_leaf=2, max_features='auto', n_jobs=1)
ET.fit(x_data, y_data)
ExtraTreesRegressor(bootstrap=False, compute_importances=None, criterion='mse', max_depth=None, max_features='auto', min_density=None, min_samples_leaf=2, min_samples_split=2, n_estimators=1000, n_jobs=1, oob_score=False, random_state=None, verbose=0)
#Root Mean Squared Logarithmic Error (RMSLE)
print "Root Mean Squared Logarithmic Error Train: ", rmsele(ET.predict(x_data), y_data)
print "Training accuracy: %0.2f%%" % (100*ET.score(x_data, y_data))
Root Mean Squared Logarithmic Error Train: 0.170882517127 Training accuracy: 98.27%
The above errors are very slightly improved from the random forest and this was also seen when making predictions on Kaggle's test dataset.
# submit = test.drop('datetime', 1)
# print submit.shape, test.shape, xtrain.shape, xtest.shape
# dfET = pd.DataFrame(data=zip(test['datetime'], ET.predict(submit)), columns = ["datetime", "count"])
# dfET[1:100]
pd.DataFrame.to_csv(dfET, 'submission_weird.csv', index = False)
from numpy import corrcoef, sum, log, arange
from pylab import pcolor, show, colorbar, xticks, yticks
import matplotlib.pyplot as plt
fig, ax = plt.subplots()
R = corrcoef(x_data.T)
pcolor(R)
colorbar()
ax.set_xticks(np.arange(x_data.shape[1]) + 0.5, minor=False) #code to align the labels with the correct column of data on plot
ax.set_yticks(np.arange(x_data.shape[1]) + 0.5, minor=False)#code to align the labels with the correct row of data on plot
ax.set_xticklabels(x_data.columns, rotation=90)
ax.set_yticklabels(x_data.columns)
ax.set_title("Correlation Matrix of Features")
<matplotlib.text.Text at 0x13b600d50>
We can see a high level of positive correlation between 'month' and 'season', as would be expected. Also there is a large correlation between 'temp' and 'atemp' which is again something to be expected.
We can also see a large negative correlation between 'workingday' and 'weekday' as well as a relatively high negative correlation between 'humidity' and 'windspeed'
There also appears to be a mild correlation between a number of other variables.
For the above reason, due to the high level of correlations, we implement PCA on the features prior to training Extra Tree.
from sklearn.decomposition import PCA
pca = PCA(n_components=None)
pca_X = pca.fit_transform(x_data)
label = [("pca_" + str(e)) for e in range(pca_X.shape[1])]
# print pca_X.shape
fig, ax = plt.subplots()
R = corrcoef(pca_X.T)
pcolor(R)
colorbar()
ax.set_xticks(np.arange(pca_X.shape[1]) + 0.5, minor=False) #code to align the labels with the correct column of data on plot
ax.set_yticks(np.arange(pca_X.shape[1]) + 0.5, minor=False)#code to align the labels with the correct row of data on plot
ax.set_xticklabels(label, rotation=90)
ax.set_yticklabels(label)
ax.set_title("Correlation Matrix of PCA Components")
<matplotlib.text.Text at 0x10b4ccb90>
PCA has removed the correlation that existed between the variables and generated another 11 features based on our initial features, that are uncorrelated. The reason 11 PCA components have been generated is because we specified 'n_components'=None, which sets to keep all components.
# Split the data into training and cross validation test set to evaluate the performance of PCA
x_ETtrain, x_ETtest, y_ETtrain, y_ETtest = train_test_split(pca_X, y_data)
#Re-run the extra tree model using the PCA components as the new features
ET.fit(x_ETtrain, y_ETtrain)
ExtraTreesRegressor(bootstrap=False, compute_importances=None, criterion='mse', max_depth=None, max_features='auto', min_density=None, min_samples_leaf=2, min_samples_split=2, n_estimators=1000, n_jobs=1, oob_score=False, random_state=None, verbose=0)
preds = ET.predict(x_ETtest)
plt.scatter(preds, y_ETtest)
plt.title("Extra Tree Model Prediction")
plt.ylabel("Actual Rental Counts")
plt.xlabel("Predicted Rental Counts")
plt.xlim(0, 1000)
plt.ylim(0, 1000)
remove_border()
#Root Mean Squared Logarithmic Error (RMSLE)
print "Root Mean Squared Logarithmic Error Train: ", rmsele(ET.predict(x_ETtrain), y_ETtrain)
print "Root Mean Squared Logarithmic Error Test: ", rmsele(ET.predict(x_ETtest), y_ETtest)
print "Training accuracy: %0.2f%%" % (100*ET.score(x_ETtrain, y_ETtrain))
print "Test accuracy: %0.2f%%" % (100*ET.score(x_ETtest, y_ETtest)) + "\n"
Root Mean Squared Logarithmic Error Train: 0.316552068911 Root Mean Squared Logarithmic Error Test: 0.731589334789 Training accuracy: 97.28% Test accuracy: 73.98%
numfeat = len(features)
indices = np.argsort(ET.feature_importances_)[::-1][:numfeat]
plt.bar(xrange(numfeat), ET.feature_importances_[indices], align='center', alpha=.5)
plt.xticks(xrange(numfeat), label, rotation='vertical', fontsize=12)
plt.xlim([-1, numfeat])
plt.ylabel('Feature percentages', fontsize=12)
plt.title('Feature importances computed by Extra Tree', fontsize=16)
remove_border()
PCA actually worsened the performance of the algorithm, as can be seen both from the RMSLE and the plots. It appears perhaps that a better way to improve features, might be to generate some additional ones.
We retrain using the best model that we have so far, namely the Extra Tree Regressor model, using all our features, not using PCA; and we plot the residuals against different features to identify which ones are performing particularly badly.
x_ETtrain, x_ETtest, y_ETtrain, y_ETtest = train_test_split(x_data, y_data)
ET.fit(x_ETtrain, y_ETtrain)
ExtraTreesRegressor(bootstrap=False, compute_importances=None, criterion='mse', max_depth=None, max_features='auto', min_density=None, min_samples_leaf=2, min_samples_split=2, n_estimators=1000, n_jobs=1, oob_score=False, random_state=None, verbose=0)
#Root Mean Squared Logarithmic Error (RMSLE)
print "Root Mean Squared Logarithmic Error Train: ", rmsele(ET.predict(x_ETtrain), y_ETtrain)
print "Root Mean Squared Logarithmic Error Test: ", rmsele(ET.predict(x_ETtest), y_ETtest)
print "Training accuracy: %0.2f%%" % (100*ET.score(x_ETtrain, y_ETtrain))
print "Test accuracy: %0.2f%%" % (100*ET.score(x_ETtest, y_ETtest)) + "\n"
Root Mean Squared Logarithmic Error Train: 0.173575313669 Root Mean Squared Logarithmic Error Test: 0.414387820597 Training accuracy: 98.20% Test accuracy: 87.17%
fig, axes=plt.subplots(figsize=(35, 30), nrows=4, ncols=3)
fig.subplots_adjust(hspace = 0.5)
residual = list(ET.predict(x_ETtest).astype(float)-y_ETtest.astype(float))
for i in range(4):
for j in range(3):
if (i*3 + j) < 11:
axes[i][j].scatter(list(x_ETtest[:, i*3 + j].astype(float)), residual, c='g', s=50, alpha = 0.75)
axes[i][j].set_title('Residuals vs ' + features[i*3 + j])
axes[i][j].set_xlabel(features[i*3 + j])
axes[i][j].set_ylabel("Residuals")
remove_border(axes[i][j])
else:
fig.delaxes(axes[i][j])
From the above plots, we can see that in general the residuals are fairly well behaved, averaged around zero and showing not particular trend with the feature variable. This is however not the case for the 'hour' variable and we come across the two peaks that we observed before, suggesting that it would be a good idea to split the model into the 'registered' and 'casual' users.
We now try to apply our best model so far, ie. the Extra Tree Regressor, to predict 'registered' and 'casual' user counts separately to see if we can get a better performance in that way.
finalDF_mod = data
finalDF_mod = data.drop('datetime', 1)
finalDF_mod = finalDF_mod.drop('count', 1)
finalDF_mod = finalDF_mod.drop('registered', 1)
finalDF_mod = finalDF_mod.drop('casual', 1)
y_data_reg = data['registered']
y_data_cas = data['casual']
x_data = finalDF_mod
#REGISTERED MODEL
ETreg = ExtraTreesRegressor(n_estimators=1000, min_samples_split=2, min_samples_leaf=2, max_features='auto', n_jobs=1)
xreg_train, xreg_test, yreg_train, yreg_test = train_test_split(x_data, y_data_reg)
ETreg.fit(xreg_train, yreg_train)
#CASUAL MODEL
ETcas = ExtraTreesRegressor(n_estimators=1000, min_samples_split=2, min_samples_leaf=2, max_features='auto', n_jobs=1)
xcas_train, xcas_test, ycas_train, ycas_test = train_test_split(x_data, y_data_cas)
ETcas.fit(xcas_train, ycas_train)
ExtraTreesRegressor(bootstrap=False, compute_importances=None, criterion='mse', max_depth=None, max_features='auto', min_density=None, min_samples_leaf=2, min_samples_split=2, n_estimators=1000, n_jobs=1, oob_score=False, random_state=None, verbose=0)
#Root Mean Squared Logarithmic Error (RMSLE)
print "CASUAL USERS MODEL"
print "Root Mean Squared Logarithmic Error Train: ", rmsele(ETcas.predict(xcas_train), ycas_train)
print "Root Mean Squared Logarithmic Error Test: ", rmsele(ETcas.predict(xcas_test), ycas_test)
print "Training accuracy: %0.2f%%" % (100*ETcas.score(xcas_train, ycas_train))
print "Test accuracy: %0.2f%%" % (100*ETcas.score(xcas_test, ycas_test)) + "\n"
print "REGISTERED USERS MODEL"
print "Root Mean Squared Logarithmic Error Train: ", rmsele(ETreg.predict(xreg_train), yreg_train)
print "Root Mean Squared Logarithmic Error Test: ", rmsele(ETreg.predict(xreg_test), yreg_test)
print "Training accuracy: %0.2f%%" % (100*ETreg.score(xreg_train, yreg_train))
print "Test accuracy: %0.2f%%" % (100*ETreg.score(xreg_test, yreg_test)) + "\n"
print "COMBINED MODEL"
print "Root Mean Squared Logarithmic Error Train: ", rmsele((ETreg.predict(xreg_train)+ETcas.predict(xcas_train)), yreg_train+ycas_train)
print "Root Mean Squared Logarithmic Error Test: ", rmsele((ETreg.predict(xreg_test)+ETcas.predict(xcas_test)), yreg_test+ycas_test)
CASUAL USERS MODEL Root Mean Squared Logarithmic Error Train: 0.242743805711 Root Mean Squared Logarithmic Error Test: 0.537978189659 Training accuracy: 98.48% Test accuracy: 89.09% REGISTERED USERS MODEL Root Mean Squared Logarithmic Error Train: 0.175938462826 Root Mean Squared Logarithmic Error Test: 0.408400060532 Training accuracy: 97.99% Test accuracy: 85.62% COMBINED MODEL Root Mean Squared Logarithmic Error Train: 0.13874436266 Root Mean Squared Logarithmic Error Test: 0.331791151225
The above model looks very promising by combining the two models. However on uploading this solution to Kaggle we discovered that the model had a higher rmsle than
# submit = test.drop('datetime', 1)
# print submit.shape, test.shape, xtrain.shape, xtest.shape
# dfcombo = pd.DataFrame(data=zip(test['datetime'], (ETreg.predict(submit)+ETcas.predict(submit))), columns = ["datetime", "count"])
# dfcombo[1:100]
pd.DataFrame.to_csv(dfcombo, 'submission_comboET.csv', index = False)
Count data is usually first modelled using a poisson distribution, as it is discrete and only defined on the positive line. In this section we consider MCMC parametric models to create the dataset.
import pymc as mc #Monte Carlo simulation libraray for Bayesian MCMC
import scipy.stats as stats #Import library which contains statistical functions for our analysis
def poisson(x, beta, alpha=0):
return np.exp(np.dot(beta, x) + alpha)
To first experiment with the model, we work with a small sample of the data (only including 3000 rows) and we only use the hour as a predictor variable. This would be a reasonable assumption due to its importance as a feature in our previous random forest model. We also limit ourselves to the 'registered' users count data to experiment modelling a bimodal distribution.
We also look at higher order terms for the hour to experiment with whether we can get a better fit to the data. The first model below considered below uses a third order polynomial model.
However at this stage we note there is a violation of the assumption of mean = variance within our dataset, which is required for a Poisson distribution. A Negative Binomial is normally a good alternate choice for modelling count data when the data is over-dispersed (high variance).
Regardless, as a starting exercise, we present our analysis using a Poisson distribuion:
# We normalise the hours so that no single term dominates the higher order modelling
def normalize(x):
return (x - x.mean()) / x.std()
hour = data['hour'][0:3000]
hour_norm = normalize(hour)
hour_2 = hour**2
hour_2_norm = normalize(hour_2)
hour_3 = hour**3
hour_3_norm = normalize(hour_3)
# Y count data
y = data['registered'][0:3000]
# We assume Normal priors for alpha, beta, gamma, delta which are the contributors to the lambda parameter of our eventual Poisson distribution
# We choose the Normal distribution as its random variables can take both positive and negative numbers
beta = mc.Normal("beta", 3, 0.5)
alpha = mc.Normal("alpha", 4, 0.5)
gamma = mc.Normal("gamma", -2.5, 0.5)
delta = mc.Normal("delta", 0, 1)
# Our deterministic functions combines our priors to calculate the lambda rate parameter for our Poisson
@mc.deterministic
def rate_param(x=hour_norm, x2=hour_2_norm, x3=hour_3_norm, beta=beta, alpha=alpha, gamma = gamma, delta = delta):
return np.exp(np.dot(beta, x) + np.dot(gamma, x2) + np.dot(delta, x3) + alpha)
# The observed data is modelled as a Poisson with the observed=True argument to show that this should not be modified
# but tied to our dataset
observed = mc.Poisson("poisson_count", mu=rate_param, value=y, observed=True)
# We pass all our parameters to the model so that during the simulation they can be updated simultaneously
model = mc.Model([observed, beta, alpha, gamma, delta])
We now run the simulation after defining our model.
# MCMC Simulation code
map_ = mc.MAP(model)
map_.fit()
mcmc = mc.MCMC(model)
mcmc.sample(120000, 50000, 2)
[-----------------100%-----------------] 120000 of 120000 complete in 155.3 sec
The plots below define the posterior probabilities of the hyper-parameters used in the estimation of the Poisson lambda parameter.
alpha_samples = mcmc.trace('alpha')[:, None] # best to make them 1d
beta_samples = mcmc.trace('beta')[:, None]
gamma_samples = mcmc.trace('gamma')[:, None]
delta_samples = mcmc.trace('delta')[:, None]
# histograms of the samples:
plt.subplot(411)
plt.title(r"Posterior distributions of the variables $\alpha, \beta$")
plt.ylabel("Occurrences")
plt.hist(alpha_samples, histtype='stepfilled', bins=35, alpha=0.85,
label=r"posterior of $\alpha$", color="#A60628", normed=True)
plt.legend()
remove_border()
plt.subplot(412)
plt.ylabel("Occurrences")
plt.hist(beta_samples, histtype='stepfilled', bins=35, alpha=0.85,
label=r"posterior of $\beta$", color="#7A68A6", normed=True)
plt.legend()
remove_border()
plt.subplot(413)
plt.ylabel("Occurrences")
plt.hist(gamma_samples, histtype='stepfilled', bins=35, alpha=0.85,
label=r"posterior of $\gamma$", color="#A60628", normed=True)
plt.legend()
remove_border()
plt.subplot(414)
plt.ylabel("Occurrences")
plt.hist(delta_samples, histtype='stepfilled', bins=35, alpha=0.85,
label=r"posterior of $\delta$", color="red", normed=True)
plt.legend()
remove_border()
As can be seen from these plots the distribution has selected a narrow range within which the true value likely exists.
Below we estimate the average value of our Poisson lambda parameter using the 35000 traces from the MCMC simulation.
Note that this is not a prediction of our bike rental count, only a prediction of the rate parameter that can be used to generate counts from a Poisson distribution.
t = np.linspace(hour_norm.min(), hour_norm.max(), 50)[:, None]
t2 = np.linspace(hour_2_norm.min(), hour_2_norm.max(), 50)[:, None]
t3 = np.linspace(hour_3_norm.min(), hour_3_norm.max(), 50)[:, None]
lambda_ = np.exp(np.dot(beta_samples, t.T) + np.dot(gamma_samples, t2.T) + np.dot(delta_samples, t3.T) + alpha_samples)
mean_count_t_ = lambda_.mean(axis=0)
# plt.plot(t, mean_count_t_)
We now simulate drawing count data from a Poisson distribution using the posterior distribution of our lambda parameter, sampling from the posterior distributions of the hyper parameters estimated above.
We then plot the data and can compare with our original data.
#Simulate data points
simulated = mc.Poisson("poisson_sim", rate_param)
N = 10000
mcmc = mc.MCMC([simulated, alpha, beta, gamma, delta, observed])
mcmc.sample(N)
[-----------------100%-----------------] 10000 of 10000 complete in 28.4 sec
Below is the plot of our simulated data; we plot 50 of the 10000 simulations, superimposed on one another.
simulations = mcmc.trace("poisson_sim")[:]
for i in range(50):
plt.scatter(hour, simulations[i, :], color="k", s=2, alpha=0.1)
remove_border()
plt.title("Simulated draws from Poisson distribution using lambda posterior")
plt.ylabel("Rental bikes count")
plt.xlabel("Hour")
<matplotlib.text.Text at 0x110740dd0>
The plot above shows a peak at around 4pm; the time at which this peak occurs is earlier than observed in our actual dataset. Also our actual dataset shows a peak earlier in the day which is not represented here at all. The earlier peak may be a reflection of the bimodal behaviour somehow getting forced into one average peak.
Below is the actual data for comparison.
df = pd.DataFrame(data=hour)
df['bikes'] = y
mean_data_hour = df.groupby('hour').bikes.mean()
plt.scatter(hour, y, color="k", s=50, alpha=0.2)
plt.plot(mean_data_hour.index, mean_data_hour.values, color='r', label = "average")
plt.title("Posterior expected value of counts of bike rentals; plus realizations")
plt.legend(loc="upper left")
plt.xlim(0, 24)
plt.ylabel("Rental bikes count")
plt.xlabel("Hours")
remove_border()
plt.legend()
<matplotlib.legend.Legend at 0x107b314d0>
From the analysis above, it appears that our single model cannot be optimised to generate a bimodal distribution. We therefore consider a mixture model.
The code below is work in progress...
from sklearn.cluster import KMeans
hour = np.asarray(hour)
n = 2 #Number of components of the mixture
kme = KMeans(n) # This is not needed but it is to help convergence
# kme.fit(data[:, np.newaxis])
kme.fit(hour[:, np.newaxis])
# ndata = len(data)
ndata = len(hour)
dd = mc.Dirichlet('dd', theta=(1,)*n)
category = mc.Categorical('category', p=dd, size=ndata, value=kme.labels_)
# alphas = mc.Uniform("alphas", 0, 1, size=n)
alphas = mc.Gamma('alphas', alpha=1, beta=.0001 ,size=n)
means = mc.TruncatedNormal('means', kme.cluster_centers_[:,0],0.1,a=0.0, b=100000, size=n)
@mc.deterministic
def mean(category=category, means=means):
return means[category]
@mc.deterministic
def alpha(category=category, alphas=alphas):
return alphas[category]
obs = mc.NegativeBinomial('obs', mean, alpha, value=y, observed = True)
predictive = mc.NegativeBinomial('predictive', mean, alpha)
model = mc.Model({'dd': dd,
'category': category,
'alphas': alphas,
'means': means,
'predictive':predictive,
'obs': obs})
mcmc = mc.MCMC(model)
mcmc.sample(10000, 7000)
[-----------------100%-----------------] 10000 of 10000 complete in 49.9 sec
G = [hour[np.nonzero(np.round(mcmc.trace("category")[:].mean(axis=0)) == i)] for i in range(0, 2) ]
plt.hist(G, bins=30, stacked = True)
remove_border()
plt.title("Assignment of category across hours")
plt.xlabel("Hour")
<matplotlib.text.Text at 0x1e31fea10>
simulations = mcmc.trace("predictive")[:]
print simulations.shape
print simulations[0:100]
for i in range(100):
plt.scatter(hour, simulations[i, :], color="k", s=2, alpha=0.1)
remove_border()
plt.title("Simulated draws from Negative Binomial distribution")
plt.ylabel("Rental bikes count")
plt.xlabel("Hours")
(10000, 3000) [[ 60 73 102 ..., 103 70 12] [ 61 203 13 ..., 45 83 1] [ 73 38 6 ..., 85 29 72] ..., [ 60 110 23 ..., 116 75 7] [ 68 82 4 ..., 74 3 97] [ 57 71 11 ..., 201 118 43]]
<matplotlib.text.Text at 0x1e6e5c050>