In this notebook, we'll train a Random Forest Regression model for predicting building energy consumption based on historical enregy data and several weather variables. We'll use daily energy data and weather data to predict energy consumption.
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
pd.options.display.mpl_style = 'default'
import seaborn as sns
import scipy as sp
import sklearn
import sklearn.cross_validation
from sklearn.ensemble import RandomForestRegressor
from sklearn.cross_validation import cross_val_score
In this notebook, we'll train a Random Forest Regression model for predicting building energy consumption based on historical enregy data and several weather variables. We'll use daily energy data and weather data to predict energy consumption.
# read in original data:
electricity = pd.read_excel('Data/dailyElectricityWithFeatures.xlsx')
electricity = electricity.drop('startDay', 1).drop('endDay', 1)
#electricity = electricity.drop('humidityRatio-kg/kg',1).drop('coolingDegrees',1).drop('heatingDegrees',1).drop('dehumidification',1).drop('occupancy',1)
electricity = electricity.dropna()
chilledWater = pd.read_excel('Data/dailyChilledWaterWithFeatures.xlsx')
chilledWater = chilledWater.drop('startDay', 1).drop('endDay', 1)
chilledWater = chilledWater.dropna()
steam = pd.read_excel('Data/dailySteamWithFeatures.xlsx')
steam = steam.drop('startDay', 1).drop('endDay', 1)
steam = steam.dropna()
# normalize data:
normalized_electricity = electricity - electricity.mean()
normalized_chilledWater = chilledWater - chilledWater.mean()
normalized_steam = steam - steam.mean()
Adding a new column to specify if working days or weekends and holidays. We'll set working days to 0, and weekends and holidays to 1. US public holidays are listed here, http://www.officeholidays.com/countries/usa/ We may also remove vacations times when there is no school, but we'll do it since we don't have this information.
# Initialization all days to 0
normalized_electricity['day_type'] = np.zeros(len(normalized_electricity))
normalized_chilledWater['day_type'] = np.zeros(len(normalized_chilledWater))
normalized_steam['day_type'] = np.zeros(len(normalized_steam))
# Set weekends to 1
normalized_electricity['day_type'][(normalized_electricity.index.dayofweek==5)|(normalized_electricity.index.dayofweek==6)] = 1
normalized_chilledWater['day_type'][(normalized_chilledWater.index.dayofweek==5)|(normalized_chilledWater.index.dayofweek==6)] = 1
normalized_steam['day_type'][(normalized_steam.index.dayofweek==5)|(normalized_steam.index.dayofweek==6)] = 1
# Set holidays to 1
holidays = ['2014-01-01','2014-01-20','2014-05-26','2014-07-04','2014-09-01','2014-11-11','2014-11-27','2014-12-25','2013-01-01',
'2013-01-21','2013-05-27','2013-07-04','2013-09-02','2013-11-11','2013-11-27','2013-12-25','2012-01-01','2012-01-16',
'2012-05-28','2012-07-04','2012-09-03','2012-11-12','2012-11-22','2012-12-25']
for i in range(len(holidays)):
normalized_electricity['day_type'][normalized_electricity.index.date==np.datetime64(holidays[i])] = 1
normalized_chilledWater['day_type'][normalized_chilledWater.index.date==np.datetime64(holidays[i])] = 1
normalized_steam['day_type'][normalized_steam.index.date==np.datetime64(holidays[i])] = 1
Analysis of electricity data.
# Split train and test data:
elect_train = pd.DataFrame(data=normalized_electricity, index=np.arange('2012-01', '2014-01', dtype='datetime64[D]')).dropna()
elect_test = pd.DataFrame(data=normalized_electricity, index=np.arange('2014-01', '2014-11', dtype='datetime64[D]')).dropna()
XX_elect_train = elect_train.drop('electricity-kWh', axis = 1).reset_index().drop('index', axis = 1)
XX_elect_test = elect_test.drop('electricity-kWh', axis = 1).reset_index().drop('index', axis = 1)
YY_elect_train = elect_train['electricity-kWh']
YY_elect_test = elect_test['electricity-kWh']
print XX_elect_train.shape, XX_elect_test.shape
(634, 13) (294, 13)
XX_elect_train.head()
RH-% | T-C | Tdew-C | pressure-mbar | solarRadiation-W/m2 | windDirection | windSpeed-m/s | humidityRatio-kg/kg | coolingDegrees | heatingDegrees | dehumidification | occupancy | day_type | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 8.212490 | -4.533170 | -2.381563 | -6.369746 | -67.924759 | 28.249822 | 0.562182 | -0.001941 | -3.841443 | 2.072677 | -0.000885 | -0.671879 | 1 |
1 | -12.481350 | -5.873750 | -8.392976 | -16.701268 | -75.852295 | 45.912866 | 2.358178 | -0.003322 | -3.841443 | 3.413256 | -0.000885 | -0.371879 | 0 |
2 | -25.939684 | -14.915417 | -18.430476 | -9.201268 | -67.477295 | 95.079532 | 2.693826 | -0.005410 | -3.841443 | 12.454923 | -0.000885 | -0.371879 | 0 |
3 | -26.898017 | -18.790417 | -22.413809 | -3.076268 | -64.435629 | 78.829532 | 1.571140 | -0.005847 | -3.841443 | 16.329923 | -0.000885 | -0.371879 | 0 |
4 | -21.523017 | -12.290417 | -15.322143 | -9.284601 | -72.435629 | 50.496199 | 1.605862 | -0.004991 | -3.841443 | 9.829923 | -0.000885 | -0.371879 | 0 |
# Find the optimal number of trees.
scores = pd.DataFrame()
for n in range(1,41):
RF = RandomForestRegressor(n_estimators=n, max_depth=None, min_samples_split=1, random_state=0)
score = cross_val_score(RF, XX_elect_train, YY_elect_train,cv=10)
scores[n] = score
sns.set_context("talk")
sns.set_style("white")
sns.boxplot(np.matrix(scores))
plt.xlabel("Number of trees")
plt.ylabel("Scores")
plt.title("The scores of the Random Forests for different number of trees.")
plt.xlim(0,41)
plt.show()
Choose number of trees as 20.
# Use the optimal number of trees for prediction:
RF_e = RandomForestRegressor(n_estimators=20, max_depth=None, min_samples_split=1, random_state=0)
RF_e.fit(XX_elect_train,YY_elect_train)
YY_elect_pred=RF_e.predict(XX_elect_test)
fig,ax = plt.subplots(1, 1,figsize=(20,10))
line1, =plt.plot(XX_elect_test.index, YY_elect_test, label='Actual consumption', color='k')
line2, =plt.plot(XX_elect_test.index, YY_elect_pred, label='RF Regression Prediction', color='r')
plt.xlabel('Feature index',fontsize=18)
plt.ylabel('Normalized electricity usage (kWh)',fontsize=18)
plt.title('Actual and RF predicted electricity usage',fontsize=20)
plt.legend([line1, line2], ['Actual consumption', 'RF Regression Prediction'],fontsize=18)
plt.show()
print RF_e.score(XX_elect_test,YY_elect_test)
0.679872383518
#Plot actual vs. prediced usage.
fig = plt.figure(figsize=(8,8))
plt.scatter(YY_elect_test, YY_elect_test, c='k')
plt.scatter(YY_elect_test, YY_elect_pred, c='r')
plt.xlabel('Actual Elec. Usage (kWh): $Y_i$',fontsize=18)
plt.ylabel("Predicted Elec. Usage (kWh): $\hat{Y}_i$",fontsize=18)
plt.title("Energy vs Predicted Energy: $Y_i$ vs $\hat{Y}_i$",fontsize=20)
plt.show()
Analysis of chilled water data.
chilledw_train = pd.DataFrame(data=normalized_chilledWater, index=np.arange('2012-01', '2014-01', dtype='datetime64[D]')).dropna()
chilledw_test = pd.DataFrame(data=normalized_chilledWater, index=np.arange('2014-01', '2014-11', dtype='datetime64[D]')).dropna()
XX_chilledw_train = chilledw_train.drop('chilledWater-TonDays', axis = 1).reset_index().drop('index', axis = 1)
XX_chilledw_test = chilledw_test.drop('chilledWater-TonDays', axis = 1).reset_index().drop('index', axis = 1)
YY_chilledw_train = chilledw_train['chilledWater-TonDays']
YY_chilledw_test = chilledw_test['chilledWater-TonDays']
print XX_chilledw_train.shape, XX_chilledw_test.shape
(705, 13) (294, 13)
# Find the optimal number of trees.
scores = pd.DataFrame()
for n in range(1,41):
rf = RandomForestRegressor(n_estimators=n, max_depth=None, min_samples_split=1, random_state=0)
score = cross_val_score(rf, XX_chilledw_train, YY_chilledw_train,cv=10)
scores[n] = score
sns.set_context("talk")
sns.set_style("white")
sns.boxplot(np.matrix(scores))
plt.xlabel("Number of trees")
plt.ylabel("Scores")
plt.title("The scores of the Random Forests for different number of trees.")
plt.xlim(0,41)
plt.ylim(-1,1)
plt.show()
Choose number of trees as 20.
# Use the optimal number of trees for prediction:
RF_w = RandomForestRegressor(n_estimators=20, max_depth=None, min_samples_split=1, random_state=0)
RF_w.fit(XX_chilledw_train,YY_chilledw_train)
YY_chilledw_pred=RF_w.predict(XX_chilledw_test)
fig,ax = plt.subplots(1, 1,figsize=(20,10))
line1, =plt.plot(XX_chilledw_test.index, YY_chilledw_test, label='Actual consumption', color='k')
line2, =plt.plot(XX_chilledw_test.index, YY_chilledw_pred, label='RF Regression Prediction', color='r')
plt.xlabel('Feature index',fontsize=18)
plt.ylabel('Normalized chilled water usage (t)',fontsize=18)
plt.title('Actual and RF predicted chilled water usage',fontsize=20)
plt.legend([line1, line2], ['Actual consumption', 'RF Regression Prediction'],fontsize=18)
plt.show()
print RF_w.score(XX_chilledw_test,YY_chilledw_test)
0.883316960112
#Plot actual vs. prediced usage.
fig = plt.figure(figsize=(8,8))
plt.scatter(YY_chilledw_test, YY_chilledw_test, c='k')
plt.scatter(YY_chilledw_test, YY_chilledw_pred, c='r')
plt.xlabel('Actual Water Usage (Ton): $Y_i$',fontsize=18)
plt.ylabel("Predicted Water Usage (Ton): $\hat{Y}_i$",fontsize=18)
plt.title("Water vs Predicted Water: $Y_i$ vs $\hat{Y}_i$",fontsize=18)
plt.show()
Analysis of steam data.
steam_train = pd.DataFrame(data=normalized_steam, index=np.arange('2012-01', '2014-01', dtype='datetime64[D]')).dropna()
steam_test = pd.DataFrame(data=normalized_steam, index=np.arange('2014-01', '2014-11', dtype='datetime64[D]')).dropna()
XX_steam_train = steam_train.drop('steam-LBS', axis = 1).reset_index().drop('index', axis = 1)
XX_steam_test = steam_test.drop('steam-LBS', axis = 1).reset_index().drop('index', axis = 1)
YY_steam_train = steam_train['steam-LBS']
YY_steam_test = steam_test['steam-LBS']
print XX_steam_train.shape, XX_steam_test.shape
(705, 13) (294, 13)
# Find the optimal number of trees.
scores = pd.DataFrame()
for n in range(1,41):
Rf = RandomForestRegressor(n_estimators=n, max_depth=None, min_samples_split=1, random_state=0)
score = cross_val_score(Rf, XX_steam_train, YY_steam_train,cv=10)
scores[n] = score
sns.set_context("talk")
sns.set_style("white")
sns.boxplot(np.matrix(scores))
plt.xlabel("Number of trees")
plt.ylabel("Scores")
plt.title("The scores of the Random Forests for different number of trees.")
plt.xlim(0,41)
plt.ylim(-1,1)
plt.show()
Choose number of trees as 21.
# Use the optimal number of trees for prediction:
RF_s = RandomForestRegressor(n_estimators=21, max_depth=None, min_samples_split=1, random_state=0)
RF_s.fit(XX_steam_train,YY_steam_train)
YY_steam_pred=RF_s.predict(XX_steam_test)
fig,ax = plt.subplots(1, 1,figsize=(20,10))
line1, =plt.plot(XX_steam_test.index, YY_steam_test, label='Actual consumption', color='k')
line2, =plt.plot(XX_steam_test.index, YY_steam_pred, label='RF Regression Prediction', color='r')
plt.xlabel('Feature index',fontsize=18)
plt.ylabel('Normalized steam usage (LBS)',fontsize=18)
plt.title('Actual and RF predicted steam usage',fontsize=20)
plt.legend([line1, line2], ['Actual consumption', 'RF Regression Prediction'],fontsize=18)
plt.show()
print RF_s.score(XX_steam_test,YY_steam_test)
0.960889669644
#Plot actual vs. prediced usage.
fig = plt.figure(figsize=(8,8))
plt.scatter(YY_steam_test, YY_steam_test, c='k')
plt.scatter(YY_steam_test, YY_steam_pred, c='r')
plt.xlabel('Actual Steam Usage (LBS): $Y_i$',fontsize=18)
plt.ylabel("Predicted Steam Usage (LBS): $\hat{Y}_i$",fontsize=18)
plt.title("Steam vs Predicted Steam: $Y_i$ vs $\hat{Y}_i$",fontsize=18)
plt.show()