In this notebook, we'll train a Linear Regression model for predicting building energy consumption based on historical enregy data, several weather variables, hour of the day, day of the week, weekends and holidays.
To do this, we'll fit the model to daily and hourly energy and weather data from 2012-01-01 to 2014-10-31.
# special IPython command to prepare the notebook for matplotlib
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
pd.options.display.mpl_style = 'default'
from sklearn.linear_model import LinearRegression
dailyElectricity = pd.read_excel('Data/dailyElectricityWithFeatures.xlsx')
dailyElectricity = dailyElectricity.drop('startDay', 1).drop('endDay', 1)
dailyChilledWater = pd.read_excel('Data/dailyChilledWaterWithFeatures.xlsx')
dailyChilledWater = dailyChilledWater.drop('startDay', 1).drop('endDay', 1)
dailySteam = pd.read_excel('Data/dailySteamWithFeatures.xlsx')
dailySteam = dailySteam.drop('startDay', 1).drop('endDay', 1)
hourlyElectricity = pd.read_excel('Data/hourlyElectricityWithFeatures.xlsx')
hourlyElectricity = hourlyElectricity.drop('startTime', 1).drop('endTime', 1)
hourlyChilledWater = pd.read_excel('Data/hourlyChilledWaterWithFeatures.xlsx')
hourlyChilledWater = hourlyChilledWater.drop('startTime', 1).drop('endTime', 1)
hourlySteam = pd.read_excel('Data/hourlySteamWithFeatures.xlsx')
hourlySteam = hourlySteam.drop('startTime', 1).drop('endTime', 1)
#display a dataframe
dailyElectricity.head()
electricity-kWh | RH-% | T-C | Tdew-C | pressure-mbar | solarRadiation-W/m2 | windDirection | windSpeed-m/s | humidityRatio-kg/kg | coolingDegrees | heatingDegrees | dehumidification | occupancy | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
2012-01-01 | 2800.244977 | 76.652174 | 7.173913 | 3.073913 | 1004.956522 | 95.260870 | 236.086957 | 4.118361 | 0.004796 | 0.086957 | 7.826087 | 0 | 0.0 |
2012-01-02 | 3168.974047 | 55.958333 | 5.833333 | -2.937500 | 994.625000 | 87.333333 | 253.750000 | 5.914357 | 0.003415 | 0.000000 | 9.166667 | 0 | 0.3 |
2012-01-03 | 5194.533376 | 42.500000 | -3.208333 | -12.975000 | 1002.125000 | 95.708333 | 302.916667 | 6.250005 | 0.001327 | 0.000000 | 18.208333 | 0 | 0.3 |
2012-01-04 | 5354.861935 | 41.541667 | -7.083333 | -16.958333 | 1008.250000 | 98.750000 | 286.666667 | 5.127319 | 0.000890 | 0.000000 | 22.083333 | 0 | 0.3 |
2012-01-05 | 5496.223993 | 46.916667 | -0.583333 | -9.866667 | 1002.041667 | 90.750000 | 258.333333 | 5.162041 | 0.001746 | 0.000000 | 15.583333 | 0 | 0.3 |
Adding new features to the dataframe: weekday, day of the year and week of the year.
def addDailyTimeFeatures(df):
df['weekday'] = df.index.weekday
df['day'] = df.index.dayofyear
df['week'] = df.index.weekofyear
return df
dailyElectricity = addDailyTimeFeatures(dailyElectricity)
dailyChilledWater = addDailyTimeFeatures(dailyChilledWater)
dailySteam = addDailyTimeFeatures(dailySteam)
df_elect = dailyElectricity[['weekday', 'day', 'week', 'occupancy', 'electricity-kWh']]
elect_train = pd.DataFrame(data=df_elect, index=np.arange('2012-01', '2013-07', dtype='datetime64[D]')).dropna()
elect_test = pd.DataFrame(data=df_elect, index=np.arange('2013-07', '2014-11', dtype='datetime64[D]')).dropna()
XX_elect_train = elect_train.drop('electricity-kWh', axis = 1).reset_index().drop('index', axis = 1)
XX_elect_test = elect_test.drop('electricity-kWh', axis = 1).reset_index().drop('index', axis = 1)
YY_elect_train = elect_train['electricity-kWh']
YY_elect_test = elect_test['electricity-kWh']
lr_elect = LinearRegression()
lr_elect.fit(XX_elect_train,YY_elect_train)
y_lr = lr_elect.predict(XX_elect_test)
print "The test score R2: ", lr_elect.score(XX_elect_test, YY_elect_test)
print "The Linear Regression coefficients are"
pd.DataFrame(zip(XX_elect_train.columns, lr_elect.coef_), columns = ['elect_features', 'linearRegr_Coefficients'])
The test score R2: 0.608937488563 The Linear Regression coefficients are
elect_features | linearRegr_Coefficients | |
---|---|---|
0 | weekday | -125.392163 |
1 | day | 0.550121 |
2 | week | -11.553215 |
3 | occupancy | 2830.298384 |
#Plot observed and Predicted electricity value
fig = plt.figure(figsize=(15,7))
plt.scatter(XX_elect_test.index, YY_elect_test, label='Observed', color='k')
plt.plot(XX_elect_test.index, y_lr, label='Predicted', color='g')
plt.legend(loc='upper right')
<matplotlib.legend.Legend at 0x32b713c8>
#Plot Observed vs. Linear Regression predicted usage.
fig = plt.figure(figsize=(6,6))
plt.plot(YY_elect_test, YY_elect_test, c='k')
plt.scatter(YY_elect_test, y_lr, c='g')
plt.xlabel('Observed Elec. Usage (kWh)')
plt.ylabel("Predicted Elec. Usage (kWh): $\hat{Y}_i$")
plt.title("Energy vs Predicted Elec.: $Y_i$ vs $\hat{Y}_i$")
<matplotlib.text.Text at 0x365fab00>
chilledw_train = pd.DataFrame(data=dailyChilledWater, index=np.arange('2012-01', '2013-07', dtype='datetime64[D]')).dropna()
chilledw_test = pd.DataFrame(data=dailyChilledWater, index=np.arange('2013-07', '2014-11', dtype='datetime64[D]')).dropna()
XX_chilledw_train = chilledw_train.drop('chilledWater-TonDays', axis = 1).reset_index().drop('index', axis = 1)
XX_chilledw_test = chilledw_test.drop('chilledWater-TonDays', axis = 1).reset_index().drop('index', axis = 1)
YY_chilledw_train = chilledw_train['chilledWater-TonDays']
YY_chilledw_test = chilledw_test['chilledWater-TonDays']
lr_chilledw = LinearRegression()
lr_chilledw.fit(XX_chilledw_train,YY_chilledw_train)
print "The test score R2: ", lr_chilledw.score(XX_chilledw_test, YY_chilledw_test)
print "The Linear Regression coefficients are"
pd.DataFrame(zip(XX_chilledw_train.columns, lr_chilledw.coef_), columns = ['chilledw_features', 'linearRegr_Coefficients'])
The test score R2: 0.830709188732 The Linear Regression coefficients are
chilledw_features | linearRegr_Coefficients | |
---|---|---|
0 | RH-% | 0.464299 |
1 | T-C | 6.062113 |
2 | Tdew-C | -2.486768 |
3 | pressure-mbar | -0.095268 |
4 | solarRadiation-W/m2 | 0.042885 |
5 | windDirection | -0.025036 |
6 | windSpeed-m/s | -1.166902 |
7 | humidityRatio-kg/kg | 1673.166705 |
8 | coolingDegrees | 2.853128 |
9 | heatingDegrees | 4.421394 |
10 | dehumidification | 2999.125771 |
11 | occupancy | 0.571356 |
12 | weekday | -2.461900 |
13 | day | -0.010718 |
14 | week | 0.122757 |
#Plot observed and predicted electricity value
y_lr = lr_chilledw.predict(XX_chilledw_test)
fig = plt.figure(figsize=(15,7))
plt.scatter(XX_chilledw_test.index, YY_chilledw_test, label='Observed', color='k')
plt.plot(XX_chilledw_test.index, y_lr, label='Predicted', color='r')
plt.legend(loc='upper right')
<matplotlib.legend.Legend at 0x34131400>
#Plot observed vs. Predicted usage.
fig = plt.figure(figsize=(6,6))
plt.plot(YY_chilledw_test, YY_chilledw_test, c='k')
plt.scatter(YY_chilledw_test, y_lr, c='r')
plt.xlabel('Observed Chilled Water Usage (TonDays)')
plt.ylabel("Predicted Chilled Water Usage (TonDays): $\hat{Y}_i$")
plt.title("Observed vs Predicted Chilled Water: $Y_i$ vs $\hat{Y}_i$")
<matplotlib.text.Text at 0x31a16c88>
steam_train = pd.DataFrame(data=dailySteam, index=np.arange('2012-01', '2013-07', dtype='datetime64[D]')).dropna()
steam_test = pd.DataFrame(data=dailySteam, index=np.arange('2013-07', '2014-11', dtype='datetime64[D]')).dropna()
XX_steam_train = steam_train.drop('steam-LBS', axis = 1).reset_index().drop('index', axis = 1)
XX_steam_test = steam_test.drop('steam-LBS', axis = 1).reset_index().drop('index', axis = 1)
YY_steam_train = steam_train['steam-LBS']
YY_steam_test = steam_test['steam-LBS']
lr_steam = LinearRegression()
lr_steam.fit(XX_steam_train,YY_steam_train)
print "The test score R2: ", lr_steam.score(XX_steam_test, YY_steam_test)
print "The Linear Regression coefficients are"
pd.DataFrame(zip(XX_steam_train.columns, lr_steam.coef_), columns = ['steam_features', 'linearRegr_Coefficients'])
The test score R2: 0.942276415896 The Linear Regression coefficients are
steam_features | linearRegr_Coefficients | |
---|---|---|
0 | RH-% | 66.535470 |
1 | T-C | 458.096751 |
2 | Tdew-C | -951.521615 |
3 | pressure-mbar | -30.891470 |
4 | solarRadiation-W/m2 | -18.446292 |
5 | windDirection | -7.828922 |
6 | windSpeed-m/s | 251.824413 |
7 | humidityRatio-kg/kg | 857001.445663 |
8 | coolingDegrees | -99.989152 |
9 | heatingDegrees | 1794.351286 |
10 | dehumidification | -482120.622688 |
11 | occupancy | 3150.501909 |
12 | weekday | -531.583401 |
13 | day | -1.499061 |
14 | week | -43.000664 |
#Plot observed and predicted electricity value
y_lr = lr_steam.predict(XX_steam_test)
fig = plt.figure(figsize=(15,7))
plt.scatter(XX_steam_test.index, YY_steam_test, label='Observed', color='k')
plt.plot(XX_steam_test.index, y_lr, label='Predicted', color='g')
plt.legend(loc='upper right')
<matplotlib.legend.Legend at 0x35b66d68>
#Plot actual vs. Predicted usage.
fig = plt.figure(figsize=(6,6))
plt.plot(YY_steam_test, YY_steam_test, c='k')
plt.scatter(YY_steam_test, y_lr, c='g')
plt.xlabel('Observed Steam Usage (LBS)')
plt.ylabel("Predicted Steam Usage (LBS): $\hat{Y}_i$")
plt.title("Observed vs Predicted Steam: $Y_i$ vs $\hat{Y}_i$")
<matplotlib.text.Text at 0x3429a630>
Adding new features to the dataframe: hour, weekday, day of the year and week of the year.
def addHourlyTimeFeatures(df):
df['hour'] = df.index.hour
df['weekday'] = df.index.weekday
df['day'] = df.index.dayofyear
df['week'] = df.index.weekofyear
return df
hourlyElectricity = addHourlyTimeFeatures(hourlyElectricity)
df_hourlyelect = hourlyElectricity[['hour', 'weekday', 'day', 'week', 'cosHour',
'occupancy', 'electricity-kWh']]
hourlyelect_train = pd.DataFrame(data=df_hourlyelect, index=np.arange('2014-01-01 00:00:00', '2014-10-01 00:00:00', dtype='datetime64[h]')).dropna()
hourlyelect_test = pd.DataFrame(data=df_hourlyelect, index=np.arange('2014-10-01 00:00:00', '2014-11-01 00:00:00', dtype='datetime64[h]')).dropna()
XX_hourlyelect_train = hourlyelect_train.drop('electricity-kWh', axis = 1).reset_index().drop('index', axis = 1)
XX_hourlyelect_test = hourlyelect_test.drop('electricity-kWh', axis = 1).reset_index().drop('index', axis = 1)
YY_hourlyelect_train = hourlyelect_train['electricity-kWh']
YY_hourlyelect_test = hourlyelect_test['electricity-kWh']
lr_hourlyelect = LinearRegression()
lr_hourlyelect.fit(XX_hourlyelect_train,YY_hourlyelect_train)
y_hourlyelect_lr = lr_hourlyelect.predict(XX_hourlyelect_test)
print "The test score R2: ", lr_hourlyelect.score(XX_hourlyelect_test, YY_hourlyelect_test)
print "The Linear Regression coefficients are"
pd.DataFrame(zip(XX_hourlyelect_train.columns, lr_hourlyelect.coef_), columns = ['hourlyelect_features', 'linearRegr_Coefficients'])
The test score R2: 0.714713369958 The Linear Regression coefficients are
hourlyelect_features | linearRegr_Coefficients | |
---|---|---|
0 | hour | -0.287362 |
1 | weekday | -6.995868 |
2 | day | -0.309981 |
3 | week | 0.955127 |
4 | cosHour | -81.049080 |
5 | occupancy | 114.803110 |
#Plot observed and Predicted electricity value
fig = plt.figure(figsize=(25,7))
plt.scatter(XX_hourlyelect_test.index, YY_hourlyelect_test, label='Observed', color='k')
plt.plot(XX_hourlyelect_test.index, y_hourlyelect_lr, label='Predicted', color='r')
plt.legend(loc='upper right')
<matplotlib.legend.Legend at 0x3416b710>
#Plot Observed vs. Predicted usage.
fig = plt.figure(figsize=(6,6))
plt.plot(YY_hourlyelect_test, YY_hourlyelect_test, c='k')
plt.scatter(YY_hourlyelect_test, y_hourlyelect_lr, c='r')
plt.xlabel('Observed Elec. Usage (kWh)')
plt.ylabel("Predicted Elec. Usage (kWh): $\hat{Y}_i$")
plt.title("Energy vs Predicted Elec.: $Y_i$ vs $\hat{Y}_i$")
<matplotlib.text.Text at 0x33ea2fd0>
hourlychilledw_train = pd.DataFrame(data=hourlyChilledWater, index=np.arange('2014-01-01 00:00:00', '2014-09-01 00:00:00', dtype='datetime64[h]')).dropna()
hourlychilledw_test = pd.DataFrame(data=hourlyChilledWater, index=np.arange('2014-09-01 00:00:00', '2014-11-01 00:00:00', dtype='datetime64[h]')).dropna()
XX_hourlychilledw_train = hourlychilledw_train.drop('chilledWater-TonDays', axis = 1).reset_index().drop('index', axis = 1)
XX_hourlychilledw_test = hourlychilledw_test.drop('chilledWater-TonDays', axis = 1).reset_index().drop('index', axis = 1)
YY_hourlychilledw_train = hourlychilledw_train['chilledWater-TonDays']
YY_hourlychilledw_test = hourlychilledw_test['chilledWater-TonDays']
lr_hourlychilledw = LinearRegression()
lr_hourlychilledw.fit(XX_hourlychilledw_train,YY_hourlychilledw_train)
y_hourlychilledw_lr = lr_hourlychilledw.predict(XX_hourlychilledw_test)
print "The test score R2: ", lr_hourlychilledw.score(XX_hourlychilledw_test, YY_hourlychilledw_test)
print "The Linear Regression coefficients are"
pd.DataFrame(zip(XX_hourlychilledw_train.columns, lr_hourlychilledw.coef_), columns = ['hourlychilledw_features', 'linearRegr_Coefficients'])
The test score R2: 0.709930521875 The Linear Regression coefficients are
hourlychilledw_features | linearRegr_Coefficients | |
---|---|---|
0 | RH-% | -0.028198 |
1 | T-C | 0.459533 |
2 | Tdew-C | 0.166999 |
3 | pressure-mbar | -0.007099 |
4 | solarRadiation-W/m2 | 0.001003 |
5 | windDirection | -0.000382 |
6 | windSpeed-m/s | 0.004837 |
7 | humidityRatio-kg/kg | -91.425425 |
8 | coolingDegrees | -0.172407 |
9 | heatingDegrees | 0.603195 |
10 | dehumidification | 226.397306 |
11 | occupancy | 0.483000 |
12 | cosHour | -0.562715 |
#Plot Observed and Predicted electricity value
fig = plt.figure(figsize=(15,7))
plt.scatter(XX_hourlychilledw_test.index, YY_hourlychilledw_test, label='Observed', color='k')
plt.plot(XX_hourlychilledw_test.index, y_hourlychilledw_lr, label='Predicted', color='g')
plt.legend(loc='upper right')
<matplotlib.legend.Legend at 0x36eeeda0>
#Plot Observed vs. Predicted usage.
fig = plt.figure(figsize=(6,6))
plt.plot(YY_hourlychilledw_test, YY_hourlychilledw_test, c='k')
plt.scatter(YY_hourlychilledw_test, y_hourlychilledw_lr, c='g')
plt.xlabel('Observed Chilled Water Usage (TonDays)')
plt.ylabel("Predicted Chilled Water Usage (TonDays): $\hat{Y}_i$")
plt.title("Observed vs Predicted Chilled Water: $Y_i$ vs $\hat{Y}_i$")
<matplotlib.text.Text at 0x390844e0>
hourlysteam_train = pd.DataFrame(data=hourlySteam, index=np.arange('2012-01-01 00:00:00', '2014-02-01 00:00:00', dtype='datetime64[h]')).dropna()
hourlysteam_test = pd.DataFrame(data=hourlySteam, index=np.arange('2014-02-01 00:00:00', '2014-11-01 00:00:00', dtype='datetime64[h]')).dropna()
XX_hourlysteam_train = hourlysteam_train.drop('steam-LBS', axis = 1).reset_index().drop('index', axis = 1)
XX_hourlysteam_test = hourlysteam_test.drop('steam-LBS', axis = 1).reset_index().drop('index', axis = 1)
YY_hourlysteam_train = hourlysteam_train['steam-LBS']
YY_hourlysteam_test = hourlysteam_test['steam-LBS']
lr_hourlysteam = LinearRegression()
lr_hourlysteam.fit(XX_hourlysteam_train,YY_hourlysteam_train)
y_hourlysteam_lr = lr_hourlysteam.predict(XX_hourlysteam_test)
print "The test score R2: ", lr_hourlysteam.score(XX_hourlysteam_test, YY_hourlysteam_test)
print "The coefficients Linear Regression are"
pd.DataFrame(zip(XX_hourlysteam_train.columns, lr_hourlysteam.coef_), columns = ['hourlysteam_features', 'linearRegr_Coefficients'])
The test score R2: 0.764295430491 The coefficients Linear Regression are
hourlysteam_features | linearRegr_Coefficients | |
---|---|---|
0 | RH-% | 5.367666 |
1 | T-C | 8.577206 |
2 | Tdew-C | -54.743326 |
3 | pressure-mbar | -0.279591 |
4 | solarRadiation-W/m2 | 0.138138 |
5 | windDirection | 0.041451 |
6 | windSpeed-m/s | 13.943372 |
7 | humidityRatio-kg/kg | 75847.104324 |
8 | coolingDegrees | -31.597421 |
9 | heatingDegrees | 57.903822 |
10 | dehumidification | -8088.312347 |
11 | occupancy | 131.534596 |
12 | cosHour | -343.896782 |
#Plot Observed and Predicted value
fig = plt.figure(figsize=(15,7))
plt.scatter(XX_hourlysteam_test.index, YY_hourlysteam_test, label='Observed', color='k')
plt.plot(XX_hourlysteam_test.index, y_hourlysteam_lr, label='Predicted', color='r')
plt.legend(loc='upper right')
<matplotlib.legend.Legend at 0x3ad09438>
#Plot Observed vs. Predicted usage.
fig = plt.figure(figsize=(6,6))
plt.plot(YY_hourlysteam_test, YY_hourlysteam_test, c='k')
plt.scatter(YY_hourlysteam_test, y_hourlysteam_lr, c='r')
plt.xlabel('Observed Steam Usage (LBS)')
plt.ylabel("Predicted Steam Usage (LBS): $\hat{Y}_i$")
plt.title("Observed vs Predicted Steam: $Y_i$ vs $\hat{Y}_i$")
<matplotlib.text.Text at 0x3a6659b0>