# This notbook is for challenge2 of Cambridge Energy Data Lab GitHub tasks¶

In [395]:
import pandas
import pylab


## Steps 1 & 2¶

• The data-set consists of two columns: a time stamp and the energy consumption

#### Load data file as pandas DataFrame¶

In [396]:
df_total_watt = pandas.read_table('../../data/total_watt.csv', sep=',', header=None, names=['datetime', 'consumption' ], parse_dates=['datetime'])


In [398]:
df_total_watt

Out[398]:
<class 'pandas.core.frame.DataFrame'>
Int64Index: 1601 entries, 0 to 1600
Data columns (total 2 columns):
datetime       1601  non-null values
consumption    1601  non-null values
dtypes: datetime64[ns](1), float64(1)

In [399]:
df_total_watt.head()

Out[399]:
datetime consumption
0 2011-04-18 13:22:00 925.840614
1 2011-04-18 13:52:00 483.295892
2 2011-04-18 14:22:00 915.761634
3 2011-04-18 14:52:00 609.043491
4 2011-04-18 15:22:00 745.155434
In [405]:
df_total_watt.consumption.describe()

Out[405]:
count    1601.000000
mean      509.789108
std       788.525174
min        55.535874
25%       179.004211
50%       312.145277
75%       510.456476
max      9107.922620
dtype: float64

## Steps 3¶

• visualise the data-set

#### Simply Visualize the Dataset as it is¶

Use plot instead of bat because of too many data points

In [402]:
pylab.plot(df_total_watt.datetime, df_total_watt.consumption)
pylab.ylabel('consumption (Wh)')
pylab.xlabel('time')
pylab.title('energy consumption per 30mins')

Out[402]:
<matplotlib.text.Text at 0x124ee3e10>

#### Visualize changes per 30mins¶

In [406]:
pylab.plot(df_total_watt.datetime, df_total_watt.consumption.diff())
pylab.ylabel('consumption change (Wh)')
pylab.xlabel('time')
pylab.title('energy consumption change per 30mins')

Out[406]:
<matplotlib.text.Text at 0x127621e50>

## Steps 4¶

• visualise the data-set as values per day

#### Group by date¶

In [407]:
df_total_watt['date'] = [dt.date() for dt in df_total_watt['datetime']]
df_total_watt_daily = df_total_watt.groupby('date').sum().reset_index()

In [408]:
df_total_watt_daily.head()

Out[408]:
date consumption
0 2011-04-18 17105.982347
1 2011-04-19 30440.466453
2 2011-04-20 24027.226338
3 2011-04-21 17475.725746
4 2011-04-22 18776.278103
In [409]:
pylab.bar(df_total_watt_daily.date, df_total_watt_daily.consumption)
pylab.ylabel('consumption(Wh)')
pylab.xlabel('date')
pylab.title('energy consumption per day')

Out[409]:
<matplotlib.text.Text at 0x128062590>

## Step 5¶

• cluster the values per day into 3 groups: low, medium, and high energy consumption

#### Quickly check basic stats¶

In [362]:
df_total_watt_daily.consumption.describe()

Out[362]:
count       35.000000
mean     23319.210323
std      14109.683226
min       8278.602258
25%      13794.652107
50%      18776.278103
75%      25235.158554
max      66411.835632
dtype: float64

#### Check how total kWh/day is distributed¶

It is biased to range of 10000 to 25000 kWh.

In [640]:
count = {}
interval = 5000
consumption_ranges = range(0,70000,interval)
for consumption_range in consumption_ranges:
count[consumption_range] = \
df_total_watt_daily[df_total_watt_daily['consumption'] >= consumption_range].consumption.count() - \
df_total_watt_daily[df_total_watt_daily['consumption'] >= consumption_range + interval].consumption.count()

pylab.bar(count.keys(), count.values(), width=interval)
pylab.ylabel('frequency (days)')
pylab.xlabel('total consumption per day (kWh)')
pylab.title('distribution of energy consumption per day')

Out[640]:
<matplotlib.text.Text at 0x1328e3150>

#### define high, low and mean as follows.¶

• high > mean + std
• low < mean - std
• mean - std <= medium <= mean + std

(*) consumption is not normally distributed as above chart, so 1 std does not mean 70% in this case.

In [410]:
df_total_watt_daily_high = df_total_watt_daily[df_total_watt_daily.consumption > df_total_watt_daily.consumption.mean() + df_total_watt_daily.consumption.std()]

In [411]:
df_total_watt_daily_low = df_total_watt_daily[df_total_watt_daily.consumption < df_total_watt_daily.consumption.mean() - df_total_watt_daily.consumption.std()]

In [412]:
df_total_watt_daily_middle = df_total_watt_daily[df_total_watt_daily.consumption <= df_total_watt_daily.consumption.mean() + df_total_watt_daily.consumption.std()]
df_total_watt_daily_middle = df_total_watt_daily_middle[df_total_watt_daily_middle.consumption >= df_total_watt_daily.consumption.mean() - df_total_watt_daily.consumption.std()]

In [413]:
df_total_watt_daily_low

Out[413]:
date consumption
16 2011-05-06 8278.602258
In [414]:
df_total_watt_daily_high

Out[414]:
date consumption
5 2011-04-23 41551.530034
6 2011-04-24 58647.570168
12 2011-04-30 66411.835632
13 2011-05-01 42598.899117
32 2011-05-22 51413.203602
33 2011-05-23 40378.805967
In [415]:
df_total_watt_daily_middle.head()

Out[415]:
date consumption
0 2011-04-18 17105.982347
1 2011-04-19 30440.466453
2 2011-04-20 24027.226338
3 2011-04-21 17475.725746
4 2011-04-22 18776.278103

## Step 6¶

• visualise the clusters
In [416]:
middle = pylab.bar(df_total_watt_daily_middle.date, df_total_watt_daily_middle.consumption, color='green')
high = pylab.bar(df_total_watt_daily_high.date, df_total_watt_daily_high.consumption, color='red')
low = pylab.bar(df_total_watt_daily_low.date, df_total_watt_daily_low.consumption, color='blue')
pylab.ylabel('energy consumption per day (Wh)')
pylab.xlabel('date')
pylab.title('clustered energy consumption per day')
pylab.legend((middle, high, low), ('middle','high', 'low'))

Out[416]:
<matplotlib.legend.Legend at 0x127d36550>

## (Optional) Use log values for categorization¶

#### Plot daily consumption in order¶

Looks like log value is more suitable for clustering

In [598]:
pylab.bar(range(df_total_watt_daily.consumption.count()),df_total_watt_daily.sort('consumption')['consumption'])
pylab.ylabel('energy consumption per day (Wh)')
pylab.xlabel('order')
pylab.title('energy consumption per day ordered by consumption')

Out[598]:
<matplotlib.text.Text at 0x131c79290>
In [597]:
pylab.bar(range(df_total_watt_daily.consumption.count()),df_total_watt_daily.sort('consumption')['consumption'],
log = True)
pylab.ylabel('energy consumption per day (Wh)')
pylab.xlabel('order')
pylab.title('energy consumption per day ordered by consumption')

Out[597]:
<matplotlib.text.Text at 0x131a00310>

#### Use log value this time¶

• high > mean + std
• low < mean - std
• mean - std <= medium <= mean + std
In [560]:
df_total_watt_daily['consumption_log'] = df_total_watt_daily.consumption.map(lambda c: log(c))

In [418]:
df_total_watt_daily['consumption_log'].describe()

Out[418]:
count    35.000000
mean      9.915222
std       0.517556
min       9.021429
25%       9.531914
50%       9.840350
75%      10.135928
max      11.103631
dtype: float64
In [419]:
df_total_watt_daily_log_high = df_total_watt_daily[df_total_watt_daily['consumption_log'] > df_total_watt_daily['consumption_log'].mean() + df_total_watt_daily['consumption_log'].std()]
df_total_watt_daily_log_low = df_total_watt_daily[df_total_watt_daily['consumption_log'] < df_total_watt_daily['consumption_log'].mean() - df_total_watt_daily['consumption_log'].std()]
df_total_watt_daily_log_middle = df_total_watt_daily[df_total_watt_daily['consumption_log'] <= df_total_watt_daily['consumption_log'].mean() + df_total_watt_daily['consumption_log'].std()]
df_total_watt_daily_log_middle = df_total_watt_daily_log_middle[df_total_watt_daily_log_middle['consumption_log'] >= df_total_watt_daily['consumption_log'].mean() - df_total_watt_daily['consumption_log'].std()]

In [420]:
middle = pylab.bar(df_total_watt_daily_log_middle.date, df_total_watt_daily_log_middle.consumption, color='green')
high = pylab.bar(df_total_watt_daily_log_high.date, df_total_watt_daily_log_high.consumption, color='red')
low = pylab.bar(df_total_watt_daily_log_low.date, df_total_watt_daily_log_low.consumption, color='blue')
pylab.ylabel('energy consumption per day (Wh)')
pylab.xlabel('date')
pylab.title('clustered energy consumption per day')
pylab.legend((middle, high, low), ('middle','high', 'low'))

Out[420]:
<matplotlib.legend.Legend at 0x12a023550>

## (Optional 2) Use K-means for Categorization¶

Simply create three clusters by k-means

#### k-means clusering using 25&, 50%, 75% percentiles as initial centroids¶

In [463]:
from scipy.cluster.vq import kmeans2, whiten

In [578]:
whitened = whiten(df_total_watt_daily.consumption.values)
initial_centroids = numpy.array([numpy.percentile(whitened,25),numpy.percentile(whitened,50),numpy.percentile(whitened,75)])
centroids, labels = kmeans2((whitened),initial_centroids)

In [579]:
centroids, labels

Out[579]:
(array([ 0.97785871,  1.70177426,  3.60740752]),
array([0, 1, 1, 0, 1, 2, 2, 1, 1, 1, 0, 0, 2, 2, 0, 1, 0, 1, 0, 0, 0, 0, 0,
0, 1, 1, 1, 0, 0, 0, 0, 0, 2, 2, 1]))
In [580]:
df_total_watt_daily['labels'] = labels

In [581]:
df_total_watt_daily_cluster_low = df_total_watt_daily[df_total_watt_daily['labels'] == 0]
df_total_watt_daily_cluster_middle = df_total_watt_daily[df_total_watt_daily['labels'] == 1]
df_total_watt_daily_cluster_high = df_total_watt_daily[df_total_watt_daily['labels'] == 2]

In [582]:
low = pylab.bar(df_total_watt_daily_cluster_low.date, df_total_watt_daily_cluster_low.consumption, color='blue')
middle = pylab.bar(df_total_watt_daily_cluster_middle.date, df_total_watt_daily_cluster_middle.consumption, color='green')
high = pylab.bar(df_total_watt_daily_cluster_high.date, df_total_watt_daily_cluster_high.consumption, color='red')
pylab.ylabel('energy consumption per day (Wh)')
pylab.xlabel('date')
pylab.title('clustered energy consumption per day')
pylab.legend((middle, high, low), ('middle','high', 'low'))

Out[582]:
<matplotlib.legend.Legend at 0x12eb65610>