URL: https://github.com/camenergydatalab/EnergyDataSimulationChallenge
import pandas
import pylab
df_total_watt = pandas.read_table('../../data/total_watt.csv', sep=',', header=None, names=['datetime', 'consumption' ], parse_dates=['datetime'])
df_total_watt
<class 'pandas.core.frame.DataFrame'> Int64Index: 1601 entries, 0 to 1600 Data columns (total 2 columns): datetime 1601 non-null values consumption 1601 non-null values dtypes: datetime64[ns](1), float64(1)
df_total_watt.head()
datetime | consumption | |
---|---|---|
0 | 2011-04-18 13:22:00 | 925.840614 |
1 | 2011-04-18 13:52:00 | 483.295892 |
2 | 2011-04-18 14:22:00 | 915.761634 |
3 | 2011-04-18 14:52:00 | 609.043491 |
4 | 2011-04-18 15:22:00 | 745.155434 |
df_total_watt.consumption.describe()
count 1601.000000 mean 509.789108 std 788.525174 min 55.535874 25% 179.004211 50% 312.145277 75% 510.456476 max 9107.922620 dtype: float64
Use plot instead of bat because of too many data points
pylab.plot(df_total_watt.datetime, df_total_watt.consumption)
pylab.ylabel('consumption (Wh)')
pylab.xlabel('time')
pylab.title('energy consumption per 30mins')
<matplotlib.text.Text at 0x124ee3e10>
pylab.plot(df_total_watt.datetime, df_total_watt.consumption.diff())
pylab.ylabel('consumption change (Wh)')
pylab.xlabel('time')
pylab.title('energy consumption change per 30mins')
<matplotlib.text.Text at 0x127621e50>
df_total_watt['date'] = [dt.date() for dt in df_total_watt['datetime']]
df_total_watt_daily = df_total_watt.groupby('date').sum().reset_index()
df_total_watt_daily.head()
date | consumption | |
---|---|---|
0 | 2011-04-18 | 17105.982347 |
1 | 2011-04-19 | 30440.466453 |
2 | 2011-04-20 | 24027.226338 |
3 | 2011-04-21 | 17475.725746 |
4 | 2011-04-22 | 18776.278103 |
pylab.bar(df_total_watt_daily.date, df_total_watt_daily.consumption)
pylab.ylabel('consumption(Wh)')
pylab.xlabel('date')
pylab.title('energy consumption per day')
<matplotlib.text.Text at 0x128062590>
df_total_watt_daily.consumption.describe()
count 35.000000 mean 23319.210323 std 14109.683226 min 8278.602258 25% 13794.652107 50% 18776.278103 75% 25235.158554 max 66411.835632 dtype: float64
It is biased to range of 10000 to 25000 kWh.
count = {}
interval = 5000
consumption_ranges = range(0,70000,interval)
for consumption_range in consumption_ranges:
count[consumption_range] = \
df_total_watt_daily[df_total_watt_daily['consumption'] >= consumption_range].consumption.count() - \
df_total_watt_daily[df_total_watt_daily['consumption'] >= consumption_range + interval].consumption.count()
pylab.bar(count.keys(), count.values(), width=interval)
pylab.ylabel('frequency (days)')
pylab.xlabel('total consumption per day (kWh)')
pylab.title('distribution of energy consumption per day')
<matplotlib.text.Text at 0x1328e3150>
(*) consumption is not normally distributed as above chart, so 1 std does not mean 70% in this case.
df_total_watt_daily_high = df_total_watt_daily[df_total_watt_daily.consumption > df_total_watt_daily.consumption.mean() + df_total_watt_daily.consumption.std()]
df_total_watt_daily_low = df_total_watt_daily[df_total_watt_daily.consumption < df_total_watt_daily.consumption.mean() - df_total_watt_daily.consumption.std()]
df_total_watt_daily_middle = df_total_watt_daily[df_total_watt_daily.consumption <= df_total_watt_daily.consumption.mean() + df_total_watt_daily.consumption.std()]
df_total_watt_daily_middle = df_total_watt_daily_middle[df_total_watt_daily_middle.consumption >= df_total_watt_daily.consumption.mean() - df_total_watt_daily.consumption.std()]
df_total_watt_daily_low
date | consumption | |
---|---|---|
16 | 2011-05-06 | 8278.602258 |
df_total_watt_daily_high
date | consumption | |
---|---|---|
5 | 2011-04-23 | 41551.530034 |
6 | 2011-04-24 | 58647.570168 |
12 | 2011-04-30 | 66411.835632 |
13 | 2011-05-01 | 42598.899117 |
32 | 2011-05-22 | 51413.203602 |
33 | 2011-05-23 | 40378.805967 |
df_total_watt_daily_middle.head()
date | consumption | |
---|---|---|
0 | 2011-04-18 | 17105.982347 |
1 | 2011-04-19 | 30440.466453 |
2 | 2011-04-20 | 24027.226338 |
3 | 2011-04-21 | 17475.725746 |
4 | 2011-04-22 | 18776.278103 |
middle = pylab.bar(df_total_watt_daily_middle.date, df_total_watt_daily_middle.consumption, color='green')
high = pylab.bar(df_total_watt_daily_high.date, df_total_watt_daily_high.consumption, color='red')
low = pylab.bar(df_total_watt_daily_low.date, df_total_watt_daily_low.consumption, color='blue')
pylab.ylabel('energy consumption per day (Wh)')
pylab.xlabel('date')
pylab.title('clustered energy consumption per day')
pylab.legend((middle, high, low), ('middle','high', 'low'))
<matplotlib.legend.Legend at 0x127d36550>
Looks like log value is more suitable for clustering
pylab.bar(range(df_total_watt_daily.consumption.count()),df_total_watt_daily.sort('consumption')['consumption'])
pylab.ylabel('energy consumption per day (Wh)')
pylab.xlabel('order')
pylab.title('energy consumption per day ordered by consumption')
<matplotlib.text.Text at 0x131c79290>
pylab.bar(range(df_total_watt_daily.consumption.count()),df_total_watt_daily.sort('consumption')['consumption'],
log = True)
pylab.ylabel('energy consumption per day (Wh)')
pylab.xlabel('order')
pylab.title('energy consumption per day ordered by consumption')
<matplotlib.text.Text at 0x131a00310>
df_total_watt_daily['consumption_log'] = df_total_watt_daily.consumption.map(lambda c: log(c))
df_total_watt_daily['consumption_log'].describe()
count 35.000000 mean 9.915222 std 0.517556 min 9.021429 25% 9.531914 50% 9.840350 75% 10.135928 max 11.103631 dtype: float64
df_total_watt_daily_log_high = df_total_watt_daily[df_total_watt_daily['consumption_log'] > df_total_watt_daily['consumption_log'].mean() + df_total_watt_daily['consumption_log'].std()]
df_total_watt_daily_log_low = df_total_watt_daily[df_total_watt_daily['consumption_log'] < df_total_watt_daily['consumption_log'].mean() - df_total_watt_daily['consumption_log'].std()]
df_total_watt_daily_log_middle = df_total_watt_daily[df_total_watt_daily['consumption_log'] <= df_total_watt_daily['consumption_log'].mean() + df_total_watt_daily['consumption_log'].std()]
df_total_watt_daily_log_middle = df_total_watt_daily_log_middle[df_total_watt_daily_log_middle['consumption_log'] >= df_total_watt_daily['consumption_log'].mean() - df_total_watt_daily['consumption_log'].std()]
middle = pylab.bar(df_total_watt_daily_log_middle.date, df_total_watt_daily_log_middle.consumption, color='green')
high = pylab.bar(df_total_watt_daily_log_high.date, df_total_watt_daily_log_high.consumption, color='red')
low = pylab.bar(df_total_watt_daily_log_low.date, df_total_watt_daily_log_low.consumption, color='blue')
pylab.ylabel('energy consumption per day (Wh)')
pylab.xlabel('date')
pylab.title('clustered energy consumption per day')
pylab.legend((middle, high, low), ('middle','high', 'low'))
<matplotlib.legend.Legend at 0x12a023550>
Simply create three clusters by k-means
from scipy.cluster.vq import kmeans2, whiten
whitened = whiten(df_total_watt_daily.consumption.values)
initial_centroids = numpy.array([numpy.percentile(whitened,25),numpy.percentile(whitened,50),numpy.percentile(whitened,75)])
centroids, labels = kmeans2((whitened),initial_centroids)
centroids, labels
(array([ 0.97785871, 1.70177426, 3.60740752]), array([0, 1, 1, 0, 1, 2, 2, 1, 1, 1, 0, 0, 2, 2, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 2, 2, 1]))
df_total_watt_daily['labels'] = labels
df_total_watt_daily_cluster_low = df_total_watt_daily[df_total_watt_daily['labels'] == 0]
df_total_watt_daily_cluster_middle = df_total_watt_daily[df_total_watt_daily['labels'] == 1]
df_total_watt_daily_cluster_high = df_total_watt_daily[df_total_watt_daily['labels'] == 2]
low = pylab.bar(df_total_watt_daily_cluster_low.date, df_total_watt_daily_cluster_low.consumption, color='blue')
middle = pylab.bar(df_total_watt_daily_cluster_middle.date, df_total_watt_daily_cluster_middle.consumption, color='green')
high = pylab.bar(df_total_watt_daily_cluster_high.date, df_total_watt_daily_cluster_high.consumption, color='red')
pylab.ylabel('energy consumption per day (Wh)')
pylab.xlabel('date')
pylab.title('clustered energy consumption per day')
pylab.legend((middle, high, low), ('middle','high', 'low'))
<matplotlib.legend.Legend at 0x12eb65610>