import pandas import pylab df_total_watt = pandas.read_table('../../data/total_watt.csv', sep=',', header=None, names=['datetime', 'consumption' ], parse_dates=['datetime']) df_total_watt df_total_watt.head() df_total_watt.consumption.describe() pylab.plot(df_total_watt.datetime, df_total_watt.consumption) pylab.ylabel('consumption (Wh)') pylab.xlabel('time') pylab.title('energy consumption per 30mins') pylab.plot(df_total_watt.datetime, df_total_watt.consumption.diff()) pylab.ylabel('consumption change (Wh)') pylab.xlabel('time') pylab.title('energy consumption change per 30mins') df_total_watt['date'] = [dt.date() for dt in df_total_watt['datetime']] df_total_watt_daily = df_total_watt.groupby('date').sum().reset_index() df_total_watt_daily.head() pylab.bar(df_total_watt_daily.date, df_total_watt_daily.consumption) pylab.ylabel('consumption(Wh)') pylab.xlabel('date') pylab.title('energy consumption per day') df_total_watt_daily.consumption.describe() count = {} interval = 5000 consumption_ranges = range(0,70000,interval) for consumption_range in consumption_ranges: count[consumption_range] = \ df_total_watt_daily[df_total_watt_daily['consumption'] >= consumption_range].consumption.count() - \ df_total_watt_daily[df_total_watt_daily['consumption'] >= consumption_range + interval].consumption.count() pylab.bar(count.keys(), count.values(), width=interval) pylab.ylabel('frequency (days)') pylab.xlabel('total consumption per day (kWh)') pylab.title('distribution of energy consumption per day') df_total_watt_daily_high = df_total_watt_daily[df_total_watt_daily.consumption > df_total_watt_daily.consumption.mean() + df_total_watt_daily.consumption.std()] df_total_watt_daily_low = df_total_watt_daily[df_total_watt_daily.consumption < df_total_watt_daily.consumption.mean() - df_total_watt_daily.consumption.std()] df_total_watt_daily_middle = df_total_watt_daily[df_total_watt_daily.consumption <= df_total_watt_daily.consumption.mean() + df_total_watt_daily.consumption.std()] df_total_watt_daily_middle = df_total_watt_daily_middle[df_total_watt_daily_middle.consumption >= df_total_watt_daily.consumption.mean() - df_total_watt_daily.consumption.std()] df_total_watt_daily_low df_total_watt_daily_high df_total_watt_daily_middle.head() middle = pylab.bar(df_total_watt_daily_middle.date, df_total_watt_daily_middle.consumption, color='green') high = pylab.bar(df_total_watt_daily_high.date, df_total_watt_daily_high.consumption, color='red') low = pylab.bar(df_total_watt_daily_low.date, df_total_watt_daily_low.consumption, color='blue') pylab.ylabel('energy consumption per day (Wh)') pylab.xlabel('date') pylab.title('clustered energy consumption per day') pylab.legend((middle, high, low), ('middle','high', 'low')) pylab.bar(range(df_total_watt_daily.consumption.count()),df_total_watt_daily.sort('consumption')['consumption']) pylab.ylabel('energy consumption per day (Wh)') pylab.xlabel('order') pylab.title('energy consumption per day ordered by consumption') pylab.bar(range(df_total_watt_daily.consumption.count()),df_total_watt_daily.sort('consumption')['consumption'], log = True) pylab.ylabel('energy consumption per day (Wh)') pylab.xlabel('order') pylab.title('energy consumption per day ordered by consumption') df_total_watt_daily['consumption_log'] = df_total_watt_daily.consumption.map(lambda c: log(c)) df_total_watt_daily['consumption_log'].describe() df_total_watt_daily_log_high = df_total_watt_daily[df_total_watt_daily['consumption_log'] > df_total_watt_daily['consumption_log'].mean() + df_total_watt_daily['consumption_log'].std()] df_total_watt_daily_log_low = df_total_watt_daily[df_total_watt_daily['consumption_log'] < df_total_watt_daily['consumption_log'].mean() - df_total_watt_daily['consumption_log'].std()] df_total_watt_daily_log_middle = df_total_watt_daily[df_total_watt_daily['consumption_log'] <= df_total_watt_daily['consumption_log'].mean() + df_total_watt_daily['consumption_log'].std()] df_total_watt_daily_log_middle = df_total_watt_daily_log_middle[df_total_watt_daily_log_middle['consumption_log'] >= df_total_watt_daily['consumption_log'].mean() - df_total_watt_daily['consumption_log'].std()] middle = pylab.bar(df_total_watt_daily_log_middle.date, df_total_watt_daily_log_middle.consumption, color='green') high = pylab.bar(df_total_watt_daily_log_high.date, df_total_watt_daily_log_high.consumption, color='red') low = pylab.bar(df_total_watt_daily_log_low.date, df_total_watt_daily_log_low.consumption, color='blue') pylab.ylabel('energy consumption per day (Wh)') pylab.xlabel('date') pylab.title('clustered energy consumption per day') pylab.legend((middle, high, low), ('middle','high', 'low')) from scipy.cluster.vq import kmeans2, whiten whitened = whiten(df_total_watt_daily.consumption.values) initial_centroids = numpy.array([numpy.percentile(whitened,25),numpy.percentile(whitened,50),numpy.percentile(whitened,75)]) centroids, labels = kmeans2((whitened),initial_centroids) centroids, labels df_total_watt_daily['labels'] = labels df_total_watt_daily_cluster_low = df_total_watt_daily[df_total_watt_daily['labels'] == 0] df_total_watt_daily_cluster_middle = df_total_watt_daily[df_total_watt_daily['labels'] == 1] df_total_watt_daily_cluster_high = df_total_watt_daily[df_total_watt_daily['labels'] == 2] low = pylab.bar(df_total_watt_daily_cluster_low.date, df_total_watt_daily_cluster_low.consumption, color='blue') middle = pylab.bar(df_total_watt_daily_cluster_middle.date, df_total_watt_daily_cluster_middle.consumption, color='green') high = pylab.bar(df_total_watt_daily_cluster_high.date, df_total_watt_daily_cluster_high.consumption, color='red') pylab.ylabel('energy consumption per day (Wh)') pylab.xlabel('date') pylab.title('clustered energy consumption per day') pylab.legend((middle, high, low), ('middle','high', 'low'))