import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from dateutil.parser import parse
from datetime import datetime, date, time
# df = pd.read_csv("data/Data.csv")
df.Date = pd.to_datetime(df.Date,utc='GMT')
#df = df.set_index('GMT')
print df.head()
df.tail()
Date Temp H_Pcnt C_Pcnt Temp_Feels Precip Radiation \ 0 2004-01-01 05:00:00 39.3 73 2 33.2 0 0 1 2004-01-01 06:00:00 40.3 71 9 34.6 0 0 2 2004-01-01 07:00:00 40.7 74 13 34.9 0 0 3 2004-01-01 08:00:00 41.2 77 19 35.1 0 0 4 2004-01-01 09:00:00 40.5 81 18 34.2 0 0 Wind 0 9 1 8 2 9 3 10 4 10 [5 rows x 8 columns]
Date | Temp | H_Pcnt | C_Pcnt | Temp_Feels | Precip | Radiation | Wind | |
---|---|---|---|---|---|---|---|---|
87668 | 2014-01-01 01:00:00 | 30.1 | 61 | 0 | 22.9 | 0 | 0 | 7 |
87669 | 2014-01-01 02:00:00 | 29.3 | 61 | 0 | 22.8 | 0 | 0 | 6 |
87670 | 2014-01-01 03:00:00 | 28.7 | 61 | 1 | 23.6 | 0 | 0 | 5 |
87671 | 2014-01-01 04:00:00 | 28.3 | 60 | 1 | 24.8 | 0 | 0 | 3 |
87672 | 2014-01-01 05:00:00 | 28.2 | 59 | 1 | 25.3 | 0 | 0 | 3 |
5 rows × 8 columns
# create a column which is only the month, day and hour of day.
# This is used to find the average for each respective hour later
df['hour'] = df['Date'].apply(lambda t: "%d-%d-%d" % (t.month, t.day,t.hour))
#find average for each hour... This makes sense for all the columns except precip
g = df.groupby('hour').mean().reset_index()
# g = df.groupby('hour').agg({'Temp' : np.mean,'H_Pcnt' : np.mean,'C_Pcnt' : np.mean,'Temp_Feels' : np.mean,'Precip' : np.sum,'Radiation' : np.mean,'Wind' : np.mean}).reset_index()
g
hour | Temp | H_Pcnt | C_Pcnt | Temp_Feels | Precip | Radiation | Wind | |
---|---|---|---|---|---|---|---|---|
0 | 1-1-0 | 38.230000 | 82.000000 | 54.6 | 33.180000 | 0.003000 | 0.0 | 8.100000 |
1 | 1-1-1 | 38.260000 | 82.600000 | 51.8 | 32.700000 | 0.007000 | 0.0 | 8.700000 |
2 | 1-1-10 | 38.930000 | 86.200000 | 65.6 | 32.920000 | 0.012000 | 0.0 | 9.400000 |
3 | 1-1-11 | 38.740000 | 86.900000 | 64.8 | 32.600000 | 0.009000 | 0.0 | 9.300000 |
4 | 1-1-12 | 38.840000 | 87.900000 | 53.6 | 34.080000 | 0.009000 | 0.0 | 7.000000 |
5 | 1-1-13 | 41.440000 | 83.500000 | 53.5 | 36.550000 | 0.011000 | 4.5 | 8.000000 |
6 | 1-1-14 | 44.000000 | 78.000000 | 56.0 | 39.080000 | 0.005000 | 78.4 | 9.500000 |
7 | 1-1-15 | 46.070000 | 72.300000 | 55.9 | 41.250000 | 0.007000 | 188.1 | 10.400000 |
8 | 1-1-16 | 47.120000 | 69.300000 | 55.4 | 42.470000 | 0.007000 | 266.0 | 10.800000 |
9 | 1-1-17 | 47.660000 | 67.500000 | 55.5 | 43.230000 | 0.004000 | 302.9 | 10.700000 |
10 | 1-1-18 | 49.160000 | 66.900000 | 55.5 | 46.040000 | 0.004000 | 299.9 | 8.200000 |
11 | 1-1-19 | 48.300000 | 69.600000 | 56.7 | 45.450000 | 0.001000 | 255.8 | 7.600000 |
12 | 1-1-2 | 38.290000 | 82.900000 | 52.9 | 32.520000 | 0.006000 | 0.0 | 9.100000 |
13 | 1-1-20 | 46.320000 | 74.500000 | 57.3 | 43.460000 | 0.001000 | 177.2 | 7.300000 |
14 | 1-1-21 | 42.680000 | 79.600000 | 57.9 | 38.660000 | 0.002000 | 85.9 | 8.300000 |
15 | 1-1-22 | 41.830000 | 80.200000 | 58.2 | 36.940000 | 0.003000 | 0.0 | 9.400000 |
16 | 1-1-23 | 41.070000 | 80.200000 | 58.3 | 35.540000 | 0.002000 | 0.0 | 10.000000 |
17 | 1-1-3 | 38.340000 | 83.100000 | 54.9 | 32.670000 | 0.007000 | 0.0 | 9.200000 |
18 | 1-1-4 | 38.280000 | 83.300000 | 54.3 | 32.520000 | 0.008000 | 0.0 | 9.200000 |
19 | 1-1-5 | 38.272727 | 82.454545 | 49.0 | 32.472727 | 0.004545 | 0.0 | 9.181818 |
20 | 1-1-6 | 38.900000 | 85.500000 | 57.1 | 33.570000 | 0.005000 | 0.0 | 8.300000 |
21 | 1-1-7 | 39.080000 | 85.200000 | 62.8 | 33.470000 | 0.009000 | 0.0 | 9.000000 |
22 | 1-1-8 | 39.310000 | 85.100000 | 65.4 | 33.760000 | 0.009000 | 0.0 | 9.000000 |
23 | 1-1-9 | 39.190000 | 85.600000 | 66.4 | 33.610000 | 0.012000 | 0.0 | 9.300000 |
24 | 1-10-0 | 36.190000 | 73.000000 | 57.3 | 29.870000 | 0.000000 | 0.0 | 8.900000 |
... | ... | ... | ... | ... | ... | ... | ... |
8784 rows × 8 columns
# Do a left merge with the original dataset
new_df = pd.merge(left=df, right=g, on='hour', suffixes=('','_avg') )
new_df = new_df.set_index('Date')
new_df.head()
Temp | H_Pcnt | C_Pcnt | Temp_Feels | Precip | Radiation | Wind | hour | Temp_avg | H_Pcnt_avg | C_Pcnt_avg | Temp_Feels_avg | Precip_avg | Radiation_avg | Wind_avg | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
Date | |||||||||||||||
2004-01-01 05:00:00 | 39.3 | 73 | 2 | 33.2 | 0.00 | 0 | 9 | 1-1-5 | 38.272727 | 82.454545 | 49 | 32.472727 | 0.004545 | 0 | 9.181818 |
2005-01-01 05:00:00 | 49.4 | 94 | 51 | 46.0 | 0.00 | 0 | 8 | 1-1-5 | 38.272727 | 82.454545 | 49 | 32.472727 | 0.004545 | 0 | 9.181818 |
2006-01-01 05:00:00 | 40.7 | 85 | 56 | 35.1 | 0.00 | 0 | 9 | 1-1-5 | 38.272727 | 82.454545 | 49 | 32.472727 | 0.004545 | 0 | 9.181818 |
2007-01-01 05:00:00 | 50.1 | 96 | 99 | 44.4 | 0.05 | 0 | 17 | 1-1-5 | 38.272727 | 82.454545 | 49 | 32.472727 | 0.004545 | 0 | 9.181818 |
2008-01-01 05:00:00 | 39.7 | 97 | 5 | 32.6 | 0.00 | 0 | 12 | 1-1-5 | 38.272727 | 82.454545 | 49 | 32.472727 | 0.004545 | 0 | 9.181818 |
5 rows × 15 columns
# I don't need the dates before 2009 so I remove them here.
# I kept them till now in order to calculate the averages
test = []
def drop_dates(indexes):
for index in indexes:
if index < datetime(2009,5,1):
test.append(index)
return test
trimmed = new_df.drop(drop_dates(new_df.index)).sort_index().drop('hour',axis=1)
#trimmed = trimmed.drop('date', 1)
print trimmed.head()
trimmed.Precip.plot()
Temp H_Pcnt C_Pcnt Temp_Feels Precip Radiation \ Date 2009-05-01 00:00:00 57.0 89 100 55.6 0 0 2009-05-01 01:00:00 56.5 88 92 54.5 0 0 2009-05-01 02:00:00 56.0 88 88 53.8 0 0 2009-05-01 03:00:00 56.8 86 88 54.5 0 0 2009-05-01 04:00:00 57.1 85 90 54.7 0 0 Wind Temp_avg H_Pcnt_avg C_Pcnt_avg Temp_Feels_avg \ Date 2009-05-01 00:00:00 7 59.32 73.5 75.2 59.23 2009-05-01 01:00:00 8 58.33 73.2 71.7 57.30 2009-05-01 02:00:00 9 56.90 74.4 67.9 55.53 2009-05-01 03:00:00 10 56.26 74.2 66.1 54.63 2009-05-01 04:00:00 10 55.46 74.1 64.6 53.57 Precip_avg Radiation_avg Wind_avg Date 2009-05-01 00:00:00 0 0 6.9 2009-05-01 01:00:00 0 0 7.5 2009-05-01 02:00:00 0 0 7.8 2009-05-01 03:00:00 0 0 8.5 2009-05-01 04:00:00 0 0 8.5 [5 rows x 14 columns]
<matplotlib.axes.AxesSubplot at 0x158a0b790>
#pd.write_csv("data/aggedWeather.csv")
trimmed.to_csv("aggedWeather.csv")
# print trimmed.columns
# trimmed.drop()
Index([u'Temp', u'H_Pcnt', u'C_Pcnt', u'Temp_Feels', u'Precip', u'Radiation', u'Wind', u'Temp_avg', u'H_Pcnt_avg', u'C_Pcnt_avg', u'Temp_Feels_avg', u'Precip_avg', u'Radiation_avg', u'Wind_avg'], dtype='object')
--------------------------------------------------------------------------- TypeError Traceback (most recent call last) <ipython-input-707-f146ea3104eb> in <module>() 1 print trimmed.columns ----> 2 trimmed.drop() TypeError: drop() takes at least 2 arguments (1 given)
# grouped = df.groupby("Date")
# #grouped['Temp'].agg([np.sum, np.mean, np.std])
# agged = grouped.agg({'Temp' : np.mean,
# 'Humidity_Pcnt' : np.mean,
# 'Clouds_Pcnt' : np.mean,
# 'Temp_Feels' : np.mean,
# 'Precip' : np.sum,
# 'Radiation' : np.mean,
# 'Wind' : np.mean})
# agged = np.round(agged,1)
# agged['Year'] = "holder"
# agged['Month'] = "holder"
# agged['Day'] = "holder"
# agged['Weekday'] = "holder"
# #slimmed = agged.drop(lambda x: "2004" is x)
# #slimmed
# dates = []
# def dropDates():
# for dateString in agged.index:
# date = parse(dateString)
# if date < parse("2009-05-01"):
# dates.append(dateString)
# else:
# agged.Year[dateString] = date.year
# agged.Month[dateString] = date.month
# agged.Weekday[dateString] = date.weekday()
# agged.Day[dateString] = date.day
# dropDates()
# agged = agged.drop(dates)
# agged.head()