In [701]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from dateutil.parser import parse
from datetime import datetime, date, time

In [712]:

# df = pd.read_csv("data/Data.csv")
df.Date = pd.to_datetime(df.Date,utc='GMT')
#df = df.set_index('GMT')
print df.head()
df.tail()

                 Date  Temp  H_Pcnt  C_Pcnt  Temp_Feels  Precip  Radiation  \
0 2004-01-01 05:00:00  39.3      73       2        33.2       0          0   
1 2004-01-01 06:00:00  40.3      71       9        34.6       0          0   
2 2004-01-01 07:00:00  40.7      74      13        34.9       0          0   
3 2004-01-01 08:00:00  41.2      77      19        35.1       0          0   
4 2004-01-01 09:00:00  40.5      81      18        34.2       0          0   

   Wind  
0     9  
1     8  
2     9  
3    10  
4    10  

[5 rows x 8 columns]

Out[712]:

	Date	Temp	H_Pcnt	C_Pcnt	Temp_Feels	Wind
87668	2014-01-01 01:00:00	30.1	61	0	22.9	7
87669	2014-01-01 02:00:00	29.3	61	0	22.8	6
87670	2014-01-01 03:00:00	28.7	61	1	23.6	5
87671	2014-01-01 04:00:00	28.3	60	1	24.8	3
87672	2014-01-01 05:00:00	28.2	59	1	25.3	3

5 rows × 8 columns

In [709]:

# create a column which is only the month, day and hour of day. 
# This is used to find the average for each respective hour later
df['hour'] = df['Date'].apply(lambda t: "%d-%d-%d" % (t.month, t.day,t.hour))
#find average for each hour... This makes sense for all the columns except precip
g = df.groupby('hour').mean().reset_index()
# g = df.groupby('hour').agg({'Temp' : np.mean,'H_Pcnt' : np.mean,'C_Pcnt' : np.mean,'Temp_Feels' : np.mean,'Precip' : np.sum,'Radiation' : np.mean,'Wind' : np.mean}).reset_index()
g

Out[709]:

	hour	Temp	H_Pcnt	C_Pcnt	Temp_Feels	Precip	Radiation	Wind
0	1-1-0	38.230000	82.000000	54.6	33.180000	0.003000	0.0	8.100000
1	1-1-1	38.260000	82.600000	51.8	32.700000	0.007000	0.0	8.700000
2	1-1-10	38.930000	86.200000	65.6	32.920000	0.012000	0.0	9.400000
3	1-1-11	38.740000	86.900000	64.8	32.600000	0.009000	0.0	9.300000
4	1-1-12	38.840000	87.900000	53.6	34.080000	0.009000	0.0	7.000000
5	1-1-13	41.440000	83.500000	53.5	36.550000	0.011000	4.5	8.000000
6	1-1-14	44.000000	78.000000	56.0	39.080000	0.005000	78.4	9.500000
7	1-1-15	46.070000	72.300000	55.9	41.250000	0.007000	188.1	10.400000
8	1-1-16	47.120000	69.300000	55.4	42.470000	0.007000	266.0	10.800000
9	1-1-17	47.660000	67.500000	55.5	43.230000	0.004000	302.9	10.700000
10	1-1-18	49.160000	66.900000	55.5	46.040000	0.004000	299.9	8.200000
11	1-1-19	48.300000	69.600000	56.7	45.450000	0.001000	255.8	7.600000
12	1-1-2	38.290000	82.900000	52.9	32.520000	0.006000	0.0	9.100000
13	1-1-20	46.320000	74.500000	57.3	43.460000	0.001000	177.2	7.300000
14	1-1-21	42.680000	79.600000	57.9	38.660000	0.002000	85.9	8.300000
15	1-1-22	41.830000	80.200000	58.2	36.940000	0.003000	0.0	9.400000
16	1-1-23	41.070000	80.200000	58.3	35.540000	0.002000	0.0	10.000000
17	1-1-3	38.340000	83.100000	54.9	32.670000	0.007000	0.0	9.200000
18	1-1-4	38.280000	83.300000	54.3	32.520000	0.008000	0.0	9.200000
19	1-1-5	38.272727	82.454545	49.0	32.472727	0.004545	0.0	9.181818
20	1-1-6	38.900000	85.500000	57.1	33.570000	0.005000	0.0	8.300000
21	1-1-7	39.080000	85.200000	62.8	33.470000	0.009000	0.0	9.000000
22	1-1-8	39.310000	85.100000	65.4	33.760000	0.009000	0.0	9.000000
23	1-1-9	39.190000	85.600000	66.4	33.610000	0.012000	0.0	9.300000
24	1-10-0	36.190000	73.000000	57.3	29.870000	0.000000	0.0	8.900000
	...	...	...	...	...	...	...	...

8784 rows × 8 columns

In [704]:

# Do a left merge with the original dataset
new_df = pd.merge(left=df, right=g, on='hour', suffixes=('','_avg') )
new_df = new_df.set_index('Date')
new_df.head()

Out[704]:

	Temp	H_Pcnt	C_Pcnt	Temp_Feels	Precip	Radiation	Wind	hour	Temp_avg	H_Pcnt_avg	C_Pcnt_avg	Temp_Feels_avg	Precip_avg	Radiation_avg	Wind_avg
Date
2004-01-01 05:00:00	39.3	73	2	33.2	0.00	0	9	1-1-5	38.272727	82.454545	49	32.472727	0.004545	0	9.181818
2005-01-01 05:00:00	49.4	94	51	46.0	0.00	0	8	1-1-5	38.272727	82.454545	49	32.472727	0.004545	0	9.181818
2006-01-01 05:00:00	40.7	85	56	35.1	0.00	0	9	1-1-5	38.272727	82.454545	49	32.472727	0.004545	0	9.181818
2007-01-01 05:00:00	50.1	96	99	44.4	0.05	0	17	1-1-5	38.272727	82.454545	49	32.472727	0.004545	0	9.181818
2008-01-01 05:00:00	39.7	97	5	32.6	0.00	0	12	1-1-5	38.272727	82.454545	49	32.472727	0.004545	0	9.181818

5 rows × 15 columns

In [705]:

# I don't need the dates before 2009 so I remove them here.
# I kept them till now in order to calculate the averages
test = []

def drop_dates(indexes):
    for index in indexes:
        if index < datetime(2009,5,1):
            test.append(index)
    return test
trimmed = new_df.drop(drop_dates(new_df.index)).sort_index().drop('hour',axis=1)
#trimmed = trimmed.drop('date', 1)
print trimmed.head()

trimmed.Precip.plot()

                     Temp  H_Pcnt  C_Pcnt  Temp_Feels  Precip  Radiation  \
Date                                                                       
2009-05-01 00:00:00  57.0      89     100        55.6       0          0   
2009-05-01 01:00:00  56.5      88      92        54.5       0          0   
2009-05-01 02:00:00  56.0      88      88        53.8       0          0   
2009-05-01 03:00:00  56.8      86      88        54.5       0          0   
2009-05-01 04:00:00  57.1      85      90        54.7       0          0   

                     Wind  Temp_avg  H_Pcnt_avg  C_Pcnt_avg  Temp_Feels_avg  \
Date                                                                          
2009-05-01 00:00:00     7     59.32        73.5        75.2           59.23   
2009-05-01 01:00:00     8     58.33        73.2        71.7           57.30   
2009-05-01 02:00:00     9     56.90        74.4        67.9           55.53   
2009-05-01 03:00:00    10     56.26        74.2        66.1           54.63   
2009-05-01 04:00:00    10     55.46        74.1        64.6           53.57   

                     Precip_avg  Radiation_avg  Wind_avg  
Date                                                      
2009-05-01 00:00:00           0              0       6.9  
2009-05-01 01:00:00           0              0       7.5  
2009-05-01 02:00:00           0              0       7.8  
2009-05-01 03:00:00           0              0       8.5  
2009-05-01 04:00:00           0              0       8.5  

[5 rows x 14 columns]

Out[705]:

<matplotlib.axes.AxesSubplot at 0x158a0b790>

In [706]:

#pd.write_csv("data/aggedWeather.csv")
trimmed.to_csv("aggedWeather.csv")

In [706]:

In [707]:

# print trimmed.columns
# trimmed.drop()

Index([u'Temp', u'H_Pcnt', u'C_Pcnt', u'Temp_Feels', u'Precip', u'Radiation', u'Wind', u'Temp_avg', u'H_Pcnt_avg', u'C_Pcnt_avg', u'Temp_Feels_avg', u'Precip_avg', u'Radiation_avg', u'Wind_avg'], dtype='object')

---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-707-f146ea3104eb> in <module>()
      1 print trimmed.columns
----> 2 trimmed.drop()

TypeError: drop() takes at least 2 arguments (1 given)

In [ ]:

# grouped = df.groupby("Date")
# #grouped['Temp'].agg([np.sum, np.mean, np.std])
# agged = grouped.agg({'Temp' : np.mean,
#              'Humidity_Pcnt' : np.mean,
#              'Clouds_Pcnt' : np.mean,
#              'Temp_Feels' : np.mean,
#              'Precip' : np.sum,
#              'Radiation' : np.mean,
#              'Wind' : np.mean})

# agged = np.round(agged,1)
# agged['Year'] = "holder"
# agged['Month'] = "holder"
# agged['Day'] = "holder"
# agged['Weekday'] = "holder"

# #slimmed = agged.drop(lambda x: "2004" is x)
# #slimmed

# dates = []
# def dropDates():
#     for dateString in agged.index:
#         date = parse(dateString)
#         if date < parse("2009-05-01"): 
#             dates.append(dateString)
#         else:
#             agged.Year[dateString] = date.year
#             agged.Month[dateString] = date.month
#             agged.Weekday[dateString] = date.weekday()
#             agged.Day[dateString] = date.day
            
            



# dropDates()
# agged = agged.drop(dates)

# agged.head()