# data is NOAA's hourly precipitation amounts for PHL measured in hundredths of an inch
# see documentation at ftp://ftp.ncdc.noaa.gov/pub/data/cdo/documentation/PRECIP_HLY_documentation.pdf

import pandas as pd
import numpy as np
from datetime import datetime
import matplotlib.pyplot as plt

df = pd.read_csv('168853.csv', index_col = 'DATE', date_parser = lambda x: datetime.strptime(x, '%Y%m%d %H:%M'))

# rainfall for June - exclude measurement types:
# a - start of accumulation
# { - start of deleted period
# } - end of deleted period
# [ - start of missing period
# ] - end of missing period
# M - missing data
dfj = df[df.index.map(lambda dt: dt.month == 6) & (df['Measurement Flag'].map(lambda x: not x in ['a','{','}','[',']','M']))]
dfj.HPCP = dfj.HPCP * 0.254 # convert to mm

# bucket by year
by_year = dfj.groupby(dfj.index.map(lambda dt: dt.year)).aggregate({'HPCP' : np.sum})

# show min and max June rainfall years, in mm
pd.concat([by_year.ix[by_year.idxmin(axis=0)], by_year.ix[by_year.idxmax(axis=0)]])


# polynomial fit
p3 = np.poly1d(np.polyfit(by_year.index, by_year.HPCP, 3))
fig = plt.figure(figsize=(15,6))
lines = plt.plot(by_year.index, by_year.HPCP, '.', by_year.index, p3(by_year.index), '-')