# data is NOAA's hourly precipitation amounts for PHL measured in hundredths of an inch # see documentation at ftp://ftp.ncdc.noaa.gov/pub/data/cdo/documentation/PRECIP_HLY_documentation.pdf import pandas as pd import numpy as np from datetime import datetime import matplotlib.pyplot as plt df = pd.read_csv('168853.csv', index_col = 'DATE', date_parser = lambda x: datetime.strptime(x, '%Y%m%d %H:%M')) # rainfall for June - exclude measurement types: # a - start of accumulation # { - start of deleted period # } - end of deleted period # [ - start of missing period # ] - end of missing period # M - missing data dfj = df[df.index.map(lambda dt: dt.month == 6) & (df['Measurement Flag'].map(lambda x: not x in ['a','{','}','[',']','M']))] dfj.HPCP = dfj.HPCP * 0.254 # convert to mm # bucket by year by_year = dfj.groupby(dfj.index.map(lambda dt: dt.year)).aggregate({'HPCP' : np.sum}) # show min and max June rainfall years, in mm pd.concat([by_year.ix[by_year.idxmin(axis=0)], by_year.ix[by_year.idxmax(axis=0)]]) # polynomial fit p3 = np.poly1d(np.polyfit(by_year.index, by_year.HPCP, 3)) fig = plt.figure(figsize=(15,6)) lines = plt.plot(by_year.index, by_year.HPCP, '.', by_year.index, p3(by_year.index), '-')