import pandas as pd # some display options to make figures bigger # hide this pd.set_option('display.max_columns', 15) pd.set_option('display.line_width', 400) pd.set_option('display.mpl_style', 'default') rcParams['figure.figsize'] = (14, 7) import matplotlib font = {'family' : 'normal', 'weight' : 'bold', 'size' : 22} matplotlib.rc('font', **font) bike_data = pd.read_csv("./2012.csv") bike_data[:5] bike_data = pd.read_csv("./2012.csv", encoding='latin1', sep=';', index_col='Date', parse_dates=True, dayfirst=True) # Get rid of missing columns bike_data = bike_data.dropna(axis=1) # Only use 3 of the columns so it all fits on the screen bike_data = bike_data[['Berri 1', u'Côte-Sainte-Catherine', 'Maisonneuve 1']] bike_data[:5] bike_data[:3] bike_data.plot() bike_data.median() bike_data.median().plot(kind='bar') # column slice column_slice = bike_data[['Berri 1', 'Maisonneuve 1']] # row slice column_slice[:3] column_slice.plot() bike_data['weekday'] = bike_data.index.weekday bike_data.head() counts_by_day = bike_data.groupby('weekday').aggregate(numpy.sum) counts_by_day counts_by_day.index = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'] counts_by_day.plot() bike_data['Berri 1'].plot() def get_weather_data(year): url_template = "http://climate.weather.gc.ca/climateData/bulkdata_e.html?format=csv&stationID=5415&Year={year}&Month={month}&timeframe=1&submit=Download+Data" # mctavish station: 10761, airport station: 5415 data_by_month = [] for month in range(1, 13): url = url_template.format(year=year, month=month) weather_data = pd.read_csv(url, skiprows=16, index_col='Date/Time', parse_dates=True).dropna(axis=1) weather_data.columns = map(lambda x: x.replace('\xb0', ''), weather_data.columns) weather_data = weather_data.drop(['Year', 'Day', 'Month', 'Time', 'Data Quality'], axis=1) data_by_month.append(weather_data.dropna()) # Concatenate and drop any empty columns return pd.concat(data_by_month).dropna(axis=1, how='all').dropna() weather_data = get_weather_data(2012) weather_data[:5] bike_data['mean temp'] = weather_data['Temp (C)'].resample('D', how='mean') bike_data.head() bike_data[['Berri 1', 'mean temp']].plot(subplots=True) bike_data['Rain'] = weather_data['Weather'].str.contains('Rain').map(lambda x: int(x)).resample('D', how='mean') bike_data[['Berri 1', 'Rain']].plot(subplots=True) # Look at everything between May and September summertime_data = bike_data['2012-05-01':'2012-09-01'] summertime_data['Berri 1'][:5] < 2500 summertime_data = bike_data['2012-05-01':'2012-09-01'] bad_days = summertime_data[summertime_data['Berri 1'] < 2500] bad_days[['Berri 1', 'Rain', 'mean temp', 'weekday']] julia = {'email': 'julia@jvns.ca', 'twitter': 'http://twitter.com/b0rk'} print 'Email:', julia['email'] print 'Twitter:', julia['twitter'] print 'Slides: http://bit.ly/pyconca-pandas'