from __future__ import division, print_function from IPython.display import HTML, Image, display as disp with open("../css/css.css", "r") as f: style = f.read() HTML(style) !./shortlog.sh ../../pandas disp(Image("../img/m.jpg")) disp(Image("../img/u.jpg")) # imports that I'll use throughout the talk import numpy as np import pandas as pd from pandas import DataFrame, Series, Index from numpy.random import randn, randint, rand, choice import matplotlib.pyplot as plt pd.options.display.max_rows = 10 try: from mpltools import style style.use('ggplot') except ImportError: pass # because of our bg color plt.rc('text', color='white') plt.rc('axes', labelcolor='white') plt.rc('xtick', color='white') plt.rc('ytick', color='white') %matplotlib inline df = pd.DataFrame(randn(10, 2), columns=list('ab')) df # if you have boto installed and have set up credentials # df = pd.read_csv('s3://nypug/tips.csv') df = pd.read_csv('https://s3.amazonaws.com/nyqpug/tips.csv') df df.dtypes # column access by name df['day'] # ⟵ that's a Series object # by attribute df.time df.time.value_counts() # multiple columns df[['tip', 'sex']] t = df.set_index('day') t.head() t.loc['Sun'] df.loc[df.day == 'Sun'] df.loc[:, 'smoker'] df.iloc[:, 3] # same as df.loc[:, 'smoker'] df['pct_tip'] = df.tip / df.total_bill df['avg_price'] = df.total_bill / df.size df.avg_price.hist(bins=20) del df['avg_price'] del df['pct_tip'] # multiple columns, multiple rows df.loc[[0, 2], ['sex', 'tip']] df.loc[:10, ['total_bill', 'tip']] # note this is inclusive # and with iloc df.iloc[:5] # exclusive endpoints # np.ptp is peak-to-peak difference, i.e., range df[['total_bill', 'tip', 'size']].apply(np.ptp) days = ['Sun', 'Mon', 'Tues', 'Wed', 'Thur', 'Fri', 'Sat'] df.day.map(days.index) df.sum() # whoa! + is defined for strings df.sum(numeric_only=True) df.count() / df.shape[0] # we don't have any nans df.var() df.mean() df.head() gb = df.groupby('sex') gb.mean() gb = df.groupby(['sex', 'smoker']) gb.std() # can pass multiple reducers to agg gb.agg(['mean', 'std', 'median']) def stdize(x): return (x - x.mean()) / x.std() df['tb_std'] = gb.total_bill.apply(stdize) df ax = df.tb_std.plot(kind='kde', lw=3) df.tb_std.hist(ax=ax, normed=True) ax.set_xlabel('$z$-score', fontsize=20) ax.set_title(r'Total Bill Stdized Across Sex$\times$ Smoker') ax.axis('tight') plt.gcf().tight_layout() # create some frames n = 2000 n2 = n // 2 visits = DataFrame({'page_visits_per_day': np.random.poisson(10, size=n), 'user_id': randint(9, size=n)}) likes = DataFrame({'likes_per_day': np.random.poisson(30, size=n2), 'user_id': randint(6, size=n2)}) visits likes merg = pd.merge(visits, likes) merg.sort('user_id') s = df.day s is_weekend = s.str.startswith('S') is_weekend correct = s[is_weekend].str.contains(r'^(?:Sat|Sun)$') correct.all() s.str.len() s.str[:2] n = 10000 idx = pd.date_range(start='today', periods=n, freq='D') idx s = Series(np.random.poisson(10, size=n), index=idx, name='login_count') s s.resample('W', how='sum') # multiple functions rs = s.resample('W', how=['mean', 'std', 'count']) rs mu = rs['mean'] fig, ax = plt.subplots(figsize=(12, 6)) ax.step(mu.index[:50], mu.iloc[:50], lw=3, where='post') fig.tight_layout() ax.set_xlabel('Time') ax.set_ylabel('Average Count / Week') ax.set_title('Count vs. Time') %%writefile tmp.csv a,b,c 1,d,3.0 4,e,6.28 2,f,4.4 df = pd.read_csv('tmp.csv') df df.dtypes df.to_csv('tmp.csv') %cat tmp.csv # saves the index created by default df = DataFrame(dict(a=randn(10), b=randint(10, size=10), c=choice(list('abc'), size=10))) df df.to_hdf('tmp.h5', 'df', format='table') pd.read_hdf('tmp.h5', 'df') !ptdump tmp.h5