from __future__ import division, print_function
from IPython.display import HTML, Image, display as disp
with open("../css/css.css", "r") as f:
style = f.read()
HTML(style)
!./shortlog.sh ../../pandas
2985 Wes McKinney 1149 jreback 765 y-p 607 Chang She 323 Phillip Cloud 315 Adam Klein 120 Jeffrey Tratner 109 Vytautas Jancauskas 89 Joris Van den Bossche 84 Andy Hayden
pandas fills the need for a Python library that is awesome at data munging. With pandas you don't need to go outside of Python to do data analysis. You can write a web app using django and have data dashboards that are backed by pandas.
pandas
is the library for tabular you've always wished you had. It supports operations such as
groupby
merge
mean
, sum
, any
, all
, etc.)pandas
was designed to work with "medium" data, i.e., data that fit into memory.disp(Image("../img/m.jpg"))
disp(Image("../img/u.jpg"))
pip install sh
urllib
et al are less than desirable# imports that I'll use throughout the talk
import numpy as np
import pandas as pd
from pandas import DataFrame, Series, Index
from numpy.random import randn, randint, rand, choice
import matplotlib.pyplot as plt
pd.options.display.max_rows = 10
try:
from mpltools import style
style.use('ggplot')
except ImportError:
pass
# because of our bg color
plt.rc('text', color='white')
plt.rc('axes', labelcolor='white')
plt.rc('xtick', color='white')
plt.rc('ytick', color='white')
%matplotlib inline
DataFrame
is the flagship data structure in pandas
¶df = pd.DataFrame(randn(10, 2), columns=list('ab'))
df
a | b | |
---|---|---|
0 | 1.525911 | -1.536062 |
1 | -0.773639 | 0.786538 |
2 | 0.673406 | 0.336026 |
3 | -0.778567 | 0.557999 |
4 | 0.932137 | -1.204426 |
5 | 0.139991 | 0.406078 |
6 | -0.687879 | -0.497632 |
7 | 0.271935 | -0.287440 |
8 | 0.240098 | 1.125080 |
9 | 0.235167 | -0.532541 |
# if you have boto installed and have set up credentials
# df = pd.read_csv('s3://nypug/tips.csv')
df = pd.read_csv('https://s3.amazonaws.com/nyqpug/tips.csv')
df
total_bill | tip | sex | smoker | day | time | size | |
---|---|---|---|---|---|---|---|
0 | 16.99 | 1.01 | Female | No | Sun | Dinner | 2 |
1 | 10.34 | 1.66 | Male | No | Sun | Dinner | 3 |
2 | 21.01 | 3.50 | Male | No | Sun | Dinner | 3 |
3 | 23.68 | 3.31 | Male | No | Sun | Dinner | 2 |
4 | 24.59 | 3.61 | Female | No | Sun | Dinner | 4 |
... | ... | ... | ... | ... | ... | ... | ... |
239 | 29.03 | 5.92 | Male | No | Sat | Dinner | 3 |
240 | 27.18 | 2.00 | Female | Yes | Sat | Dinner | 2 |
241 | 22.67 | 2.00 | Male | Yes | Sat | Dinner | 2 |
242 | 17.82 | 1.75 | Male | No | Sat | Dinner | 2 |
243 | 18.78 | 3.00 | Female | No | Thur | Dinner | 2 |
244 rows × 7 columns
df.dtypes
total_bill float64 tip float64 sex object smoker object day object time object size int64 dtype: object
pandas
' most useful and powerful features is its ability to slice and dice data in almost any way you can think of.¶# column access by name
df['day'] # ⟵ that's a Series object
0 Sun 1 Sun 2 Sun ... 241 Sat 242 Sat 243 Thur Name: day, Length: 244, dtype: object
# by attribute
df.time
0 Dinner 1 Dinner 2 Dinner ... 241 Dinner 242 Dinner 243 Dinner Name: time, Length: 244, dtype: object
df.time.value_counts()
Dinner 176 Lunch 68 dtype: int64
# multiple columns
df[['tip', 'sex']]
tip | sex | |
---|---|---|
0 | 1.01 | Female |
1 | 1.66 | Male |
2 | 3.50 | Male |
3 | 3.31 | Male |
4 | 3.61 | Female |
... | ... | ... |
239 | 5.92 | Male |
240 | 2.00 | Female |
241 | 2.00 | Male |
242 | 1.75 | Male |
243 | 3.00 | Female |
244 rows × 2 columns
t = df.set_index('day')
t.head()
total_bill | tip | sex | smoker | time | size | |
---|---|---|---|---|---|---|
day | ||||||
Sun | 16.99 | 1.01 | Female | No | Dinner | 2 |
Sun | 10.34 | 1.66 | Male | No | Dinner | 3 |
Sun | 21.01 | 3.50 | Male | No | Dinner | 3 |
Sun | 23.68 | 3.31 | Male | No | Dinner | 2 |
Sun | 24.59 | 3.61 | Female | No | Dinner | 4 |
t.loc['Sun']
df.loc[df.day == 'Sun']
total_bill | tip | sex | smoker | day | time | size | |
---|---|---|---|---|---|---|---|
0 | 16.99 | 1.01 | Female | No | Sun | Dinner | 2 |
1 | 10.34 | 1.66 | Male | No | Sun | Dinner | 3 |
2 | 21.01 | 3.50 | Male | No | Sun | Dinner | 3 |
3 | 23.68 | 3.31 | Male | No | Sun | Dinner | 2 |
4 | 24.59 | 3.61 | Female | No | Sun | Dinner | 4 |
... | ... | ... | ... | ... | ... | ... | ... |
186 | 20.90 | 3.50 | Female | Yes | Sun | Dinner | 3 |
187 | 30.46 | 2.00 | Male | Yes | Sun | Dinner | 5 |
188 | 18.15 | 3.50 | Female | Yes | Sun | Dinner | 3 |
189 | 23.10 | 4.00 | Male | Yes | Sun | Dinner | 3 |
190 | 15.69 | 1.50 | Male | Yes | Sun | Dinner | 2 |
76 rows × 7 columns
df.loc[:, 'smoker']
0 No 1 No 2 No ... 241 Yes 242 No 243 No Name: smoker, Length: 244, dtype: object
df.iloc[:, 3] # same as df.loc[:, 'smoker']
0 No 1 No 2 No ... 241 Yes 242 No 243 No Name: smoker, Length: 244, dtype: object
df['pct_tip'] = df.tip / df.total_bill
df['avg_price'] = df.total_bill / df.size
df.avg_price.hist(bins=20)
<matplotlib.axes._subplots.AxesSubplot at 0x4ae7a50>
del df['avg_price']
del df['pct_tip']
# multiple columns, multiple rows
df.loc[[0, 2], ['sex', 'tip']]
sex | tip | |
---|---|---|
0 | Female | 1.01 |
2 | Male | 3.50 |
df.loc[:10, ['total_bill', 'tip']] # note this is inclusive
total_bill | tip | |
---|---|---|
0 | 16.99 | 1.01 |
1 | 10.34 | 1.66 |
2 | 21.01 | 3.50 |
3 | 23.68 | 3.31 |
4 | 24.59 | 3.61 |
... | ... | ... |
6 | 8.77 | 2.00 |
7 | 26.88 | 3.12 |
8 | 15.04 | 1.96 |
9 | 14.78 | 3.23 |
10 | 10.27 | 1.71 |
11 rows × 2 columns
# and with iloc
df.iloc[:5] # exclusive endpoints
total_bill | tip | sex | smoker | day | time | size | |
---|---|---|---|---|---|---|---|
0 | 16.99 | 1.01 | Female | No | Sun | Dinner | 2 |
1 | 10.34 | 1.66 | Male | No | Sun | Dinner | 3 |
2 | 21.01 | 3.50 | Male | No | Sun | Dinner | 3 |
3 | 23.68 | 3.31 | Male | No | Sun | Dinner | 2 |
4 | 24.59 | 3.61 | Female | No | Sun | Dinner | 4 |
Apply a function over the columns of a DataFrame
# np.ptp is peak-to-peak difference, i.e., range
df[['total_bill', 'tip', 'size']].apply(np.ptp)
total_bill 47.74 tip 9.00 size 5.00 dtype: float64
days = ['Sun', 'Mon', 'Tues', 'Wed', 'Thur', 'Fri', 'Sat']
df.day.map(days.index)
0 0 1 0 2 0 ... 241 6 242 6 243 4 Name: day, Length: 244, dtype: int64
For example, the DataFrame.sum()
method results in a Series
object and by default sums across the rows. If you want to sum across the columns pass axis=1
, similar to the numpy
convention
In general, reduction operations result in a Series
object.
Some example reduction functions:
mean
, median
, and mode
count
std
, var
NOTE: These ignore NaN
s
df.sum() # whoa! + is defined for strings
total_bill 4827.77 tip 731.58 sex FemaleMaleMaleMaleFemaleMaleMaleMaleMaleMaleMa... smoker NoNoNoNoNoNoNoNoNoNoNoNoNoNoNoNoNoNoNoNoNoNoNo... day SunSunSunSunSunSunSunSunSunSunSunSunSunSunSunS... time DinnerDinnerDinnerDinnerDinnerDinnerDinnerDinn... size 627 dtype: object
df.sum(numeric_only=True)
total_bill 4827.77 tip 731.58 size 627.00 dtype: float64
df.count() / df.shape[0] # we don't have any nans
total_bill 1 tip 1 sex 1 smoker 1 day 1 time 1 size 1 dtype: float64
df.var()
total_bill 79.252939 tip 1.914455 size 0.904591 dtype: float64
df.mean()
total_bill 19.785943 tip 2.998279 size 2.569672 dtype: float64
pandas
' GroupBy
functionality allows you to perform operations on subsets of a DataFrame
and then combines the results for you at the end
df.head()
total_bill | tip | sex | smoker | day | time | size | |
---|---|---|---|---|---|---|---|
0 | 16.99 | 1.01 | Female | No | Sun | Dinner | 2 |
1 | 10.34 | 1.66 | Male | No | Sun | Dinner | 3 |
2 | 21.01 | 3.50 | Male | No | Sun | Dinner | 3 |
3 | 23.68 | 3.31 | Male | No | Sun | Dinner | 2 |
4 | 24.59 | 3.61 | Female | No | Sun | Dinner | 4 |
gb = df.groupby('sex')
gb.mean()
total_bill | tip | size | |
---|---|---|---|
sex | |||
Female | 18.056897 | 2.833448 | 2.459770 |
Male | 20.744076 | 3.089618 | 2.630573 |
gb = df.groupby(['sex', 'smoker'])
gb.std()
total_bill | tip | size | ||
---|---|---|---|---|
sex | smoker | |||
Female | No | 7.286455 | 1.128425 | 1.073146 |
Yes | 9.189751 | 1.219916 | 0.613917 | |
Male | No | 8.726566 | 1.489559 | 0.989094 |
Yes | 9.911845 | 1.500120 | 0.892530 |
# can pass multiple reducers to agg
gb.agg(['mean', 'std', 'median'])
total_bill | tip | size | ||||||||
---|---|---|---|---|---|---|---|---|---|---|
mean | std | median | mean | std | median | mean | std | median | ||
sex | smoker | |||||||||
Female | No | 18.105185 | 7.286455 | 16.69 | 2.773519 | 1.128425 | 2.68 | 2.592593 | 1.073146 | 2 |
Yes | 17.977879 | 9.189751 | 16.27 | 2.931515 | 1.219916 | 2.88 | 2.242424 | 0.613917 | 2 | |
Male | No | 19.791237 | 8.726566 | 18.24 | 3.113402 | 1.489559 | 2.74 | 2.711340 | 0.989094 | 2 |
Yes | 22.284500 | 9.911845 | 20.39 | 3.051167 | 1.500120 | 3.00 | 2.500000 | 0.892530 | 2 |
apply
¶GroupBy.apply()
take a callable
and calls it on each group
def stdize(x):
return (x - x.mean()) / x.std()
df['tb_std'] = gb.total_bill.apply(stdize)
df
total_bill | tip | sex | smoker | day | time | size | tb_std | |
---|---|---|---|---|---|---|---|---|
0 | 16.99 | 1.01 | Female | No | Sun | Dinner | 2 | -0.153049 |
1 | 10.34 | 1.66 | Male | No | Sun | Dinner | 3 | -1.083042 |
2 | 21.01 | 3.50 | Male | No | Sun | Dinner | 3 | 0.139661 |
3 | 23.68 | 3.31 | Male | No | Sun | Dinner | 2 | 0.445623 |
4 | 24.59 | 3.61 | Female | No | Sun | Dinner | 4 | 0.889982 |
... | ... | ... | ... | ... | ... | ... | ... | ... |
239 | 29.03 | 5.92 | Male | No | Sat | Dinner | 3 | 1.058694 |
240 | 27.18 | 2.00 | Female | Yes | Sat | Dinner | 2 | 1.001346 |
241 | 22.67 | 2.00 | Male | Yes | Sat | Dinner | 2 | 0.038893 |
242 | 17.82 | 1.75 | Male | No | Sat | Dinner | 2 | -0.225889 |
243 | 18.78 | 3.00 | Female | No | Thur | Dinner | 2 | 0.092612 |
244 rows × 8 columns
ax = df.tb_std.plot(kind='kde', lw=3)
df.tb_std.hist(ax=ax, normed=True)
ax.set_xlabel('$z$-score', fontsize=20)
ax.set_title(r'Total Bill Stdized Across Sex$\times$ Smoker')
ax.axis('tight')
plt.gcf().tight_layout()
Often times you have data sets of different shapes with a common column that you want to join on
pandas
does this a couple of ways, but the main entry point is pandas.merge
# create some frames
n = 2000
n2 = n // 2
visits = DataFrame({'page_visits_per_day': np.random.poisson(10, size=n),
'user_id': randint(9, size=n)})
likes = DataFrame({'likes_per_day': np.random.poisson(30, size=n2),
'user_id': randint(6, size=n2)})
visits
page_visits_per_day | user_id | |
---|---|---|
0 | 6 | 4 |
1 | 12 | 1 |
2 | 13 | 5 |
3 | 7 | 1 |
4 | 6 | 7 |
... | ... | ... |
1995 | 7 | 1 |
1996 | 8 | 5 |
1997 | 9 | 3 |
1998 | 10 | 6 |
1999 | 8 | 4 |
2000 rows × 2 columns
likes
likes_per_day | user_id | |
---|---|---|
0 | 27 | 0 |
1 | 33 | 4 |
2 | 23 | 1 |
3 | 31 | 5 |
4 | 26 | 0 |
... | ... | ... |
995 | 28 | 5 |
996 | 30 | 3 |
997 | 33 | 4 |
998 | 29 | 0 |
999 | 27 | 5 |
1000 rows × 2 columns
merg = pd.merge(visits, likes)
merg.sort('user_id')
page_visits_per_day | user_id | likes_per_day | |
---|---|---|---|
152516 | 15 | 0 | 22 |
174447 | 11 | 0 | 21 |
174446 | 11 | 0 | 25 |
174445 | 11 | 0 | 25 |
174444 | 11 | 0 | 26 |
... | ... | ... | ... |
85623 | 17 | 5 | 24 |
85624 | 17 | 5 | 35 |
85625 | 17 | 5 | 34 |
85645 | 17 | 5 | 25 |
73182 | 10 | 5 | 34 |
227153 rows × 3 columns
s = df.day
s
0 Sun 1 Sun 2 Sun ... 241 Sat 242 Sat 243 Thur Name: day, Length: 244, dtype: object
is_weekend = s.str.startswith('S')
is_weekend
0 True 1 True 2 True ... 241 True 242 True 243 False Name: day, Length: 244, dtype: bool
correct = s[is_weekend].str.contains(r'^(?:Sat|Sun)$')
correct.all()
True
s.str.len()
0 3 1 3 2 3 ... 241 3 242 3 243 4 Name: day, Length: 244, dtype: int64
s.str[:2]
0 Su 1 Su 2 Su ... 241 Sa 242 Sa 243 Th Name: day, Length: 244, dtype: object
n = 10000
idx = pd.date_range(start='today', periods=n, freq='D')
idx
<class 'pandas.tseries.index.DatetimeIndex'> [2014-05-28, ..., 2041-10-12] Length: 10000, Freq: D, Timezone: None
s = Series(np.random.poisson(10, size=n), index=idx, name='login_count')
s
2014-05-28 6 2014-05-29 12 2014-05-30 11 ... 2041-10-10 6 2041-10-11 14 2041-10-12 8 Freq: D, Name: login_count, Length: 10000
resample
¶s.resample('W', how='sum')
2014-06-01 46 2014-06-08 56 2014-06-15 74 ... 2041-09-29 66 2041-10-06 77 2041-10-13 59 Freq: W-SUN, Name: login_count, Length: 1429
# multiple functions
rs = s.resample('W', how=['mean', 'std', 'count'])
rs
mean | std | count | |
---|---|---|---|
2014-06-01 | 9.200000 | 2.387467 | 5 |
2014-06-08 | 8.000000 | 2.768875 | 7 |
2014-06-15 | 10.571429 | 4.755949 | 7 |
2014-06-22 | 10.857143 | 3.132016 | 7 |
2014-06-29 | 9.285714 | 2.927700 | 7 |
... | ... | ... | ... |
2041-09-15 | 8.285714 | 2.984085 | 7 |
2041-09-22 | 10.000000 | 2.000000 | 7 |
2041-09-29 | 9.428571 | 3.154739 | 7 |
2041-10-06 | 11.000000 | 2.828427 | 7 |
2041-10-13 | 9.833333 | 2.857738 | 6 |
1429 rows × 3 columns
mu = rs['mean']
fig, ax = plt.subplots(figsize=(12, 6))
ax.step(mu.index[:50], mu.iloc[:50], lw=3, where='post')
fig.tight_layout()
ax.set_xlabel('Time')
ax.set_ylabel('Average Count / Week')
ax.set_title('Count vs. Time')
<matplotlib.text.Text at 0x58f0550>
%%writefile tmp.csv
a,b,c
1,d,3.0
4,e,6.28
2,f,4.4
Overwriting tmp.csv
df = pd.read_csv('tmp.csv')
df
a | b | c | |
---|---|---|---|
0 | 1 | d | 3.00 |
1 | 4 | e | 6.28 |
2 | 2 | f | 4.40 |
df.dtypes
a int64 b object c float64 dtype: object
df.to_csv('tmp.csv')
%cat tmp.csv # saves the index created by default
,a,b,c 0,1,d,3.0 1,4,e,6.28 2,2,f,4.4
df = DataFrame(dict(a=randn(10), b=randint(10, size=10),
c=choice(list('abc'), size=10)))
df
a | b | c | |
---|---|---|---|
0 | 0.311531 | 2 | a |
1 | 0.466764 | 3 | b |
2 | -0.134696 | 9 | a |
3 | -1.227535 | 4 | c |
4 | -0.572367 | 8 | a |
5 | 0.330043 | 7 | a |
6 | -0.684184 | 4 | c |
7 | -1.097351 | 9 | c |
8 | -0.694465 | 1 | b |
9 | -1.376861 | 5 | b |
df.to_hdf('tmp.h5', 'df', format='table')
pd.read_hdf('tmp.h5', 'df')
a | b | c | |
---|---|---|---|
0 | 0.311531 | 2 | a |
1 | 0.466764 | 3 | b |
2 | -0.134696 | 9 | a |
3 | -1.227535 | 4 | c |
4 | -0.572367 | 8 | a |
5 | 0.330043 | 7 | a |
6 | -0.684184 | 4 | c |
7 | -1.097351 | 9 | c |
8 | -0.694465 | 1 | b |
9 | -1.376861 | 5 | b |
!ptdump tmp.h5
/ (RootGroup) '' /df (Group) '' /df/table (Table(10,)) ''