Back: Whirl-torial ¶

In [ ]:

from __future__ import division, print_function

In [ ]:

from IPython.display import HTML, display as disp, Audio
with open("../css/css.css", "r") as f:
    styles = f.read()
HTML(styles)

In [ ]:

import os
import operator
import json
import tokenize
import re

from itertools import imap, ifilter, islice, cycle
from functools import partial

import sh
import requests

import matplotlib.pyplot as plt

import numpy as np
from numpy.random import randn, randint, rand, choice

import pandas as pd
from pandas import DataFrame, Series, Index
from pandas.compat import map, StringIO
import pandas.util.testing as tm

In [ ]:

pd.options.display.max_rows = 10
pd.options.display.max_columns = 7

try:
    from mpltools import style
    style.use('ggplot')
except ImportError:
    pass

# because of our bg color
plt.rc('text', color='white')
plt.rc('axes', labelcolor='white')
plt.rc('xtick', color='white')
plt.rc('ytick', color='white')

In [ ]:

%matplotlib inline

In [ ]:

def insert_page(url):
    """Embed a webpage in the notebook"""
    disp(HTML('<iframe src=%r width=700 height=350></iframe>' % url))


def read_text(*args, **kwargs):
    """Simple text reader because I don't like typing ``with`` every time"""
    
    with open(*args, **kwargs) as f:
        return f.read()
    
    
def highlight(filename, style='fruity'):
    """Syntax highlight a file based on its extension"""
    
    from pygments import highlight as h
    from pygments.lexers import guess_lexer_for_filename
    from pygments.formatters import HtmlFormatter

    code = read_text(filename, mode='rt')

    formatter = HtmlFormatter(style=style)
    lexer = guess_lexer_for_filename(filename, code)
    disp(HTML('<style type="text/css">{0}</style>{1}'.format(
            formatter.get_style_defs('.highlight'),
            h(code, lexer, formatter))))
    
    
def gen_frames(n, size, f=randn):
    """Generate `n` frames of size `size` using the function `f`."""
    return (DataFrame(f(*sz)) for sz in [size] * n)

New Features since v0.11.0¶

(or Interactive Release Notes)¶

v0.11¶

indexers loc/at, iloc/iat
all dtypes allowed
now we use numexpr to evaluate arithmetic expressions where possible (with objects whose len is > 10k elements)

`numexpr` speedups¶

More apparent for long(ish) expressions with large(ish) arrays¶

In [ ]:

x, y, z, w = gen_frames(4, size=(1e6, 20))

In [ ]:

def show_faster(num, denom):
    ratio = num / denom
    disp(HTML('numexpr is <b>%.2g</b>&times; as fast' % ratio))
    
    
def biggish():
    disp(HTML('<b>biggish</b>'))
    with tm.use_numexpr(True):
        Y = %timeit -r 1 -n 1 -o x + y + z + w ** 3

    with tm.use_numexpr(False):
        N = %timeit -r 1 -n 1 -o x + y + z + w ** 3

    show_faster(N.best, Y.best)
    
    
def smallish():
    disp(HTML('<b>smallish</b>'))
    with tm.use_numexpr(False):
        Y = %timeit -r 1 -n 1 -o x + y

    with tm.use_numexpr(False):
        N = %timeit -r 1 -n 1 -o x + y

    show_faster(N.best, Y.best)


biggish()
smallish()

v0.12¶

read_html
read_json
read_csv accepts S3 URLs
DataFrame.replace() with regular expressions
Series.str iteration
MultiIndex column reading and writing in read_csv
GroupBy.filter()

`read_html`¶

In [ ]:

insert_page("http://www.fdic.gov/bank/individual/failed/banklist.html")

In [ ]:

url = '../data/banklist.html'
dfs = pd.read_html(url)  # returns a list of all tables found on the page

In [ ]:

assert len(dfs) == 1, "you're wrong about me"
df = dfs.pop()

In [ ]:

# not sure where those extra columns are from ...
df

Select tables based on class¶

In [ ]:

dat_url = 'tmp.html'
with open(dat_url, 'w') as f:
    DataFrame(randn(2, 2)).to_html(f, classes=['first'])
    f.write('\n\n')
    DataFrame(randn(2, 2)).to_html(f, classes=['second'])

In [ ]:

highlight(dat_url)

In [ ]:

df, = pd.read_html(dat_url, attrs={'class': 'first'}, index_col=0)
df

In [ ]:

dfs = pd.read_html(dat_url, index_col=0)

for df in dfs:
    disp(df)
# not really a way to tell which table is which; ordered by appearance in HTML

In [ ]:

top_url = 'http://www.tylervigen.com'
url = 'http://www.tylervigen.com/view_correlation?id=1703'

In [ ]:

insert_page(top_url)

In [ ]:

insert_page(url)

In [ ]:

raw = requests.get(url).text
match = r'Divorce rate in Maine'
dfs = pd.read_html(raw, match=match, header=0, index_col=0)

In [ ]:

dfs[-1]

In [ ]:

# get rid of junk columns
df = dfs[-1].dropna(how='all', axis=(0, 1)).T

# better names
df.columns = ['mn_divorce_rate', 'per_capita_marg']

# rename generic index name to year
df = df.reset_index().rename(columns={'index': 'year'})

# make years integers
df = df.convert_objects(convert_numeric=True)
df

In [ ]:

def blacken_legend_text(leg):
    for t in leg.get_texts():
        t.set_color('k')
        
        
fig, (ax, ax2) = plt.subplots(2, 1, figsize=(8, 6))

# maine divorces
ln = ax.plot(df.mn_divorce_rate.values, r'ro-', label='Divorce Rate / 1000 People')
ax.set_xticklabels(df.year)
ax.set_xlabel('Year')
ax.set_ylabel(ln[0].get_label())

# butter eating
axt = ax.twinx()
lt = axt.plot(df.per_capita_marg.values, r'bo-', label='Per Capita Lbs of Margarine')
axt.set_ylabel(lt[0].get_label())

# scatter plot
ax2.scatter(df.mn_divorce_rate.values, df.per_capita_marg.values, s=100)
ax2.set_xlabel('MN Divorce Rate')
ax2.set_ylabel('Margarine')
ax2.set_title(r'Divorce vs. Margarine, $r = %.2g$' % df.mn_divorce_rate.corr(df.per_capita_marg))
ax2.axis('tight')

# legend madness
lns = ln + lt
leg = ax.legend(lns, [l.get_label() for l in lns], loc=0)
blacken_legend_text(leg)

fig.tight_layout()

`DataFrame.replace()` with regular expressions¶

In [ ]:

tips = pd.read_csv('s3://nyqpug/tips.csv')

In [ ]:

# add some random lower cased versions of yes and no
nrows = len(tips)
tips.loc[(rand(nrows) > 0.5) & (tips.smoker == 'Yes'), 'smoker'] = 'yes'
tips.loc[(rand(nrows) > 0.5) & (tips.smoker == 'No'), 'smoker'] = 'no'

In [ ]:

tips.smoker.value_counts().plot(kind='bar')

In [ ]:

# sanity check
tips.smoker.value_counts()

In [ ]:

repd = tips.replace(regex={'smoker': {'[yY]es': True, '[nN]o': False}})
repd

In [ ]:

repd_all = tips.replace(regex={'[yY]es': True, '[nN]o': False})
repd_all

`read_json` (simple)¶

In [ ]:

jsfile = 'data.json'

In [ ]:

%%writefile $jsfile
{
    "name": ["Bob Jones", "Karen Smith"],
    "age": [28, 26],
    "gender": ["M", "F"]
}

In [ ]:

pd.read_json(jsfile)  # no problemo

In [ ]:

# can also use keys as the rows instead of columns
pd.read_json(jsfile, orient='index')

In [ ]:

%%writefile $jsfile
{
    "region": {
        "Canada": {
            "name": "Bob Jones",
            "age": 28,
            "gender": "M"
        },
        "USA": {
            "name": "Karen Smith",
            "age": 26,
            "gender": "F"
        }
    }
}

In [ ]:

disp(pd.read_json(jsfile, orient='records'))
disp(Audio(os.path.join(os.pardir, 'mp3', 'w.mp3'), autoplay=True))

In [ ]:

# disp(Audio(os.path.join(os.pardir, 'mp3', 'c.mp3'), autoplay=True))

`read_json` (not so simple)¶

pandas plays nicely with other libraries¶

In [ ]:

data = read_text(jsfile)

In [ ]:

# avoid read_json entirely :)
# get transposed
df = DataFrame(json.loads(data)["region"])
df = df.T.convert_objects(convert_numeric=True)
df

In [ ]:

df.dtypes

In [ ]:

jq = sh.jq.bake('-M')  # -M disables colorizing

In [ ]:

rule = "(.region)"  # this rule is essentially data["region"]
out = jq(rule, _in=data).stdout
res = pd.read_json(out, orient='index')
res

In [ ]:

res.dtypes

Let's try something a bit hairier...¶

adapted from this StackOverflow question ¶

In [ ]:

%%writefile $jsfile
{
  "intervals": [
    {
      "pivots": "Jane Smith",
      "series": [
        {
          "interval_id": 0,
          "p_value": 1
        },
        {
          "interval_id": 1,
          "p_value": 1.1162791357932633e-8
        },
        {
          "interval_id": 2,
          "p_value": 0.0000028675012051504467
        }
      ]
    },
    {
      "pivots": "Bob Smith",
      "series": [
        {
          "interval_id": 0,
          "p_value": 1
        },
        {
          "interval_id": 1,
          "p_value": 1.1162791357932633e-8
        },
        {
          "interval_id": 2,
          "p_value": 0.0000028675012051504467
        }
      ]
    }
  ]
}

In [ ]:

%%writefile rule.txt
[{pivots: .intervals[].pivots, 
  interval_id: .intervals[].series[].interval_id,
  p_value: .intervals[].series[].p_value}] | unique

In [ ]:

data = read_text(jsfile)

# check out http://stedolan.github.io/jq/manual for more details on these rules
rule = read_text('rule.txt')
out = jq(rule, _in=data).stdout
js = json.loads(out)

In [ ]:

js[:2]

In [ ]:

res = pd.read_json(out)
res

In [ ]:

res.dtypes

v0.13¶

DataFrame.isin()
str.extract()
Experimental Features
- query/eval
- msgpack IO
- Google BigQuery IO

In [ ]:

names = list(filter(None, read_text('names.txt').split('\n')))
names

In [ ]:

df = DataFrame(dict(zip(['math', 'physics'], 
                        [names[:5], names[-5:]])))
df

In [ ]:

df.isin(['Brook', 'Bradley', 'Richie', 'Sarah'])

`str.extract()`¶

In [ ]:

!grep -P '^[a-zA-Z_]\w*$' /usr/share/dict/cracklib-small | head -10

In [ ]:

def gen_filenames(n, pattern='%d_%s', dict_file='/usr/share/dict/words'):
    matches_id = partial(re.match, '^%s$' % tokenize.Name)
    interpolator = partial(operator.mod, pattern)
    
    with open(dict_file, 'rt') as f:
        only_valid_names = ifilter(matches_id, cycle(f))
        n_matches = islice(only_valid_names, 0, n)
        
        for el in imap(interpolator, enumerate(imap(str.strip, n_matches))):
            yield el

In [ ]:

vids = Series(list(gen_filenames(30, pattern='%d_%s.mp4')))
vids

In [ ]:

ext = vids.str.extract('(?P<num>\d+)_(?P<name>.+)')
ext

In [ ]:

ext = ext.convert_objects(convert_numeric=True)
disp(ext.dtypes)
ext

v0.13 Experimental Features¶

query/eval
msgpack IO
Google BigQuery IO

`query`/`eval`¶

In [ ]:

n = 1e6
df = DataFrame({'a': randint(10, size=n),
                'b': rand(n),
                'c': rand(n)})
df.head()

In [ ]:

sub = df.query('1 <= a <= 5 and 0.1 < b < 0.4 and 0.5 <= c <= 0.95')
sub

In [ ]:

qtime = %timeit -o df.query('1 <= a <= 5 and 0.1 < b < 0.4 and 0.5 <= c <= 0.95')
pytime = %timeit -o df.loc[(1 <= df.a) & (df.a <= 5) & (0.1 <= df.b) & (df.b <= 0.4) & (0.5 <= df.c) & (df.c <= 0.9)]

print('query is %.2gx faster than pure Python' % (pytime.best / qtime.best))

In [ ]:

A, B, C, D = (DataFrame(randn(n, 40)) for _ in range(4))

In [ ]:

qtime = %timeit -r 1 -n 1 -o pd.eval('A + B * 2 + C / D ** 3 * B / C + A ** 10 < A ** 5')

In [ ]:

pytime = %timeit -r 1 -n 1 -o A + B * 2 + C / D ** 3 * B / C + A ** 10 < A ** 5
print('query is %.2gx faster than pure Python' % (pytime.best / qtime.best))

Local variables¶

In [ ]:

a = rand()
df.query('a <= @a <= b')

`MessagePack` IO (`to_msgpack`/`read_msgpack`)¶

MessagePack is like JSON but smaller and it's a binary format.¶

Support for this is experimental.¶

Serialization of large objects is not recommended as they are converted to Python `list`s before writing¶

In [ ]:

df.head(2).to_msgpack()

In [ ]:

s = pd.to_msgpack(None,  # we want the raw bytes output so pass None
                  Series(randn(2)), 
                  ['yep', 'a', 'list'], 
                  randn(2), 
                  {'a': 2, 'b': 3})
sio = StringIO(s)
pd.read_msgpack(sio)

Google BigQuery IO¶

You need to

pip install bigquery

as well as set up a Google BigQuery account before this will work

Data can be found here ¶

In [ ]:

highlight('query.sql')

In [ ]:

query = read_text('query.sql')

In [ ]:

df = pd.read_gbq(query, project_id='metal-lantern-572')

Notice the NaTs and NaNs. Those are where other repositories have valid pull request dates

In [ ]:

df = df.rename(columns=lambda x: x.replace('payload_pull_request_', ''))

In [ ]:

df.dtypes
df

In [ ]:

df['created_at'] = pd.to_datetime(df.created_at)

In [ ]:

df

In [ ]:

# set the index to the datetime column just created
df = df.set_index('created_at').sort_index()
df

In [ ]:

s = df.additions

In [ ]:

def remove_time(ax):
    replacer = lambda x: x.get_text().replace(' 00:00:00', '')
    ax.set_xticklabels(list(map(replacer, ax.get_xticklabels())))

In [ ]:

r = s.resample('B', how='sum')
r.index.name = 'Pull Request Day'
ax = r.plot(kind='bar', figsize=(18, 5))

remove_time(ax)
ax.set_ylabel('Pull Request Additions per Business Day')
ax.get_figure().autofmt_xdate()

Non user facing but worth mentioning:¶

Jeff Reback's refactor of `Series` to use composition instead of inheriting from `numpy.ndarray`. Bravo!¶

v0.14 (soon to be released)¶

MultiIndex slicing
nlargest/nsmallest
hexbin, pie, and table plotting

Prelude to `MultiIndex` slicing¶

`MultiIndex` Slicing¶

In [ ]:

def channel_index(name, nchannels):
    return list(zip(np.repeat(name, nchannels), range(nchannels)))

In [ ]:

# simulate our EEG data set

fs = 256  # sampling rate
neeg = 8  # number of EEG channels
nex = 4  # number of auxiliary channels
nsensors = 2  # number of gsr channels

eeg_chan = channel_index('eeg', neeg)
ex_chan = channel_index('ex', nex)
sens_chan = channel_index('gsr', nsensors)

disp(eeg_chan)
disp(ex_chan)
disp(sens_chan)

In [ ]:

columns = pd.MultiIndex.from_tuples(eeg_chan + ex_chan + sens_chan,
                                    names=['signal', 'channel'])

# 10 seconds of fake data
df = pd.DataFrame(np.random.randn(fs * 10, columns.labels[0].size), columns=columns)

# add in some nans (e.g., a person moved around during these samples)
df.loc[rand(len(df)) < 0.20, 'eeg'] = np.nan
df.head()

In [ ]:

# simulate a stimulus marker
df['stim'] = np.sort(randint(10, size=len(df)))
df

In [ ]:

df.loc[:, np.s_[('ex', 'stim'), :]].head()

In [ ]:

# the EX and STIM channels where EEG channels 0 and 1 are not null
row_idx = df.eeg[[0, 1]].notnull().all(axis=1)
col_idx = np.s_[('ex', 'stim'), :]

In [ ]:

col_idx

In [ ]:

res = df.loc[row_idx, col_idx]
res

In [ ]:

# use np.s_ to construct slices
assert slice(None, 1000, 2) == np.s_[:1000:2]  # Which would you prefer?
assert slice(900, None, -1) == np.s_[900::-1]

The expression

np.s_[('ex', 'stim'), :]

says select 'ex' and 'stim' from the first level of the MultiIndex and the : as saying give me all columns from the second level

`nlargest`/`nsmallest`¶

If $n << $ `len(s)` you'll see a performance improvement.¶

Idea: Don't sort the whole array when you only need 5 values because¶

¶

In [ ]:

s = Series(randn(1000000), name='a')

In [ ]:

a = %timeit -o s.nlargest(5) # 5 << 1,000,000
b = %timeit -o s.order(ascending=False).head()
print('nlargest is %.2gx faster than order + head' % (b.best / a.best))

In [ ]:

a = %timeit -o s.nsmallest(5)
b = %timeit -o s.order().head()
print('nsmallest is %.2gx faster than order + head' % (b.best / a.best))

Back: Whirl-torial ¶

New Features since v0.11.0¶

(or Interactive Release Notes)¶

v0.11¶

`numexpr` speedups¶

More apparent for long(ish) expressions with large(ish) arrays¶

v0.12¶

`read_html`¶

Select tables based on class¶

`DataFrame.replace()` with regular expressions¶

`read_json` (simple)¶

`read_json` (not so simple)¶

pandas plays nicely with other libraries¶

Let's try something a bit hairier...¶

adapted from this StackOverflow question ¶

v0.13¶

`str.extract()`¶

v0.13 Experimental Features¶

`query`/`eval`¶

Local variables¶

`MessagePack` IO (`to_msgpack`/`read_msgpack`)¶

MessagePack is like JSON but smaller and it's a binary format.¶

Support for this is experimental.¶

Serialization of large objects is not recommended as they are converted to Python `list`s before writing¶

Google BigQuery IO¶

Data can be found here ¶

Non user facing but worth mentioning:¶

Jeff Reback's refactor of `Series` to use composition instead of inheriting from `numpy.ndarray`. Bravo!¶

v0.14 (soon to be released)¶

Prelude to `MultiIndex` slicing¶

`MultiIndex` Slicing¶

`nlargest`/`nsmallest`¶

If $n << $ `len(s)` you'll see a performance improvement.¶

Idea: Don't sort the whole array when you only need 5 values because¶

¶

Thanks!¶

Questions?¶

Back: Whirl-torial¶

New Features since v0.11.0¶

(or Interactive Release Notes)¶

v0.11¶

numexpr speedups¶

More apparent for long(ish) expressions with large(ish) arrays¶

v0.12¶

read_html¶

Select tables based on class¶

DataFrame.replace() with regular expressions¶

read_json (simple)¶

read_json (not so simple)¶

pandas plays nicely with other libraries¶

Let's try something a bit hairier...¶

adapted from this StackOverflow question¶

v0.13¶

str.extract()¶

v0.13 Experimental Features¶

query/eval¶

Local variables¶

MessagePack IO (to_msgpack/read_msgpack)¶

MessagePack is like JSON but smaller and it's a binary format.¶

Support for this is experimental.¶

Serialization of large objects is not recommended as they are converted to Python lists before writing¶

Google BigQuery IO¶

Data can be found here¶

Non user facing but worth mentioning:¶

Jeff Reback's refactor of Series to use composition instead of inheriting from numpy.ndarray. Bravo!¶

v0.14 (soon to be released)¶

Prelude to MultiIndex slicing¶

MultiIndex Slicing¶

nlargest/nsmallest¶

If $n << $ len(s) you'll see a performance improvement.¶

Idea: Don't sort the whole array when you only need 5 values because¶

¶

Thanks!¶

Questions?¶

Back: Whirl-torial ¶

`numexpr` speedups¶

`read_html`¶

`DataFrame.replace()` with regular expressions¶

`read_json` (simple)¶

`read_json` (not so simple)¶

adapted from this StackOverflow question ¶

`str.extract()`¶

`query`/`eval`¶

`MessagePack` IO (`to_msgpack`/`read_msgpack`)¶

Serialization of large objects is not recommended as they are converted to Python `list`s before writing¶

Data can be found here ¶

Jeff Reback's refactor of `Series` to use composition instead of inheriting from `numpy.ndarray`. Bravo!¶

Prelude to `MultiIndex` slicing¶

`MultiIndex` Slicing¶

`nlargest`/`nsmallest`¶

If $n << $ `len(s)` you'll see a performance improvement.¶