from __future__ import division, print_function
from IPython.display import HTML, display as disp, Audio
with open("../css/css.css", "r") as f:
styles = f.read()
HTML(styles)
import os
import operator
import json
import tokenize
import re
from itertools import imap, ifilter, islice, cycle
from functools import partial
import sh
import requests
import matplotlib.pyplot as plt
import numpy as np
from numpy.random import randn, randint, rand, choice
import pandas as pd
from pandas import DataFrame, Series, Index
from pandas.compat import map, StringIO
import pandas.util.testing as tm
pd.options.display.max_rows = 10
pd.options.display.max_columns = 7
try:
from mpltools import style
style.use('ggplot')
except ImportError:
pass
# because of our bg color
plt.rc('text', color='white')
plt.rc('axes', labelcolor='white')
plt.rc('xtick', color='white')
plt.rc('ytick', color='white')
%matplotlib inline
def insert_page(url):
"""Embed a webpage in the notebook"""
disp(HTML('<iframe src=%r width=700 height=350></iframe>' % url))
def read_text(*args, **kwargs):
"""Simple text reader because I don't like typing ``with`` every time"""
with open(*args, **kwargs) as f:
return f.read()
def highlight(filename, style='fruity'):
"""Syntax highlight a file based on its extension"""
from pygments import highlight as h
from pygments.lexers import guess_lexer_for_filename
from pygments.formatters import HtmlFormatter
code = read_text(filename, mode='rt')
formatter = HtmlFormatter(style=style)
lexer = guess_lexer_for_filename(filename, code)
disp(HTML('<style type="text/css">{0}</style>{1}'.format(
formatter.get_style_defs('.highlight'),
h(code, lexer, formatter))))
def gen_frames(n, size, f=randn):
"""Generate `n` frames of size `size` using the function `f`."""
return (DataFrame(f(*sz)) for sz in [size] * n)
numexpr
speedups¶x, y, z, w = gen_frames(4, size=(1e6, 20))
def show_faster(num, denom):
ratio = num / denom
disp(HTML('numexpr is <b>%.2g</b>× as fast' % ratio))
def biggish():
disp(HTML('<b>biggish</b>'))
with tm.use_numexpr(True):
Y = %timeit -r 1 -n 1 -o x + y + z + w ** 3
with tm.use_numexpr(False):
N = %timeit -r 1 -n 1 -o x + y + z + w ** 3
show_faster(N.best, Y.best)
def smallish():
disp(HTML('<b>smallish</b>'))
with tm.use_numexpr(False):
Y = %timeit -r 1 -n 1 -o x + y
with tm.use_numexpr(False):
N = %timeit -r 1 -n 1 -o x + y
show_faster(N.best, Y.best)
biggish()
smallish()
insert_page("http://www.fdic.gov/bank/individual/failed/banklist.html")
url = '../data/banklist.html'
dfs = pd.read_html(url) # returns a list of all tables found on the page
assert len(dfs) == 1, "you're wrong about me"
df = dfs.pop()
# not sure where those extra columns are from ...
df
dat_url = 'tmp.html'
with open(dat_url, 'w') as f:
DataFrame(randn(2, 2)).to_html(f, classes=['first'])
f.write('\n\n')
DataFrame(randn(2, 2)).to_html(f, classes=['second'])
highlight(dat_url)
df, = pd.read_html(dat_url, attrs={'class': 'first'}, index_col=0)
df
dfs = pd.read_html(dat_url, index_col=0)
for df in dfs:
disp(df)
# not really a way to tell which table is which; ordered by appearance in HTML
top_url = 'http://www.tylervigen.com'
url = 'http://www.tylervigen.com/view_correlation?id=1703'
insert_page(top_url)
insert_page(url)
raw = requests.get(url).text
match = r'Divorce rate in Maine'
dfs = pd.read_html(raw, match=match, header=0, index_col=0)
dfs[-1]
# get rid of junk columns
df = dfs[-1].dropna(how='all', axis=(0, 1)).T
# better names
df.columns = ['mn_divorce_rate', 'per_capita_marg']
# rename generic index name to year
df = df.reset_index().rename(columns={'index': 'year'})
# make years integers
df = df.convert_objects(convert_numeric=True)
df
def blacken_legend_text(leg):
for t in leg.get_texts():
t.set_color('k')
fig, (ax, ax2) = plt.subplots(2, 1, figsize=(8, 6))
# maine divorces
ln = ax.plot(df.mn_divorce_rate.values, r'ro-', label='Divorce Rate / 1000 People')
ax.set_xticklabels(df.year)
ax.set_xlabel('Year')
ax.set_ylabel(ln[0].get_label())
# butter eating
axt = ax.twinx()
lt = axt.plot(df.per_capita_marg.values, r'bo-', label='Per Capita Lbs of Margarine')
axt.set_ylabel(lt[0].get_label())
# scatter plot
ax2.scatter(df.mn_divorce_rate.values, df.per_capita_marg.values, s=100)
ax2.set_xlabel('MN Divorce Rate')
ax2.set_ylabel('Margarine')
ax2.set_title(r'Divorce vs. Margarine, $r = %.2g$' % df.mn_divorce_rate.corr(df.per_capita_marg))
ax2.axis('tight')
# legend madness
lns = ln + lt
leg = ax.legend(lns, [l.get_label() for l in lns], loc=0)
blacken_legend_text(leg)
fig.tight_layout()
DataFrame.replace()
with regular expressions¶tips = pd.read_csv('s3://nyqpug/tips.csv')
# add some random lower cased versions of yes and no
nrows = len(tips)
tips.loc[(rand(nrows) > 0.5) & (tips.smoker == 'Yes'), 'smoker'] = 'yes'
tips.loc[(rand(nrows) > 0.5) & (tips.smoker == 'No'), 'smoker'] = 'no'
tips.smoker.value_counts().plot(kind='bar')
# sanity check
tips.smoker.value_counts()
repd = tips.replace(regex={'smoker': {'[yY]es': True, '[nN]o': False}})
repd
repd_all = tips.replace(regex={'[yY]es': True, '[nN]o': False})
repd_all
jsfile = 'data.json'
%%writefile $jsfile
{
"name": ["Bob Jones", "Karen Smith"],
"age": [28, 26],
"gender": ["M", "F"]
}
pd.read_json(jsfile) # no problemo
# can also use keys as the rows instead of columns
pd.read_json(jsfile, orient='index')
%%writefile $jsfile
{
"region": {
"Canada": {
"name": "Bob Jones",
"age": 28,
"gender": "M"
},
"USA": {
"name": "Karen Smith",
"age": 26,
"gender": "F"
}
}
}
disp(pd.read_json(jsfile, orient='records'))
disp(Audio(os.path.join(os.pardir, 'mp3', 'w.mp3'), autoplay=True))
# disp(Audio(os.path.join(os.pardir, 'mp3', 'c.mp3'), autoplay=True))
read_json
(not so simple)¶data = read_text(jsfile)
# avoid read_json entirely :)
# get transposed
df = DataFrame(json.loads(data)["region"])
df = df.T.convert_objects(convert_numeric=True)
df
df.dtypes
jq = sh.jq.bake('-M') # -M disables colorizing
rule = "(.region)" # this rule is essentially data["region"]
out = jq(rule, _in=data).stdout
res = pd.read_json(out, orient='index')
res
res.dtypes
%%writefile $jsfile
{
"intervals": [
{
"pivots": "Jane Smith",
"series": [
{
"interval_id": 0,
"p_value": 1
},
{
"interval_id": 1,
"p_value": 1.1162791357932633e-8
},
{
"interval_id": 2,
"p_value": 0.0000028675012051504467
}
]
},
{
"pivots": "Bob Smith",
"series": [
{
"interval_id": 0,
"p_value": 1
},
{
"interval_id": 1,
"p_value": 1.1162791357932633e-8
},
{
"interval_id": 2,
"p_value": 0.0000028675012051504467
}
]
}
]
}
%%writefile rule.txt
[{pivots: .intervals[].pivots,
interval_id: .intervals[].series[].interval_id,
p_value: .intervals[].series[].p_value}] | unique
data = read_text(jsfile)
# check out http://stedolan.github.io/jq/manual for more details on these rules
rule = read_text('rule.txt')
out = jq(rule, _in=data).stdout
js = json.loads(out)
js[:2]
res = pd.read_json(out)
res
res.dtypes
DataFrame.isin()
str.extract()
query
/eval
msgpack
IOGoogle BigQuery
IOnames = list(filter(None, read_text('names.txt').split('\n')))
names
df = DataFrame(dict(zip(['math', 'physics'],
[names[:5], names[-5:]])))
df
df.isin(['Brook', 'Bradley', 'Richie', 'Sarah'])
str.extract()
¶!grep -P '^[a-zA-Z_]\w*$' /usr/share/dict/cracklib-small | head -10
def gen_filenames(n, pattern='%d_%s', dict_file='/usr/share/dict/words'):
matches_id = partial(re.match, '^%s$' % tokenize.Name)
interpolator = partial(operator.mod, pattern)
with open(dict_file, 'rt') as f:
only_valid_names = ifilter(matches_id, cycle(f))
n_matches = islice(only_valid_names, 0, n)
for el in imap(interpolator, enumerate(imap(str.strip, n_matches))):
yield el
vids = Series(list(gen_filenames(30, pattern='%d_%s.mp4')))
vids
ext = vids.str.extract('(?P<num>\d+)_(?P<name>.+)')
ext
ext = ext.convert_objects(convert_numeric=True)
disp(ext.dtypes)
ext
query
/eval
n = 1e6
df = DataFrame({'a': randint(10, size=n),
'b': rand(n),
'c': rand(n)})
df.head()
sub = df.query('1 <= a <= 5 and 0.1 < b < 0.4 and 0.5 <= c <= 0.95')
sub
qtime = %timeit -o df.query('1 <= a <= 5 and 0.1 < b < 0.4 and 0.5 <= c <= 0.95')
pytime = %timeit -o df.loc[(1 <= df.a) & (df.a <= 5) & (0.1 <= df.b) & (df.b <= 0.4) & (0.5 <= df.c) & (df.c <= 0.9)]
print('query is %.2gx faster than pure Python' % (pytime.best / qtime.best))
A, B, C, D = (DataFrame(randn(n, 40)) for _ in range(4))
qtime = %timeit -r 1 -n 1 -o pd.eval('A + B * 2 + C / D ** 3 * B / C + A ** 10 < A ** 5')
pytime = %timeit -r 1 -n 1 -o A + B * 2 + C / D ** 3 * B / C + A ** 10 < A ** 5
print('query is %.2gx faster than pure Python' % (pytime.best / qtime.best))
a = rand()
df.query('a <= @a <= b')
MessagePack
IO (to_msgpack
/read_msgpack
)¶df.head(2).to_msgpack()
s = pd.to_msgpack(None, # we want the raw bytes output so pass None
Series(randn(2)),
['yep', 'a', 'list'],
randn(2),
{'a': 2, 'b': 3})
sio = StringIO(s)
pd.read_msgpack(sio)
highlight('query.sql')
query = read_text('query.sql')
df = pd.read_gbq(query, project_id='metal-lantern-572')
Notice the NaT
s and NaN
s. Those are where other repositories have valid pull request dates
df = df.rename(columns=lambda x: x.replace('payload_pull_request_', ''))
df.dtypes
df
df['created_at'] = pd.to_datetime(df.created_at)
df
# set the index to the datetime column just created
df = df.set_index('created_at').sort_index()
df
s = df.additions
def remove_time(ax):
replacer = lambda x: x.get_text().replace(' 00:00:00', '')
ax.set_xticklabels(list(map(replacer, ax.get_xticklabels())))
r = s.resample('B', how='sum')
r.index.name = 'Pull Request Day'
ax = r.plot(kind='bar', figsize=(18, 5))
remove_time(ax)
ax.set_ylabel('Pull Request Additions per Business Day')
ax.get_figure().autofmt_xdate()
Series
to use composition instead of inheriting from numpy.ndarray
. Bravo!¶MultiIndex
slicingnlargest
/nsmallest
MultiIndex
slicing¶MultiIndex
Slicing¶def channel_index(name, nchannels):
return list(zip(np.repeat(name, nchannels), range(nchannels)))
# simulate our EEG data set
fs = 256 # sampling rate
neeg = 8 # number of EEG channels
nex = 4 # number of auxiliary channels
nsensors = 2 # number of gsr channels
eeg_chan = channel_index('eeg', neeg)
ex_chan = channel_index('ex', nex)
sens_chan = channel_index('gsr', nsensors)
disp(eeg_chan)
disp(ex_chan)
disp(sens_chan)
columns = pd.MultiIndex.from_tuples(eeg_chan + ex_chan + sens_chan,
names=['signal', 'channel'])
# 10 seconds of fake data
df = pd.DataFrame(np.random.randn(fs * 10, columns.labels[0].size), columns=columns)
# add in some nans (e.g., a person moved around during these samples)
df.loc[rand(len(df)) < 0.20, 'eeg'] = np.nan
df.head()
# simulate a stimulus marker
df['stim'] = np.sort(randint(10, size=len(df)))
df
df.loc[:, np.s_[('ex', 'stim'), :]].head()
# the EX and STIM channels where EEG channels 0 and 1 are not null
row_idx = df.eeg[[0, 1]].notnull().all(axis=1)
col_idx = np.s_[('ex', 'stim'), :]
col_idx
res = df.loc[row_idx, col_idx]
res
# use np.s_ to construct slices
assert slice(None, 1000, 2) == np.s_[:1000:2] # Which would you prefer?
assert slice(900, None, -1) == np.s_[900::-1]
The expression
np.s_[('ex', 'stim'), :]
says select 'ex'
and 'stim'
from the first level of the MultiIndex
and the :
as saying give me all columns from the second level
nlargest
/nsmallest
¶len(s)
you'll see a performance improvement.¶s = Series(randn(1000000), name='a')
a = %timeit -o s.nlargest(5) # 5 << 1,000,000
b = %timeit -o s.order(ascending=False).head()
print('nlargest is %.2gx faster than order + head' % (b.best / a.best))
a = %timeit -o s.nsmallest(5)
b = %timeit -o s.order().head()
print('nsmallest is %.2gx faster than order + head' % (b.best / a.best))