from __future__ import division, print_function

from IPython.display import HTML, Image, display as disp

with open("../css/css.css", "r") as f:
    style = f.read()
HTML(style)

!./shortlog.sh ../../pandas

disp(Image("../img/m.jpg"))

disp(Image("../img/u.jpg"))

# imports that I'll use throughout the talk
import numpy as np
import pandas as pd
from pandas import DataFrame, Series, Index
from numpy.random import randn, randint, rand, choice
import matplotlib.pyplot as plt

pd.options.display.max_rows = 10

try:
    from mpltools import style
    style.use('ggplot')
except ImportError:
    pass


# because of our bg color
plt.rc('text', color='white')
plt.rc('axes', labelcolor='white')
plt.rc('xtick', color='white')
plt.rc('ytick', color='white')

%matplotlib inline

df = pd.DataFrame(randn(10, 2), columns=list('ab'))
df

# if you have boto installed and have set up credentials
# df = pd.read_csv('s3://nypug/tips.csv')

df = pd.read_csv('https://s3.amazonaws.com/nyqpug/tips.csv')
df

df.dtypes

# column access by name
df['day']  # ⟵ that's a Series object

# by attribute
df.time

df.time.value_counts()

# multiple columns
df[['tip', 'sex']]

t = df.set_index('day')
t.head()

t.loc['Sun']

df.loc[df.day == 'Sun']

df.loc[:, 'smoker']

df.iloc[:, 3]  # same as df.loc[:, 'smoker']

df['pct_tip'] = df.tip / df.total_bill
df['avg_price'] = df.total_bill / df.size

df.avg_price.hist(bins=20)

del df['avg_price']
del df['pct_tip']

# multiple columns, multiple rows
df.loc[[0, 2], ['sex', 'tip']]

df.loc[:10, ['total_bill', 'tip']]  # note this is inclusive

# and with iloc
df.iloc[:5]  # exclusive endpoints

# np.ptp is peak-to-peak difference, i.e., range
df[['total_bill', 'tip', 'size']].apply(np.ptp)

days = ['Sun', 'Mon', 'Tues', 'Wed', 'Thur', 'Fri', 'Sat']

df.day.map(days.index)

df.sum()  # whoa! + is defined for strings

df.sum(numeric_only=True)

df.count() / df.shape[0]  # we don't have any nans

df.var()

df.mean()

df.head()

gb = df.groupby('sex')
gb.mean()

gb = df.groupby(['sex', 'smoker'])
gb.std()

# can pass multiple reducers to agg
gb.agg(['mean', 'std', 'median'])

def stdize(x):
    return (x - x.mean()) / x.std()
    
df['tb_std'] = gb.total_bill.apply(stdize)
df

ax = df.tb_std.plot(kind='kde', lw=3)
df.tb_std.hist(ax=ax, normed=True)

ax.set_xlabel('$z$-score', fontsize=20)
ax.set_title(r'Total Bill Stdized Across Sex$\times$ Smoker')

ax.axis('tight')
plt.gcf().tight_layout()

# create some frames
n = 2000
n2 = n // 2
visits = DataFrame({'page_visits_per_day': np.random.poisson(10, size=n),
                 'user_id': randint(9, size=n)})
likes = DataFrame({'likes_per_day': np.random.poisson(30, size=n2),
                   'user_id': randint(6, size=n2)})
visits

likes

merg = pd.merge(visits, likes)
merg.sort('user_id')

s = df.day
s

is_weekend = s.str.startswith('S')
is_weekend

correct = s[is_weekend].str.contains(r'^(?:Sat|Sun)$')
correct.all()

s.str.len()

s.str[:2]

n = 10000
idx = pd.date_range(start='today', periods=n, freq='D')
idx

s = Series(np.random.poisson(10, size=n), index=idx, name='login_count')
s

s.resample('W', how='sum')

# multiple functions
rs = s.resample('W', how=['mean', 'std', 'count'])
rs

mu = rs['mean']
fig, ax = plt.subplots(figsize=(12, 6))
ax.step(mu.index[:50], mu.iloc[:50], lw=3, where='post')
fig.tight_layout()
ax.set_xlabel('Time')
ax.set_ylabel('Average Count / Week')
ax.set_title('Count vs. Time')

%%writefile tmp.csv
a,b,c
1,d,3.0
4,e,6.28
2,f,4.4

df = pd.read_csv('tmp.csv')
df

df.dtypes

df.to_csv('tmp.csv')

%cat tmp.csv # saves the index created by default

df = DataFrame(dict(a=randn(10), b=randint(10, size=10),
                    c=choice(list('abc'), size=10)))
df

df.to_hdf('tmp.h5', 'df', format='table')

pd.read_hdf('tmp.h5', 'df')

!ptdump tmp.h5