#!/usr/bin/env python
# coding: utf-8

# In[1]:


import numpy as np
import pandas as pd
PREVIOUS_MAX_ROWS = pd.options.display.max_rows
pd.options.display.max_columns = 20
pd.options.display.max_rows = 20
pd.options.display.max_colwidth = 80
np.random.seed(12345)
import matplotlib.pyplot as plt
plt.rc("figure", figsize=(10, 6))
np.set_printoptions(precision=4, suppress=True)


# In[2]:


import numpy as np
import pandas as pd


# In[3]:


df = pd.DataFrame({"key1" : ["a", "a", None, "b", "b", "a", None],
                   "key2" : pd.Series([1, 2, 1, 2, 1, None, 1],
                                      dtype="Int64"),
                   "data1" : np.random.standard_normal(7),
                   "data2" : np.random.standard_normal(7)})
df


# In[4]:


grouped = df["data1"].groupby(df["key1"])
grouped


# In[5]:


grouped.mean()


# In[6]:


means = df["data1"].groupby([df["key1"], df["key2"]]).mean()
means


# In[7]:


means.unstack()


# In[8]:


states = np.array(["OH", "CA", "CA", "OH", "OH", "CA", "OH"])
years = [2005, 2005, 2006, 2005, 2006, 2005, 2006]
df["data1"].groupby([states, years]).mean()


# In[9]:


df.groupby("key1").mean()
df.groupby("key2").mean(numeric_only=True)
df.groupby(["key1", "key2"]).mean()


# In[10]:


df.groupby(["key1", "key2"]).size()


# In[11]:


df.groupby("key1", dropna=False).size()
df.groupby(["key1", "key2"], dropna=False).size()


# In[12]:


df.groupby("key1").count()


# In[13]:


for name, group in df.groupby("key1"):
    print(name)
    print(group)


# In[14]:


for (k1, k2), group in df.groupby(["key1", "key2"]):
    print((k1, k2))
    print(group)


# In[15]:


pieces = {name: group for name, group in df.groupby("key1")}
pieces["b"]


# In[16]:


grouped = df.groupby({"key1": "key", "key2": "key",
                      "data1": "data", "data2": "data"}, axis="columns")


# In[17]:


for group_key, group_values in grouped:
    print(group_key)
    print(group_values)


# In[18]:


df.groupby(["key1", "key2"])[["data2"]].mean()


# In[19]:


s_grouped = df.groupby(["key1", "key2"])["data2"]
s_grouped
s_grouped.mean()


# In[20]:


people = pd.DataFrame(np.random.standard_normal((5, 5)),
                      columns=["a", "b", "c", "d", "e"],
                      index=["Joe", "Steve", "Wanda", "Jill", "Trey"])
people.iloc[2:3, [1, 2]] = np.nan # Add a few NA values
people


# In[21]:


mapping = {"a": "red", "b": "red", "c": "blue",
           "d": "blue", "e": "red", "f" : "orange"}


# In[22]:


by_column = people.groupby(mapping, axis="columns")
by_column.sum()


# In[23]:


map_series = pd.Series(mapping)
map_series
people.groupby(map_series, axis="columns").count()


# In[24]:


people.groupby(len).sum()


# In[25]:


key_list = ["one", "one", "one", "two", "two"]
people.groupby([len, key_list]).min()


# In[26]:


columns = pd.MultiIndex.from_arrays([["US", "US", "US", "JP", "JP"],
                                    [1, 3, 5, 1, 3]],
                                    names=["cty", "tenor"])
hier_df = pd.DataFrame(np.random.standard_normal((4, 5)), columns=columns)
hier_df


# In[27]:


hier_df.groupby(level="cty", axis="columns").count()


# In[28]:


df
grouped = df.groupby("key1")
grouped["data1"].nsmallest(2)


# In[29]:


def peak_to_peak(arr):
    return arr.max() - arr.min()
grouped.agg(peak_to_peak)


# In[30]:


grouped.describe()


# In[31]:


tips = pd.read_csv("examples/tips.csv")
tips.head()


# In[32]:


tips["tip_pct"] = tips["tip"] / tips["total_bill"]
tips.head()


# In[33]:


grouped = tips.groupby(["day", "smoker"])


# In[34]:


grouped_pct = grouped["tip_pct"]
grouped_pct.agg("mean")


# In[35]:


grouped_pct.agg(["mean", "std", peak_to_peak])


# In[36]:


grouped_pct.agg([("average", "mean"), ("stdev", np.std)])


# In[37]:


functions = ["count", "mean", "max"]
result = grouped[["tip_pct", "total_bill"]].agg(functions)
result


# In[38]:


result["tip_pct"]


# In[39]:


ftuples = [("Average", "mean"), ("Variance", np.var)]
grouped[["tip_pct", "total_bill"]].agg(ftuples)


# In[40]:


grouped.agg({"tip" : np.max, "size" : "sum"})
grouped.agg({"tip_pct" : ["min", "max", "mean", "std"],
             "size" : "sum"})


# In[41]:


grouped = tips.groupby(["day", "smoker"], as_index=False)
grouped.mean(numeric_only=True)


# In[42]:


def top(df, n=5, column="tip_pct"):
    return df.sort_values(column, ascending=False)[:n]
top(tips, n=6)


# In[43]:


tips.groupby("smoker").apply(top)


# In[44]:


tips.groupby(["smoker", "day"]).apply(top, n=1, column="total_bill")


# In[45]:


result = tips.groupby("smoker")["tip_pct"].describe()
result
result.unstack("smoker")


# In[46]:


tips.groupby("smoker", group_keys=False).apply(top)


# In[47]:


frame = pd.DataFrame({"data1": np.random.standard_normal(1000),
                      "data2": np.random.standard_normal(1000)})
frame.head()
quartiles = pd.cut(frame["data1"], 4)
quartiles.head(10)


# In[48]:


def get_stats(group):
    return pd.DataFrame(
        {"min": group.min(), "max": group.max(),
        "count": group.count(), "mean": group.mean()}
    )

grouped = frame.groupby(quartiles)
grouped.apply(get_stats)


# In[49]:


grouped.agg(["min", "max", "count", "mean"])


# In[50]:


quartiles_samp = pd.qcut(frame["data1"], 4, labels=False)
quartiles_samp.head()
grouped = frame.groupby(quartiles_samp)
grouped.apply(get_stats)


# In[51]:


s = pd.Series(np.random.standard_normal(6))
s[::2] = np.nan
s
s.fillna(s.mean())


# In[52]:


states = ["Ohio", "New York", "Vermont", "Florida",
          "Oregon", "Nevada", "California", "Idaho"]
group_key = ["East", "East", "East", "East",
             "West", "West", "West", "West"]
data = pd.Series(np.random.standard_normal(8), index=states)
data


# In[53]:


data[["Vermont", "Nevada", "Idaho"]] = np.nan
data
data.groupby(group_key).size()
data.groupby(group_key).count()
data.groupby(group_key).mean()


# In[54]:


def fill_mean(group):
    return group.fillna(group.mean())

data.groupby(group_key).apply(fill_mean)


# In[55]:


fill_values = {"East": 0.5, "West": -1}
def fill_func(group):
    return group.fillna(fill_values[group.name])

data.groupby(group_key).apply(fill_func)


# In[56]:


suits = ["H", "S", "C", "D"]  # Hearts, Spades, Clubs, Diamonds
card_val = (list(range(1, 11)) + [10] * 3) * 4
base_names = ["A"] + list(range(2, 11)) + ["J", "K", "Q"]
cards = []
for suit in suits:
    cards.extend(str(num) + suit for num in base_names)

deck = pd.Series(card_val, index=cards)


# In[57]:


deck.head(13)


# In[58]:


def draw(deck, n=5):
    return deck.sample(n)
draw(deck)


# In[59]:


def get_suit(card):
    # last letter is suit
    return card[-1]

deck.groupby(get_suit).apply(draw, n=2)


# In[60]:


deck.groupby(get_suit, group_keys=False).apply(draw, n=2)


# In[61]:


df = pd.DataFrame({"category": ["a", "a", "a", "a",
                                "b", "b", "b", "b"],
                   "data": np.random.standard_normal(8),
                   "weights": np.random.uniform(size=8)})
df


# In[62]:


grouped = df.groupby("category")
def get_wavg(group):
    return np.average(group["data"], weights=group["weights"])

grouped.apply(get_wavg)


# In[63]:


close_px = pd.read_csv("examples/stock_px.csv", parse_dates=True,
                       index_col=0)
close_px.info()
close_px.tail(4)


# In[64]:


def spx_corr(group):
    return group.corrwith(group["SPX"])


# In[65]:


rets = close_px.pct_change().dropna()


# In[66]:


def get_year(x):
    return x.year

by_year = rets.groupby(get_year)
by_year.apply(spx_corr)


# In[67]:


def corr_aapl_msft(group):
    return group["AAPL"].corr(group["MSFT"])
by_year.apply(corr_aapl_msft)


# In[68]:


import statsmodels.api as sm
def regress(data, yvar=None, xvars=None):
    Y = data[yvar]
    X = data[xvars]
    X["intercept"] = 1.
    result = sm.OLS(Y, X).fit()
    return result.params


# In[69]:


by_year.apply(regress, yvar="AAPL", xvars=["SPX"])


# In[70]:


df = pd.DataFrame({'key': ['a', 'b', 'c'] * 4,
                   'value': np.arange(12.)})
df


# In[71]:


g = df.groupby('key')['value']
g.mean()


# In[72]:


def get_mean(group):
    return group.mean()
g.transform(get_mean)


# In[73]:


g.transform('mean')


# In[74]:


def times_two(group):
    return group * 2
g.transform(times_two)


# In[75]:


def get_ranks(group):
    return group.rank(ascending=False)
g.transform(get_ranks)


# In[76]:


def normalize(x):
    return (x - x.mean()) / x.std()


# In[77]:


g.transform(normalize)
g.apply(normalize)


# In[78]:


g.transform('mean')
normalized = (df['value'] - g.transform('mean')) / g.transform('std')
normalized


# In[79]:


tips.head()
tips.pivot_table(index=["day", "smoker"],
                 values=["size", "tip", "tip_pct", "total_bill"])


# In[80]:


tips.pivot_table(index=["time", "day"], columns="smoker",
                 values=["tip_pct", "size"])


# In[81]:


tips.pivot_table(index=["time", "day"], columns="smoker",
                 values=["tip_pct", "size"], margins=True)


# In[82]:


tips.pivot_table(index=["time", "smoker"], columns="day",
                 values="tip_pct", aggfunc=len, margins=True)


# In[83]:


tips.pivot_table(index=["time", "size", "smoker"], columns="day",
                 values="tip_pct", fill_value=0)


# In[84]:


from io import StringIO
data = """Sample  Nationality  Handedness
1   USA  Right-handed
2   Japan    Left-handed
3   USA  Right-handed
4   Japan    Right-handed
5   Japan    Left-handed
6   Japan    Right-handed
7   USA  Right-handed
8   USA  Left-handed
9   Japan    Right-handed
10  USA  Right-handed"""
data = pd.read_table(StringIO(data), sep="\s+")


# In[85]:


data


# In[86]:


pd.crosstab(data["Nationality"], data["Handedness"], margins=True)


# In[87]:


pd.crosstab([tips["time"], tips["day"]], tips["smoker"], margins=True)


# In[88]:


# In[89]:


pd.options.display.max_rows = PREVIOUS_MAX_ROWS