#!/usr/bin/env python
# coding: utf-8

# # Indexing
# 
# Selecting subsets of data.

# This is subset of the data from beeradvocate.com, via [Standford](https://snap.stanford.edu/data/web-RateBeer.html). It's strangely formatted.
# 
# ```
# beer/name: Sausa Weizen
# beer/beerId: 47986
# beer/brewerId: 10325
# beer/ABV: 5.00
# beer/style: Hefeweizen
# review/appearance: 2.5
# review/aroma: 2
# review/palate: 1.5
# review/taste: 1.5
# review/overall: 1.5
# review/time: 1234817823
# review/profileName: stcules
# review/text: A lot of foam. But a lot.	In the smell some banana, and then lactic and tart. Not a good start.	Quite dark orange in color, with a lively carbonation (now visible, under the foam).	Again tending to lactic sourness.	Same for the taste. With some yeast and banana.		
# 
# beer/name: Red Moon
# beer/beerId: 48213
# beer/brewerId: 10325
# beer/ABV: 6.20
#  ...
# ```
# 
# The dataset was a bit large to processess all at once
# 
# ```bash
# $ wc -l beeradvocate.txt
#  22212596 beeradvocate.txt
# ```
# 
# So I parsed them in chunks.
# 
# ```python
# def format_review(review):
#     return dict(map(lambda x: x.strip().split(": ", 1), review))
# 
# def as_dataframe(reviews):
#     col_names = {
#         'beer/ABV': 'abv', 'beer/beerId': 'beer_id', 'beer/brewerId': 'brewer_id',
#         'beer/name': 'beer_name', 'beer/style': 'beer_style',
#         'review/appearance': 'review_appearance', 'review/aroma': 'review_aroma',
#         'review/overall': 'review_overall', 'review/palate': 'review_palate',
#         'review/profileName': 'profile_name', 'review/taste': 'review_taste',
#         'review/text': 'text', 'review/time': 'time'
#     }
#     df = pd.DataFrame(list(reviews))
#     numeric = ['abv', 'review_appearance', 'review_aroma',
#                'review_overall', 'review_palate', 'review_taste']
#     df = (df.rename(columns=col_names)
#             .replace('', np.nan))
#     df[numeric] = df[numeric].astype(float)
#     df['time'] = pd.to_datetime(df.time.astype(int), unit='s')
#     return df
# 
# def main():
#     with open('beeradvocate.txt') as f:
#         reviews = filter(lambda x: x != ('\n',),
#                          partitionby(lambda x: x == '\n', f))
#         reviews = map(format_review, reviews)
#         reviews = partition(100000, reviews, pad=None)
#         reviews = filter(None, reviews)
#         os.makedirs('beer_reviews', exist_ok=True)
# 
#         for i, subset in enumerate(reviews):
#             print(i, end='\r')
#             df = as_dataframe(subset)
#             df.to_csv('beer_reviews/review_%s.csv' % i, index=False)
# ```

# # Aside: dask
# 
# To select the subset we'll work with, about a 10th of the reviews, I used [`dask`](http://dask.readthedocs.org).
# All of those files wouldn't fit in memory at once. But we can compute quantiles in chunks and aggregate those together. 
# 
# ```python
# 
# In [1]: import dask.dataframe as dd
# 
# In [2]: df = dd.read_csv('beer_reviews/*.csv', parse_dates=['time'])
# 
# In [3]: cutoffs = df.time.quantiles([.5, .6])
# 
# In [4]: %time cutoffs = cutoffs.compute()
# CPU times: user 20.7 s, sys: 8.37 s, total: 29.1 s
# Wall time: 28.2 s
# 
# In [5]: %time subset = df[(df.time >= cutoffs[0]) & (df.time <= cutoffs[1])].compute()
# CPU times: user 20.9 s, sys: 7.68 s, total: 28.6 s
# Wall time: 27.5 s
# 
# In [6]: subset.to_csv('../notebooks/data/beer_subset.csv', index=False)
# ```
# 
# Just wirting `cutoff = df.time.quantile([10])` doesn't actually do the computation, instead it build of dask graph of what it needs to do when asked for the result. If that sounds interesting consider going to Matthew Rocklin's [dask tutorial](https://github.com/ContinuumIO/dask-tutorial/) during the next session.
# 
# Here's the graph for the percentile to calculate `cutoff`

# In[38]:


from IPython import display


# In[41]:


display.Image('cutoffs.png')


# And the graph for `subset`

# In[43]:


display.Image('subset.png')


# `subset` fits comfortably in memory, so we can sort it and write it out to csv. That's what we'll work with for this notebook.

# In[4]:


from __future__ import print_function, division

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

pd.options.display.max_rows = 10


# In[5]:


df = pd.read_csv('data/beer_subset.csv.gz', parse_dates=['time'], compression='gzip')
review_cols = ['review_appearance', 'review_aroma', 'review_overall',
               'review_palate', 'review_taste']
df.head()


# ## Boolean indexing
# 
# Like a where clause in SQL. The indexer (or boolean mask) should be 1-dimensional and the same length as the thing being indexed.

# In[6]:


df.abv < 5


# In[7]:


df[df.abv < 5].head()


# Notice that we just used `[]` there. We can pass the boolean indexer in to `.loc` as well.

# In[8]:


df.loc[df.abv < 5, ['beer_style', 'review_overall']]


# Again, you can get complicated

# In[9]:


df[((df.abv < 5) & (df.time > pd.Timestamp('2009-06'))) | (df.review_overall >= 4.5)]


# ### Exercise: Find the IPAs
# 
# Select just the rows where the `beer_style` contains IPA. 
# 
# Hint: Look at the docstring for `df.beer_style.str.contains?`.
# Use that to return a boolean array

# In[10]:


# Hint: Look at the docstring for `df.beer_style.str.contains`
# Use that to return a boolean array
get_ipython().run_line_magic('pinfo', 'df.beer_style.str.contains')


# In[ ]:


get_ipython().run_line_magic('load', '-r 1:4 solutions_indexing.py')


# This is quite powerful. Any method that returns a boolean array is potentially an indexer.

# # isin
# 
# Useful for seeing if a value is contained in a collection.

# In[11]:


brewer = df[['brewer_id', 'beer_id']]
brewer.head()


# In[12]:


brewer_ids = df.brewer_id.value_counts().index[:10]
beer_ids = df.beer_id.value_counts().index[:10]
brewer_ids


# In[13]:


df.brewer_id.isin(brewer_ids)


# `DataFrame.isin()` can take a dictionary.

# In[14]:


to_find = {
    'brewer_id': brewer_ids,
    'beer_id': beer_ids
}
brewer.isin(to_find)


# The result of `DataFrame.isin` is always the same shape as the input.
# Use `.any` or `.all` if you intend to index with the result.

# In[15]:


brewer[brewer.isin(to_find).all('columns')]


# ### Exercise: Find a subset of beer styles
# 
# Find the rows where the beer style is either 'American IPA' or 'Pilsner'.

# In[ ]:


# In[13]:


get_ipython().run_line_magic('load', '-r 5:7 solutions_indexing.py')


# ### Exercise: High Marks
# 
# Select the rows where the scores of the 5 `review_cols` 5 `review_cols` ('review_appearance', 'review_aroma', 'review_overall', 'review_palate', 'review_taste') are *all* at least 4.0.
# 
# hint: Like NumPy arrays, DataFrames have an `any` and `all` methods that check whether it contains `any` or `all` True values. These methods also take an `axis` argument for the dimension to remove.
# 
# - `0` or `index` removes (or aggregates over) the vertical dimension
# - `1` or `columns` removes (aggregates over) the horizontal dimension.

# In[16]:


review_cols


# In[ ]:


# your code goes here


# In[95]:


get_ipython().run_line_magic('load', '-r 9:20 solutions_indexing.py')


# ### Exercise: Pretty Good
# 
# select rows where the average of the 5 `review_cols` ('review_appearance', 'review_aroma', 'review_overall', 'review_palate', 'review_taste') is at least 4.

# In[ ]:


# In[97]:


get_ipython().run_line_magic('load', '-r 21:23 solutions_indexing.py')


# # Hierarchical Indexing
# 
# One of the most powerful and most complicated features of pandas.
# Let's you represent high-dimensional datasets in a table.

# In[17]:


reviews = df.set_index(['profile_name', 'beer_id', 'time'])
reviews.head()


# You'll almost always want to sort your MultiIndex.

# In[18]:


reviews = reviews.sort_index()
reviews.head()


# Internally, a MultiIndex is a collection of pairs of `levels` and `labels`, one pair for each level of the MultiIndex.

# In[19]:


reviews.index.levels[0]


# In[20]:


reviews.index.labels[0]


# ### get_level_values

# In[21]:


top_reviewers = (reviews.index.get_level_values('profile_name')
                 .value_counts()
                 .head(5).index)
top_reviewers


# In[22]:


reviews.loc[top_reviewers, :, :].head()


# The syntax is a bit trickier when you want to specify a row Indexer *and* a column Indexer.

# In[23]:


reviews.loc[(top_reviewers, 111, :), ['beer_name', 'brewer_name']]


# In[24]:


reviews.loc[pd.IndexSlice[top_reviewers, 111, :], ['beer_name', 'brewer_id']]


# Be careful with duplicates in the indicies.

# In[25]:


reviews.index.is_unique


# In[26]:


dupes = reviews.index.get_duplicates()
dupes


# In[27]:


reviews.loc[dupes]


# In[28]:


reviews[reviews.index.duplicated()]


# In[29]:


reviews = reviews[~reviews.index.duplicated()]
reviews.index.is_unique


# ### Exercise: Select the Top Beers
# 
# Use `.loc` to select the `beer_name` and `beer_style` for the 10 most popular beers, as measure by number of reviews.
# 
# - Hint: Need the value_counts for the `beer_id` level (we did that [earlier](#get_level_values))

# In[ ]:


# %load -r 24:27 solutions_indexing.py
top_beers = reviews.index.get_level_values('beer_id').value_counts().head(10).index
reviews.loc[pd.IndexSlice[:, top_beers], ['beer_name', 'beer_style']]


# # Pitfalls
# 
# 
# Chained indexing

# In[31]:


bad = df.copy()


# In[32]:


bad.loc[df.beer_style.str.contains('IPA')]['beer_name'] = 'yummy'


# In[33]:


bad[df.beer_style.str.contains('IPA')]['beer_name']


# In[34]:


bad.loc[df.beer_style.str.contains('IPA'), 'beer_name']


# In[35]:


bad.loc[df.beer_style.str.contains('IPA'), 'beer_name'] = 'Tasty'
bad.loc[df.beer_style.str.contains('IPA'), 'beer_name']


# # Recap
# 
# - Boolean masks should always be 1-dimensional and the same length
# - sort your `MultiIndexes`
# - `isin` + `.any()` or `.all()` for comparing to collections