#!/usr/bin/env python # coding: utf-8 # # Indexing # # Selecting subsets of data. # This is subset of the data from beeradvocate.com, via [Standford](https://snap.stanford.edu/data/web-RateBeer.html). It's strangely formatted. # # ``` # beer/name: Sausa Weizen # beer/beerId: 47986 # beer/brewerId: 10325 # beer/ABV: 5.00 # beer/style: Hefeweizen # review/appearance: 2.5 # review/aroma: 2 # review/palate: 1.5 # review/taste: 1.5 # review/overall: 1.5 # review/time: 1234817823 # review/profileName: stcules # review/text: A lot of foam. But a lot. In the smell some banana, and then lactic and tart. Not a good start. Quite dark orange in color, with a lively carbonation (now visible, under the foam). Again tending to lactic sourness. Same for the taste. With some yeast and banana. # # beer/name: Red Moon # beer/beerId: 48213 # beer/brewerId: 10325 # beer/ABV: 6.20 # ... # ``` # # The dataset was a bit large to processess all at once # # ```bash # $ wc -l beeradvocate.txt # 22212596 beeradvocate.txt # ``` # # So I parsed them in chunks. # # ```python # def format_review(review): # return dict(map(lambda x: x.strip().split(": ", 1), review)) # # def as_dataframe(reviews): # col_names = { # 'beer/ABV': 'abv', 'beer/beerId': 'beer_id', 'beer/brewerId': 'brewer_id', # 'beer/name': 'beer_name', 'beer/style': 'beer_style', # 'review/appearance': 'review_appearance', 'review/aroma': 'review_aroma', # 'review/overall': 'review_overall', 'review/palate': 'review_palate', # 'review/profileName': 'profile_name', 'review/taste': 'review_taste', # 'review/text': 'text', 'review/time': 'time' # } # df = pd.DataFrame(list(reviews)) # numeric = ['abv', 'review_appearance', 'review_aroma', # 'review_overall', 'review_palate', 'review_taste'] # df = (df.rename(columns=col_names) # .replace('', np.nan)) # df[numeric] = df[numeric].astype(float) # df['time'] = pd.to_datetime(df.time.astype(int), unit='s') # return df # # def main(): # with open('beeradvocate.txt') as f: # reviews = filter(lambda x: x != ('\n',), # partitionby(lambda x: x == '\n', f)) # reviews = map(format_review, reviews) # reviews = partition(100000, reviews, pad=None) # reviews = filter(None, reviews) # os.makedirs('beer_reviews', exist_ok=True) # # for i, subset in enumerate(reviews): # print(i, end='\r') # df = as_dataframe(subset) # df.to_csv('beer_reviews/review_%s.csv' % i, index=False) # ``` # # Aside: dask # # To select the subset we'll work with, about a 10th of the reviews, I used [`dask`](http://dask.readthedocs.org). # All of those files wouldn't fit in memory at once. But we can compute quantiles in chunks and aggregate those together. # # ```python # # In [1]: import dask.dataframe as dd # # In [2]: df = dd.read_csv('beer_reviews/*.csv', parse_dates=['time']) # # In [3]: cutoffs = df.time.quantiles([.5, .6]) # # In [4]: %time cutoffs = cutoffs.compute() # CPU times: user 20.7 s, sys: 8.37 s, total: 29.1 s # Wall time: 28.2 s # # In [5]: %time subset = df[(df.time >= cutoffs[0]) & (df.time <= cutoffs[1])].compute() # CPU times: user 20.9 s, sys: 7.68 s, total: 28.6 s # Wall time: 27.5 s # # In [6]: subset.to_csv('../notebooks/data/beer_subset.csv', index=False) # ``` # # Just wirting `cutoff = df.time.quantile([10])` doesn't actually do the computation, instead it build of dask graph of what it needs to do when asked for the result. If that sounds interesting consider going to Matthew Rocklin's [dask tutorial](https://github.com/ContinuumIO/dask-tutorial/) during the next session. # # Here's the graph for the percentile to calculate `cutoff` # In[38]: from IPython import display # In[41]: display.Image('cutoffs.png') # And the graph for `subset` # In[43]: display.Image('subset.png') # `subset` fits comfortably in memory, so we can sort it and write it out to csv. That's what we'll work with for this notebook. # In[4]: from __future__ import print_function, division import numpy as np import pandas as pd import seaborn as sns import matplotlib.pyplot as plt pd.options.display.max_rows = 10 # In[5]: df = pd.read_csv('data/beer_subset.csv.gz', parse_dates=['time'], compression='gzip') review_cols = ['review_appearance', 'review_aroma', 'review_overall', 'review_palate', 'review_taste'] df.head() # ## Boolean indexing # # Like a where clause in SQL. The indexer (or boolean mask) should be 1-dimensional and the same length as the thing being indexed. # In[6]: df.abv < 5 # In[7]: df[df.abv < 5].head() # Notice that we just used `[]` there. We can pass the boolean indexer in to `.loc` as well. # In[8]: df.loc[df.abv < 5, ['beer_style', 'review_overall']] # Again, you can get complicated # In[9]: df[((df.abv < 5) & (df.time > pd.Timestamp('2009-06'))) | (df.review_overall >= 4.5)] # ### Exercise: Find the IPAs # # Select just the rows where the `beer_style` contains IPA. # # Hint: Look at the docstring for `df.beer_style.str.contains?`. # Use that to return a boolean array # In[10]: # Hint: Look at the docstring for `df.beer_style.str.contains` # Use that to return a boolean array get_ipython().run_line_magic('pinfo', 'df.beer_style.str.contains') # In[ ]: get_ipython().run_line_magic('load', '-r 1:4 solutions_indexing.py') # This is quite powerful. Any method that returns a boolean array is potentially an indexer. # # isin # # Useful for seeing if a value is contained in a collection. # In[11]: brewer = df[['brewer_id', 'beer_id']] brewer.head() # In[12]: brewer_ids = df.brewer_id.value_counts().index[:10] beer_ids = df.beer_id.value_counts().index[:10] brewer_ids # In[13]: df.brewer_id.isin(brewer_ids) # `DataFrame.isin()` can take a dictionary. # In[14]: to_find = { 'brewer_id': brewer_ids, 'beer_id': beer_ids } brewer.isin(to_find) # The result of `DataFrame.isin` is always the same shape as the input. # Use `.any` or `.all` if you intend to index with the result. # In[15]: brewer[brewer.isin(to_find).all('columns')] # ### Exercise: Find a subset of beer styles # # Find the rows where the beer style is either 'American IPA' or 'Pilsner'. # In[ ]: # In[13]: get_ipython().run_line_magic('load', '-r 5:7 solutions_indexing.py') # ### Exercise: High Marks # # Select the rows where the scores of the 5 `review_cols` 5 `review_cols` ('review_appearance', 'review_aroma', 'review_overall', 'review_palate', 'review_taste') are *all* at least 4.0. # # hint: Like NumPy arrays, DataFrames have an `any` and `all` methods that check whether it contains `any` or `all` True values. These methods also take an `axis` argument for the dimension to remove. # # - `0` or `index` removes (or aggregates over) the vertical dimension # - `1` or `columns` removes (aggregates over) the horizontal dimension. # In[16]: review_cols # In[ ]: # your code goes here # In[95]: get_ipython().run_line_magic('load', '-r 9:20 solutions_indexing.py') # ### Exercise: Pretty Good # # select rows where the average of the 5 `review_cols` ('review_appearance', 'review_aroma', 'review_overall', 'review_palate', 'review_taste') is at least 4. # In[ ]: # In[97]: get_ipython().run_line_magic('load', '-r 21:23 solutions_indexing.py') # # Hierarchical Indexing # # One of the most powerful and most complicated features of pandas. # Let's you represent high-dimensional datasets in a table. # In[17]: reviews = df.set_index(['profile_name', 'beer_id', 'time']) reviews.head() # You'll almost always want to sort your MultiIndex. # In[18]: reviews = reviews.sort_index() reviews.head() # Internally, a MultiIndex is a collection of pairs of `levels` and `labels`, one pair for each level of the MultiIndex. # In[19]: reviews.index.levels[0] # In[20]: reviews.index.labels[0] # ### get_level_values # In[21]: top_reviewers = (reviews.index.get_level_values('profile_name') .value_counts() .head(5).index) top_reviewers # In[22]: reviews.loc[top_reviewers, :, :].head() # The syntax is a bit trickier when you want to specify a row Indexer *and* a column Indexer. # In[23]: reviews.loc[(top_reviewers, 111, :), ['beer_name', 'brewer_name']] # In[24]: reviews.loc[pd.IndexSlice[top_reviewers, 111, :], ['beer_name', 'brewer_id']] # Be careful with duplicates in the indicies. # In[25]: reviews.index.is_unique # In[26]: dupes = reviews.index.get_duplicates() dupes # In[27]: reviews.loc[dupes] # In[28]: reviews[reviews.index.duplicated()] # In[29]: reviews = reviews[~reviews.index.duplicated()] reviews.index.is_unique # ### Exercise: Select the Top Beers # # Use `.loc` to select the `beer_name` and `beer_style` for the 10 most popular beers, as measure by number of reviews. # # - Hint: Need the value_counts for the `beer_id` level (we did that [earlier](#get_level_values)) # In[ ]: # %load -r 24:27 solutions_indexing.py top_beers = reviews.index.get_level_values('beer_id').value_counts().head(10).index reviews.loc[pd.IndexSlice[:, top_beers], ['beer_name', 'beer_style']] # # Pitfalls # # # Chained indexing # In[31]: bad = df.copy() # In[32]: bad.loc[df.beer_style.str.contains('IPA')]['beer_name'] = 'yummy' # In[33]: bad[df.beer_style.str.contains('IPA')]['beer_name'] # In[34]: bad.loc[df.beer_style.str.contains('IPA'), 'beer_name'] # In[35]: bad.loc[df.beer_style.str.contains('IPA'), 'beer_name'] = 'Tasty' bad.loc[df.beer_style.str.contains('IPA'), 'beer_name'] # # Recap # # - Boolean masks should always be 1-dimensional and the same length # - sort your `MultiIndexes` # - `isin` + `.any()` or `.all()` for comparing to collections