# By convention
import pandas as pd

%%time

dish_data = '/Users/libraries/Dropbox/NYPL_Data/2013_12_18_11_50_34_data/Dish.csv'
dish_df = pd.read_csv(dish_data, index_col='id')

%%time

menu_item_data = '/Users/libraries/Dropbox/NYPL_Data/2013_12_18_11_50_34_data/MenuItem.csv'
item_df = pd.read_csv(menu_item_data, index_col='dish_id')

dish_df[:5]

item_df[:5]

dish_df

unique_values_t0 = len(dish_df.name.unique())
print unique_values_t0

# Drop the 'description' column because it's empty and taking up screen real estate
del dish_df['description']

def strip_and_lower(x):
    """
    Basic string normalization:
    1) strip leading and trailing whitespace
    2) convert to lowercase
    3) normalize internal whitespace (remove extra spaces between tokens)
    """
    
    tokens = x.strip().lower().split()
    result = ' '.join(filter(None, tokens))
    
    return result

# Apply the function to every value of name using map() — rather than, for instance, using a loop
dish_df['name_modified'] = dish_df['name'].map(strip_and_lower)

dish_df[:5]

len(dish_df.name_modified.unique())

import re

def fingerprint(x):
    """
    A modified version of the fingerprint clustering algorithm implemented by Open Refine.
    See https://github.com/OpenRefine/OpenRefine/wiki/Clustering-In-Depth
    This does not normalize to ASCII characters since diacritics may be significant in this dataset
    """
    alphanumeric_tokens = filter(None, re.split('\W', x))
    seen = set()
    seen_add = seen.add
    deduped = sorted([i for i in alphanumeric_tokens if i not in seen and not seen_add(i)])
    fingerprint = ''.join(deduped)
    
    return fingerprint

dish_df['fingerprint'] = dish_df['name_modified'].map(fingerprint)

dish_df[:5]

clusters = dish_df.groupby('fingerprint')

# Order the clusters by size
s = clusters.size()
s.order()

s.value_counts()

# Actually we'll be excluding the last item of the series since the value there is blank
for f in s.order()[-6:-1].index:
    print dish_df[dish_df.fingerprint == f].name

testing_cluster = dish_df[dish_df.fingerprint == 'augratinpotatoes']

for item in testing_cluster.index.tolist():
    url = "http://menus.nypl.org/dishes/{0}".format(item)
    print url

# Get a Dataframe consisting of all the rows that share a common fingerprint
result_df = dish_df[dish_df.fingerprint == 'frenchfriedpotatoes']

# Get the number of times appeared for each result by selecting that column (Series) and sort in descending order
by_appearance = result_df.times_appeared.order(ascending=False)

#Use the index of the max value to look up the most common value for the name
print "Most common value: '{0}'".format(dish_df.name[by_appearance.idxmax()])

result_df.times_appeared[:5]

total_appearances = result_df.times_appeared.sum()
print "French fried potatoes appear {0} times in the menus digitized by NYPL.".format(total_appearances)

# aggregate_last_appeared = result_df.last_appeared.max()

# It turns out we can't completely straightforwardly use min() as well as max() 
# because some dishes apparently have '0' for first appeared.
aggregate_first_appeared = result_df.first_appeared.values[0]
aggregate_last_appeared = result_df.last_appeared.max()

print "French fried potatoes first appeared on a menu in {0} and last appeared on a menu in {1}"\
.format(aggregate_first_appeared, aggregate_last_appeared)

# Load data from the other CSV files included in the NYPL's regular data dumps because we'll need it all 
page_data = '/Users/libraries/Dropbox/NYPL_Data/2013_12_18_11_50_34_data/MenuPage.csv'
page_df = pd.read_csv(page_data, index_col='id')

menu_data = '/Users/libraries/Dropbox/NYPL_Data/2013_12_18_11_50_34_data/Menu.csv'
menu_df = pd.read_csv(menu_data, index_col='id', parse_dates=True)

# This should create a DataFrame from the Menu Items data set that contains 
# only "rows" related to our example cluster of 'French fried potato' dishes
lookup_items_df = item_df[item_df.index.isin(result_df.index)]

# The length of this DataFrame should be same as the sum of occurrences we calculated above 
try:
    assert len(lookup_items_df) == result_df.times_appeared.sum()
    print "Length of the DataFrame is consistent"
except:
    print "Something's wrong"

lookup_items_df.head(10)

target_pages = lookup_items_df.menu_page_id.values

lookup_pages_df = page_df[page_df.index.isin(target_pages)]

lookup_pages_df.head(10)

target_menus = lookup_pages_df.menu_id.values

lookup_menus_df = menu_df[menu_df.index.isin(target_menus)]

lookup_menus_df.head(10)

import dateutil

def extract_year(x):
    try:
        datestring = dateutil.parser.parse(x)
        return datestring.year
    except:
        return None

lookup_menus_df['date_year'] = lookup_menus_df['date'].map(extract_year)

%matplotlib inline

by_year = lookup_menus_df.groupby(lookup_menus_df['date_year'])
year_series = by_year.size()
year_series.plot(figsize=(20,10))


# Extract years in the complete DataFrame for menus
menu_df['date_year'] = menu_df['date'].map(extract_year)

# Group by year and calculate the number of menus held for each year
total_menu_by_year = menu_df.groupby(menu_df['date_year'])
menu_total_series = total_menu_by_year.size()

normed = [year_series[date_year]/menu_total_series[date_year].astype(float) for date_year in year_series.index.tolist()]
pd.Series(normed, index=year_series.index).plot(figsize=(20,10))