# By convention import pandas as pd %%time dish_data = '/Users/libraries/Dropbox/NYPL_Data/2013_12_18_11_50_34_data/Dish.csv' dish_df = pd.read_csv(dish_data, index_col='id') %%time menu_item_data = '/Users/libraries/Dropbox/NYPL_Data/2013_12_18_11_50_34_data/MenuItem.csv' item_df = pd.read_csv(menu_item_data, index_col='dish_id') dish_df[:5] item_df[:5] dish_df unique_values_t0 = len(dish_df.name.unique()) print unique_values_t0 # Drop the 'description' column because it's empty and taking up screen real estate del dish_df['description'] def strip_and_lower(x): """ Basic string normalization: 1) strip leading and trailing whitespace 2) convert to lowercase 3) normalize internal whitespace (remove extra spaces between tokens) """ tokens = x.strip().lower().split() result = ' '.join(filter(None, tokens)) return result # Apply the function to every value of name using map() — rather than, for instance, using a loop dish_df['name_modified'] = dish_df['name'].map(strip_and_lower) dish_df[:5] len(dish_df.name_modified.unique()) import re def fingerprint(x): """ A modified version of the fingerprint clustering algorithm implemented by Open Refine. See https://github.com/OpenRefine/OpenRefine/wiki/Clustering-In-Depth This does not normalize to ASCII characters since diacritics may be significant in this dataset """ alphanumeric_tokens = filter(None, re.split('\W', x)) seen = set() seen_add = seen.add deduped = sorted([i for i in alphanumeric_tokens if i not in seen and not seen_add(i)]) fingerprint = ''.join(deduped) return fingerprint dish_df['fingerprint'] = dish_df['name_modified'].map(fingerprint) dish_df[:5] clusters = dish_df.groupby('fingerprint') # Order the clusters by size s = clusters.size() s.order() s.value_counts() # Actually we'll be excluding the last item of the series since the value there is blank for f in s.order()[-6:-1].index: print dish_df[dish_df.fingerprint == f].name testing_cluster = dish_df[dish_df.fingerprint == 'augratinpotatoes'] for item in testing_cluster.index.tolist(): url = "http://menus.nypl.org/dishes/{0}".format(item) print url # Get a Dataframe consisting of all the rows that share a common fingerprint result_df = dish_df[dish_df.fingerprint == 'frenchfriedpotatoes'] # Get the number of times appeared for each result by selecting that column (Series) and sort in descending order by_appearance = result_df.times_appeared.order(ascending=False) #Use the index of the max value to look up the most common value for the name print "Most common value: '{0}'".format(dish_df.name[by_appearance.idxmax()]) result_df.times_appeared[:5] total_appearances = result_df.times_appeared.sum() print "French fried potatoes appear {0} times in the menus digitized by NYPL.".format(total_appearances) # aggregate_last_appeared = result_df.last_appeared.max() # It turns out we can't completely straightforwardly use min() as well as max() # because some dishes apparently have '0' for first appeared. aggregate_first_appeared = result_df.first_appeared.values[0] aggregate_last_appeared = result_df.last_appeared.max() print "French fried potatoes first appeared on a menu in {0} and last appeared on a menu in {1}"\ .format(aggregate_first_appeared, aggregate_last_appeared) # Load data from the other CSV files included in the NYPL's regular data dumps because we'll need it all page_data = '/Users/libraries/Dropbox/NYPL_Data/2013_12_18_11_50_34_data/MenuPage.csv' page_df = pd.read_csv(page_data, index_col='id') menu_data = '/Users/libraries/Dropbox/NYPL_Data/2013_12_18_11_50_34_data/Menu.csv' menu_df = pd.read_csv(menu_data, index_col='id', parse_dates=True) # This should create a DataFrame from the Menu Items data set that contains # only "rows" related to our example cluster of 'French fried potato' dishes lookup_items_df = item_df[item_df.index.isin(result_df.index)] # The length of this DataFrame should be same as the sum of occurrences we calculated above try: assert len(lookup_items_df) == result_df.times_appeared.sum() print "Length of the DataFrame is consistent" except: print "Something's wrong" lookup_items_df.head(10) target_pages = lookup_items_df.menu_page_id.values lookup_pages_df = page_df[page_df.index.isin(target_pages)] lookup_pages_df.head(10) target_menus = lookup_pages_df.menu_id.values lookup_menus_df = menu_df[menu_df.index.isin(target_menus)] lookup_menus_df.head(10) import dateutil def extract_year(x): try: datestring = dateutil.parser.parse(x) return datestring.year except: return None lookup_menus_df['date_year'] = lookup_menus_df['date'].map(extract_year) %matplotlib inline by_year = lookup_menus_df.groupby(lookup_menus_df['date_year']) year_series = by_year.size() year_series.plot(figsize=(20,10)) # Extract years in the complete DataFrame for menus menu_df['date_year'] = menu_df['date'].map(extract_year) # Group by year and calculate the number of menus held for each year total_menu_by_year = menu_df.groupby(menu_df['date_year']) menu_total_series = total_menu_by_year.size() normed = [year_series[date_year]/menu_total_series[date_year].astype(float) for date_year in year_series.index.tolist()] pd.Series(normed, index=year_series.index).plot(figsize=(20,10))