import Levenshtein as L import pandas as pd Students = pd.DataFrame({'student_id': [1, 2], 'name': ['Alice', 'Bob']}) Students Grades = pd.DataFrame({'student_id': [1, 1, 2, 2], 'class_id': [1, 2, 1, 3], 'grade': ['A', 'C', 'B', 'B']}) Grades pd.merge(Students, Grades, on='student_id') Classes = pd.DataFrame({'class_id': [1, 2, 3], 'title': ['Math', 'English', 'Spanish']}) pd.merge(pd.merge(Students, Grades, on='student_id'), Classes, on='class_id') resto = pd.read_csv('restaurants.csv') resto.info() resto[:10] clusters = pd.merge(resto, resto, on='cluster') clusters = clusters[clusters.id_x != clusters.id_y] clusters[:10] clusters = clusters[clusters.id_x < clusters.id_y] clusters[:10] resto['dummy'] = 0 prod = pd.merge(resto, resto, on='dummy') # Clean up del prod['dummy'] del resto['dummy'] # Show that prod is the size of "resto" squared: print len(prod), len(resto)**2 prod[:10] prod = prod[prod.id_x < prod.id_y] prod[:10] import Levenshtein as L L.distance('Hello, World!', 'Hallo, World!') # This takes a minute or two to run prod['distance'] = prod.apply(lambda r: L.distance(r['name_x'], r['name_y']), axis=1) %matplotlib inline import pylab def accuracy(max_distance): similar = prod[prod.distance < max_distance] correct = float(sum(similar.cluster_x == similar.cluster_y)) precision = correct / len(similar) recall = correct / len(clusters) return (precision, recall) thresholds = range(1, 11) p = [] r = [] for t in thresholds: acc = accuracy(t) p.append(acc[0]) r.append(acc[1]) pylab.plot(thresholds, p) pylab.plot(thresholds, r) pylab.legend(['precision', 'recall'], loc=2) pylab.scatter(p, r) # Top-right of chart is "good" because precision and recall are both high (close to 1) prod['ratio'] = prod.apply(lambda r: L.ratio(r['name_x'], r['name_y']), axis=1) prod[:10] def accuracy_ratio(min_ratio): similar = prod[prod.ratio > min_ratio] correct = float(sum(similar.cluster_x == similar.cluster_y)) precision = correct / len(similar) recall = correct / len(clusters) return (precision, recall) thresholds = range(1, 10) p = [] r = [] for t in thresholds: acc = accuracy_ratio(float(t)/10) p.append(acc[0]) r.append(acc[1]) pylab.plot(thresholds, p) pylab.plot(thresholds, r) pylab.legend(['precision', 'recall'], loc=2) pylab.scatter(p, r) # Ratio appears to do a little better than distance---it achieves higher precision at reasonable levels of recall (say >0.8)