import numpy as np
import pandas as pd
from pandas import DataFrame, Series
import matplotlib.pyplot as plt
# %matplotlib inline
from wrangling import authors, stories, favourite_authors, favourite_stories, genres, categories
I need to calculate a similarity measure between two authors/users.
Since I care about story preference, I'll use the Jaccard Index on favourite story sets.
def jaccard_index(set1, set2):
i_count = len(set1.intersection(set2))
u_count = len(set1.union(set2))
return 0 if u_count == 0 else i_count/float(u_count)
Comparing author's favourite stories with mine
def author_similarity(author, my_stories):
author_stories = set(stories[stories["author"] == author.name].index)
author_favourites = set(favourite_stories.ix[author.name]) if author.name in favourite_stories else set()
all_stories = author_stories.union(author_favourites)
return jaccard_index(all_stories, set(my_stories))
my_fav_stories = ["8096183", # Harry Potter and the Natural 20
"9794740", # Pokemon, The Origin of Species
"9311012", # Lighting up the Dark
"5782108", # Harry Potter and the Methods of Rationality
"7354757", # The Game of Champions
"5193644", # Time Braid
"3695087", # Larceny, Lechery and Luna Lovegood
"9669819", # The Two Year Emperor
]
authors["similarity"] = authors.apply(author_similarity, axis=1, args=(my_fav_stories,))
authors.sort("similarity", ascending=False)[:5]
name | similarity | |
---|---|---|
author | ||
4976703 | alexanderwales | 0.384615 |
5118664 | daystar721 | 0.333333 |
3989854 | Sir Poley | 0.222222 |
4767519 | Scientist's Thesis | 0.187500 |
3344060 | Velorien | 0.166667 |
Calculating the weighted average of all stories by author similarity
# The sum of the similarity of every author who has favourited this story + the writer's similarity
stories["sim_total"] = authors.ix[stories["author"]]["similarity"].values
# The total number of times this story has been favourited + written (1)
stories["sim_count"] = 1
for author in authors.iterrows():
author_favs = favourite_stories.get(author[0], Series())
stories.loc[author_favs, "sim_total"] += author[1]["similarity"]
stories.loc[author_favs, "sim_count"] += 1
stories["sim_score"] = stories["sim_total"].div(stories["sim_count"])
stories.sort("sim_score", ascending=False)[:5][["title", "sim_total", "sim_count", "sim_score"]]
title | sim_total | sim_count | sim_score | |
---|---|---|---|---|
story | ||||
10327510 | A Bluer Shade of White | 0.384615 | 1 | 0.384615 |
10023949 | Harry Potter and the Philosopher\'s Zombie | 0.717949 | 2 | 0.358974 |
9676374 | Daystar\'s Remix of Rationality | 0.333333 | 1 | 0.333333 |
9794740 | Pokemon: The Origin of Species | 0.967949 | 4 | 0.241987 |
9658524 | Branches on the Tree of Time | 0.469361 | 2 | 0.234681 |
stories.sort("sim_total", ascending=False)[:5][["title", "sim_total", "sim_count", "sim_score"]]
title | sim_total | sim_count | sim_score | |
---|---|---|---|---|
story | ||||
8096183 | Harry Potter and the Natural 20 | 1.573355 | 18 | 0.087409 |
5782108 | Harry Potter and the Methods of Rationality | 1.486540 | 18 | 0.082586 |
9794740 | Pokemon: The Origin of Species | 0.967949 | 4 | 0.241987 |
10023949 | Harry Potter and the Philosopher\'s Zombie | 0.717949 | 2 | 0.358974 |
10360716 | The Metropolitan Man | 0.705458 | 5 | 0.141092 |
stories.sort("sim_count", ascending=False)[:5][["title", "sim_total", "sim_count", "sim_score"]]
title | sim_total | sim_count | sim_score | |
---|---|---|---|---|
story | ||||
8096183 | Harry Potter and the Natural 20 | 1.573355 | 18 | 0.087409 |
5782108 | Harry Potter and the Methods of Rationality | 1.486540 | 18 | 0.082586 |
2731239 | Team 8 | 0.193825 | 14 | 0.013845 |
5193644 | Time Braid | 0.683962 | 13 | 0.052612 |
5409165 | It\'s For a Good Cause, I Swear! | 0.058134 | 11 | 0.005285 |