# Metafiction¶

## Data Analysis¶

In [1]:
import numpy as np
import pandas as pd
from pandas import DataFrame, Series
import matplotlib.pyplot as plt
# %matplotlib inline
In [2]:
from wrangling import authors, stories, favourite_authors, favourite_stories, genres, categories

## Similarity¶

I need to calculate a similarity measure between two authors/users.

Since I care about story preference, I'll use the Jaccard Index on favourite story sets.

In [3]:
def jaccard_index(set1, set2):
i_count = len(set1.intersection(set2))
u_count = len(set1.union(set2))
return 0 if u_count == 0 else i_count/float(u_count)

## Scoring Authors¶

Comparing author's favourite stories with mine

In [4]:
def author_similarity(author, my_stories):
author_stories = set(stories[stories["author"] == author.name].index)
author_favourites = set(favourite_stories.ix[author.name]) if author.name in favourite_stories else set()
all_stories = author_stories.union(author_favourites)
return jaccard_index(all_stories, set(my_stories))
In [5]:
my_fav_stories = ["8096183", # Harry Potter and the Natural 20
"9794740", # Pokemon, The Origin of Species
"9311012", # Lighting up the Dark
"5782108", # Harry Potter and the Methods of Rationality
"7354757", # The Game of Champions
"5193644", # Time Braid
"3695087", # Larceny, Lechery and Luna Lovegood
"9669819", # The Two Year Emperor
]
In [6]:
authors["similarity"] = authors.apply(author_similarity, axis=1, args=(my_fav_stories,))
In [7]:
authors.sort("similarity", ascending=False)[:5]
Out[7]:
name similarity
author
4976703 alexanderwales 0.384615
5118664 daystar721 0.333333
3989854 Sir Poley 0.222222
4767519 Scientist's Thesis 0.187500
3344060 Velorien 0.166667

## Scoring Stories¶

Calculating the weighted average of all stories by author similarity

In [8]:
# The sum of the similarity of every author who has favourited this story + the writer's similarity
stories["sim_total"] = authors.ix[stories["author"]]["similarity"].values

# The total number of times this story has been favourited + written (1)
stories["sim_count"] = 1

for author in authors.iterrows():
author_favs = favourite_stories.get(author[0], Series())
stories.loc[author_favs, "sim_total"] += author[1]["similarity"]
stories.loc[author_favs, "sim_count"] += 1

stories["sim_score"] = stories["sim_total"].div(stories["sim_count"])

### Stories by average score¶

In [9]:
stories.sort("sim_score", ascending=False)[:5][["title", "sim_total", "sim_count", "sim_score"]]
Out[9]:
title sim_total sim_count sim_score
story
10327510 A Bluer Shade of White 0.384615 1 0.384615
10023949 Harry Potter and the Philosopher\'s Zombie 0.717949 2 0.358974
9676374 Daystar\'s Remix of Rationality 0.333333 1 0.333333
9794740 Pokemon: The Origin of Species 0.967949 4 0.241987
9658524 Branches on the Tree of Time 0.469361 2 0.234681

### Stories by total similarity¶

In [10]:
stories.sort("sim_total", ascending=False)[:5][["title", "sim_total", "sim_count", "sim_score"]]
Out[10]:
title sim_total sim_count sim_score
story
8096183 Harry Potter and the Natural 20 1.573355 18 0.087409
5782108 Harry Potter and the Methods of Rationality 1.486540 18 0.082586
9794740 Pokemon: The Origin of Species 0.967949 4 0.241987
10023949 Harry Potter and the Philosopher\'s Zombie 0.717949 2 0.358974
10360716 The Metropolitan Man 0.705458 5 0.141092

### Stories by times favourited¶

In [11]:
stories.sort("sim_count", ascending=False)[:5][["title", "sim_total", "sim_count", "sim_score"]]
Out[11]:
title sim_total sim_count sim_score
story
8096183 Harry Potter and the Natural 20 1.573355 18 0.087409
5782108 Harry Potter and the Methods of Rationality 1.486540 18 0.082586
2731239 Team 8 0.193825 14 0.013845
5193644 Time Braid 0.683962 13 0.052612
5409165 It\'s For a Good Cause, I Swear! 0.058134 11 0.005285