import numpy as np
import pandas as pd
from pandas import DataFrame, Series
import matplotlib.pyplot as plt
# %matplotlib inline
import json
metafiction = [json.loads(x) for x in open("metafiction.dat")]
len(metafiction)
100
author_list = [{"author": rec["id"], "name": rec["name"]} for rec in metafiction]
len(author_list)
100
for record in metafiction:
for story in record["favourite-stories"]:
author_list.append({"author": story["author"]})
for author in record["favourite-authors"]:
author_list.append({"author": author})
len(author_list)
10694
authors = DataFrame(author_list)
authors.drop_duplicates(["author"], inplace=True)
authors.set_index(["author"], inplace=True)
len(authors)
5170
authors.ix[[0]]
name | |
---|---|
author | |
5111102 | EagleJarl |
story_list = []
for record in metafiction:
story_list.extend(record["author-stories"])
story_list.extend(record["favourite-stories"])
len(story_list)
11052
stories = DataFrame(story_list)
## rename columns
columns = stories.columns.values
columns[3] = u"is_complete"
columns[4] = u"submitted"
columns[5] = u"updated"
columns[9] = u"story"
stories.columns = columns
stories.drop_duplicates(["story"], inplace=True)
stories.set_index("story", inplace=True)
stories["submitted"] = stories["submitted"].astype("datetime64")
stories["updated"] = stories["updated"].astype("datetime64")
len(stories)
9089
stories.ix[[0]]
author | categories | chapters | is_complete | submitted | updated | favourites | follows | genres | language | rating | reviews | title | word-count | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
story | ||||||||||||||
9669819 | 5111102 | [Dungeons and Dragons] | 76 | False | 2013-09-08 11:03:42 | 2014-12-06 17:56:42 | 425 | 483 | [Adventure, Fantasy] | English | T | 773 | The Two Year Emperor | 309723 |
favourite_author_list = []
favourite_story_list = []
for record in metafiction:
for author in record["favourite-authors"]:
favourite_author_list.append({"author": record["id"],
"favourite_author": author})
for story in record["favourite-stories"]:
favourite_story_list.append({"author": record["id"],
"favourite_story": story["id"]})
(len(favourite_author_list), len(favourite_story_list))
(1211, 9383)
favourite_authors = DataFrame(favourite_author_list)
favourite_authors.set_index("author", inplace=True)
favourite_authors = favourite_authors["favourite_author"]
favourite_stories = DataFrame(favourite_story_list)
favourite_stories.set_index("author", inplace=True)
favourite_stories = favourite_stories["favourite_story"]
favourite_authors.ix[[0]]
author 5111102 4976703 Name: favourite_author, dtype: object
favourite_stories.ix[[0]]
author 5111102 8096183 Name: favourite_story, dtype: object
genre_list = sorted(set.union(*[set(g) for g in stories["genres"]]))
genres = DataFrame(data=np.zeros((len(stories), len(genre_list))), columns=genre_list, index=stories.index)
category_list = sorted(set.union(*[set(c) for c in stories["categories"]]))
categories = DataFrame(data=np.zeros((len(stories), len(category_list))), columns=category_list, index=stories.index)
for story in stories.index:
genres.ix[story, stories.ix[story, "genres"]] = 1
categories.ix[story, stories.ix[story, "categories"]] = 1
genres.ix[[0]]
Adventure | Angst | Comfort | Crime | Drama | Family | Fantasy | Friendship | Horror | Humor | ... | Mystery | Parody | Poetry | Romance | Sci-Fi | Spiritual | Supernatural | Suspense | Tragedy | Western | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
story | |||||||||||||||||||||
9669819 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
1 rows × 21 columns
categories.ix[[0]]
.hack/SIGN | 10th Kingdom | 1984 | 24 | 30 Rock | A Certain Scientific Railgun/とある科学の超電磁砲 | A song of Ice and Fire | A-Team | Addams Family | Advance Wars | ... | Yami no Matsuei | Young Justice | Young Wizards | Yu Yu Hakusho | Yu-Gi-Oh | Zatch Bell | Zoids | iCarly | the X-Men | xxxHOLiC | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
story | |||||||||||||||||||||
9669819 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
1 rows × 541 columns