Reproducing the results of "The Three Types Of Adam Sandler Movies" on FiveThirtyEight: http://fivethirtyeight.com/datalab/the-three-types-of-adam-sandler-movies/
import sys
sys.version_info
sys.version_info(major=3, minor=4, micro=3, releaselevel='final', serial=0)
import requests
requests.__version__
'2.6.2'
import bs4
from bs4 import BeautifulSoup
bs4.__version__
'4.3.2'
import numpy as np
np.__version__
'1.9.2'
import pandas as pd
pd.__version__
'0.16.0'
from sklearn.cluster import KMeans
import sklearn
sklearn.__version__
'0.16.1'
import bokeh.plotting as plt
from bokeh.models import HoverTool
plt.output_notebook()
import bokeh
bokeh.__version__
Original article is based on Rotten Tomatoes for ratings and Opus Data for the Box Office Gross. The second one is behind a paywall so it was replaced it with the same the Box Office data on Rotten Tomatoes.
We get the html from rottern tomatoes and pass it to beautiful soup so we can extract the content we want. Then we used selector gadget to get the CSS selector of the table that we can pass to pandas and they will return a nice DataFrame.
def get_soup(url):
r = requests.get(url)
return BeautifulSoup(r.text, 'html5lib')
rotten_sandler_url = 'http://www.rottentomatoes.com/celebrity/adam_sandler/'
soup = get_soup(rotten_sandler_url)
films_table = str(soup.select('#filmography_box table:first-child')[0])
rotten = pd.read_html(films_table)[0]
rotten.head()
RATING | TITLE | CREDIT | BOX OFFICE | YEAR | |
---|---|---|---|---|---|
0 | NaN | Hello Ghost | Actor Producer | -- | 2015 |
1 | 9% | The Cobbler | Max Simkin | -- | 2015 |
2 | NaN | Pixels | Producer Screenwriter Sam Brenner | -- | 2015 |
3 | NaN | Candy Land | Actor | -- | 2015 |
4 | 6% | Paul Blart: Mall Cop 2 | Producer | $43.2M | 2015 |
We convert the "Rating" and "Box Office" columns to numeric values with some simple transformations removing some text characters and also replacing empty values with numpy.nan
.
rotten.RATING = rotten.RATING.str.replace('%', '').astype(float)
rotten['BOX OFFICE'] = rotten['BOX OFFICE'].str.replace('$', '').str.replace('M', '').str.replace('-', '0')
rotten['BOX OFFICE'] = rotten['BOX OFFICE'].astype(float)
rotten.loc[rotten['BOX OFFICE'] == 0, ['BOX OFFICE']] = np.nan
rotten.head()
RATING | TITLE | CREDIT | BOX OFFICE | YEAR | |
---|---|---|---|---|---|
0 | NaN | Hello Ghost | Actor Producer | NaN | 2015 |
1 | 9 | The Cobbler | Max Simkin | NaN | 2015 |
2 | NaN | Pixels | Producer Screenwriter Sam Brenner | NaN | 2015 |
3 | NaN | Candy Land | Actor | NaN | 2015 |
4 | 6 | Paul Blart: Mall Cop 2 | Producer | 43.2 | 2015 |
rotten = rotten.set_index('TITLE')
We finaly save the dataset.
rotten.to_csv('rotten.csv')
This is the original chart for comparison
from IPython.display import Image
Image(url='https://espnfivethirtyeight.files.wordpress.com/2015/04/hickey-datalab-sandler.png?w=610&h=634')
We load the saved data into a DataFrame and we just plot using bokeh that gives some nice interactive features that the original chart do not have, this makes it easier to explore the different movies.
rotten = pd.read_csv('rotten.csv', index_col=0)
rotten = rotten.dropna()
len(rotten)
37
rotten.index
Index(['Paul Blart: Mall Cop 2', 'Blended', 'Top Five', 'Grown Ups 2', 'That's My Boy', 'Hotel Transylvania', 'Here Comes the Boom', 'Jack and Jill', 'Zookeeper', 'Just Go with It', 'Bucky Larson: Born to Be a Star', 'Grown Ups', 'Funny People', 'Paul Blart: Mall Cop', 'You Don't Mess With the Zohan', 'The House Bunny', 'Bedtime Stories', 'Strange Wilderness', 'I Now Pronounce You Chuck & Larry', 'Reign Over Me', 'Grandma's Boy', 'Click', 'The Benchwarmers', 'Deuce Bigalow: European Gigolo', 'The Longest Yard', 'Spanglish', '50 First Dates', 'Dickie Roberts: Former Child Star', 'Anger Management', 'The Hot Chick', 'Mr. Deeds', 'Adam Sandler's Eight Crazy Nights', 'The Master of Disguise', 'Punch-Drunk Love', 'Joe Dirt', 'The Animal', 'Little Nicky'], dtype='object')
source = plt.ColumnDataSource(
data=dict(
rating=rotten.RATING,
gross=rotten['BOX OFFICE'],
movie=rotten.index,
)
)
p = plt.figure(tools='reset,save,hover', x_range=[0, 100], title='',
x_axis_label="Rotten Tomatoes rating", y_axis_label="Box Office Gross")
p.scatter(rotten.RATING, rotten['BOX OFFICE'], size=10, source=source)
hover = p.select(dict(type=HoverTool))
hover.tooltips = [
("Movie", "@movie"),
("Rating", "@rating"),
("Box Office Gross", "@gross"),
]
plt.show(p)
The articles also mentioned some simple clustering on the dataset. We can reproduce that with scikit-learn.
X = rotten[['RATING', 'BOX OFFICE']].values
clf = KMeans(n_clusters=3)
clf.fit(X)
KMeans(copy_x=True, init='k-means++', max_iter=300, n_clusters=3, n_init=10, n_jobs=1, precompute_distances='auto', random_state=None, tol=0.0001, verbose=0)
clusters = clf.predict(X)
clusters
array([0, 0, 2, 1, 0, 1, 0, 0, 0, 1, 0, 1, 2, 1, 1, 0, 1, 0, 1, 2, 0, 1, 0, 0, 1, 2, 1, 0, 1, 0, 1, 0, 0, 2, 0, 0, 0], dtype=int32)
colors = clusters.astype(str)
colors[clusters == 0] = 'green'
colors[clusters == 1] = 'red'
colors[clusters == 2] = 'gold'
source = plt.ColumnDataSource(
data=dict(
rating=rotten.RATING,
gross=rotten['BOX OFFICE'],
movie=rotten.index,
)
)
p = plt.figure(tools='reset,save,hover', x_range=[0, 100], title='',
x_axis_label="Rotten Tomatoes rating", y_axis_label="Box Office Gross")
p.scatter(rotten.RATING, rotten['BOX OFFICE'], size=10, source=source, color=colors)
hover = p.select(dict(type=HoverTool))
hover.tooltips = [
("Movie", "@movie"),
("Rating", "@rating"),
("Box Office Gross", "@gross"),
]
plt.show(p)
We can see a similar result as the original article mentioned there is some differences in the Box Office Gross as it was mentioned before so the result is not exactly the same.
What happens if we use IMDB ratings instead of Rotten Tomatoes.
We apply a similar procedure for getting the data from IMDB with a basic crawler.
imdb_sandler_url = 'http://www.imdb.com/name/nm0001191/'
soup = get_soup(imdb_sandler_url)
a_tags = soup.select('div#filmo-head-actor + div b a')
a_tags[:5]
[<a href="/title/tt2479478/?ref_=nm_flmg_act_1">The Ridiculous 6</a>, <a href="/title/tt2510894/?ref_=nm_flmg_act_2">Hotel Transylvania 2</a>, <a href="/title/tt2120120/?ref_=nm_flmg_act_3">Pixels</a>, <a href="/title/tt3203616/?ref_=nm_flmg_act_4">The Cobbler</a>, <a href="/title/tt3179568/?ref_=nm_flmg_act_5">Men, Women & Children</a>]
movies = {}
for a_tag in a_tags:
movie_name = a_tag.text
movie_url = 'http://www.imdb.com' + a_tag['href']
soup = get_soup(movie_url)
rating = soup.select('.star-box-giga-star')
if len(rating) == 1:
movies[movie_name] = float(rating[0].text)
ratings = pd.DataFrame.from_dict(movies, orient='index')
ratings.columns = ['rating']
ratings.head()
rating | |
---|---|
The Hot Chick | 5.5 |
The Animal | 4.8 |
Deuce Bigalow: Male Gigolo | 5.7 |
Happy Gilmore | 7.0 |
Eight Crazy Nights | 5.4 |
len(ratings)
53
ratings.index.name = 'Title'
ratings.to_csv('imdb-ratings.csv')
IMDB also provides the Box Office Gross information from Box Office Mojo.
box_sandler_url = 'http://www.boxofficemojo.com/people/chart/?view=Actor&id=adamsandler.htm'
soup = get_soup(box_sandler_url)
box_gross_table = str(soup.select('br + table')[0])
gross = pd.read_html(box_gross_table, header=0)[0]
gross.head()
Date | Title (click to view) | Studio | Lifetime Gross / Theaters | Opening / Theaters | Rank | Unnamed: 6 | Unnamed: 7 | |
---|---|---|---|---|---|---|---|---|
0 | 10/1/14 | Men, Women & Children | Par. | $705,908 | 608 | $48,024 | 17 | 30 |
1 | 5/23/14 | Blended | WB | $46,294,610 | 3555 | $14,284,031 | 3555 | 18 |
2 | 7/12/13 | Grown Ups 2 | Sony | $133,668,525 | 3491 | $41,508,572 | 3491 | 8 |
3 | 9/28/12 | Hotel Transylvania(Voice) | Sony | $148,313,048 | 3375 | $42,522,194 | 3349 | 5 |
4 | 6/15/12 | That's My Boy | Sony | $36,931,089 | 3030 | $13,453,714 | 3030 | 22 |
gross.drop('Unnamed: 6', axis=1, inplace=True)
gross.drop('Unnamed: 7', axis=1, inplace=True)
gross.drop('Opening / Theaters', axis=1, inplace=True)
gross.drop('Rank', axis=1, inplace=True)
gross.drop('Studio', axis=1, inplace=True)
gross.columns = ['Date', 'Title', 'Gross']
gross.set_index('Title', inplace=True)
gross.Gross = gross.Gross.str.replace(r'[$,]', '').astype(int)
gross.head()
Date | Gross | |
---|---|---|
Title | ||
Men, Women & Children | 10/1/14 | 705908 |
Blended | 5/23/14 | 46294610 |
Grown Ups 2 | 7/12/13 | 133668525 |
Hotel Transylvania(Voice) | 9/28/12 | 148313048 |
That's My Boy | 6/15/12 | 36931089 |
gross.to_csv('imdb-gross.csv')
Load both datasets and plot the same values
ratings = pd.read_csv('imdb-ratings.csv', index_col=0)
gross = pd.read_csv('imdb-gross.csv', index_col=0)
gross.Gross = gross.Gross / 1e6
len(ratings)
53
len(gross)
37
gross.ix['Just Go with It'] = gross.ix['Just Go With It']
gross = gross.drop('Just Go With It')
gross.ix['I Now Pronounce You Chuck & Larry'] = gross.ix['I Now Pronounce You Chuck and Larry']
gross = gross.drop('I Now Pronounce You Chuck and Larry')
imdb = gross.join(ratings)
len(imdb), len(imdb.dropna())
(37, 33)
imdb = imdb.dropna()
source = plt.ColumnDataSource(
data=dict(
rating=imdb.rating,
gross=imdb.Gross,
movie=imdb.index,
)
)
p = plt.figure(tools='reset,save,hover', x_range=[0, 10], title='',
x_axis_label="Rotten Tomatoes rating", y_axis_label="Box Office Gross")
p.scatter(imdb.rating, imdb.Gross, size=10, source=source)
hover = p.select(dict(type=HoverTool))
hover.tooltips = [
("Movie", "@movie"),
("Rating", "@rating"),
("Box Office Gross", "@gross"),
]
plt.show(p)
Interesting, the result is very different you can see two clusters in this case: the Greater than 100M and less than 100M movies.
X = imdb[['rating', 'Gross']].values
clf = KMeans(n_clusters=2)
clf.fit(X)
KMeans(copy_x=True, init='k-means++', max_iter=300, n_clusters=2, n_init=10, n_jobs=1, precompute_distances='auto', random_state=None, tol=0.0001, verbose=0)
clusters = clf.predict(X)
clusters
array([1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0], dtype=int32)
colors = clusters.astype(str)
colors[clusters == 0] = 'green'
colors[clusters == 1] = 'red'
source = plt.ColumnDataSource(
data=dict(
rating=imdb.rating,
gross=imdb.Gross,
movie=imdb.index,
)
)
p = plt.figure(tools='reset,save,hover', x_range=[0, 10], title='',
x_axis_label="Rotten Tomatoes rating", y_axis_label="Box Office Gross")
p.scatter(imdb.rating, imdb.Gross, size=10, source=source, color=colors)
hover = p.select(dict(type=HoverTool))
hover.tooltips = [
("Movie", "@movie"),
("Rating", "@rating"),
("Box Office Gross", "@gross"),
]
plt.show(p)