#!/usr/bin/env python # coding: utf-8 # Reproducing the results of "The Three Types Of Adam Sandler Movies" on FiveThirtyEight: http://fivethirtyeight.com/datalab/the-three-types-of-adam-sandler-movies/ # ## Versions # In[1]: import sys sys.version_info # In[2]: import requests requests.__version__ # In[3]: import bs4 from bs4 import BeautifulSoup bs4.__version__ # In[4]: import numpy as np np.__version__ # In[5]: import pandas as pd pd.__version__ # In[6]: from sklearn.cluster import KMeans import sklearn sklearn.__version__ # In[7]: import bokeh.plotting as plt from bokeh.models import HoverTool plt.output_notebook() import bokeh bokeh.__version__ # ## Getting Data # Original article is based on [Rotten Tomatoes](http://www.rottentomatoes.com/celebrity/adam_sandler/) for ratings and [Opus Data](http://www.opusdata.com) for the Box Office Gross. The second one is behind a paywall so it was replaced it with the same the Box Office data on Rotten Tomatoes. # We get the html from rottern tomatoes and pass it to beautiful soup so we can extract the content we want. # Then we used [selector gadget](http://selectorgadget.com/) to get the CSS selector of the table that we can pass to pandas and they will return a nice DataFrame. # In[8]: def get_soup(url): r = requests.get(url) return BeautifulSoup(r.text, 'html5lib') # In[9]: rotten_sandler_url = 'http://www.rottentomatoes.com/celebrity/adam_sandler/' # In[10]: soup = get_soup(rotten_sandler_url) # In[11]: films_table = str(soup.select('#filmography_box table:first-child')[0]) # In[12]: rotten = pd.read_html(films_table)[0] # In[13]: rotten.head() # We convert the "Rating" and "Box Office" columns to numeric values with some simple transformations removing some text characters and also replacing empty values with `numpy.nan`. # In[14]: rotten.RATING = rotten.RATING.str.replace('%', '').astype(float) # In[15]: rotten['BOX OFFICE'] = rotten['BOX OFFICE'].str.replace('$', '').str.replace('M', '').str.replace('-', '0') rotten['BOX OFFICE'] = rotten['BOX OFFICE'].astype(float) # In[16]: rotten.loc[rotten['BOX OFFICE'] == 0, ['BOX OFFICE']] = np.nan # In[17]: rotten.head() # In[18]: rotten = rotten.set_index('TITLE') # We finaly save the dataset. # In[19]: rotten.to_csv('rotten.csv') # ## Chart # This is the original chart for comparison # In[20]: from IPython.display import Image # In[21]: Image(url='https://espnfivethirtyeight.files.wordpress.com/2015/04/hickey-datalab-sandler.png?w=610&h=634') # We load the saved data into a DataFrame and we just plot using bokeh that gives some nice interactive features that the original chart do not have, this makes it easier to explore the different movies. # In[22]: rotten = pd.read_csv('rotten.csv', index_col=0) # In[23]: rotten = rotten.dropna() # In[24]: len(rotten) # In[25]: rotten.index # In[26]: source = plt.ColumnDataSource( data=dict( rating=rotten.RATING, gross=rotten['BOX OFFICE'], movie=rotten.index, ) ) p = plt.figure(tools='reset,save,hover', x_range=[0, 100], title='', x_axis_label="Rotten Tomatoes rating", y_axis_label="Box Office Gross") p.scatter(rotten.RATING, rotten['BOX OFFICE'], size=10, source=source) hover = p.select(dict(type=HoverTool)) hover.tooltips = [ ("Movie", "@movie"), ("Rating", "@rating"), ("Box Office Gross", "@gross"), ] plt.show(p) # ## Clusters # The articles also mentioned some simple clustering on the dataset. We can reproduce that with scikit-learn. # In[27]: X = rotten[['RATING', 'BOX OFFICE']].values # In[28]: clf = KMeans(n_clusters=3) # In[29]: clf.fit(X) # In[30]: clusters = clf.predict(X) clusters # In[31]: colors = clusters.astype(str) colors[clusters == 0] = 'green' colors[clusters == 1] = 'red' colors[clusters == 2] = 'gold' # In[32]: source = plt.ColumnDataSource( data=dict( rating=rotten.RATING, gross=rotten['BOX OFFICE'], movie=rotten.index, ) ) p = plt.figure(tools='reset,save,hover', x_range=[0, 100], title='', x_axis_label="Rotten Tomatoes rating", y_axis_label="Box Office Gross") p.scatter(rotten.RATING, rotten['BOX OFFICE'], size=10, source=source, color=colors) hover = p.select(dict(type=HoverTool)) hover.tooltips = [ ("Movie", "@movie"), ("Rating", "@rating"), ("Box Office Gross", "@gross"), ] plt.show(p) # We can see a similar result as the original article mentioned there is some differences in the Box Office Gross as it was mentioned before so the result is not exactly the same. # ## IMDB # What happens if we use IMDB ratings instead of Rotten Tomatoes. # ### IMDB: Ratings # We apply a similar procedure for getting the data from IMDB with a basic crawler. # In[33]: imdb_sandler_url = 'http://www.imdb.com/name/nm0001191/' # In[34]: soup = get_soup(imdb_sandler_url) # In[35]: a_tags = soup.select('div#filmo-head-actor + div b a') # In[36]: a_tags[:5] # In[37]: movies = {} for a_tag in a_tags: movie_name = a_tag.text movie_url = 'http://www.imdb.com' + a_tag['href'] soup = get_soup(movie_url) rating = soup.select('.star-box-giga-star') if len(rating) == 1: movies[movie_name] = float(rating[0].text) # In[38]: ratings = pd.DataFrame.from_dict(movies, orient='index') ratings.columns = ['rating'] # In[39]: ratings.head() # In[40]: len(ratings) # In[41]: ratings.index.name = 'Title' # In[42]: ratings.to_csv('imdb-ratings.csv') # ### IMDB: Box Office Mojo # IMDB also provides the Box Office Gross information from [Box Office Mojo](http://www.boxofficemojo.com). # In[43]: box_sandler_url = 'http://www.boxofficemojo.com/people/chart/?view=Actor&id=adamsandler.htm' # In[44]: soup = get_soup(box_sandler_url) # In[45]: box_gross_table = str(soup.select('br + table')[0]) # In[46]: gross = pd.read_html(box_gross_table, header=0)[0] # In[47]: gross.head() # In[48]: gross.drop('Unnamed: 6', axis=1, inplace=True) gross.drop('Unnamed: 7', axis=1, inplace=True) gross.drop('Opening / Theaters', axis=1, inplace=True) gross.drop('Rank', axis=1, inplace=True) gross.drop('Studio', axis=1, inplace=True) # In[49]: gross.columns = ['Date', 'Title', 'Gross'] # In[50]: gross.set_index('Title', inplace=True) # In[51]: gross.Gross = gross.Gross.str.replace(r'[$,]', '').astype(int) # In[52]: gross.head() # In[53]: gross.to_csv('imdb-gross.csv') # ### IMDB: Analysis # Load both datasets and plot the same values # In[54]: ratings = pd.read_csv('imdb-ratings.csv', index_col=0) # In[55]: gross = pd.read_csv('imdb-gross.csv', index_col=0) # In[56]: gross.Gross = gross.Gross / 1e6 # In[57]: len(ratings) # In[58]: len(gross) # In[59]: gross.ix['Just Go with It'] = gross.ix['Just Go With It'] gross = gross.drop('Just Go With It') # In[60]: gross.ix['I Now Pronounce You Chuck & Larry'] = gross.ix['I Now Pronounce You Chuck and Larry'] gross = gross.drop('I Now Pronounce You Chuck and Larry') # In[61]: imdb = gross.join(ratings) # In[62]: len(imdb), len(imdb.dropna()) # In[63]: imdb = imdb.dropna() # In[64]: source = plt.ColumnDataSource( data=dict( rating=imdb.rating, gross=imdb.Gross, movie=imdb.index, ) ) p = plt.figure(tools='reset,save,hover', x_range=[0, 10], title='', x_axis_label="Rotten Tomatoes rating", y_axis_label="Box Office Gross") p.scatter(imdb.rating, imdb.Gross, size=10, source=source) hover = p.select(dict(type=HoverTool)) hover.tooltips = [ ("Movie", "@movie"), ("Rating", "@rating"), ("Box Office Gross", "@gross"), ] plt.show(p) # Interesting, the result is very different you can see two clusters in this case: the Greater than 100M and less than 100M movies. # In[65]: X = imdb[['rating', 'Gross']].values # In[66]: clf = KMeans(n_clusters=2) # In[67]: clf.fit(X) # In[68]: clusters = clf.predict(X) clusters # In[69]: colors = clusters.astype(str) colors[clusters == 0] = 'green' colors[clusters == 1] = 'red' # In[70]: source = plt.ColumnDataSource( data=dict( rating=imdb.rating, gross=imdb.Gross, movie=imdb.index, ) ) p = plt.figure(tools='reset,save,hover', x_range=[0, 10], title='', x_axis_label="Rotten Tomatoes rating", y_axis_label="Box Office Gross") p.scatter(imdb.rating, imdb.Gross, size=10, source=source, color=colors) hover = p.select(dict(type=HoverTool)) hover.tooltips = [ ("Movie", "@movie"), ("Rating", "@rating"), ("Box Office Gross", "@gross"), ] plt.show(p) # In[ ]: