#!/usr/bin/env python
# coding: utf-8

# Reproducing the results of "The Three Types Of Adam Sandler Movies" on FiveThirtyEight: http://fivethirtyeight.com/datalab/the-three-types-of-adam-sandler-movies/

# ## Versions

# In[1]:


import sys
sys.version_info


# In[2]:


import requests
requests.__version__


# In[3]:


import bs4
from bs4 import BeautifulSoup
bs4.__version__


# In[4]:


import numpy as np
np.__version__


# In[5]:


import pandas as pd
pd.__version__


# In[6]:


from sklearn.cluster import KMeans

import sklearn
sklearn.__version__


# In[7]:


import bokeh.plotting as plt
from bokeh.models import HoverTool
plt.output_notebook()

import bokeh
bokeh.__version__


# ## Getting Data

# Original article is based on [Rotten Tomatoes](http://www.rottentomatoes.com/celebrity/adam_sandler/) for ratings and [Opus Data](http://www.opusdata.com) for the Box Office Gross. The second one is behind a paywall so it was replaced it with the same the Box Office data on Rotten Tomatoes.

# We get the html from rottern tomatoes and pass it to beautiful soup so we can extract the content we want.
# Then we used [selector gadget](http://selectorgadget.com/) to get the CSS selector of the table that we can pass to pandas and they will return a nice DataFrame.

# In[8]:


def get_soup(url):
    r = requests.get(url)
    return BeautifulSoup(r.text, 'html5lib')


# In[9]:


rotten_sandler_url = 'http://www.rottentomatoes.com/celebrity/adam_sandler/'


# In[10]:


soup = get_soup(rotten_sandler_url)


# In[11]:


films_table = str(soup.select('#filmography_box table:first-child')[0])


# In[12]:


rotten = pd.read_html(films_table)[0]


# In[13]:


rotten.head()


# We convert the "Rating" and "Box Office" columns to numeric values with some simple transformations removing some text characters and also replacing empty values with `numpy.nan`.

# In[14]:


rotten.RATING = rotten.RATING.str.replace('%', '').astype(float)


# In[15]:


rotten['BOX OFFICE'] = rotten['BOX OFFICE'].str.replace('$', '').str.replace('M', '').str.replace('-', '0')
rotten['BOX OFFICE'] = rotten['BOX OFFICE'].astype(float)


# In[16]:


rotten.loc[rotten['BOX OFFICE'] == 0, ['BOX OFFICE']] = np.nan


# In[17]:


rotten.head()


# In[18]:


rotten = rotten.set_index('TITLE')


# We finaly save the dataset.

# In[19]:


rotten.to_csv('rotten.csv')


# ## Chart

# This is the original chart for comparison

# In[20]:


from IPython.display import Image


# In[21]:


Image(url='https://espnfivethirtyeight.files.wordpress.com/2015/04/hickey-datalab-sandler.png?w=610&h=634')


# We load the saved data into a DataFrame and we just plot using bokeh that gives some nice interactive features that the original chart do not have, this makes it easier to explore the different movies.

# In[22]:


rotten = pd.read_csv('rotten.csv', index_col=0)


# In[23]:


rotten = rotten.dropna()


# In[24]:


len(rotten)


# In[25]:


rotten.index


# In[26]:


source = plt.ColumnDataSource(
    data=dict(
        rating=rotten.RATING,
        gross=rotten['BOX OFFICE'],
        movie=rotten.index,
    )
)

p = plt.figure(tools='reset,save,hover', x_range=[0, 100], title='',
               x_axis_label="Rotten Tomatoes rating", y_axis_label="Box Office Gross")
p.scatter(rotten.RATING, rotten['BOX OFFICE'], size=10, source=source)

hover = p.select(dict(type=HoverTool))

hover.tooltips = [
    ("Movie", "@movie"),
    ("Rating", "@rating"),
    ("Box Office Gross", "@gross"),
]

plt.show(p)


# ## Clusters

# The articles also mentioned some simple clustering on the dataset. We can reproduce that with scikit-learn.

# In[27]:


X = rotten[['RATING', 'BOX OFFICE']].values


# In[28]:


clf = KMeans(n_clusters=3)


# In[29]:


clf.fit(X)


# In[30]:


clusters = clf.predict(X)
clusters


# In[31]:


colors = clusters.astype(str)
colors[clusters == 0] = 'green'
colors[clusters == 1] = 'red'
colors[clusters == 2] = 'gold'


# In[32]:


source = plt.ColumnDataSource(
    data=dict(
        rating=rotten.RATING,
        gross=rotten['BOX OFFICE'],
        movie=rotten.index,
    )
)

p = plt.figure(tools='reset,save,hover', x_range=[0, 100], title='',
               x_axis_label="Rotten Tomatoes rating", y_axis_label="Box Office Gross")
p.scatter(rotten.RATING, rotten['BOX OFFICE'], size=10, source=source, color=colors)

hover = p.select(dict(type=HoverTool))

hover.tooltips = [
    ("Movie", "@movie"),
    ("Rating", "@rating"),
    ("Box Office Gross", "@gross"),
]

plt.show(p)


# We can see a similar result as the original article mentioned there is some differences in the Box Office Gross as it was mentioned before so the result is not exactly the same.

# ## IMDB

# What happens if we use IMDB ratings instead of Rotten Tomatoes.

# ### IMDB: Ratings

# We apply a similar procedure for getting the data from IMDB with a basic crawler.

# In[33]:


imdb_sandler_url = 'http://www.imdb.com/name/nm0001191/'


# In[34]:


soup = get_soup(imdb_sandler_url)


# In[35]:


a_tags = soup.select('div#filmo-head-actor + div b a')


# In[36]:


a_tags[:5]


# In[37]:


movies = {}
for a_tag in a_tags:
    movie_name = a_tag.text
    movie_url = 'http://www.imdb.com' + a_tag['href']
    soup = get_soup(movie_url)
    rating = soup.select('.star-box-giga-star')
    if len(rating) == 1:
        movies[movie_name] = float(rating[0].text)


# In[38]:


ratings = pd.DataFrame.from_dict(movies, orient='index')
ratings.columns = ['rating']


# In[39]:


ratings.head()


# In[40]:


len(ratings)


# In[41]:


ratings.index.name = 'Title'


# In[42]:


ratings.to_csv('imdb-ratings.csv')


# ### IMDB: Box Office Mojo

# IMDB also provides the Box Office Gross information from [Box Office Mojo](http://www.boxofficemojo.com).

# In[43]:


box_sandler_url = 'http://www.boxofficemojo.com/people/chart/?view=Actor&id=adamsandler.htm'


# In[44]:


soup = get_soup(box_sandler_url)


# In[45]:


box_gross_table = str(soup.select('br + table')[0])


# In[46]:


gross = pd.read_html(box_gross_table, header=0)[0]


# In[47]:


gross.head()


# In[48]:


gross.drop('Unnamed: 6', axis=1, inplace=True)
gross.drop('Unnamed: 7', axis=1, inplace=True)
gross.drop('Opening / Theaters', axis=1, inplace=True)
gross.drop('Rank', axis=1, inplace=True)
gross.drop('Studio', axis=1, inplace=True)


# In[49]:


gross.columns = ['Date', 'Title', 'Gross']


# In[50]:


gross.set_index('Title', inplace=True)


# In[51]:


gross.Gross = gross.Gross.str.replace(r'[$,]', '').astype(int)


# In[52]:


gross.head()


# In[53]:


gross.to_csv('imdb-gross.csv')


# ### IMDB: Analysis

# Load both datasets and plot the same values

# In[54]:


ratings = pd.read_csv('imdb-ratings.csv', index_col=0)


# In[55]:


gross = pd.read_csv('imdb-gross.csv', index_col=0)


# In[56]:


gross.Gross = gross.Gross / 1e6


# In[57]:


len(ratings)


# In[58]:


len(gross)


# In[59]:


gross.ix['Just Go with It'] = gross.ix['Just Go With It']
gross = gross.drop('Just Go With It')


# In[60]:


gross.ix['I Now Pronounce You Chuck & Larry'] = gross.ix['I Now Pronounce You Chuck and Larry']
gross = gross.drop('I Now Pronounce You Chuck and Larry')


# In[61]:


imdb = gross.join(ratings)


# In[62]:


len(imdb), len(imdb.dropna())


# In[63]:


imdb = imdb.dropna()


# In[64]:


source = plt.ColumnDataSource(
    data=dict(
        rating=imdb.rating,
        gross=imdb.Gross,
        movie=imdb.index,
    )
)

p = plt.figure(tools='reset,save,hover', x_range=[0, 10], title='',
               x_axis_label="Rotten Tomatoes rating", y_axis_label="Box Office Gross")
p.scatter(imdb.rating, imdb.Gross, size=10, source=source)

hover = p.select(dict(type=HoverTool))
hover.tooltips = [
    ("Movie", "@movie"),
    ("Rating", "@rating"),
    ("Box Office Gross", "@gross"),
]

plt.show(p)


# Interesting, the result is very different you can see two clusters in this case: the Greater than 100M and less than 100M movies.

# In[65]:


X = imdb[['rating', 'Gross']].values


# In[66]:


clf = KMeans(n_clusters=2)


# In[67]:


clf.fit(X)


# In[68]:


clusters = clf.predict(X)
clusters


# In[69]:


colors = clusters.astype(str)
colors[clusters == 0] = 'green'
colors[clusters == 1] = 'red'


# In[70]:


source = plt.ColumnDataSource(
    data=dict(
        rating=imdb.rating,
        gross=imdb.Gross,
        movie=imdb.index,
    )
)

p = plt.figure(tools='reset,save,hover', x_range=[0, 10], title='',
               x_axis_label="Rotten Tomatoes rating", y_axis_label="Box Office Gross")
p.scatter(imdb.rating, imdb.Gross, size=10, source=source, color=colors)

hover = p.select(dict(type=HoverTool))
hover.tooltips = [
    ("Movie", "@movie"),
    ("Rating", "@rating"),
    ("Box Office Gross", "@gross"),
]

plt.show(p)


# In[ ]: