# %load get_rotten.py
__author__ = 'ju'
import json
import pandas as pd
import numpy as np
import requests
from urllib2 import Request, urlopen, URLError
from bokeh.plotting import figure, output_file, show, ColumnDataSource
from bokeh.models import HoverTool, BoxSelectTool
import bokeh
output_notebook()
# df = pd.read_csv('IMDB_7000.txt',sep='\t')
# df = df[df['Votes']>=50000]
# print df
#
# #print '%'.join(['The','Shawshank'])
# imdb = []
# rt = []
# rated = []
# year = []
# genre = []
#
# for i in df.index:
# url = 'http://www.omdbapi.com/?t='+'%25'.join(df.loc[i,'Title'].split(' '))+'&y='+str(int(df.loc[i,'Year']))+'&tomatoes=true'
# result = json.load(urlopen(url))
# print df.loc[i,'Title']
# if 'imdbRating' in result:
# imdb.append(result['imdbRating'])
# else:
# imdb.append(np.nan)
# if 'tomatoMeter' in result:
# rt.append(result['tomatoMeter'])
# else: rt.append(np.nan)
# if 'Year' in result:
# year.append(result['Year'])
# else: year.append(np.nan)
# if 'Genre' in result:
# genre.append(result['Genre'])
# else: genre.append(np.nan)
# if 'Rated' in result:
# rated.append(result['Rated'])
# else: rated.append(np.nan)
# df['Year'] = pd.Series(year,index=df.index)
# df['Genre'] = pd.Series(genre,index=df.index)
# df['Rated'] = pd.Series(rated,index=df.index)
# df['IMDB Rating'] = pd.Series(imdb,index=df.index)
# df['Rotten Tomatoes'] = pd.Series(rt, index=df.index)
#
# df=df.dropna()
# print df
# df.to_csv('IMDB_RT_some.txt',sep='\t',index=False)
df = pd.read_csv('IMDB_RT_some.txt',sep='\t')
#df=df.drop('Unnamed: 0',axis=1)
df=df.dropna()
allgen = []
for i in df['Genre']:
allgen=allgen+i.split(', ')
#print set(allgen)
#print set(df['Rated'])
output_file("toolbar.html")
source = ColumnDataSource(
data=dict(
x=df['Rank'].tolist(),
y=df['Rotten Tomatoes'].tolist(),
desc=df['Title'].tolist(),
)
)
hover = HoverTool(
tooltips=[
("Title", "@desc"),
("(x,y)", "($x, $y)"),
]
)
hover = HoverTool(
tooltips=[
#("index", "$index"),
("Title", "@desc"),
('IMDB Rating', '@ra'),
('Rotten Tomatoes', '@rt')
#("(x,y)", "($x, $y)"),
]
)
r = df[df['Rated']=='R']
rsource = ColumnDataSource(
data=dict(
ra=[str(i) for i in r['Rank'].tolist()],
rt=[str(int(i)) for i in r['Rotten Tomatoes'].tolist()],
desc=r['Title'].tolist(),
)
)
pg13 = df[df['Rated']=='PG-13']
pg13source = ColumnDataSource(
data=dict(
ra=[str(i) for i in pg13['Rank'].tolist()],
rt=[str(int(i)) for i in pg13['Rotten Tomatoes'].tolist()],
desc=pg13['Title'].tolist(),
)
)
pg = df[df['Rated']=='PG']
pgsource = ColumnDataSource(
data=dict(
ra=[str(i) for i in pg['Rank'].tolist()],
rt=[str(int(i)) for i in pg['Rotten Tomatoes'].tolist()],
desc=pg['Title'].tolist(),
)
)
g = df[df['Rated']=='G']
gsource = ColumnDataSource(
data=dict(
ra=[str(i) for i in g['Rank'].tolist()],
rt=[str(int(i)) for i in g['Rotten Tomatoes'].tolist()],
desc=g['Title'].tolist(),
)
)
nc17 = df[df['Rated']=='NC-17']
nc17source = ColumnDataSource(
data=dict(
ra=[str(i) for i in nc17['Rank'].tolist()],
rt=[str(int(i)) for i in nc17['Rotten Tomatoes'].tolist()],
desc=nc17['Title'].tolist(),
)
)
other = df[df['Rated'].isin(['GP','M','NOT RATED','PASSED','X','UNRATED','APPROVED'])]
othersource = ColumnDataSource(
data=dict(
ra=[str(i) for i in other['Rank'].tolist()],
rt=[str(int(i)) for i in other['Rotten Tomatoes'].tolist()],
desc=other['Title'].tolist(),
)
)
p = figure(title='IMDB vs. RT', tools=[hover], x_axis_label='IMDB Rating',y_axis_label='Rotten Tomatoes Rating')
#p.scatter(df['Rank'].tolist(), df['Rotten Tomatoes'], size=5, source=source)
p.scatter(nc17['Rank'].tolist(), nc17['Rotten Tomatoes'].tolist(), size=5, source=nc17source,color='orange',legend='NC-17')
p.scatter(r['Rank'].tolist(), r['Rotten Tomatoes'].tolist(), size=5, source=rsource, color='red',legend='R')
p.scatter(pg13['Rank'].tolist(), pg13['Rotten Tomatoes'].tolist(), size=5, source=pg13source,color='blue',legend='PG-13')
p.scatter(pg['Rank'].tolist(), pg['Rotten Tomatoes'].tolist(), size=5,source=pgsource, color='green',legend='PG')
p.scatter(g['Rank'].tolist(), g['Rotten Tomatoes'].tolist(), size=5, source=gsource,color='#32cd32',legend='G')
p.scatter(other['Rank'].tolist(), other['Rotten Tomatoes'].tolist(), source=othersource,size=5,color='grey',legend='other')
p.legend.orientation = 'bottom_right'
show(p)