#!/usr/bin/env python # coding: utf-8 # In[7]: # %load get_rotten.py __author__ = 'ju' import json import pandas as pd import numpy as np import requests from urllib2 import Request, urlopen, URLError from bokeh.plotting import figure, output_file, show, ColumnDataSource from bokeh.models import HoverTool, BoxSelectTool import bokeh output_notebook() # df = pd.read_csv('IMDB_7000.txt',sep='\t') # df = df[df['Votes']>=50000] # print df # # #print '%'.join(['The','Shawshank']) # imdb = [] # rt = [] # rated = [] # year = [] # genre = [] # # for i in df.index: # url = 'http://www.omdbapi.com/?t='+'%25'.join(df.loc[i,'Title'].split(' '))+'&y='+str(int(df.loc[i,'Year']))+'&tomatoes=true' # result = json.load(urlopen(url)) # print df.loc[i,'Title'] # if 'imdbRating' in result: # imdb.append(result['imdbRating']) # else: # imdb.append(np.nan) # if 'tomatoMeter' in result: # rt.append(result['tomatoMeter']) # else: rt.append(np.nan) # if 'Year' in result: # year.append(result['Year']) # else: year.append(np.nan) # if 'Genre' in result: # genre.append(result['Genre']) # else: genre.append(np.nan) # if 'Rated' in result: # rated.append(result['Rated']) # else: rated.append(np.nan) # df['Year'] = pd.Series(year,index=df.index) # df['Genre'] = pd.Series(genre,index=df.index) # df['Rated'] = pd.Series(rated,index=df.index) # df['IMDB Rating'] = pd.Series(imdb,index=df.index) # df['Rotten Tomatoes'] = pd.Series(rt, index=df.index) # # df=df.dropna() # print df # df.to_csv('IMDB_RT_some.txt',sep='\t',index=False) df = pd.read_csv('IMDB_RT_some.txt',sep='\t') #df=df.drop('Unnamed: 0',axis=1) df=df.dropna() allgen = [] for i in df['Genre']: allgen=allgen+i.split(', ') #print set(allgen) #print set(df['Rated']) output_file("toolbar.html") source = ColumnDataSource( data=dict( x=df['Rank'].tolist(), y=df['Rotten Tomatoes'].tolist(), desc=df['Title'].tolist(), ) ) hover = HoverTool( tooltips=[ ("Title", "@desc"), ("(x,y)", "($x, $y)"), ] ) hover = HoverTool( tooltips=[ #("index", "$index"), ("Title", "@desc"), ('IMDB Rating', '@ra'), ('Rotten Tomatoes', '@rt') #("(x,y)", "($x, $y)"), ] ) r = df[df['Rated']=='R'] rsource = ColumnDataSource( data=dict( ra=[str(i) for i in r['Rank'].tolist()], rt=[str(int(i)) for i in r['Rotten Tomatoes'].tolist()], desc=r['Title'].tolist(), ) ) pg13 = df[df['Rated']=='PG-13'] pg13source = ColumnDataSource( data=dict( ra=[str(i) for i in pg13['Rank'].tolist()], rt=[str(int(i)) for i in pg13['Rotten Tomatoes'].tolist()], desc=pg13['Title'].tolist(), ) ) pg = df[df['Rated']=='PG'] pgsource = ColumnDataSource( data=dict( ra=[str(i) for i in pg['Rank'].tolist()], rt=[str(int(i)) for i in pg['Rotten Tomatoes'].tolist()], desc=pg['Title'].tolist(), ) ) g = df[df['Rated']=='G'] gsource = ColumnDataSource( data=dict( ra=[str(i) for i in g['Rank'].tolist()], rt=[str(int(i)) for i in g['Rotten Tomatoes'].tolist()], desc=g['Title'].tolist(), ) ) nc17 = df[df['Rated']=='NC-17'] nc17source = ColumnDataSource( data=dict( ra=[str(i) for i in nc17['Rank'].tolist()], rt=[str(int(i)) for i in nc17['Rotten Tomatoes'].tolist()], desc=nc17['Title'].tolist(), ) ) other = df[df['Rated'].isin(['GP','M','NOT RATED','PASSED','X','UNRATED','APPROVED'])] othersource = ColumnDataSource( data=dict( ra=[str(i) for i in other['Rank'].tolist()], rt=[str(int(i)) for i in other['Rotten Tomatoes'].tolist()], desc=other['Title'].tolist(), ) ) p = figure(title='IMDB vs. RT', tools=[hover], x_axis_label='IMDB Rating',y_axis_label='Rotten Tomatoes Rating') #p.scatter(df['Rank'].tolist(), df['Rotten Tomatoes'], size=5, source=source) p.scatter(nc17['Rank'].tolist(), nc17['Rotten Tomatoes'].tolist(), size=5, source=nc17source,color='orange',legend='NC-17') p.scatter(r['Rank'].tolist(), r['Rotten Tomatoes'].tolist(), size=5, source=rsource, color='red',legend='R') p.scatter(pg13['Rank'].tolist(), pg13['Rotten Tomatoes'].tolist(), size=5, source=pg13source,color='blue',legend='PG-13') p.scatter(pg['Rank'].tolist(), pg['Rotten Tomatoes'].tolist(), size=5,source=pgsource, color='green',legend='PG') p.scatter(g['Rank'].tolist(), g['Rotten Tomatoes'].tolist(), size=5, source=gsource,color='#32cd32',legend='G') p.scatter(other['Rank'].tolist(), other['Rotten Tomatoes'].tolist(), source=othersource,size=5,color='grey',legend='other') p.legend.orientation = 'bottom_right' show(p) # In[8]: from IPython.display import HTML # In[9]: HTML(''' The raw code for this IPython notebook is by default hidden for easier reading. To toggle on/off the raw code, click here.''') # In[ ]: