%matplotlib inline import gc import json, re import unicodedata import random, math from pattern import web from HTMLParser import HTMLParser import requests import pandas as pd import numpy as np import matplotlib.pyplot as plt pd.set_option('display.width', 500) pd.set_option('display.max_columns', 30) # set some nicer defaults for matplotlib from matplotlib import rcParams api_key = 'sdat77wpkdnuyvb9bqhz2y5v' #nicks api key for rotten tomatoes #these colors come from colorbrewer2.org. Each is an RGB triplet dark2_colors = [(0.10588235294117647, 0.6196078431372549, 0.4666666666666667), (0.8509803921568627, 0.37254901960784315, 0.00784313725490196), (0.4588235294117647, 0.4392156862745098, 0.7019607843137254), (0.9058823529411765, 0.1607843137254902, 0.5411764705882353), (0.4, 0.6509803921568628, 0.11764705882352941), (0.9019607843137255, 0.6705882352941176, 0.00784313725490196), (0.6509803921568628, 0.4627450980392157, 0.11372549019607843), (0.4, 0.4, 0.4)] rcParams['figure.figsize'] = (10, 6) rcParams['figure.dpi'] = 150 rcParams['axes.color_cycle'] = dark2_colors rcParams['lines.linewidth'] = 2 rcParams['axes.grid'] = False rcParams['axes.facecolor'] = 'white' rcParams['font.size'] = 14 rcParams['patch.edgecolor'] = 'none' def remove_border(axes=None, top=False, right=False, left=True, bottom=True): """ Minimize chartjunk by stripping out unnecesary plot borders and axis ticks The top/right/left/bottom keywords toggle whether the corresponding plot border is drawn """ ax = axes or plt.gca() ax.spines['top'].set_visible(top) ax.spines['right'].set_visible(right) ax.spines['left'].set_visible(left) ax.spines['bottom'].set_visible(bottom) #turn off all ticks ax.yaxis.set_ticks_position('none') ax.xaxis.set_ticks_position('none') #now re-enable visibles if top: ax.xaxis.tick_top() if bottom: ax.xaxis.tick_bottom() if left: ax.yaxis.tick_left() if right: ax.yaxis.tick_right() def histogram_style(): remove_border(left=False) plt.grid(False) plt.grid(axis='y', color='w', linestyle='-', lw=1) """ EXAMPLE OF USAGE: allDataEver = get_all_years() allDataEver = repair_dict(oscarArray, allDataEver) print allDataEver[2013]['Best Cinematography'] print get_by_award_all_years('Best Picture', allDataEver) print get_by_year(1952) #official oscar titles #note Best Picture instead of Best Achievement in Picture etc. #note that not all oscars exist for every year, i.e. best black and white oscarArray = ['Best Picture', 'Best Actor in a Leading Role', 'Best Actress in a Leading Role', 'Best Actor in a Supporting Role', 'Best Actress in a Supporting Role', 'Best Actor in a Supporting Role', 'Best Director', 'Best Assistant Director', 'Best Writing, Screenplay Written Directly for the Screen', 'Best Writing, Screenplay Based on Material from Another Medium', 'Best Cinematography', 'Best Art Direction-Set Decoration', 'Best Costume Design', 'Best Sound', 'Best Film Editing', 'Best Sound Editing','Best Effects, Visual Effects', 'Best Makeup', 'Best Music, Original Song', 'Best Music, Original Score', 'Best Short Film, Animated', 'Best Short Film, Live Action', 'Best Documentary, Short Subjects', 'Best Documentary, Features', 'Best Foreign Language Film'] """ # HELPER FUNCTIONS from pattern.web import URL, DOM, plaintext, strip_between from pattern.web import NODE, TEXT, COMMENT, ELEMENT, DOCUMENT # get rid of accents etc. def fixWeirdChars(phrase): HexEncode = ['E9','E1','E0','B7'] matchObj = re.search(r'&#x..', phrase) if matchObj: accentedHex = matchObj.group() HexPart = accentedHex.split('x')[1] if(HexPart in HexEncode): phrase = re.sub((accentedHex+';'), 'e', phrase) else: Hexy = "0x" + HexPart Num = int(Hexy, 16) Char = chr(Num) phrase = re.sub((accentedHex+';'), Char, phrase) matchAgain = re.search(r'&#x..', phrase) return fixWeirdChars(phrase) return phrase def clean_unicode(s): s = str(s) s.encode('ascii','ignore') s = fixWeirdChars(s) return str(s) # repair dictionary names def repair_dict(dictionary): for year in dictionary: for award in dictionary[year]: if(award == 'Best Motion Picture of the Year'): dictionary[year]['Best Picture'] = dictionary[year]['Best Motion Picture of the Year'] del dictionary[year]['Best Motion Picture of the Year'] if(award == 'Best Performance by an Actor in a Leading Role'): dictionary[year]['Best Actor in a Leading Role'] = dictionary[year]['Best Performance by an Actor in a Leading Role'] del dictionary[year]['Best Performance by an Actor in a Leading Role'] if(award == 'Best Performance by an Actress in a Leading Role'): dictionary[year]['Best Actress in a Leading Role'] = dictionary[year]['Best Performance by an Actress in a Leading Role'] del dictionary[year]['Best Performance by an Actress in a Leading Role'] if(award == 'Best Performance by an Actress in a Supporting Role'): dictionary[year]['Best Actress in a Supporting Role'] = dictionary[year]['Best Performance by an Actress in a Supporting Role'] del dictionary[year]['Best Performance by an Actress in a Supporting Role'] if(award == 'Best Performance by an Actor in a Supporting Role'): dictionary[year]['Best Actor in a Supporting Role'] = dictionary[year]['Best Performance by an Actor in a Supporting Role'] del dictionary[year]['Best Performance by an Actor in a Supporting Role'] if(award == 'Best Writing, Original Screenplay'): dictionary[year]['Best Writing, Screenplay Written Directly for the Screen'] = dictionary[year]['Best Writing, Original Screenplay'] del dictionary[year]['Best Writing, Original Screenplay'] if(award == 'Best Writing, Adapted Screenplay'): dictionary[year]['Best Writing, Screenplay Based on Material from Another Medium'] = dictionary[year]['Best Writing, Adapted Screenplay'] del dictionary[year]['Best Writing, Adapted Screenplay'] if(award == 'Best Foreign Language Film of the Year'): newAwardName = award newAwardName = re.sub(' of the Year', '', newAwardName) dictionary[year][newAwardName] = dictionary[year][award] del dictionary[year][award] # check to see if it includes best achievement if('Best Achievement' in award): if(award == 'Best Achievement in Directing'): dictionary[year]['Best Director'] = dictionary[year]['Best Achievement in Directing'] del dictionary[year]['Best Achievement in Directing'] else: newAwardName = award if('Mixing' in award): newAwardName = re.sub(' Mixing', '', newAwardName) if(' and Hairstyling' in award): newAwardName = re.sub(' and Hairstyling', '', newAwardName) if('Music Written for Motion Pictures, Original Song' or 'Music Written for Motion Pictures, Original Score' in award): newAwardName = re.sub(' Written for Motion Pictures', '', newAwardName) newAwardName = re.sub(' Achievement in', '', newAwardName) if 'Visual Effects' in newAwardName: #add comma newAwardName = 'Best Effects, Visual Effects' dictionary[year][newAwardName] = dictionary[year][award] del dictionary[year][award] return dictionary # END OF HELPER FUNCTIONS """ Gets the movie titles for all Oscars for a given year. i.e. get_by_year(2012) Function -------- get_by_year Input int year Outputs a dict of the format: 'Award': 'nominees': ['a', 'b', 'c', 'd'] 'winner': 'f' """ def get_by_year(year): url = URL("http://www.imdb.com/event/ev0000003/" + str(year)) dom = DOM(url.download(cached=True)) dictAll = {} awards = dom.by_class('award') awardTitles = awards[0].by_tag('h2') awardList = [] for award in awardTitles: awardList.append(award.content) prize = awards[0].by_tag('blockquote') for index, title in enumerate(prize[1:25]): winner = title.by_tag('strong')[0].by_tag('a')[0].content winner_id = str(title.by_tag('strong')[0].by_tag('a')[0].attrs['href'][-8:-1]) nomineeList = [] for each in title.by_tag('strong')[1::]: name = each.by_tag('a')[0].content id = str(each.by_tag('a')[0].attrs['href'][-8:-1]) nomineeList.append((clean_unicode(name),id)) winnersAndNominees = {} winnersAndNominees['winner'] = (clean_unicode(winner),winner_id) winnersAndNominees['nominees'] = nomineeList dictAll[awardList[index]] = winnersAndNominees return dictAll """ Gets the movie titles for all Oscars for all years between 1935 and 2013. Function -------- get_all_years() Outputs a dict of dicts the format: 'Year': 'Award': 'nominees': ['a', 'b', 'c', 'd'] 'winner': 'f' """ def get_all_years(): hugeDict = {} #lol for year in range(2003, 2014): hugeDict[year] = get_by_year(str(year)) return hugeDict ''' Gets all data for every year for a specific oscar Function ------ get_by_award_all_years(oscar, dictionary) Outputs a dict of the format 'Year': 'nominees': ['a', 'b', 'c', 'd'] 'winner': 'f' ''' def get_by_award_all_years(oscar, dictionary): yearDict = {} for year in dictionary: if oscar in dictionary[year].keys(): yearDict[year] = dictionary[year][oscar] return yearDict # oscars = get_all_years(); # oscars = repair_dict(oscars) # f = open("data/oscars.json","w+") # json.dump(oscars,f) # f.close() # load an existing dictionary f = open("data/oscars.json","rb") oscars = json.load(f) f.close() print oscars movie_awards = [u'Best Documentary, Features',u'Best Writing, Screenplay Written Directly for the Screen', u'Best Foreign Language Film',u'Best Short Film, Live Action',u'Best Music, Original Song', u'Best Music, Original Score',u'Best Art Direction-Set Decoration',u'Best Costume Design', u'Best Short Film, Animated',u'Best Makeup',u'Best Film Editing',u'Best Animated Feature', u'Best Writing, Screenplay Based on Material from Another Medium',u'Best Sound Editing', u'Best Cinematography',u'Best Documentary, Short Subjects',u'Best Sound',u'Best Director', u'Best Picture',u'Best Visual Effects'] # http://stackoverflow.com/questions/9708374/python-how-to-find-the-string-with-most-matches-in-a-list-of-strings def find_correct_award(awards,keystr): keywords = re.split(' |-',keystr.lower()) def matches(text): return sum(word in text.lower() for word in keywords) return max(awards,key=matches) for year in oscars: for award in movie_awards: if award not in oscars[year].keys(): old = find_correct_award(oscars[year].keys(),award) oscars[year][award] = oscars[year].pop(old) print oscars['2003']['Best Cinematography'] def get_top_movies_by_year(yr,num_movies=200): num_pages = int((num_movies-1)/100) + 1 missing_rtid = 0 missing_imdbid = 0 alldata = [] for pagenum in range(1,num_pages+1): url = 'http://boxofficemojo.com/yearly/chart/?page=' + str(pagenum) + '&view=releasedate&view2=domestic&yr=' + str(yr) + '&p=.htm' page_xml = requests.get(url).text dom = web.Element(page_xml) for table in dom.by_tag('table'): cellpadding = (table.attributes['cellpadding']) if (cellpadding == '5'): # isolates table with movie data for row in table.by_tag('tr'): cols = row.by_tag('td') #catches movie rows try: rank = int(web.strip_tags(cols[0])) if (rank<=num_movies): #checks to make sure in right spot! movie_title = web.strip_tags(cols[1]).strip('\t') movie_title = unicodedata.normalize('NFKD', movie_title).encode('ascii', 'ignore') #removes accents studio = web.strip_tags(cols[2]).strip('\t') # the below try/except mechanisms catch errors when there is a NONE instead of a string which is a candidate for integer casting try: total_gross = int(((web.strip_tags(cols[3])).strip('$')).replace(',', '')) except: total_gross = None try: num_theaters_total = int((web.strip_tags(cols[4])).replace(',', '')) except: num_theaters_total = None try: opening_revenue = int(((web.strip_tags(cols[5])).strip('$')).replace(',', '')) except: opening_revenue = None try: num_theaters_opening = int((web.strip_tags(cols[6])).replace(',', '')) except: num_theaters_opening = None date_open = web.strip_tags(cols[7]).strip('\t') date_close = web.strip_tags(cols[8]).strip('\t') #using rotten tomatoes to get rtid/imdbid url2 = 'http://api.rottentomatoes.com/api/public/v1.0/movies.json?' 'q=' + movie_title + ' &page_limit=10' + '&apikey=' + api_key info_data = requests.get(url2).text info_json = json.loads(info_data) movie_count = 0 rtid='' imdbid='' try: for movie in info_json['movies']: if movie['year']==yr: movie_count+=1 rtid=movie['id'] try: imdbid=(movie['alternate_ids'])['imdb'] except: imdbid='' except: pass if movie_count!=1: #case where more than one movie met description rtid = '' imdbid = '' # second attempt to obtain imdb ids using the omdbapi if imdbid=='': try: omdb_link = "http://www.omdbapi.com/?t=%s&y=%s" % (movie_title,yr) omdb_text = web.URL(omdb_link).download(cached=True) imdbid = json.loads(omdb_text)['imdbID'].replace("tt","") except: try: omdb_link = "http://www.omdbapi.com/?t=%s&y=%s" % (movie_title,(yr-1)) omdb_text = web.URL(omdb_link).download(cached=True) imdbid = json.loads(omdb_text)['imdbID'].replace("tt","") except: try: omdb_link = "http://www.omdbapi.com/?t=%s&y=%s" % (movie_title,(yr-2)) omdb_text = web.URL(omdb_link).download(cached=True) imdbid = json.loads(omdb_text)['imdbID'].replace("tt","") except: imdbid = '' missing_imdbid += 1 if rtid=='': missing_rtid += 1 data_row = [rank,yr,movie_title,imdbid,rtid,studio,total_gross,num_theaters_total,opening_revenue,num_theaters_opening,date_open,date_close] alldata.append(data_row) except ValueError: pass result = pd.DataFrame(alldata,columns=['rank','year','movie_title','imdbid','rtid','studio','total_gross','num_theaters_total','opening_revenue','num_theaters_opening','date_open','date_close']) if (missing_imdbid>0): print 'unable to find a imdbid for ' + str(missing_imdbid) + ' movies of ' + str(num_movies) + ' in ' + str(yr) if (missing_rtid>0): print 'unable to find a rtid for ' + str(missing_rtid) + ' movies of ' + str(num_movies) + ' in ' + str(yr) return result # Saves all data # alldata=get_top_movies_by_year(2003,200) # for yr in range(2004,2013): # num_movies=200 # result=get_top_movies_by_year(yr,num_movies) # alldata.append(result,ignore_index=True) # print yr # filename='./data/alldata_' + str(num_movies) + '.csv' # alldata.to_csv(filename, index=False) # Saves just title and id data in a separate file # narrowed_data=pd.DataFrame(alldata, columns=['year','rank','movie_title','imdbid','rtid']) # filename='./data/iddata_' + str(num_movies) + '.csv' # narrowed_data.to_csv(filename, index=False) bomj_df = pd.read_csv('data/completedata.csv') def test_for_nones(df,column_name): result = [] col = df[column_name] for item in col: check_bool = False if (item != 'None'): if item!='NaN': check_bool = True result.append(check_bool) return result # stores imdbid with 7 characters bomj_df=bomj_df[test_for_nones(bomj_df,'imdbid')] # removes rtid, which is calculated later bomj_df=bomj_df[['rank','year','movie_title','imdbid','studio','total_gross','num_theaters_total','opening_revenue','num_theaters_opening','date_open','date_close']] import numpy as np # ensures proper formatting of all columns def formatting_verification(column,fun): result = [] for val in column: if fun=='int': if ((val == 'None')): result.append(None) else: print val result.append(int(val)) # elif fun=='str': # if ((val is None)): # result.append() # else: # try: # if math.isnan(val): # result.append() # else: # result.append(str(val)) # except: # result.append(str(val)) # return result bomj_df['imdbid'] = bomj_df['imdbid'].map(lambda id: str(id).zfill(7)) # bomj_df['rank'] = formatting_verification(bomj_df['rank'].values,'int') # bomj_df['year'] = formatting_verification(bomj_df['year'].values,'int') # bomj_df['studio'] = formatting_verification(bomj_df['studio'].values,'str') # bomj_df['total_gross'] = formatting_verification(bomj_df['total_gross'].values,'int') #bomj_df['num_theaters_total'] = formatting_verification(bomj_df['num_theaters_total'].values,'int') # bomj_df['opening_revenue'] = formatting_verification(bomj_df['opening_revenue'].values,'int') # bomj_df['num_theaters_opening'] = formatting_verification(bomj_df['num_theaters_opening'].values,'int') # bomj_df['date_open'] = pd.to_datetime(bomj_df['date_open']) # bomj_df['date_close'] = pd.to_datetime(bomj_df['date_close']) #bomj_df[['num_theaters_total']] = bomj_df[['num_theaters_total']].astype(int) #bomj_df['num_theaters_total'] = formatting_verification(bomj_df['num_theaters_total'].values,'int') imdb_as_index = bomj_df.set_index('imdbid') imdb_as_index.head() data_by_year = imdb_as_index.groupby('year').groups performance = {} total_nominees_present = 0 total_nominees_absent = 0 total_winners_present = 0 total_winners_absent = 0 for (yr,yr_list) in oscars.iteritems(): yr = int(yr) if yr>2003: yr=yr-1 movieids=data_by_year[yr] nominee_ids=[] winner_ids=[] nominees_present = 0 nominees_absent = 0 winners_present = 0 winners_absent = 0 for (award,award_list) in yr_list.iteritems(): winner_ids.append(award_list['winner'][1]) for nominee in award_list['nominees']: nominee_ids.append(nominee[1]) for nid in nominee_ids: is_movie_saved = False for top200id in movieids: try: corrected_id=str(int(top200id)).zfill(7) if corrected_id==nid: is_movie_saved = True except: pass if is_movie_saved: nominees_present += 1 else: nominees_absent += 1 for wid in winner_ids: is_movie_saved = False for top200id in movieids: try: corrected_id=str(int(top200id)).zfill(7) if corrected_id==wid: is_movie_saved = True except: pass if is_movie_saved: winners_present += 1 else: winners_absent += 1 total_nominees_present += nominees_present total_nominees_absent += nominees_absent total_winners_present += winners_present total_winners_absent += winners_absent performance[yr]=[winners_present,winners_absent,nominees_present,nominees_absent] #print str(yr) + ' nominee performance: ' + str(float(nominees_present)/(nominees_present+nominees_absent)) #print str(yr) + ' winner performance: ' + str(float(winners_present)/(winners_present+winners_absent)) #print print 'total nominee performance: ' + str(float(total_nominees_present)/(total_nominees_present+total_nominees_absent)) print 'total winner performance: ' + str(float(total_winners_present)/(total_winners_present+total_winners_absent)) print "For Example:" pd.DataFrame({"critic":[''],"norm_score":[''],"quote":[''],"id":[''],"title":[''],"source":[''],"overall_score":[''],"year":[''],"date":['']}).head() # write the functions to scrape the data ''' Get Individual Review From Rotten Tomatoes ''' def get_rtomatoes_movie_reviews(imdb): #api_key = 't97y5fkhfbfu9wjhse7v8ayy' #api_key = '8kr5nqyrshjyp4y69w8rzzgu' api_key = 'sdat77wpkdnuyvb9bqhz2y5v' if(len(imdb)<7): extra = 7 - len(imdb) prefix = "" for i in range(0, extra): prefix = prefix + '0' imdb = prefix + imdb url = 'http://api.rottentomatoes.com/api/public/v1.0/movie_alias.json' options = {'type': 'imdb','id': imdb, 'apikey': api_key} rotIds = requests.get(url, params=options).text rotIds = json.loads(rotIds) if('"error":"Could not find a movie with the specified id"' in rotIds): return None if('id' not in rotIds): return None title = rotIds['title'] year = rotIds['year'] overall_score = rotIds['ratings']['critics_score'] rotId = rotIds['id'] url2 = 'http://api.rottentomatoes.com/api/public/v1.0/movies/%s/reviews.json' % rotId options2 = {'review_type': 'top_critic', 'page_limit': 50, 'page': 1, 'apikey': api_key} data = requests.get(url2, params=options2).text data = json.loads(data) # load a json string into a collection of lists and dicts if('total' not in data or 'reviews' not in data): return None if str(data['total']) == '0': return None ids = [] titles = [] years = [] overall_scores = [] revs = data['reviews'] critics = [] quotes = [] pubs = [] freshness = [] dates = [] for each in revs: ids.append(imdb) titles.append(title) years.append(year) overall_scores.append(float(overall_score)/100.0) critics.append(each['critic']) quotes.append(each['quote']) pubs.append(each['publication']) if(each['freshness'] == 'fresh'): freshness.append(1) else: freshness.append(0) dates.append(each['date']) d = {'critic' : critics, 'norm_score' : freshness, 'quote' : quotes, 'id' : ids, 'title' : titles, 'source' : pubs, 'overall_score' : overall_scores, 'date' : dates, 'year': years} return pd.DataFrame(d) def get_all_rtomatoes_movies(movieList): all_imdbs = alldata['imdbid'] imdbs = [] for each in all_imdbs: if str(each) != 'None': imdbs.append(each) #print imdbs #for each in imdbs: newDF = pd.DataFrame() for each in imdbs: individualDF = get_rtomatoes_movie_reviews(each) if(individualDF): newDF = newDF.append(individualDF, ignore_index=True) filename='./data/rotten_tomatoes_data.csv' newDF.to_csv(filename, index=False) return newDF # actually call the functions and save in a file #alldata = pd.read_csv('./revised_id_data.csv') #rot_tom_data = get_all_rtomatoes_movies(alldata) #once we have the stored csv file not necessary to make the above calls, and can just do: rotten_df = pd.read_csv('data/rotten_tomatoes_data.csv') def filter_tomatoes(dataF): for ind, each in enumerate(dataF['quote']): if(str(each) == 'nan'): dataF['quote'][ind] = 'NoReview' dataF = dataF[dataF.quote != 'NoReview'] return dataF rotten_df = filter_tomatoes(rotten_df) rotten_df.head() # write the functions to scrape the data # actually call the functions and save in a file """ Function -------- get_imdb_movie_reviews: retrieves the movie reviews for a given movie on imdb Parameters ---------- id: imdb movie ID title: title of the movie (why scrape it when we already have it?) year: year that the movie was made Returns ------- returns a dataframe containing all of the movie revies of the form listed above """ def get_imdb_movie_reviews(id,title,year): score_max = 10.0 link = "http://www.imdb.com/title/tt%0.7d/" % id url = web.URL(link) dom = web.DOM(url.download(cached=True)) overall = float(dom.by_class("titlePageSprite star-box-giga-star")[0].content.strip()) / score_max # try to get year directly from page; this isn't present in every entry try: year = dom('span.itemprop[itemprop=name]')[0].next.next.by_tag('a')[0].content year = int(year) except: pass rc = dom.by_attr(itemprop="reviewCount")[0].content.split(" ")[0].replace(",","") revlink = link + 'reviews?count=%s&start=0' % rc # get at most 20 reviews url = web.URL(revlink) dom = web.DOM(url.download(cached=True)) parser = HTMLParser() lst = [] hrs = dom.by_id('tn15main').by_tag('hr') for hr in hrs: div = hr.next.next try: score = float(div.by_tag("img")[1].attrs["alt"].split("/")[0]) / score_max date = div.by_tag("small")[2].content except: continue user = div.by_tag("a")[1].content p = div.next.next review = parser.unescape(p.content.replace("
","\n")) lst.append(dict(critic=user,norm_score=score,quote=review, id=id,title=title,source="IMDB",overall_score=overall,year=year,date=date)) return lst def get_all_imdb_movies(): fname = 'revised_id_data.csv' df = [] result = pd.read_csv('data/%s' % fname) f = open('errors.txt','w+') year = 2003 for index,row in result.iterrows(): try: row['imdbid'] = int(row['imdbid']) except: continue newyr = row['year'] if(int(newyr) != year and df != []): # filter_quotes(df) fo = open('imdbrev%d.json' % year,'w+') json.dump(df,fo) fo.close() year = int(newyr) df = [] gc.collect() try: df.append(get_imdb_movie_reviews(row['imdbid'],row['movie_title'],row['year'])) print row['movie_title'], year except: print >> f, "ERROR: %s" % row['movie_title'] f.close() if df: fo = open('imdbrev%d.json' % year,'w+') json.dump(df,fo) fo.close() # create the json files for all of the imdb reviews # get_all_imdb_movies() def filter_quotes(df): for index,row in df.iterrows(): text = row['quote'] text = re.sub(r"\\u[0-9a-zAZ]{4}", "", text) text = re.sub("\\\[^\'|\"]", "", text) text = text.replace('\n','') text = unicodedata.normalize('NFKC',unicode(text)).encode('ascii','ignore') df.ix[index, 'quote'] = text """ Here we construct the dataframe for all of the imdb reviews. We then filter it, and save it to a csv. """ """ imdb_f_list = ['imdbrev2003.json', 'imdbrev2004.json', 'imdbrev2005.json', 'imdbrev2006.json', 'imdbrev2007.json', 'imdbrev2008.json', 'imdbrev2009.json', 'imdbrev2010.json', 'imdbrev2011.json', 'imdbrev2012.json'] imdb_df = pd.DataFrame({"critic":[],"norm_score":[],"quote":[],"id":[], "title":[],"source":[],"overall_score":[],"year":[],"date":[]}) for f in imdb_f_list: fp = open('data/' + f,'rb') j = json.load(fp) fp.close() for m in j: if m != []: imdb_df = imdb_df.append(m) gc.collect() filter_quotes(imdb_df) imdb_df.to_csv('data/imdb_reviews.csv',index=False) """ pass # read saved dataframe imdb_df = pd.read_csv('data/imdb_reviews.csv') imdb_df.head() # now it's time to add the data """ Function -------- add_oscar_data: Adds data for being an oscar winner or nominee in place in the passed dataframe Parameters ---------- df: Dataframe award_name: Name of the oscar award If None, then creates general columns for winning any or being nominated in any """ movie_awards = [u'Best Documentary, Features',u'Best Writing, Screenplay Written Directly for the Screen', u'Best Foreign Language Film',u'Best Short Film, Live Action',u'Best Music, Original Song', u'Best Music, Original Score',u'Best Art Direction-Set Decoration',u'Best Costume Design', u'Best Short Film, Animated',u'Best Makeup',u'Best Film Editing',u'Best Animated Feature', u'Best Writing, Screenplay Based on Material from Another Medium',u'Best Sound Editing', u'Best Cinematography',u'Best Documentary, Short Subjects',u'Best Sound',u'Best Director', u'Best Picture',u'Best Visual Effects'] def add_oscar_data(df,imdbid_title='id',award_name=None): awards = movie_awards if award_name: awards = [award_name] win = award_name.replace(' ','_') + '_winner' nom = award_name.replace(' ','_') + '_nom' else: win = 'oscar_winner' nom = 'oscar_nom' df[win] = 0 df[nom] = 0 for year in oscars: for award in movie_awards: try: df[win][df[imdbid_title] == oscars[year][award]['winner'][1]] = 1 df[nom][df[imdbid_title] == oscars[year][award]['winner'][1]] = 1 for n in oscars[year][award]['nominees']: df[nom][df[imdbid_title] == n[1]] = 1 # skip awards that don't exist for a year except: pass add_oscar_data(bomj_df,'imdbid') bomj_df.head() def plot_str_hist(df,column_name,xlabel='',ylabel='',title='',xticksize=7): total_movies = float(len(df)) data_pts=[] counts=[] data=[] tickloc=0.5 barloc=0 barlocs=[] ticklocs=[] grouped_data=(df.groupby(column_name)).groups for key,val in grouped_data.items(): data.append((key,len(val))) ticklocs.append(tickloc) barlocs.append(barloc) tickloc+=1 barloc+=1 data.sort(key=lambda x:x[1]) for key,val in data: data_pts.append(key) counts.append(val/total_movies) plt.bar(barlocs, height=counts, width=1,color='r',edgecolor="white") plt.xticks(rotation=90,size=xticksize) plt.xticks(ticklocs,data_pts) plt.xlabel(xlabel) plt.ylabel(ylabel) plt.title(title) histogram_style() plt.show() plot_str_hist(bomj_df,'studio',xlabel='Studio',ylabel='Frequency',title='Studio v. Count for All Movies') plot_str_hist(bomj_df[bomj_df.oscar_winner == 1],'studio',xlabel='Studio',ylabel='Frequency',title='Studio v. Count for Winners',xticksize=10) plot_str_hist(bomj_df[bomj_df.oscar_nom == 1],'studio',xlabel='Studio',ylabel='Frequency',title='Studio v. Count for Nominees',xticksize=10) def plot_quant_data(bomj_df,varname,cr): data = (bomj_df)[varname].values cleaned_data = [] for pt in data: try: cleaned_data.append(int(pt)) except: pass plt.hist(cleaned_data,20,color=cr,normed=True) plt.xlabel(varname) plt.ylabel('Frequency') plt.title(varname + ' v. count for all movies') histogram_style() plt.show() win_data = (bomj_df[bomj_df.oscar_winner == 1])[varname].values cleaned_win_data = [] for pt in win_data: try: cleaned_win_data.append(int(pt)) except: pass plt.hist(cleaned_win_data,20,color=cr,normed=True) plt.xlabel(varname) plt.ylabel('Frequency') plt.title(varname + ' v. count for winners') histogram_style() plt.show() nom_data = (bomj_df[bomj_df.oscar_nom == 1])[varname].values cleaned_nom_data = [] for pt in nom_data: try: cleaned_nom_data.append(int(pt)) except: pass plt.hist(cleaned_nom_data,20,color=cr,normed=True) plt.xlabel(varname) plt.ylabel('Frequency') plt.title(varname + ' v. count for nominees') histogram_style() plt.show() plot_quant_data(bomj_df,'num_theaters_total','b') plot_quant_data(bomj_df,'opening_revenue','b') plot_quant_data(bomj_df,'num_theaters_opening','b') print "Number of reviews: %i" % len(imdb_df) print "Number of critics: %i" % imdb_df.critic.unique().size print "Number of movies: %i" % imdb_df.title.unique().size imdb_df.groupby('critic').title.count().hist(log=True, bins=range(20), edgecolor='white') plt.xlabel("Number of reviews per critic") plt.ylabel("N") histogram_style() plt.plot(imdb_df.groupby('year').critic.count().index,imdb_df.groupby('year').critic.count()) plt.xlabel('Year') plt.ylabel('Number of Reviews') plt.show() imdb_df[imdb_df['year'] < 2003].groupby('year').title.unique() # let's remove the entries for years < 2003 imdb_df = imdb_df[imdb_df['year'] >= 2003] # Finally, let's try plotting the same graph as above plt.plot(imdb_df.groupby('year').critic.count().index,imdb_df.groupby('year').critic.count()) plt.xlabel('Year') plt.xticks(range(2002,2013)) plt.ylabel('Number of Reviews') plt.show() print "average movie rating: %0.2f / 10" % (imdb_df.groupby('title')['overall_score'].unique().mean() * 10) print "average user critic rating: %0.2f / 10" % (imdb_df.groupby('critic')['norm_score'].mean().mean() * 10) #Your code here imdb_df.groupby('critic').norm_score.mean().hist(bins=10, edgecolor='w', lw=1) plt.xlabel("Average rating per critic") plt.ylabel("N") histogram_style() # first let's convert the date column to an actual date object imdb_df['date'] = pd.to_datetime(imdb_df['date']) def less_than_date(row): return row['date'] < pd.to_datetime('January 1 ' + str(row['year'] + 1)) imdb_df = imdb_df[imdb_df.apply(less_than_date,axis=1)] print "Number of reviews: %i" % len(imdb_df) print "Number of critics: %i" % imdb_df.critic.unique().size print "Number of movies: %i" % imdb_df.title.unique().size print "Number of reviews: %i" % len(rotten_df) print "Number of critics: %i" % rotten_df.critic.unique().size print "Number of movies: %i" % rotten_df.title.unique().size rotten_df.groupby('critic').title.count().hist(log=True, bins=range(20), edgecolor='white') plt.xlabel("Number of reviews per critic") plt.ylabel("N") histogram_style() plt.plot(rotten_df.groupby('year').critic.count().index,rotten_df.groupby('year').critic.count()) plt.xlabel('Year') plt.ylabel('Number of Reviews') plt.show() print "average movie rating: %0.2f / 10" % (rotten_df.groupby('title')['overall_score'].unique().mean() * 10) print "average user critic rating: %0.2f / 10" % (rotten_df.groupby('critic')['norm_score'].mean().mean() * 10) #Your code here rotten_df.groupby('critic').norm_score.mean().hist(bins=10, edgecolor='w', lw=1) plt.xlabel("Average rating per critic") plt.ylabel("N") histogram_style() #def less_than_date(row): # return row['date'] < pd.to_datetime('January 1 ' + str(row['year'] + 1)) rotten_df['date'] = pd.to_datetime(rotten_df['date']) rotten_df = rotten_df[rotten_df.apply(less_than_date,axis=1)] rotten_df.head() print "Number of reviews: %i" % len(rotten_df) print "Number of critics: %i" % rotten_df.critic.unique().size print "Number of movies: %i" % rotten_df.title.unique().size imdb_df['id'] = imdb_df['id'].map(lambda id: str(id).zfill(7)) add_oscar_data(imdb_df) imdb_df.head() from sklearn.feature_extraction.text import CountVectorizer from sklearn.cross_validation import train_test_split from sklearn.naive_bayes import MultinomialNB from sklearn.ensemble import RandomForestClassifier """ Function -------- make_xyz Build a bag-of-words training set for the review data Parameters ----------- df : Pandas DataFrame The review for a dataset vectorizer : CountVectorizer object (optional) A CountVectorizer object to use. If None, then create and fit a new CountVectorizer. Otherwise, re-fit the provided CountVectorizer using the critics data Returns ------- X : numpy array (dims: nreview, nwords) Bag-of-words representation for each review. Y : numpy array (dims: nreview) 1/0 array. 1 = won an oscar, 0 = didn't win an oscar Z : numpy array (dims: nreview) 1/0 array. 1 = nominated, 0 = not nominated Examples -------- X, Y, Z = make_xyz(imdb_id) """ def make_xyz(df, vectorizer=None): if vectorizer is None: vectorizer = CountVectorizer() X = vectorizer.fit_transform(df.quote) X = X.tocsc() # some versions of sklearn return COO format gc.collect() return X, df['oscar_winner'].astype(np.int), df['oscar_nom'].astype(np.int) from random import sample # group imdb_df by movie grp_imdb = imdb_df.groupby('id') # reset imdb_df for appending new info imdb_df = pd.DataFrame() # iterate through groups, randomly remove reviews, append to imdb_df for name,group in grp_imdb: rows = sample(group.index,len(group.index) / 10) group = group.ix[rows] imdb_df = imdb_df.append(group) # finally construct X,Y,Z X, Y, Z = make_xyz(imdb_df) rotten_df['id'] = rotten_df['id'].map(lambda id: str(id).zfill(7)) add_oscar_data(rotten_df) rotten_df.head() rottenX, rottenY, rottenZ = make_xyz(rotten_df) """ Function -------- split_test_train_years: randomly splits a movie dataframe by years into training and testing groups Parameters ---------- df: Dataframe to be split yr_lst: a list of the years on which to split Returns ------- test_yrs, train_years: A tuple of the split dataframe """ def split_test_train_years(df,yr_lst): test_yrs = random.sample(yr_lst, len(yr_lst)/2) train_yrs = list(set(yr_lst) - set(test_yrs)) print 'Test years: ' + str(test_yrs) print 'Train years: ' + str(train_yrs) test_mask = [x in test_yrs for x in df.year] train_mask = [x in train_yrs for x in df.year] return df[test_mask], df[train_mask] bomj_testdf,bomj_traindf = split_test_train_years(bomj_df,range(2003,2013)) bomj_testdf.head() from sklearn.naive_bayes import MultinomialNB def none_checker_composite(df,list_of_cols): num_cols = len(list_of_cols) newdf=df[test_for_nones(df,list_of_cols[1])] if num_cols>1: for i in range(1,num_cols): newdf=newdf[test_for_nones(newdf,list_of_cols[i])] return newdf def isolate_data(df,cols_to_include,classifier_column='oscar_winner'): X = df[cols_to_include].values Y = df[classifier_column].values X_new=[] Y_new=[] for i in range(0,len(Y)): try: X_new.append((int(X[i,0]),int(X[i,1]),int(X[i,2]))) Y_new.append(int(Y[i])) except: pass return X_new, Y_new bomj_testdf=none_checker_composite(bomj_testdf,['total_gross','num_theaters_total','opening_revenue','num_theaters_opening']) bomj_traindf=none_checker_composite(bomj_traindf,['total_gross','num_theaters_total','opening_revenue','num_theaters_opening']) (X_test, Y_test) = isolate_data(bomj_testdf,['total_gross','num_theaters_total','opening_revenue','num_theaters_opening'],'oscar_winner') (X_train, Y_train) = isolate_data(bomj_traindf,['total_gross','num_theaters_total','opening_revenue','num_theaters_opening'],'oscar_winner') print "Accuracy on train data: %0.2f%%" % (100 * clf.score(X_train, Y_train)) print "Accuracy on test data: %0.2f%%" % (100 * clf.score(X_test, Y_test)) #Your code here xtrain, xtest, ytrain, ytest = train_test_split(X, Y) clf = MultinomialNB().fit(xtrain, ytrain) print "Accuracy on train data: %0.2f%%" % (100 * clf.score(xtest, ytest)) # Your code here. Print the accuracy on the test and training dataset training_accuracy = clf.score(xtrain, ytrain) test_accuracy = clf.score(xtest, ytest) print "Accuracy on training data: %0.2f" % (training_accuracy) print "Accuracy on test data: %0.2f" % (test_accuracy) """ Function -------- calibration_plot Builds a plot like the one above, from a classifier and review data Inputs ------- clf : Classifier object A MultinomialNB classifier X : (Nexample, Nfeature) array The bag-of-words data Y : (Nexample) integer array 1 if a review is Fresh """ #your code here def calibration_plot(clf, xtest, ytest): prob = clf.predict_proba(xtest)[:, 1] outcome = ytest data = pd.DataFrame(dict(prob=prob, outcome=outcome)) #group outcomes into bins of similar probability bins = np.linspace(0, 1, 20) cuts = pd.cut(prob, bins) binwidth = bins[1] - bins[0] #freshness ratio and number of examples in each bin cal = data.groupby(cuts).outcome.agg(['mean', 'count']) cal['pmid'] = (bins[:-1] + bins[1:]) / 2 cal['sig'] = np.sqrt(cal.pmid * (1 - cal.pmid) / cal['count']) #the calibration plot ax = plt.subplot2grid((3, 1), (0, 0), rowspan=2) p = plt.errorbar(cal.pmid, cal['mean'], cal['sig']) plt.plot(cal.pmid, cal.pmid, linestyle='--', lw=1, color='k') plt.ylabel("Empirical P(Fresh)") remove_border(ax) #the distribution of P(fresh) ax = plt.subplot2grid((3, 1), (2, 0), sharex=ax) plt.bar(left=cal.pmid - binwidth / 2, height=cal['count'], width=.95 * (bins[1] - bins[0]), fc=p[0].get_color()) plt.xlabel("Predicted P(Fresh)") remove_border() plt.ylabel("Number") calibration_plot(clf, xtest, ytest) """ Function -------- log_likelihood Compute the log likelihood of a dataset according to a bayesian classifier. The Log Likelihood is defined by L = Sum_fresh(logP(fresh)) + Sum_rotten(logP(rotten)) Where Sum_fresh indicates a sum over all fresh reviews, and Sum_rotten indicates a sum over rotten reviews Parameters ---------- clf : Bayesian classifier x : (nexample, nfeature) array The input data y : (nexample) integer array Whether each review is Fresh """ #your code here def log_likelihood(clf, x, y): prob = clf.predict_log_proba(x) rotten = y == 0 fresh = ~rotten return prob[rotten, 0].sum() + prob[fresh, 1].sum() from sklearn.cross_validation import KFold def cv_score(clf, x, y, score_func): """ Uses 5-fold cross validation to estimate a score of a classifier Inputs ------ clf : Classifier object x : Input feature vector y : Input class labels score_func : Function like log_likelihood, that takes (clf, x, y) as input, and returns a score Returns ------- The average score obtained by randomly splitting (x, y) into training and test sets, fitting on the training set, and evaluating score_func on the test set Examples cv_score(clf, x, y, log_likelihood) """ result = 0 nfold = 5 for train, test in KFold(y.size, nfold): # split data into train/test groups, 5 times clf.fit(x[train], y[train]) # fit result += score_func(clf, x[test], y[test]) # evaluate score function on held-out data return result / nfold # average #the grid of parameters to search over alphas = [0, .1, 1, 5, 10, 50] min_dfs = [1e-5, 1e-4, 1e-3, 1e-2, 1e-1] #Find the best value for alpha and min_df, and the best classifier best_alpha = None best_min_df = None max_loglike = -np.inf for alpha in alphas: for min_df in min_dfs: vectorizer = CountVectorizer(min_df = min_df) X, Y = make_xy(critics, vectorizer) #your code here clf = MultinomialNB(alpha=alpha) loglike = cv_score(clf, X, Y, log_likelihood) if loglike > max_loglike: max_loglike = loglike best_alpha, best_min_df = alpha, min_df print "alpha: %f" % best_alpha print "min_df: %f" % best_min_df #Your code here vectorizer = CountVectorizer(min_df=best_min_df) X, Y = make_xy(critics, vectorizer) xtrain, xtest, ytrain, ytest = train_test_split(X, Y) clf = MultinomialNB(alpha=best_alpha).fit(xtrain, ytrain) calibration_plot(clf, xtest, ytest) # Your code here. Print the accuracy on the test and training dataset training_accuracy = clf.score(xtrain, ytrain) test_accuracy = clf.score(xtest, ytest) print "Accuracy on training data: %0.2f" % (training_accuracy) print "Accuracy on test data: %0.2f" % (test_accuracy) ## Your code here words = np.array(vectorizer.get_feature_names()) x = np.eye(xtest.shape[1]) probs = clf.predict_log_proba(x)[:, 0] ind = np.argsort(probs) good_words = words[ind[:10]] bad_words = words[ind[-10:]] good_prob = probs[ind[:10]] bad_prob = probs[ind[-10:]] print "Good words\t P(fresh | word)" for w, p in zip(good_words, good_prob): print "%20s" % w, "%0.2f" % (1 - np.exp(p)) print "Bad words\t P(fresh | word)" for w, p in zip(bad_words, bad_prob): print "%20s" % w, "%0.2f" % (1 - np.exp(p)) #Your code here x, y = make_xy(critics, vectorizer) prob = clf.predict_proba(x)[:, 0] predict = clf.predict(x) bad_rotten = np.argsort(prob[y == 0])[:5] bad_fresh = np.argsort(prob[y == 1])[-5:] print "Mis-predicted Rotten quotes" print '---------------------------' for row in bad_rotten: print critics[y == 0].quote.irow(row) print print "Mis-predicted Fresh quotes" print '--------------------------' for row in bad_fresh: print critics[y == 1].quote.irow(row) print clf.predict_proba(vectorizer.transform(['This movie is not remarkable, touching, or superb in any way']))