%matplotlib inline

import gc
import json, re
import unicodedata
import random, math
from pattern import web
from HTMLParser import HTMLParser
import requests
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 30)

# set some nicer defaults for matplotlib
from matplotlib import rcParams

api_key = 'sdat77wpkdnuyvb9bqhz2y5v' #nicks api key for rotten tomatoes

#these colors come from colorbrewer2.org. Each is an RGB triplet
dark2_colors = [(0.10588235294117647, 0.6196078431372549, 0.4666666666666667),
                (0.8509803921568627, 0.37254901960784315, 0.00784313725490196),
                (0.4588235294117647, 0.4392156862745098, 0.7019607843137254),
                (0.9058823529411765, 0.1607843137254902, 0.5411764705882353),
                (0.4, 0.6509803921568628, 0.11764705882352941),
                (0.9019607843137255, 0.6705882352941176, 0.00784313725490196),
                (0.6509803921568628, 0.4627450980392157, 0.11372549019607843),
                (0.4, 0.4, 0.4)]

rcParams['figure.figsize'] = (10, 6)
rcParams['figure.dpi'] = 150
rcParams['axes.color_cycle'] = dark2_colors
rcParams['lines.linewidth'] = 2
rcParams['axes.grid'] = False
rcParams['axes.facecolor'] = 'white'
rcParams['font.size'] = 14
rcParams['patch.edgecolor'] = 'none'

def remove_border(axes=None, top=False, right=False, left=True, bottom=True):
    """
    Minimize chartjunk by stripping out unnecesary plot borders and axis ticks
    
    The top/right/left/bottom keywords toggle whether the corresponding plot border is drawn
    """
    ax = axes or plt.gca()
    ax.spines['top'].set_visible(top)
    ax.spines['right'].set_visible(right)
    ax.spines['left'].set_visible(left)
    ax.spines['bottom'].set_visible(bottom)
    
    #turn off all ticks
    ax.yaxis.set_ticks_position('none')
    ax.xaxis.set_ticks_position('none')
    
    #now re-enable visibles
    if top:
        ax.xaxis.tick_top()
    if bottom:
        ax.xaxis.tick_bottom()
    if left:
        ax.yaxis.tick_left()
    if right:
        ax.yaxis.tick_right()
        
def histogram_style():
    remove_border(left=False)
    plt.grid(False)
    plt.grid(axis='y', color='w', linestyle='-', lw=1)

"""
EXAMPLE OF USAGE:
allDataEver = get_all_years()
allDataEver = repair_dict(oscarArray, allDataEver)
print allDataEver[2013]['Best Cinematography']
print get_by_award_all_years('Best Picture', allDataEver) 
print get_by_year(1952)

#official oscar titles
#note Best Picture instead of Best Achievement in Picture etc. 
#note that not all oscars exist for every year, i.e. best black and white 

oscarArray = ['Best Picture', 'Best Actor in a Leading Role', 
'Best Actress in a Leading Role', 'Best Actor in a Supporting Role', 
'Best Actress in a Supporting Role', 'Best Actor in a Supporting Role', 
'Best Director', 'Best Assistant Director', 'Best Writing, Screenplay Written Directly for the Screen', 
'Best Writing, Screenplay Based on Material from Another Medium', 'Best Cinematography', 
'Best Art Direction-Set Decoration', 'Best Costume Design', 'Best Sound', 'Best Film Editing', 
'Best Sound Editing','Best Effects, Visual Effects', 'Best Makeup', 'Best Music, Original Song', 
'Best Music, Original Score', 'Best Short Film, Animated', 'Best Short Film, Live Action', 'Best Documentary, Short Subjects', 
'Best Documentary, Features', 'Best Foreign Language Film']
"""

# HELPER FUNCTIONS

from pattern.web import URL, DOM, plaintext, strip_between
from pattern.web import NODE, TEXT, COMMENT, ELEMENT, DOCUMENT

# get rid of accents etc.
def fixWeirdChars(phrase):
    HexEncode = ['E9','E1','E0','B7']
    matchObj = re.search(r'&#x..', phrase)
    if matchObj:
        accentedHex = matchObj.group()
        HexPart = accentedHex.split('x')[1]
        if(HexPart in HexEncode):
            phrase = re.sub((accentedHex+';'), 'e', phrase)
        else:
            Hexy = "0x" + HexPart
            Num = int(Hexy, 16)
            Char = chr(Num)
            phrase = re.sub((accentedHex+';'), Char, phrase)

        matchAgain = re.search(r'&#x..', phrase)
        return fixWeirdChars(phrase)
    return phrase


def clean_unicode(s):
	s = str(s)
	s.encode('ascii','ignore')
	s = fixWeirdChars(s)
 	return str(s)

# repair dictionary names
def repair_dict(dictionary):
	for year in dictionary:
		for award in dictionary[year]:
			if(award == 'Best Motion Picture of the Year'):
				dictionary[year]['Best Picture'] = dictionary[year]['Best Motion Picture of the Year']
				del dictionary[year]['Best Motion Picture of the Year']

			if(award == 'Best Performance by an Actor in a Leading Role'):
				dictionary[year]['Best Actor in a Leading Role'] = dictionary[year]['Best Performance by an Actor in a Leading Role']
				del dictionary[year]['Best Performance by an Actor in a Leading Role']

			if(award == 'Best Performance by an Actress in a Leading Role'):
				dictionary[year]['Best Actress in a Leading Role'] = dictionary[year]['Best Performance by an Actress in a Leading Role']
				del dictionary[year]['Best Performance by an Actress in a Leading Role']

			if(award == 'Best Performance by an Actress in a Supporting Role'):
				dictionary[year]['Best Actress in a Supporting Role'] = dictionary[year]['Best Performance by an Actress in a Supporting Role']
				del dictionary[year]['Best Performance by an Actress in a Supporting Role']

			if(award == 'Best Performance by an Actor in a Supporting Role'):
				dictionary[year]['Best Actor in a Supporting Role'] = dictionary[year]['Best Performance by an Actor in a Supporting Role']
				del dictionary[year]['Best Performance by an Actor in a Supporting Role']

			if(award == 'Best Writing, Original Screenplay'):
				dictionary[year]['Best Writing, Screenplay Written Directly for the Screen'] = dictionary[year]['Best Writing, Original Screenplay']
				del dictionary[year]['Best Writing, Original Screenplay']

			if(award == 'Best Writing, Adapted Screenplay'):
				dictionary[year]['Best Writing, Screenplay Based on Material from Another Medium'] = dictionary[year]['Best Writing, Adapted Screenplay']
				del dictionary[year]['Best Writing, Adapted Screenplay']

			if(award == 'Best Foreign Language Film of the Year'):
				newAwardName = award
				newAwardName = re.sub(' of the Year', '', newAwardName)
				dictionary[year][newAwardName] = dictionary[year][award]
				del dictionary[year][award]

			# check to see if it includes best achievement
			if('Best Achievement' in award):
				if(award == 'Best Achievement in Directing'):
					dictionary[year]['Best Director'] = dictionary[year]['Best Achievement in Directing']
					del dictionary[year]['Best Achievement in Directing']
				else:
					newAwardName = award
					if('Mixing' in award):
						newAwardName = re.sub(' Mixing', '', newAwardName)

					if(' and Hairstyling' in award):
						newAwardName = re.sub(' and Hairstyling', '', newAwardName)

					if('Music Written for Motion Pictures, Original Song' or 'Music Written for Motion Pictures, Original Score' in award):
						newAwardName = re.sub(' Written for Motion Pictures', '', newAwardName)


					newAwardName = re.sub(' Achievement in', '', newAwardName)

					if 'Visual Effects' in newAwardName:
						#add comma
						newAwardName = 'Best Effects, Visual Effects' 

					dictionary[year][newAwardName] = dictionary[year][award]
					del dictionary[year][award]
					
	return dictionary

# END OF HELPER FUNCTIONS


"""
Gets the movie titles for all Oscars for a given year. i.e. get_by_year(2012)
Function
--------
get_by_year

Input int year

Outputs a dict of the format:
'Award':
	'nominees': ['a', 'b', 'c', 'd'] 
	'winner': 'f'

"""
def get_by_year(year):

    url = URL("http://www.imdb.com/event/ev0000003/" + str(year))
    dom = DOM(url.download(cached=True))
    
    dictAll = {}
    
    awards = dom.by_class('award')
    awardTitles = awards[0].by_tag('h2')
    awardList = []
    for award in awardTitles:
        awardList.append(award.content)

    prize = awards[0].by_tag('blockquote')
    for index, title in enumerate(prize[1:25]):
        winner = title.by_tag('strong')[0].by_tag('a')[0].content
        winner_id = str(title.by_tag('strong')[0].by_tag('a')[0].attrs['href'][-8:-1])
        
        nomineeList = []
        for each in title.by_tag('strong')[1::]:
            name = each.by_tag('a')[0].content
            id = str(each.by_tag('a')[0].attrs['href'][-8:-1])
            nomineeList.append((clean_unicode(name),id))
            
        winnersAndNominees = {}
        winnersAndNominees['winner'] = (clean_unicode(winner),winner_id)
        winnersAndNominees['nominees'] = nomineeList
        dictAll[awardList[index]] =  winnersAndNominees
    return dictAll


"""
Gets the movie titles for all Oscars for all years between 1935 and 2013.
Function
--------
get_all_years()


Outputs a dict of dicts the format:
'Year':
	'Award':
		'nominees': ['a', 'b', 'c', 'd'] 
		'winner': 'f'
"""

def get_all_years():
	hugeDict = {}  #lol
	for year in range(2003, 2014):
		hugeDict[year] = get_by_year(str(year))
	return hugeDict


'''
Gets all data for every year for a specific oscar
Function
------
get_by_award_all_years(oscar, dictionary)

Outputs a dict of the format
'Year':
	'nominees': ['a', 'b', 'c', 'd'] 
	'winner': 'f'
        
'''
def get_by_award_all_years(oscar, dictionary):
	yearDict = {}
	for year in dictionary:
		if oscar in dictionary[year].keys():
			yearDict[year] = dictionary[year][oscar]
	return yearDict

# oscars = get_all_years();
# oscars = repair_dict(oscars)
# f = open("data/oscars.json","w+")
# json.dump(oscars,f)
# f.close()

# load an existing dictionary
f = open("data/oscars.json","rb")
oscars = json.load(f)
f.close()
print oscars

movie_awards = [u'Best Documentary, Features',u'Best Writing, Screenplay Written Directly for the Screen',
                u'Best Foreign Language Film',u'Best Short Film, Live Action',u'Best Music, Original Song',
                u'Best Music, Original Score',u'Best Art Direction-Set Decoration',u'Best Costume Design',
                u'Best Short Film, Animated',u'Best Makeup',u'Best Film Editing',u'Best Animated Feature',
                u'Best Writing, Screenplay Based on Material from Another Medium',u'Best Sound Editing',
                u'Best Cinematography',u'Best Documentary, Short Subjects',u'Best Sound',u'Best Director',
                u'Best Picture',u'Best Visual Effects']

# http://stackoverflow.com/questions/9708374/python-how-to-find-the-string-with-most-matches-in-a-list-of-strings
def find_correct_award(awards,keystr):
    keywords = re.split(' |-',keystr.lower())
    def matches(text):
        return sum(word in text.lower() for word in keywords)
    return max(awards,key=matches) 

for year in oscars:
    for award in movie_awards:
        if award not in oscars[year].keys():
            old = find_correct_award(oscars[year].keys(),award)
            oscars[year][award] = oscars[year].pop(old)

print oscars['2003']['Best Cinematography']

def get_top_movies_by_year(yr,num_movies=200):
    num_pages = int((num_movies-1)/100) + 1
    missing_rtid = 0
    missing_imdbid = 0
    alldata = []
    for pagenum in range(1,num_pages+1):
        url = 'http://boxofficemojo.com/yearly/chart/?page=' + str(pagenum) + '&view=releasedate&view2=domestic&yr=' + str(yr) + '&p=.htm'
        page_xml = requests.get(url).text
        dom = web.Element(page_xml)
        for table in dom.by_tag('table'):
            cellpadding = (table.attributes['cellpadding'])    
            if (cellpadding == '5'):  # isolates table with movie data
                for row in table.by_tag('tr'):
                    cols = row.by_tag('td')
                    #catches movie rows
                    try:
                        rank = int(web.strip_tags(cols[0]))
                        if (rank<=num_movies): #checks to make sure in right spot!
                            movie_title = web.strip_tags(cols[1]).strip('\t')
                            movie_title = unicodedata.normalize('NFKD', movie_title).encode('ascii', 'ignore') #removes accents
                            studio = web.strip_tags(cols[2]).strip('\t')
                            
                            # the below try/except mechanisms catch errors when there is a NONE instead of a string which is a candidate for integer casting
                            try:
                                total_gross = int(((web.strip_tags(cols[3])).strip('$')).replace(',', ''))
                            except:
                                total_gross = None
                            try:
                                num_theaters_total = int((web.strip_tags(cols[4])).replace(',', ''))
                            except:
                                num_theaters_total = None
                            try:
                                opening_revenue = int(((web.strip_tags(cols[5])).strip('$')).replace(',', ''))
                            except:
                                opening_revenue = None
                            try:   
                                num_theaters_opening = int((web.strip_tags(cols[6])).replace(',', ''))
                            except:
                                num_theaters_opening = None
                            
                            date_open = web.strip_tags(cols[7]).strip('\t')
                            date_close = web.strip_tags(cols[8]).strip('\t')
                            
                            #using rotten tomatoes to get rtid/imdbid
                            url2 = 'http://api.rottentomatoes.com/api/public/v1.0/movies.json?' 'q=' + movie_title + ' &page_limit=10' + '&apikey=' + api_key
                            info_data = requests.get(url2).text
                            info_json = json.loads(info_data)
                            movie_count = 0
                            rtid=''
                            imdbid=''
                            try:
                                for movie in info_json['movies']:
                                    if movie['year']==yr:
                                        movie_count+=1
                                        rtid=movie['id']
                                        try:
                                            imdbid=(movie['alternate_ids'])['imdb']
                                        except:
                                            imdbid=''
                            except:
                                pass
                            if movie_count!=1: #case where more than one movie met description
                                rtid = ''
                                imdbid = ''
                            
                            # second attempt to obtain imdb ids using the omdbapi
                            if imdbid=='':
                                try:
                                    omdb_link = "http://www.omdbapi.com/?t=%s&y=%s" % (movie_title,yr)
                                    omdb_text = web.URL(omdb_link).download(cached=True)
                                    imdbid = json.loads(omdb_text)['imdbID'].replace("tt","")
                                except:
                                    try:
                                        omdb_link = "http://www.omdbapi.com/?t=%s&y=%s" % (movie_title,(yr-1))
                                        omdb_text = web.URL(omdb_link).download(cached=True)
                                        imdbid = json.loads(omdb_text)['imdbID'].replace("tt","")
                                    except:
                                        try:
                                            omdb_link = "http://www.omdbapi.com/?t=%s&y=%s" % (movie_title,(yr-2))
                                            omdb_text = web.URL(omdb_link).download(cached=True)
                                            imdbid = json.loads(omdb_text)['imdbID'].replace("tt","")
                                        except:
                                            imdbid = ''
                                            missing_imdbid += 1
                            if rtid=='':
                                missing_rtid += 1
                            
                            data_row = [rank,yr,movie_title,imdbid,rtid,studio,total_gross,num_theaters_total,opening_revenue,num_theaters_opening,date_open,date_close]
                            alldata.append(data_row)
                    except ValueError:
                        pass
                    
    result = pd.DataFrame(alldata,columns=['rank','year','movie_title','imdbid','rtid','studio','total_gross','num_theaters_total','opening_revenue','num_theaters_opening','date_open','date_close'])
    if (missing_imdbid>0):
        print 'unable to find a imdbid for ' + str(missing_imdbid) + ' movies of ' + str(num_movies) + ' in ' + str(yr)
    if (missing_rtid>0):
        print 'unable to find a rtid for ' + str(missing_rtid) + ' movies of ' + str(num_movies) + ' in ' + str(yr)
    return result

# Saves all data
# alldata=get_top_movies_by_year(2003,200)
# for yr in range(2004,2013):
#     num_movies=200
#     result=get_top_movies_by_year(yr,num_movies)
#     alldata.append(result,ignore_index=True)
#     print yr
# filename='./data/alldata_' + str(num_movies) + '.csv'
# alldata.to_csv(filename, index=False)

# Saves just title and id data in a separate file
# narrowed_data=pd.DataFrame(alldata, columns=['year','rank','movie_title','imdbid','rtid'])
# filename='./data/iddata_' + str(num_movies) + '.csv'
# narrowed_data.to_csv(filename, index=False)

bomj_df = pd.read_csv('data/completedata.csv')

def test_for_nones(df,column_name):
    result = []
    col = df[column_name]
    for item in col:
        check_bool = False
        if (item != 'None'):
            if item!='NaN':
                check_bool = True
        result.append(check_bool)
    return result

# stores imdbid with 7 characters
bomj_df=bomj_df[test_for_nones(bomj_df,'imdbid')]
# removes rtid, which is calculated later
bomj_df=bomj_df[['rank','year','movie_title','imdbid','studio','total_gross','num_theaters_total','opening_revenue','num_theaters_opening','date_open','date_close']]

import numpy as np

# ensures proper formatting of all columns
def formatting_verification(column,fun):
    result = []
    for val in column:
        if fun=='int':
            if ((val == 'None')):
                result.append(None)
            else:
                print val
                result.append(int(val))

#         elif fun=='str':
#             if ((val is None)):
#                 result.append()
#             else:
#                 try:
#                     if math.isnan(val):
#                         result.append()
#                     else:
#                         result.append(str(val))
#                 except:
#                     result.append(str(val))
#     return result  


bomj_df['imdbid'] = bomj_df['imdbid'].map(lambda id: str(id).zfill(7))

# bomj_df['rank'] = formatting_verification(bomj_df['rank'].values,'int')
# bomj_df['year'] = formatting_verification(bomj_df['year'].values,'int')
# bomj_df['studio'] = formatting_verification(bomj_df['studio'].values,'str')
# bomj_df['total_gross'] = formatting_verification(bomj_df['total_gross'].values,'int')
#bomj_df['num_theaters_total'] = formatting_verification(bomj_df['num_theaters_total'].values,'int')
# bomj_df['opening_revenue'] = formatting_verification(bomj_df['opening_revenue'].values,'int')
# bomj_df['num_theaters_opening'] = formatting_verification(bomj_df['num_theaters_opening'].values,'int')
# bomj_df['date_open'] = pd.to_datetime(bomj_df['date_open'])
# bomj_df['date_close'] = pd.to_datetime(bomj_df['date_close'])
#bomj_df[['num_theaters_total']] = bomj_df[['num_theaters_total']].astype(int)
#bomj_df['num_theaters_total'] = formatting_verification(bomj_df['num_theaters_total'].values,'int')

imdb_as_index = bomj_df.set_index('imdbid')
imdb_as_index.head()
data_by_year = imdb_as_index.groupby('year').groups
performance = {}
total_nominees_present = 0
total_nominees_absent = 0
total_winners_present = 0
total_winners_absent = 0
for (yr,yr_list) in oscars.iteritems():
    yr = int(yr)
    if yr>2003:
        yr=yr-1
        movieids=data_by_year[yr]
        nominee_ids=[]
        winner_ids=[]
        
        nominees_present = 0
        nominees_absent = 0
        winners_present = 0
        winners_absent = 0
        
        for (award,award_list) in yr_list.iteritems():
            winner_ids.append(award_list['winner'][1])
            for nominee in award_list['nominees']:
                nominee_ids.append(nominee[1])
        for nid in nominee_ids:
            is_movie_saved = False
            for top200id in movieids:
                try:
                    corrected_id=str(int(top200id)).zfill(7)
                    if corrected_id==nid:
                        is_movie_saved = True
                except:
                    pass
            if is_movie_saved:
                nominees_present += 1
            else:
                nominees_absent += 1
        for wid in winner_ids:
            is_movie_saved = False
            for top200id in movieids:
                try:
                    corrected_id=str(int(top200id)).zfill(7)
                    if corrected_id==wid:
                        is_movie_saved = True
                except:
                    pass
            if is_movie_saved:
                winners_present += 1
            else:
                winners_absent += 1    
        total_nominees_present += nominees_present
        total_nominees_absent += nominees_absent
        total_winners_present += winners_present
        total_winners_absent += winners_absent
        performance[yr]=[winners_present,winners_absent,nominees_present,nominees_absent]
        #print str(yr) + ' nominee performance: ' + str(float(nominees_present)/(nominees_present+nominees_absent))
        #print str(yr) + ' winner performance: ' + str(float(winners_present)/(winners_present+winners_absent))
        #print

print 'total nominee performance: ' + str(float(total_nominees_present)/(total_nominees_present+total_nominees_absent))
print 'total winner performance: ' + str(float(total_winners_present)/(total_winners_present+total_winners_absent))

print "For Example:"
pd.DataFrame({"critic":[''],"norm_score":[''],"quote":[''],"id":[''],"title":[''],"source":[''],"overall_score":[''],"year":[''],"date":['']}).head()

# write the functions to scrape the data

'''
Get Individual Review From Rotten Tomatoes

'''

def get_rtomatoes_movie_reviews(imdb):
	#api_key = 't97y5fkhfbfu9wjhse7v8ayy' 
	#api_key = '8kr5nqyrshjyp4y69w8rzzgu'
	api_key = 'sdat77wpkdnuyvb9bqhz2y5v'
	if(len(imdb)<7):
		extra = 7 - len(imdb)
		prefix = ""
		for i in range(0, extra):
			prefix = prefix + '0'
			imdb = prefix + imdb

	url = 'http://api.rottentomatoes.com/api/public/v1.0/movie_alias.json' 
	options = {'type': 'imdb','id': imdb, 'apikey': api_key}
	rotIds = requests.get(url, params=options).text
	rotIds = json.loads(rotIds)

	if('"error":"Could not find a movie with the specified id"' in rotIds):
		return None
	if('id' not in rotIds):
		return None

	title = rotIds['title']
	year = rotIds['year']
	overall_score = rotIds['ratings']['critics_score']
	rotId = rotIds['id']
	
	url2 = 'http://api.rottentomatoes.com/api/public/v1.0/movies/%s/reviews.json' % rotId
	options2 = {'review_type': 'top_critic', 'page_limit': 50, 'page': 1, 'apikey': api_key}
	data = requests.get(url2, params=options2).text
	data = json.loads(data)  # load a json string into a collection of lists and dicts
	if('total' not in data or 'reviews' not in data):
		return None
	if str(data['total']) == '0':
		return None

	ids = []
	titles = []
	years = []
	overall_scores = []
	
	revs = data['reviews'] 
	critics = []
	quotes = []
	pubs = []
	freshness = []
	dates = []

	for each in revs:
		ids.append(imdb)
		titles.append(title)
		years.append(year)
		overall_scores.append(float(overall_score)/100.0)

		critics.append(each['critic'])
		quotes.append(each['quote'])
		pubs.append(each['publication'])
		if(each['freshness'] == 'fresh'):
			freshness.append(1)
		else:
			freshness.append(0)
		dates.append(each['date']) 
	
	d = {'critic' : critics, 'norm_score' : freshness, 'quote' : quotes, 'id' : ids, 'title' : titles, 'source' : pubs, 'overall_score' : overall_scores, 'date' : dates, 'year': years} 
	return pd.DataFrame(d)

def get_all_rtomatoes_movies(movieList):
	all_imdbs = alldata['imdbid']
	imdbs = []
	for each in all_imdbs:
		if str(each) != 'None':
			imdbs.append(each)
	#print imdbs
	#for each in imdbs:

	newDF = pd.DataFrame()
	for each in imdbs:
		individualDF = get_rtomatoes_movie_reviews(each)
		if(individualDF):
			newDF = newDF.append(individualDF, ignore_index=True)


	filename='./data/rotten_tomatoes_data.csv'
	newDF.to_csv(filename, index=False)
	return newDF

# actually call the functions and save in a file

#alldata = pd.read_csv('./revised_id_data.csv')     
#rot_tom_data = get_all_rtomatoes_movies(alldata)

#once we have the stored csv file not necessary to make the above calls, and can just do:
rotten_df = pd.read_csv('data/rotten_tomatoes_data.csv')

def filter_tomatoes(dataF):
    for ind, each in enumerate(dataF['quote']):
        if(str(each) == 'nan'):
            dataF['quote'][ind] = 'NoReview'
    dataF = dataF[dataF.quote != 'NoReview']
    return dataF

rotten_df = filter_tomatoes(rotten_df)
rotten_df.head()


# write the functions to scrape the data

# actually call the functions and save in a file

"""
Function
--------
get_imdb_movie_reviews:
    retrieves the movie reviews for a given movie on imdb
Parameters
----------
id:
    imdb movie ID
title:
    title of the movie (why scrape it when we already have it?)
year:
    year that the movie was made
Returns
-------
returns a dataframe containing all of the movie revies of the form
listed above
"""
def get_imdb_movie_reviews(id,title,year):
    score_max = 10.0
    link = "http://www.imdb.com/title/tt%0.7d/" % id
    url = web.URL(link)
    dom = web.DOM(url.download(cached=True))
    overall = float(dom.by_class("titlePageSprite star-box-giga-star")[0].content.strip()) / score_max
    # try to get year directly from page; this isn't present in every entry
    try:
        year = dom('span.itemprop[itemprop=name]')[0].next.next.by_tag('a')[0].content
        year = int(year)
    except:
        pass
    rc = dom.by_attr(itemprop="reviewCount")[0].content.split(" ")[0].replace(",","")
    revlink = link + 'reviews?count=%s&start=0' % rc # get at most 20 reviews
    url = web.URL(revlink)
    dom = web.DOM(url.download(cached=True))
    parser = HTMLParser()
    lst = []
    hrs = dom.by_id('tn15main').by_tag('hr')
    for hr in hrs:
        div = hr.next.next
        try:
            score = float(div.by_tag("img")[1].attrs["alt"].split("/")[0]) / score_max
            date = div.by_tag("small")[2].content
        except:
            continue
        user = div.by_tag("a")[1].content
        p = div.next.next
        review = parser.unescape(p.content.replace("<br />","\n"))
        lst.append(dict(critic=user,norm_score=score,quote=review,
                        id=id,title=title,source="IMDB",overall_score=overall,year=year,date=date))
    return lst

def get_all_imdb_movies():
    fname = 'revised_id_data.csv'
    df = []
    result = pd.read_csv('data/%s' % fname)
    f = open('errors.txt','w+')
    year = 2003
    for index,row in result.iterrows():
        try: 
            row['imdbid'] = int(row['imdbid'])
        except:
            continue
        newyr = row['year']
        if(int(newyr) != year and df != []):
            # filter_quotes(df)
            fo = open('imdbrev%d.json' % year,'w+')
            json.dump(df,fo)
            fo.close()
            year = int(newyr)
            df = []
            gc.collect()
        try:
            df.append(get_imdb_movie_reviews(row['imdbid'],row['movie_title'],row['year']))
            print row['movie_title'], year
        except:
            print >> f, "ERROR: %s" % row['movie_title']
    f.close()
    if df:
        fo = open('imdbrev%d.json' % year,'w+')
        json.dump(df,fo)
        fo.close()

# create the json files for all of the imdb reviews
# get_all_imdb_movies()

def filter_quotes(df):
    for index,row in df.iterrows():
        text = row['quote']
        text = re.sub(r"\\u[0-9a-zAZ]{4}", "", text)
        text = re.sub("\\\[^\'|\"]", "", text)
        text = text.replace('\n','')
        text = unicodedata.normalize('NFKC',unicode(text)).encode('ascii','ignore')
        df.ix[index, 'quote'] = text

"""
Here we construct the dataframe for all of the imdb reviews.
We then filter it, and save it to a csv.
"""
"""
imdb_f_list = ['imdbrev2003.json', 'imdbrev2004.json', 'imdbrev2005.json', 
               'imdbrev2006.json', 'imdbrev2007.json', 'imdbrev2008.json', 
               'imdbrev2009.json', 'imdbrev2010.json', 'imdbrev2011.json', 'imdbrev2012.json'] 


imdb_df = pd.DataFrame({"critic":[],"norm_score":[],"quote":[],"id":[],
               "title":[],"source":[],"overall_score":[],"year":[],"date":[]})

for f in imdb_f_list:
    fp = open('data/' + f,'rb')
    j = json.load(fp)
    fp.close()
    for m in j:
        if m != []:
            imdb_df = imdb_df.append(m)
        gc.collect()
        
filter_quotes(imdb_df)
imdb_df.to_csv('data/imdb_reviews.csv',index=False)
"""
pass

# read saved dataframe
imdb_df = pd.read_csv('data/imdb_reviews.csv')
imdb_df.head()

# now it's time to add the data
"""
Function
--------
add_oscar_data:
    Adds data for being an oscar winner or nominee in place in the passed dataframe

Parameters
----------
    df: Dataframe
    award_name: Name of the oscar award
        If None, then creates general columns for winning any or being nominated in any
"""

movie_awards = [u'Best Documentary, Features',u'Best Writing, Screenplay Written Directly for the Screen',
                u'Best Foreign Language Film',u'Best Short Film, Live Action',u'Best Music, Original Song',
                u'Best Music, Original Score',u'Best Art Direction-Set Decoration',u'Best Costume Design',
                u'Best Short Film, Animated',u'Best Makeup',u'Best Film Editing',u'Best Animated Feature',
                u'Best Writing, Screenplay Based on Material from Another Medium',u'Best Sound Editing',
                u'Best Cinematography',u'Best Documentary, Short Subjects',u'Best Sound',u'Best Director',
                u'Best Picture',u'Best Visual Effects']

def add_oscar_data(df,imdbid_title='id',award_name=None):
    awards = movie_awards
    if award_name:
        awards = [award_name]
        win = award_name.replace(' ','_') + '_winner'
        nom = award_name.replace(' ','_') + '_nom'
    else:
        win = 'oscar_winner'
        nom = 'oscar_nom'
    df[win] = 0
    df[nom] = 0
    for year in oscars:
        for award in movie_awards:
            try:
                df[win][df[imdbid_title] == oscars[year][award]['winner'][1]] = 1
                df[nom][df[imdbid_title] == oscars[year][award]['winner'][1]] = 1
                for n in oscars[year][award]['nominees']:
                    df[nom][df[imdbid_title] == n[1]] = 1
            # skip awards that don't exist for a year
            except:
                pass

add_oscar_data(bomj_df,'imdbid')
bomj_df.head()

def plot_str_hist(df,column_name,xlabel='',ylabel='',title='',xticksize=7):
    total_movies = float(len(df))
    data_pts=[]
    counts=[]
    data=[]
    tickloc=0.5
    barloc=0
    barlocs=[]
    ticklocs=[]
    grouped_data=(df.groupby(column_name)).groups
    for key,val in grouped_data.items():
        data.append((key,len(val)))
        ticklocs.append(tickloc)
        barlocs.append(barloc)
        tickloc+=1
        barloc+=1
    data.sort(key=lambda x:x[1])
    for key,val in data:
        data_pts.append(key)
        counts.append(val/total_movies)
    plt.bar(barlocs, height=counts, width=1,color='r',edgecolor="white")
    plt.xticks(rotation=90,size=xticksize)
    plt.xticks(ticklocs,data_pts)
    plt.xlabel(xlabel)
    plt.ylabel(ylabel)
    plt.title(title)
    histogram_style()
    plt.show()

plot_str_hist(bomj_df,'studio',xlabel='Studio',ylabel='Frequency',title='Studio v. Count for All Movies')
plot_str_hist(bomj_df[bomj_df.oscar_winner == 1],'studio',xlabel='Studio',ylabel='Frequency',title='Studio v. Count for Winners',xticksize=10)
plot_str_hist(bomj_df[bomj_df.oscar_nom == 1],'studio',xlabel='Studio',ylabel='Frequency',title='Studio v. Count for Nominees',xticksize=10)

def plot_quant_data(bomj_df,varname,cr):
    
    data =  (bomj_df)[varname].values
    cleaned_data = []
    for pt in data:
        try:
            cleaned_data.append(int(pt))
        except:
            pass
    plt.hist(cleaned_data,20,color=cr,normed=True)
    plt.xlabel(varname)
    plt.ylabel('Frequency')
    plt.title(varname + ' v. count for all movies')
    histogram_style()
    plt.show()
    
    win_data = (bomj_df[bomj_df.oscar_winner == 1])[varname].values
    cleaned_win_data = []
    for pt in win_data:
        try:
            cleaned_win_data.append(int(pt))
        except:
            pass   
    plt.hist(cleaned_win_data,20,color=cr,normed=True)
    plt.xlabel(varname)
    plt.ylabel('Frequency')
    plt.title(varname + ' v. count for winners')
    histogram_style()
    plt.show()

    nom_data = (bomj_df[bomj_df.oscar_nom == 1])[varname].values
    cleaned_nom_data = []
    for pt in nom_data:
        try:
            cleaned_nom_data.append(int(pt))
        except:
            pass   
    plt.hist(cleaned_nom_data,20,color=cr,normed=True)
    plt.xlabel(varname)
    plt.ylabel('Frequency')
    plt.title(varname + ' v. count for nominees')
    histogram_style()
    plt.show()
plot_quant_data(bomj_df,'num_theaters_total','b')
plot_quant_data(bomj_df,'opening_revenue','b')
plot_quant_data(bomj_df,'num_theaters_opening','b')

print "Number of reviews: %i" % len(imdb_df)
print "Number of critics: %i" % imdb_df.critic.unique().size
print "Number of movies:  %i" % imdb_df.title.unique().size

imdb_df.groupby('critic').title.count().hist(log=True, bins=range(20), edgecolor='white')
plt.xlabel("Number of reviews per critic")
plt.ylabel("N")
histogram_style()

plt.plot(imdb_df.groupby('year').critic.count().index,imdb_df.groupby('year').critic.count())
plt.xlabel('Year')
plt.ylabel('Number of Reviews')
plt.show()

imdb_df[imdb_df['year'] < 2003].groupby('year').title.unique()

# let's remove the entries for years < 2003
imdb_df = imdb_df[imdb_df['year'] >= 2003]

# Finally, let's try plotting the same graph as above
plt.plot(imdb_df.groupby('year').critic.count().index,imdb_df.groupby('year').critic.count())
plt.xlabel('Year')
plt.xticks(range(2002,2013))
plt.ylabel('Number of Reviews')
plt.show()

print "average movie rating: %0.2f / 10" % (imdb_df.groupby('title')['overall_score'].unique().mean() * 10)
print "average user critic rating: %0.2f / 10" % (imdb_df.groupby('critic')['norm_score'].mean().mean() * 10) 

#Your code here
imdb_df.groupby('critic').norm_score.mean().hist(bins=10, edgecolor='w', lw=1)
plt.xlabel("Average rating per critic")
plt.ylabel("N")
histogram_style()

# first let's convert the date column to an actual date object
imdb_df['date'] = pd.to_datetime(imdb_df['date'])
def less_than_date(row):
    return row['date'] < pd.to_datetime('January 1 ' + str(row['year'] + 1))
imdb_df = imdb_df[imdb_df.apply(less_than_date,axis=1)]

print "Number of reviews: %i" % len(imdb_df)
print "Number of critics: %i" % imdb_df.critic.unique().size
print "Number of movies:  %i" % imdb_df.title.unique().size

print "Number of reviews: %i" % len(rotten_df)
print "Number of critics: %i" % rotten_df.critic.unique().size
print "Number of movies:  %i" % rotten_df.title.unique().size

rotten_df.groupby('critic').title.count().hist(log=True, bins=range(20), edgecolor='white')
plt.xlabel("Number of reviews per critic")
plt.ylabel("N")
histogram_style()

plt.plot(rotten_df.groupby('year').critic.count().index,rotten_df.groupby('year').critic.count())
plt.xlabel('Year')
plt.ylabel('Number of Reviews')
plt.show()

print "average movie rating: %0.2f / 10" % (rotten_df.groupby('title')['overall_score'].unique().mean() * 10)
print "average user critic rating: %0.2f / 10" % (rotten_df.groupby('critic')['norm_score'].mean().mean() * 10) 

#Your code here
rotten_df.groupby('critic').norm_score.mean().hist(bins=10, edgecolor='w', lw=1)
plt.xlabel("Average rating per critic")
plt.ylabel("N")
histogram_style()

#def less_than_date(row):
#   return row['date'] < pd.to_datetime('January 1 ' + str(row['year'] + 1))

rotten_df['date'] = pd.to_datetime(rotten_df['date'])
rotten_df = rotten_df[rotten_df.apply(less_than_date,axis=1)]
rotten_df.head()

print "Number of reviews: %i" % len(rotten_df)
print "Number of critics: %i" % rotten_df.critic.unique().size
print "Number of movies:  %i" % rotten_df.title.unique().size

imdb_df['id'] = imdb_df['id'].map(lambda id: str(id).zfill(7))

add_oscar_data(imdb_df)
imdb_df.head()

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cross_validation import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier

"""
Function
--------
make_xyz

Build a bag-of-words training set for the review data

Parameters
-----------
df : Pandas DataFrame
    The review for a dataset
    
vectorizer : CountVectorizer object (optional)
    A CountVectorizer object to use. If None,
    then create and fit a new CountVectorizer.
    Otherwise, re-fit the provided CountVectorizer
    using the critics data
    
Returns
-------
X : numpy array (dims: nreview, nwords)
    Bag-of-words representation for each review.
Y : numpy array (dims: nreview)
    1/0 array. 1 = won an oscar, 0 = didn't win an oscar
Z : numpy array (dims: nreview)
    1/0 array. 1 = nominated, 0 = not nominated

Examples
--------
X, Y, Z = make_xyz(imdb_id)
"""
def make_xyz(df, vectorizer=None): 
    if vectorizer is None:
        vectorizer = CountVectorizer()
    X = vectorizer.fit_transform(df.quote)
    X = X.tocsc()  # some versions of sklearn return COO format
    gc.collect()
    return X, df['oscar_winner'].astype(np.int), df['oscar_nom'].astype(np.int)

from random import sample

# group imdb_df by movie
grp_imdb = imdb_df.groupby('id')
# reset imdb_df for appending new info
imdb_df = pd.DataFrame()

# iterate through groups, randomly remove reviews, append to imdb_df
for name,group in grp_imdb:
    rows = sample(group.index,len(group.index) / 10)
    group = group.ix[rows]
    imdb_df = imdb_df.append(group)

# finally construct X,Y,Z
X, Y, Z = make_xyz(imdb_df)

rotten_df['id'] = rotten_df['id'].map(lambda id: str(id).zfill(7))
add_oscar_data(rotten_df)
rotten_df.head()

rottenX, rottenY, rottenZ = make_xyz(rotten_df)

"""
Function
--------
split_test_train_years:
    randomly splits a movie dataframe by years into training and testing groups

Parameters
----------
df:
    Dataframe to be split
yr_lst:
    a list of the years on which to split

Returns
-------
test_yrs, train_years:
    A tuple of the split dataframe
"""

def split_test_train_years(df,yr_lst):
    test_yrs = random.sample(yr_lst, len(yr_lst)/2)
    train_yrs = list(set(yr_lst) - set(test_yrs))
    print 'Test years: ' + str(test_yrs)
    print 'Train years: ' + str(train_yrs)
    test_mask = [x in test_yrs for x in df.year]
    train_mask = [x in train_yrs for x in df.year]
    return df[test_mask], df[train_mask]

bomj_testdf,bomj_traindf = split_test_train_years(bomj_df,range(2003,2013))

bomj_testdf.head()

from sklearn.naive_bayes import MultinomialNB

def none_checker_composite(df,list_of_cols):
    num_cols = len(list_of_cols)
    newdf=df[test_for_nones(df,list_of_cols[1])]
    if num_cols>1:
        for i in range(1,num_cols):
            newdf=newdf[test_for_nones(newdf,list_of_cols[i])]
    return newdf

def isolate_data(df,cols_to_include,classifier_column='oscar_winner'):
    X = df[cols_to_include].values
    Y = df[classifier_column].values  
    X_new=[]
    Y_new=[]
    for i in range(0,len(Y)):
        try:
            X_new.append((int(X[i,0]),int(X[i,1]),int(X[i,2])))
            Y_new.append(int(Y[i]))
        except:
            pass
    return X_new, Y_new

bomj_testdf=none_checker_composite(bomj_testdf,['total_gross','num_theaters_total','opening_revenue','num_theaters_opening'])
bomj_traindf=none_checker_composite(bomj_traindf,['total_gross','num_theaters_total','opening_revenue','num_theaters_opening'])
(X_test, Y_test) = isolate_data(bomj_testdf,['total_gross','num_theaters_total','opening_revenue','num_theaters_opening'],'oscar_winner')
(X_train, Y_train) = isolate_data(bomj_traindf,['total_gross','num_theaters_total','opening_revenue','num_theaters_opening'],'oscar_winner')


print "Accuracy on train data: %0.2f%%" % (100 * clf.score(X_train, Y_train))
print "Accuracy on test data: %0.2f%%" % (100 * clf.score(X_test, Y_test))

#Your code here
xtrain, xtest, ytrain, ytest = train_test_split(X, Y)
clf = MultinomialNB().fit(xtrain, ytrain)

print "Accuracy on train data: %0.2f%%" % (100 * clf.score(xtest, ytest))

# Your code here. Print the accuracy on the test and training dataset
training_accuracy = clf.score(xtrain, ytrain)
test_accuracy = clf.score(xtest, ytest)

print "Accuracy on training data: %0.2f" % (training_accuracy)
print "Accuracy on test data:     %0.2f" % (test_accuracy)

"""
Function
--------
calibration_plot

Builds a plot like the one above, from a classifier and review data

Inputs
-------
clf : Classifier object
    A MultinomialNB classifier
X : (Nexample, Nfeature) array
    The bag-of-words data
Y : (Nexample) integer array
    1 if a review is Fresh
"""    
#your code here

def calibration_plot(clf, xtest, ytest):
    prob = clf.predict_proba(xtest)[:, 1]
    outcome = ytest
    data = pd.DataFrame(dict(prob=prob, outcome=outcome))

    #group outcomes into bins of similar probability
    bins = np.linspace(0, 1, 20)
    cuts = pd.cut(prob, bins)
    binwidth = bins[1] - bins[0]
    
    #freshness ratio and number of examples in each bin
    cal = data.groupby(cuts).outcome.agg(['mean', 'count'])
    cal['pmid'] = (bins[:-1] + bins[1:]) / 2
    cal['sig'] = np.sqrt(cal.pmid * (1 - cal.pmid) / cal['count'])
        
    #the calibration plot
    ax = plt.subplot2grid((3, 1), (0, 0), rowspan=2)
    p = plt.errorbar(cal.pmid, cal['mean'], cal['sig'])
    plt.plot(cal.pmid, cal.pmid, linestyle='--', lw=1, color='k')
    plt.ylabel("Empirical P(Fresh)")
    remove_border(ax)
    
    #the distribution of P(fresh)
    ax = plt.subplot2grid((3, 1), (2, 0), sharex=ax)
    
    plt.bar(left=cal.pmid - binwidth / 2, height=cal['count'],
            width=.95 * (bins[1] - bins[0]),
            fc=p[0].get_color())
    
    plt.xlabel("Predicted P(Fresh)")
    remove_border()
    plt.ylabel("Number")

calibration_plot(clf, xtest, ytest)

"""
Function
--------
log_likelihood

Compute the log likelihood of a dataset according to 
a bayesian classifier. 
The Log Likelihood is defined by

L = Sum_fresh(logP(fresh)) + Sum_rotten(logP(rotten))

Where Sum_fresh indicates a sum over all fresh reviews, 
and Sum_rotten indicates a sum over rotten reviews
    
Parameters
----------
clf : Bayesian classifier
x : (nexample, nfeature) array
    The input data
y : (nexample) integer array
    Whether each review is Fresh
"""
#your code here

def log_likelihood(clf, x, y):
    prob = clf.predict_log_proba(x)
    rotten = y == 0
    fresh = ~rotten
    return prob[rotten, 0].sum() + prob[fresh, 1].sum()

from sklearn.cross_validation import KFold

def cv_score(clf, x, y, score_func):
    """
    Uses 5-fold cross validation to estimate a score of a classifier
    
    Inputs
    ------
    clf : Classifier object
    x : Input feature vector
    y : Input class labels
    score_func : Function like log_likelihood, that takes (clf, x, y) as input,
                 and returns a score
                 
    Returns
    -------
    The average score obtained by randomly splitting (x, y) into training and 
    test sets, fitting on the training set, and evaluating score_func on the test set
    
    Examples
    cv_score(clf, x, y, log_likelihood)
    """
    result = 0
    nfold = 5
    for train, test in KFold(y.size, nfold): # split data into train/test groups, 5 times
        clf.fit(x[train], y[train]) # fit
        result += score_func(clf, x[test], y[test]) # evaluate score function on held-out data
    return result / nfold # average

#the grid of parameters to search over
alphas = [0, .1, 1, 5, 10, 50]
min_dfs = [1e-5, 1e-4, 1e-3, 1e-2, 1e-1]

#Find the best value for alpha and min_df, and the best classifier
best_alpha = None
best_min_df = None
max_loglike = -np.inf

for alpha in alphas:
    for min_df in min_dfs:         
        vectorizer = CountVectorizer(min_df = min_df)       
        X, Y = make_xy(critics, vectorizer)
        
        #your code here
        clf = MultinomialNB(alpha=alpha)
        loglike = cv_score(clf, X, Y, log_likelihood)

        if loglike > max_loglike:
            max_loglike = loglike
            best_alpha, best_min_df = alpha, min_df

print "alpha: %f" % best_alpha
print "min_df: %f" % best_min_df

#Your code here

vectorizer = CountVectorizer(min_df=best_min_df)
X, Y = make_xy(critics, vectorizer)
xtrain, xtest, ytrain, ytest = train_test_split(X, Y)

clf = MultinomialNB(alpha=best_alpha).fit(xtrain, ytrain)

calibration_plot(clf, xtest, ytest)

# Your code here. Print the accuracy on the test and training dataset
training_accuracy = clf.score(xtrain, ytrain)
test_accuracy = clf.score(xtest, ytest)

print "Accuracy on training data: %0.2f" % (training_accuracy)
print "Accuracy on test data:     %0.2f" % (test_accuracy)

## Your code here

words = np.array(vectorizer.get_feature_names())

x = np.eye(xtest.shape[1])
probs = clf.predict_log_proba(x)[:, 0]
ind = np.argsort(probs)

good_words = words[ind[:10]]
bad_words = words[ind[-10:]]

good_prob = probs[ind[:10]]
bad_prob = probs[ind[-10:]]

print "Good words\t     P(fresh | word)"
for w, p in zip(good_words, good_prob):
    print "%20s" % w, "%0.2f" % (1 - np.exp(p))
    
print "Bad words\t     P(fresh | word)"
for w, p in zip(bad_words, bad_prob):
    print "%20s" % w, "%0.2f" % (1 - np.exp(p))

#Your code here
x, y = make_xy(critics, vectorizer)

prob = clf.predict_proba(x)[:, 0]
predict = clf.predict(x)

bad_rotten = np.argsort(prob[y == 0])[:5]
bad_fresh = np.argsort(prob[y == 1])[-5:]

print "Mis-predicted Rotten quotes"
print '---------------------------'
for row in bad_rotten:
    print critics[y == 0].quote.irow(row)
    print

print "Mis-predicted Fresh quotes"
print '--------------------------'
for row in bad_fresh:
    print critics[y == 1].quote.irow(row)
    print

clf.predict_proba(vectorizer.transform(['This movie is not remarkable, touching, or superb in any way']))