%matplotlib inline # from pylab import * import matplotlib.pyplot as plt import pandas as pd import numpy as np # Dump the data into a frame and label it so if we come back to it later we'll know what's what # We can see that there were no titles so we'll provide them # Its also clear that there's tab separations names = ['imdbID', 'Title', 'year', 'score', 'votes', 'runtime', 'genres'] data = pd.read_csv('imdb_top_10000.txt', delimiter='\t', names=names).dropna() print "Number of rows: %i. Number of columns: %i" %(data.shape[0],data.shape[1]) data.head() # print the first 5 rows # Fix the title data['Title'] = [t[0:-7] for t in data.Title] data.year # Fix the runtime # For each element of data.runtime, split the string at a space and get the first element as a number. # We can use a "comprehension": clean_runtime = [float(r.split(' ')[0]) for r in data.runtime] data['runtime'] = clean_runtime # Splitting up the genres #determine the unique genres genresset = set() for m in data.genres: genresset.update(g for g in m.split('|')) genresset = sorted(genresset) #make a column for each genre for genre in genresset: data[genre] = [genre in movie.split('|') for movie in data.genres] data.iloc[1:10,4:23] #Check global to see if they make sense: data[['score', 'runtime', 'year', 'votes']].describe() # Remove any zero runtimes print len(data[data.runtime == 0]) #probably best to flag those bad data as NAN data.runtime[data.runtime==0] = np.nan data[['score', 'runtime', 'year', 'votes']].describe() #Exercise: do a histogram of all variables together # more movies in recent years, but not *very* recent movies (they haven't had time to receive lots of votes yet?) fig,ax1 = plt.subplots(1,1) ax1.hist(data.year,bins=np.arange(1950, 2013),color='#eeeeee') ax1.set_title('Movie Releases by Year') ax1.set_xlabel('Release Year') # Update the matplotlib configuration parameters: # plt.rcParams.update({ 'font.family': 'monospace'}) fig,ax1 = plt.subplots(1,1) ax1.hist(data.score, bins=20,color="#eeeeee") ax1.set_title('Movie Rating Frequency') ax1.set_xlabel("IMDB rating") plt.hist(data.runtime.dropna(), bins=50, color='#cccccc') plt.xlabel("Movie Run Time") plt.ylabel("Number of Movies") plt.title('Number of Movies per Run Times') #hmm, more bad, recent movies. Real, or a selection bias? plt.figure(figsize=(10,5)) plt.scatter(data.year, data.score, lw=0, alpha=.05) plt.xlabel("Year") plt.ylabel("Score") plt.title("Rating over time",fontsize='15') plt.figure(figsize=(10,5)) plt.scatter(data.votes, data.score, lw=0, alpha=.1) plt.xlabel("Number of Votes") plt.ylabel("IMDB Rating") plt.title("Votes vs. Ratings",fontsize='x-large') plt.xscale('log') ### Who are those outliers? # Who is that movie with lots of votes but a score of betweeen 3 and 4? # Over 70000, less than 4 data[(data.votes > 7e4) & (data.score < 4)][['Title', 'year', 'score', 'votes', 'genres']] # The lowest rated movies data[data.score == data.score.min()][['Title', 'year', 'score', 'votes', 'genres']] # The highest rated movies data[(data.score >7)&(data.year > 2010)][['Title', 'year', 'score', 'votes', 'genres']] #etc. #temp type(data.year) # By Decade # Make a separate table with the decades decade = (data.year / 10) * 10 tyd = data[['Title', 'year']] tyd['decade'] = decade tyd.head() # We can group the score by decade: data.groupby(decade).score.head(10) #mean score for all movies in each decade decade_mean = data.groupby(decade).score.mean() print(decade_mean) plt.plot(decade_mean.index, decade_mean.values, 'o-', color='r', lw=1, label='Decade Average') #plt.scatter(data.year, data.score, alpha=.04, lw=0) plt.xlabel("Year") plt.ylabel("Score") plt.title("Average Score per Decade") #plt.legend(frameon=False) # add a legend plt.plot(decade_mean.index, decade_mean.values, 'o-', color='r', lw=1, label='Decade Average') plt.scatter(data.year, data.score, alpha=.04, lw=0) plt.xlabel("Year") plt.ylabel("Score") plt.title("Movie Scores over Time") plt.legend(frameon=False) # add a legend len(genresset) # axes.ravel flattens out an array and executes on each one. # Compare across genres # create a 4x6 grid of plots. fig, axes = plt.subplots(nrows=4, ncols=6, figsize=(12, 8), tight_layout=True) bins = np.arange(1950, 2013, 3) for ax, genre in zip(axes.ravel(), genresset): # Each of 24 axes gets a genre #histogram of all the movies by year -- normed, so max is highest level, 0 is lowest level -- in red ax.hist(data[data[genre] == 1].year, bins=bins, histtype='stepfilled', normed=True, color='r', alpha=.2, ec='none') # Histogram of movies per year for all movies -- serves as the identical background for the graph. ax.hist(data.year, bins=bins, histtype='stepfilled', ec='None', normed=True, zorder=0, color='#cccccc') ax.hist() ax.xaxis.set_ticks(np.arange(1950, 2013, 30)) ax.set_yticks([]) ax.annotate(genre, xy=(1955, 3e-2), fontsize=14) ax.set_xlabel('Year') fig, axes = plt.subplots(nrows=4, ncols=6, figsize=(12, 8), tight_layout=True) bins = np.arange(30, 240, 10) for ax, genre in zip(axes.ravel(), genresset): ax.hist(data[data[genre] == 1].runtime, bins=bins, histtype='stepfilled', color='r', ec='none', alpha=.3, normed=True) ax.hist(data.runtime, bins=bins, normed=True, histtype='stepfilled', ec='none', color='#cccccc', zorder=0) ax.set_xticks(np.arange(30, 240, 60)) ax.set_yticks([]) ax.set_xlabel("Runtime [min]") #remove_border(ax, left=False) ax.annotate(genre, xy=(230, .02), ha='right', fontsize=12) fig, axes = plt.subplots(nrows=4, ncols=6, figsize=(12, 8), tight_layout=True) bins = np.arange(0, 10, .5) for ax, genre in zip(axes.ravel(), genresset): ax.hist(data[data[genre] == 1].score, bins=bins, histtype='stepfilled', color='r', ec='none', alpha=.3, normed=True) ax.hist(data.score, bins=bins, normed=True, histtype='stepfilled', ec='none', color='#cccccc', zorder=0) ax.set_yticks([]) ax.set_xlabel("Score") #remove_border(ax, left=False) ax.set_ylim(0, .4) ax.annotate(genre, xy=(.8, .2), ha='left', fontsize=10)