A voyage through evaluating preferential factors in cinema using public IMDB data
This notebook is adapted from Chris Beaumont's "Rubric for Data Wrangling and Exploration"
%matplotlib inline
# from pylab import *
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
# Dump the data into a frame and label it so if we come back to it later we'll know what's what
# We can see that there were no titles so we'll provide them
# Its also clear that there's tab separations
names = ['imdbID', 'Title', 'year', 'score', 'votes', 'runtime', 'genres']
data = pd.read_csv('imdb_top_10000.txt', delimiter='\t', names=names).dropna()
print "Number of rows: %i. Number of columns: %i" %(data.shape[0],data.shape[1])
data.head() # print the first 5 rows
Number of rows: 9999. Number of columns: 7
imdbID | Title | year | score | votes | runtime | genres | |
---|---|---|---|---|---|---|---|
0 | tt0111161 | The Shawshank Redemption (1994) | 1994 | 9.2 | 619479 | 142 mins. | Crime|Drama |
1 | tt0110912 | Pulp Fiction (1994) | 1994 | 9.0 | 490065 | 154 mins. | Crime|Thriller |
2 | tt0137523 | Fight Club (1999) | 1999 | 8.8 | 458173 | 139 mins. | Drama|Mystery|Thriller |
3 | tt0133093 | The Matrix (1999) | 1999 | 8.7 | 448114 | 136 mins. | Action|Adventure|Sci-Fi |
4 | tt1375666 | Inception (2010) | 2010 | 8.9 | 385149 | 148 mins. | Action|Adventure|Sci-Fi|Thriller |
Weed to adjust the runtime and genres so we can use them. And may as well remove the superfluous year from title as well.
You must love to munge, or work with someone who loves to munge, to get anywhere in data science. Understanding strings are lists, is helpful.
# Fix the title
data['Title'] = [t[0:-7] for t in data.Title]
data.year
0 1994 1 1994 2 1999 3 1999 4 2010 5 1994 6 1999 7 2009 8 1993 9 1980 10 2005 11 1995 12 1991 13 1997 14 1995 ... 9985 1970 9986 1955 9987 1990 9988 1995 9989 1991 9990 2005 9991 1971 9992 2004 9993 2000 9994 1998 9995 2007 9996 2005 9997 1966 9998 1999 9999 2001 Name: year, Length: 9999, dtype: int64
# Fix the runtime
# For each element of data.runtime, split the string at a space and get the first element as a number.
# We can use a "comprehension":
clean_runtime = [float(r.split(' ')[0]) for r in data.runtime]
data['runtime'] = clean_runtime
# Splitting up the genres
#determine the unique genres
genresset = set()
for m in data.genres:
genresset.update(g for g in m.split('|'))
genresset = sorted(genresset)
#make a column for each genre
for genre in genresset:
data[genre] = [genre in movie.split('|') for movie in data.genres]
data.iloc[1:10,4:23]
votes | runtime | genres | Action | Adult | Adventure | Animation | Biography | Comedy | Crime | Drama | Family | Fantasy | Film-Noir | History | Horror | Music | Musical | Mystery | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
1 | 490065 | 154 | Crime|Thriller | False | False | False | False | False | False | True | False | False | False | False | False | False | False | False | False |
2 | 458173 | 139 | Drama|Mystery|Thriller | False | False | False | False | False | False | False | True | False | False | False | False | False | False | False | True |
3 | 448114 | 136 | Action|Adventure|Sci-Fi | True | False | True | False | False | False | False | False | False | False | False | False | False | False | False | False |
4 | 385149 | 148 | Action|Adventure|Sci-Fi|Thriller | True | False | True | False | False | False | False | False | False | False | False | False | False | False | False | False |
5 | 368994 | 142 | Comedy|Drama|Romance | False | False | False | False | False | True | False | True | False | False | False | False | False | False | False | False |
6 | 338332 | 122 | Drama | False | False | False | False | False | False | False | True | False | False | False | False | False | False | False | False |
7 | 336855 | 162 | Action|Adventure|Fantasy|Sci-Fi | True | False | True | False | False | False | False | False | False | True | False | False | False | False | False | False |
8 | 325888 | 195 | Biography|Drama|History|War | False | False | False | False | True | False | False | True | False | False | False | True | False | False | False | False |
9 | 320105 | 124 | Action|Adventure|Family|Sci-Fi | True | False | True | False | False | False | False | False | True | False | False | False | False | False | False | False |
#Check global to see if they make sense:
data[['score', 'runtime', 'year', 'votes']].describe()
score | runtime | year | votes | |
---|---|---|---|---|
count | 9999.000000 | 9999.000000 | 9999.000000 | 9999.000000 |
mean | 6.385989 | 103.580358 | 1993.471447 | 16605.462946 |
std | 1.189965 | 26.629310 | 14.830049 | 34564.883945 |
min | 1.500000 | 0.000000 | 1950.000000 | 1356.000000 |
25% | 5.700000 | 93.000000 | 1986.000000 | 2334.500000 |
50% | 6.600000 | 102.000000 | 1998.000000 | 4981.000000 |
75% | 7.200000 | 115.000000 | 2005.000000 | 15278.500000 |
max | 9.200000 | 450.000000 | 2011.000000 | 619479.000000 |
# Remove any zero runtimes
print len(data[data.runtime == 0])
#probably best to flag those bad data as NAN
data.runtime[data.runtime==0] = np.nan
282
data[['score', 'runtime', 'year', 'votes']].describe()
score | runtime | year | votes | |
---|---|---|---|---|
count | 9999.000000 | 9717.000000 | 9999.000000 | 9999.000000 |
mean | 6.385989 | 106.586395 | 1993.471447 | 16605.462946 |
std | 1.189965 | 20.230330 | 14.830049 | 34564.883945 |
min | 1.500000 | 45.000000 | 1950.000000 | 1356.000000 |
25% | 5.700000 | 93.000000 | 1986.000000 | 2334.500000 |
50% | 6.600000 | 103.000000 | 1998.000000 | 4981.000000 |
75% | 7.200000 | 115.000000 | 2005.000000 | 15278.500000 |
max | 9.200000 | 450.000000 | 2011.000000 | 619479.000000 |
#Exercise: do a histogram of all variables together
# more movies in recent years, but not *very* recent movies (they haven't had time to receive lots of votes yet?)
fig,ax1 = plt.subplots(1,1)
ax1.hist(data.year,bins=np.arange(1950, 2013),color='#eeeeee')
ax1.set_title('Movie Releases by Year')
ax1.set_xlabel('Release Year')
<matplotlib.text.Text at 0x10795d8d0>
# Update the matplotlib configuration parameters:
# plt.rcParams.update({ 'font.family': 'monospace'})
fig,ax1 = plt.subplots(1,1)
ax1.hist(data.score, bins=20,color="#eeeeee")
ax1.set_title('Movie Rating Frequency')
ax1.set_xlabel("IMDB rating")
<matplotlib.text.Text at 0x107126310>
plt.hist(data.runtime.dropna(), bins=50, color='#cccccc')
plt.xlabel("Movie Run Time")
plt.ylabel("Number of Movies")
plt.title('Number of Movies per Run Times')
<matplotlib.text.Text at 0x1084eb290>
#hmm, more bad, recent movies. Real, or a selection bias?
plt.figure(figsize=(10,5))
plt.scatter(data.year, data.score, lw=0, alpha=.05)
plt.xlabel("Year")
plt.ylabel("Score")
plt.title("Rating over time",fontsize='15')
<matplotlib.text.Text at 0x10875d8d0>
plt.figure(figsize=(10,5))
plt.scatter(data.votes, data.score, lw=0, alpha=.1)
plt.xlabel("Number of Votes")
plt.ylabel("IMDB Rating")
plt.title("Votes vs. Ratings",fontsize='x-large')
plt.xscale('log')
### Who are those outliers?
# Who is that movie with lots of votes but a score of betweeen 3 and 4?
# Over 70000, less than 4
data[(data.votes > 7e4) & (data.score < 4)][['Title', 'year', 'score', 'votes', 'genres']]
Title | year | score | votes | genres | |
---|---|---|---|---|---|
334 | Batman & Robin | 1997 | 3.5 | 91875 | Action|Crime|Fantasy|Sci-Fi |
# The lowest rated movies
data[data.score == data.score.min()][['Title', 'year', 'score', 'votes', 'genres']]
Title | year | score | votes | genres | |
---|---|---|---|---|---|
1982 | Manos: The Hands of Fate | 1966 | 1.5 | 20927 | Horror |
2793 | Superbabies: Baby Geniuses 2 | 2004 | 1.5 | 13196 | Comedy|Family |
3746 | Daniel the Wizard | 2004 | 1.5 | 8271 | Comedy|Crime|Family|Fantasy|Horror |
5158 | Ben & Arthur | 2002 | 1.5 | 4675 | Drama|Romance |
5993 | Night Train to Mundo Fine | 1966 | 1.5 | 3542 | Action|Adventure|Crime|War |
6257 | Monster a-Go Go | 1965 | 1.5 | 3255 | Sci-Fi|Horror |
6726 | Dream Well | 2009 | 1.5 | 2848 | Comedy|Romance|Sport |
# The highest rated movies
data[(data.score >7)&(data.year > 2010)][['Title', 'year', 'score', 'votes', 'genres']]
Title | year | score | votes | genres | |
---|---|---|---|---|---|
460 | X-Men: First Class | 2011 | 8.0 | 72810 | Action|Adventure|Drama|Sci-Fi |
609 | Harry Potter and the Deathly Hallows: Part 2 | 2011 | 8.5 | 60422 | Adventure|Drama|Fantasy|Mystery |
624 | Thor | 2011 | 7.4 | 58330 | Action|Adventure|Fantasy |
733 | Source Code | 2011 | 7.7 | 53097 | Mystery|Sci-Fi|Thriller |
766 | Limitless | 2011 | 7.3 | 49143 | Mystery|Sci-Fi|Thriller |
905 | The Adjustment Bureau | 2011 | 7.1 | 44011 | Romance|Sci-Fi|Thriller |
940 | Fast Five | 2011 | 7.6 | 42983 | Action|Crime|Drama|Thriller |
1149 | Paul | 2011 | 7.2 | 35837 | Adventure|Comedy|Sci-Fi |
1301 | Rango | 2011 | 7.5 | 31795 | Animation|Adventure|Comedy|Family|Western |
1335 | Super 8 | 2011 | 7.7 | 31297 | Mystery|Sci-Fi|Thriller |
1703 | Hanna | 2011 | 7.1 | 24357 | Action|Crime|Mystery|Thriller |
1845 | Rio | 2011 | 7.1 | 22671 | Animation|Adventure|Comedy|Family|Musical |
1996 | The Lincoln Lawyer | 2011 | 7.4 | 20445 | Crime|Drama|Thriller |
2130 | Kung Fu Panda 2 | 2011 | 7.8 | 19102 | Animation|Action|Adventure|Comedy|Family |
2213 | Bridesmaids | 2011 | 7.6 | 17826 | Comedy |
2791 | The Tree of Life | 2011 | 7.9 | 13205 | Drama |
2923 | A Separation | 2011 | 8.6 | 11954 | Drama |
3390 | Horrible Bosses | 2011 | 7.6 | 9798 | Comedy|Crime |
3590 | Midnight in Paris | 2011 | 8.1 | 8852 | Comedy|Fantasy|Romance |
3791 | Kill the Irishman | 2011 | 7.1 | 8144 | Biography|Crime|Thriller |
4060 | Captain America: The First Avenger | 2011 | 7.9 | 7191 | Action|Adventure|Sci-Fi|Thriller |
6008 | Delhi Belly | 2011 | 8.2 | 3509 | Comedy|Crime|Drama |
6395 | Melancholia | 2011 | 7.9 | 3135 | Drama|Sci-Fi |
6842 | Win Win | 2011 | 7.7 | 2756 | Comedy|Drama |
6946 | Kaybedenler Kul\xfcb\xfc | 2011 | 7.8 | 2680 | Comedy|Drama |
7141 | Jane Eyre | 2011 | 7.8 | 2559 | Drama|Romance |
7857 | Love Likes Coincidences | 2011 | 7.4 | 2136 | Drama|Romance |
8113 | Attack the Block | 2011 | 7.4 | 2017 | Action|Comedy|Sci-Fi |
8173 | Zindagi Na Milegi Dobara | 2011 | 8.3 | 1982 | Drama|Romance |
9049 | Eyyvah eyvah 2 | 2011 | 7.3 | 1651 | Comedy |
9145 | No One Killed Jessica | 2011 | 7.2 | 1616 | Crime|Drama|Thriller |
9197 | Winnie the Pooh | 2011 | 7.6 | 1598 | Animation|Family |
9860 | The Turin Horse | 2011 | 8.5 | 1389 | Drama |
9966 | Pyaar Ka Punchnama | 2011 | 8.0 | 1359 | Comedy|Drama|Romance |
#etc.
#temp
type(data.year)
pandas.core.series.Series
Are there natural subsets we'd like to comapre?
# By Decade
# Make a separate table with the decades
decade = (data.year / 10) * 10
tyd = data[['Title', 'year']]
tyd['decade'] = decade
tyd.head()
Title | year | decade | |
---|---|---|---|
0 | The Shawshank Redemption (1994) | 1994 | 1990 |
1 | Pulp Fiction (1994) | 1994 | 1990 |
2 | Fight Club (1999) | 1999 | 1990 |
3 | The Matrix (1999) | 1999 | 1990 |
4 | Inception (2010) | 2010 | 2010 |
# We can group the score by decade:
data.groupby(decade).score.head(10)
year 1950 143 8.9 177 8.7 212 8.8 247 8.6 249 8.5 417 8.4 536 8.2 539 8.4 556 8.7 561 8.4 1960 85 9.0 92 8.4 98 8.6 128 8.7 321 8.8 ... 2000 23 8.4 24 8.5 25 8.9 27 8.8 28 8.8 2010 4 8.9 100 8.0 117 8.3 122 8.6 145 8.0 178 8.1 257 8.3 277 7.1 301 6.6 346 7.6 Length: 70, dtype: float64
#mean score for all movies in each decade
decade_mean = data.groupby(decade).score.mean()
print(decade_mean)
year 1950 7.244522 1960 7.062367 1970 6.842297 1980 6.248693 1990 6.199316 2000 6.277858 2010 6.344552 Name: score, dtype: float64
plt.plot(decade_mean.index, decade_mean.values, 'o-',
color='r', lw=1, label='Decade Average')
#plt.scatter(data.year, data.score, alpha=.04, lw=0)
plt.xlabel("Year")
plt.ylabel("Score")
plt.title("Average Score per Decade")
#plt.legend(frameon=False) # add a legend
<matplotlib.text.Text at 0x1070462d0>
plt.plot(decade_mean.index, decade_mean.values, 'o-',
color='r', lw=1, label='Decade Average')
plt.scatter(data.year, data.score, alpha=.04, lw=0)
plt.xlabel("Year")
plt.ylabel("Score")
plt.title("Movie Scores over Time")
plt.legend(frameon=False) # add a legend
<matplotlib.legend.Legend at 0x1075a4150>
len(genresset)
# axes.ravel flattens out an array and executes on each one.
24
# Compare across genres
# create a 4x6 grid of plots.
fig, axes = plt.subplots(nrows=4, ncols=6, figsize=(12, 8), tight_layout=True)
bins = np.arange(1950, 2013, 3)
for ax, genre in zip(axes.ravel(), genresset): # Each of 24 axes gets a genre
#histogram of all the movies by year -- normed, so max is highest level, 0 is lowest level -- in red
ax.hist(data[data[genre] == 1].year, bins=bins, histtype='stepfilled', normed=True, color='r', alpha=.2, ec='none')
# Histogram of movies per year for all movies -- serves as the identical background for the graph.
ax.hist(data.year, bins=bins, histtype='stepfilled', ec='None', normed=True, zorder=0, color='#cccccc')
ax.hist()
ax.xaxis.set_ticks(np.arange(1950, 2013, 30))
ax.set_yticks([])
ax.annotate(genre, xy=(1955, 3e-2), fontsize=14)
ax.set_xlabel('Year')
fig, axes = plt.subplots(nrows=4, ncols=6, figsize=(12, 8), tight_layout=True)
bins = np.arange(30, 240, 10)
for ax, genre in zip(axes.ravel(), genresset):
ax.hist(data[data[genre] == 1].runtime,
bins=bins, histtype='stepfilled', color='r', ec='none', alpha=.3, normed=True)
ax.hist(data.runtime, bins=bins, normed=True,
histtype='stepfilled', ec='none', color='#cccccc',
zorder=0)
ax.set_xticks(np.arange(30, 240, 60))
ax.set_yticks([])
ax.set_xlabel("Runtime [min]")
#remove_border(ax, left=False)
ax.annotate(genre, xy=(230, .02), ha='right', fontsize=12)
fig, axes = plt.subplots(nrows=4, ncols=6, figsize=(12, 8), tight_layout=True)
bins = np.arange(0, 10, .5)
for ax, genre in zip(axes.ravel(), genresset):
ax.hist(data[data[genre] == 1].score,
bins=bins, histtype='stepfilled', color='r', ec='none', alpha=.3, normed=True)
ax.hist(data.score, bins=bins, normed=True,
histtype='stepfilled', ec='none', color='#cccccc',
zorder=0)
ax.set_yticks([])
ax.set_xlabel("Score")
#remove_border(ax, left=False)
ax.set_ylim(0, .4)
ax.annotate(genre, xy=(.8, .2), ha='left', fontsize=10)