Data is available at http://bit.ly/cs109_imdb.
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
#tell pandas to display wide tables as pretty HTML tables
pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 100)
def remove_border(axes=None, top=False, right=False, left=True, bottom=True):
"""
Minimize chartjunk by stripping out unnecesasry plot borders and axis ticks
The top/right/left/bottom keywords toggle whether the corresponding plot border is drawn
"""
ax = axes or plt.gca()
ax.spines['top'].set_visible(top)
ax.spines['right'].set_visible(right)
ax.spines['left'].set_visible(left)
ax.spines['bottom'].set_visible(bottom)
#turn off all ticks
ax.yaxis.set_ticks_position('none')
ax.xaxis.set_ticks_position('none')
#now re-enable visibles
if top:
ax.xaxis.tick_top()
if bottom:
ax.xaxis.tick_bottom()
if left:
ax.yaxis.tick_left()
if right:
ax.yaxis.tick_right()
!head imdb_top_10000.txt
tt0111161 The Shawshank Redemption (1994) 1994 9.2 619479 142 mins. Crime|Drama tt0110912 Pulp Fiction (1994) 1994 9.0 490065 154 mins. Crime|Thriller tt0137523 Fight Club (1999) 1999 8.8 458173 139 mins. Drama|Mystery|Thriller tt0133093 The Matrix (1999) 1999 8.7 448114 136 mins. Action|Adventure|Sci-Fi tt1375666 Inception (2010) 2010 8.9 385149 148 mins. Action|Adventure|Sci-Fi|Thriller tt0109830 Forrest Gump (1994) 1994 8.7 368994 142 mins. Comedy|Drama|Romance tt0169547 American Beauty (1999) 1999 8.6 338332 122 mins. Drama tt0499549 Avatar (2009) 2009 8.1 336855 162 mins. Action|Adventure|Fantasy|Sci-Fi tt0108052 Schindler's List (1993) 1993 8.9 325888 195 mins. Biography|Drama|History|War tt0080684 Star Wars: Episode V - The Empire Strikes Back (1980) 1980 8.8 320105 124 mins. Action|Adventure|Family|Sci-Fi
names = ['imdbID', 'title', 'year', 'rating', 'votes','runtime', 'genres']
data = pd.read_csv('imdb_top_10000.txt', delimiter='\t', names=names).dropna()
print "Number of rows: %i" % data.shape[0]
data.head()
Number of rows: 9999
imdbID | title | year | rating | votes | runtime | genres | |
---|---|---|---|---|---|---|---|
0 | tt0111161 | The Shawshank Redemption (1994) | 1994 | 9.2 | 619479 | 142 mins. | Crime|Drama |
1 | tt0110912 | Pulp Fiction (1994) | 1994 | 9.0 | 490065 | 154 mins. | Crime|Thriller |
2 | tt0137523 | Fight Club (1999) | 1999 | 8.8 | 458173 | 139 mins. | Drama|Mystery|Thriller |
3 | tt0133093 | The Matrix (1999) | 1999 | 8.7 | 448114 | 136 mins. | Action|Adventure|Sci-Fi |
4 | tt1375666 | Inception (2010) | 2010 | 8.9 | 385149 | 148 mins. | Action|Adventure|Sci-Fi|Thriller |
dirty = '142 mins.'
clean = int(dirty.split(' ')[0])
#can also do:
#clean = int(dirty[0:-6])
print clean
142
Applying this for all the rows:
clean_runtime = [float(r.split(' ')[0]) for r in data.runtime]
data.runtime = clean_runtime # or data['runtime'] = clean_runtime
data.head()
imdbID | title | year | rating | votes | runtime | genres | |
---|---|---|---|---|---|---|---|
0 | tt0111161 | The Shawshank Redemption (1994) | 1994 | 9.2 | 619479 | 142 | Crime|Drama |
1 | tt0110912 | Pulp Fiction (1994) | 1994 | 9.0 | 490065 | 154 | Crime|Thriller |
2 | tt0137523 | Fight Club (1999) | 1999 | 8.8 | 458173 | 139 | Drama|Mystery|Thriller |
3 | tt0133093 | The Matrix (1999) | 1999 | 8.7 | 448114 | 136 | Action|Adventure|Sci-Fi |
4 | tt1375666 | Inception (2010) | 2010 | 8.9 | 385149 | 148 | Action|Adventure|Sci-Fi|Thriller |
We can split up the genres column into many columns (one for each genre) and assign a boolean value to each
#determine the unique genres. Use 'set' for thi
genres = set()
for m in data.genres:
genres.update(m for m in m.split('|'))
genres = sorted(genres)
#make a column for each genre
#assign a boolean value to these columns for each movie in the db
#Action column gets filled for all movies, followed by Adult and so on
for genre in genres:
data[genre] = [genre in m.split('|') for m in data.genres]
data.head()
imdbID | title | year | rating | votes | runtime | genres | Action | Adult | Adventure | Animation | Biography | Comedy | Crime | Drama | Family | Fantasy | Film-Noir | History | Horror | Music | Musical | Mystery | News | Reality-TV | Romance | Sci-Fi | Sport | Thriller | War | Western | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | tt0111161 | The Shawshank Redemption (1994) | 1994 | 9.2 | 619479 | 142 | Crime|Drama | False | False | False | False | False | False | True | True | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False |
1 | tt0110912 | Pulp Fiction (1994) | 1994 | 9.0 | 490065 | 154 | Crime|Thriller | False | False | False | False | False | False | True | False | False | False | False | False | False | False | False | False | False | False | False | False | False | True | False | False |
2 | tt0137523 | Fight Club (1999) | 1999 | 8.8 | 458173 | 139 | Drama|Mystery|Thriller | False | False | False | False | False | False | False | True | False | False | False | False | False | False | False | True | False | False | False | False | False | True | False | False |
3 | tt0133093 | The Matrix (1999) | 1999 | 8.7 | 448114 | 136 | Action|Adventure|Sci-Fi | True | False | True | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | True | False | False | False | False |
4 | tt1375666 | Inception (2010) | 2010 | 8.9 | 385149 | 148 | Action|Adventure|Sci-Fi|Thriller | True | False | True | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | True | False | True | False | False |
data.title = [t[0:-7] for t in data.title]
data.head()
imdbID | title | year | rating | votes | runtime | genres | Action | Adult | Adventure | Animation | Biography | Comedy | Crime | Drama | Family | Fantasy | Film-Noir | History | Horror | Music | Musical | Mystery | News | Reality-TV | Romance | Sci-Fi | Sport | Thriller | War | Western | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | tt0111161 | The Shawshank Redemption | 1994 | 9.2 | 619479 | 142 | Crime|Drama | False | False | False | False | False | False | True | True | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False |
1 | tt0110912 | Pulp Fiction | 1994 | 9.0 | 490065 | 154 | Crime|Thriller | False | False | False | False | False | False | True | False | False | False | False | False | False | False | False | False | False | False | False | False | False | True | False | False |
2 | tt0137523 | Fight Club | 1999 | 8.8 | 458173 | 139 | Drama|Mystery|Thriller | False | False | False | False | False | False | False | True | False | False | False | False | False | False | False | True | False | False | False | False | False | True | False | False |
3 | tt0133093 | The Matrix | 1999 | 8.7 | 448114 | 136 | Action|Adventure|Sci-Fi | True | False | True | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | True | False | False | False | False |
4 | tt1375666 | Inception | 2010 | 8.9 | 385149 | 148 | Action|Adventure|Sci-Fi|Thriller | True | False | True | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | True | False | True | False | False |
Now that we have the data in the right form, we can move on to exploring some properties of this data.
We need to look at what questions we want to answer. Some possibilities are:
-> What are the most popular genres?
-> Is there any correlation between rating and number of votes?
-> How does the rating vary by year, genre?
-> What are the best movies year-wise?
We 'll now go about finding answers to these questions.
We can get a rough overview of the data by calling the describe method.
data[['year', 'rating', 'votes', 'runtime']].describe()
year | rating | votes | runtime | |
---|---|---|---|---|
count | 9999.000000 | 9999.000000 | 9999.000000 | 9999.000000 |
mean | 1993.471447 | 6.385989 | 16605.462946 | 103.580358 |
std | 14.830049 | 1.189965 | 34564.883945 | 26.629310 |
min | 1950.000000 | 1.500000 | 1356.000000 | 0.000000 |
25% | 1986.000000 | 5.700000 | 2334.500000 | 93.000000 |
50% | 1998.000000 | 6.600000 | 4981.000000 | 102.000000 |
75% | 2005.000000 | 7.200000 | 15278.500000 | 115.000000 |
max | 2011.000000 | 9.200000 | 619479.000000 | 450.000000 |
We can see some basic properties of the attributes. One may note that some properties don't make sense. For e.g. the min runtime is 0.0 which may shows the quality of data mined. So we may want to see the number of records having that behavior, and flag them so that those values aren't taken for calculating mean, std dev, etc.
#take the subset of the df having runtime=0 and find the number of records in that subset
print len(data[data.runtime==0])
#Flagging the bad values
data.runtime[data.runtime==0] = np.nan
282
/home/priyanka/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:5: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
Now we get
data['runtime'].describe()
count 9717.000000 mean 106.586395 std 20.230330 min 45.000000 25% 93.000000 50% 103.000000 75% 115.000000 max 450.000000 Name: runtime, dtype: float64
We can make use of matplotlib for this purpose.
#You can find the max and min ranges for relevant axes. For example
print max(data.year)
print min(data.year)
2011 1950
#plotting a histogram of the number of movies per year
#arange returns evenly spaced values within a given interval
plt.hist(data.year, bins=np.arange(1950,2015),color='#cccccc')
plt.xlabel('Release Year')
remove_border()
plt.hist(data.rating, bins=20, color='#cccccc')
plt.xlabel('Rating')
remove_border()
plt.hist(data.runtime.dropna(), bins=50,color='#cccccc')
plt.xlabel('Runtime')
remove_border()
# We know the x axis goes till 450 because
max(data.runtime)
450.0
#we can use scatter plots to represent correlations
plt.scatter(data.year, data.rating, lw=0, alpha=0.08, color='k')
plt.xlabel("Year")
plt.ylabel("IMDB Rating")
remove_border()
This seems to suggest that earlier movies had better ratings compared to later ones.
Is this a selection bias?
One possible explanation could be that IMDB came into existence only in 2005 and people are more likely to have seen and rated good movies from the 50's to 80's.
plt.scatter(data.votes, data.rating, alpha=0.08, color='k')
plt.xlabel('Votes')
plt.ylabel('Year')
remove_border()
Clearly we need to change our x-axis in this case.
We can choose a log scale for the purpose.
plt.scatter(data.votes, data.rating, lw=0,alpha=0.2, color='k')
plt.xlabel('Votes')
plt.ylabel('Year')
plt.xscale('log')
remove_border()
What we tend to think is that movies with a lot of votes tend to be more popular and hence have a higher rating. But in this plot, we do see some outliers which have a lot of votes but a very ppor rating. So we can infer that these movies were probably highly hated by the viewers. Let's pull out these movies from the data.
#low rating movies with lots of votes
data[(data.votes>90000) & (data.rating<5)][['title','year','rating','votes','genres']]
title | year | rating | votes | genres | |
---|---|---|---|---|---|
317 | New Moon | 2009 | 4.5 | 90457 | Adventure|Drama|Fantasy|Romance |
334 | Batman & Robin | 1997 | 3.5 | 91875 | Action|Crime|Fantasy|Sci-Fi |
# The lowest rated movies
data[data.rating==min(data.rating)][['title','year','rating','votes','genres']]
# or data[data.rating==data.rating.min()][['title','year','rating','votes','genres']]
title | year | rating | votes | genres | |
---|---|---|---|---|---|
1982 | Manos: The Hands of Fate | 1966 | 1.5 | 20927 | Horror |
2793 | Superbabies: Baby Geniuses 2 | 2004 | 1.5 | 13196 | Comedy|Family |
3746 | Daniel the Wizard | 2004 | 1.5 | 8271 | Comedy|Crime|Family|Fantasy|Horror |
5158 | Ben & Arthur | 2002 | 1.5 | 4675 | Drama|Romance |
5993 | Night Train to Mundo Fine | 1966 | 1.5 | 3542 | Action|Adventure|Crime|War |
6257 | Monster a-Go Go | 1965 | 1.5 | 3255 | Sci-Fi|Horror |
6726 | Dream Well | 2009 | 1.5 | 2848 | Comedy|Romance|Sport |
# The highest rated movies
data[data.rating==data.rating.max()][['title', 'year', 'rating', 'votes', 'genres']]
title | year | rating | votes | genres | |
---|---|---|---|---|---|
0 | The Shawshank Redemption | 1994 | 9.2 | 619479 | Crime|Drama |
26 | The Godfather | 1972 | 9.2 | 474189 | Crime|Drama |
# The top 10 highest rated movies
data.sort('rating', ascending=False)[['title', 'year', 'rating', 'votes', 'genres']].head(10)
title | year | rating | votes | genres | |
---|---|---|---|---|---|
0 | The Shawshank Redemption | 1994 | 9.2 | 619479 | Crime|Drama |
26 | The Godfather | 1972 | 9.2 | 474189 | Crime|Drama |
3386 | Outrageous Class | 1975 | 9.0 | 9823 | Comedy|Drama |
37 | The Godfather: Part II | 1974 | 9.0 | 291169 | Crime|Drama |
85 | The Good, the Bad and the Ugly | 1966 | 9.0 | 195238 | Western |
1 | Pulp Fiction | 1994 | 9.0 | 490065 | Crime|Thriller |
25 | The Dark Knight | 2008 | 8.9 | 555122 | Action|Crime|Drama|Thriller |
143 | 12 Angry Men | 1957 | 8.9 | 148155 | Drama|Mystery |
44 | One Flew Over the Cuckoo's Nest | 1975 | 8.9 | 255503 | Drama |
4 | Inception | 2010 | 8.9 | 385149 | Action|Adventure|Sci-Fi|Thriller |
Notice that record 3386 appears number 3 on this list, although it's featured as the 3386th best movies in IMDB.
This is because it's vote count is low.
What genres are more frequent?
data.Action.sum()
1891
#we can do the above for all genres
#sum sums over rows by default
data[genres].sum()
Action 1891 Adult 9 Adventure 1313 Animation 314 Biography 394 Comedy 3922 Crime 1867 Drama 5697 Family 754 Fantasy 916 Film-Noir 40 History 358 Horror 1215 Music 371 Musical 260 Mystery 1009 News 1 Reality-TV 1 Romance 2441 Sci-Fi 897 Sport 288 Thriller 2832 War 512 Western 235 dtype: int64
genre_count = np.sort(data[genres].sum())[::-1] #sort in ascending order
pd.DataFrame({'Genre Count': genre_count})
#need to find a way to print the genre name :(
Genre Count | |
---|---|
0 | 5697 |
1 | 3922 |
2 | 2832 |
3 | 2441 |
4 | 1891 |
5 | 1867 |
6 | 1313 |
7 | 1215 |
8 | 1009 |
9 | 916 |
10 | 897 |
11 | 754 |
12 | 512 |
13 | 394 |
14 | 371 |
15 | 358 |
16 | 314 |
17 | 288 |
18 | 260 |
19 | 235 |
20 | 40 |
21 | 9 |
22 | 1 |
23 | 1 |
Also note the difference between the following two statements.
data.genres.head()
0 Crime|Drama 1 Crime|Thriller 2 Drama|Mystery|Thriller 3 Action|Adventure|Sci-Fi 4 Action|Adventure|Sci-Fi|Thriller Name: genres, dtype: object
data[genres].head() #because genres was the unique set that we had created
Action | Adult | Adventure | Animation | Biography | Comedy | Crime | Drama | Family | Fantasy | Film-Noir | History | Horror | Music | Musical | Mystery | News | Reality-TV | Romance | Sci-Fi | Sport | Thriller | War | Western | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | False | False | False | False | False | False | True | True | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False |
1 | False | False | False | False | False | False | True | False | False | False | False | False | False | False | False | False | False | False | False | False | False | True | False | False |
2 | False | False | False | False | False | False | False | True | False | False | False | False | False | False | False | True | False | False | False | False | False | True | False | False |
3 | True | False | True | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | True | False | False | False | False |
4 | True | False | True | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | True | False | True | False | False |
How many genres does a movie have on an average?
#axis=1 sums over columns instead
genre_count = data[genres].sum(axis=1)
print "On an average, a movie has %0.2f genres",genre_count.mean()
genre_count.describe()
On an average, a movie has %0.2f genres 2.75397539754
count 9999.000000 mean 2.753975 std 1.168910 min 1.000000 25% 2.000000 50% 3.000000 75% 3.000000 max 8.000000 dtype: float64
Let's split up movies by decade
decade = (data.year//10)*10
dec = data[['title','year']]
dec['decade'] = decade
dec.head()
/home/priyanka/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:3: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy app.launch_new_instance()
title | year | decade | |
---|---|---|---|
0 | The Shawshank Redemption | 1994 | 1990 |
1 | Pulp Fiction | 1994 | 1990 |
2 | Fight Club | 1999 | 1990 |
3 | The Matrix | 1999 | 1990 |
4 | Inception | 2010 | 2010 |
#use groupby to get group properties
#gather mean rating for each decade
decade_mean = data.groupby(decade).rating.mean()
decade_mean.name = 'Decade Mean'
print decade_mean
year 1950 7.244522 1960 7.062367 1970 6.842297 1980 6.248693 1990 6.199316 2000 6.277858 2010 6.344552 Name: Decade Mean, dtype: float64
#plot the decade values against their corresponding mean rating
plt.plot(decade_mean.index, decade_mean.values,'o-', color='r', lw=2, label='Decade Average')
plt.scatter(data.year, data.rating, alpha=0.04, lw=0, color='k')
plt.xlabel('Year')
plt.ylabel('Rating')
plt.legend(frameon='False')
remove_border()
#we can go one step further, and compute the scatter in each year as well
#standard deviation is a measure of how much the members of a group differ from the mean value for the group
grouped_scores = data.groupby(decade).rating
mean = grouped_scores.mean() #each decade has a mean value
std = grouped_scores.std() # each decade has a std dev value
plt.plot(decade_mean.index, decade_mean.values, 'o-',
color='r', lw=3, label='Decade Average')
plt.fill_between(decade_mean.index, (decade_mean + std).values,
(decade_mean - std).values, color='r', alpha=.2)
plt.scatter(data.year, data.rating, alpha=.04, lw=0, color='k')
plt.xlabel("Year")
plt.ylabel("Score")
plt.legend(frameon=False)
remove_border()
You can also iterate over a GroupBy object. Each iteration yields two variables: one of the distinct values of the group key, and the subset of the dataframe where the key equals that value. To find the most popular movie each year:
for year, subset in data.groupby(data.year):
print year, subset[subset.rating==subset.rating.max()].title.values[0]
1950 Sunset Blvd. 1951 Strangers on a Train 1952 Singin' in the Rain 1953 The Wages of Fear 1954 Seven Samurai 1955 Diabolique 1956 The Killing 1957 12 Angry Men 1958 Vertigo 1959 North by Northwest 1960 Psycho 1961 Yojimbo 1962 To Kill a Mockingbird 1963 The Great Escape 1964 Dr. Strangelove or: How I Learned to Stop Worrying and Love the Bomb 1965 For a Few Dollars More 1966 The Good, the Bad and the Ugly 1967 Cool Hand Luke 1968 Once Upon a Time in the West 1969 Butch Cassidy and the Sundance Kid 1970 Patton 1971 A Clockwork Orange 1972 The Godfather 1973 The Sting 1974 The Godfather: Part II 1975 Outrageous Class 1976 Tosun Pasa 1977 Star Wars: Episode IV - A New Hope 1978 The Girl with the Red Scarf 1979 Apocalypse Now 1980 Star Wars: Episode V - The Empire Strikes Back 1981 Raiders of the Lost Ark 1982 The Marathon Family 1983 Star Wars: Episode VI - Return of the Jedi 1984 Balkan Spy 1985 The Broken Landlord 1986 Aliens 1987 Mr. Muhsin 1988 Cinema Paradiso 1989 Indiana Jones and the Last Crusade 1990 Goodfellas 1991 The Silence of the Lambs 1992 Reservoir Dogs 1993 Schindler's List 1994 The Shawshank Redemption 1995 The Usual Suspects 1996 Fargo 1997 Life Is Beautiful 1998 American History X 1999 Fight Club 2000 Memento 2001 The Lord of the Rings: The Fellowship of the Ring 2002 City of God 2003 The Lord of the Rings: The Return of the King 2004 Eternal Sunshine of the Spotless Mind 2005 My Father and My Son 2006 The Departed 2007 Like Stars on Earth 2008 The Dark Knight 2009 Inglourious Basterds 2010 Inception 2011 A Separation
Let's split up the movies by genre, and look at how their release year/runtime/IMDB score vary. The distribution for all movies is shown as a grey background.
This isn't a standard groupby, so we can't use the groupby method here. A manual loop is needed