#!/usr/bin/env python # coding: utf-8 # # IMDb Data Wrangling and Exploration # ## Task: Use the IMDb top 10,000 movies data for cleaning and exploration # Data is available at http://bit.ly/cs109_imdb. # In[1]: get_ipython().run_line_magic('matplotlib', 'inline') import matplotlib.pyplot as plt import pandas as pd import numpy as np #tell pandas to display wide tables as pretty HTML tables pd.set_option('display.width', 500) pd.set_option('display.max_columns', 100) def remove_border(axes=None, top=False, right=False, left=True, bottom=True): """ Minimize chartjunk by stripping out unnecesasry plot borders and axis ticks The top/right/left/bottom keywords toggle whether the corresponding plot border is drawn """ ax = axes or plt.gca() ax.spines['top'].set_visible(top) ax.spines['right'].set_visible(right) ax.spines['left'].set_visible(left) ax.spines['bottom'].set_visible(bottom) #turn off all ticks ax.yaxis.set_ticks_position('none') ax.xaxis.set_ticks_position('none') #now re-enable visibles if top: ax.xaxis.tick_top() if bottom: ax.xaxis.tick_bottom() if left: ax.yaxis.tick_left() if right: ax.yaxis.tick_right() # In[2]: get_ipython().system('head imdb_top_10000.txt') # ## 1. Build a dataframe # In[3]: names = ['imdbID', 'title', 'year', 'rating', 'votes','runtime', 'genres'] data = pd.read_csv('imdb_top_10000.txt', delimiter='\t', names=names).dropna() print "Number of rows: %i" % data.shape[0] data.head() # ## 2. Clean the dataframe # # ### a. Fixing runtime # remove unnecessary details (mins) and convert to float value # In[4]: dirty = '142 mins.' clean = int(dirty.split(' ')[0]) #can also do: #clean = int(dirty[0:-6]) print clean # Applying this for all the rows: # In[5]: clean_runtime = [float(r.split(' ')[0]) for r in data.runtime] data.runtime = clean_runtime # or data['runtime'] = clean_runtime data.head() # ### b. Fixing genres # We can split up the genres column into many columns (one for each genre) and assign a boolean value to each # In[6]: #determine the unique genres. Use 'set' for thi genres = set() for m in data.genres: genres.update(m for m in m.split('|')) genres = sorted(genres) #make a column for each genre #assign a boolean value to these columns for each movie in the db #Action column gets filled for all movies, followed by Adult and so on for genre in genres: data[genre] = [genre in m.split('|') for m in data.genres] data.head() # ### c. Removing year from the title # In[7]: data.title = [t[0:-7] for t in data.title] data.head() # Now that we have the data in the right form, we can move on to exploring some properties of this data. # # ## 3. Explore global properties # # We need to look at what questions we want to answer. Some possibilities are:
# -> What are the most popular genres?
# -> Is there any correlation between rating and number of votes?
# -> How does the rating vary by year, genre?
# -> What are the best movies year-wise? # # We 'll now go about finding answers to these questions. # # ### Call describe on relevant columns # We can get a rough overview of the data by calling the describe method. # In[8]: data[['year', 'rating', 'votes', 'runtime']].describe() # We can see some basic properties of the attributes. # One may note that some properties don't make sense. For e.g. the min runtime is 0.0 which may shows the quality of data mined. So we may want to see the number of records having that behavior, and flag them so that those values aren't taken for calculating mean, std dev, etc. # In[9]: #take the subset of the df having runtime=0 and find the number of records in that subset print len(data[data.runtime==0]) #Flagging the bad values data.runtime[data.runtime==0] = np.nan # Now we get # In[10]: data['runtime'].describe() # ### Make some basic plots # We can make use of matplotlib for this purpose. # In[11]: #You can find the max and min ranges for relevant axes. For example print max(data.year) print min(data.year) # In[12]: #plotting a histogram of the number of movies per year #arange returns evenly spaced values within a given interval plt.hist(data.year, bins=np.arange(1950,2015),color='#cccccc') plt.xlabel('Release Year') remove_border() # Useful links
# http://docs.scipy.org/doc/numpy/reference/generated/numpy.arange.html # # In[13]: plt.hist(data.rating, bins=20, color='#cccccc') plt.xlabel('Rating') remove_border() # In[14]: plt.hist(data.runtime.dropna(), bins=50,color='#cccccc') plt.xlabel('Runtime') remove_border() # In[15]: # We know the x axis goes till 450 because max(data.runtime) # In[16]: #we can use scatter plots to represent correlations plt.scatter(data.year, data.rating, lw=0, alpha=0.08, color='k') plt.xlabel("Year") plt.ylabel("IMDB Rating") remove_border() # This seems to suggest that earlier movies had better ratings compared to later ones.
# Is this a selection bias?
# One possible explanation could be that IMDB came into existence only in 2005 and people are more likely to have seen and rated good movies from the 50's to 80's. # In[17]: plt.scatter(data.votes, data.rating, alpha=0.08, color='k') plt.xlabel('Votes') plt.ylabel('Year') remove_border() # Clearly we need to change our x-axis in this case.
# We can choose a log scale for the purpose. # In[18]: plt.scatter(data.votes, data.rating, lw=0,alpha=0.2, color='k') plt.xlabel('Votes') plt.ylabel('Year') plt.xscale('log') remove_border() # What we tend to think is that movies with a lot of votes tend to be more popular and hence have a higher rating. But in this plot, we do see some outliers which have a lot of votes but a very ppor rating. So we can infer that these movies were probably highly hated by the viewers. Let's pull out these movies from the data. # In[19]: #low rating movies with lots of votes data[(data.votes>90000) & (data.rating<5)][['title','year','rating','votes','genres']] # In[20]: # The lowest rated movies data[data.rating==min(data.rating)][['title','year','rating','votes','genres']] # or data[data.rating==data.rating.min()][['title','year','rating','votes','genres']] # In[21]: # The highest rated movies data[data.rating==data.rating.max()][['title', 'year', 'rating', 'votes', 'genres']] # In[22]: # The top 10 highest rated movies data.sort('rating', ascending=False)[['title', 'year', 'rating', 'votes', 'genres']].head(10) # Notice that record 3386 appears number 3 on this list, although it's featured as the 3386th best movies in IMDB.
This is because it's vote count is low. # ### Run aggregation functions like sum over several rows or columns # What genres are more frequent? # In[23]: data.Action.sum() # In[24]: #we can do the above for all genres #sum sums over rows by default data[genres].sum() # In[25]: genre_count = np.sort(data[genres].sum())[::-1] #sort in ascending order pd.DataFrame({'Genre Count': genre_count}) #need to find a way to print the genre name :( # Also note the difference between the following two statements. # In[26]: data.genres.head() # In[27]: data[genres].head() #because genres was the unique set that we had created # How many genres does a movie have on an average? # In[28]: #axis=1 sums over columns instead genre_count = data[genres].sum(axis=1) print "On an average, a movie has %0.2f genres",genre_count.mean() genre_count.describe() # ## 4. Explore Group Properties # Let's split up movies by decade # In[29]: decade = (data.year//10)*10 dec = data[['title','year']] dec['decade'] = decade dec.head() # In[30]: #use groupby to get group properties #gather mean rating for each decade decade_mean = data.groupby(decade).rating.mean() decade_mean.name = 'Decade Mean' print decade_mean # In[31]: #plot the decade values against their corresponding mean rating plt.plot(decade_mean.index, decade_mean.values,'o-', color='r', lw=2, label='Decade Average') plt.scatter(data.year, data.rating, alpha=0.04, lw=0, color='k') plt.xlabel('Year') plt.ylabel('Rating') plt.legend(frameon='False') remove_border() # In[32]: #we can go one step further, and compute the scatter in each year as well #standard deviation is a measure of how much the members of a group differ from the mean value for the group grouped_scores = data.groupby(decade).rating mean = grouped_scores.mean() #each decade has a mean value std = grouped_scores.std() # each decade has a std dev value plt.plot(decade_mean.index, decade_mean.values, 'o-', color='r', lw=3, label='Decade Average') plt.fill_between(decade_mean.index, (decade_mean + std).values, (decade_mean - std).values, color='r', alpha=.2) plt.scatter(data.year, data.rating, alpha=.04, lw=0, color='k') plt.xlabel("Year") plt.ylabel("Score") plt.legend(frameon=False) remove_border() # You can also iterate over a GroupBy object. Each iteration yields two variables: one of the distinct values of the group key, and the subset of the dataframe where the key equals that value. To find the most popular movie each year: # In[33]: for year, subset in data.groupby(data.year): print year, subset[subset.rating==subset.rating.max()].title.values[0] # ## Small multiples # Let's split up the movies by genre, and look at how their release year/runtime/IMDB score vary. The distribution for all movies is shown as a grey background. # # This isn't a standard groupby, so we can't use the groupby method here. A manual loop is needed # In[ ]: