%matplotlib inline
# from pylab import *

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

# Dump the data into a frame and label it so if we come back to it later we'll know what's what
# We can see that there were no titles so we'll provide them
# Its also clear that there's tab separations 
names = ['imdbID', 'Title', 'year', 'score', 'votes', 'runtime', 'genres']
data = pd.read_csv('imdb_top_10000.txt', delimiter='\t', names=names).dropna()
print "Number of rows: %i. Number of columns: %i" %(data.shape[0],data.shape[1])
data.head()  # print the first 5 rows

# Fix the title
data['Title'] = [t[0:-7] for t in data.Title]


data.year


# Fix the runtime
# For each element of data.runtime, split the string at a space and get the first element as a number.
# We can use a "comprehension":

clean_runtime = [float(r.split(' ')[0]) for r in data.runtime]
data['runtime'] = clean_runtime


# Splitting up the genres
#determine the unique genres
genresset = set()

for m in data.genres:
    genresset.update(g for g in m.split('|'))
genresset = sorted(genresset)

#make a column for each genre
for genre in genresset:
    data[genre] = [genre in movie.split('|') for movie in data.genres]
         

data.iloc[1:10,4:23]

#Check global to see if they make sense:
data[['score', 'runtime', 'year', 'votes']].describe()

# Remove any zero runtimes
print len(data[data.runtime == 0])

#probably best to flag those bad data as NAN
data.runtime[data.runtime==0] = np.nan

data[['score', 'runtime', 'year', 'votes']].describe()


#Exercise: do a histogram of all variables together

# more movies in recent years, but not *very* recent movies (they haven't had time to receive lots of votes yet?)
fig,ax1 = plt.subplots(1,1)
ax1.hist(data.year,bins=np.arange(1950, 2013),color='#eeeeee')
ax1.set_title('Movie Releases by Year')
ax1.set_xlabel('Release Year')

# Update the matplotlib configuration parameters:
# plt.rcParams.update({ 'font.family': 'monospace'})


fig,ax1 = plt.subplots(1,1)
ax1.hist(data.score, bins=20,color="#eeeeee")
ax1.set_title('Movie Rating Frequency')
ax1.set_xlabel("IMDB rating")


plt.hist(data.runtime.dropna(), bins=50, color='#cccccc')
plt.xlabel("Movie Run Time")
plt.ylabel("Number of Movies")
plt.title('Number of Movies per Run Times')

#hmm, more bad, recent movies. Real, or a selection bias?
plt.figure(figsize=(10,5))
plt.scatter(data.year, data.score, lw=0, alpha=.05)
plt.xlabel("Year")
plt.ylabel("Score")
plt.title("Rating over time",fontsize='15')


plt.figure(figsize=(10,5))
plt.scatter(data.votes, data.score, lw=0, alpha=.1)
plt.xlabel("Number of Votes")
plt.ylabel("IMDB Rating")
plt.title("Votes vs. Ratings",fontsize='x-large')
plt.xscale('log')

### Who are those outliers?
# Who is that movie with lots of votes but a score of betweeen 3 and 4?
# Over 70000, less than 4
data[(data.votes > 7e4) & (data.score < 4)][['Title', 'year', 'score', 'votes', 'genres']]

# The lowest rated movies
data[data.score == data.score.min()][['Title', 'year', 'score', 'votes', 'genres']]

# The highest rated movies
data[(data.score >7)&(data.year > 2010)][['Title', 'year', 'score', 'votes', 'genres']]


#etc.

#temp
type(data.year)


# By Decade
# Make a separate table with the decades
decade =  (data.year / 10) * 10

tyd = data[['Title', 'year']]
tyd['decade'] = decade

tyd.head()

# We can group the score by decade:
data.groupby(decade).score.head(10)


#mean score for all movies in each decade
decade_mean = data.groupby(decade).score.mean()
print(decade_mean)

plt.plot(decade_mean.index, decade_mean.values, 'o-',
        color='r', lw=1, label='Decade Average')
#plt.scatter(data.year, data.score, alpha=.04, lw=0)
plt.xlabel("Year")
plt.ylabel("Score")
plt.title("Average Score per Decade")
#plt.legend(frameon=False) # add a legend


plt.plot(decade_mean.index, decade_mean.values, 'o-',
        color='r', lw=1, label='Decade Average')
plt.scatter(data.year, data.score, alpha=.04, lw=0)
plt.xlabel("Year")
plt.ylabel("Score")
plt.title("Movie Scores over Time")
plt.legend(frameon=False) # add a legend


len(genresset)
# axes.ravel flattens out an array and executes on each one.

# Compare across genres
# create a 4x6 grid of plots.
fig, axes = plt.subplots(nrows=4, ncols=6, figsize=(12, 8), tight_layout=True)
bins = np.arange(1950, 2013, 3)

for ax, genre in zip(axes.ravel(), genresset): # Each of 24 axes gets a genre
    
     #histogram of all the movies by year -- normed, so max is highest level, 0 is lowest level -- in red
    ax.hist(data[data[genre] == 1].year, bins=bins, histtype='stepfilled', normed=True, color='r', alpha=.2, ec='none')
    
    # Histogram of movies per year for all movies -- serves as the identical background for the graph.
    ax.hist(data.year, bins=bins, histtype='stepfilled', ec='None', normed=True, zorder=0, color='#cccccc')
    
   
    ax.hist()
    
    ax.xaxis.set_ticks(np.arange(1950, 2013, 30))
    ax.set_yticks([])
    ax.annotate(genre, xy=(1955, 3e-2), fontsize=14)

    ax.set_xlabel('Year')


fig, axes = plt.subplots(nrows=4, ncols=6, figsize=(12, 8), tight_layout=True)

bins = np.arange(30, 240, 10)

for ax, genre in zip(axes.ravel(), genresset):
    ax.hist(data[data[genre] == 1].runtime, 
            bins=bins, histtype='stepfilled', color='r', ec='none', alpha=.3, normed=True)
               
    ax.hist(data.runtime, bins=bins, normed=True,
            histtype='stepfilled', ec='none', color='#cccccc',
            zorder=0)
    
    ax.set_xticks(np.arange(30, 240, 60))
    ax.set_yticks([])
    ax.set_xlabel("Runtime [min]")
    #remove_border(ax, left=False)
    ax.annotate(genre, xy=(230, .02), ha='right', fontsize=12)

fig, axes = plt.subplots(nrows=4, ncols=6, figsize=(12, 8), tight_layout=True)

bins = np.arange(0, 10, .5)

for ax, genre in zip(axes.ravel(), genresset):
    ax.hist(data[data[genre] == 1].score, 
            bins=bins, histtype='stepfilled', color='r', ec='none', alpha=.3, normed=True)
               
    ax.hist(data.score, bins=bins, normed=True,
            histtype='stepfilled', ec='none', color='#cccccc',
            zorder=0)
    
    ax.set_yticks([])
    ax.set_xlabel("Score")
    #remove_border(ax, left=False)
    ax.set_ylim(0, .4)
    ax.annotate(genre, xy=(.8, .2), ha='left', fontsize=10)