from IPython.display import YouTubeVideo, Image, HTML
YouTubeVideo('0Q14rHLvMco')

import pandas as pd
%matplotlib inline

we_have_to_go_back = pd.read_csv('./data/LOST_clean.csv')
print "Total rows:", len(we_have_to_go_back)
we_have_to_go_back.head()

we_have_to_go_back['type'].value_counts()

big_and_small_screen = we_have_to_go_back[(we_have_to_go_back['type'] == 'TV Series') |
                                          (we_have_to_go_back['type'] == 'Movie') |
                                          (we_have_to_go_back['type'] == 'TV Movie')]

big_and_small_screen.drop_duplicates(['title','score'])['score'].hist(bins=16)
big_and_small_screen.drop_duplicates(['title','score'])['score'].describe()

big_and_small_screen.sort('score', ascending=0).head(5)

YouTubeVideo('arMtFxv7jlw')

big_and_small_screen.ix[big_and_small_screen.groupby('actor')['score'].idxmax()]

# side note: not happy with this code... there must be a better way.

big_and_small_screen['post_lost'] = big_and_small_screen['start_year'] > 2004
before_and_after = pd.pivot_table(big_and_small_screen, columns=['post_lost'], 
                                  values=['start_year'], index=['actor'], aggfunc=np.size).reset_index()
before_and_after['more_after_lost'] = (before_and_after['start_year'][True] - before_and_after['start_year'][False] > 0)
before_and_after

pre_LOST_roles = big_and_small_screen[big_and_small_screen['post_lost'] == False]
actor_type_counts = pre_LOST_roles.groupby(['actor','type']).size().reset_index()
actor_type_counts.columns = ['actor','type','occurrences']
actor_type_counts.ix[actor_type_counts.groupby('actor')['occurrences'].idxmax()]

actor_type_counts.ix[actor_type_counts.groupby('actor')['occurrences'].idxmax()]['type'].value_counts()

post_LOST_roles = big_and_small_screen[big_and_small_screen['post_lost'] == True]
actor_type_counts = post_LOST_roles.groupby(['actor','type']).size().reset_index()
actor_type_counts.columns = ['actor','type','occurrences']
actor_type_counts.ix[actor_type_counts.groupby('actor')['occurrences'].idxmax()]