from IPython.display import YouTubeVideo, Image, HTML YouTubeVideo('0Q14rHLvMco') import pandas as pd %matplotlib inline we_have_to_go_back = pd.read_csv('./data/LOST_clean.csv') print "Total rows:", len(we_have_to_go_back) we_have_to_go_back.head() we_have_to_go_back['type'].value_counts() big_and_small_screen = we_have_to_go_back[(we_have_to_go_back['type'] == 'TV Series') | (we_have_to_go_back['type'] == 'Movie') | (we_have_to_go_back['type'] == 'TV Movie')] big_and_small_screen.drop_duplicates(['title','score'])['score'].hist(bins=16) big_and_small_screen.drop_duplicates(['title','score'])['score'].describe() big_and_small_screen.sort('score', ascending=0).head(5) YouTubeVideo('arMtFxv7jlw') big_and_small_screen.ix[big_and_small_screen.groupby('actor')['score'].idxmax()] # side note: not happy with this code... there must be a better way. big_and_small_screen['post_lost'] = big_and_small_screen['start_year'] > 2004 before_and_after = pd.pivot_table(big_and_small_screen, columns=['post_lost'], values=['start_year'], index=['actor'], aggfunc=np.size).reset_index() before_and_after['more_after_lost'] = (before_and_after['start_year'][True] - before_and_after['start_year'][False] > 0) before_and_after pre_LOST_roles = big_and_small_screen[big_and_small_screen['post_lost'] == False] actor_type_counts = pre_LOST_roles.groupby(['actor','type']).size().reset_index() actor_type_counts.columns = ['actor','type','occurrences'] actor_type_counts.ix[actor_type_counts.groupby('actor')['occurrences'].idxmax()] actor_type_counts.ix[actor_type_counts.groupby('actor')['occurrences'].idxmax()]['type'].value_counts() post_LOST_roles = big_and_small_screen[big_and_small_screen['post_lost'] == True] actor_type_counts = post_LOST_roles.groupby(['actor','type']).size().reset_index() actor_type_counts.columns = ['actor','type','occurrences'] actor_type_counts.ix[actor_type_counts.groupby('actor')['occurrences'].idxmax()]