import networkx as nx
import gzip
from collections import Counter
import re
G=nx.Graph()
movies={}
with gzip.open('imdb_data/imdb_s.csv.gz') as f:
for line in f.readlines():
if ',' not in line:
print line
continue
movie,actor = line.rstrip().split(',')
if movie not in movies: movies[movie]=[]
movies[movie].append(actor)
len(movies)
58642
movies.keys()[:10]
['Him_gok_2001', 'Jack_Napier_Show_2__The_2002', 'Ms__Goldman_2004', 'Me_Luv_U_Long_Time_2002', 'Waltz_2004', 'All_the_Stage_Is_a_World_2005', 'Before_the_Bell_Rang_2003', 'Ticking_Man__The_2001', '_Groupe_flag__2002', 'Making_of__Double_Vision___The_2002']
movies['Him_gok_2001']
['Chang__Ken__II_', 'Chan__Moses', 'Fong__Alex__I_', 'Ha__Shiu_Sing', 'Lam__Lap_Sam', 'Lee__Wai_sheung', 'Lok__Dat_Wai', 'Summer__Danny', 'Wan__Tin_chiu', 'Wong__Ken__I_']
for movie,actors in movies.iteritems():
#add link for each pair of actors in a given movie
[G.add_edge(actors[i],actors[j])for i in range(len(actors)) for j in range(i+1,len(actors))]
nx.number_connected_components(G)
3077
comps=nx.connected_components(G)
len(comps[0])
182323
lhist=Counter([len(cc) for cc in comps])
sorted([size for size in lhist])[-10:]
[30, 32, 33, 39, 42, 50, 52, 54, 99, 182323]
nx.is_connected(G)
False
actorlist = Counter([a for m in movies for a in movies[m]])
actorlist.most_common(10)
[('Stone__Lee__II_', 490), ('Steele__Lexington', 354), ('Lawrence__Joel__II_', 346), ('Davis__Mark__V_', 335), ('Holmes__Steve', 318), ('Pete__Mr_', 308), ('Stone__Evan__I_', 294), ('Marcus__Mr_', 287), ('Cannon__Chris__III_', 285), ('Wood__Mark__IV_', 278)]
len(actorlist)
198571
comps[0][:10]
['Shelley__Harry_Anthony', 'Hyson__Matt', 'Sz_les__L_szl_', 'Kermizian__Brian', 'Prak__David', 'Green__David_Gordon', 'Jolicoeur__Clermont', 'Rist__Robbie', 'Stein__Seymour', 'Jurin__Jem']
ccG=nx.connected_component_subgraphs(G)
Gc=ccG[0]
len(Gc.nodes())
182323
Gc.edges()[:10]
[('Shelley__Harry_Anthony', 'Winsett__Jerry'), ('Shelley__Harry_Anthony', 'Waugh__Scott'), ('Shelley__Harry_Anthony', 'Gilbert__Lance'), ('Shelley__Harry_Anthony', 'Lee__Will_Yun'), ('Shelley__Harry_Anthony', 'Harrington__Adam__II_'), ('Shelley__Harry_Anthony', 'Lavin__Jacob'), ('Shelley__Harry_Anthony', 'James__Jesse__VII_'), ('Shelley__Harry_Anthony', 'Henderson__Martin__I_'), ('Shelley__Harry_Anthony', 'Kahn__Joseph'), ('Shelley__Harry_Anthony', 'Ashker__John')]
#nx.average_shortest_path_length(Gc)
dc=nx.degree_centrality(ccG[0])
dc.values()[:10]
[0.00014808964359759106, 0.0015796228650409715, 0.0005210561533989316, 0.00017002885005649345, 0.00019196805651539584, 4.387841291780476e-05, 0.0003729665098013405, 0.0003236032952688101, 0.0001316352387534143, 2.193920645890238e-05]
years=Counter([title[-4:] for title in movies.keys()])
years.most_common()
[('2003', 11989), ('2004', 11963), ('2002', 10886), ('2001', 10219), ('2000', 9584), ('2005', 3700), ('2006', 285), ('2007', 15), ('2008', 1)]
a2005=Counter([actor for movie,actors in movies.iteritems() if movie.endswith('2005') for actor in actors])
a2005.most_common()[:10]
[('Andersen__Kim_S_nderholm', 14), ('Trejo__Danny', 13), ('Freeman__Morgan__I_', 11), ('Talkington__Jonas', 11), ('Madsen__Michael__I_', 11), ('Busey__Gary', 10), ('Williams__Robin__I_', 10), ('Bennett__Jeff__I_', 10), ('Astin__Sean', 10), ('Depp__Johnny', 10)]
len(a2005)
21785
Gs=nx.Graph()
for movie,actors in movies.iteritems():
if movie.endswith('2005') and re.match(r'[AT]',movie):
#add link for each pair of actors in a given movie
[Gs.add_edge(actors[i],actors[j])for i in range(len(actors)) for j in range(i+1,len(actors))]
len(Gs.nodes()),len(Gs.edges())
(3117, 33373)
cGs=nx.connected_component_subgraphs(Gs)
[len(g) for g in cGs[:10]]
[655, 295, 87, 79, 58, 48, 44, 44, 38, 36]
myG=cGs[0]
len(myG.nodes())
655
mdc=nx.degree_centrality(myG)
mdc.values()[:10]
[0.03669724770642202, 0.019877675840978593, 0.010703363914373088, 0.012232415902140673, 0.09480122324159021, 0.01834862385321101, 0.013761467889908258, 0.053516819571865444, 0.11009174311926606, 0.04434250764525994]
myG['Depp__Johnny']
{'Affleck__Ben': {}, 'Bush__George': {}, 'Caan__James': {}, 'Cage__Nicolas': {}, 'Caviezel__James': {}, 'Cheadle__Don': {}, 'Chesney__Kenny': {}, 'Clapton__Eric': {}, 'Clinton__Bill': {}, 'Clooney__George': {}, 'Damon__Matt': {}, 'DeVito__Danny': {}, 'De_Niro__Robert': {}, 'DiCaprio__Leonardo': {}, 'Douglas__Michael__I_': {}, 'Downey_Jr___Robert': {}, 'Eastwood__Clint': {}, 'Foxx__Jamie': {}, 'Freeman__Morgan__I_': {}, 'Garcia__Andy__I_': {}, 'Grammer__Kelsey': {}, 'Grant__Hugh__I_': {}, 'Groban__Josh': {}, 'John__Elton': {}, 'Jones__Quincy': {}, 'Kravitz__Lenny': {}, 'Lauer__Matt': {}, 'Leno__Jay': {}, 'Lowe__Rob__I_': {}, 'Maroon_5': {}, 'Matthews__Christopher__II_': {}, 'Mayer__John__VII_': {}, 'McCormack__Eric': {}, 'McDonough__Neal': {}, 'Nelly__III_': {}, 'O_Reilly__Bill__I_': {}, 'Pitt__Brad': {}, 'Robbins__Tim__I_': {}, 'Romano__Ray__I_': {}, 'Seacrest__Ryan': {}, 'Selleck__Tom': {}, 'Sinise__Gary': {}, 'Spacey__Kevin': {}, 'Spade__David': {}, 'Tarantino__Quentin': {}, 'Waters__Roger': {}, 'Williams__Brian__III_': {}, 'Willis__Bruce__I_': {}, 'Wilson__Brian__I_': {}, 'Wonder__Stevie': {}, 'Woods__James__I_': {}, 'Wright__Bob__VI_': {}, 'Wyle__Noah': {}}
len(nx.degree_histogram(myG))
89
print nx.info(myG)
Name: Type: Graph Number of nodes: 655 Number of edges: 10795 Average degree: 32.9618
nx.density(myG)
0.05040035483343838
degrees={n:len(myG[n]) for n in nx.nodes(myG)}
degs=Counter([a for x in myG for a in myG[x]])
degs.most_common()[:10]
[('Willis__Bruce__I_', 88), ('Lowe__Rob__I_', 85), ('Armstrong__Curtis', 79), ('Lopez__Nick__III_', 78), ('Gainey__M_C_', 78), ('Gore__Chris', 73), ('Dello_Stritto__Frank_J_', 72), ('Schodowski__Chuck', 72), ('Monks__Joseph', 72), ('Tippett__Phil', 72)]
len(myG['Willis__Bruce__I_'])
88
[a for a in myG if a.startswith('Willis')]
['Willis__Bruce__I_']
bc=nx.betweenness_centrality(myG)
len(bc)
655
sorted(bc.items(),key=lambda x:x[1])[-10:]
[('Marquette__Chris', 0.1894104368920672), ('Lowe__Rob__I_', 0.19293178351933277), ('Leguizamo__John', 0.19461694086572898), ('Armstrong__Curtis', 0.19862689726550245), ('Fishburne__Laurence', 0.21505074204682226), ('Rule__Ja', 0.23025423318706215), ('Kay_ru__Artel', 0.23325887107726748), ('Willis__Bruce__I_', 0.23352854308429788), ('Stanton__Harry_Dean', 0.23797052106407604), ('Gainey__M_C_', 0.3448796661843011)]
nx.draw_networkx(myG)
alist= ('Lee__Christopher__I_', 'Depp__Johnny', 'Willis__Bruce__I_','Schwarzenegger__Arnold')
def keep(movie):
for actor in alist:
if actor in movies[movie]: return True
return False
#'Walken__Christopher', 'Stiller__Ben','Hoffman__Dustin',,'Smith__Will__I_','Cruise__Tom','Travolta__John','Ferrell__Will'
len([movie for movie in movies if keep(movie)])
179
Gs=nx.Graph()
for movie,actors in movies.iteritems():
if keep(movie):
#add link for each pair of actors in a given movie
[Gs.add_edge(actors[i],actors[j])for i in range(len(actors)) for j in range(i+1,len(actors))]
nx.is_connected(Gs)
True
print nx.info(Gs)
Name: Type: Graph Number of nodes: 3170 Number of edges: 109058 Average degree: 68.8063
hub_ego=nx.ego_graph(Gs,'Willis__Bruce__I_')
pos=nx.spring_layout(hub_ego)
nx.draw(hub_ego,pos,node_color='b',node_size=50,with_labels=False)
# Draw ego as large and red
nx.draw_networkx_nodes(hub_ego,pos,nodelist=[largest_hub],node_size=300,node_color='r')
figure(figsize=(20,20))
nx.draw_networkx(hub_ego)
bc=nx.betweenness_centrality(Gs)
len(bc)
3170
sorted(bc.items(),key=lambda x:x[1])[-10:]
[('Brosnan__Pierce', 0.013185193060307594), ('Douglas__Michael__I_', 0.0132091694260583), ('Bates__Alan', 0.013351189575697819), ('Astin__Sean', 0.014019042898398188), ('Bloom__Orlando', 0.020403500131657353), ('Jackson__Samuel_L_', 0.03274249267938933), ('Willis__Bruce__I_', 0.14873139095079452), ('Lee__Christopher__I_', 0.15610768423787588), ('Depp__Johnny', 0.2065163401410608), ('Schwarzenegger__Arnold', 0.2130729583642646)]
for a in alist: print a,len(Gs[a])
Lee__Christopher__I_ 758 Depp__Johnny 969 Willis__Bruce__I_ 806 Schwarzenegger__Arnold 1326
for a in alist: print nx.eccentricity(Gs,a)
3 2 3 3
[(a,b,nx.shortest_path(Gs,a,b)) for a in alist for b in alist if a!=b]
[('Lee__Christopher__I_', 'Depp__Johnny', ['Lee__Christopher__I_', 'Depp__Johnny']), ('Lee__Christopher__I_', 'Willis__Bruce__I_', ['Lee__Christopher__I_', 'Depp__Johnny', 'Willis__Bruce__I_']), ('Lee__Christopher__I_', 'Schwarzenegger__Arnold', ['Lee__Christopher__I_', 'Christensen__Hayden', 'Schwarzenegger__Arnold']), ('Depp__Johnny', 'Lee__Christopher__I_', ['Depp__Johnny', 'Lee__Christopher__I_']), ('Depp__Johnny', 'Willis__Bruce__I_', ['Depp__Johnny', 'Willis__Bruce__I_']), ('Depp__Johnny', 'Schwarzenegger__Arnold', ['Depp__Johnny', 'Schwarzenegger__Arnold']), ('Willis__Bruce__I_', 'Lee__Christopher__I_', ['Willis__Bruce__I_', 'Depp__Johnny', 'Lee__Christopher__I_']), ('Willis__Bruce__I_', 'Depp__Johnny', ['Willis__Bruce__I_', 'Depp__Johnny']), ('Willis__Bruce__I_', 'Schwarzenegger__Arnold', ['Willis__Bruce__I_', 'Schwarzenegger__Arnold']), ('Schwarzenegger__Arnold', 'Lee__Christopher__I_', ['Schwarzenegger__Arnold', 'Christensen__Hayden', 'Lee__Christopher__I_']), ('Schwarzenegger__Arnold', 'Depp__Johnny', ['Schwarzenegger__Arnold', 'Depp__Johnny']), ('Schwarzenegger__Arnold', 'Willis__Bruce__I_', ['Schwarzenegger__Arnold', 'Willis__Bruce__I_'])]
nx.eccentricity(Gs,'Christensen__Hayden')
3
len(Gs['Christensen__Hayden'])
150
degs=Counter([a for x in Gs for a in myG[x]])
nx.eccentricity(Gs,'Grint__Rupert')
3
nx.diameter(Gs)
4
links=[('A','B'),('A','C'),('A','D'),('A','E'),('B','C'),
('B','F'),('C','F'),('D','G'),('D','H'),('E','H'),
('F','I'),('G','I'),('G','J'),('H','J'),
('I','K'),('J','K')]
H=nx.Graph()
H.add_edges_from(links)
nx.draw_networkx(H)
nx.shortest_path(H,'A','K')
['A', 'D', 'G', 'I', 'K']
print nx.shortest_path_length(H)
{'A': {'A': 0, 'C': 1, 'B': 1, 'E': 1, 'D': 1, 'G': 2, 'F': 2, 'I': 3, 'H': 2, 'K': 4, 'J': 3}, 'C': {'A': 1, 'C': 0, 'B': 1, 'E': 2, 'D': 2, 'G': 3, 'F': 1, 'I': 2, 'H': 3, 'K': 3, 'J': 4}, 'B': {'A': 1, 'C': 1, 'B': 0, 'E': 2, 'D': 2, 'G': 3, 'F': 1, 'I': 2, 'H': 3, 'K': 3, 'J': 4}, 'E': {'A': 1, 'C': 2, 'B': 2, 'E': 0, 'D': 2, 'G': 3, 'F': 3, 'I': 4, 'H': 1, 'K': 3, 'J': 2}, 'D': {'A': 1, 'C': 2, 'B': 2, 'E': 2, 'D': 0, 'G': 1, 'F': 3, 'I': 2, 'H': 1, 'K': 3, 'J': 2}, 'G': {'A': 2, 'C': 3, 'B': 3, 'E': 3, 'D': 1, 'G': 0, 'F': 2, 'I': 1, 'H': 2, 'K': 2, 'J': 1}, 'F': {'A': 2, 'C': 1, 'B': 1, 'E': 3, 'D': 3, 'G': 2, 'F': 0, 'I': 1, 'H': 4, 'K': 2, 'J': 3}, 'I': {'A': 3, 'C': 2, 'B': 2, 'E': 4, 'D': 2, 'G': 1, 'F': 1, 'I': 0, 'H': 3, 'K': 1, 'J': 2}, 'H': {'A': 2, 'C': 3, 'B': 3, 'E': 1, 'D': 1, 'G': 2, 'F': 4, 'I': 3, 'H': 0, 'K': 2, 'J': 1}, 'K': {'A': 4, 'C': 3, 'B': 3, 'E': 3, 'D': 3, 'G': 2, 'F': 2, 'I': 1, 'H': 2, 'K': 0, 'J': 1}, 'J': {'A': 3, 'C': 4, 'B': 4, 'E': 2, 'D': 2, 'G': 1, 'F': 3, 'I': 2, 'H': 1, 'K': 1, 'J': 0}}
nx.average_shortest_path_length(H)
2.1636363636363636
nx.single_source_shortest_path_length(H,'A')
{'A': 0, 'B': 1, 'C': 1, 'D': 1, 'E': 1, 'F': 2, 'G': 2, 'H': 2, 'I': 3, 'J': 3, 'K': 4}
nx.single_source_shortest_path(H,'A')
{'A': ['A'], 'B': ['A', 'B'], 'C': ['A', 'C'], 'D': ['A', 'D'], 'E': ['A', 'E'], 'F': ['A', 'C', 'F'], 'G': ['A', 'D', 'G'], 'H': ['A', 'E', 'H'], 'I': ['A', 'D', 'G', 'I'], 'J': ['A', 'E', 'H', 'J'], 'K': ['A', 'D', 'G', 'I', 'K']}
mean(nx.single_source_shortest_path_length(H,'A').values())
1.8181818181818181
sorted([(node, mean(nx.single_source_shortest_path_length(H,node).values()))
for node in H],
key=lambda x:x[1])
[('D', 1.7272727272727273), ('A', 1.8181818181818181), ('G', 1.8181818181818181), ('I', 1.9090909090909092), ('C', 2.0), ('B', 2.0), ('F', 2.0), ('H', 2.0), ('E', 2.0909090909090908), ('J', 2.0909090909090908), ('K', 2.1818181818181817)]
sorted([(n,b) for n,b in nx.betweenness_centrality(H).items()],
key=lambda x:x[1],reverse=True)
[('A', 0.2615873015873016), ('I', 0.1976719576719577), ('D', 0.1791534391534392), ('F', 0.15851851851851853), ('G', 0.15005291005291005), ('H', 0.14), ('J', 0.1285714285714286), ('E', 0.05968253968253969), ('C', 0.050793650793650794), ('B', 0.050793650793650794), ('K', 0.045396825396825394)]