import networkx as nx import gzip from collections import Counter import re G=nx.Graph() movies={} with gzip.open('imdb_data/imdb_s.csv.gz') as f: for line in f.readlines(): if ',' not in line: print line continue movie,actor = line.rstrip().split(',') if movie not in movies: movies[movie]=[] movies[movie].append(actor) len(movies) movies.keys()[:10] movies['Him_gok_2001'] for movie,actors in movies.iteritems(): #add link for each pair of actors in a given movie [G.add_edge(actors[i],actors[j])for i in range(len(actors)) for j in range(i+1,len(actors))] nx.number_connected_components(G) comps=nx.connected_components(G) len(comps[0]) lhist=Counter([len(cc) for cc in comps]) sorted([size for size in lhist])[-10:] nx.is_connected(G) actorlist = Counter([a for m in movies for a in movies[m]]) actorlist.most_common(10) len(actorlist) comps[0][:10] ccG=nx.connected_component_subgraphs(G) Gc=ccG[0] len(Gc.nodes()) Gc.edges()[:10] #nx.average_shortest_path_length(Gc) dc=nx.degree_centrality(ccG[0]) dc.values()[:10] years=Counter([title[-4:] for title in movies.keys()]) years.most_common() a2005=Counter([actor for movie,actors in movies.iteritems() if movie.endswith('2005') for actor in actors]) a2005.most_common()[:10] len(a2005) Gs=nx.Graph() for movie,actors in movies.iteritems(): if movie.endswith('2005') and re.match(r'[AT]',movie): #add link for each pair of actors in a given movie [Gs.add_edge(actors[i],actors[j])for i in range(len(actors)) for j in range(i+1,len(actors))] len(Gs.nodes()),len(Gs.edges()) cGs=nx.connected_component_subgraphs(Gs) [len(g) for g in cGs[:10]] myG=cGs[0] len(myG.nodes()) mdc=nx.degree_centrality(myG) mdc.values()[:10] myG['Depp__Johnny'] len(nx.degree_histogram(myG)) print nx.info(myG) nx.density(myG) degrees={n:len(myG[n]) for n in nx.nodes(myG)} degs=Counter([a for x in myG for a in myG[x]]) degs.most_common()[:10] len(myG['Willis__Bruce__I_']) [a for a in myG if a.startswith('Willis')] bc=nx.betweenness_centrality(myG) len(bc) sorted(bc.items(),key=lambda x:x[1])[-10:] nx.draw_networkx(myG) alist= ('Lee__Christopher__I_', 'Depp__Johnny', 'Willis__Bruce__I_','Schwarzenegger__Arnold') def keep(movie): for actor in alist: if actor in movies[movie]: return True return False #'Walken__Christopher', 'Stiller__Ben','Hoffman__Dustin',,'Smith__Will__I_','Cruise__Tom','Travolta__John','Ferrell__Will' len([movie for movie in movies if keep(movie)]) Gs=nx.Graph() for movie,actors in movies.iteritems(): if keep(movie): #add link for each pair of actors in a given movie [Gs.add_edge(actors[i],actors[j])for i in range(len(actors)) for j in range(i+1,len(actors))] nx.is_connected(Gs) print nx.info(Gs) hub_ego=nx.ego_graph(Gs,'Willis__Bruce__I_') pos=nx.spring_layout(hub_ego) nx.draw(hub_ego,pos,node_color='b',node_size=50,with_labels=False) # Draw ego as large and red nx.draw_networkx_nodes(hub_ego,pos,nodelist=[largest_hub],node_size=300,node_color='r') figure(figsize=(20,20)) nx.draw_networkx(hub_ego) bc=nx.betweenness_centrality(Gs) len(bc) sorted(bc.items(),key=lambda x:x[1])[-10:] for a in alist: print a,len(Gs[a]) for a in alist: print nx.eccentricity(Gs,a) [(a,b,nx.shortest_path(Gs,a,b)) for a in alist for b in alist if a!=b] nx.eccentricity(Gs,'Christensen__Hayden') len(Gs['Christensen__Hayden']) degs=Counter([a for x in Gs for a in myG[x]]) nx.eccentricity(Gs,'Grint__Rupert') nx.diameter(Gs) links=[('A','B'),('A','C'),('A','D'),('A','E'),('B','C'), ('B','F'),('C','F'),('D','G'),('D','H'),('E','H'), ('F','I'),('G','I'),('G','J'),('H','J'), ('I','K'),('J','K')] H=nx.Graph() H.add_edges_from(links) nx.draw_networkx(H) nx.shortest_path(H,'A','K') print nx.shortest_path_length(H) nx.average_shortest_path_length(H) nx.single_source_shortest_path_length(H,'A') nx.single_source_shortest_path(H,'A') mean(nx.single_source_shortest_path_length(H,'A').values()) sorted([(node, mean(nx.single_source_shortest_path_length(H,node).values())) for node in H], key=lambda x:x[1]) sorted([(n,b) for n,b in nx.betweenness_centrality(H).items()], key=lambda x:x[1],reverse=True)