%run "libraries.ipynb"
from networkx.algorithms import bipartite
sns.set(style="whitegrid")
pages = codecs.open("data/pagenames.txt","r", "utf-8-sig").readlines()
pages = map(lambda x: x.strip(), pages)
pages_editors_graph = nx.Graph()
def get_editors_set(page):
editors = []
revisions = json.load(codecs.open("data/revisions/%s.json" % (page), "r", "utf-8-sig"))
revisions = pd.DataFrame(revisions)
editors = revisions[revisions["userid"] != 0]["user"].tolist()
# editors = map(lambda e: u"%s" % (e), editors)
editors = set(editors)
# add the number of revisions per user
revisions_count = revisions.groupby("user").groups
revisions_range = revisions.groupby("user").agg({ "timestamp": [ np.min, np.max ] })
# print editors
# delete weird users
# !!SOMEONE SHOULD CHECK THIS AGAIN: There is nan with page "Four-dimensional space"
editors = [ e for e in editors if e in revisions_count ]
# store extra information about the edge
def info(x):
info = {"revisions": len(revisions_count[x]),
"first revision": revisions_range.ix[x]["timestamp"]["amin"],
"last revision": revisions_range.ix[x]["timestamp"]["amax"]
}
return info
editors = map(lambda x: (x, info(x)), editors)
return editors
for p in pages:
title = "p:%s" % (p)
# print p
e = get_editors_set(p)
node_info = {"revisions": int(sum([ x[1]["revisions"] for x in e ])),
"first revision": str(np.min([ pd.to_datetime(x[1]["first revision"]) for x in e ])),
"last revision": str(np.max([ pd.to_datetime(x[1]["last revision"]) for x in e ]))
}
pages_editors_graph.add_node(title, type="page", attr_dict=node_info)
# add number of revisions to users
for editor in e:
editor_label = "u:%s" % (editor[0])
info = editor[1]
pages_editors_graph.add_node(editor_label, type="user")
pages_editors_graph.add_edge(editor_label, title, attr_dict=info)
# add extra information about editors
editors = [ n for n in pages_editors_graph.nodes(data=True) if n[1]["type"] == "user" ]
for editor in editors:
user = editor[0]
pages_editors_graph.node[user]["revisions"] = int(np.sum([ pages_editors_graph[user][edge]["revisions"] for edge in pages_editors_graph[user] ]))
pages_editors_graph.node[user]["first revision"] = str(np.min([ pd.to_datetime(pages_editors_graph[user][edge]["first revision"]) for edge in pages_editors_graph[user] ]))
pages_editors_graph.node[user]["last revision"] = str(np.max([ pd.to_datetime(pages_editors_graph[user][edge]["last revision"]) for edge in pages_editors_graph[user] ]))
pages_nodes = [ x[0] for x in pages_editors_graph.nodes(data=True) if x[1]["type"] == "page" ]
users_nodes = [ x[0] for x in pages_editors_graph.nodes(data=True) if x[1]["type"] == "user" ]
print "page nodes: %s" % (len(pages_nodes))
print "user nodes: %s" % (len(users_nodes))
page nodes: 303 user nodes: 15857
print "nodes: %s" % (len(pages_editors_graph.nodes()))
print "edges: %s" % (len(pages_editors_graph.edges()))
nodes: 16160 edges: 39923
nx.write_gexf(pages_editors_graph, "data/pages-editors.gexf", encoding='utf-8')
def reduce_bipartite(G, select, weight="weight"):
selected = [ x[0] for x in G.nodes(data=True) if x[1]["type"] == select ]
results = bipartite.projected_graph(G, selected)
for u in results.nodes():
for v in results[u].keys():
w = len(set(G[u]) & set(G[v]))
results[u][v][weight] = w
return results
page_graph = reduce_bipartite(pages_editors_graph, "page", "coeditors")
print "nodes: %s" % (len(page_graph.nodes()))
print "edges: %s" % (len(page_graph.edges()))
nodes: 303 edges: 44688
nx.write_gexf(page_graph, "data/pages-linked-by-coeditors.gexf", encoding='utf-8')
# editors_graph = reduce_bipartite(pages_editors_graph, "user", "pages")
# print "nodes: %s" % (len(editors_graph.nodes()))
# print "edges: %s" % (len(editors_graph.edges()))
nodes: 15857 edges: 7489562
# nx.write_gexf(editors_graph, "data/wikipedia-geometry/editors-linked-by-pages.gexf")
network_df = pd.DataFrame(index=pages)
network_df.head()
2D computer graphics |
---|
2D geometric model |
3D computer graphics |
3D projection |
3-sphere |
centrality = nx.degree_centrality(page_graph)
closeness = nx.closeness_centrality(page_graph)
betweenness = nx.betweenness_centrality(page_graph, weight="coeditors")
current_flow_closeness = nx.current_flow_closeness_centrality(page_graph, weight="coeditors")
current_flow_betweenness = nx.current_flow_betweenness_centrality(page_graph, weight="coeditors")
#eigenvector = nx.eigenvector_centrality(page_graph)
eigenvector = nx.eigenvector_centrality_numpy(page_graph, weight="coeditors")
for index in network_df.index:
t = "p:%s" % (index)
network_df.ix[index,"centrality"] = centrality[t]
network_df.ix[index,"closeness"] = closeness[t]
network_df.ix[index,"betweenness"] = betweenness[t]
network_df.ix[index,"current flow closeness"] = current_flow_closeness[t]
network_df.ix[index,"current flow betweenness"] = current_flow_betweenness[t]
network_df.ix[index,"eigenvector"] = eigenvector[t]
network_df.head()
centrality | closeness | betweenness | current flow closeness | current flow betweenness | eigenvector | |
---|---|---|---|---|---|---|
2D computer graphics | 0.970199 | 0.971061 | 0.000603 | 3.284879 | 0.002896 | 0.035669 |
2D geometric model | 0.986755 | 0.986928 | 0.001794 | 2.618806 | 0.002197 | 0.019031 |
3D computer graphics | 0.983444 | 0.983713 | 0.000119 | 3.974827 | 0.004239 | 0.058397 |
3D projection | 0.993377 | 0.993421 | 0.000015 | 3.882411 | 0.004432 | 0.045345 |
3-sphere | 0.990066 | 0.990164 | 0.000000 | 4.029130 | 0.004879 | 0.050949 |
def get_exclusive_editors(title):
nb = pages_editors_graph["p:%s" % (title)]
# print nb.keys()
result = [n for n in nb.keys() if len(pages_editors_graph[n]) > 1 ]
return len(result)
# print network_df.index[0:10].map(get_exclusive_editors)
network_df["exclusive editors"] = map(get_exclusive_editors, network_df.index)
network_df["exclusive editors"].head()
2D computer graphics 103 2D geometric model 27 3D computer graphics 150 3D projection 77 3-sphere 102 Name: exclusive editors, dtype: int64
network_df.to_csv("data/pages-linked-by-coeditors.stats.csv", encoding="UTF-8")