In [1]:

%run "libraries.ipynb"

from networkx.algorithms import bipartite 

sns.set(style="whitegrid")

In [2]:

pages = codecs.open("data/pagenames.txt","r", "utf-8-sig").readlines()
pages = map(lambda x: x.strip(), pages)

computing page-editor bipartite graph¶

In [3]:

pages_editors_graph = nx.Graph()


def get_editors_set(page):
    editors = []

    revisions = json.load(codecs.open("data/revisions/%s.json" % (page), "r", "utf-8-sig"))
    
    revisions = pd.DataFrame(revisions)
    
    editors = revisions[revisions["userid"] != 0]["user"].tolist()
    # editors = map(lambda e: u"%s" % (e), editors)
    editors = set(editors)
    
    # add the number of revisions per user
    revisions_count = revisions.groupby("user").groups
    revisions_range = revisions.groupby("user").agg({ "timestamp": [ np.min, np.max  ] })
    # print editors

    # delete weird users
    # !!SOMEONE SHOULD CHECK THIS AGAIN: There is nan with page "Four-dimensional space"
    editors = [ e for e in editors if e in revisions_count ]
    
    # store extra information about the edge
    def info(x):
        info = {"revisions": len(revisions_count[x]),
                "first revision": revisions_range.ix[x]["timestamp"]["amin"],
                "last revision": revisions_range.ix[x]["timestamp"]["amax"]
               }
        return info

    editors = map(lambda x: (x, info(x)), editors)
    
    return editors

for p in pages:
    title = "p:%s" % (p)
    # print p
    e = get_editors_set(p)
    
    node_info = {"revisions": int(sum([ x[1]["revisions"] for x in e ])),
                 "first revision": str(np.min([ pd.to_datetime(x[1]["first revision"]) for x in e ])),
                 "last revision": str(np.max([ pd.to_datetime(x[1]["last revision"]) for x in e ]))
    }
    
    pages_editors_graph.add_node(title, type="page", attr_dict=node_info)

    # add number of revisions to users
    for editor in e:
        editor_label = "u:%s" % (editor[0])
        info = editor[1]
        pages_editors_graph.add_node(editor_label, type="user")
        pages_editors_graph.add_edge(editor_label, title, attr_dict=info)

In [4]:

# add extra information about editors
editors = [ n for n in pages_editors_graph.nodes(data=True) if n[1]["type"] == "user" ]

for editor in editors:
    user = editor[0]
    pages_editors_graph.node[user]["revisions"] = int(np.sum([ pages_editors_graph[user][edge]["revisions"] for edge in pages_editors_graph[user] ]))
    pages_editors_graph.node[user]["first revision"] = str(np.min([ pd.to_datetime(pages_editors_graph[user][edge]["first revision"]) for edge in pages_editors_graph[user] ]))
    pages_editors_graph.node[user]["last revision"] = str(np.max([ pd.to_datetime(pages_editors_graph[user][edge]["last revision"]) for edge in pages_editors_graph[user] ]))

In [5]:

pages_nodes = [ x[0] for x in  pages_editors_graph.nodes(data=True) if x[1]["type"] == "page" ]
users_nodes = [ x[0] for x in  pages_editors_graph.nodes(data=True) if x[1]["type"] == "user" ]

print "page nodes: %s" % (len(pages_nodes))
print "user nodes: %s" % (len(users_nodes))

page nodes: 303
user nodes: 15857

counting¶

In [6]:

print "nodes: %s" % (len(pages_editors_graph.nodes()))
print "edges: %s" % (len(pages_editors_graph.edges()))

nodes: 16160
edges: 39923

In [7]:

nx.write_gexf(pages_editors_graph, "data/pages-editors.gexf", encoding='utf-8')

In [8]:

def reduce_bipartite(G, select, weight="weight"):
    selected = [ x[0] for x in G.nodes(data=True) if x[1]["type"] == select ]
    results = bipartite.projected_graph(G, selected)

    for u in results.nodes():
        for v in results[u].keys():
            w = len(set(G[u]) & set(G[v]))
            results[u][v][weight] = w

    return results

computing page-page graph¶

In [9]:

page_graph = reduce_bipartite(pages_editors_graph, "page", "coeditors")

counting¶

In [10]:

print "nodes: %s" % (len(page_graph.nodes()))
print "edges: %s" % (len(page_graph.edges()))

nodes: 303
edges: 44688

saving¶

In [11]:

nx.write_gexf(page_graph, "data/pages-linked-by-coeditors.gexf", encoding='utf-8')

computing editor-editor graph¶

In [12]:

# editors_graph = reduce_bipartite(pages_editors_graph, "user", "pages")

counting¶

In [13]:

# print "nodes: %s" % (len(editors_graph.nodes()))
# print "edges: %s" % (len(editors_graph.edges()))

nodes: 15857
edges: 7489562

In [14]:

# nx.write_gexf(editors_graph, "data/wikipedia-geometry/editors-linked-by-pages.gexf")

network statistics¶

In [15]:

network_df = pd.DataFrame(index=pages)
network_df.head()

Out[15]:


2D computer graphics
2D geometric model
3D computer graphics
3D projection
3-sphere

centrality¶

In [16]:

centrality = nx.degree_centrality(page_graph)
closeness = nx.closeness_centrality(page_graph)
betweenness = nx.betweenness_centrality(page_graph, weight="coeditors")
current_flow_closeness = nx.current_flow_closeness_centrality(page_graph, weight="coeditors")
current_flow_betweenness = nx.current_flow_betweenness_centrality(page_graph, weight="coeditors")
#eigenvector = nx.eigenvector_centrality(page_graph)
eigenvector = nx.eigenvector_centrality_numpy(page_graph, weight="coeditors")

for index in network_df.index:
    t = "p:%s" % (index)
    
    network_df.ix[index,"centrality"] = centrality[t]
    network_df.ix[index,"closeness"] = closeness[t]
    network_df.ix[index,"betweenness"] = betweenness[t]
    network_df.ix[index,"current flow closeness"] = current_flow_closeness[t]
    network_df.ix[index,"current flow betweenness"] = current_flow_betweenness[t]
    network_df.ix[index,"eigenvector"] = eigenvector[t]

network_df.head()

Out[16]:

	centrality	closeness	betweenness	current flow closeness	current flow betweenness	eigenvector
2D computer graphics	0.970199	0.971061	0.000603	3.284879	0.002896	0.035669
2D geometric model	0.986755	0.986928	0.001794	2.618806	0.002197	0.019031
3D computer graphics	0.983444	0.983713	0.000119	3.974827	0.004239	0.058397
3D projection	0.993377	0.993421	0.000015	3.882411	0.004432	0.045345
3-sphere	0.990066	0.990164	0.000000	4.029130	0.004879	0.050949

exclusive editors¶

In [17]:

def get_exclusive_editors(title):
    nb = pages_editors_graph["p:%s" % (title)]
    # print nb.keys()
    result = [n for n in nb.keys() if len(pages_editors_graph[n]) > 1 ]
    return len(result)

# print network_df.index[0:10].map(get_exclusive_editors)

network_df["exclusive editors"] = map(get_exclusive_editors, network_df.index)

network_df["exclusive editors"].head()

Out[17]:

2D computer graphics    103
2D geometric model       27
3D computer graphics    150
3D projection            77
3-sphere                102
Name: exclusive editors, dtype: int64

storing the statistics¶

In [18]:

network_df.to_csv("data/pages-linked-by-coeditors.stats.csv", encoding="UTF-8")

computing page-editor bipartite graph¶

counting¶

computing page-page graph¶

counting¶

saving¶

computing editor-editor graph¶

counting¶

network statistics¶

centrality¶

exclusive editors¶

storing the statistics¶

final report¶