import networkx as nx
import numpy as np
import urllib.request
# Fetch co-author network from http://snap.stanford.edu/data/ca-GrQc.html
urllib.request.urlretrieve("http://snap.stanford.edu/data/ca-GrQc.txt.gz", "ca-GrQc.txt.gz")
graph = nx.read_edgelist('ca-GrQc.txt.gz')
# How many nodes in graph?
graph.order()
5242
The one with the 500th most co-authors
degrees = dict(graph.degree())
author = sorted(degrees.items(),
key=lambda x: x[1],
reverse=True)[500][0]
print('selected author %s with degree %d' %
(author, graph.degree()[author]))
selected author 13813 with degree 13
help(nx.subgraph)
Help on function subgraph in module networkx.classes.function: subgraph(G, nbunch) Returns the subgraph induced on nodes in nbunch. Parameters ---------- G : graph A NetworkX graph nbunch : list, iterable A container of nodes that will be iterated through once (thus it should be an iterator or be iterable). Each element of the container should be a valid node type: any hashable type except None. If nbunch is None, return all edges data in the graph. Nodes in nbunch that are not in the graph will be (quietly) ignored. Notes ----- subgraph(G) calls G.subgraph()
def get_subgraph(graph, nodes, n=100):
""" Get the subgraph consisting of a list node
and their neighbors, plus their neighbors'
neighbors, up to $n$ total nodes"""
neighbors = set()
for ni in nodes:
neighbors |= set(graph.neighbors(ni))
# plot at least the target node and his neighbors.
result = set(nodes) | neighbors
# add "friends of friends" up to n total nodes.
for x in neighbors:
# how many more nodes can we add?
maxsize = n - len(result)
toadd = set(graph.neighbors(x)) - result
result.update(list(toadd)[:maxsize])
if len(result) > n:
break
return graph.subgraph(result)
subgraph = get_subgraph(graph, [author], n=30)
print('subgraph has %d nodes' % len(subgraph.nodes()))
subgraph has 30 nodes
import warnings
warnings.filterwarnings("ignore")
import matplotlib.pyplot as plt
%matplotlib inline
def plot_subgraph(subgraph, target_nodes):
"""
Plot this subgraph of nodes, coloring
the specified list of target_nodes in red.
"""
nodes = list(subgraph.nodes())
colors = ['b'] * len(nodes)
for n in target_nodes:
idx = nodes.index(n)
colors[idx] = 'r'
sizes = [800] * len(nodes)
sizes[idx] = 1000
plt.figure(figsize=(10,10))
plt.axis('off')
nx.draw_networkx(subgraph, nodelist=nodes, with_labels=True,
width=.5, node_color=colors,
node_size=sizes, alpha=.5)
plot_subgraph(subgraph, [author])
Who should we recommend that author 13813 collaborate with next?
Treat link prediction as a ranking problem.
Let's look at a few possible ranking functions:
1.) Shortest Path: s(X,Y)= length of shortest path from X to Y.
Advantages? Disadvantages?
def rank_by_shortest_path(graph, node):
"""
Score each potential edge from target node X to another
node Y by the length of the shortest path from X to Y.
Params:
graph: networkx Graph
node: target node to recommend an edge for
Returns:
List of (id, score) tuples for edges from node to id.
"""
paths = nx.shortest_path_length(graph, node)
return sorted(paths.items(), key=lambda x: x[1])
shortest_paths = rank_by_shortest_path(graph, author)
print('top predicted edges:')
shortest_paths[:20]
top predicted edges:
[('13813', 0), ('17559', 1), ('19206', 1), ('409', 1), ('2474', 1), ('4241', 1), ('6746', 1), ('8680', 1), ('10476', 1), ('16568', 1), ('19204', 1), ('20373', 1), ('21771', 1), ('24620', 1), ('1674', 2), ('3323', 2), ('9184', 2), ('19657', 2), ('21646', 2), ('21943', 2)]
[s for s in shortest_paths if s[1] == 2][:10]
# Many shortest paths of length 2!
# No way to choose among them!
[('1674', 2), ('3323', 2), ('9184', 2), ('19657', 2), ('21646', 2), ('21943', 2), ('23204', 2), ('25034', 2), ('18344', 2), ('639', 2)]
len([s for s in shortest_paths if s[1] == 2])
57
Idea: If X and Y have many co-authors who have co-authored, then X and Y are more likely to co-author.
2.) Common Neighbors: |N(X)∩N(Y)|
from collections import Counter
def rank_by_common_neighbors(graph, node):
"""
Score each potential edge from target node X to another
node Y by the number of neighbors in common.
Params:
graph: networkx Graph
node: target node to recommend an edge for
Returns:
List of (id, score) tuples for edges from node to id.
"""
neighbors = set(graph.neighbors(node))
scores = []
for n in graph.nodes():
neighbors2 = set(graph.neighbors(n))
scores.append((n, len(neighbors & neighbors2)))
return sorted(scores, key=lambda x: x[1], reverse=True)
common_neighbors = rank_by_common_neighbors(graph, author)
nonzero_scores = [x[1] for x in common_neighbors if x[1] > 0]
print('Histogram of number of nodes with common neighbors:\n', \
Counter(nonzero_scores).most_common())
# plot scores
plt.figure()
plt.plot(sorted(nonzero_scores))
plt.xlabel('rank')
plt.ylabel('score')
plt.show()
print('top predicted edges:')
common_neighbors[:20]
Histogram of number of nodes with common neighbors: [(1, 58), (2, 9), (4, 2), (13, 1), (5, 1)]
top predicted edges:
[('13813', 13), ('23204', 5), ('17559', 4), ('19204', 4), ('25034', 2), ('21646', 2), ('8680', 2), ('9184', 2), ('20373', 2), ('409', 2), ('4241', 2), ('6746', 2), ('20344', 2), ('10235', 1), ('19297', 1), ('1727', 1), ('21705', 1), ('24924', 1), ('14746', 1), ('10055', 1)]
pair = set([author, '23204'])
plot_subgraph(get_subgraph(graph, pair, n=30), pair)
Idea: If X and Y have many co-authors who have co-authored, then X and Y are more likely to co-author.
3.) Jaccard coefficient: |N(X)∩N(Y)||N(X)∪N(Y)|
For a randomly selected node Z∈N(X)∪N(Y), how likely is it that Z is in both N(X) and N(Y)?
def rank_by_jaccard(graph, node):
neighbors = set(graph.neighbors(node))
scores = []
for n in graph.nodes():
neighbors2 = set(graph.neighbors(n))
scores.append((n, len(neighbors & neighbors2) /
len(neighbors | neighbors2)))
return sorted(scores, key=lambda x: x[1], reverse=True)
common_jaccard = rank_by_jaccard(graph, author)
common_jaccard[:20]
[('13813', 1.0), ('23204', 0.25), ('19204', 0.2), ('17559', 0.18181818181818182), ('409', 0.14285714285714285), ('6746', 0.14285714285714285), ('20344', 0.14285714285714285), ('8680', 0.125), ('21646', 0.1), ('12852', 0.07692307692307693), ('16333', 0.07692307692307693), ('18344', 0.07692307692307693), ('19940', 0.07142857142857142), ('3633', 0.07142857142857142), ('4241', 0.06896551724137931), ('21705', 0.06666666666666667), ('25034', 0.06666666666666667), ('19206', 0.06666666666666667), ('2474', 0.06666666666666667), ('24620', 0.06666666666666667)]
plt.plot(sorted([x[1] for x in common_jaccard if x[1] > 0]))
[<matplotlib.lines.Line2D at 0x11e8f4cc0>]
# Print the top 10 jaccard scores, and
# compare with common neighbors scores.
import pandas as pd
def compare_scores(scores_list, names, n):
"""
Find top n scores for each method and compare.
"""
ids = set()
for scores in scores_list:
ids.update([x[0] for x in scores[:n]])
print('%d shared ids in top %d' % (len(ids), n))
results = []
# find rank and score for each id by each scoring metric.
for id_ in ids:
result = {'id': id_}
for scores, name in zip(scores_list, names):
for rank, (id2, score) in enumerate(scores):
if id2 == id_:
result[name + '_rank'] = rank
result[name + '_score'] = score
break
results.append(result)
headers = ['id']
for name in names:
headers.append(name + '_rank')
headers.append(name + '_score')
return pd.DataFrame(results, columns=headers)
df = compare_scores([common_jaccard, common_neighbors],
['jaccard', 'common neighbors'],
20)
df.sort_values('jaccard_rank').head(10)
28 shared ids in top 20
id | jaccard_rank | jaccard_score | common neighbors_rank | common neighbors_score | |
---|---|---|---|---|---|
15 | 13813 | 0 | 1.000000 | 0 | 13 |
21 | 23204 | 1 | 0.250000 | 1 | 5 |
5 | 19204 | 2 | 0.200000 | 3 | 4 |
1 | 17559 | 3 | 0.181818 | 2 | 4 |
2 | 409 | 4 | 0.142857 | 9 | 2 |
26 | 6746 | 5 | 0.142857 | 11 | 2 |
6 | 20344 | 6 | 0.142857 | 12 | 2 |
19 | 8680 | 7 | 0.125000 | 6 | 2 |
23 | 21646 | 8 | 0.100000 | 5 | 2 |
24 | 12852 | 9 | 0.076923 | 39 | 1 |
# Look at one discrepancy:
# common neighbor ranks 21646 > 20344 (actually a tie; score=2)
# jaccard ranks 20344 > 8680 (.143 > .1)
print('13813 shares %d friends with 21646, %d friends with 20344' %
(len(set(graph.neighbors(author)) & set(graph.neighbors('21646'))),
len(set(graph.neighbors(author)) & set(graph.neighbors('20344')))))
print('21646 has %d total friends' % len(list(graph.neighbors('21646'))))
print('20344 has %d total friends' % len(list(graph.neighbors('20344'))))
13813 shares 2 friends with 21646, 2 friends with 20344 21646 has 9 total friends 20344 has 3 total friends
pair = set([author, '20344'])
plot_subgraph(get_subgraph(graph, pair, n=30), pair)
pair = set([author, '21646'])
plot_subgraph(get_subgraph(graph, pair, n=30), pair)
** Let's look at the friends shared by 13813 and 20344 **
set(graph.neighbors(author)) & set(graph.neighbors('20344'))
{'19204', '8680'}
list(graph.neighbors('19204'))
['17559', '13813', '23134', '20344', '409', '2001', '3633', '6746', '7935', '8680', '23204']
list(graph.neighbors('8680'))
['17559', '13813', '20344', '19204', '23204']
print('19204 has %d neighbors; 8680 has %d neighbors' %
(len(list(graph.neighbors('19204'))),
len(list(graph.neighbors('8680')))))
19204 has 11 neighbors; 8680 has 5 neighbors
** Which shared friend provides stronger evidence of a connection between 13813 and 20344? **
5.) Adamic / Adar:
(e.g., the fact that we both follow CNN is less interesting than that we both follow the Chicago Tribune)
∑z∈N(X)∩N(Y)1log(|N(z)|)import math
def rank_by_adar(graph, node):
neighbors = set(graph.neighbors(node))
scores = []
for n in graph.nodes():
neighbors2 = set(graph.neighbors(n))
score = sum(1/math.log10(len(list(graph.neighbors(o))))
for o in neighbors & neighbors2)
scores.append((n, score))
return sorted(scores, key=lambda x: x[1], reverse=True)
adar = rank_by_adar(graph, author)
adar[:20]
[('13813', 19.78535802138797), ('19204', 6.520194824154786), ('23204', 6.099882396691225), ('17559', 5.202170679188602), ('409', 3.056155842078512), ('6746', 3.056155842078512), ('20373', 2.993614991792008), ('4241', 2.8442587098986194), ('20344', 2.3909291258625203), ('10235', 2.095903274289385), ('16568', 2.095903274289385), ('18344', 2.095903274289385), ('8680', 1.8579642852917506), ('24620', 1.660964047443681), ('5385', 1.660964047443681), ('24589', 1.660964047443681), ('11284', 1.660964047443681), ('4875', 1.660964047443681), ('25034', 1.6130499965393197), ('21646', 1.6130499965393197)]
df = compare_scores([adar, common_jaccard, common_neighbors],
['adar', 'jaccard', 'common neighbors'],
20)
df.sort_values('adar_rank').head(12)
33 shared ids in top 20
id | adar_rank | adar_score | jaccard_rank | jaccard_score | common neighbors_rank | common neighbors_score | |
---|---|---|---|---|---|---|---|
19 | 13813 | 0 | 19.785358 | 0 | 1.000000 | 0 | 13 |
7 | 19204 | 1 | 6.520195 | 2 | 0.200000 | 3 | 4 |
26 | 23204 | 2 | 6.099882 | 1 | 0.250000 | 1 | 5 |
1 | 17559 | 3 | 5.202171 | 3 | 0.181818 | 2 | 4 |
3 | 409 | 4 | 3.056156 | 4 | 0.142857 | 9 | 2 |
31 | 6746 | 5 | 3.056156 | 5 | 0.142857 | 11 | 2 |
11 | 20373 | 6 | 2.993615 | 44 | 0.055556 | 8 | 2 |
6 | 4241 | 7 | 2.844259 | 14 | 0.068966 | 10 | 2 |
8 | 20344 | 8 | 2.390929 | 6 | 0.142857 | 12 | 2 |
22 | 10235 | 9 | 2.095903 | 61 | 0.040000 | 13 | 1 |
18 | 16568 | 10 | 2.095903 | 31 | 0.062500 | 28 | 1 |
12 | 18344 | 11 | 2.095903 | 11 | 0.076923 | 69 | 1 |
6.) Preferential attachment: |N(X)|×|N(Y)|
What will this do?
Idea: two nodes are similar if they have similar neighbors
7.) SimRank: s(X,Y)=γ∑A∈N(X)∑B∈N(Y)s(A,B)|N(X)|⋅|N(Y)|
γ∈[0,1] is a tuning parameter.
Solve iteratively:
Advantage?
# Evaluate Adar using train/test
import random
random.seed(123)
def sample_edges(graph, node, pct=.5):
""" Randomly remove some edges for node.
Return:
the resulting graph G'
the list of friends of node whose edges were removed.
"""
edges = list(graph.edges([node]))
# Sample edges to remove.
to_remove = random.sample(edges, int(len(edges) * pct))
# Create the list of friends whose edges we have removed.
friends = []
for x in to_remove:
if x[0] != node:
friends.append(x[0])
else:
friends.append(x[1])
print('removing %d edges' % len(to_remove))
# Copy G into G' and remove the edges.
graph_cp = graph.copy()
graph_cp.remove_edges_from(to_remove)
return graph_cp, friends
def filter_existing_edges(prediction, train_graph, author):
"""
Filter recommended edges to those that don't currently exist in the graph.
"""
return [p for p in prediction if p[0] not in train_graph.neighbors(author) and p[0] != author]
train_graph, friends = sample_edges(graph, author, pct=.5)
prediction = rank_by_adar(train_graph, author)
prediction = filter_existing_edges(prediction, train_graph, author)[:6]
print('adar rank=\n' + '\n'.join(str(x) for x in prediction))
print('true neighbors=', friends)
n_correct = len(set(friends) & set([x[0] for x in prediction]))
print('Adar finds %d/%d for accuracy of %.3f' %
(n_correct, len(friends),
(n_correct / len(friends))))
removing 6 edges adar rank= ('10235', 2.095903274289385) ('20373', 2.095903274289385) ('18344', 2.095903274289385) ('4241', 1.660964047443681) ('5385', 1.660964047443681) ('24589', 1.660964047443681) true neighbors= ['17559', '4241', '19206', '8680', '21771', '20373'] Adar finds 2/6 for accuracy of 0.333
Another evaluation metric: Mean Reciprocal Rank
def mrr(prediction, truth):
# dict from node id to rank (starting at 1)
node2pos = {n[0]:i+1 for i, n in enumerate(prediction)}
ranks = [node2pos[n] for n in truth]
print(list(zip(truth, ranks)))
return np.mean([1/r for r in ranks])
prediction = rank_by_adar(train_graph, author)
prediction = filter_existing_edges(prediction, train_graph, author)
mrr(prediction, friends)
[('17559', 9), ('4241', 4), ('19206', 994), ('8680', 10), ('21771', 996), ('20373', 2)]
0.16052019389877867
pair = set([author, '19206'])
plot_subgraph(get_subgraph(train_graph, pair, n=30), pair)
prediction = rank_by_jaccard(train_graph, author)
prediction = filter_existing_edges(prediction, train_graph, author)
mrr(prediction, friends)
[('17559', 10), ('4241', 14), ('19206', 994), ('8680', 6), ('21771', 996), ('20373', 16)]
0.06710088172946649
pair = set([author, '21771'])
plot_subgraph(get_subgraph(train_graph, pair, n=30), pair)
#
from IPython.core.display import HTML
HTML(open('../custom.css').read())