#Parameters
# Number of nodes in the graph
num_nodes = 5000
# Number of intra-community edges each node create upon arrival
num_intra_edges = 2
# Probability of generating a intra-community edge uniformly at random
rand_intra_edge_prob = 0.02
# Maximum number of inter-community edges each node create upon arrival
num_inter_edges = 2
# Probability of generating each inter-community edge
inter_edge_prob = 0.02
# Parameters for Pitman-Yor process
d = 0.4
theta = 75
import sys
from graph_tool.all import *
from pylab import *
import numpy as np
import numpy.random as nprd
from IPython.display import clear_output
nprd.seed(42)
community_rand = nprd.RandomState(50)
graph = Graph(directed = False)
vertex_community_map = graph.new_vertex_property("int")
num_community = 0
nodes_in_community = []
edge_ends_in_community = []
for n in range(num_nodes): # n is the index of current node, and also the number of existing nodes
clear_output()
print n + 1, "/", num_nodes
vertex = graph.add_vertex()
assert(n == graph.vertex_index[vertex])
# Assign community
if community_rand.rand() <= (theta + d * num_community) / (n + theta):
# create a new community
community = num_community
num_community += 1
nodes_in_community.append([n])
edge_ends_in_community.append([])
else:
# choose an existing community
# by giving equal weight to the decision each previous node made
community = vertex_community_map[graph.vertex(community_rand.randint(n))]
nodes_in_community[community].append(n)
vertex_community_map[vertex] = community
# Create intra-community edges
community_size = len(nodes_in_community[community]); assert(community_size > 0)
if community_size == 1:
# First node in community, no one to connect to
pass
elif community_size == 2:
# Second node in community, must connect to first one
first_node = nodes_in_community[community][0]
for i in range(num_intra_edges):
e = graph.add_edge(n, first_node)
edge_ends_in_community[community].extend([n] * num_intra_edges)
edge_ends_in_community[community].extend([first_node] * num_intra_edges)
else:
l = len(edge_ends_in_community[community])
for i in range(num_intra_edges):
if rand() < rand_intra_edge_prob: # Picking a node within the community uniformly at random with probability rand_intra_edge_prob
selected_index = nprd.randint(community_size - 1)
selected = nodes_in_community[community][selected_index]
else: # Otherwise, do preferential attachment
selected_index = nprd.randint(l)
selected = edge_ends_in_community[community][selected_index]
e = graph.add_edge(n, selected)
edge_ends_in_community[community].extend([n, selected])
# Create inter-community edges
outside_nodes = np.setdiff1d(np.arange(n), nodes_in_community[community])
def create_inter_edge():
selected_index = nprd.randint(len(outside_nodes))
selected = outside_nodes[selected_index]
e = graph.add_edge(n, selected)
edge_ends_in_community[community].append(n)
edge_ends_in_community[vertex_community_map[graph.vertex(selected)]].append(selected)
if len(outside_nodes) > 0:
if community_size == 1:
create_inter_edge()
else:
for i in range(num_inter_edges):
if rand() < inter_edge_prob:
create_inter_edge()
print "Graph generated."
5000 / 5000 Graph generated.
The Pitman-Yor Process is specified as follows:
$$P(c_n=k)=\frac{n_k-d}{n-1+\theta} (1 \leq k \leq K)$$$$P(c_n=K+1)=\frac{\theta+d \cdot K}{n-1+\theta}$$where $c_n$ is the table that $n$th customer sits at; the existing tables are numbered $k=1...K$; table $k$ has $n_k$ customers.
$0 \leq d < 1$ is the discount parameter, $\theta > -d$ is the strength parameter.
print "Global clustering coefficient and standard deviation: "
print global_clustering(graph)
Global clustering coefficient and standard deviation: (0.11773113668848266, 0.004855790389072158)
print graph_tool.community.modularity(graph, vertex_community_map)
0.889661848309
def plot_basics(data, data_inst, fig, units):
from powerlaw import plot_pdf, Fit, pdf
annotate_coord = (-.5, 0.4)
ax1 = fig.add_subplot(n_graphs,n_data,data_inst)
plot_pdf(data[data>0], ax=ax1, linear_bins=True, color='r', linewidth=.5)
x, y = pdf(data, linear_bins=True)
ind = y>0
y = y[ind]
x = x[:-1]
x = x[ind]
ax1.scatter(x, y, color='r', s=.5)
plot_pdf(data[data>0], ax=ax1, color='b', linewidth=2)
from pylab import setp
setp( ax1.get_xticklabels(), visible=False)
ax1.set_yticks(ax1.get_yticks()[::2])
locs,labels = yticks()
if data_inst==1:
ax1.annotate("PDF &\nData Points", annotate_coord, xycoords="axes fraction", fontsize=10)
from mpl_toolkits.axes_grid.inset_locator import inset_axes
ax1in = inset_axes(ax1, width = "30%", height = "30%", loc=3)
ax1in.hist(data, normed=True, color='b')
ax1in.set_xticks([])
ax1in.set_yticks([])
ax2 = fig.add_subplot(n_graphs,n_data,n_data+data_inst, sharex=ax1, sharey=ax1)
fit = Fit(data, discrete=True)
fit.power_law.plot_pdf(ax=ax2, color='g', linewidth = 2)
fit.exponential.plot_pdf(ax=ax2, linestyle='--', color='r', linewidth = 2)
fit.plot_pdf(ax=ax2, color='b', linewidth=2)
setp( ax2.get_xticklabels(), visible=False)
ax2.set_yticks(ax2.get_yticks()[::2])
ax2.set_xlim(ax1.get_xlim())
if data_inst==1:
ax2.annotate("Power-law &\nExponential", annotate_coord, xycoords="axes fraction", fontsize=10)
ax3 = fig.add_subplot(n_graphs,n_data,n_data*2+data_inst, sharex=ax1, sharey=ax1)
fit.power_law.plot_pdf(ax=ax3, color='g', linewidth = 2)
fit.lognormal.plot_pdf(ax=ax3, linestyle='--', color='r', linewidth = 2)
fit.plot_pdf(ax=ax3, color='b', linewidth=2)
ax3.set_ylim(ax2.get_ylim())
ax3.set_yticks(ax3.get_yticks()[::2])
ax3.set_xlim(ax1.get_xlim())
if data_inst==1:
ax3.annotate("Power-law &\nLog-normal", annotate_coord, xycoords="axes fraction", fontsize=10)
ax3.set_xlabel(units)
import powerlaw
community_sizes = np.array([len(c) for c in nodes_in_community])
community_fit = powerlaw.Fit(community_sizes, discrete=True)
degrees = np.array([v.out_degree() for v in graph.vertices()])
degree_fit = powerlaw.Fit(degrees, discrete=True)
clear_output()
print "Community Size Distribution"
print "----------------------------------------"
print "Number of communities:\t", len(community_sizes)
print "Power law parameters:\t", "alpha=", community_fit.power_law.alpha, "; sigma=", community_fit.power_law.sigma
print "Comparing to exponential:\t", community_fit.distribution_compare('power_law', 'exponential')
print "Comparing to log-normal:\t", community_fit.distribution_compare('power_law', 'lognormal')
print
print "Degree Distribution"
print "----------------------------------------"
print "Power law parameters:\t", "alpha=", degree_fit.power_law.alpha, "; sigma=", degree_fit.power_law.sigma
print "Comparing to exponential:\t", degree_fit.distribution_compare('power_law', 'exponential')
print "Comparing to log-normal:\t", degree_fit.distribution_compare('power_law', 'lognormal')
print
print
n_data = 2
n_graphs = 4
f = figure(figsize=(10,15))
plot_basics(community_sizes, 1, f, 'Community Size')
plot_basics(degrees, 2, f, 'Degree')
Community Size Distribution ---------------------------------------- Number of communities: 811 Power law parameters: alpha= 2.26432809608 ; sigma= 0.0900796470622 Comparing to exponential: (11.62515047665813, 0.15365083113030445) Comparing to log-normal: (-4.355554952279749, 0.046621573334856725) Degree Distribution ---------------------------------------- Power law parameters: alpha= 2.60154597327 ; sigma= 0.0314634037506 Comparing to exponential: (95.349869616324114, 0.00069530208031327704) Comparing to log-normal: (-39.957868865284446, 2.2560161810007961e-09) Calculating best minimal value for power law fit Calculating best minimal value for power law fit
edge_community_map = graph.new_edge_property("int")
for e in graph.edges():
c1 = vertex_community_map[e.source()]
c2 = vertex_community_map[e.target()]
if c1 == c2:
edge_community_map[e] = c1
# Degree is used to determine the size of vertex on plot
deg = graph.degree_property_map("out")
deg.a = 1.5*(sqrt(deg.a) * 0.5 + 0.4)
graph_draw(graph,
vertex_size = deg,
vertex_color=vertex_community_map, vertex_fill_color=vertex_community_map,
edge_color = edge_community_map, edge_pen_width=0.2,
output_size=(800, 800), output="rose_graph.png")
from IPython.core.display import Image
Image(filename='rose_graph.png')
For details:
(Note that the running result of this algorithm seems to be quite unstable; setting random seed doesn't work)
vertex_block_map, min_dl, b_cache = minimize_blockmodel_dl(graph)
print "Number of detected communities:", len(unique(vertex_block_map.get_array()))
print "Minimum description length:", min_dl
Number of detected communities: 34 Minimum description length: -0.902014471564
print "Newman's modularity:", graph_tool.community.modularity(graph, vertex_block_map)
Newman's modularity: 0.801219781965
from sklearn import metrics
community_labels = [vertex_community_map[v] for v in graph.vertices()]
block_labels = [vertex_block_map[v] for v in graph.vertices()]
random_labels = [nprd.randint(num_community) for v in graph.vertices()]
print "Adjusted Rand index:\t", metrics.adjusted_rand_score(community_labels, block_labels), "(", "balseline:", metrics.adjusted_rand_score(community_labels, random_labels), ")"
print "Adjusted Mutual Information:\t", metrics.adjusted_mutual_info_score(community_labels, block_labels), "(", "balseline:", metrics.adjusted_mutual_info_score(community_labels, random_labels), ")"
Adjusted Rand index: 0.248678734658 ( balseline: 0.00020345520586 ) Adjusted Mutual Information: 0.454991559779 ( balseline: 0.00067834321406 )
edge_block_map = graph.new_edge_property("int")
for e in graph.edges():
c1 = vertex_block_map[e.source()]
c2 = vertex_block_map[e.target()]
if c1 == c2:
edge_block_map[e] = c1
graph_draw(graph,
vertex_size = deg,
vertex_color=vertex_block_map, vertex_fill_color=vertex_block_map,
edge_color = edge_block_map, edge_pen_width=0.2,
output_size=(800, 800), output="rose_graph_block.png")
Image(filename='rose_graph_block.png')
num_community_to_detect = len([s for s in community_sizes if s > 2])
print "Trying to detect ", num_community_to_detect, " communities..."
vertex_spin_map = community_structure(graph, 20000, num_community_to_detect)
print "Done."
Trying to detect 402 communities... Done.
print "Newman's modularity:", graph_tool.community.modularity(graph, vertex_spin_map)
Newman's modularity: 0.913561319909
from sklearn import metrics
community_labels = [vertex_spin_map[v] for v in graph.vertices()]
spin_labels = [vertex_block_map[v] for v in graph.vertices()]
random_labels = [nprd.randint(num_community) for v in graph.vertices()]
print "Adjusted Rand index:\t", metrics.adjusted_rand_score(community_labels, spin_labels), "(", "balseline:", metrics.adjusted_rand_score(community_labels, random_labels), ")"
print "Adjusted Mutual Information:\t", metrics.adjusted_mutual_info_score(community_labels, spin_labels), "(", "balseline:", metrics.adjusted_mutual_info_score(community_labels, random_labels), ")"
Adjusted Rand index: 0.250715435259 ( balseline: 4.70750289877e-06 ) Adjusted Mutual Information: 0.476804167707 ( balseline: 0.0021941470908 )
edge_spin_map = graph.new_edge_property("int")
for e in graph.edges():
c1 = vertex_spin_map[e.source()]
c2 = vertex_spin_map[e.target()]
if c1 == c2:
edge_spin_map[e] = c1
graph_draw(graph,
vertex_size = deg,
vertex_color=vertex_spin_map, vertex_fill_color=vertex_spin_map,
edge_color = edge_spin_map, edge_pen_width=0.2,
output_size=(800, 800), output="rose_graph_potts.png")
Image(filename='rose_graph_potts.png')