Parameter Settings¶

In [21]:

#Parameters

# Number of nodes in the graph
num_nodes = 5000

# Number of intra-community edges each node create upon arrival
num_intra_edges = 2

# Probability of generating a intra-community edge uniformly at random
rand_intra_edge_prob = 0.02

# Maximum number of inter-community edges each node create upon arrival
num_inter_edges = 2

# Probability of generating each inter-community edge
inter_edge_prob = 0.02

# Parameters for Pitman-Yor process
d = 0.4
theta = 75

Simulation¶

In [22]:

import sys
from graph_tool.all import *
from pylab import *
import numpy as np
import numpy.random as nprd
from IPython.display import clear_output

nprd.seed(42)
community_rand = nprd.RandomState(50)

graph = Graph(directed = False)

vertex_community_map = graph.new_vertex_property("int")

num_community = 0
nodes_in_community = []
edge_ends_in_community = []

for n in range(num_nodes): # n is the index of current node, and also the number of existing nodes
    clear_output()
    print n + 1, "/", num_nodes
    
    vertex = graph.add_vertex()
    assert(n == graph.vertex_index[vertex])
    
    # Assign community
    if community_rand.rand() <= (theta + d * num_community) / (n + theta):
        # create a new community
        community = num_community
        num_community += 1
        nodes_in_community.append([n])
        edge_ends_in_community.append([])
    else:
        # choose an existing community
        # by giving equal weight to the decision each previous node made
        community = vertex_community_map[graph.vertex(community_rand.randint(n))]
        nodes_in_community[community].append(n)       
    
    vertex_community_map[vertex] = community
    
    # Create intra-community edges
    community_size = len(nodes_in_community[community]); assert(community_size > 0)
    if community_size == 1:
        # First node in community, no one to connect to
        pass
    elif community_size == 2:
        # Second node in community, must connect to first one
        first_node = nodes_in_community[community][0]
        for i in range(num_intra_edges):
            e = graph.add_edge(n, first_node)
        edge_ends_in_community[community].extend([n] * num_intra_edges)
        edge_ends_in_community[community].extend([first_node] * num_intra_edges)
    else:
        l = len(edge_ends_in_community[community])
        for i in range(num_intra_edges):
            if rand() < rand_intra_edge_prob: # Picking a node within the community uniformly at random with probability rand_intra_edge_prob
                selected_index = nprd.randint(community_size - 1)
                selected = nodes_in_community[community][selected_index]
            else: # Otherwise, do preferential attachment
                selected_index = nprd.randint(l)
                selected = edge_ends_in_community[community][selected_index]               
          
            e = graph.add_edge(n, selected)
            edge_ends_in_community[community].extend([n, selected])
    
    # Create inter-community edges
    outside_nodes = np.setdiff1d(np.arange(n), nodes_in_community[community])
    def create_inter_edge():
        selected_index = nprd.randint(len(outside_nodes))
        selected = outside_nodes[selected_index]
        e = graph.add_edge(n, selected)
        edge_ends_in_community[community].append(n)
        edge_ends_in_community[vertex_community_map[graph.vertex(selected)]].append(selected)
    
    if len(outside_nodes) > 0:
        if community_size == 1:
            create_inter_edge()
        else:
            for i in range(num_inter_edges):
                if rand() < inter_edge_prob:
                    create_inter_edge()
    

print "Graph generated."

5000 / 5000
Graph generated.

Side notes¶

The Pitman-Yor Process is specified as follows:

$$P(c_n=k)=\frac{n_k-d}{n-1+\theta} (1 \leq k \leq K)$$$$P(c_n=K+1)=\frac{\theta+d \cdot K}{n-1+\theta}$$

where $c_n$ is the table that $n$th customer sits at; the existing tables are numbered $k=1...K$; table $k$ has $n_k$ customers.

$0 \leq d < 1$ is the discount parameter, $\theta > -d$ is the strength parameter.

Graph Statistics¶

Clustering Coefficient¶

In [23]:

print "Global clustering coefficient and standard deviation: "
print global_clustering(graph)

Global clustering coefficient and standard deviation: 
(0.11773113668848266, 0.004855790389072158)

Newman's Modularity¶

In [24]:

print graph_tool.community.modularity(graph, vertex_community_map)

0.889661848309

Community Size And Degree Distribution¶

In [25]:

def plot_basics(data, data_inst, fig, units):    
    from powerlaw import plot_pdf, Fit, pdf
    annotate_coord = (-.5, 0.4)

    ax1 = fig.add_subplot(n_graphs,n_data,data_inst)
    plot_pdf(data[data>0], ax=ax1, linear_bins=True, color='r', linewidth=.5)
    x, y = pdf(data, linear_bins=True)
    ind = y>0
    y = y[ind]
    x = x[:-1]
    x = x[ind]
    ax1.scatter(x, y, color='r', s=.5)
    plot_pdf(data[data>0], ax=ax1, color='b', linewidth=2)
    from pylab import setp
    setp( ax1.get_xticklabels(), visible=False)
    ax1.set_yticks(ax1.get_yticks()[::2])
    locs,labels = yticks()
    if data_inst==1:
        ax1.annotate("PDF &\nData Points", annotate_coord, xycoords="axes fraction", fontsize=10)


    from mpl_toolkits.axes_grid.inset_locator import inset_axes
    ax1in = inset_axes(ax1, width = "30%", height = "30%", loc=3)
    ax1in.hist(data, normed=True, color='b')
    ax1in.set_xticks([])
    ax1in.set_yticks([])

    ax2 = fig.add_subplot(n_graphs,n_data,n_data+data_inst, sharex=ax1, sharey=ax1)
    fit = Fit(data, discrete=True)
    fit.power_law.plot_pdf(ax=ax2, color='g', linewidth = 2)
    fit.exponential.plot_pdf(ax=ax2, linestyle='--', color='r', linewidth = 2)
    fit.plot_pdf(ax=ax2, color='b', linewidth=2)
    setp( ax2.get_xticklabels(), visible=False)

    ax2.set_yticks(ax2.get_yticks()[::2])
    ax2.set_xlim(ax1.get_xlim())
    
    if data_inst==1:
        ax2.annotate("Power-law &\nExponential", annotate_coord, xycoords="axes fraction", fontsize=10)

        

    ax3 = fig.add_subplot(n_graphs,n_data,n_data*2+data_inst, sharex=ax1, sharey=ax1)
    fit.power_law.plot_pdf(ax=ax3, color='g', linewidth = 2)
    fit.lognormal.plot_pdf(ax=ax3, linestyle='--', color='r', linewidth = 2)
    fit.plot_pdf(ax=ax3, color='b', linewidth=2)

    ax3.set_ylim(ax2.get_ylim())
    ax3.set_yticks(ax3.get_yticks()[::2])
    ax3.set_xlim(ax1.get_xlim())
    
    if data_inst==1:
        ax3.annotate("Power-law &\nLog-normal", annotate_coord, xycoords="axes fraction", fontsize=10)

    ax3.set_xlabel(units)

In [26]:

import powerlaw

community_sizes = np.array([len(c) for c in nodes_in_community])
community_fit = powerlaw.Fit(community_sizes, discrete=True)

degrees = np.array([v.out_degree() for v in graph.vertices()])
degree_fit = powerlaw.Fit(degrees, discrete=True)

clear_output()

print "Community Size Distribution"
print "----------------------------------------"
print "Number of communities:\t", len(community_sizes)
print "Power law parameters:\t", "alpha=", community_fit.power_law.alpha, "; sigma=", community_fit.power_law.sigma
print "Comparing to exponential:\t", community_fit.distribution_compare('power_law', 'exponential')
print "Comparing to log-normal:\t", community_fit.distribution_compare('power_law', 'lognormal')
print


print "Degree Distribution"
print "----------------------------------------"
print "Power law parameters:\t", "alpha=", degree_fit.power_law.alpha, "; sigma=", degree_fit.power_law.sigma
print "Comparing to exponential:\t", degree_fit.distribution_compare('power_law', 'exponential')
print "Comparing to log-normal:\t", degree_fit.distribution_compare('power_law', 'lognormal')
print
print

n_data = 2
n_graphs = 4

f = figure(figsize=(10,15))
plot_basics(community_sizes, 1, f, 'Community Size')
plot_basics(degrees, 2, f, 'Degree')

Community Size Distribution
----------------------------------------
Number of communities:	811
Power law parameters:	alpha= 2.26432809608 ; sigma= 0.0900796470622
Comparing to exponential:	(11.62515047665813, 0.15365083113030445)
Comparing to log-normal:	(-4.355554952279749, 0.046621573334856725)

Degree Distribution
----------------------------------------
Power law parameters:	alpha= 2.60154597327 ; sigma= 0.0314634037506
Comparing to exponential:	(95.349869616324114, 0.00069530208031327704)
Comparing to log-normal:	(-39.957868865284446, 2.2560161810007961e-09)


Calculating best minimal value for power law fit
Calculating best minimal value for power law fit

Graph Visualization¶

In [42]:

edge_community_map = graph.new_edge_property("int")
for e in graph.edges():
    c1 = vertex_community_map[e.source()]
    c2 = vertex_community_map[e.target()]
    if c1 == c2:
        edge_community_map[e] = c1

# Degree is used to determine the size of vertex on plot
deg = graph.degree_property_map("out")
deg.a = 1.5*(sqrt(deg.a) * 0.5 + 0.4)

graph_draw(graph,
    vertex_size = deg,
    vertex_color=vertex_community_map, vertex_fill_color=vertex_community_map,
    edge_color = edge_community_map, edge_pen_width=0.2,
    output_size=(800, 800), output="rose_graph.png")

from IPython.core.display import Image 
Image(filename='rose_graph.png')

Out[42]:

Community detection¶

Stochastic blockmodel inference¶

For details:

(Note that the running result of this algorithm seems to be quite unstable; setting random seed doesn't work)

In [28]:

vertex_block_map, min_dl, b_cache = minimize_blockmodel_dl(graph)

print "Number of detected communities:", len(unique(vertex_block_map.get_array()))
print "Minimum description length:", min_dl

Number of detected communities: 34
Minimum description length: -0.902014471564

Evaluation¶

In [29]:

print "Newman's modularity:", graph_tool.community.modularity(graph, vertex_block_map)

Newman's modularity: 0.801219781965

In [30]:

from sklearn import metrics

community_labels = [vertex_community_map[v] for v in graph.vertices()]
block_labels = [vertex_block_map[v] for v in graph.vertices()]
random_labels = [nprd.randint(num_community) for v in graph.vertices()]

print "Adjusted Rand index:\t", metrics.adjusted_rand_score(community_labels, block_labels), "(", "balseline:", metrics.adjusted_rand_score(community_labels, random_labels), ")"
print "Adjusted Mutual Information:\t", metrics.adjusted_mutual_info_score(community_labels, block_labels), "(", "balseline:", metrics.adjusted_mutual_info_score(community_labels, random_labels), ")"

Adjusted Rand index:	0.248678734658 ( balseline: 0.00020345520586 )
Adjusted Mutual Information:	0.454991559779 ( balseline: 0.00067834321406 )

Visualization¶

In [44]:

edge_block_map = graph.new_edge_property("int")
for e in graph.edges():
    c1 = vertex_block_map[e.source()]
    c2 = vertex_block_map[e.target()]
    if c1 == c2:
        edge_block_map[e] = c1

graph_draw(graph,
    vertex_size = deg,
    vertex_color=vertex_block_map, vertex_fill_color=vertex_block_map,
    edge_color = edge_block_map, edge_pen_width=0.2,
    output_size=(800, 800), output="rose_graph_block.png")

Image(filename='rose_graph_block.png')

Out[44]:

Modularity-based¶

For details:

In [32]:

num_community_to_detect = len([s for s in community_sizes if s > 2])
print "Trying to detect ", num_community_to_detect, " communities..."
vertex_spin_map = community_structure(graph, 20000, num_community_to_detect)
print "Done."

Trying to detect  402  communities...
Done.

Evaluation¶

In [33]:

print "Newman's modularity:", graph_tool.community.modularity(graph, vertex_spin_map)

Newman's modularity: 0.913561319909

In [34]:

from sklearn import metrics

community_labels = [vertex_spin_map[v] for v in graph.vertices()]
spin_labels = [vertex_block_map[v] for v in graph.vertices()]
random_labels = [nprd.randint(num_community) for v in graph.vertices()]

print "Adjusted Rand index:\t", metrics.adjusted_rand_score(community_labels, spin_labels), "(", "balseline:", metrics.adjusted_rand_score(community_labels, random_labels), ")"
print "Adjusted Mutual Information:\t", metrics.adjusted_mutual_info_score(community_labels, spin_labels), "(", "balseline:", metrics.adjusted_mutual_info_score(community_labels, random_labels), ")"

Adjusted Rand index:	0.250715435259 ( balseline: 4.70750289877e-06 )
Adjusted Mutual Information:	0.476804167707 ( balseline: 0.0021941470908 )

Visualization¶

In [45]:

edge_spin_map = graph.new_edge_property("int")
for e in graph.edges():
    c1 = vertex_spin_map[e.source()]
    c2 = vertex_spin_map[e.target()]
    if c1 == c2:
        edge_spin_map[e] = c1
        
graph_draw(graph,
    vertex_size = deg,
    vertex_color=vertex_spin_map, vertex_fill_color=vertex_spin_map,
    edge_color = edge_spin_map, edge_pen_width=0.2,
    output_size=(800, 800), output="rose_graph_potts.png")

Image(filename='rose_graph_potts.png')

Out[45]: