Consider a graph represented with adjacency matrix $A$ where its nodes . Here, $A$ is a $d\times d$ matrix where $a_{ij}$ represents the association between node $i$ and $j$, e.g. the existance of an edge. And $U$ is a $d \times k$ matrix, where $u_{ik}$ denotes the memberships of node $i$ in the $k^{th}$ cluster of $U$, for example it is $1$ if node $i$ belongs to cluster $k$ in $U$ and $0$ otherwise. With this representation, a simple matrix outer product and transepose can give us interesting information. More specifically, two matrices of $$ M= (UU^T)_{d\times d}\quad \quad N = (U^TU)_{k \times k}$$ denote respectively the co-memberships of pairs of nodes in clusters of $U$, and overlaps of pairs of clusters in $U$. So that $M_{ij}$ denotes in how many clusters node $i$ and $j$ appeared together. And $N_{ij}$ denotes how many nodes clusters $i$ and $j$ have in common.
We can also compute $A A^T$ or $A^TA$, which compute the number of common (in/out-)neighbors for every pair of nodes in A: a.k.a. co-citaion and biblographic coupling matrices in case of directed graphs. For undirected graphs, $A$ is symmetric and $AA^T = A^TA = A^2$.
We use these to define network clustering distances. But first lets visualize a clustered network for having illustrated examples.
import math
import logging
import numpy as np
import numpy.linalg as LA
import networkx as nx
import igraph as ig
import networkx.linalg.graphmatrix as gm
from sympy.printing import latex
from sympy.matrices import *
import sympy as sy
import matplotlib
import matplotlib.pyplot as plt
from pylab import *
FORMAT = "[%(lineno)s:%(funcName)20s()]\n %(message)s"
logging.basicConfig(format=FORMAT, level=logging.ERROR)
init_printing()
%matplotlib inline
def printlatex(A):
print latex(Matrix(A),mode='inline')
def draw_clustered_graph(G,U, pos, draw_edge_wights = False, draw_graph = True):
"""
Visualizes the given clustering of a graph.
Parameters
----------
G: A networkx graph
U: numpy matrix
A nxk matrix representing a clustering, where n is the number of data points and k is the number of clusters in U,
so that U_{ik} shows if the ith data-point belongs to the kth cluster in U.
pos: A Python dictionary (optional)
A dictionary of positions for graph nodes keyed by node
Returns
-------
None
Examples
-------
>>> import numpy as np
>>> import networkx as nx
>>> from pylab import *
>>> np.matrix( [[0,1,1,0,0,0,0,0,0,0],
[1,0,0,1,0,0,0,0,0,0],
[1,0,0,1,0,0,0,1,0,0],
[0,1,1,0,1,1,0,0,0,0],
[0,0,0,1,0,1,1,0,0,0],
[0,0,0,1,1,0,1,0,0,0],
[0,0,0,0,1,1,0,0,0,0],
[0,0,1,0,0,0,0,0,1,1],
[0,0,0,0,0,0,0,1,0,1],
[0,0,0,0,0,0,0,1,1,0]])
>>> G = nx.from_numpy_matrix(A)
>>> U = np.matrix([[1,0],[1,0],[1,0],[0,1],[0,1],[0,1],[0,1],[1,0],[1,0],[1,0]])
>>> figure(figsize=(10, 4))
>>> draw_clustered_graph(G,U, nx.spring_layout(G))
>>> show()
"""
color_list = ['b','g','r','m','y','c']
color_list =['#9999FF', '#FF9999', '#66FF99','#FF99FF', '#FFFF99', '#E0E0E0','#FF9933','#FF3333','#33FF99','#33FFFF','#3333FF']
if pos==None:
pos = nx.spring_layout(G)#, nlist, dim, scale)spring_layout(G)
print '----- positions to repeat the layout:: \n ',pos
maxW = 0.0
for (i,j, w) in G.edges(data=True):
maxW = max(maxW,w['weight'] )
if U is not None:
UG = nx.from_numpy_matrix( U.dot(U.T))
maxU = U.max()
# draw clusters
t =6.2
for c in range(U.shape[1]):
nsizes = [ int(t**4 * U[i,c]/maxU) for i in range(U.shape[0])]
esizes = [ ((U[i,c]*U[j,c])*t**2.0/maxU**2) for (i,j, w) in UG.edges(data=True)]
try:
nx.draw(UG, pos = pos, node_size=nsizes, style ='solid' ,\
#labels ='.',\
node_color = color_list[c],edge_color =color_list[c],\
linewidths=0, width= esizes, alpha =.5)#, alpha =1
except:
pass
# draw graph
# if draw_graph:
try:
f= lambda w: np.log(w/maxW +1)*w*3/maxW +1 if draw_graph else 0
nx.draw(G, pos = pos, node_color = 'w', alpha =1 , linewidths=1, width= [f(w['weight']) for (i,j, w) in G.edges(data=True)])
except:
pass
if draw_edge_wights:
edge_lbls = dict([((u,v,),d['weight']) for u,v,d in G.edges(data=True)])
nx.draw_networkx_edge_labels(G, pos = pos, label_pos=0.5 ,edge_labels =edge_lbls)
return pos
A = np.array([[0,1,1,0,0,0,0,0,0,0],
[1,0,0,1,0,0,0,0,0,0],
[1,0,0,1,0,0,0,1,0,0],
[0,1,1,0,1,1,0,0,0,0],
[0,0,0,1,0,1,1,0,0,0],
[0,0,0,1,1,0,1,0,0,0],
[0,0,0,0,1,1,0,0,0,0],
[0,0,1,0,0,0,0,0,1,1],
[0,0,0,0,0,0,0,1,0,1],
[0,0,0,0,0,0,0,1,1,0]])
G = nx.from_numpy_matrix(A)
U = np.array([[1,0],[1,0],[1,0],[0,1],[0,1],[0,1],[0,1],[1,0],[1,0],[1,0]])
figure(figsize=(4, 2))
fix_pos = draw_clustered_graph(G,U, nx.spring_layout(G))
show()
Consider two alternative clustering of $d$ data-points, called $U$ and $V$. Where, $U$ is a $d \times k$ matrix, where $u_{ik}$ denotes the memberships of node $i$ in the $k^{th}$ cluster of $U$, for example it is $1$ if node $i$ belongs to cluster $k$ in $U$ and $0$ otherwise. With this representation, a simple matrix outer product and transepose can give us interesting information. More specifically, two matrices of $ M= (UU^T)_{d\times d}$ and $ N = (U^TU)_{k \times k}$ denote respectively the co-memberships of pairs of nodes in clusters of $U$, and overlaps of pairs of clusters in $U$. In other words, $M_{ij}$ denotes in how many clusters node $i$ and $j$ appeared together; and $N_{ij}$ denotes how many nodes clusters $i$ and $j$ have in common.
Now for computing the overlaps between two different clustering $U$ and $V$, we can simply compute: $N= (U^TV)_{k \times r} $ which we write $N$ for short. The original rand index, is then defined in terms of this contingency matrix. More specifically assume $\varphi(x)= x(x-1)/2$, then consider these four terms: $$\sum_{ij} \varphi(n_{ij}) = sum(\varphi(N)) = \mathbb{1}_{1\times k} \times \varphi(N) \times \mathbb{1}_{r \times 1} ,\; and \quad \sum_{i} \varphi(\sum_{j} n_{ij}) = sum(\varphi(sum(N,1))) = \mathbb{1}_{1\times k} \times \varphi(N \times \mathbb{1}_{r \times 1} )$$ $$\sum_{j} \varphi(\sum_{i} n_{ij}) = sum(\varphi(sum(N,0))) = \varphi(\mathbb{1}_{1\times k} \times N) \times \mathbb{1}_{r \times 1} ,\; and \quad \varphi(\sum_{ij} n_{ij}) = \varphi(sum(N)) = \varphi(\mathbb{1}_{1\times k} \times N \times \mathbb{1}_{r \times 1}) $$ where $\mathbb{1}$ is a identity vector with appropiate shape, written afterward simply as $\mathbb{1}$. The rand index which is originally defined as: $RI =1 + [2\sum_{i=1}^k\sum_{j=1}^r n_{ij}(n_{ij}-1) - \sum_{i=1}^k n_{i.}(n_{i.}-1)- \sum_{j=1}^r n_{.j}(n_{.j}-1))]/[n(n-1)]$; can be re-written as: $RI =1 + [\mathbb{1} \varphi(N) \mathbb{1} - \mathbb{1} \varphi(N \mathbb{1} ) ]/[\varphi(\mathbb{1} N \mathbb{1}) ] + [\mathbb{1} \varphi(N) \mathbb{1} - \varphi(\mathbb{1} N) \mathbb{1} ]/[\varphi(\mathbb{1} N \mathbb{1}) ] $. We can similarly derive the adjusted for chance version of rand index. The generalized forms are: $$ \mathcal{G}_{\varphi} = \frac{\mathbf{1} \varphi(N) \mathbf{1}^T - \mathbf{1} \varphi(N \mathbf{1}^T ) } {\varphi(\mathbf{1} N \mathbf{1}^T) } + \frac{\mathbf{1} \varphi(N) \mathbf{1}^T - \varphi(\mathbf{1} N) \mathbf{1}^T } {\varphi(\mathbf{1} N \mathbf{1}^T) } $$$$ A\mathcal{G}_{\varphi} =\frac{\mathbf{1} \varphi(N) \mathbf{1}^T - E}{M-E} ,\quad M = \frac{ \mathbf{1} \varphi(N \mathbf{1}^T ) + \varphi(\mathbf{1} N) \mathbf{1}^T }{2 } ,\; E = \frac{ (\mathbf{1} \varphi(N \mathbf{1}^T )) ( \varphi(\mathbf{1} N ) \mathbf{1}^T )}{\varphi(\mathbf{1} N \mathbf{1}^T) } $$
def agreement_from_overlaps(U,V, f = lambda x: x*(x-1)*0.5, exact=False):
"""
Computes the agreement between two disjoint clusterings based on the overlap of their clusters.
Parameters
----------
U: numpy matrix
A nxk matrix representing a clustering,
where n is the number of data points and k is the number of clusters in U,
so that U_{ik} shows if the ith data-point belongs to the kth cluster in U.
V: numpy matrix
A nxr matrix representing a clustering similar to U
f: function
A Python function (non-linear) to apply on the overlap sizes to compute the divergence,
f = lambda x: x*(x-1)*0.5 (default) will result in Rand Index and Adjusted Rand Index,
f = lambda x: 0 if x==0 else (x * math.log(x)) derives normalized VI.
Returns
-------
G: float
agreement between input clusterings from
$$\GAM_{\varphi} = \frac{\mathbf{1} \varphi(N) \mathbf{1}^T - \mathbf{1} \varphi(N \mathbf{1}^T ) } {\varphi(\mathbf{1} N \mathbf{1}^T) } + \frac{\mathbf{1} \varphi(N) \mathbf{1}^T - \varphi(\mathbf{1} N) \mathbf{1}^T } {\varphi(\mathbf{1} N \mathbf{1}^T) } $$
AG: float
agreement between input clusterings from
$$\AG_{\varphi} =\frac{\mathbf{1} \varphi(N) \mathbf{1}^T - E}{M-E} ,\quad M = \frac{ \mathbf{1} \varphi(N \mathbf{1}^T ) + \varphi(\mathbf{1} N) \mathbf{1}^T }{2 } ,\; E = \frac{ (\mathbf{1} \varphi(N \mathbf{1}^T )) ( \varphi(\mathbf{1} N ) \mathbf{1}^T )}{\varphi(\mathbf{1} N \mathbf{1}^T) } $$
"""
return 1- distance_from_overlaps(U, V, f, exact)
def distance_from_overlaps(U,V, f = lambda x: x*(x-1)*0.5, exact=False):
f = np.vectorize(f, otypes=[np.float])
N = np.dot(U.T,V) # the overlaps, a.k.a contingency table
b2 = np.ones((N.shape[1],1),dtype='float')
b1 = np.ones((1, N.shape[0]),dtype='float')
m1=b1.dot(N)
m2=N.dot(b2)
m = b1.dot(N).dot(b2)
s1 = (b1.dot(f(m2)))
s2 = (f(m1).dot(b2))
n = f(m)
I = (b1.dot(f(N)).dot(b2))
# or
# s1 = f(N.sum(axis=1)).sum()
# s2 = f(N.sum(axis=0)).sum()
# n = f(N.sum())
# I = f(N).sum()
G = float ((s1+s2- 2* I )/( n ) )
if exact:
E = (b1.dot(f(m2).dot(f(m1))/f(m))).dot(b2)
else:
E = (b1.dot(f(m2.dot(m1)/m))).dot(b2)
AG = float ((s1+s2- 2* I )/( (s1+s2) - 2 *E ))
return np.array([G, AG])
V = np.array([[1,0],[1,0],[1,0],[0,1],[0,1],[0,1],[0,1],[1,0],[1,0],[1,0]])
U = np.array([[1,0,0],[1,0,0],[1,0,0],[1,0,0],[0,1,0],[0,1,0],[0,1,0],[0,0,1],[0,0,1],[0,0,1]])
figure(figsize=(9, 9))
nr = 2
nf = 2
subplot(nr,nf,1)#.set_title('$U$')
draw_clustered_graph(G,V, fix_pos)
subplot(nr,nf,2)#.set_title('$V1$')
draw_clustered_graph(G,U, fix_pos)
print 'RI = %0.3f, ARI = %0.3f'% tuple(agreement_from_overlaps(U,V, exact=True))
print 'VI = %0.3f, NMI (_sum, VM in sklearn) = %0.3f'% tuple(agreement_from_overlaps(U,V, f = lambda x: 0 if x==0 else x*log(x), exact=False))
print 'RI\' = %0.3f, ARI\' = %0.3f'% tuple(agreement_from_overlaps(U,V,f = lambda x: x**2, exact=False))
RI = 0.667, ARI = 0.312 VI = 0.624, NMI (_sum, VM in sklearn) = 0.509 RI' = 0.700, ARI' = 0.408
def convert_cluster_to_labels(U):
return np.nonzero(U)[1]
def sklearn_measures(U, V):
# http://scikit-learn.org/stable/modules/classes.html#clustering-metrics
import sklearn.metrics.cluster as sym
U_labels = np.nonzero(U)[1]
V_labels = np.nonzero(V)[1]
# print U_labels, V_labels
print 'From sklean:: ARI = %0.3f'% sym.adjusted_rand_score(U_labels, V_labels),
print 'NMI = %0.3f'% sym.normalized_mutual_info_score(U_labels, V_labels),
print 'AMI = %0.3f'% sym.adjusted_mutual_info_score(U_labels, V_labels),
print 'VM = %0.3f'% sym.v_measure_score(U_labels, V_labels)
sklearn_measures(U,V)
From sklean:: ARI = 0.312 NMI = 0.523 AMI = 0.327 VM = 0.509
We can also derive NMI directly from our matrix representation
def nmi(U,V):
N = np.dot(U.T,V) # the overlaps, a.k.a contingency table
b2 = np.ones((N.shape[1],1),dtype='float')
b1 = np.ones((1, N.shape[0]),dtype='float')
n = b1.dot(N).dot(b2)
nv= b1.dot(N)
nu= N.dot(b2)
f = np.vectorize(lambda x: x*math.log(x) if x >0 else 0, otypes=[np.float])
Hu = -(b1.dot(f(nu/n)))
Hv = -(f(nv/n).dot(b2))
Huv = -(b1.dot(f(N/n)).dot(b2))
Iuv = Hu+Hv-Huv
vi = float((2*Huv-(Hu+Hv) )/math.log(n))
nmi_sum = float(2*Iuv/(Hu+Hv)) # VM in sklearn
nmi_sqrt = float(Iuv/math.sqrt(Hu*Hv)) # NMI in sklearn
# 1-VI to make it into a agreement measure
return np.array([1-vi, nmi_sum, nmi_sqrt])
print '1-VI = %0.3f, NMI_sum = %0.3f, NMI_sqrt = %0.3f'% tuple(nmi(U,V))
1-VI = 0.624, NMI_sum = 0.509, NMI_sqrt = 0.523
The two square matrices of $UU^T$ and $VV^T$ show the co-membership of pair of nodes in clusterings $U$ and $V$ respectively. Therefore the difference of these two will give us the disagreement between $U$ and $V$, i.e. $\Delta = UU^T - VV^T $. We can show that the $RI$ and $ARI$, can be derived from $\Delta$ as: $$(A)RI = 1 - \frac{|| UU^T - VV^T ||^2_F}{MAX}\quad where$$ $$ MAX_{RI}= max(max(UU^T),max(VV^T)) n^2 \;\quad and \quad\; MAX_{ARI}=||UU^T||^2_F+||VV^T||^2_F-2 |UU^T||VV^T| / (n^2)$$
def agreement_from_co_memberships(U, V, same_nodes = True):
return 1-distance_from_co_memberships(U, V, same_nodes)
def distance_from_co_memberships(U, V, same_nodes = True):
"""
Computes the agreement between two clusterings based on the co_memberships of data points in their clusters.
Parameters
----------
U: Matrix
A nxk matrix representing a clustering,
where upper is the number of data points and k is the number of clusters in U,
so that U_{ik} shows if the ith data-point belongs to the kth cluster in U.
V: Matrix
A nxr matrix representing a clustering similar to U
same_nodes: Boolean
determines whether the formula should be exact same as the original RI formula or the approximate forms
Returns
-------
G: float
agreement between input clusterings from
$$\GAM_{\varphi} = 1 - \frac{|| UU^T - VV^T ||}{MAX}$$
where MAX is $$MAX= \mathfrak{m} upper^2$$ and $$ \mathfrak{m} = max(max(UU^T),max(VV^T))$$
AG: float
agreement between input clusterings from same formula as G but where MAX is
$$MAX=||UU^T||+||VV^T||-2 ||UU^T||||VV^T|| / (\mathfrak{m} upper^2)$$.
"""
CU = U.dot(U.T).astype('float')
CV = V.dot(V.T).astype('float')
#print CU
sf = lambda A: np.multiply(A, A).sum() #squared frob norm
norm = sf
if not same_nodes:
np.fill_diagonal(CU,0)
np.fill_diagonal(CV,0)
pairCount = CU.shape[0]**2 - (0 if same_nodes else CU.shape[0])
sUV = norm( CU-CV )
sU = norm(CU)
sV = norm(CV)
m = max(np.max(CU),np.max(CV))
GI = sUV /(pairCount*m**2)
AGI = sUV / ((sU+sV) - 2*(np.sum(CU)*np.sum(CV))/(pairCount))
return np.array([GI, AGI])
print 'RI = %0.3f, ARI = %0.3f'% tuple(agreement_from_co_memberships(U,V, False))
print 'RI\' = %0.3f, ARI\' = %0.3f'% tuple(agreement_from_co_memberships(U,V, True))
RI = 0.667, ARI = 0.312 RI' = 0.700, ARI' = 0.408
Collins and Dent (1988) proposed the Omega index as a generalization of the (adjusted) rand index for overlapping clusters with crisp memberships. The Omega Index($\omega$) derives from our formulation if we define $\Delta = [UU^T == VV^T]$. Since in $\omega$, each pair of datapoints is considered as an agreement if they appear in exactly the same number of clusters in the two clusterings, i.e. $\Delta_{ij} =1$ if $(UU^T)_{ij} == (VV^T)_{ij}$ and zero otherwise.
def agreement_from_omega(U, V, same_nodes = False):
CU = U.dot(U.T).astype('float')
CV = V.dot(V.T).astype('float')
if not same_nodes:
np.fill_diagonal(CU,0)
np.fill_diagonal(CV,0)
pairCount = CU.shape[0]**2 - (0 if same_nodes else CU.shape[0])
pair_counts = np.zeros((int(np.max(CU))+1,int(np.max(CV))+1))
for i in range(0, int(np.max(CU)+1)):
for j in range(0, int(np.max(CV)+1)):
tmp = (CU==i)*(CV==j)
tmp = np.tril(tmp) if same_nodes else np.tril(tmp,-1)
pair_counts[i,j] = np.sum(tmp)
w= np.array( CU==CV, dtype='float')
w = np.sum(w)- w.trace()
w = w*1.0/ pairCount
tU = np.histogram(CU, bins=np.max(CU)+1)[0]
tV = np.histogram(CV, bins=np.max(CV)+1)[0]
tU[0]= tU[0] - (0 if same_nodes else CU.shape[0])
tV[0]= tV[0] - (0 if same_nodes else CU.shape[0])
m = min(tU.shape[0], tV.shape[0])
ew = [tU[j]*tV[j] for j in range(0 ,m)]
ew = np.sum(ew)*1.0 / (pairCount**2)
aw = (w-ew)/(1-ew)
return np.array([w, aw])
print 'w = %0.3f, Aw = %0.3f'% tuple(agreement_from_omega(U,V, False))
w = 0.667, Aw = 0.312
def agreement_from_matrix_norms(U,V, norm = lambda A: math.sqrt(np.multiply(A, A).sum()) ):
return 1-distance_from_matrix_norms(U, V, norm)
def distance_from_matrix_norms(U,V, norm = lambda A: math.sqrt(np.multiply(A, A).sum()) ):
# L21_norm = lambda A: np.sqrt(np.multiply(A, A).sum(axis=1)).sum()
# frobenius_norm = lambda A: math.sqrt(np.multiply(A, A).sum())
# log_norm = lambda A: np.multiply(A, np.log(A.clip(1)).sum(axis=1)).sum()
CU = U.dot(U.T).astype('float')
CV = V.dot(V.T).astype('float')
return np.array([norm(CU-CV)/(norm(CU)+norm(CV)) ])
print 'I_norm = %0.3f'% tuple(agreement_from_matrix_norms(U,V))
I_norm = 0.580
def distance_alternative_forms(U,V):
return 1-agreement_alternative_forms(U, V)
def agreement_alternative_forms(U, V):
CU = U.dot(U.T).astype('float')
CV = V.dot(V.T).astype('float')
CU2 = CU.dot(CU)
CV2 = CV.dot(CV)
Q = CU.dot(CV)
# Cauchy-Schwarz inequality ==> alt1<1
alt1 = float(Q.trace()) / math.sqrt( (CU2).trace() * (CV2).trace() )
alt2 = float(Q.trace() / (CU.trace() * CV.trace()))
return np.array([alt1, alt2])
print 'I_sqrt(tr) = %0.3f, I_tr = %0.3f'% tuple(agreement_alternative_forms(U,V))
I_sqrt(tr) = 0.666, I_tr = 0.280
def get_all_overlapping_agreement_variations(U, V):
dists= get_all_overlapping_distance_variations(U,V)
return [dists[0], 1-dists[1]]
def get_all_overlapping_distance_variations(U, V):
res= ["RI","ARI","RI'","ARI'", "nF", "sqtr","tr" ] ,\
np.hstack((distance_from_co_memberships(U,V, False),\
distance_from_co_memberships(U,V),\
distance_from_matrix_norms(U,V) ,\
distance_alternative_forms(U, V)
))
return res
from tabulate import tabulate
Res = get_all_overlapping_agreement_variations(U, V)
headrs = np.hstack( ([" - "], Res[0]))
vals = [ tuple(["(V,U)"]+Res[1].tolist()) ]
vals = np.asarray(vals, dtype=dict(names = headrs, formats=["a18"]+["float32"]*len(Res[1]) ))
print tabulate(vals, headers = "keys", floatfmt=".3f")
- RI ARI RI' ARI' nF sqtr tr ----- ----- ----- ----- ------ ----- ------ ----- (V,U) 0.667 0.312 0.700 0.408 0.580 0.666 0.280
All the agreement measures presented so far only consider memberships of datapoints, and ignore any relations between them. Here we define structure dependent clustering distances which incorporate the underlying structure of the graph.
def get_incident_matrix(n, edges, directed =False):
N = np.zeros((n, len(edges)), dtype =np.float)
for j,e in enumerate(edges):
w = 1 if len(e)<3 else np.sqrt(e[2])
N[e[0]][j] = w
N[e[1]][j] = w * (-1 if directed else 1)
return N
def get_incident_matrix_from_graph(G, directed =False):
#N = gm.incidence_matrix(G)
edges = [(i,j, w['weight']) for (i,j, w) in G.edges(data=True)]
return get_incident_matrix(G.number_of_nodes(), edges,directed)
def get_incident_matrix_from_adjecency(A, directed =False):
edges = np.nonzero(A if directed else np.tril(A))
weighted_edges = np.vstack((edges, A[edges]))
return get_incident_matrix(A.shape[0], np.transpose(weighted_edges), directed)
def distance_with_structure(G,U):
N = get_incident_matrix_from_graph(G)
return get_all_distance_variations(N, U)
def agreement_with_structure(G,U):
dists= distance_with_structure(G,U)
return [dists[0], 1-dists[1]]
def agreement_based_on_structure(G, U, V, method = "trans"):
dists= distance_based_on_structure(G, U, V, method)
return [dists[0], 1-dists[1]]
def distance_based_on_structure(G, U, V, method = "trans"):
# A = gm.adj_matrix(G)
N = get_incident_matrix_from_graph(G)
# avg dis
if method=="trans":
return get_all_distance_variations(N.T.dot(U), N.T.dot(V))
else:
names, d = get_all_distance_variations(U,V)
d1 = get_all_distance_variations(U,N)[1]
d2 = get_all_distance_variations(V,N)[1]
if method == "sum":
alpha =0.5
return [d1[0], alpha*d+ (1-alpha)*np.absolute(d1-d2) ]
elif method == "prod":
return [d1[0], 1./3*(np.sqrt(d*d1)+np.sqrt(d*d2)+np.sqrt(d1*d2))]
return float( (U.dot(U.T).dot(N.dot(N.T)).dot(V.dot(V.T))).trace() /\
math.sqrt( (U.dot(U.T).dot(N.dot(N.T)).dot(U.dot(U.T))).trace() *\
(V.dot(V.T).dot(N.dot(N.T)).dot(V.dot(V.T))).trace() )\
)
def transformation():
figure(figsize=(10, 5))
A = np.zeros((9,9),dtype='float')
edges = [(0,1),(0,5),(0,6),
(1,0),(1,2),(1,5),
(2,1),(2,3),(2,5),
(3,2),(3,5),(3,4),
(4,3),(4,5),
(5,0),(5,1),(5,2),(5,3),(5,4),(5,6),(5,8),
(6,0),(6,5),(6,7),(6,8),
(7,6),(7,8),
(8,6),(8,7),(8,5),
]
for e in edges:
A[e]=1
fix_pos = {0: np.array([ 0.20385061-0.1, 0.23977043-0.1]),
1: np.array([ 0.43693314, 0. ]),
2: np.array([ 0.75900616, 0.01462725]),
3: np.array([ 0.97478493, 0.24577521-0.1]),
4: np.array([ 1. , 0.53455287-0.1-0.1]),
5: np.array([ 0.56256428+0.05, 0.38383768 +0.1]),
6: np.array([ 0.16250501-0.1, 0.59711573]),
7: np.array([ 0. , 0.96007752]),
8: np.array([ 0.32060991, 0.8106598 ])}
edges = transpose(np.nonzero(np.tril(A)))
edg_pos ={}
for j,e in enumerate(edges):
edg_pos[j]= 0.5*(fix_pos[e[0]]+fix_pos[e[1]])
# print edg_pos
U = np.array([[1,0],[1,0],[1,0],[1,0],[1,0],[1,0],[0,1],[0,1],[0,1]])
V1 = np.array([[0,1],[1,0],[1,0],[1,0],[1,0],[1,0],[0,1],[0,1],[0,1]])
V2 = np.array([[1,0],[1,0],[1,0],[1,0],[1,0],[0,1],[0,1],[0,1],[0,1]])
N = get_incident_matrix(A.shape[0],edges)
G=nx.from_numpy_matrix(A)#=N.dot(N.T)
LG=nx.from_numpy_matrix(N.T.dot(N)) #Line graph
T = N.T.dot(U)
TRA = T.dot(T.T)
# nx.write_dot(LG,'graph.dot')
subplot(2,4,1)
draw_clustered_graph(G ,U, pos= fix_pos,draw_graph = True)
subplot(2,4,2)
draw_clustered_graph(G ,V1, pos= fix_pos,draw_graph = True)
subplot(2,4,3)
draw_clustered_graph(G ,V2, pos= fix_pos,draw_graph = True)
subplot(2,4,6)
draw_clustered_graph(nx.from_numpy_matrix((N.T.dot(U)).dot(U.T.dot(N))) ,N.T.dot(U), pos= edg_pos,draw_graph = False)
subplot(2,4,7)
draw_clustered_graph(nx.from_numpy_matrix((N.T.dot(V1)).dot(V1.T.dot(N))) ,N.T.dot(V1), pos= edg_pos,draw_graph = False)
subplot(2,4,8)
draw_clustered_graph(nx.from_numpy_matrix((N.T.dot(V2)).dot(V2.T.dot(N))) ,N.T.dot(V2), pos= edg_pos,draw_graph = False)
show()
transformation()
def experiment(A,V,U,U2=None, fix_pos=None):
G = nx.from_numpy_matrix(A)
plotTable = False
nr = 2 if plotTable else 1
nf = 2 #numFigs
if not U2==None:
nf =3
if fix_pos==None:
fix_pos = nx.spring_layout(G)#, nlist, dim, scale)spring_layout(G)
print '----- positions to repeat the layout:: \n ',fix_pos
figure(figsize=(9, 4))
subplot(nr,nf,1)#.set_title('$U$')
draw_clustered_graph(G,V, fix_pos)
subplot(nr,nf,2)#.set_title('$V1$')
draw_clustered_graph(G,U, fix_pos)
if not U2 == None:
subplot(nr,nf,3)#.set_title('$V2$')
draw_clustered_graph(G,U2, fix_pos)
show()
Res = get_all_agreement_variations(V, U)
headrs = np.hstack( ([" - "], Res[0]))
vals = [ tuple(["(V,U_1)"]+Res[1].tolist()) ]
if U2 is not None:
vals = vals + [ tuple(["(V,U_2)"]+get_all_agreement_variations(V, U2)[1].tolist()) ]
vals = vals + [ tuple(["(N,V)"]+agreement_with_structure(G, V)[1].tolist()) ]
vals = vals + [ tuple(["(N,U_1)"]+agreement_with_structure(G, U)[1].tolist()) ]
if not U2==None:
vals = vals + [ tuple(["(N,U_2)"]+agreement_with_structure(G, U2)[1].tolist()) ]
for m in ["trans","sum"]:
vals = vals + [ tuple(["(V,U_1|G)"+m[0]]+agreement_based_on_structure(G, V,U, method=m)[1].tolist()) ]
if not U2==None:
vals = vals + [ tuple(["(V,U_2|G)"+m[0]]+agreement_based_on_structure(G, V,U2, method=m)[1].tolist()) ]
vals = np.asarray(vals, dtype=dict(names = headrs, formats=["a18"]+["float32"]*len(Res[1]) ))
print tabulate(vals, headers = "keys", tablefmt="grid", floatfmt=".3f")
if plotTable:
args = { "tablefmt":"plain", "floatfmt":".3f", "numalign":"right"}
ax = plt.subplot2grid((2,nf), (1,0), colspan=nf)
plt.axis('off')
plt.text(0,0, tabulate(vals, headers = "keys", **args))#,backgroundcolor = 'y',alpha=1)
show()
def basic_example():
A = np.array([[0,1,1,0,0,0,0,0,0,0],
[1,0,0,1,0,0,0,0,0,0],
[1,0,0,1,0,0,0,1,0,0],
[0,1,1,0,1,1,0,0,0,0],
[0,0,0,1,0,1,1,0,0,0],
[0,0,0,1,1,0,1,0,0,0],
[0,0,0,0,1,1,0,0,0,0],
[0,0,1,0,0,0,0,0,1,1],
[0,0,0,0,0,0,0,1,0,1],
[0,0,0,0,0,0,0,1,1,0]])
fix_pos_GraphExample = {0: array([ 0.05, .35]),
1: array([ 0.02, 0.62]),
2: array([ 0.25, 0.4]),
3: array([ 0.25, 0.73]),
4: array([ .5, 0.9]),
5: array([ 0, 0.9]),
6: array([ 0.25, 1 ]),
7: array([ 0.25, 0.1]),
8: array([ 0, 0. ]),
9: array([ .5, 0])}
return A, fix_pos_GraphExample
A, fix_pos_GraphExample = basic_example()
V = np.array([[1,0],[1,0],[1,0],[0,1],[0,1],[0,1],[0,1],[1,0],[1,0],[1,0]])
U = np.array([[1,0,0],[1,0,0],[1,0,0],[1,0,0],[0,1,0],[0,1,0],[0,1,0],[0,0,1],[0,0,1],[0,0,1]])
experiment(A,U,V, fix_pos=fix_pos_GraphExample)
+------------+-------+-------+-------+--------+-------+--------+-------+ | - | RI | ARI | RI' | ARI' | nF | sqtr | tr | +============+=======+=======+=======+========+=======+========+=======+ | (V,U_1) | 0.667 | 0.312 | 0.700 | 0.408 | 0.580 | 0.666 | 0.280 | +------------+-------+-------+-------+--------+-------+--------+-------+ | (N,V) | 0.889 | 0.723 | 0.975 | 0.586 | 0.598 | 0.797 | 0.177 | +------------+-------+-------+-------+--------+-------+--------+-------+ | (N,U_1) | 0.733 | 0.451 | 0.966 | 0.437 | 0.571 | 0.672 | 0.185 | +------------+-------+-------+-------+--------+-------+--------+-------+ | (V,U_1|G)t | 0.816 | 0.455 | 0.822 | 0.501 | 0.643 | 0.768 | 0.322 | +------------+-------+-------+-------+--------+-------+--------+-------+ | (V,U_1|G)s | 0.756 | 0.520 | 0.846 | 0.629 | 0.776 | 0.771 | 0.636 | +------------+-------+-------+-------+--------+-------+--------+-------+
A, fix_pos_GraphExample = basic_example()
U = np.array([[1,0,0],[1,0,0],[1,0,0],[1,1,0],[0,1,0],[0,1,0],[0,1,0],[0,0,1],[0,0,1],[0,0,1]])
V = np.array([[1,0,0],[1,0,0],[1,0,0],[.6,.4,0],[0,1,0],[0,1,0],[0,1,0],[0,0,1],[0,0,1],[0,0,1]])
V2 = np.array([[2,0,0],[2,0,0],[1,0,0],[1,1,0],[0,2,0],[0,2,0],[0,3,0],[0,0,2],[0,0,2],[0,0,2]])
experiment(A,U,V,V2, fix_pos=fix_pos_GraphExample)
+------------+-------+-------+-------+--------+-------+--------+-------+ | - | RI | ARI | RI' | ARI' | nF | sqtr | tr | +============+=======+=======+=======+========+=======+========+=======+ | (V,U_1) | 0.965 | 0.911 | 0.987 | 0.884 | 0.809 | 0.942 | 0.325 | +------------+-------+-------+-------+--------+-------+--------+-------+ | (V,U_2) | 0.935 | 0.379 | 0.958 | 0.328 | 0.397 | 0.881 | 0.314 | +------------+-------+-------+-------+--------+-------+--------+-------+ | (N,V) | 0.911 | 0.793 | 0.979 | 0.664 | 0.651 | 0.832 | 0.189 | +------------+-------+-------+-------+--------+-------+--------+-------+ | (N,U_1) | 0.921 | 0.786 | 0.975 | 0.570 | 0.588 | 0.808 | 0.178 | +------------+-------+-------+-------+--------+-------+--------+-------+ | (N,U_2) | 0.928 | 0.317 | 0.962 | 0.411 | 0.479 | 0.757 | 0.172 | +------------+-------+-------+-------+--------+-------+--------+-------+ | (V,U_1|G)t | 0.969 | 0.852 | 0.966 | 0.850 | 0.799 | 0.953 | 0.351 | +------------+-------+-------+-------+--------+-------+--------+-------+ | (V,U_2|G)t | 0.951 | 0.371 | 0.939 | 0.341 | 0.440 | 0.904 | 0.347 | +------------+-------+-------+-------+--------+-------+--------+-------+ | (V,U_1|G)s | 0.978 | 0.952 | 0.991 | 0.895 | 0.873 | 0.959 | 0.657 | +------------+-------+-------+-------+--------+-------+--------+-------+ | (V,U_2|G)s | 0.959 | 0.451 | 0.970 | 0.537 | 0.612 | 0.903 | 0.648 | +------------+-------+-------+-------+--------+-------+--------+-------+
A = np.zeros((11,11),dtype='float')
for e in [(0,1),(0,3),(0,4),
(1,2),(1,3),(1,4),
(2,3),
(3,4),
(5,6),(5,7),
(6,7),
(7,8),
(8,9),(8,10),
(9,10)
]:
A[e]=A[(e[1],e[0])]=1
fix_pos_GraphExample = {0: array([ 0.5/2 , 0.95]),
1: array([ 1./2 , 0.59]),
2: array([ 0.81/2, 0. ]),
3: array([ 0.19/2, 0. ]),
4: array([ 0. , 0.59]),
5: array([ 0.74, 1. ]),
6: array([ 1, 1. ]),
7: array([ 0.88, 0.69]),
8: array([ 0.88, 0.35]),
9: array([ 0.74, 0]),
10: array([ 1, 0])}
np.set_printoptions(precision=3,suppress=True)
U = np.array([[1,0,0],[1,0,0],[1,0,0],[1,0,0],[1,0,0],\
[0,1,0],[0,1,0],[0,1,0],[0,0,1],[0,0,1],[0,0,1]])
V1 = np.array([[1,0],[1,0],[1,0],[1,0],[1,0],\
[0,1],[0,1],[0,1],[0,1],[0,1],[0,1]])
V2 = np.array([[1,0,0],[0,0,1],[0,0,1],[1,0,0],[1,0,0],\
[0,1,0],[0,1,0],[0,1,0],[0,0,1],[0,0,1],[0,0,1]])
sklearn_measures(U, V1)
sklearn_measures(U, V2)
print '---', nmi(U,V1)
print '---', nmi(U,V2)
experiment(A,U,V1,V2, fix_pos=fix_pos_GraphExample)
From sklean:: ARI = 0.660 NMI = 0.804 AMI = 0.600 VM = 0.785 From sklean:: ARI = 0.471 NMI = 0.713 AMI = 0.621 VM = 0.713 --- [ 0.842 0.785 0.804] --- [ 0.745 0.713 0.713]
+------------+-------+-------+-------+--------+-------+--------+-------+ | - | RI | ARI | RI' | ARI' | nF | sqtr | tr | +============+=======+=======+=======+========+=======+========+=======+ | (V,U_1) | 0.836 | 0.660 | 0.851 | 0.703 | 0.705 | 0.840 | 0.355 | +------------+-------+-------+-------+--------+-------+--------+-------+ | (V,U_2) | 0.782 | 0.471 | 0.802 | 0.567 | 0.626 | 0.721 | 0.256 | +------------+-------+-------+-------+--------+-------+--------+-------+ | (N,V) | 0.945 | 0.865 | 0.977 | 0.620 | 0.615 | 0.814 | 0.176 | +------------+-------+-------+-------+--------+-------+--------+-------+ | (N,U_1) | 0.818 | 0.621 | 0.970 | 0.502 | 0.589 | 0.707 | 0.182 | +------------+-------+-------+-------+--------+-------+--------+-------+ | (N,U_2) | 0.800 | 0.506 | 0.968 | 0.485 | 0.552 | 0.702 | 0.152 | +------------+-------+-------+-------+--------+-------+--------+-------+ | (V,U_1|G)t | 0.900 | 0.790 | 0.906 | 0.806 | 0.768 | 0.902 | 0.407 | +------------+-------+-------+-------+--------+-------+--------+-------+ | (V,U_2|G)t | 0.857 | 0.564 | 0.862 | 0.607 | 0.667 | 0.798 | 0.305 | +------------+-------+-------+-------+--------+-------+--------+-------+ | (V,U_1|G)s | 0.855 | 0.708 | 0.922 | 0.793 | 0.839 | 0.866 | 0.675 | +------------+-------+-------+-------+--------+-------+--------+-------+ | (V,U_2|G)s | 0.818 | 0.556 | 0.897 | 0.716 | 0.782 | 0.804 | 0.616 | +------------+-------+-------+-------+--------+-------+--------+-------+
A = np.zeros((9,9),dtype='float')
for e in [(0,1),(0,5),(0,6),
(1,0),(1,2),(1,5),
(2,1),(2,3),(2,5),
(3,2),(3,5),(3,4),
(4,3),(4,5),
(5,0),(5,1),(5,2),(5,3),(5,4),(5,6),(5,8),
(6,0),(6,5),(6,7),(6,8),
(7,6),(7,8),
(8,6),(8,7),(8,5),
]:
A[e]=1
fix_pos_GraphExample = {0: np.array([ 0.20385061-0.1, 0.23977043-0.1]),
1: np.array([ 0.43693314, 0. ]),
2: np.array([ 0.75900616, 0.01462725]),
3: np.array([ 0.97478493, 0.24577521-0.1]),
4: np.array([ 1. , 0.53455287-0.1-0.1]),
5: np.array([ 0.56256428+0.05, 0.38383768 +0.1]),
6: np.array([ 0.16250501-0.1, 0.59711573]),
7: np.array([ 0. , 0.96007752]),
8: np.array([ 0.32060991, 0.8106598 ])}
U = np.array([[1,0],[1,0],[1,0],[1,0],[1,0],[1,0],[0,1],[0,1],[0,1]])
V1 = np.array([[0,1],[1,0],[1,0],[1,0],[1,0],[1,0],[0,1],[0,1],[0,1]])
V2 = np.array([[1,0],[1,0],[1,0],[1,0],[1,0],[0,1],[0,1],[0,1],[0,1]])
experiment(A,U,V1,V2, fix_pos=fix_pos_GraphExample)
+------------+-------+-------+-------+--------+-------+--------+-------+ | - | RI | ARI | RI' | ARI' | nF | sqtr | tr | +============+=======+=======+=======+========+=======+========+=======+ | (V,U_1) | 0.778 | 0.556 | 0.802 | 0.604 | 0.695 | 0.815 | 0.432 | +------------+-------+-------+-------+--------+-------+--------+-------+ | (V,U_2) | 0.778 | 0.556 | 0.802 | 0.604 | 0.695 | 0.815 | 0.432 | +------------+-------+-------+-------+--------+-------+--------+-------+ | (N,V) | 0.750 | 0.500 | 0.979 | 0.327 | 0.512 | 0.662 | 0.200 | +------------+-------+-------+-------+--------+-------+--------+-------+ | (N,U_1) | 0.750 | 0.491 | 0.979 | 0.337 | 0.503 | 0.668 | 0.193 | +------------+-------+-------+-------+--------+-------+--------+-------+ | (N,U_2) | 0.639 | 0.264 | 0.977 | 0.275 | 0.481 | 0.616 | 0.178 | +------------+-------+-------+-------+--------+-------+--------+-------+ | (V,U_1|G)t | 0.926 | 0.744 | 0.928 | 0.752 | 0.799 | 0.923 | 0.527 | +------------+-------+-------+-------+--------+-------+--------+-------+ | (V,U_2|G)t | 0.857 | 0.417 | 0.859 | 0.435 | 0.708 | 0.844 | 0.480 | +------------+-------+-------+-------+--------+-------+--------+-------+ | (V,U_1|G)s | 0.889 | 0.773 | 0.901 | 0.797 | 0.843 | 0.904 | 0.712 | +------------+-------+-------+-------+--------+-------+--------+-------+ | (V,U_2|G)s | 0.833 | 0.660 | 0.900 | 0.776 | 0.832 | 0.885 | 0.705 | +------------+-------+-------+-------+--------+-------+--------+-------+
A = np.zeros((5,5),dtype='float')
for e in [(0,1),(0,4),
(1,0),(1,2),
(2,1),(2,3),
(3,2),(3,4),
(4,3),(4,0),
]:
A[e]=1
fix_pos_GraphExample = {0: array([ 0.49682607, 0.95151713]),
1: array([ 1. , 0.59183855]),
2: array([ 8.12052312e-01, 3.19389103e-05]),
3: array([ 0.19258776, 0. ]),
4: array([ 0. , 0.58628989])}
U = np.array([[1,0,0],[0,1,0],[0,1,1],[1,1,1],[1,0,1]])
V1 = np.array([[1,0,0],[0,1,0],[0,0,1],[1,0,0],[1,0,0]])
V2 = np.array([[1,0,0],[0,1,0],[0,0,1],[1,0,1],[1,0,0]])
print 'V_1', agreement_from_omega(U, V1)
print 'V_2', agreement_from_omega(U, V2)
experiment(A,U,V1,V2, fix_pos=fix_pos_GraphExample)
V_1 [ 0.5 0.219] V_2 [ 0.5 0.194]
+------------+-------+-------+-------+--------+-------+--------+-------+ | - | RI | ARI | RI' | ARI' | nF | sqtr | tr | +============+=======+=======+=======+========+=======+========+=======+ | (V,U_1) | 0.800 | 0.245 | 0.902 | 0.318 | 0.532 | 0.764 | 0.378 | +------------+-------+-------+-------+--------+-------+--------+-------+ | (V,U_2) | 0.875 | 0.490 | 0.942 | 0.577 | 0.663 | 0.894 | 0.444 | +------------+-------+-------+-------+--------+-------+--------+-------+ | (N,V) | 0.850 | 0.333 | 0.933 | 0.528 | 0.682 | 0.816 | 0.333 | +------------+-------+-------+-------+--------+-------+--------+-------+ | (N,U_1) | 0.600 | 0.200 | 0.870 | 0.444 | 0.590 | 0.771 | 0.280 | +------------+-------+-------+-------+--------+-------+--------+-------+ | (N,U_2) | 0.700 | 0.400 | 0.900 | 0.576 | 0.666 | 0.822 | 0.300 | +------------+-------+-------+-------+--------+-------+--------+-------+ | (V,U_1|G)t | 0.856 | 0.186 | 0.868 | 0.211 | 0.536 | 0.860 | 0.538 | +------------+-------+-------+-------+--------+-------+--------+-------+ | (V,U_2|G)t | 0.913 | 0.427 | 0.924 | 0.483 | 0.672 | 0.961 | 0.581 | +------------+-------+-------+-------+--------+-------+--------+-------+ | (V,U_1|G)s | 0.775 | 0.556 | 0.919 | 0.617 | 0.720 | 0.859 | 0.662 | +------------+-------+-------+-------+--------+-------+--------+-------+ | (V,U_2|G)s | 0.863 | 0.712 | 0.954 | 0.765 | 0.824 | 0.945 | 0.706 | +------------+-------+-------+-------+--------+-------+--------+-------+