import numpy as np
from sklearn import metrics
import pandas as pd
import seaborn as sns
sns.set_context("paper", font_scale=2)
sns.set_style("whitegrid")
def merge_clusters(assign_vect, a, b):
"""
merge clusters a and b into a cluster with
id a
"""
x = np.array(assign_vect)
x[x == b] = a
return x
def distribute_clusters(assign_vect, b):
"""
Take all the data points in cluster b and evenly
distribute them among the remaining clusters
"""
x = np.array(assign_vect)
ids = np.unique(assign_vect).tolist()
ids.remove(b)
for ei, i in enumerate(np.argwhere(x == b)):
x[i] = ids[ei % len(ids)]
return x
def split_clusters(assignvect, b):
"""
split cluster b into two evenly-sized
clusters with ids max+1 and max+2
"""
x = np.array(assignvect)
for ei, i in enumerate(np.argwhere(x == b)):
x[i] = (ei % 2) + np.max(assignvect) + 1
return x
#print distribute_clusters([0, 1, 2, 0, 1, 2], 1)
#assert == np.array([0, 0, 2, 0, 2, 2])
print split_clusters([0, 1, 0, 1, 0, 1], 1)
[0 2 0 3 0 2]
CLUSTER_N = 20
N = 1000
truth = np.arange(N) % CLUSTER_N
outdata = []
for merge_n in range(CLUSTER_N-1):
for operation, name in [(split_clusters, 'split'),
(distribute_clusters, 'distribute'),
(lambda x, y: merge_clusters(x, 0, y), 'merge')]:
new_array = np.array(truth)
for i in range(1, merge_n+1):
new_array = operation(new_array, i)
ari = metrics.adjusted_rand_score(truth, new_array)
ami = metrics.adjusted_mutual_info_score(truth, new_array)
homogeneity = metrics.homogeneity_score(truth, new_array)
completeness = metrics.completeness_score(truth, new_array)
outdata.append({'name' : name,
'modify_n' : merge_n,
'ari' : ari,
'ami' : ami,
'homogeneity' : homogeneity,
'completeness' : completeness})
df = pd.DataFrame(outdata)
df.head()
ari | completeness | homogeneity | modify_n | name | |
---|---|---|---|---|---|
0 | 1.000000 | 1.000000 | 1.000000 | 0 | split |
1 | 1.000000 | 1.000000 | 1.000000 | 0 | distribute |
2 | 1.000000 | 1.000000 | 1.000000 | 0 | merge |
3 | 0.986423 | 0.988563 | 1.000000 | 1 | split |
4 | 0.922947 | 0.950296 | 0.934012 | 1 | distribute |
for gi, g in df.groupby('name'):
pylab.plot(g['modify_n'], g['ari'], label=gi, linewidth=3)
pylab.legend(loc='lower left')
pylab.ylabel('Adjusted Rand Index')
pylab.xlabel('groups modified')
pylab.grid()
pylab.savefig('ari.pdf')
#sns.factorplot(df, 'ari', )
if 'ami' in df:
del df['ami']
dfm = pd.melt(df, id_vars=['name', 'modify_n'])
sns.factorplot("modify_n", "value", "name", dfm, col='variable', kind='point');
sns.despine()
#sns.factorplot(df, 'ari', )
if 'ami' in df:
del df['ami']
dfm = pd.melt(df, id_vars=['name', 'modify_n'])
g= sns.factorplot("modify_n", "value", "name", dfm, col='variable', kind='point');
sns.despine()
#pylab.gca().set_xticklabels(['1'] + [''] *18 + ['18'] )
pylab.yticks([0, 1])
pylab.ylim(0, 1.1)
# ridiculous hack
l = [' '] * 19
l[0] = '1'
l[-1] ='19'
g.set_xticklabels(l)
g.set_titles("{col_name}" ) #["Adjusted Rand Index", "Completeness", "Homogeneity"])
g.set_xlabels("Number of modified types")
pylab.savefig("cluster_metric_comparison.pdf")