We test different possibilities of filtering the correlation matrix. This allows us to build a graph and use the literature from graph theory to extract valuable information. A short introduction of this topic is given in wikipedia.
The Dependency matrix is the weighted adjacency matrix, representing the fully connected network. Different algorithms can be applied to filter the fully connected network to obtain the most meaningful information, such as using a threshold approach, or different pruning algorithms.
A widely used method to construct informative sub-graph of a complete network is the Minimum Spanning Tree (MST).
Another informative sub-graph, which retains more information (in comparison to the MST) is the Planar Maximally Filtered Graph (PMFG) which we also compute here. Both methods are based on hierarchical clustering and the resulting sub-graphs include all the $N$ nodes in the network whose edges represent the most relevant association correlations. The MST sub-graph contains $(N-1)$ edges with no loops while the PMFG sub-graph contains $3(N-2)$ edges.
A short timeline of the literature in this field can be found in this blog entry.
%load_ext autoreload
%autoreload 2
%matplotlib inline
import pandas as pd
import pandas.io.data as web
import datetime
import pandas.io.pytables
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import seaborn as sns
from itertools import permutations
from collections import OrderedDict
from pprint import pprint
import networkx as nx
import viz
import scipy.cluster.hierarchy as sch
ds = viz.CorrelationMatrixSource('dax30.h5', 'corr2')
ds.time_axis()
df = ds.get_at( datetime.date(2008,9,15))
with pd.io.pytables.get_store('dax30.h5') as store:
dax_info = store.select('dax')
dax_info.set_index('Symbol', inplace=True)
plt.figure(figsize=(10,10))
gs = gridspec.GridSpec(3, 1)
ax_mat = plt.subplot(gs[1:, -2:])
plt.imshow(df.values, interpolation='nearest', axes=ax_mat)
ax_mat.set_yticks(np.arange(len(df.index)))
ax_mat.set_yticklabels(dax_info.loc[df.index]['Ticker symbol'].tolist())
ax_mat.set_xticklabels(dax_info.loc[df.index[map(int, ax_mat.get_xticks())[:-1]]]['Ticker symbol'].tolist())
ax_mat.grid(False)
plt.savefig('correlation.pdf', bbox='tight')
plt.figure(figsize=(10,10))
gs = gridspec.GridSpec(3, 3)
ax_dendrogram = plt.subplot(gs[1:,:1])
Y = sch.linkage(df.values, method='centroid')
Z = sch.dendrogram(Y, orientation='right', no_plot=False, ax=ax_dendrogram)
index = list(reversed(Z['leaves']))
ax_mat = plt.subplot(gs[1:, -2:])
D = df.values[index,:]
D = D[:,index]
cax = plt.imshow(D, interpolation='nearest', axes=ax_mat)
ax_mat.set_xticks([])
ax_mat.set_yticks([])
ax_dendrogram.set_yticklabels(dax_info.ix[index]['Ticker symbol'].tolist())
plt.savefig('dendrogram.pdf', bbox_inches='tight')
We can also use bokeh
from bokeh.plotting import *
from bokeh.objects import HoverTool, ColumnDataSource
def to_data_source(df):
Y = sch.linkage(df.values, method='centroid')
Z = sch.dendrogram(Y, orientation='right', no_plot=True)
index = Z['leaves']
_names = df.columns.tolist()
names = [_names[i] for i in index] # if i in range(5)]
color_of = colorer()
# xnames, ynames = zip( *((c,v) for c,v in permutations(names, r=2)))
xnames = []
ynames = []
values = []
colors = []
for n in names:
xnames.extend([n] * len(names))
ynames.extend(names)
v = df.loc[n, names].tolist()
values.extend(v)
colors.extend(map(color_of, v))
source = ColumnDataSource(
data=dict(
xname = xnames,
yname = ynames,
colors= colors,
values= values,
)
)
return source, names
def colorer(number_colors = 21):
basis = sns.blend_palette(["seagreen", "ghostwhite", "#4168B7"], number_colors)
palette = ["rgb(%d, %d, %d)" % (r,g,b) for r,g,b, a in np.round(basis * 255)]
def color_of(value):
i = np.round((value + 1.) * (number_colors -1) * 0.5)
return palette[int(i)]
return color_of
def corrplot(source, names):
rect('xname', 'yname', 0.9, 0.9, source=source,
x_range=names, y_range=list(reversed(names)),
color='colors', line_color=None,
tools="resize,hover,previewsave", title="Correlation matrix",
plot_width=500, plot_height=500)
grid().grid_line_color = None
axis().axis_line_color = None
axis().major_tick_line_color = None
axis().major_label_text_font_size = "7pt"
axis().major_label_standoff = 0
xaxis().location = "top"
xaxis().major_label_orientation = np.pi/3
hover = [t for t in curplot().tools if isinstance(t, HoverTool)][0]
hover.tooltips = OrderedDict([
('names', '@yname, @xname'),
('value', '@values')
])
return curplot()
df = ds.get_at( datetime.date(2003,1,6))
source, names = to_data_source(df)
bk = corrplot(source, names)
show()
The goal is to reduce the redundancy in the correlation matrix and filter it, creating a graph. One of the criteria for this is to compute the Minimum Spanning Tree.
Minimum spanning trees are graphs of nodes so that
This kind of tree is particularly useful for representing complex networks, filtering the information about the correlations between all nodes and presenting it in a planar graph. It is constructed using the Kruskal's algorithm. We use the implementation from the NetworkX package.
The algorithm requires a transformation of the correlation matrix $[C_{ij}]$ into a distance matrix $[ d_{ij}]$
$$d_{ij}= \sqrt{2(1-C_{ij})}$$def construct_mst(df, threshold=0.01):
# threshold = 0.05 / len(names):
g = nx.Graph()
names = df.columns.unique()
df_distance = np.sqrt( 2 * np.clip( 1. - df, 0., 2.) )
for i, n in enumerate(names):
for j, m in enumerate(names):
if j >= i: break
val = df_distance.loc[n,m]
# Bonferroni correction: TODO check implementation
if np.abs(val) > threshold:
g.add_edge(n, m, weight=val)
return nx.minimum_spanning_tree(g)
Here is an example
mst = construct_mst(df)
def add_info(g, key, pds):
new_attributes = map( lambda x: (x[0], x[1].decode('utf8')), filter(lambda i: g.has_node(i[0]), pds.iteritems()))
nx.set_node_attributes(g, key, dict(new_attributes))
add_info(mst, 'group', dax_info['Prime Standard industry group'])
add_info(mst, 'label', dax_info.Company)
# fig =plt.figure()
# nx.draw_graphviz(mst, node_color='steelblue')
# import mpld3
# fig_d3 = mpld3.display(fig)
# display(fig_d3)
nx.draw_graphviz(mst, font_size=9, font_color='black', node_color='steelblue', labels=dict(map( lambda x: (x[0], x[1].decode('utf8')), filter(lambda i: mst.has_node(i[0]), dax_info.Company.iteritems()))))
plt.savefig('mst.pdf', bbox='tight')
from graph import widgetforcelayout
from IPython.display import display
from IPython.html import widgets
def construct_dateslider(ds, callback=None):
_time = pd.TimeSeries(ds.time_axis()) #.reindex()
int_range = widgets.IntSliderWidget(min=100, max=_time.last_valid_index(), value=100, readout=False)
int_range.set_css({'width': '900'})
text_area = widgets.LatexWidget()
def get_current_date():
i = int_range.value
val = _time.ix[i]
return val
def date_to_string(datevalue):
return "%s/%d/%d" % (datevalue.year, datevalue.month, datevalue.day)
def f():
new_date = get_current_date()
new_val = date_to_string(new_date)
if new_val == text_area.value:
return
text_area.value = new_val
if callback: callback(new_date)
text_area.value = date_to_string(get_current_date())
int_range.on_trait_change(f)
date_container = widgets.ContainerWidget(children=[int_range, text_area])
# date_container.set_css('border', '3px dotted grey')
return date_container
widgetforcelayout.publish_js()
figure()
source, names = to_data_source(df)
bk = corrplot(source, names)
graph_widget = widgetforcelayout.GraphWidget(height=500, width=600, link_distance=10, charge=-330,
color_domain=dax_info['Prime Standard industry group'].unique().tolist())
bokeh_widget= widgets.HTMLWidget()
bokeh_widget.value = notebook_div(curplot())
corr_container = widgets.ContainerWidget(children=[graph_widget, bokeh_widget])
# corr_container.set_css('border', '3px dotted grey')
import time
def update_matrix_mst(datevalue):
_df = ds.get_at(datevalue)
_source, _names = to_data_source(_df)
bk = corrplot(_source, _names)
time.sleep(0.01)
bokeh_widget.value = notebook_div(curplot())
_mst = construct_mst(_df)
add_info(_mst, 'group', dax_info['Prime Standard industry group'])
add_info(_mst, 'label', dax_info.Company)
time.sleep(0.01)
graph_widget.set_graph(_mst)
date_container = construct_dateslider(ds, callback=update_matrix_mst)
date_container.msg_throttle = 1
mst_title = widgets.HTMLWidget()
mst_title.value='<h3>Minimum Spanning Tree</h3>'
container = widgets.ContainerWidget( children=[mst_title, corr_container, date_container] )
# display(mst_title)
display(container)
date_container.remove_class('vbox')
date_container.add_class('hbox')
corr_container.remove_class('vbox')
corr_container.add_class('hbox')
mst_stats.plot(figsize=(16,3))
plt.ylabel('Diameter MST')
--------------------------------------------------------------------------- NameError Traceback (most recent call last) <ipython-input-25-eddb02651a3a> in <module>() 48 corr_container.add_class('hbox') 49 ---> 50 mst_stats.plot(figsize=(16,3)) 51 plt.ylabel('Diameter MST') NameError: name 'mst_stats' is not defined
%%time
# initialize DataFrame
all_t = ds.time_axis()
mst_stats = pd.DataFrame(columns=['diameter'], index=all_t)
# iterate
for t, df in ds.iterate_time():
try:
g = construct_mst(df)
mst_stats.loc[t] = nx.algorithms.diameter(g)# #, nx.algorithms.degree_centrality}
except ValueError as e:
print "skipping ", t
CPU times: user 4min 51s, sys: 4.64 s, total: 4min 56s Wall time: 5min
mst_stats.plot(figsize=(16,5))
plt.ylabel('Diameter MST')
plt.savefig('diameter.pdf', bbox='tight')
We use the planarity library in order to decide whether a graph is planar.
The benefits of the PMFG are discussed in length in
Song W-M, Di Matteo T, Aste T (2012) Hierarchical Information Clustering by Means of Topologically Embedded Graphs. PLoS ONE 7(3): e31929. doi:10.1371/journal.pone.0031929
It basically offers a good trade-off between edge reduction and information retainment.
from graph.algos import construct_pmfg
import planarity
def construct_pmfg(df_corr_matrix):
df_distance = np.sqrt( 2 * np.clip( 1. - df_corr_matrix, 0., 2.) )
dist_matrix = df_distance.values
index_upper_triangular = np.triu_indices(dist_matrix.shape[0],1)
isort = np.argsort( dist_matrix[index_upper_triangular] )
G = nx.Graph()
for k in xrange(0,len(isort)):
u = index_upper_triangular[0][isort[k]]
v = index_upper_triangular[1][isort[k]]
if dist_matrix[u,v] > 0: # remove perfect correlation because of diagonal FIXME
G.add_edge(u, v, {'weight': float(dist_matrix[u,v])})
if not planarity.is_planar(G):
G.remove_edge(u,v)
return G
def add_info_pmfg(g, key, prop_name, pds):
idx = pds.columns.tolist().index(prop_name)
new_attr = map( lambda r: (r[0], r[1][1][idx]), filter( lambda r: g.has_node(r[0]), enumerate(pds.iterrows())))
nx.set_node_attributes(pmfg, key, dict(new_attr))
pmfg = construct_pmfg(df)
graph_widget = widgetforcelayout.GraphWidget(height=600, width=800, link_distance=25, charge=-1000)
# ['Ticker symbol']
add_info_pmfg(pmfg, 'label', 'Company', dax_info)
add_info_pmfg(pmfg, 'group', 'Prime Standard industry group', dax_info)
# pmfg.nodes(data=True)
display(graph_widget)
time.sleep(0.1)
graph_widget.set_graph(pmfg)
map( lambda v: (dax_info.index[v[0]], v[1]), sorted( nx.algorithms.eigenvector_centrality(pmfg).items(), key=lambda x: x[1]))
[('ADS.DE', 0.049043173025961624), ('CBK.DE', 0.07019794395970734), ('IFX.DE', 0.07186337740740774), ('VOW3.DE', 0.08554373485017783), ('EOAN.DE', 0.10413019491713557), ('FME.DE', 0.10784496408951262), ('MUV2.DE', 0.1139542680363763), ('SDF.DE', 0.11971420931895896), ('CON.DE', 0.12302900048657926), ('FRE.DE', 0.13328050841143393), ('DAI.DE', 0.14569266802572808), ('SAP.DE', 0.1474766606016511), ('TKA.DE', 0.14892101365301563), ('DPW.DE', 0.15111231708957262), ('MRK.DE', 0.15220769706448323), ('RWE.DE', 0.1603673026450104), ('DTE.DE', 0.16762914071751536), ('SIE.DE', 0.18328526142226578), ('LXS.DE', 0.18405957253478808), ('HEN3.DE', 0.1913498868609257), ('LIN.DE', 0.1926191444625879), ('LHA.DE', 0.19471659145804932), ('BMW.DE', 0.23798889741484666), ('BAYN.DE', 0.27052421338966726), ('BAS.DE', 0.2719406620623792), ('BEI.DE', 0.27619143818898234), ('HEI.DE', 0.27747033359470147), ('ALV.DE', 0.4431407715191102)]
nx.algorithms.eigenvector_centrality?
mean_measure = lambda g, f_m, **d: np.mean(f_m(g, **d).values())
aggregated_centrality = lambda g: ( mean_measure( g, nx.algorithms.betweenness_centrality, weight='weight'),
mean_measure( g, nx.algorithms.closeness_centrality, distance='weight'),
mean_measure( g, nx.algorithms.eigenvector_centrality) )
aggregated_centrality(pmfg)
(0.046499796499796499, 0.47681349305560661, 0.17054624811459035)
%%time
# initialize DataFrame
all_t = ds.time_axis()
pmfg_stats = pd.DataFrame(columns=['betweenness_centrality', 'closeness_centrality', 'eigenvector_centrality'], index=all_t)
# iterate
for t, df in ds.iterate_time():
try:
g = construct_pmfg(df)
pmfg_stats.loc[t] = aggregated_centrality(g)# #, nx.algorithms.degree_centrality}
except ValueError as e:
print "skipping ", t
CPU times: user 4min 51s, sys: 4.97 s, total: 4min 56s Wall time: 5min 16s
pmfg_stats
pmfg_stats.plot(figsize=(16,5), subplots=True)
array([<matplotlib.axes.AxesSubplot object at 0x7f803c0c1250>, <matplotlib.axes.AxesSubplot object at 0x7f803c108d10>, <matplotlib.axes.AxesSubplot object at 0x7f803c15c7d0>], dtype=object)
pmfg_stats
pmfg_stats.plot(figsize=(16,5), subplots=True)
# plt.ylabel('Diameter MST')
array([<matplotlib.axes.AxesSubplot object at 0x11ed52d90>, <matplotlib.axes.AxesSubplot object at 0x11e225c50>, <matplotlib.axes.AxesSubplot object at 0x11ec60b10>], dtype=object)