%load_ext autoreload %autoreload 2 %matplotlib inline import pandas as pd import pandas.io.data as web import datetime import pandas.io.pytables import numpy as np import matplotlib as mpl import matplotlib.pyplot as plt import matplotlib.gridspec as gridspec import seaborn as sns from itertools import permutations from collections import OrderedDict from pprint import pprint import networkx as nx import viz import scipy.cluster.hierarchy as sch ds = viz.CorrelationMatrixSource('dax30.h5', 'corr2') ds.time_axis() df = ds.get_at( datetime.date(2008,9,15)) with pd.io.pytables.get_store('dax30.h5') as store: dax_info = store.select('dax') dax_info.set_index('Symbol', inplace=True) plt.figure(figsize=(10,10)) gs = gridspec.GridSpec(3, 1) ax_mat = plt.subplot(gs[1:, -2:]) plt.imshow(df.values, interpolation='nearest', axes=ax_mat) ax_mat.set_yticks(np.arange(len(df.index))) ax_mat.set_yticklabels(dax_info.loc[df.index]['Ticker symbol'].tolist()) ax_mat.set_xticklabels(dax_info.loc[df.index[map(int, ax_mat.get_xticks())[:-1]]]['Ticker symbol'].tolist()) ax_mat.grid(False) plt.savefig('correlation.pdf', bbox='tight') plt.figure(figsize=(10,10)) gs = gridspec.GridSpec(3, 3) ax_dendrogram = plt.subplot(gs[1:,:1]) Y = sch.linkage(df.values, method='centroid') Z = sch.dendrogram(Y, orientation='right', no_plot=False, ax=ax_dendrogram) index = list(reversed(Z['leaves'])) ax_mat = plt.subplot(gs[1:, -2:]) D = df.values[index,:] D = D[:,index] cax = plt.imshow(D, interpolation='nearest', axes=ax_mat) ax_mat.set_xticks([]) ax_mat.set_yticks([]) ax_dendrogram.set_yticklabels(dax_info.ix[index]['Ticker symbol'].tolist()) plt.savefig('dendrogram.pdf', bbox_inches='tight') from bokeh.plotting import * from bokeh.objects import HoverTool, ColumnDataSource output_notebook() def to_data_source(df): Y = sch.linkage(df.values, method='centroid') Z = sch.dendrogram(Y, orientation='right', no_plot=True) index = Z['leaves'] _names = df.columns.tolist() names = [_names[i] for i in index] # if i in range(5)] color_of = colorer() # xnames, ynames = zip( *((c,v) for c,v in permutations(names, r=2))) xnames = [] ynames = [] values = [] colors = [] for n in names: xnames.extend([n] * len(names)) ynames.extend(names) v = df.loc[n, names].tolist() values.extend(v) colors.extend(map(color_of, v)) source = ColumnDataSource( data=dict( xname = xnames, yname = ynames, colors= colors, values= values, ) ) return source, names def colorer(number_colors = 21): basis = sns.blend_palette(["seagreen", "ghostwhite", "#4168B7"], number_colors) palette = ["rgb(%d, %d, %d)" % (r,g,b) for r,g,b, a in np.round(basis * 255)] def color_of(value): i = np.round((value + 1.) * (number_colors -1) * 0.5) return palette[int(i)] return color_of def corrplot(source, names): rect('xname', 'yname', 0.9, 0.9, source=source, x_range=names, y_range=list(reversed(names)), color='colors', line_color=None, tools="resize,hover,previewsave", title="Correlation matrix", plot_width=500, plot_height=500) grid().grid_line_color = None axis().axis_line_color = None axis().major_tick_line_color = None axis().major_label_text_font_size = "7pt" axis().major_label_standoff = 0 xaxis().location = "top" xaxis().major_label_orientation = np.pi/3 hover = [t for t in curplot().tools if isinstance(t, HoverTool)][0] hover.tooltips = OrderedDict([ ('names', '@yname, @xname'), ('value', '@values') ]) return curplot() df = ds.get_at( datetime.date(2003,1,6)) source, names = to_data_source(df) bk = corrplot(source, names) show() def construct_mst(df, threshold=0.01): # threshold = 0.05 / len(names): g = nx.Graph() names = df.columns.unique() df_distance = np.sqrt( 2 * np.clip( 1. - df, 0., 2.) ) for i, n in enumerate(names): for j, m in enumerate(names): if j >= i: break val = df_distance.loc[n,m] # Bonferroni correction: TODO check implementation if np.abs(val) > threshold: g.add_edge(n, m, weight=val) return nx.minimum_spanning_tree(g) mst = construct_mst(df) def add_info(g, key, pds): new_attributes = map( lambda x: (x[0], x[1].decode('utf8')), filter(lambda i: g.has_node(i[0]), pds.iteritems())) nx.set_node_attributes(g, key, dict(new_attributes)) add_info(mst, 'group', dax_info['Prime Standard industry group']) add_info(mst, 'label', dax_info.Company) # fig =plt.figure() # nx.draw_graphviz(mst, node_color='steelblue') # import mpld3 # fig_d3 = mpld3.display(fig) # display(fig_d3) nx.draw_graphviz(mst, font_size=9, font_color='black', node_color='steelblue', labels=dict(map( lambda x: (x[0], x[1].decode('utf8')), filter(lambda i: mst.has_node(i[0]), dax_info.Company.iteritems())))) plt.savefig('mst.pdf', bbox='tight') from graph import widgetforcelayout from IPython.display import display from IPython.html import widgets def construct_dateslider(ds, callback=None): _time = pd.TimeSeries(ds.time_axis()) #.reindex() int_range = widgets.IntSliderWidget(min=100, max=_time.last_valid_index(), value=100, readout=False) int_range.set_css({'width': '900'}) text_area = widgets.LatexWidget() def get_current_date(): i = int_range.value val = _time.ix[i] return val def date_to_string(datevalue): return "%s/%d/%d" % (datevalue.year, datevalue.month, datevalue.day) def f(): new_date = get_current_date() new_val = date_to_string(new_date) if new_val == text_area.value: return text_area.value = new_val if callback: callback(new_date) text_area.value = date_to_string(get_current_date()) int_range.on_trait_change(f) date_container = widgets.ContainerWidget(children=[int_range, text_area]) # date_container.set_css('border', '3px dotted grey') return date_container widgetforcelayout.publish_js() figure() source, names = to_data_source(df) bk = corrplot(source, names) graph_widget = widgetforcelayout.GraphWidget(height=500, width=600, link_distance=10, charge=-330, color_domain=dax_info['Prime Standard industry group'].unique().tolist()) bokeh_widget= widgets.HTMLWidget() bokeh_widget.value = notebook_div(curplot()) corr_container = widgets.ContainerWidget(children=[graph_widget, bokeh_widget]) # corr_container.set_css('border', '3px dotted grey') import time def update_matrix_mst(datevalue): _df = ds.get_at(datevalue) _source, _names = to_data_source(_df) bk = corrplot(_source, _names) time.sleep(0.01) bokeh_widget.value = notebook_div(curplot()) _mst = construct_mst(_df) add_info(_mst, 'group', dax_info['Prime Standard industry group']) add_info(_mst, 'label', dax_info.Company) time.sleep(0.01) graph_widget.set_graph(_mst) date_container = construct_dateslider(ds, callback=update_matrix_mst) date_container.msg_throttle = 1 mst_title = widgets.HTMLWidget() mst_title.value='

Minimum Spanning Tree

' container = widgets.ContainerWidget( children=[mst_title, corr_container, date_container] ) # display(mst_title) display(container) date_container.remove_class('vbox') date_container.add_class('hbox') corr_container.remove_class('vbox') corr_container.add_class('hbox') mst_stats.plot(figsize=(16,3)) plt.ylabel('Diameter MST') %%time # initialize DataFrame all_t = ds.time_axis() mst_stats = pd.DataFrame(columns=['diameter'], index=all_t) # iterate for t, df in ds.iterate_time(): try: g = construct_mst(df) mst_stats.loc[t] = nx.algorithms.diameter(g)# #, nx.algorithms.degree_centrality} except ValueError as e: print "skipping ", t mst_stats.plot(figsize=(16,5)) plt.ylabel('Diameter MST') plt.savefig('diameter.pdf', bbox='tight') from graph.algos import construct_pmfg import planarity def construct_pmfg(df_corr_matrix): df_distance = np.sqrt( 2 * np.clip( 1. - df_corr_matrix, 0., 2.) ) dist_matrix = df_distance.values index_upper_triangular = np.triu_indices(dist_matrix.shape[0],1) isort = np.argsort( dist_matrix[index_upper_triangular] ) G = nx.Graph() for k in xrange(0,len(isort)): u = index_upper_triangular[0][isort[k]] v = index_upper_triangular[1][isort[k]] if dist_matrix[u,v] > 0: # remove perfect correlation because of diagonal FIXME G.add_edge(u, v, {'weight': float(dist_matrix[u,v])}) if not planarity.is_planar(G): G.remove_edge(u,v) return G def add_info_pmfg(g, key, prop_name, pds): idx = pds.columns.tolist().index(prop_name) new_attr = map( lambda r: (r[0], r[1][1][idx]), filter( lambda r: g.has_node(r[0]), enumerate(pds.iterrows()))) nx.set_node_attributes(pmfg, key, dict(new_attr)) pmfg = construct_pmfg(df) graph_widget = widgetforcelayout.GraphWidget(height=600, width=800, link_distance=25, charge=-1000) # ['Ticker symbol'] add_info_pmfg(pmfg, 'label', 'Company', dax_info) add_info_pmfg(pmfg, 'group', 'Prime Standard industry group', dax_info) # pmfg.nodes(data=True) display(graph_widget) time.sleep(0.1) graph_widget.set_graph(pmfg) map( lambda v: (dax_info.index[v[0]], v[1]), sorted( nx.algorithms.eigenvector_centrality(pmfg).items(), key=lambda x: x[1])) nx.algorithms.eigenvector_centrality? mean_measure = lambda g, f_m, **d: np.mean(f_m(g, **d).values()) aggregated_centrality = lambda g: ( mean_measure( g, nx.algorithms.betweenness_centrality, weight='weight'), mean_measure( g, nx.algorithms.closeness_centrality, distance='weight'), mean_measure( g, nx.algorithms.eigenvector_centrality) ) aggregated_centrality(pmfg) %%time # initialize DataFrame all_t = ds.time_axis() pmfg_stats = pd.DataFrame(columns=['betweenness_centrality', 'closeness_centrality', 'eigenvector_centrality'], index=all_t) # iterate for t, df in ds.iterate_time(): try: g = construct_pmfg(df) pmfg_stats.loc[t] = aggregated_centrality(g)# #, nx.algorithms.degree_centrality} except ValueError as e: print "skipping ", t pmfg_stats pmfg_stats.plot(figsize=(16,5), subplots=True) pmfg_stats pmfg_stats.plot(figsize=(16,5), subplots=True) # plt.ylabel('Diameter MST')