import numpy as np import matplotlib import matplotlib.pyplot as plt import pandas as pd %matplotlib inline matplotlib.rcParams['savefig.dpi'] = 300 df = pd.read_csv('../data/allen_brain_atlas.csv') data = np.array(df.ix[:, :-2]) data = data - data.mean(0) corr = np.dot(data.T, data)/data.shape[0] evals, evecs = np.linalg.eig(corr) pcs = np.dot(data, evecs[:, :]) fig, axes = plt.subplots(5,5, sharex='col', sharey='row', figsize=(10,10)) for i in range(5): for j in range(5): ax = axes[i,j] if i==j: ax.set_axis_off() else: ax.plot(pcs[::10,i], pcs[::10,j], '.', ms=5) from sklearn import cluster W = evecs.dot(np.diag(1./np.sqrt(evals))).dot(evecs.T) whitened = np.dot(W, data.T) C_W = np.dot(whitened, whitened.T)/whitened.shape[1] plt.imshow(C_W, interpolation='nearest') centers, labels, _ = cluster.k_means(whitened.T, 4) plt.scatter(pcs[:,0], pcs[:,1], c=labels, cmap='hsv') plt.xlabel("PC1") plt.ylabel("PC2") ax = plt.gca() plt.text(0.95, 0.05, 'Data source: Allen Brain Institute', transform=ax.transAxes, va='bottom', ha='right') from matplotlib import colors, cm def get_colors(labels, cmap): norm_labels = colors.Normalize()(labels) cmap_func = cm.get_cmap(cmap) class_colors = cmap_func(norm_labels) return class_colors class_colors = get_colors(np.unique(labels), 'hsv') from lxml import etree with file('../data/brain_atlas.svg') as fid: svg = etree.fromstring(fid.read()) with file('../data/ontology.xml') as fid: ontology = etree.fromstring(fid.read()) def color_svg(svg, ontology, data_ids, labels, colors): show = [] unique_data_ids = np.unique(data_ids) paths = svg.xpath('//*[@structure_id]') ulabels = np.unique(labels) class_members = [data_ids[labels==l] for l in ulabels] n_members = np.array([len(c) for c in class_members]) for p in paths: sid = p.get('structure_id') el = ontology.xpath('//structure[id={}]'.format(sid))[0] descendants = [] for i in unique_data_ids: if i==sid or el.xpath('descendant::structure[id={}]'.format(i)): descendants.append(i) if descendants: descendants = np.array(descendants) n_probes = np.array([np.sum(np.in1d(cl, descendants)) for cl in class_members]) frac = n_probes*1./n_members dominating = np.argmax(frac) r, g, b = (np.array(colors[dominating])*255).astype(int)[:3] p.set('style', 'fill: rgb({},{},{}); stroke: none'.format(r,g,b)) else: p.set('style', 'fill: rgb(255,255,255); stroke: none') return svg from IPython.display import SVG new_svg = color_svg(svg, ontology, df.structure_id, labels, class_colors) SVG(etree.tostring(new_svg))