This notebook exists to take the graphs from all the other notebooks and process them into tikz files for plotting in the report. This involves loading the numpy files which have been saved and plotting the graphs again. The default settings at the top of this notebook will be propagated to all the saved graphs, so it's very important that this is chosen appropriately.
It can take a long time to produce the results being graphed in each of the notebooks. To recreate the results would involve running, in some cases, the entire notebook again. Therefore, to make it easy to change annotations or formatting of the plots it is better to save the results in those notebook and plot the graphs here. However, in those notebooks after running analysis we still would like to have the results so the graphs will be plotted in those notebooks as well.
Of course, it would still save time to save the graphs once in the notebook in which the results are generated as pgf. Unfortunately, many of the results were saved before it was clear what the best way to plot the results would be so they have been saved to be plotted in this notebook. Also the backend for notebooks running pylab (which is all of them) can't be changed while the kernel is running. In new notebooks it would be preferable to save both the data and the final plots to pgf but it's not clear whether that will be possible and it may be easier to simply do all graph processing in this notebook.
Starting with just the default values from the parallel ML tutorial:
import matplotlib
# Set backend to pgf
matplotlib.use('pgf')
import matplotlib.pyplot as plt
import numpy as np
# Some nice default configuration for plots
plt.rcParams['figure.figsize'] = 6, 4.5
plt.rcParams['axes.grid'] = True
plt.gray()
#testing custom preamble from here: http://matplotlib.org/users/pgf.html
pgf_with_custom_preamble = {
"font.family": "serif", # use serif/main font for text elements
"text.usetex": True, # use inline math for ticks
"pgf.rcfonts": False, # don't setup fonts from rc parameters
"pgf.preamble": [
r"\usepackage{units}", # load additional packages
r"\usepackage{metalogo}",
r"\usepackage{unicode-math}", # unicode math setup
r"\setmathfont{xits-math.otf}",
r"\setmainfont{DejaVu Serif}", # serif font via preamble
],
"figure.figsize":(6,4.5)
}
matplotlib.rcParams.update(pgf_with_custom_preamble)
%matplotlib inline
cd ../../plots/bayes
/home/gavin/Documents/MRes/plots/bayes
import os
imagedir = os.path.abspath("../../opencast-bio/report/images/")
The graphs from the Classifier Training notebook fitting a logistic regression model to the DIP training data.
Loading the saved data file and plotting with a sample of the same code:
lrlc = np.load("lrlc.npz")
mean_scores = lrlc['arr_0'][()]
Defining the function to plot the learning curve:
def plotlearningcurve(mean_scores):
import pylab as pl
# now the mean_scores dictionary contains everything required to build the plot
trainsizes = sorted(mean_scores.keys())
mean_train = np.array([mean_scores[train_size][0] for train_size in trainsizes])
mean_test = np.array([mean_scores[train_size][2] for train_size in trainsizes])
train_confidence = np.array([mean_scores[train_size][1]*2 for train_size in trainsizes])
test_confidence = np.array([mean_scores[train_size][3]*2 for train_size in trainsizes])
#plot the training scores
pl.figure()
pl.fill_between(trainsizes, mean_train - train_confidence, mean_train + train_confidence,
color = 'b', alpha = .2)
pl.plot(trainsizes, mean_train, 'o-k', c='b', label='Train score')
#plot the test scores
pl.fill_between(trainsizes, mean_test - test_confidence, mean_test + test_confidence,
color = 'g', alpha = .2)
pl.plot(trainsizes, mean_test, 'o-k', c='g', label='Test score')
#extra annotation
pl.xlabel('Training set size')
pl.ylabel('Score')
pl.xlim(0, max(trainsizes))
pl.ylim((None, 1.0)) # The best possible score is 1.0
pl.legend(loc='best')
pl.title('Main train and test scores +/- 2 standard errors')
plotlearningcurve(mean_scores)
plt.savefig("lrlc.pgf", format='pgf')
The graphs created in either of the data visualisation notebooks must be saved to pgf to be included in the report.
import pandas as pd
from pandas.tools.plotting import andrews_curves
/usr/lib/python2.7/site-packages/pandas/io/excel.py:626: UserWarning: Installed openpyxl is not supported at this time. Use >=1.6.1 and <2.0.0. .format(openpyxl_compat.start_ver, openpyxl_compat.stop_ver))
pd.options.display.mpl_style = 'default'
fnpz = np.load("parrallel.coordinates.plot.oop.npz")
X = fnpz['arr_0']
y = fnpz['arr_1']
maxes = np.amax(abs(X),axis=0) + 1e-14
plotdata = pd.DataFrame(X/maxes)
plotdata['training labels'] = y
andrews_curves(plotdata,'training labels')
#plt.savefig(os.path.join(imagedir,"out.andrews.curves.tikz"),format='pgf')
plt.savefig(os.path.join(imagedir,"out.andrews.curves.png"),format='png')
for rowi,row in enumerate(X):
#normalise row values
row = row/(maxes)
#then just plot it
if y[rowi] > 0.5:
plt.plot(row,color='green',alpha=0.5)
else:
plt.plot(row,color='red',alpha=0.05)
#plt.savefig(os.path.join(imagedir,"out.parallel.lines.tikz"),format='pgf')
plt.savefig(os.path.join(imagedir,"out.parallel.lines.png"),format='png')
fnpz = np.load("parrallel.coordinates.plot.ip.npz")
X = fnpz['arr_0']
y = fnpz['arr_1']
maxes = np.amax(abs(X),axis=0) + 1e-14
plotdata = pd.DataFrame(X/maxes)
plotdata['training labels'] = y
andrews_curves(plotdata,'training labels')
#plt.savefig(os.path.join(imagedir,"in.andrews.curves.tikz"),format='pgf')
plt.savefig(os.path.join(imagedir,"in.andrews.curves.png"),format='png')
for rowi,row in enumerate(X):
#normalise row values
row = row/(maxes)
#then just plot it
if y[rowi] > 0.5:
plt.plot(row,color='green',alpha=0.5)
else:
plt.plot(row,color='red',alpha=0.05)
#plt.savefig(os.path.join(imagedir,"in.parallel.lines.tikz"),format='pgf')
plt.savefig(os.path.join(imagedir,"in.parallel.lines.png"),format='png')
The low dimensional plots are saved in the following section, including:
t-SNE should not have any ticks on the axes as the values are meaningless.
nones,nzeros = 1000, 1000
X_pca = np.load("pca.oop.npz")['arr_0']
ones = plt.scatter(X_pca[:nones,0],X_pca[:nones,1],c='red',alpha=0.2)
zeros = plt.scatter(X_pca[nones:,0],X_pca[nones:,1],c='blue',marker="x",alpha=0.2)
l=plt.legend((ones,zeros),("interactions","non-interactions"),loc=0)
plt.savefig(os.path.join(imagedir,"out.pca.png"),format='png')
nones,nzeros = 1000, 600000
X_pca = np.load("pca.ip.npz")['arr_0']
ones = plt.scatter(X_pca[:nones,0],X_pca[:nones,1],c='red',alpha=0.2)
zeros = plt.scatter(X_pca[nones:,0],X_pca[nones:,1],c='blue',marker="x",alpha=0.05)
l=plt.legend((ones,zeros),("interactions","non-interactions"),loc=3)
#plt.savefig(os.path.join(imagedir,"in.pca.tikz"),format='pgf')
plt.savefig(os.path.join(imagedir,"in.pca.png"),format='png')
nones,nzeros = 100, 100
X_tsne = np.load("tdsne.oop.npz")['arr_0']
ones = plt.scatter(X_tsne[:nones,0],X_tsne[:nones,1],c='red')
zeros = plt.scatter(X_tsne[nones:,0],X_tsne[nones:,1],c='blue',marker="x")
l=plt.legend((ones,zeros),("interactions","non-interactions"),loc=0)
plt.tick_params(labelleft='off',labelbottom='off')
#plt.savefig(os.path.join(imagedir,"out.tsne.tikz"),format='pgf')
plt.savefig(os.path.join(imagedir,"out.tsne.png"),format='png')
nones,nzeros = 10, 6000
X_tsne = np.load("tdsne.ip.npz")['arr_0']
ones = plt.scatter(X_tsne[:nones,0],X_tsne[:nones,1],c='red',alpha=0.6)
zeros = plt.scatter(X_tsne[nones:,0],X_tsne[nones:,1],c='blue',marker="x",alpha=0.1)
l=plt.legend((ones,zeros),("interactions","non-interactions"),loc=0)
plt.tick_params(labelleft='off',labelbottom='off')
#plt.savefig(os.path.join(imagedir,"in.tsne.tikz"),format='pgf')
plt.savefig(os.path.join(imagedir,"in.tsne.png"),format='png')
A graph produced when training using DIP or HIPPIE demonstrating the poor feature importances produced - relying on only the features derived from interaction databases.
dimensions = (4,2.4)
rfimportances = np.load("../hippie/random.forest.importances.npz")['arr_0']
fig = plt.figure()
fig.set_size_inches(*dimensions)
plt.plot(rfimportances)
plt.xlabel("Feature index")
plt.ylabel("Importance")
plt.savefig(os.path.join(imagedir,"unbalanced.weighting.tikz"),format='pgf')
These are the weightings on features actually produced when training the classifier in the final iteration.
logrefweights = np.load("logistic.regression.coef.npz")['arr_0']
fig = plt.figure()
fig.set_size_inches(*dimensions)
plt.plot(logrefweights)
plt.xlabel("Feature index")
plt.ylabel("Coefficient")
plt.savefig(os.path.join(imagedir,"logreg.weights.tikz"),format='pgf')
rfimportances = np.load("random.forest.importances.npz")["arr_0"]
fig = plt.figure()
fig.set_size_inches(*dimensions)
plt.plot(rfimportances)
plt.xlabel("Feature index")
plt.ylabel("Importance")
plt.savefig(os.path.join(imagedir,"rf.importances.tikz"),format='pgf')
The ROC curves produced by the logistic regression and random forest classifiers.
def plotroc(fpr,tpr,name):
fig = plt.figure()
fig.set_size_inches(*dimensions)
plt.clf()
plt.plot(fpr, tpr)
plt.plot([0, 1], [0, 1], 'k--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.0])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.savefig(os.path.join(imagedir,name),format='pgf')
return None
fpr = np.load("logistic.regression.roc.npz")['arr_0']
tpr = np.load("logistic.regression.roc.npz")['arr_1']
plotroc(fpr,tpr,"logreg.roc.tikz")
fpr = np.load("random.forest.roc.npz")['arr_0'][0]
tpr = np.load("random.forest.roc.npz")['arr_0'][1]
plotroc(fpr,tpr,"rf.roc.tikz")
The precision-recall curves for the logistic regression and random forest classifiers.
def drawprecisionrecall(precision,recall,name):
fig = plt.figure()
fig.set_size_inches(*dimensions)
plt.clf()
plt.plot(recall,precision)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.savefig(os.path.join(imagedir,name),format='pgf')
return None
with np.load("lr_precisionrecall.npz") as data:
precision = data['arr_0']
recall = data['arr_1']
drawprecisionrecall(precision,recall,"logreg.pr.tikz")
with np.load("random.forest.precisionrecall.npz") as data:
precision = data['arr_0']
recall = data['arr_1']
drawprecisionrecall(precision,recall,"rf.pr.tikz")
The following is a histogram of weightings produced by the bayesian weighting algorithm.
with np.load("postweightings.npz") as nf:
weights = nf['arr_0']
fig = plt.figure()
fig.set_size_inches(*dimensions)
h=plt.hist(weights,bins=50)
plt.xlabel("posterior probability")
plt.ylabel("frequency")
plt.savefig(os.path.join(imagedir,"bayes.weights.dist.tikz"),format='pgf')
These are the graphs plotted using networkx to compare two communities by eye.
with np.load("nx2933.npz") as nf:
uw29 = list(nf['arr_0'])
w33 = list(nf['arr_1'])
interactions = np.loadtxt("../../HBP/testdata/edgelist_update_weighted.txt",dtype=str)
interactions = interactions[1:]
import networkx as nx
def plotcommunities(com1,com2,fname,title):
fig = plt.figure()
fig.set_size_inches(4,3)
plt.title(title, size=12)
G = nx.Graph()
for l in interactions:
if l[0] in set(com1+com2) and l[1] in set(com1+com2):
G.add_edge(l[0],l[1],weight=float(l[2]))
edict = {}
lim = min([d['weight'] for (u,v,d) in G.edges(data=True)])
diff = np.linspace(lim,1.0,10)[1]- np.linspace(lim,1.0,10)[0]
for x in np.linspace(lim,1.0,10):
edict[x] = [(u,v) for (u,v,d) in G.edges(data=True) if d['weight'] > x and d['weight'] < x+diff]
pos = nx.circular_layout(com1)
pos2 = nx.circular_layout(set(com2)-set(com1))
for k in pos2:
pos2[k] = np.array([pos2[k][0]+1.5,pos2[k][1]])
pos = dict(pos.items()+pos2.items())
nx.draw_networkx_nodes(G,pos,node_size=20,alpha=0.5)
for k in edict:
nx.draw_networkx_edges(G,pos,edgelist=edict[k],alpha=(k-lim)*(1/(1-lim)),edge_color='r')
#nx.draw_networkx_edges(G,pos,edgelist=edict[k],edge_color='r')
l=nx.draw_networkx_labels(G,pos,font_size=5,font_family='sans-serif')
plt.tick_params(labelleft='off',labelbottom='off')
plt.savefig(os.path.join(imagedir,fname))
return None
plotcommunities(uw29,w33,"nx2933.pgf","Unweighted community 29 interactions")
plotcommunities(w33,uw29,"nx3329.pgf","Weighted community 33 interactions")
with np.load("nx6444.npz") as nf:
uw64 = list(nf['arr_0'])
w44 = list(nf['arr_1'])
plotcommunities(uw64,w44,"nx6444.pgf","Unweighted community 64 interactions")
plotcommunities(w44,uw64,"nx4464.pgf","Weighted community 44 interactions")