from eden.io.gspan import load
pos_graphs = list(load('data/bursi.pos.gspan'))
neg_graphs = list(load('data/bursi.neg.gspan'))
graphs = pos_graphs + neg_graphs
y = [1]*len(pos_graphs) + [-1]*len(neg_graphs)
import numpy as np
y = np.array(y)
EDeN exports a vectorize
function that converts a list of graphs in input to a data matrix in output.
The output format is a scipy Compressed Sparse Row matrix.
%%time
from eden.graph import vectorize
X = vectorize(graphs, complexity=2)
print 'Instances: %d Features: %d with an avg of %d features per instance' % (X.shape[0], X.shape[1], X.getnnz()/X.shape[0])
Instances: 4337 Features: 65537 with an avg of 184 features per instance CPU times: user 10.4 s, sys: 2.44 s, total: 12.9 s Wall time: 12.1 s
Several predictive algorithms from the scikit library can process data in csr format.
%%time
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import cross_val_score
predictor = SGDClassifier(average=True, class_weight='balanced', shuffle=True, n_jobs=-1)
scores = cross_val_score(predictor, X, y, cv=10, scoring='roc_auc')
print('AUC ROC: %.4f +- %.4f' % (np.mean(scores),np.std(scores)))
AUC ROC: 0.9007 +- 0.0153 CPU times: user 920 ms, sys: 84.5 ms, total: 1 s Wall time: 1.17 s