from __future__ import print_function import numpy as np from sklearn import datasets, metrics, cross_validation from minirank import ordinal_logistic_fit, ordinal_logistic_predict DOC = """ ================================================================================ Compare the prediction accuracy of different models on the boston dataset ================================================================================ """ print(DOC) boston = datasets.load_boston() X, y = boston.data, np.round(boston.target) X -= X.mean() y -= y.min() idx = np.argsort(y) X = X[idx] y = y[idx] cv = cross_validation.ShuffleSplit(y.size, n_iter=50, test_size=.1, random_state=0) score_logistic = [] score_ordinal_logistic = [] score_ridge = [] for i, (train, test) in enumerate(cv): if not np.all(np.unique(y[train]) == np.unique(y)): # we need the train set to have all different classes continue assert np.all(np.unique(y[train]) == np.unique(y)) train = np.sort(train) test = np.sort(test) w, theta = ordinal_logistic_fit(X[train], y[train]) pred = ordinal_logistic_predict(w, theta, X[test], y) s = metrics.mean_absolute_error(y[test], pred) print('ERROR (ORDINAL) fold %s: %s' % (i+1, s)) score_ordinal_logistic.append(s) from sklearn import linear_model clf = linear_model.LogisticRegression(C=1.) clf.fit(X[train], y[train]) pred = clf.predict(X[test]) s = metrics.mean_absolute_error(y[test], pred) print('ERROR (LOGISTIC) fold %s: %s' % (i+1, s)) score_logistic.append(s) from sklearn import linear_model clf = linear_model.Ridge(alpha=1.) clf.fit(X[train], y[train]) pred = np.round(clf.predict(X[test])) s = metrics.mean_absolute_error(y[test], pred) print('ERROR (LOGISTIC) fold %s: %s' % (i+1, s)) score_ridge.append(s) print() print('MEAN ABSOLUTE ERROR (ORDINAL LOGISTIC): %s' % np.mean(score_ordinal_logistic)) print('MEAN ABSOLUTE ERROR (LOGISTIC REGRESSION): %s' % np.mean(score_logistic)) print('MEAN ABSOLUTE ERROR (RIDGE REGRESSION): %s' % np.mean(score_ridge)) from pylab import * pos = arange(3)+1.1 # the bar centers on the y axis val = map(np.mean, (score_ordinal_logistic, score_logistic, score_ridge)) xerr = map(np.std, (score_ordinal_logistic, score_logistic, score_ridge)) barh(pos,val, xerr=xerr, align='center', alpha=.5, ecolor='black') yticks(pos, ('Ordinal Logistic', 'Multiclass Logistic', 'Linear Regression'), rotation=45) grid(True) xlabel('Mean Absolute Error (lower is better)', fontsize='x-large') tight_layout() savefig('bars_ordinal.png') show()