# code to regenerate input files for results for paper # # http://www.fabiangieseke.de/pdfs/neucom2013_draft.pdf # see Table 2 results real-sim data set # # see also: # http://www.fabiangieseke.de/pdfs/icpram2012.pdf # import sys from time import time from pprint import pprint import numpy as np import scipy import scipy.sparse as sp import joblib import io import os.path import sklearn import sklearn.svm import sklearn.datasets import sklearn.metrics import sklearn.cross_validation from sklearn.externals.six import u, b import warnings warnings.filterwarnings('ignore') %pylab inline X, y = sklearn.datasets.load_svmlight_file('real-sim') X.shape from collections import Counter Counter(y) splits = sklearn.cross_validation.StratifiedShuffleSplit(y, n_iter=1, test_size=0.50) train_indices, test_indices = splits.__iter__().next() instance_ids = np.arange(y.size) X_train = X[train_indices] train_ids = instance_ids[train_indices] X_test = X[test_indices] test_ids = instance_ids[test_indices] train_labels = y[train_indices] test_labels = y[test_indices] svm = sklearn.svm.LinearSVC(penalty='l2', C=1, dual=False) svm.fit(X_train, train_labels) accuracy = sklearn.metrics.accuracy_score(test_labels, svm.predict(X_test)) print 100.0*(accuracy) #ncv = 10 #print sklearn.cross_validation.cross_val_score(svm, X_train, train_labels, cv=10).sum()/ncv #print sklearn.cross_validation.cross_val_score(svm, X_test, test_labels, cv=10).sum()/ncv print X_train.shape print X_test.shape def print_svmlight_infiles(L, y_l, U, y_u, HO, y_ho): A = scipy.sparse.vstack((L,U)) unk_l = y_u*0 a_l = np.hstack((y_l, unk_l)) print A.shape, a_l.shape numL = L.shape[0] training_file = 'svmlight.train.%d'%numL sklearn.datasets.dump_svmlight_file(A,a_l,training_file,zero_based=False) testL_file = 'svmlight.testL.%d'%numL sklearn.datasets.dump_svmlight_file(L,y_l,testL_file,zero_based=False) testU_file = 'svmlight.testU.%d'%numL sklearn.datasets.dump_svmlight_file(U,y_u,testU_file,zero_based=False) testHO_file = 'svmlight.testHO.%d'%numL sklearn.datasets.dump_svmlight_file(HO,y_ho,testHO_file,zero_based=False) def dump_svmlin(X, y, fX, fy): X_value_pattern = u("%d:%.16g") is_sp = int(hasattr(X, "tocsr")) fy.write(''.join(["%d\n" % label for label in y])) for i in range(X.shape[0]): if is_sp: span = slice(X.indptr[i], X.indptr[i + 1]) row = zip(X.indices[span], X.data[span]) else: nz = X[i] != 0 row = zip(np.where(nz)[0], X[i, nz]) s = " ".join(X_value_pattern % (j + 1, x) for j, x in row) fX.write(("%s\n" % s).encode('ascii')) def print_svmlin_infiles(L, y_l, U, y_u, HO, y_ho): print "print_svmlin_infiles..." A = scipy.sparse.vstack((L,U)) unk_l = y_u*0 a_l = np.hstack((y_l, unk_l)) print A.shape, a_l.shape numL = L.shape[0] examples_file = open('svmlin.train.examples.%d'%numL, "wb") labels_file = open('svmlin.train.labels.%d'%numL, "wb") dump_svmlin(A, a_l, examples_file, labels_file) examples_file = open('svmlin.testL.examples.%d'%numL, "wb") labels_file = open('svmlin.testL.labels.%d'%numL, "wb") dump_svmlin(L, y_l, examples_file, labels_file) examples_file = open('svmlin.testU.examples.%d'%numL, "wb") labels_file = open('svmlin.testU.labels.%d'%numL, "wb") dump_svmlin(U, y_u, examples_file, labels_file) examples_file = open('svmlin.testHO.examples.%d'%numL, "wb") labels_file = open('svmlin.testHO.labels.%d'%numL, "wb") dump_svmlin(HO, y_ho, examples_file, labels_file) def print_universvm_infiles(L, y_l, U, y_u, HO, y_ho): A = scipy.sparse.vstack((L,U)) unk_l = y_u*0 - 3 a_l = np.hstack((y_l, unk_l)) print A.shape, a_l.shape numL = L.shape[0] training_file = 'universvm.train.%d'%numL sklearn.datasets.dump_svmlight_file(A,a_l,training_file,zero_based=False) testL_file = 'universvm.testL.%d'%numL sklearn.datasets.dump_svmlight_file(L,y_l,testL_file,zero_based=False) testU_file = 'universvm.testU.%d'%numL sklearn.datasets.dump_svmlight_file(U,y_u,testU_file,zero_based=False) testHO_file = 'universvm.testHO.%d'%numL sklearn.datasets.dump_svmlight_file(HO,y_ho,testHO_file,zero_based=False) #split_sizes = [0.0025, 0.005, 0.01, 0.04, 0.08] split_sizes = [0.08] for U_size in split_sizes: splits = sklearn.cross_validation.StratifiedShuffleSplit(train_labels, n_iter=1, test_size=1-U_size) labeled_indices, unlabeled_indices = splits.__iter__().next() L = X_train[labeled_indices] L_ids = instance_ids[labeled_indices] U = X_train[unlabeled_indices] U_ids = instance_ids[unlabeled_indices] y_l = train_labels[labeled_indices] y_u = train_labels[unlabeled_indices] HO = X_test y_ho = test_labels print "making..." #print_universvm_infiles(L, y_l, U, y_u, X_test, test_labels) print_svmlin_infiles(L, y_l, U, y_u, X_test, test_labels) print_svmlight_infiles(L, y_l, U, y_u, HO, y_ho) svm_small = sklearn.svm.LinearSVC(C=10,fit_intercept=False) svm_small.fit(L, y_l) y_p = svm_small.predict(HO) score = svm_small.score(HO,y_ho) # print np.mean((y_ho+1)/2),np.mean((y_p+1)/2) yy = y_p*y_ho acc = float(np.where(yy>0)[0].shape[0])/y_ho.shape[0] # print acc print L.shape, y_l.shape, X_test.shape, test_labels.shape print "baseline HO accuracy for ",U_size," l=",L.shape[0] ," ",(score)*100.0 print "\n"