import numpy as np from os.path import expanduser from sklearn.datasets import load_svmlight_file from sklearn.externals import joblib memory = joblib.Memory(cachedir='.', mmap_mode='r') @memory.cache def load_fold(dataset, subset, fold_idx=1, dtype=np.float32): DATA_FOLDER = expanduser('~/data') filepath = join(DATA_FOLDER, dataset, 'Fold%d' % fold_idx, subset + '.txt') X, y, qid = load_svmlight_file(filepath, dtype=dtype, query_id=True) return X.toarray(), y, qid X_train, y_train, qid_train = load_fold('MSLR-WEB10K','train', fold_idx=1) X_vali, y_vali, qid_vali = load_fold('MSLR-WEB10K', 'vali', fold_idx=1) X_test, y_test, qid_test = load_fold('MSLR-WEB10K', 'test', fold_idx=1) %%time np.savez_compressed(expanduser('~/data/MSLR-WEB10K/mslr_web10k_fold1.npz'), X_train=X_train, y_train=y_train, qid_train=qid_train, X_vali=X_vali, y_vali=y_vali, qid_vali=qid_vali, X_test=X_test, y_test=y_test, qid_test=qid_test)