%matplotlib inline
from os.path import expanduser, join
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import r2_score
from sklearn.externals import joblib
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.linear_model import LinearRegression
from pyrallel.ensemble import EnsembleGrower
from pyrallel.ensemble import sub_ensemble
from IPython.parallel import Client
lb_view = Client().load_balanced_view()
len(lb_view)
4
This is a NumPy array version of Fold1 of the MSLR-WEB10K dataset.
%%time
data = np.load(expanduser('~/data/MSLR-WEB10K/mslr-web10k_fold1.npz'))
X_train, y_train, qid_train = data['X_train'], data['y_train'], data['qid_train']
X_vali, y_vali, qid_vali = data['X_vali'], data['y_vali'], data['qid_vali']
X_test, y_test, qid_test = data['X_test'], data['y_test'], data['qid_test']
CPU times: user 8.98 s, sys: 1.51 s, total: 10.5 s Wall time: 12.2 s
Total size in bytes, total number of search results and number of queries:
(X_train.nbytes + X_vali.nbytes + X_test.nbytes) / 1e6
652.904448
len(X_train) + len(X_vali) + len(X_test)
1200192
len(np.unique(qid_train)) + len(np.unique(qid_vali)) + len(np.unique(qid_test))
10000
Concatenate the training and validation sets as a big development set.
X_dev = np.vstack([X_train, X_vali])
y_dev = np.concatenate([y_train, y_vali])
qid_dev = np.concatenate([qid_train, qid_vali])
X_dev.shape
(958671, 136)
X_dev.dtype
dtype('float32')
unique_qid_train = np.unique(qid_train)
len(unique_qid_train)
6000
Extract a subset of 500 queries to speed up the learning when prototyping
rng = np.random.RandomState(0)
qid_mask = rng.permutation(len(unique_qid_train))[:500]
subset_mask = np.in1d(qid_train, unique_qid_train[qid_mask])
X_train_small = X_train[subset_mask]
y_train_small = y_train[subset_mask]
qid_train_small = qid_train[subset_mask]
X_train_small.shape
(62244, 136)
Sanity check:
len(np.unique(qid_train_small))
500
def dcg(relevances, rank=10):
"""Discounted cumulative gain at rank (DCG)"""
relevances = np.asarray(relevances)[:rank]
n_relevances = len(relevances)
if n_relevances == 0:
return 0.
discounts = np.log2(np.arange(n_relevances) + 2)
return np.sum(relevances / discounts)
def ndcg(relevances, rank=10):
"""Normalized discounted cumulative gain (NDGC)"""
best_dcg = dcg(sorted(relevances, reverse=True), rank)
if best_dcg == 0:
return 0.
return dcg(relevances, rank) / best_dcg
ndcg([2, 4, 0, 1, 1, 0, 0], rank=5)
0.86253003992915656
ndcg([0, 0, 0, 1, 1, 2, 4], rank=5)
0.13201850690866795
ndcg([0, 0, 0, 1, 1, 2, 4], rank=3)
0.0
ndcg([4, 2, 1, 1, 0, 0, 0], rank=5)
1.0
def mean_ndcg(y_true, y_pred, query_ids, rank=10):
y_true = np.asarray(y_true)
y_pred = np.asarray(y_pred)
query_ids = np.asarray(query_ids)
# assume query_ids are sorted
ndcg_scores = []
previous_qid = query_ids[0]
previous_loc = 0
for loc, qid in enumerate(query_ids):
if previous_qid != qid:
chunk = slice(previous_loc, loc)
ranked_relevances = y_true[chunk][np.argsort(y_pred[chunk])[::-1]]
ndcg_scores.append(ndcg(ranked_relevances, rank=rank))
previous_loc = loc
previous_qid = qid
chunk = slice(previous_loc, loc + 1)
ranked_relevances = y_true[chunk][np.argsort(y_pred[chunk])[::-1]]
ndcg_scores.append(ndcg(ranked_relevances, rank=rank))
return np.mean(ndcg_scores)
mean_ndcg([4, 3, 1, 4, 3], [4, 0, 1, 4, 2], [0, 0, 0, 2, 2], rank=10)
0.9795191506818377
grower = EnsembleGrower(lb_view, ExtraTreesRegressor(n_estimators=1))
grower.launch(X_dev, y_dev, n_estimators=500,
folder="web10k", dump_models=False)
Progress: 00% (000/500), elapsed: 0.954s
grower
Progress: 100% (500/500), elapsed: 2310.782s
#grower.wait()
etr = grower.aggregate_model()
print("Number of trees: {}".format(len(etr.estimators_)))
Number of trees: 500
%time y_test_etr = etr.predict(X_test)
CPU times: user 1min 15s, sys: 0 ns, total: 1min 15s Wall time: 1min 14s
print("NDCG_5 score: {:.3f}".format(
mean_ndcg(y_test, y_test_etr, qid_test, rank=5)))
print("NDCG_10 score: {:.3f}".format(
mean_ndcg(y_test, y_test_etr, qid_test, rank=10)))
print("R2 score: {:.3f}".format(r2_score(y_test, y_test_etr)))
NDCG_5 score: 0.516 NDCG_10 score: 0.522 R2 score: 0.184
%%time
max_n_trees = len(etr.estimators_)
n_trees = np.logspace(0, np.log10(max_n_trees), 5).astype(int)
scores = []
for j, n in enumerate(n_trees):
y_predicted = sub_ensemble(etr, n).predict(X_test)
scores.append(mean_ndcg(y_test, y_predicted, qid_test, rank=10))
CPU times: user 1min 36s, sys: 0 ns, total: 1min 36s Wall time: 1min 34s
plt.plot(n_trees, scores)
plt.xlabel("Number of trees")
plt.ylabel("Average NDC@10")
_ = plt.title("Impact of the number of trees")
%time y_train_small_etr = etr.predict(X_train_small)
CPU times: user 20.2 s, sys: 0 ns, total: 20.2 s Wall time: 19.8 s
print("NDCG_5 score: {:.3f}".format(
mean_ndcg(y_train_small, y_train_small_etr, qid_train_small, rank=5)))
print("NDCG_10 score: {:.3f}".format(
mean_ndcg(y_train_small, y_train_small_etr, qid_train_small, rank=10)))
print("R2 score: {:.3f}".format(r2_score(y_train_small, y_train_small_etr)))
NDCG_5 score: 0.964 NDCG_10 score: 0.964 R2 score: 0.999
%time lr = LinearRegression().fit(X_dev, y_dev)
CPU times: user 25.1 s, sys: 2.35 s, total: 27.4 s Wall time: 22 s
%time y_test_lr = lr.predict(X_test)
CPU times: user 163 ms, sys: 402 ms, total: 565 ms Wall time: 1.02 s
print("NDCG_5 score: {:.3f}".format(
mean_ndcg(y_test, y_test_lr, qid_test, rank=5)))
print("NDCG_10 score: {:.3f}".format(
mean_ndcg(y_test, y_test_lr, qid_test, rank=10)))
print("R2 score: {:.3f}".format(r2_score(y_test, y_test_lr)))
NDCG_5 score: 0.433 NDCG_10 score: 0.450 R2 score: 0.127
Evaluate overfitting by comparing with training set:
y_train_small_lr = lr.predict(X_train_small)
print("NDCG_5 score: {:.3f}".format(
mean_ndcg(y_train_small, y_train_small_lr, qid_train_small, rank=5)))
print("NDCG_10 score: {:.3f}".format(
mean_ndcg(y_train_small, y_train_small_lr, qid_train_small, rank=10)))
print("R2 score: {:.3f}".format(r2_score(y_train_small, y_train_small_lr)))
NDCG_5 score: 0.415 NDCG_10 score: 0.433 R2 score: 0.131
Interestingly enough, a slight overfitting of the training set from a regression standpoint (higher r2 score) does not seem to cause overfitting from a ranking standpoint. This would have to be checked with cross-validation though.
subset = np.random.permutation(y_test.shape[0])[:10000]
plt.title('Extra Trees predictions')
plt.scatter(y_test[subset], y_test_etr[subset], alpha=0.1, s=100)
plt.xlabel('True relevance')
plt.ylabel('Predicted relevance')
plt.ylim(-2, 5)
plt.xlim(-2, 5)
(-2, 5)
plt.title('Linear Regression predictions')
plt.scatter(y_test[subset], y_test_lr[subset], alpha=0.1, s=100)
plt.xlabel('True relevance')
plt.ylabel('Predicted relevance')
plt.ylim(-2, 5)
plt.xlim(-2, 5)
(-2, 5)
plt.hist(y_test, bins=5, alpha=.3, color='b', label='True relevance')
plt.hist(y_test_etr, bins=5, alpha=.3, color='g', label='ET predicted relevance')
plt.legend(loc='best')
<matplotlib.legend.Legend at 0x98b5c950>
For each query, count the number of results with rank 0, 1, 2, 3 or 4.
unique_qids_test = np.unique(qid_test)
for qid in unique_qids_test[:5]:
qids = y_test[qid_test == qid].astype(np.int)
print(np.bincount(qids, minlength=5))
[45 54 31 8 0] [59 25 8 2 0] [52 20 9 2 3] [97 45 2 2 2] [32 56 24 7 4]
etr_one = ExtraTreesRegressor(n_estimators=1)
from time import clock
training_sizes = np.logspace(0, np.log10(len(X_dev)), 9).astype(np.int)
durations = []
decision_nodes = []
for n_samples in training_sizes:
tic = clock()
etr_one.fit(X_dev[:n_samples], y_dev[:n_samples])
d = clock() - tic
print("Duration for 1 tree on {} samples: {:.3f}s".format(
n_samples, d))
durations.append(d)
tree = etr_one.estimators_[0].tree_
decision_nodes.append(np.sum(tree.children_left > 0))
Duration for 1 tree on 1 samples: 0.001s Duration for 1 tree on 5 samples: 0.002s Duration for 1 tree on 31 samples: 0.002s Duration for 1 tree on 175 samples: 0.005s Duration for 1 tree on 979 samples: 0.026s Duration for 1 tree on 5477 samples: 0.238s Duration for 1 tree on 30637 samples: 1.762s Duration for 1 tree on 171380 samples: 13.977s Duration for 1 tree on 958670 samples: 118.329s
plt.loglog(training_sizes, durations, 'o-')
plt.xlabel("# of training samples")
plt.ylabel("training time (s)")
_ = plt.title("Impact of training set size on training time")
plt.loglog(training_sizes, decision_nodes, 'o-')
plt.xlabel("# of training samples")
plt.ylabel("# of decision nodes")
_ = plt.title("Impact of training set size on model size")
diff = np.abs(lr.predict(X_dev) - y_dev)
normalized_diff = diff / diff.max()
plt.plot(sorted(normalized_diff))
plt.title("Sorted")
[<matplotlib.lines.Line2D at 0x107729710>]
n_samples = len(y_dev)
rng = np.random.RandomState(42)
sample_mask = normalized_diff > 1.64 * rng.rand(n_samples)
np.sum(sample_mask)
62226
X_sampled = X_dev[sample_mask]
y_sampled = y_dev[sample_mask]
sample_weight = normalized_diff[sample_mask]
plt.plot(sorted(sample_weight))
_ = plt.title("Sorted normalized diff in the resampled dataset")
%time gbr_sampled = GradientBoostingRegressor().fit(X_sampled, y_sampled)
CPU times: user 10min 16s, sys: 1.23 s, total: 10min 17s Wall time: 10min 18s
%time y_test_gbr_sampled = gbr.predict(X_test)
CPU times: user 1.7 s, sys: 11 ms, total: 1.71 s Wall time: 1.71 s
mean_ndcg(y_test, y_test_gbr_sampled, qid_test, rank=5)
0.47394128281542253
etr_10_sampled = ExtraTreesRegressor(n_estimators=100, n_jobs=2)
%time _ = etr_10_sampled.fit(X_sampled, y_sampled, sample_weight=sample_weight)
CPU times: user 592 ms, sys: 1.24 s, total: 1.83 s Wall time: 5min 45s
%time y_test_etr_10_sampled = etr_10_sampled.predict(X_test)
CPU times: user 482 ms, sys: 637 ms, total: 1.12 s Wall time: 7.4 s
mean_ndcg(y_test, y_test_etr_10_sampled, qid_test, rank=5)
0.44812412919740691
len(X_train_small), len(np.unique(qid_train_small))
(62244, 500)
%time gbr_small = GradientBoostingRegressor().fit(X_train_small, y_train_small)
CPU times: user 9min 17s, sys: 815 ms, total: 9min 17s Wall time: 9min 18s
%time y_test_gbr_small = gbr_small.predict(X_test)
CPU times: user 1.59 s, sys: 2.56 ms, total: 1.59 s Wall time: 1.59 s
mean_ndcg(y_test, y_test_gbr_small, qid_test, rank=5)
0.50078362434826951