In [1]:

%matplotlib inline
from os.path import expanduser, join
import numpy as np
import matplotlib.pyplot as plt

from sklearn.metrics import r2_score

from sklearn.externals import joblib
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.linear_model import LinearRegression

from pyrallel.ensemble import EnsembleGrower
from pyrallel.ensemble import sub_ensemble

In [3]:

from IPython.parallel import Client
lb_view = Client().load_balanced_view()
len(lb_view)

Out[3]:

Loading the dataset¶

This is a NumPy array version of Fold1 of the MSLR-WEB10K dataset.

In [5]:

%%time

data = np.load(expanduser('~/data/MSLR-WEB10K/mslr-web10k_fold1.npz'))
X_train, y_train, qid_train = data['X_train'], data['y_train'], data['qid_train']
X_vali, y_vali, qid_vali = data['X_vali'], data['y_vali'], data['qid_vali']
X_test, y_test, qid_test = data['X_test'], data['y_test'], data['qid_test']

CPU times: user 8.98 s, sys: 1.51 s, total: 10.5 s
Wall time: 12.2 s

Total size in bytes, total number of search results and number of queries:

In [6]:

(X_train.nbytes + X_vali.nbytes + X_test.nbytes) / 1e6

Out[6]:

652.904448

In [7]:

len(X_train) + len(X_vali) + len(X_test)

Out[7]:

In [8]:

len(np.unique(qid_train)) + len(np.unique(qid_vali)) + len(np.unique(qid_test))

Out[8]:

Concatenate the training and validation sets as a big development set.

In [9]:

X_dev = np.vstack([X_train, X_vali])
y_dev = np.concatenate([y_train, y_vali])
qid_dev = np.concatenate([qid_train, qid_vali])

In [10]:

X_dev.shape

Out[10]:

(958671, 136)

In [11]:

X_dev.dtype

Out[11]:

dtype('float32')

In [12]:

unique_qid_train = np.unique(qid_train)
len(unique_qid_train)

Out[12]:

Extract a subset of 500 queries to speed up the learning when prototyping

In [13]:

rng = np.random.RandomState(0)
qid_mask = rng.permutation(len(unique_qid_train))[:500]
subset_mask = np.in1d(qid_train, unique_qid_train[qid_mask])
X_train_small = X_train[subset_mask]
y_train_small = y_train[subset_mask]
qid_train_small = qid_train[subset_mask]

In [14]:

X_train_small.shape

Out[14]:

(62244, 136)

Sanity check:

In [15]:

len(np.unique(qid_train_small))

Out[15]:

Quantifying ranking success with NDCG¶

In [16]:

def dcg(relevances, rank=10):
    """Discounted cumulative gain at rank (DCG)"""
    relevances = np.asarray(relevances)[:rank]
    n_relevances = len(relevances)
    if n_relevances == 0:
        return 0.

    discounts = np.log2(np.arange(n_relevances) + 2)
    return np.sum(relevances / discounts)
 
 
def ndcg(relevances, rank=10):
    """Normalized discounted cumulative gain (NDGC)"""
    best_dcg = dcg(sorted(relevances, reverse=True), rank)
    if best_dcg == 0:
        return 0.

    return dcg(relevances, rank) / best_dcg

In [17]:

ndcg([2, 4, 0, 1, 1, 0, 0], rank=5)

Out[17]:

0.86253003992915656

In [18]:

ndcg([0, 0, 0, 1, 1, 2, 4], rank=5)

Out[18]:

0.13201850690866795

In [19]:

ndcg([0, 0, 0, 1, 1, 2, 4], rank=3)

Out[19]:

0.0

In [20]:

ndcg([4, 2, 1, 1, 0, 0, 0], rank=5)

Out[20]:

1.0

In [21]:

def mean_ndcg(y_true, y_pred, query_ids, rank=10):
    y_true = np.asarray(y_true)
    y_pred = np.asarray(y_pred)
    query_ids = np.asarray(query_ids)
    # assume query_ids are sorted
    ndcg_scores = []
    previous_qid = query_ids[0]
    previous_loc = 0
    for loc, qid in enumerate(query_ids):
        if previous_qid != qid:
            chunk = slice(previous_loc, loc)
            ranked_relevances = y_true[chunk][np.argsort(y_pred[chunk])[::-1]]
            ndcg_scores.append(ndcg(ranked_relevances, rank=rank))
            previous_loc = loc
        previous_qid = qid

    chunk = slice(previous_loc, loc + 1)
    ranked_relevances = y_true[chunk][np.argsort(y_pred[chunk])[::-1]]
    ndcg_scores.append(ndcg(ranked_relevances, rank=rank))
    return np.mean(ndcg_scores)


mean_ndcg([4, 3, 1, 4, 3], [4, 0, 1, 4, 2], [0, 0, 0, 2, 2], rank=10)

Out[21]:

0.9795191506818377

Growing Randomized Trees to predict relevance scores¶

In [21]:

grower = EnsembleGrower(lb_view, ExtraTreesRegressor(n_estimators=1))

In [22]:

grower.launch(X_dev, y_dev, n_estimators=500,
              folder="web10k", dump_models=False)

Out[22]:

Progress: 00% (000/500), elapsed: 0.954s

In [56]:

grower

Out[56]:

Progress: 100% (500/500), elapsed: 2310.782s

In [57]:

#grower.wait()

In [58]:

etr = grower.aggregate_model()
print("Number of trees: {}".format(len(etr.estimators_)))

Number of trees: 500

In [59]:

%time y_test_etr = etr.predict(X_test)

CPU times: user 1min 15s, sys: 0 ns, total: 1min 15s
Wall time: 1min 14s

In [60]:

print("NDCG_5 score: {:.3f}".format(
    mean_ndcg(y_test, y_test_etr, qid_test, rank=5)))
print("NDCG_10 score: {:.3f}".format(
    mean_ndcg(y_test, y_test_etr, qid_test, rank=10)))
print("R2 score: {:.3f}".format(r2_score(y_test, y_test_etr)))

NDCG_5 score: 0.516
NDCG_10 score: 0.522
R2 score: 0.184

Impact of the number of trees in the ensemble¶

In [61]:

%%time

max_n_trees = len(etr.estimators_)
n_trees = np.logspace(0, np.log10(max_n_trees), 5).astype(int)
scores = []

for j, n in enumerate(n_trees):
    y_predicted = sub_ensemble(etr, n).predict(X_test)
    scores.append(mean_ndcg(y_test, y_predicted, qid_test, rank=10))

CPU times: user 1min 36s, sys: 0 ns, total: 1min 36s
Wall time: 1min 34s

In [65]:

plt.plot(n_trees, scores)
plt.xlabel("Number of trees")
plt.ylabel("Average NDC@10")
_ = plt.title("Impact of the number of trees")

Evaluation of the overfitting of the ensemble¶

In [63]:

%time y_train_small_etr = etr.predict(X_train_small)

CPU times: user 20.2 s, sys: 0 ns, total: 20.2 s
Wall time: 19.8 s

In [64]:

print("NDCG_5 score: {:.3f}".format(
    mean_ndcg(y_train_small, y_train_small_etr, qid_train_small, rank=5)))
print("NDCG_10 score: {:.3f}".format(
    mean_ndcg(y_train_small, y_train_small_etr, qid_train_small, rank=10)))
print("R2 score: {:.3f}".format(r2_score(y_train_small, y_train_small_etr)))

NDCG_5 score: 0.964
NDCG_10 score: 0.964
R2 score: 0.999

Comparing with a baseline linear regression model¶

In [27]:

%time lr = LinearRegression().fit(X_dev, y_dev)

CPU times: user 25.1 s, sys: 2.35 s, total: 27.4 s
Wall time: 22 s

In [28]:

%time y_test_lr = lr.predict(X_test)

CPU times: user 163 ms, sys: 402 ms, total: 565 ms
Wall time: 1.02 s

In [29]:

print("NDCG_5 score: {:.3f}".format(
    mean_ndcg(y_test, y_test_lr, qid_test, rank=5)))
print("NDCG_10 score: {:.3f}".format(
    mean_ndcg(y_test, y_test_lr, qid_test, rank=10)))
print("R2 score: {:.3f}".format(r2_score(y_test, y_test_lr)))

NDCG_5 score: 0.433
NDCG_10 score: 0.450
R2 score: 0.127

Evaluate overfitting by comparing with training set:

In [30]:

y_train_small_lr = lr.predict(X_train_small)

print("NDCG_5 score: {:.3f}".format(
    mean_ndcg(y_train_small, y_train_small_lr, qid_train_small, rank=5)))
print("NDCG_10 score: {:.3f}".format(
    mean_ndcg(y_train_small, y_train_small_lr, qid_train_small, rank=10)))
print("R2 score: {:.3f}".format(r2_score(y_train_small, y_train_small_lr)))

NDCG_5 score: 0.415
NDCG_10 score: 0.433
R2 score: 0.131

Interestingly enough, a slight overfitting of the training set from a regression standpoint (higher r2 score) does not seem to cause overfitting from a ranking standpoint. This would have to be checked with cross-validation though.

Introspecting the distribution of relevance scores predictions¶

In [66]:

subset = np.random.permutation(y_test.shape[0])[:10000]

In [67]:

plt.title('Extra Trees predictions')
plt.scatter(y_test[subset], y_test_etr[subset], alpha=0.1, s=100)
plt.xlabel('True relevance')
plt.ylabel('Predicted relevance')
plt.ylim(-2, 5)
plt.xlim(-2, 5)

Out[67]:

(-2, 5)

In [68]:

plt.title('Linear Regression predictions')
plt.scatter(y_test[subset], y_test_lr[subset], alpha=0.1, s=100)
plt.xlabel('True relevance')
plt.ylabel('Predicted relevance')
plt.ylim(-2, 5)
plt.xlim(-2, 5)

Out[68]:

(-2, 5)

In [69]:

plt.hist(y_test, bins=5, alpha=.3, color='b', label='True relevance')
plt.hist(y_test_etr, bins=5, alpha=.3, color='g', label='ET predicted relevance')
plt.legend(loc='best')

Out[69]:

<matplotlib.legend.Legend at 0x98b5c950>

For each query, count the number of results with rank 0, 1, 2, 3 or 4.

In [70]:

unique_qids_test = np.unique(qid_test)
for qid in unique_qids_test[:5]:
    qids = y_test[qid_test == qid].astype(np.int)
    print(np.bincount(qids, minlength=5))

[45 54 31  8  0]
[59 25  8  2  0]
[52 20  9  2  3]
[97 45  2  2  2]
[32 56 24  7  4]

Evolution of training time with tree dataset size¶

In [37]:

etr_one = ExtraTreesRegressor(n_estimators=1)

In [148]:

from time import clock

training_sizes = np.logspace(0, np.log10(len(X_dev)), 9).astype(np.int)

durations = []
decision_nodes = []
for n_samples in training_sizes:
    tic = clock()
    etr_one.fit(X_dev[:n_samples], y_dev[:n_samples])
    d = clock() - tic
    print("Duration for 1 tree on {} samples: {:.3f}s".format(
        n_samples, d))
    durations.append(d)
    tree = etr_one.estimators_[0].tree_
    decision_nodes.append(np.sum(tree.children_left > 0))

Duration for 1 tree on 1 samples: 0.001s
Duration for 1 tree on 5 samples: 0.002s
Duration for 1 tree on 31 samples: 0.002s
Duration for 1 tree on 175 samples: 0.005s
Duration for 1 tree on 979 samples: 0.026s
Duration for 1 tree on 5477 samples: 0.238s
Duration for 1 tree on 30637 samples: 1.762s
Duration for 1 tree on 171380 samples: 13.977s
Duration for 1 tree on 958670 samples: 118.329s

In [149]:

plt.loglog(training_sizes, durations, 'o-')
plt.xlabel("# of training samples")
plt.ylabel("training time (s)")
_ = plt.title("Impact of training set size on training time")

In [150]:

plt.loglog(training_sizes, decision_nodes, 'o-')
plt.xlabel("# of training samples")
plt.ylabel("# of decision nodes")
_ = plt.title("Impact of training set size on model size")

Compressing the data with Linear Regression¶

In [35]:

diff = np.abs(lr.predict(X_dev) - y_dev)
normalized_diff = diff / diff.max()

In [36]:

plt.plot(sorted(normalized_diff))
plt.title("Sorted")

Out[36]:

[<matplotlib.lines.Line2D at 0x107729710>]

In [132]:

n_samples = len(y_dev)
rng = np.random.RandomState(42)
sample_mask = normalized_diff > 1.64 * rng.rand(n_samples)

np.sum(sample_mask)

Out[132]:

In [133]:

X_sampled = X_dev[sample_mask]
y_sampled = y_dev[sample_mask]
sample_weight = normalized_diff[sample_mask]

In [134]:

plt.plot(sorted(sample_weight))
_ = plt.title("Sorted normalized diff in the resampled dataset")

In [136]:

%time gbr_sampled = GradientBoostingRegressor().fit(X_sampled, y_sampled)

CPU times: user 10min 16s, sys: 1.23 s, total: 10min 17s
Wall time: 10min 18s

In [137]:

%time y_test_gbr_sampled = gbr.predict(X_test)

CPU times: user 1.7 s, sys: 11 ms, total: 1.71 s
Wall time: 1.71 s

In [138]:

mean_ndcg(y_test, y_test_gbr_sampled, qid_test, rank=5)

Out[138]:

0.47394128281542253

In [145]:

etr_10_sampled = ExtraTreesRegressor(n_estimators=100, n_jobs=2)
%time _ = etr_10_sampled.fit(X_sampled, y_sampled, sample_weight=sample_weight)

CPU times: user 592 ms, sys: 1.24 s, total: 1.83 s
Wall time: 5min 45s

In [146]:

%time y_test_etr_10_sampled = etr_10_sampled.predict(X_test)

CPU times: user 482 ms, sys: 637 ms, total: 1.12 s
Wall time: 7.4 s

In [147]:

mean_ndcg(y_test, y_test_etr_10_sampled, qid_test, rank=5)

Out[147]:

0.44812412919740691

GBRT on a small random subset¶

In [105]:

len(X_train_small), len(np.unique(qid_train_small))

Out[105]:

(62244, 500)

In [106]:

%time gbr_small = GradientBoostingRegressor().fit(X_train_small, y_train_small)

CPU times: user 9min 17s, sys: 815 ms, total: 9min 17s
Wall time: 9min 18s

In [107]:

%time y_test_gbr_small = gbr_small.predict(X_test)

CPU times: user 1.59 s, sys: 2.56 ms, total: 1.59 s
Wall time: 1.59 s

In [114]:

mean_ndcg(y_test, y_test_gbr_small, qid_test, rank=5)

Out[114]:

0.50078362434826951

In [ ]: