The purpose of this notebook is to explore the possibility to fit conditiona quantile estimators using classifiers on binned target values with interpolation at predict-time.
from scipy.interpolate import interp1d
from sklearn.base import BaseEstimator, RegressorMixin, clone
from sklearn.utils.validation import check_is_fitted
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.utils.validation import check_consistent_length
from sklearn.utils import check_random_state
import numpy as np
class BinnedQuantileRegressor(BaseEstimator, RegressorMixin):
def __init__(
self,
estimator=None,
n_bins=100,
quantile=0.5,
binning_strategy="quantile",
interpolation_knots="edges",
interpolation_kind="linear",
random_state=None,
):
self.n_bins = n_bins
self.estimator = estimator
self.quantile = quantile
self.binning_strategy = binning_strategy
self.interpolation_knots = interpolation_knots
self.interpolation_kind = interpolation_kind
self.random_state = random_state
def fit(self, X, y):
assert self.interpolation_knots in ("edges", "midpoints")
# Lightweight input validation: most of the input validation will be
# handled by the sub estimators.
random_state = check_random_state(self.random_state)
check_consistent_length(X, y)
self.target_binner_ = KBinsDiscretizer(
n_bins=self.n_bins,
strategy=self.binning_strategy,
subsample=200_000,
encode="ordinal",
random_state=random_state,
)
y_binned = self.target_binner_.fit_transform(
np.asarray(y).reshape(-1, 1)
).ravel().astype(np.int32)
# Fit the multiclass classifier to predict the binned targets from the
# training set.
if self.estimator is None:
estimator = RandomForestClassifier(random_state=random_state)
else:
estimator = clone(self.estimator)
self.estimator_ = estimator.fit(X, y_binned)
return self
def predict_quantiles(self, X, quantiles):
check_is_fitted(self, "estimator_")
edges = self.target_binner_.bin_edges_[0]
n_bins = edges.shape[0] - 1
expected_shape = (X.shape[0], n_bins)
y_proba_raw = self.estimator_.predict_proba(X)
# Some might stay empty on the training set, in particular with the
# uniform binning strategy. Typically, classifiers do not learn to
# predict an explicit 0 probability for unobserved classes so we have
# to post process their output:
if y_proba_raw.shape != expected_shape:
y_proba = np.zeros(shape=expected_shape)
y_proba[:, self.estimator_.classes_] = y_proba_raw
else:
y_proba = y_proba_raw
# Build the mapper for inverse CDF mapping, from cumulated
# probabilities to continuous prediction.
if self.interpolation_knots == "edges":
y_cdf = np.zeros(shape=(X.shape[0], edges.shape[0]))
y_cdf[:, 1:] = np.cumsum(y_proba, axis=1)
return np.asarray(
[
interp1d(
y_cdf_i,
edges,
kind=self.interpolation_kind,
)(quantiles)
for y_cdf_i in y_cdf
]
)
else:
midpoints = (edges[1:] + edges[:-1]) / 2
y_cdf = np.cumsum(y_proba, axis=1)
return np.asarray(
[
interp1d(
y_cdf_i,
midpoints,
kind=self.interpolation_kind,
bounds_error=False,
fill_value=(midpoints[0], midpoints[-1]),
)(quantiles)
for y_cdf_i in y_cdf
]
)
def predict(self, X):
return self.predict_quantiles(X, self.quantile).ravel()
Let's define a toy 1D dataset with heteroscedastic noise to compare different base classifiers:
import matplotlib.pyplot as plt
rng = np.random.RandomState(42)
x = rng.uniform(-1, 1, size=10_000)
X = x.reshape(-1, 1)
y = np.sin(x * np.pi) + 1
y /= y.max()
# Add heteroscedastic noise to make conditional quantiles interesting:
y += rng.normal(0, y / 5, size=y.shape)
_ = plt.plot(x, y, "o", alpha=0.1)
Let's also define some reusable helper functions to compare the models:
from sklearn.model_selection import cross_validate
from sklearn.metrics import make_scorer
from sklearn.metrics import mean_pinball_loss
from functools import partial
def fit_and_plot(model, X, y, display_bin_edges=False):
_ = plt.plot(X.ravel(), y, "o", alpha=0.1, markeredgewidth=0)
X_test = np.linspace(X.min(), X.max(), 1000).reshape(-1, 1)
q_pred = model.fit(X, y).predict(X_test)
plt.plot(X_test, q_pred, label="q=0.95")
if display_bin_edges:
# Plot horizontal lines for each of the bin edges:
bin_edges = model.target_binner_.bin_edges_[0]
plt.hlines(
bin_edges,
X_test.min(),
X_test.max(),
color="k",
linestyles="-",
linewidths=0.2,
alpha=0.2,
)
plt.legend()
plt.show()
def evaluate(model, X, y, quantile=0.95):
neg_q95_pinball_loss = make_scorer(
partial(mean_pinball_loss, alpha=0.95),
greater_is_better=False,
)
cv_results = cross_validate(
model,
X,
y,
cv=5,
scoring=neg_q95_pinball_loss,
return_train_score=True,
n_jobs=-1,
)
train_scores = -cv_results["train_score"]
val_scores = -cv_results["test_score"]
fit_time = cv_results["fit_time"]
print(
f"Train pinball (q={quantile:.2f}): {train_scores.mean():.5f} +/- {train_scores.std():.5f}\n"
f"Val pinball (q={quantile:.2f}): {val_scores.mean():.5f} +/- {val_scores.std():.5f}\n"
f"Fit time: {fit_time.mean():.4f} +/- {fit_time.std():.4f} s"
)
This classifier can naturally handle a large number of classes (target value bins). It's a strong baseline as it is fast to fit and yields strong pinball loss values.
Let's use it to explore the impact of the number of bins.
base_classifier = RandomForestClassifier(
random_state=42,
n_estimators=30,
min_samples_leaf=50,
n_jobs=-1,
)
binned_quantile_reg = BinnedQuantileRegressor(
estimator=base_classifier,
n_bins=10, # small value!
quantile=0.95,
random_state=42,
)
evaluate(binned_quantile_reg, X, y)
fit_and_plot(binned_quantile_reg, X, y, display_bin_edges=True)
Train pinball (q=0.95): 0.01307 +/- 0.00021 Val pinball (q=0.95): 0.01322 +/- 0.00026 Fit time: 0.1335 +/- 0.0142 s
Note that the interpolation makes it possible to predict values between the edges of the bins. This is not possible with the classifier alone. However we can see steps, so the model is probably underfitting because of the small number of bins. Let's increase it:
binned_quantile_reg.set_params(n_bins=100)
evaluate(binned_quantile_reg, X, y)
fit_and_plot(binned_quantile_reg, X, y, display_bin_edges=True)
Train pinball (q=0.95): 0.00986 +/- 0.00002 Val pinball (q=0.95): 0.01033 +/- 0.00005 Fit time: 0.1899 +/- 0.0198 s
We can increase it a lot more. Note that the resulting model is slower but not overfitting more than the previous one:
binned_quantile_reg.set_params(n_bins=1000)
evaluate(binned_quantile_reg, X, y)
fit_and_plot(binned_quantile_reg, X, y, display_bin_edges=False)
Train pinball (q=0.95): 0.00981 +/- 0.00002 Val pinball (q=0.95): 0.01026 +/- 0.00006 Fit time: 1.5030 +/- 0.0188 s
For the rest of the notebook, we will keep the number of bins to 100:
n_bins = 100
_ = binned_quantile_reg.set_params(n_bins=n_bins)
Much slower (100x) on CPU but better slightly better results. This is the only method that yields a smooth (and non-oscillating) prediction function:
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import SplineTransformer
base_classifier = make_pipeline(
SplineTransformer(n_knots=5, degree=3),
MLPClassifier(
random_state=42,
hidden_layer_sizes=[100, 100],
solver="adam",
max_iter=300,
),
)
binned_quantile_reg.set_params(estimator=base_classifier)
evaluate(binned_quantile_reg, X, y)
fit_and_plot(binned_quantile_reg, X, y, display_bin_edges=True)
Train pinball (q=0.95): 0.01014 +/- 0.00003 Val pinball (q=0.95): 0.01025 +/- 0.00011 Fit time: 11.1916 +/- 0.8659 s
Using the XGBoost classifier as a base estimator gives good enough results
but it typically slower and not better than RF with tuned min_samples_leaf
.
The story might be different on datasets with a very large number of samples.
Probably requires more expensive hyperparameter tuning to shine.
from sympy import N
from xgboost import XGBClassifier
base_classifier = XGBClassifier(
tree_method="hist",
multi_strategy="multi_output_tree",
random_state=42,
max_leaves=10,
n_estimators=100,
learning_rate=0.3,
)
binned_quantile_reg.set_params(estimator=base_classifier)
evaluate(binned_quantile_reg, X, y)
fit_and_plot(binned_quantile_reg, X, y, display_bin_edges=True)
Train pinball (q=0.95): 0.01003 +/- 0.00004 Val pinball (q=0.95): 0.01032 +/- 0.00008 Fit time: 2.3744 +/- 0.0435 s
XXX There is a problem with this one. It does not work at all I have not found the cause yet, even when tweaking the hparams:
from sklearn.ensemble import HistGradientBoostingClassifier
base_classifier = HistGradientBoostingClassifier(
random_state=42,
max_leaf_nodes=10,
early_stopping=True,
n_iter_no_change=3,
max_iter=1_000,
learning_rate=0.3,
)
binned_quantile_reg.set_params(estimator=base_classifier)
evaluate(binned_quantile_reg, X, y)
fit_and_plot(binned_quantile_reg, X, y, display_bin_edges=True)
Train pinball (q=0.95): 0.01625 +/- 0.00127 Val pinball (q=0.95): 0.01722 +/- 0.00211 Fit time: 0.2409 +/- 0.0206 s
from sklearn.ensemble import HistGradientBoostingRegressor
quantile_regressor = HistGradientBoostingRegressor(
random_state=42,
loss="quantile",
quantile=0.95,
max_leaf_nodes=10,
early_stopping=True,
n_iter_no_change=3,
max_iter=1_000,
learning_rate=0.3,
)
evaluate(quantile_regressor, X, y)
fit_and_plot(quantile_regressor, X, y)
Train pinball (q=0.95): 0.00971 +/- 0.00005 Val pinball (q=0.95): 0.01034 +/- 0.00015 Fit time: 0.0853 +/- 0.0144 s
from xgboost import XGBRegressor
quantile_regressor = XGBRegressor(
random_state=42,
tree_method="hist",
objective="reg:quantileerror",
quantile_alpha=0.95,
max_leaves=10,
n_estimators=100,
learning_rate=0.3,
)
evaluate(quantile_regressor, X, y)
fit_and_plot(quantile_regressor, X, y)
Train pinball (q=0.95): 0.00967 +/- 0.00005 Val pinball (q=0.95): 0.01032 +/- 0.00011 Fit time: 0.1745 +/- 0.0240 s