import pandas as pd
AMES_HOUSING_CSV = "https://www.openml.org/data/get_csv/20649135/file2ed11cebe25.arff"
df = pd.read_csv(AMES_HOUSING_CSV)
df
MS_SubClass | MS_Zoning | Lot_Frontage | Lot_Area | Street | Alley | Lot_Shape | Land_Contour | Utilities | Lot_Config | ... | Fence | Misc_Feature | Misc_Val | Mo_Sold | Year_Sold | Sale_Type | Sale_Condition | Sale_Price | Longitude | Latitude | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | One_Story_1946_and_Newer_All_Styles | Residential_Low_Density | 141 | 31770 | Pave | No_Alley_Access | Slightly_Irregular | Lvl | AllPub | Corner | ... | No_Fence | None | 0 | 5 | 2010 | WD | Normal | 215000 | -93.619754 | 42.054035 |
1 | One_Story_1946_and_Newer_All_Styles | Residential_High_Density | 80 | 11622 | Pave | No_Alley_Access | Regular | Lvl | AllPub | Inside | ... | Minimum_Privacy | None | 0 | 6 | 2010 | WD | Normal | 105000 | -93.619756 | 42.053014 |
2 | One_Story_1946_and_Newer_All_Styles | Residential_Low_Density | 81 | 14267 | Pave | No_Alley_Access | Slightly_Irregular | Lvl | AllPub | Corner | ... | No_Fence | Gar2 | 12500 | 6 | 2010 | WD | Normal | 172000 | -93.619387 | 42.052659 |
3 | One_Story_1946_and_Newer_All_Styles | Residential_Low_Density | 93 | 11160 | Pave | No_Alley_Access | Regular | Lvl | AllPub | Corner | ... | No_Fence | None | 0 | 4 | 2010 | WD | Normal | 244000 | -93.617320 | 42.051245 |
4 | Two_Story_1946_and_Newer | Residential_Low_Density | 74 | 13830 | Pave | No_Alley_Access | Slightly_Irregular | Lvl | AllPub | Inside | ... | Minimum_Privacy | None | 0 | 3 | 2010 | WD | Normal | 189900 | -93.638933 | 42.060899 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
2925 | Split_or_Multilevel | Residential_Low_Density | 37 | 7937 | Pave | No_Alley_Access | Slightly_Irregular | Lvl | AllPub | CulDSac | ... | Good_Privacy | None | 0 | 3 | 2006 | WD | Normal | 142500 | -93.604776 | 41.988964 |
2926 | One_Story_1946_and_Newer_All_Styles | Residential_Low_Density | 0 | 8885 | Pave | No_Alley_Access | Slightly_Irregular | Low | AllPub | Inside | ... | Minimum_Privacy | None | 0 | 6 | 2006 | WD | Normal | 131000 | -93.602680 | 41.988314 |
2927 | Split_Foyer | Residential_Low_Density | 62 | 10441 | Pave | No_Alley_Access | Regular | Lvl | AllPub | Inside | ... | Minimum_Privacy | Shed | 700 | 7 | 2006 | WD | Normal | 132000 | -93.606847 | 41.986510 |
2928 | One_Story_1946_and_Newer_All_Styles | Residential_Low_Density | 77 | 10010 | Pave | No_Alley_Access | Regular | Lvl | AllPub | Inside | ... | No_Fence | None | 0 | 4 | 2006 | WD | Normal | 170000 | -93.600190 | 41.990921 |
2929 | Two_Story_1946_and_Newer | Residential_Low_Density | 74 | 9627 | Pave | No_Alley_Access | Regular | Lvl | AllPub | Inside | ... | No_Fence | None | 0 | 11 | 2006 | WD | Normal | 188000 | -93.599996 | 41.989265 |
2930 rows × 81 columns
import folium
from folium.plugins import MarkerCluster
center = df[["Latitude", "Longitude"]].mean().values.tolist()
ames_map = folium.Map(location=center, zoom_start=13)
c = MarkerCluster(options={"maxClusterRadius": 40}).add_to(ames_map)
def make_tooltip(record):
tooltip = f'<div>Sale_Price: ${record["Sale_Price"] / 1e3:.1f}k</div>'
tooltip += f'<div>Gr_Liv_Area: {record["Gr_Liv_Area"]:.1f}</div>'
tooltip += f'<div>Year_Built: {record["Year_Built"]:d}</div>'
return tooltip
for i, record in df.iterrows():
marker = folium.Marker((record["Latitude"], record["Longitude"]),
tooltip=make_tooltip(record))
marker.add_to(c)
ames_map
df.plot(x="Gr_Liv_Area", y="Sale_Price", kind="scatter",
figsize=(12, 6), alpha=0.1);
from sklearn.model_selection import train_test_split
X = df.drop(columns=["Sale_Price"])
y = df["Sale_Price"]
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=500, random_state=0)
from sklearn.linear_model import LinearRegression
x_train = X_train["Gr_Liv_Area"].values.reshape(-1, 1)
x_test = X_test["Gr_Liv_Area"].values.reshape(-1, 1)
lr = LinearRegression().fit(x_train, y_train)
r2_score = lr.score(x_test, y_test)
print(f"R2 score (test): {r2_score:.3f}")
R2 score (test): 0.542
y_pred = lr.predict(X_test["Gr_Liv_Area"].values.reshape(-1, 1))
import matplotlib.pyplot as plt
def plot_predictions(y_test, y_pred):
fig, ax = plt.subplots(figsize=(8, 8))
ax.scatter(y_pred, y_test, alpha=0.3)
ax.plot([1e4, 5e5], [1e4, 5e5], linestyle="--")
ax.set_xlim(1e4, 5e5); ax.set_ylim(1e4, 5e5)
ax.set_xlabel("Predicted price ($)")
ax.set_ylabel("Actual price ($)")
plot_predictions(y_test, y_pred)
import numpy as np
def mean_absolute_percent_error(y_true, y_pred):
diffs = np.abs(y_true - y_pred)
scales = np.abs(y_true) + np.finfo(np.float64).eps
return np.mean(diffs / scales) * 100
mape = mean_absolute_percent_error(y_test, y_pred)
print(f"MAPE (test): {mape:.3f}")
MAPE (test): 23.166
X_train.head()
MS_SubClass | MS_Zoning | Lot_Frontage | Lot_Area | Street | Alley | Lot_Shape | Land_Contour | Utilities | Lot_Config | ... | Pool_QC | Fence | Misc_Feature | Misc_Val | Mo_Sold | Year_Sold | Sale_Type | Sale_Condition | Longitude | Latitude | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
483 | Two_Story_1946_and_Newer | Residential_Low_Density | 0 | 8795 | Pave | No_Alley_Access | Slightly_Irregular | Lvl | AllPub | Inside | ... | No_Pool | No_Fence | None | 0 | 4 | 2009 | WD | Normal | -93.642813 | 42.058590 |
2585 | One_Story_1946_and_Newer_All_Styles | Residential_Low_Density | 75 | 10170 | Pave | No_Alley_Access | Regular | Lvl | AllPub | Corner | ... | No_Pool | No_Fence | None | 0 | 6 | 2006 | WD | Normal | -93.617069 | 42.038357 |
2288 | Two_Story_PUD_1946_and_Newer | Residential_Medium_Density | 21 | 2001 | Pave | No_Alley_Access | Regular | Lvl | AllPub | Inside | ... | No_Pool | No_Fence | None | 0 | 1 | 2007 | WD | Normal | -93.601507 | 41.991709 |
141 | One_Story_1946_and_Newer_All_Styles | Residential_Low_Density | 70 | 10552 | Pave | No_Alley_Access | Slightly_Irregular | Lvl | AllPub | Inside | ... | No_Pool | No_Fence | None | 0 | 4 | 2010 | WD | Normal | -93.617704 | 42.043084 |
2041 | Two_Family_conversion_All_Styles_and_Ages | Residential_Medium_Density | 60 | 10120 | Pave | No_Alley_Access | Slightly_Irregular | Bnk | AllPub | Inside | ... | No_Pool | Minimum_Privacy | None | 0 | 1 | 2007 | WD | Normal | -93.622025 | 42.025893 |
5 rows × 80 columns
def caterogical_columns(df):
return df.columns[df.dtypes == object]
len(caterogical_columns(df))
46
def numeric_columns(df):
return df.columns[df.dtypes != object]
len(numeric_columns(df))
35
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.pipeline import Pipeline
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingRegressor
categories = [df[c].unique() for c in caterogical_columns(df)]
ord_encoder = OrdinalEncoder(categories=categories)
preprocessor = ColumnTransformer([
("categorical", ord_encoder, caterogical_columns),
("numeric", "passthrough", numeric_columns),
])
hgb = HistGradientBoostingRegressor(
max_leaf_nodes=3,
learning_rate=0.5,
early_stopping=True,
n_iter_no_change=10,
max_bins=5,
max_iter=1000,
random_state=0,
)
model = Pipeline([
("preprocessor", preprocessor),
("regressor", hgb),
])
%%time
_ = model.fit(X_train, y_train)
CPU times: user 4.65 s, sys: 13.1 ms, total: 4.67 s Wall time: 1.81 s
model[-1].n_iter_
129
r2_score_train = model.score(X_train, y_train)
print(f"r2 score (train): {r2_score_train:.3f}")
r2 score (train): 0.926
test_r2_score = model.score(X_test, y_test)
print(f"r2 score (test): {test_r2_score:.3f}")
r2 score (test): 0.897
y_pred = model.predict(X_test)
mape = mean_absolute_percent_error(y_test, y_pred)
print(f"MAPE: {mape:.1f}%")
MAPE: 9.7%
y_pred = model.predict(X_test)
plot_predictions(y_test, y_pred)
from sklearn.inspection import permutation_importance
pi = permutation_importance(model, X_test, y_test, n_repeats=5,
random_state=42, n_jobs=2)
sorted_idx = pi.importances_mean.argsort()
most_important_idx = [
i for i in sorted_idx
if pi.importances_mean[i] - 4 * pi.importances_std[i] > 0
]
most_important_names = df.columns[most_important_idx]
fig, ax = plt.subplots(figsize=(12, 8))
ax.boxplot(pi.importances[most_important_idx].T,
vert=False, labels=most_important_names)
ax.set_title("Permutation Importances (test set)");
feature_subset = most_important_names.tolist()
if "Latitude" not in feature_subset:
feature_subset += ["Latitude"]
if "Longitude" not in feature_subset:
feature_subset += ["Longitude"]
len(numeric_columns(df[feature_subset]))
8
len(caterogical_columns(df[feature_subset]))
8
X_train[feature_subset]
Roof_Matl | Exterior_2nd | Bsmt_Qual | Wood_Deck_SF | Condition_1 | MS_Zoning | Land_Contour | Longitude | Full_Bath | Kitchen_Qual | Total_Bsmt_SF | Garage_Cars | Overall_Qual | Gr_Liv_Area | Year_Built | Latitude | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
483 | CompShg | VinylSd | Good | 224 | Norm | Residential_Low_Density | Lvl | -93.642813 | 2 | Good | 952 | 2 | Good | 2256 | 2000 | 42.058590 |
2585 | CompShg | Wd Sdng | Typical | 0 | Norm | Residential_Low_Density | Lvl | -93.617069 | 1 | Good | 216 | 2 | Above_Average | 1575 | 1951 | 42.038357 |
2288 | CompShg | CmentBd | Typical | 0 | Norm | Residential_Medium_Density | Lvl | -93.601507 | 1 | Typical | 546 | 1 | Below_Average | 1092 | 1970 | 41.991709 |
141 | CompShg | BrkFace | Typical | 0 | Norm | Residential_Low_Density | Lvl | -93.617704 | 1 | Good | 1398 | 2 | Average | 1700 | 1959 | 42.043084 |
2041 | CompShg | Wd Sdng | Typical | 0 | Feedr | Residential_Medium_Density | Bnk | -93.622025 | 1 | Typical | 925 | 1 | Good | 1889 | 1910 | 42.025893 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
763 | CompShg | Plywood | Good | 120 | Norm | Residential_Low_Density | Lvl | -93.658173 | 3 | Typical | 1200 | 0 | Average | 1200 | 1987 | 42.028098 |
835 | CompShg | VinylSd | Good | 168 | Norm | Residential_Low_Density | Low | -93.691883 | 1 | Good | 1040 | 2 | Average | 1040 | 1996 | 42.021171 |
1653 | CompShg | HdBoard | Typical | 0 | Norm | Residential_Low_Density | Lvl | -93.638554 | 2 | Typical | 539 | 2 | Good | 1725 | 1979 | 42.052353 |
2607 | CompShg | VinylSd | Typical | 0 | Norm | Residential_Low_Density | Lvl | -93.608984 | 1 | Typical | 1024 | 2 | Average | 1086 | 1966 | 42.040811 |
2732 | CompShg | MetalSd | Typical | 237 | Norm | Residential_Low_Density | Lvl | -93.666035 | 1 | Typical | 967 | 2 | Below_Average | 1350 | 1954 | 42.026530 |
2430 rows × 16 columns
%%time
categories = [df[c].unique()
for c in caterogical_columns(df[feature_subset])]
ord_encoder = OrdinalEncoder(categories=categories)
preprocessor = ColumnTransformer([
("categorical", ord_encoder, caterogical_columns),
("numeric", "passthrough", numeric_columns),
])
hgb = HistGradientBoostingRegressor(
max_leaf_nodes=16,
learning_rate=0.1,
min_samples_leaf=5,
early_stopping=True,
n_iter_no_change=5,
max_iter=1000,
random_state=0,
)
reduced_model = Pipeline([
("preprocessor", preprocessor),
("regressor", hgb),
])
_= reduced_model.fit(X_train[feature_subset], y_train)
CPU times: user 1.32 s, sys: 31.7 ms, total: 1.36 s Wall time: 434 ms
%%time
_ = reduced_model.fit(X_train[feature_subset], y_train)
CPU times: user 967 ms, sys: 26.5 ms, total: 994 ms Wall time: 278 ms
reduced_model[-1].n_iter_
80
r2_score_train = reduced_model.score(X_train[feature_subset], y_train)
print(f"r2 score (train): {r2_score_train:.3f}")
r2 score (train): 0.949
test_r2_score = reduced_model.score(X_test[feature_subset], y_test)
print(f"r2 score (test): {test_r2_score:.3f}")
r2 score (test): 0.903
y_pred = reduced_model.predict(X_test[feature_subset])
mape = mean_absolute_percent_error(y_test, y_pred)
print(f"MAPE: {mape:.1f}%")
MAPE: 10.0%
plot_predictions(y_test, y_pred)
from sklearn.inspection import permutation_importance
pi = permutation_importance(reduced_model, X_test[feature_subset], y_test, n_repeats=10,
random_state=42, n_jobs=2)
sorted_idx = pi.importances_mean.argsort()
sorted_names = np.array(feature_subset)[sorted_idx]
fig, ax = plt.subplots(figsize=(12, 8))
ax.boxplot(pi.importances[sorted_idx].T,
vert=False, labels=sorted_names)
ax.set_title("Permutation Importances (test set)");
# %pip install -q git+https://github.com/slundberg/shap
import shap
shap.initjs()
explainer = shap.TreeExplainer(reduced_model[-1])
X_test_encoded = reduced_model[0].transform(X_test[feature_subset])
shap_values = explainer.shap_values(X_test_encoded)
feature_names = caterogical_columns(X_test[feature_subset]).tolist()
feature_names += numeric_columns(X_test[feature_subset]).tolist()
X_test_encoded = pd.DataFrame(X_test_encoded, columns=feature_names)
shap.summary_plot(shap_values, X_test_encoded, plot_type="bar")
Setting feature_perturbation = "tree_path_dependent" because no background data was given.
shap.summary_plot(shap_values, X_test_encoded, plot_size=(14, 7))
sample_idx = 0
print("True sale price:", y_test.iloc[sample_idx])
shap.force_plot(explainer.expected_value, shap_values[sample_idx, :],
X_test_encoded.iloc[sample_idx, :])
True sale price: 220000
from sklearn.inspection import plot_partial_dependence
fig, ax = plt.subplots(figsize=(10, 5))
ax.set_title("Partial Dependence")
plot_partial_dependence(reduced_model, X_test[feature_subset],
["Year_Built"], grid_resolution=20, ax=ax);
fig, ax = plt.subplots(figsize=(10, 5))
plot_partial_dependence(reduced_model, X_test[feature_subset], ["Gr_Liv_Area"],
grid_resolution=20, ax=ax);
fig, ax = plt.subplots(figsize=(8, 8))
ax.set_title("Partial Dependence")
plot_partial_dependence(reduced_model, X[feature_subset], [["Gr_Liv_Area", "Year_Built"]],
grid_resolution=20, contour_kw={"alpha": 0.8}, ax=ax);
fig, ax = plt.subplots(figsize=(12, 12))
ax.set_title("Partial Dependence")
plot_partial_dependence(reduced_model, X[feature_subset],
[["Longitude", "Latitude"]],
percentiles=(0., 1.),
grid_resolution=20, contour_kw={"alpha": 0.8}, ax=ax)
ax = fig.gca()
ax.set_xlim(X["Longitude"].min(), X["Longitude"].max())
ax.set_ylim(X["Latitude"].min(), X["Latitude"].max())
ax.set_aspect("equal")
ax.scatter(X["Longitude"], X["Latitude"]);
HistGradientBoostingRegressor()
HistGradientBoostingRegressor(early_stopping='auto', l2_regularization=0.0, learning_rate=0.1, loss='least_squares', max_bins=255, max_depth=None, max_iter=100, max_leaf_nodes=31, min_samples_leaf=20, n_iter_no_change=10, random_state=None, scoring='loss', tol=1e-07, validation_fraction=0.1, verbose=0, warm_start=False)
from sklearn.model_selection import RandomizedSearchCV
import numpy as np
categories = [df[c].unique()
for c in caterogical_columns(df)]
ord_encoder = OrdinalEncoder(categories=categories)
preprocessor = ColumnTransformer([
("categorical", ord_encoder, caterogical_columns),
("numeric", "passthrough", numeric_columns),
])
hgb = HistGradientBoostingRegressor(
early_stopping=True,
n_iter_no_change=10,
max_iter=1000,
)
model = Pipeline([
("preprocessor", preprocessor),
("regressor", hgb),
])
params = {
"regressor__learning_rate": np.logspace(-3, 0, 10),
"regressor__max_leaf_nodes": [2, 3, 4, 5, 6, 8, 16, 32, 64],
"regressor__max_bins": [3, 5, 10, 30, 50, 100, 255],
"regressor__min_samples_leaf": [1, 2, 5, 10, 20, 50, 100],
}
search = RandomizedSearchCV(model, params, n_iter=200, cv=3,
n_jobs=4, verbose=1)
# _ = search.fit(X_train, y_train)
# cv_results = pd.DataFrame(search.cv_results_)
# cv_results = cv_results.sort_values("mean_test_score", ascending=False)
# cv_results.to_json("ames_gbrt_search_results.json")
cv_results = pd.read_json("ames_gbrt_search_results.json")
def rename_param(column_name):
if "__" in column_name:
return column_name.rsplit("__", 1)[1]
return column_name
cv_results.rename(rename_param, axis=1).head(5)
mean_fit_time | std_fit_time | mean_score_time | std_score_time | min_samples_leaf | max_leaf_nodes | max_bins | learning_rate | params | split0_test_score | split1_test_score | split2_test_score | mean_test_score | std_test_score | rank_test_score | safe_test_score | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
41 | 1.150603 | 0.125701 | 0.047464 | 0.016420 | 10 | 16 | 255 | 0.046416 | {'regressor__min_samples_leaf': 10, 'regressor... | 0.884028 | 0.894805 | 0.915227 | 0.898020 | 0.012938 | 1 | 0.885082 |
71 | 1.278917 | 0.243353 | 0.052712 | 0.020205 | 2 | 16 | 255 | 0.100000 | {'regressor__min_samples_leaf': 2, 'regressor_... | 0.882843 | 0.891829 | 0.919236 | 0.897969 | 0.015479 | 2 | 0.882490 |
75 | 0.658366 | 0.108563 | 0.053916 | 0.023610 | 5 | 16 | 10 | 0.100000 | {'regressor__min_samples_leaf': 5, 'regressor_... | 0.885774 | 0.892744 | 0.910826 | 0.896448 | 0.010558 | 3 | 0.885891 |
168 | 0.761223 | 0.226863 | 0.048083 | 0.013343 | 10 | 8 | 30 | 0.046416 | {'regressor__min_samples_leaf': 10, 'regressor... | 0.887633 | 0.885017 | 0.916373 | 0.896341 | 0.014205 | 4 | 0.882136 |
78 | 0.767932 | 0.128119 | 0.034624 | 0.003252 | 5 | 16 | 255 | 0.100000 | {'regressor__min_samples_leaf': 5, 'regressor_... | 0.885211 | 0.892230 | 0.911465 | 0.896302 | 0.011098 | 5 | 0.885204 |
import plotly.express as px
fig = px.parallel_coordinates(
cv_results.rename(rename_param, axis=1).apply({
"learning_rate": np.log10,
"max_leaf_nodes": np.log2,
"max_bins": np.log2,
"mean_test_score": lambda x: x,
}),
color="mean_test_score",
color_continuous_scale=px.colors.diverging.Portland,
)
fig.show()
Let's zoom on the top performing models by using the query
methods of the dataframe. Note that the axis have a narrower range now:
fig = px.parallel_coordinates(
cv_results.rename(rename_param, axis=1).apply({
"learning_rate": np.log10,
"max_leaf_nodes": np.log2,
"max_bins": np.log2,
"mean_test_score": lambda x: x,
}).query("mean_test_score > 0.88"),
color="mean_test_score",
color_continuous_scale=px.colors.diverging.Portland,
)
fig.show()
Let's check that the inner CV scores still approximately reflect the true generatlization score measured on held out data even when we select the best model from hundred of possible candidates via random search:
best_search_result = cv_results.nlargest(n=1, columns=["mean_test_score"]).iloc[0]
print(f'R2 score of best candidate (inner CV): {best_search_result["mean_test_score"]:.3f}'
f' (+/-{3 * best_search_result["std_test_score"]:.3f})')
R2 score of best candidate (inner CV): 0.898 (+/-0.039)
model.set_params(**best_search_result["params"])
model.fit(X_train, y_train)
held_out_score = model.score(X_test, y_test)
print(f'R2 score of best candidate on held-out data: {held_out_score:.3f}')
R2 score of best candidate on held-out data: 0.893
cv_results["safe_test_score"] = cv_results["mean_test_score"] - cv_results["std_test_score"]
import plotly.express as px
import plotly.offline as pyo
pyo.init_notebook_mode()
param_names = [c for c in cv_results.columns
if c.startswith("param_")]
fig = px.scatter(cv_results, x="mean_score_time", y="safe_test_score",
hover_data=param_names)
fig.show()