House price prediction¶

Loading the Ames Housing Price dataset¶

In [2]:

import pandas as pd

In [3]:

AMES_HOUSING_CSV = "https://www.openml.org/data/get_csv/20649135/file2ed11cebe25.arff"
df = pd.read_csv(AMES_HOUSING_CSV)
df

Out[3]:

	MS_SubClass	MS_Zoning	Lot_Frontage	Lot_Area	Street	Alley	Lot_Shape	Land_Contour	Utilities	Lot_Config	...	Fence	Misc_Feature	Misc_Val	Mo_Sold	Year_Sold	Sale_Type	Sale_Condition	Sale_Price	Longitude	Latitude
0	One_Story_1946_and_Newer_All_Styles	Residential_Low_Density	141	31770	Pave	No_Alley_Access	Slightly_Irregular	Lvl	AllPub	Corner	...	No_Fence	None	0	5	2010	WD	Normal	215000	-93.619754	42.054035
1	One_Story_1946_and_Newer_All_Styles	Residential_High_Density	80	11622	Pave	No_Alley_Access	Regular	Lvl	AllPub	Inside	...	Minimum_Privacy	None	0	6	2010	WD	Normal	105000	-93.619756	42.053014
2	One_Story_1946_and_Newer_All_Styles	Residential_Low_Density	81	14267	Pave	No_Alley_Access	Slightly_Irregular	Lvl	AllPub	Corner	...	No_Fence	Gar2	12500	6	2010	WD	Normal	172000	-93.619387	42.052659
3	One_Story_1946_and_Newer_All_Styles	Residential_Low_Density	93	11160	Pave	No_Alley_Access	Regular	Lvl	AllPub	Corner	...	No_Fence	None	0	4	2010	WD	Normal	244000	-93.617320	42.051245
4	Two_Story_1946_and_Newer	Residential_Low_Density	74	13830	Pave	No_Alley_Access	Slightly_Irregular	Lvl	AllPub	Inside	...	Minimum_Privacy	None	0	3	2010	WD	Normal	189900	-93.638933	42.060899
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
2925	Split_or_Multilevel	Residential_Low_Density	37	7937	Pave	No_Alley_Access	Slightly_Irregular	Lvl	AllPub	CulDSac	...	Good_Privacy	None	0	3	2006	WD	Normal	142500	-93.604776	41.988964
2926	One_Story_1946_and_Newer_All_Styles	Residential_Low_Density	0	8885	Pave	No_Alley_Access	Slightly_Irregular	Low	AllPub	Inside	...	Minimum_Privacy	None	0	6	2006	WD	Normal	131000	-93.602680	41.988314
2927	Split_Foyer	Residential_Low_Density	62	10441	Pave	No_Alley_Access	Regular	Lvl	AllPub	Inside	...	Minimum_Privacy	Shed	700	7	2006	WD	Normal	132000	-93.606847	41.986510
2928	One_Story_1946_and_Newer_All_Styles	Residential_Low_Density	77	10010	Pave	No_Alley_Access	Regular	Lvl	AllPub	Inside	...	No_Fence	None	0	4	2006	WD	Normal	170000	-93.600190	41.990921
2929	Two_Story_1946_and_Newer	Residential_Low_Density	74	9627	Pave	No_Alley_Access	Regular	Lvl	AllPub	Inside	...	No_Fence	None	0	11	2006	WD	Normal	188000	-93.599996	41.989265

2930 rows × 81 columns

Some basic data visualization¶

In [4]:

import folium
from folium.plugins import MarkerCluster


center = df[["Latitude", "Longitude"]].mean().values.tolist()
ames_map = folium.Map(location=center, zoom_start=13)
c = MarkerCluster(options={"maxClusterRadius": 40}).add_to(ames_map)

def make_tooltip(record):
    tooltip = f'<div>Sale_Price: ${record["Sale_Price"] / 1e3:.1f}k</div>'
    tooltip += f'<div>Gr_Liv_Area: {record["Gr_Liv_Area"]:.1f}</div>'
    tooltip += f'<div>Year_Built: {record["Year_Built"]:d}</div>'
    return tooltip

for i, record in df.iterrows():
    marker = folium.Marker((record["Latitude"], record["Longitude"]),
                           tooltip=make_tooltip(record))
    marker.add_to(c)

ames_map

Out[4]:

In [5]:

df.plot(x="Gr_Liv_Area", y="Sale_Price", kind="scatter",
        figsize=(12, 6), alpha=0.1);

A baseline univariate linear model¶

In [6]:

from sklearn.model_selection import train_test_split


X = df.drop(columns=["Sale_Price"])
y = df["Sale_Price"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=500, random_state=0)

In [7]:

from sklearn.linear_model import LinearRegression

x_train = X_train["Gr_Liv_Area"].values.reshape(-1, 1)
x_test = X_test["Gr_Liv_Area"].values.reshape(-1, 1)

lr = LinearRegression().fit(x_train, y_train)
r2_score = lr.score(x_test, y_test)

print(f"R2 score (test): {r2_score:.3f}")

R2 score (test): 0.542

In [8]:

y_pred = lr.predict(X_test["Gr_Liv_Area"].values.reshape(-1, 1))

In [9]:

import matplotlib.pyplot as plt


def plot_predictions(y_test, y_pred):
    fig, ax = plt.subplots(figsize=(8, 8))
    ax.scatter(y_pred, y_test, alpha=0.3)
    ax.plot([1e4, 5e5], [1e4, 5e5], linestyle="--")
    ax.set_xlim(1e4, 5e5); ax.set_ylim(1e4, 5e5)
    ax.set_xlabel("Predicted price ($)")
    ax.set_ylabel("Actual price ($)")


plot_predictions(y_test, y_pred)

In [10]:

import numpy as np

def mean_absolute_percent_error(y_true, y_pred):
    diffs = np.abs(y_true - y_pred)
    scales = np.abs(y_true) + np.finfo(np.float64).eps
    return np.mean(diffs / scales) * 100

mape = mean_absolute_percent_error(y_test, y_pred)

print(f"MAPE (test): {mape:.3f}")

MAPE (test): 23.166

Fitting a non linear, multi-variate model¶

In [11]:

X_train.head()

Out[11]:

	MS_SubClass	MS_Zoning	Lot_Frontage	Lot_Area	Street	Alley	Lot_Shape	Land_Contour	Utilities	Lot_Config	...	Pool_QC	Fence	Misc_Feature	Mo_Sold	Year_Sold	Sale_Type	Sale_Condition	Longitude	Latitude
483	Two_Story_1946_and_Newer	Residential_Low_Density	0	8795	Pave	No_Alley_Access	Slightly_Irregular	Lvl	AllPub	Inside	...	No_Pool	No_Fence	None	4	2009	WD	Normal	-93.642813	42.058590
2585	One_Story_1946_and_Newer_All_Styles	Residential_Low_Density	75	10170	Pave	No_Alley_Access	Regular	Lvl	AllPub	Corner	...	No_Pool	No_Fence	None	6	2006	WD	Normal	-93.617069	42.038357
2288	Two_Story_PUD_1946_and_Newer	Residential_Medium_Density	21	2001	Pave	No_Alley_Access	Regular	Lvl	AllPub	Inside	...	No_Pool	No_Fence	None	1	2007	WD	Normal	-93.601507	41.991709
141	One_Story_1946_and_Newer_All_Styles	Residential_Low_Density	70	10552	Pave	No_Alley_Access	Slightly_Irregular	Lvl	AllPub	Inside	...	No_Pool	No_Fence	None	4	2010	WD	Normal	-93.617704	42.043084
2041	Two_Family_conversion_All_Styles_and_Ages	Residential_Medium_Density	60	10120	Pave	No_Alley_Access	Slightly_Irregular	Bnk	AllPub	Inside	...	No_Pool	Minimum_Privacy	None	1	2007	WD	Normal	-93.622025	42.025893

5 rows × 80 columns

In [12]:

def caterogical_columns(df):
    return df.columns[df.dtypes == object]

len(caterogical_columns(df))

Out[12]:

In [13]:

def numeric_columns(df):
    return df.columns[df.dtypes != object]


len(numeric_columns(df))

Out[13]:

In [14]:

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.pipeline import Pipeline
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingRegressor


categories = [df[c].unique() for c in caterogical_columns(df)]
ord_encoder =  OrdinalEncoder(categories=categories)

preprocessor = ColumnTransformer([
    ("categorical", ord_encoder, caterogical_columns),
    ("numeric", "passthrough", numeric_columns),
])

hgb = HistGradientBoostingRegressor(
    max_leaf_nodes=3,
    learning_rate=0.5,
    early_stopping=True,
    n_iter_no_change=10,
    max_bins=5,
    max_iter=1000,
    random_state=0,
)

model = Pipeline([
    ("preprocessor", preprocessor),
    ("regressor", hgb),
])

In [15]:

%%time
_ = model.fit(X_train, y_train)

CPU times: user 4.65 s, sys: 13.1 ms, total: 4.67 s
Wall time: 1.81 s

In [16]:

model[-1].n_iter_

Out[16]:

In [17]:

r2_score_train = model.score(X_train, y_train)
print(f"r2 score (train): {r2_score_train:.3f}")

r2 score (train): 0.926

In [18]:

test_r2_score = model.score(X_test, y_test)
print(f"r2 score (test): {test_r2_score:.3f}")

r2 score (test): 0.897

In [19]:

y_pred = model.predict(X_test)
mape = mean_absolute_percent_error(y_test, y_pred)
print(f"MAPE: {mape:.1f}%")

MAPE: 9.7%

In [20]:

y_pred = model.predict(X_test)
plot_predictions(y_test, y_pred)

Selecting the most important variables¶

In [21]:

from sklearn.inspection import permutation_importance


pi = permutation_importance(model, X_test, y_test, n_repeats=5,
                            random_state=42, n_jobs=2)

In [22]:

sorted_idx = pi.importances_mean.argsort()
most_important_idx = [
    i for i in sorted_idx
    if pi.importances_mean[i] - 4 * pi.importances_std[i] > 0
]
most_important_names = df.columns[most_important_idx]
fig, ax = plt.subplots(figsize=(12, 8))
ax.boxplot(pi.importances[most_important_idx].T,
           vert=False, labels=most_important_names)
ax.set_title("Permutation Importances (test set)");

Retraining a simpler model on the most important features only¶

In [23]:

feature_subset = most_important_names.tolist()
if "Latitude" not in feature_subset:
    feature_subset += ["Latitude"]
if "Longitude" not in feature_subset:
    feature_subset += ["Longitude"]

In [24]:

len(numeric_columns(df[feature_subset]))

Out[24]:

In [25]:

len(caterogical_columns(df[feature_subset]))

Out[25]:

In [26]:

X_train[feature_subset]

Out[26]:

	Roof_Matl	Exterior_2nd	Bsmt_Qual	Wood_Deck_SF	Condition_1	MS_Zoning	Land_Contour	Longitude	Full_Bath	Kitchen_Qual	Total_Bsmt_SF	Garage_Cars	Overall_Qual	Gr_Liv_Area	Year_Built	Latitude
483	CompShg	VinylSd	Good	224	Norm	Residential_Low_Density	Lvl	-93.642813	2	Good	952	2	Good	2256	2000	42.058590
2585	CompShg	Wd Sdng	Typical	0	Norm	Residential_Low_Density	Lvl	-93.617069	1	Good	216	2	Above_Average	1575	1951	42.038357
2288	CompShg	CmentBd	Typical	0	Norm	Residential_Medium_Density	Lvl	-93.601507	1	Typical	546	1	Below_Average	1092	1970	41.991709
141	CompShg	BrkFace	Typical	0	Norm	Residential_Low_Density	Lvl	-93.617704	1	Good	1398	2	Average	1700	1959	42.043084
2041	CompShg	Wd Sdng	Typical	0	Feedr	Residential_Medium_Density	Bnk	-93.622025	1	Typical	925	1	Good	1889	1910	42.025893
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
763	CompShg	Plywood	Good	120	Norm	Residential_Low_Density	Lvl	-93.658173	3	Typical	1200	0	Average	1200	1987	42.028098
835	CompShg	VinylSd	Good	168	Norm	Residential_Low_Density	Low	-93.691883	1	Good	1040	2	Average	1040	1996	42.021171
1653	CompShg	HdBoard	Typical	0	Norm	Residential_Low_Density	Lvl	-93.638554	2	Typical	539	2	Good	1725	1979	42.052353
2607	CompShg	VinylSd	Typical	0	Norm	Residential_Low_Density	Lvl	-93.608984	1	Typical	1024	2	Average	1086	1966	42.040811
2732	CompShg	MetalSd	Typical	237	Norm	Residential_Low_Density	Lvl	-93.666035	1	Typical	967	2	Below_Average	1350	1954	42.026530

2430 rows × 16 columns

In [27]:

%%time

categories = [df[c].unique()
              for c in caterogical_columns(df[feature_subset])]
ord_encoder =  OrdinalEncoder(categories=categories)

preprocessor = ColumnTransformer([
    ("categorical", ord_encoder, caterogical_columns),
    ("numeric", "passthrough", numeric_columns),
])
hgb = HistGradientBoostingRegressor(
    max_leaf_nodes=16,
    learning_rate=0.1,
    min_samples_leaf=5,
    early_stopping=True,
    n_iter_no_change=5,
    max_iter=1000,
    random_state=0,
)
reduced_model = Pipeline([
    ("preprocessor", preprocessor),
    ("regressor", hgb),
])

_= reduced_model.fit(X_train[feature_subset], y_train)

CPU times: user 1.32 s, sys: 31.7 ms, total: 1.36 s
Wall time: 434 ms

In [28]:

%%time
_ = reduced_model.fit(X_train[feature_subset], y_train)

CPU times: user 967 ms, sys: 26.5 ms, total: 994 ms
Wall time: 278 ms

In [29]:

reduced_model[-1].n_iter_

Out[29]:

In [30]:

r2_score_train = reduced_model.score(X_train[feature_subset], y_train)
print(f"r2 score (train): {r2_score_train:.3f}")

r2 score (train): 0.949

In [31]:

test_r2_score = reduced_model.score(X_test[feature_subset], y_test)
print(f"r2 score (test): {test_r2_score:.3f}")

r2 score (test): 0.903

In [32]:

y_pred = reduced_model.predict(X_test[feature_subset])
mape = mean_absolute_percent_error(y_test, y_pred)
print(f"MAPE: {mape:.1f}%")

MAPE: 10.0%

In [33]:

plot_predictions(y_test, y_pred)

Model inspection¶

In [34]:

from sklearn.inspection import permutation_importance


pi = permutation_importance(reduced_model, X_test[feature_subset], y_test, n_repeats=10,
                            random_state=42, n_jobs=2)

sorted_idx = pi.importances_mean.argsort()
sorted_names = np.array(feature_subset)[sorted_idx]
fig, ax = plt.subplots(figsize=(12, 8))
ax.boxplot(pi.importances[sorted_idx].T,
           vert=False, labels=sorted_names)
ax.set_title("Permutation Importances (test set)");

In [35]:

# %pip install -q git+https://github.com/slundberg/shap

In [36]:

import shap
shap.initjs()

explainer = shap.TreeExplainer(reduced_model[-1])
X_test_encoded = reduced_model[0].transform(X_test[feature_subset])
shap_values = explainer.shap_values(X_test_encoded)
feature_names = caterogical_columns(X_test[feature_subset]).tolist()
feature_names += numeric_columns(X_test[feature_subset]).tolist()
X_test_encoded = pd.DataFrame(X_test_encoded, columns=feature_names)
shap.summary_plot(shap_values, X_test_encoded, plot_type="bar")

Setting feature_perturbation = "tree_path_dependent" because no background data was given.

In [37]:

shap.summary_plot(shap_values, X_test_encoded, plot_size=(14, 7))

In [38]:

sample_idx = 0
print("True sale price:", y_test.iloc[sample_idx])
shap.force_plot(explainer.expected_value, shap_values[sample_idx, :],
                X_test_encoded.iloc[sample_idx, :])

True sale price: 220000

Out[38]:

Visualization omitted, Javascript library not loaded!
Have you run `initjs()` in this notebook? If this notebook was from another user you must also trust this notebook (File -> Trust notebook). If you are viewing this notebook on github the Javascript has been stripped for security. If you are using JupyterLab this error is because a JupyterLab extension has not yet been written.

In [39]:

from sklearn.inspection import plot_partial_dependence

fig, ax = plt.subplots(figsize=(10, 5))
ax.set_title("Partial Dependence")
plot_partial_dependence(reduced_model, X_test[feature_subset],
                        ["Year_Built"], grid_resolution=20, ax=ax);

In [40]:

fig, ax = plt.subplots(figsize=(10, 5))
plot_partial_dependence(reduced_model, X_test[feature_subset], ["Gr_Liv_Area"],
                        grid_resolution=20, ax=ax);

In [41]:

fig, ax = plt.subplots(figsize=(8, 8))
ax.set_title("Partial Dependence")
plot_partial_dependence(reduced_model, X[feature_subset], [["Gr_Liv_Area", "Year_Built"]],
                        grid_resolution=20, contour_kw={"alpha": 0.8}, ax=ax);

In [42]:

fig, ax = plt.subplots(figsize=(12, 12))
ax.set_title("Partial Dependence")
plot_partial_dependence(reduced_model, X[feature_subset],
                        [["Longitude", "Latitude"]],
                        percentiles=(0., 1.),
                        grid_resolution=20, contour_kw={"alpha": 0.8}, ax=ax)
ax = fig.gca()
ax.set_xlim(X["Longitude"].min(), X["Longitude"].max())
ax.set_ylim(X["Latitude"].min(), X["Latitude"].max())
ax.set_aspect("equal")
ax.scatter(X["Longitude"], X["Latitude"]);

Model selection: hyperparameter tuning¶

In [43]:

HistGradientBoostingRegressor()

Out[43]:

HistGradientBoostingRegressor(early_stopping='auto', l2_regularization=0.0,
                              learning_rate=0.1, loss='least_squares',
                              max_bins=255, max_depth=None, max_iter=100,
                              max_leaf_nodes=31, min_samples_leaf=20,
                              n_iter_no_change=10, random_state=None,
                              scoring='loss', tol=1e-07,
                              validation_fraction=0.1, verbose=0,
                              warm_start=False)

In [44]:

from sklearn.model_selection import RandomizedSearchCV
import numpy as np

categories = [df[c].unique()
              for c in caterogical_columns(df)]
ord_encoder =  OrdinalEncoder(categories=categories)

preprocessor = ColumnTransformer([
    ("categorical", ord_encoder, caterogical_columns),
    ("numeric", "passthrough", numeric_columns),
])
hgb = HistGradientBoostingRegressor(
    early_stopping=True,
    n_iter_no_change=10,
    max_iter=1000,
)
model = Pipeline([
    ("preprocessor", preprocessor),
    ("regressor", hgb),
])


params = {
    "regressor__learning_rate": np.logspace(-3, 0, 10),
    "regressor__max_leaf_nodes": [2, 3, 4, 5, 6, 8, 16, 32, 64],
    "regressor__max_bins": [3, 5, 10, 30, 50, 100, 255],
    "regressor__min_samples_leaf": [1, 2, 5, 10, 20, 50, 100],
}
search = RandomizedSearchCV(model, params, n_iter=200, cv=3,
                            n_jobs=4, verbose=1)

In [45]:

# _ = search.fit(X_train, y_train)

In [46]:

# cv_results = pd.DataFrame(search.cv_results_)
# cv_results = cv_results.sort_values("mean_test_score", ascending=False)
# cv_results.to_json("ames_gbrt_search_results.json")

In [47]:

cv_results = pd.read_json("ames_gbrt_search_results.json")

In [48]:

def rename_param(column_name):
    if "__" in column_name:
        return column_name.rsplit("__", 1)[1]
    return column_name

In [49]:

cv_results.rename(rename_param, axis=1).head(5)

Out[49]:

	mean_fit_time	std_fit_time	mean_score_time	std_score_time	min_samples_leaf	max_leaf_nodes	max_bins	learning_rate	params	split0_test_score	split1_test_score	split2_test_score	mean_test_score	std_test_score	rank_test_score	safe_test_score
41	1.150603	0.125701	0.047464	0.016420	10	16	255	0.046416	{'regressor__min_samples_leaf': 10, 'regressor...	0.884028	0.894805	0.915227	0.898020	0.012938	1	0.885082
71	1.278917	0.243353	0.052712	0.020205	2	16	255	0.100000	{'regressor__min_samples_leaf': 2, 'regressor_...	0.882843	0.891829	0.919236	0.897969	0.015479	2	0.882490
75	0.658366	0.108563	0.053916	0.023610	5	16	10	0.100000	{'regressor__min_samples_leaf': 5, 'regressor_...	0.885774	0.892744	0.910826	0.896448	0.010558	3	0.885891
168	0.761223	0.226863	0.048083	0.013343	10	8	30	0.046416	{'regressor__min_samples_leaf': 10, 'regressor...	0.887633	0.885017	0.916373	0.896341	0.014205	4	0.882136
78	0.767932	0.128119	0.034624	0.003252	5	16	255	0.100000	{'regressor__min_samples_leaf': 5, 'regressor_...	0.885211	0.892230	0.911465	0.896302	0.011098	5	0.885204

Interactions between hyperparameters and generalization¶

In [73]:

import plotly.express as px


fig = px.parallel_coordinates(
    cv_results.rename(rename_param, axis=1).apply({
        "learning_rate": np.log10,
        "max_leaf_nodes": np.log2,
        "max_bins": np.log2,
        "mean_test_score": lambda x: x,
    }),
    color="mean_test_score",
    color_continuous_scale=px.colors.diverging.Portland,
)
fig.show()

Let's zoom on the top performing models by using the query methods of the dataframe. Note that the axis have a narrower range now:

In [74]:

fig = px.parallel_coordinates(
    cv_results.rename(rename_param, axis=1).apply({
        "learning_rate": np.log10,
        "max_leaf_nodes": np.log2,
        "max_bins": np.log2,
        "mean_test_score": lambda x: x,
    }).query("mean_test_score > 0.88"),
    color="mean_test_score",
    color_continuous_scale=px.colors.diverging.Portland,
)
fig.show()

Let's check that the inner CV scores still approximately reflect the true generatlization score measured on held out data even when we select the best model from hundred of possible candidates via random search:

In [96]:

best_search_result = cv_results.nlargest(n=1, columns=["mean_test_score"]).iloc[0]
print(f'R2 score of best candidate (inner CV): {best_search_result["mean_test_score"]:.3f}'
      f' (+/-{3 * best_search_result["std_test_score"]:.3f})')

R2 score of best candidate (inner CV): 0.898 (+/-0.039)

In [97]:

model.set_params(**best_search_result["params"])
model.fit(X_train, y_train)
held_out_score = model.score(X_test, y_test)
print(f'R2 score of best candidate on held-out data: {held_out_score:.3f}')

R2 score of best candidate on held-out data: 0.893

Exploration of the accuracy / prediction latency trade-off¶

In [52]:

cv_results["safe_test_score"] = cv_results["mean_test_score"] - cv_results["std_test_score"]

In [53]:

import plotly.express as px
import plotly.offline as pyo
pyo.init_notebook_mode()


param_names = [c for c in cv_results.columns
               if c.startswith("param_")]
fig = px.scatter(cv_results, x="mean_score_time", y="safe_test_score",
                 hover_data=param_names)

fig.show()

In [ ]: