Appendix L — House Price prediction model exploration

Development of a scikit-learn model able to predict the house price indicator from the explanatory variable capturing different scenarios.

import geopandas as gpd
import numpy as np
import pandas as pd

import contextily
import palettable.matplotlib as palmpl
import matplotlib.pyplot as plt
import mapclassify
import libpysal

from utils import legendgram

from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_predict, GridSearchCV

/var/folders/2f/fhks6w_d0k556plcv3rfmshw0000gn/T/ipykernel_6968/881383173.py:1: UserWarning: Shapely 2.0 is installed, but because PyGEOS is also installed, GeoPandas will still use PyGEOS by default for now. To force to use and test Shapely 2.0, you have to set the environment variable USE_PYGEOS=0. You can do this before starting the Python process, or in your code before importing geopandas:

import os
os.environ['USE_PYGEOS'] = '0'
import geopandas

In a future release, GeoPandas will switch to using Shapely by default. If you are using PyGEOS directly (calling PyGEOS functions on geometries from GeoPandas), this will then stop working and you are encouraged to migrate from PyGEOS to Shapely 2.0 (https://shapely.readthedocs.io/en/latest/migration_pygeos.html).
  import geopandas as gpd

data_folder = "/Users/martin/Library/CloudStorage/OneDrive-SharedLibraries-TheAlanTuringInstitute/Daniel Arribas-Bel - demoland_data"

Load the data

data = gpd.read_parquet(f"{data_folder}/processed/interpolated/all_oa.parquet")

Filter only explanatory variables.

exvars = data.drop(
    columns=[
        "geo_code",
        "geometry",
        "air_quality_index",
        "house_price_index",
        "jobs_accessibility_index",
        "greenspace_accessibility_index",
    ]
)

Specify grid search parameters

parameters = {
    "learning_rate": (0.01, 0.05, 0.1, 0.2, 0.5, 1),
    "max_iter": [100, 200, 500],
    "max_bins": (64, 128, 255),
}

Initiate GridSearchCV with Histogram-based Gradient Boosting Regression Tree.

regressor = HistGradientBoostingRegressor(
    random_state=0,
)
est = GridSearchCV(regressor, parameters, verbose=1)

Create mask to filter missing values.

mask = data.house_price_index.notna()

Fit the data.

%%time
est.fit(exvars[mask], data.house_price_index[mask])

Fitting 5 folds for each of 54 candidates, totalling 270 fits
CPU times: user 8min 18s, sys: 13min 18s, total: 21min 37s
Wall time: 13min 49s

GridSearchCV(estimator=HistGradientBoostingRegressor(random_state=0),
             param_grid={'learning_rate': (0.01, 0.05, 0.1, 0.2, 0.5, 1),
                         'max_bins': (64, 128, 255),
                         'max_iter': [100, 200, 500]},
             verbose=1)

In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.

est.best_estimator_

HistGradientBoostingRegressor(learning_rate=0.05, max_iter=500, random_state=0)

Test HistGradientBoostingRegressor without any pre-processing as a baseline.

est.best_score_

0.25446450463994796

y_pred = cross_val_predict(
    est.best_estimator_, exvars[mask], data.house_price_index[mask], cv=5
)

Get the prediction and residuals.

pred = pd.Series(y_pred, index=data[mask].index)
residuals = data.house_price_index[mask] - pred

Check the error

mean_squared_error(data.house_price_index[mask], pred)

248704.12186985405

data.house_price_index[mask].describe()

count    3513.000000
mean     1591.847350
std       583.630015
min       300.549746
25%      1152.094294
50%      1509.777596
75%      1975.173781
max      6925.256593
Name: house_price_index, dtype: float64

Given the mean value is ~16, MSE of 1.34 is not bad, we’re within 10% of deviation.

fig, ax = plt.subplots(figsize=(8, 8))
plt.scatter(data.house_price_index[mask], pred, s=0.25)
plt.xlabel("Y test")
plt.ylabel("Y pred")

Text(0, 0.5, 'Y pred')

Plot results

from shapely.geometry import box

bds = data.total_bounds
extent = gpd.GeoSeries(
    [box((bds[0] - 7000), bds[1], bds[2] + 7000, bds[3])], crs=data.crs
).to_crs(3857)

Plot prediction using the same bins as real values.

f, ax = plt.subplots(figsize=(18, 12))
extent.plot(ax=ax, alpha=0)
bins = mapclassify.EqualInterval(
    data["house_price_index"].fillna(method="pad").values, k=20
).bins

data.assign(pred=pred).to_crs(3857).plot(
    "pred",
    scheme="userdefined",
    classification_kwds={"bins": bins},
    ax=ax,
    alpha=0.9,
    cmap="viridis",
)
legendgram(
    f,
    ax,
    pred,
    bins,
    pal=palmpl.Viridis_10,
    legend_size=(0.35, 0.15),  # legend size in fractions of the axis
    loc="lower left",  # matplotlib-style legend locations
    clip=(
        0,
        data["house_price_index"].max(),
    ),  # clip the displayed range of the histogram
)
ax.set_axis_off()
contextily.add_basemap(
    ax=ax, source=contextily.providers.CartoDB.PositronNoLabels, attribution=""
)
contextily.add_basemap(
    ax=ax,
    source=contextily.providers.Stamen.TonerLines,
    alpha=0.4,
    attribution="(C) CARTO, Map tiles by Stamen Design, CC BY 3.0 -- Map data (C) OpenStreetMap contributors",
)
# plt.savefig(f"{data_folder}/outputs/figures/air_quality_index.png", dpi=150, bbox_inches="tight")

Plot residuals

f, ax = plt.subplots(figsize=(18, 12))
extent.plot(ax=ax, alpha=0)
data.assign(res=residuals).to_crs(3857).plot(
    "res", ax=ax, alpha=0.9, cmap="RdBu", vmin=-2000, vmax=2000, legend=True
)
ax.set_axis_off()
contextily.add_basemap(
    ax=ax, source=contextily.providers.CartoDB.PositronNoLabels, attribution=""
)
contextily.add_basemap(
    ax=ax,
    source=contextily.providers.Stamen.TonerLines,
    alpha=0.4,
    attribution="(C) CARTO, Map tiles by Stamen Design, CC BY 3.0 -- Map data (C) OpenStreetMap contributors",
)
# plt.savefig(f"{data_folder}/outputs/figures/air_quality_index.png", dpi=150, bbox_inches="tight")

residuals.plot.hist(bins=25)

The results are not great. Given that the mean house price is about 1500 per sq m, the model can be quite off, especially in some location.

L.1 Using spatial lag

We can add a lag into the mix. Let’s start with the Queen weights.

queen = libpysal.weights.Queen.from_dataframe(data)
queen.transform = "R"

/Users/martin/mambaforge/envs/demoland/lib/python3.11/site-packages/libpysal/weights/weights.py:172: UserWarning: The weights matrix is not fully connected: 
 There are 3 disconnected components.
  warnings.warn(message)

for col in exvars.columns.copy():
    exvars[f"{col}_lag"] = libpysal.weights.spatial_lag.lag_spatial(queen, exvars[col])

exvars.head()

	population_estimate	A, B, D, E. Agriculture, energy and water	C. Manufacturing	F. Construction	G, I. Distribution, hotels and restaurants	H, J. Transport and communication	K, L, M, N. Financial, real estate, professional and administrative activities	O,P,Q. Public administration, education and health	R, S, T, U. Other	Land cover [Discontinuous urban fabric]	...	sdsAre_lag	sisBpM_lag	misCel_lag	ltcRea_lag	ldeAre_lag	lseCCo_lag	lseERI_lag	lteOri_lag	lteWNB_lag	lieWCe_lag
0	315	0.547024	3.146641	5.436732	4.839081	2.065436	4.283496	6.922660	1.606140	1.000000	...	30280.039796	0.064310	28.550796	51.060993	197839.799834	0.409687	0.881904	28.787310	0.014056	0.013885
1	415	0.028774	4.017802	3.870111	33.515266	8.821760	26.432214	83.050968	15.597812	0.152752	...	17776.067815	0.054231	13.316627	60.676470	32812.973123	0.398693	0.937619	20.853156	0.029603	0.001324
2	246	0.110388	1.325208	4.297030	11.481138	2.315769	4.408114	8.930658	3.636795	1.000000	...	54958.015580	0.058214	27.507441	53.117042	216595.359019	0.323117	0.784838	16.607145	0.009236	0.002879
3	277	0.478192	2.732637	4.713528	4.167451	1.776267	3.689214	5.194090	1.366452	0.931788	...	78050.340373	0.062603	29.854903	70.854692	351081.933889	0.398664	0.862998	23.747635	0.009530	0.013940
4	271	0.680069	3.917490	6.768659	6.029714	2.573226	5.335117	8.758326	2.003362	0.696472	...	27462.223773	0.067705	29.971743	52.001773	59309.608454	0.379304	0.890960	18.537922	0.016313	0.004461

5 rows × 118 columns

regressor_lag = HistGradientBoostingRegressor(
    random_state=0,
)
est_lag = GridSearchCV(regressor_lag, parameters, verbose=1)

Fit the data.

%%time
est_lag.fit(exvars[mask], data[mask].house_price_index)

Fitting 5 folds for each of 54 candidates, totalling 270 fits
CPU times: user 13min 29s, sys: 18min 57s, total: 32min 27s
Wall time: 18min 13s

GridSearchCV(estimator=HistGradientBoostingRegressor(random_state=0),
             param_grid={'learning_rate': (0.01, 0.05, 0.1, 0.2, 0.5, 1),
                         'max_bins': (64, 128, 255),
                         'max_iter': [100, 200, 500]},
             verbose=1)

est_lag.best_estimator_

HistGradientBoostingRegressor(learning_rate=0.05, max_bins=128, max_iter=500,
                              random_state=0)

Test HistGradientBoostingRegressor without any pre-processing as a baseline.

est_lag.best_score_

0.3852767425712931

y_pred_lag = cross_val_predict(
    est_lag.best_estimator_, exvars[mask], data[mask].house_price_index, cv=5
)

Get the prediction and residuals.

pred_lag = pd.Series(y_pred_lag, index=data[mask].index)
residuals_lag = data[mask].house_price_index - pred_lag

Check the error

mean_squared_error(data[mask].house_price_index, pred_lag)

201453.6577863544

MSE got down from 248704 to 201453 but that still seems to be a bit high (mean error is 448 from 1500 mean value).

fig, ax = plt.subplots(figsize=(8, 8))
plt.scatter(data.house_price_index[mask], pred_lag, s=0.25)
plt.xlabel("Y test")
plt.ylabel("Y pred")

Text(0, 0.5, 'Y pred')

Plot prediction using the same bins as real values.

f, ax = plt.subplots(figsize=(18, 12))
extent.plot(ax=ax, alpha=0)
bins = mapclassify.EqualInterval(
    data["house_price_index"].fillna(method="pad").values, k=20
).bins

data.assign(pred=pred_lag).to_crs(3857).plot(
    "pred",
    scheme="userdefined",
    classification_kwds={"bins": bins},
    ax=ax,
    alpha=0.9,
    cmap="viridis",
)
legendgram(
    f,
    ax,
    pred,
    bins,
    pal=palmpl.Viridis_10,
    legend_size=(0.35, 0.15),  # legend size in fractions of the axis
    loc="lower left",  # matplotlib-style legend locations
    clip=(
        0,
        data["house_price_index"].max(),
    ),  # clip the displayed range of the histogram
)
ax.set_axis_off()
contextily.add_basemap(
    ax=ax, source=contextily.providers.CartoDB.PositronNoLabels, attribution=""
)
contextily.add_basemap(
    ax=ax,
    source=contextily.providers.Stamen.TonerLines,
    alpha=0.4,
    attribution="(C) CARTO, Map tiles by Stamen Design, CC BY 3.0 -- Map data (C) OpenStreetMap contributors",
)
# plt.savefig(f"{data_folder}/outputs/figures/air_quality_index.png", dpi=150, bbox_inches="tight")

Plot residuals

f, ax = plt.subplots(figsize=(18, 12))
extent.plot(ax=ax, alpha=0)
data.assign(res=residuals_lag).to_crs(3857).plot(
    "res", ax=ax, alpha=0.9, cmap="RdBu", vmin=-2000, vmax=2000, legend=True
)
ax.set_axis_off()
contextily.add_basemap(
    ax=ax, source=contextily.providers.CartoDB.PositronNoLabels, attribution=""
)
contextily.add_basemap(
    ax=ax,
    source=contextily.providers.Stamen.TonerLines,
    alpha=0.4,
    attribution="(C) CARTO, Map tiles by Stamen Design, CC BY 3.0 -- Map data (C) OpenStreetMap contributors",
)
# plt.savefig(f"{data_folder}/outputs/figures/air_quality_index.png", dpi=150, bbox_inches="tight")

residuals_lag.plot.hist(bins=25)

Overlay of original and a lagged model. No clear indication that the lagged one would be vastly superior.

ax = residuals_lag.plot.hist(bins=50)
residuals.plot.hist(bins=50, ax=ax, color="r", alpha=0.7)

The average residuals:

residuals_lag.abs().mean(), residuals.abs().mean()

(345.0198428517582, 381.3997073048114)

est_lag.best_estimator_

HistGradientBoostingRegressor(learning_rate=0.05, max_bins=128, max_iter=500,
                              random_state=0)

est.best_estimator_

HistGradientBoostingRegressor(learning_rate=0.05, max_iter=500, random_state=0)

L.1.1 Wider weights

queen = libpysal.weights.Queen.from_dataframe(data)
queen3 = libpysal.weights.higher_order(queen, k=3, lower_order=True)

/Users/martin/mambaforge/envs/demoland/lib/python3.11/site-packages/libpysal/weights/weights.py:172: UserWarning: The weights matrix is not fully connected: 
 There are 3 disconnected components.
  warnings.warn(message)

queen3.transform = "R"

for col in exvars.columns.copy():
    exvars[f"{col}_lag"] = libpysal.weights.spatial_lag.lag_spatial(queen3, exvars[col])

regressor_lag = HistGradientBoostingRegressor(
    random_state=0,
)
est_lag = GridSearchCV(regressor_lag, parameters, verbose=1)

Fit the data.

%%time
est_lag.fit(exvars[mask], data.house_price_index[mask])

Fitting 5 folds for each of 54 candidates, totalling 270 fits
CPU times: user 10min 36s, sys: 11min 30s, total: 22min 6s
Wall time: 13min 58s

GridSearchCV(estimator=HistGradientBoostingRegressor(random_state=0),
             param_grid={'learning_rate': (0.01, 0.05, 0.1, 0.2, 0.5, 1),
                         'max_bins': (64, 128, 255),
                         'max_iter': [100, 200, 500]},
             verbose=1)

est_lag.best_estimator_

HistGradientBoostingRegressor(learning_rate=0.05, max_iter=500, random_state=0)

Test HistGradientBoostingRegressor without any pre-processing as a baseline.

est_lag.best_score_

0.4753837295272674

%%time
est_lag.best_estimator_.predict(exvars[mask])

CPU times: user 147 ms, sys: 32.8 ms, total: 180 ms
Wall time: 86 ms

array([1731.20448464, 2830.44054758, 1613.82686231, ..., 1590.77525305,
       2612.76409275, 1496.54246374])

y_pred_lag = cross_val_predict(
    est_lag.best_estimator_, exvars[mask], data[mask].house_price_index, cv=5
)

Get the prediction and residuals.

pred_lag = pd.Series(y_pred_lag, index=data[mask].index)
residuals_lag = data[mask].house_price_index - pred_lag

Check the error

mean_squared_error(data[mask].house_price_index, pred_lag)

168855.00058162943

residuals_lag.abs().mean()

314.2761789390005

fig, ax = plt.subplots(figsize=(8, 8))
plt.scatter(data.house_price_index[mask], pred_lag, s=0.25)
plt.xlabel("Y test")
plt.ylabel("Y pred")

Text(0, 0.5, 'Y pred')

Plot prediction using the same bins as real values.

f, ax = plt.subplots(figsize=(18, 12))
extent.plot(ax=ax, alpha=0)
bins = mapclassify.NaturalBreaks(data["house_price_index"].dropna().values, k=10).bins

data.assign(pred=pred_lag).to_crs(3857).plot(
    "pred",
    scheme="userdefined",
    classification_kwds={"bins": bins},
    ax=ax,
    alpha=0.9,
    cmap="viridis",
)
legendgram(
    f,
    ax,
    pred_lag,
    bins,
    pal=palmpl.Viridis_10,
    legend_size=(0.35, 0.15),  # legend size in fractions of the axis
    loc="lower left",  # matplotlib-style legend locations
    clip=(
        0,
        data["house_price_index"].max(),
    ),  # clip the displayed range of the histogram
)
ax.set_axis_off()
contextily.add_basemap(
    ax=ax, source=contextily.providers.CartoDB.PositronNoLabels, attribution=""
)
contextily.add_basemap(
    ax=ax,
    source=contextily.providers.Stamen.TonerLines,
    alpha=0.4,
    attribution="(C) CARTO, Map tiles by Stamen Design, CC BY 3.0 -- Map data (C) OpenStreetMap contributors",
)
# plt.savefig(f"{data_folder}/outputs/figures/air_quality_index.png", dpi=150, bbox_inches="tight")

Plot residuals

f, ax = plt.subplots(figsize=(18, 12))
extent.plot(ax=ax, alpha=0)
data.assign(res=residuals_lag).to_crs(3857).plot(
    "res", ax=ax, alpha=0.9, cmap="RdBu", vmin=-2000, vmax=2000, legend=True
)
ax.set_axis_off()
contextily.add_basemap(
    ax=ax, source=contextily.providers.CartoDB.PositronNoLabels, attribution=""
)
contextily.add_basemap(
    ax=ax,
    source=contextily.providers.Stamen.TonerLines,
    alpha=0.4,
    attribution="(C) CARTO, Map tiles by Stamen Design, CC BY 3.0 -- Map data (C) OpenStreetMap contributors",
)
# plt.savefig(f"{data_folder}/outputs/figures/air_quality_index.png", dpi=150, bbox_inches="tight")

residuals_lag.plot.hist(bins=25)

L.2 Include accessibilities

We can try to include green space and jobs accessibility, as those are likely to affect the house price. We create a lagged model with these two variables on top of explanatory variables.

exvars = data.drop(
    columns=["geo_code", "geometry", "air_quality_index", "house_price_index"]
)

queen = libpysal.weights.Queen.from_dataframe(data)
queen.transform = "R"

/Users/martin/mambaforge/envs/demoland/lib/python3.11/site-packages/libpysal/weights/weights.py:172: UserWarning: The weights matrix is not fully connected: 
 There are 3 disconnected components.
  warnings.warn(message)

for col in exvars.columns.copy():
    exvars[f"{col}_lag"] = libpysal.weights.spatial_lag.lag_spatial(queen, exvars[col])

exvars.head()

	population_estimate	A, B, D, E. Agriculture, energy and water	C. Manufacturing	F. Construction	G, I. Distribution, hotels and restaurants	H, J. Transport and communication	K, L, M, N. Financial, real estate, professional and administrative activities	O,P,Q. Public administration, education and health	R, S, T, U. Other	Land cover [Discontinuous urban fabric]	...	misCel_lag	ltcRea_lag	ldeAre_lag	lseCCo_lag	lseERI_lag	lteOri_lag	lteWNB_lag	lieWCe_lag	jobs_accessibility_index_lag	greenspace_accessibility_index_lag
0	315	0.547024	3.146641	5.436732	4.839081	2.065436	4.283496	6.922660	1.606140	1.000000	...	28.550796	51.060993	197839.799834	0.409687	0.881904	28.787310	0.014056	0.013885	1582.500000	2.650345e+05
1	415	0.028774	4.017802	3.870111	33.515266	8.821760	26.432214	83.050968	15.597812	0.152752	...	13.316627	60.676470	32812.973123	0.398693	0.937619	20.853156	0.029603	0.001324	10238.400000	1.329411e+06
2	246	0.110388	1.325208	4.297030	11.481138	2.315769	4.408114	8.930658	3.636795	1.000000	...	27.507441	53.117042	216595.359019	0.323117	0.784838	16.607145	0.009236	0.002879	1211.000000	5.170833e+05
3	277	0.478192	2.732637	4.713528	4.167451	1.776267	3.689214	5.194090	1.366452	0.931788	...	29.854903	70.854692	351081.933889	0.398664	0.862998	23.747635	0.009530	0.013940	2275.500000	3.185030e+05
4	271	0.680069	3.917490	6.768659	6.029714	2.573226	5.335117	8.758326	2.003362	0.696472	...	29.971743	52.001773	59309.608454	0.379304	0.890960	18.537922	0.016313	0.004461	1480.142857	3.063273e+05

5 rows × 122 columns

regressor_lag = HistGradientBoostingRegressor(
    random_state=0,
)
est_lag = GridSearchCV(regressor_lag, parameters, verbose=1)

Fit the data.

%%time
est_lag.fit(exvars[mask], data[mask].house_price_index)

Fitting 5 folds for each of 54 candidates, totalling 270 fits
CPU times: user 19min 33s, sys: 33min 44s, total: 53min 17s
Wall time: 26min 23s

GridSearchCV(estimator=HistGradientBoostingRegressor(random_state=0),
             param_grid={'learning_rate': (0.01, 0.05, 0.1, 0.2, 0.5, 1),
                         'max_bins': (64, 128, 255),
                         'max_iter': [100, 200, 500]},
             verbose=1)

est_lag.best_estimator_

HistGradientBoostingRegressor(learning_rate=0.05, max_iter=500, random_state=0)

Test HistGradientBoostingRegressor without any pre-processing as a baseline.

est_lag.best_score_

0.40067474727924707

y_pred_lag = cross_val_predict(
    est_lag.best_estimator_, exvars[mask], data[mask].house_price_index, cv=5
)

Get the prediction and residuals.

pred_lag = pd.Series(y_pred_lag, index=data[mask].index)
residuals_lag = data[mask].house_price_index - pred_lag

Check the error

mean_squared_error(data[mask].house_price_index, pred_lag)

196944.98165903118

MSE is only marginally better than before.

fig, ax = plt.subplots(figsize=(8, 8))
plt.scatter(data.house_price_index[mask], pred_lag, s=0.25)
plt.xlabel("Y test")
plt.ylabel("Y pred")

Text(0, 0.5, 'Y pred')

Plot prediction using the same bins as real values.

f, ax = plt.subplots(figsize=(18, 12))
extent.plot(ax=ax, alpha=0)
bins = mapclassify.EqualInterval(
    data["house_price_index"].fillna(method="pad").values, k=20
).bins

data.assign(pred=pred_lag).to_crs(3857).plot(
    "pred",
    scheme="userdefined",
    classification_kwds={"bins": bins},
    ax=ax,
    alpha=0.9,
    cmap="viridis",
)
legendgram(
    f,
    ax,
    pred_lag,
    bins,
    pal=palmpl.Viridis_10,
    legend_size=(0.35, 0.15),  # legend size in fractions of the axis
    loc="lower left",  # matplotlib-style legend locations
    clip=(
        0,
        data["house_price_index"].max(),
    ),  # clip the displayed range of the histogram
)
ax.set_axis_off()
contextily.add_basemap(
    ax=ax, source=contextily.providers.CartoDB.PositronNoLabels, attribution=""
)
contextily.add_basemap(
    ax=ax,
    source=contextily.providers.Stamen.TonerLines,
    alpha=0.4,
    attribution="(C) CARTO, Map tiles by Stamen Design, CC BY 3.0 -- Map data (C) OpenStreetMap contributors",
)
# plt.savefig(f"{data_folder}/outputs/figures/air_quality_index.png", dpi=150, bbox_inches="tight")

Plot residuals

f, ax = plt.subplots(figsize=(18, 12))
extent.plot(ax=ax, alpha=0)
data.assign(res=residuals_lag).to_crs(3857).plot(
    "res", ax=ax, alpha=0.9, cmap="RdBu", vmin=-2000, vmax=2000, legend=True
)
ax.set_axis_off()
contextily.add_basemap(
    ax=ax, source=contextily.providers.CartoDB.PositronNoLabels, attribution=""
)
contextily.add_basemap(
    ax=ax,
    source=contextily.providers.Stamen.TonerLines,
    alpha=0.4,
    attribution="(C) CARTO, Map tiles by Stamen Design, CC BY 3.0 -- Map data (C) OpenStreetMap contributors",
)
# plt.savefig(f"{data_folder}/outputs/figures/air_quality_index.png", dpi=150, bbox_inches="tight")

residuals_lag.plot.hist(bins=25)

Not great so far, it seems that our set of explanatory variables is not explanatory enough to predict house prices in this way…

L.3 With latent representation of Sentinel 2

Try including the lagged latent representation from the postcode Sentinel paper.

latent_oa = pd.read_parquet(f"{data_folder}/processed/sentinel/latent_oa.parquet")

queen = libpysal.weights.Queen.from_dataframe(data)
queen.transform = "R"

/Users/martin/mambaforge/envs/demoland/lib/python3.11/site-packages/libpysal/weights/weights.py:172: UserWarning: The weights matrix is not fully connected: 
 There are 3 disconnected components.
  warnings.warn(message)

exvars_latent = pd.concat([exvars, latent_oa.drop(columns="geometry")], axis=1)

for col in exvars_latent.columns.copy():
    exvars_latent[f"{col}_lag"] = libpysal.weights.spatial_lag.lag_spatial(
        queen, exvars_latent[col]
    )

/var/folders/2f/fhks6w_d0k556plcv3rfmshw0000gn/T/ipykernel_54818/186550642.py:2: PerformanceWarning: DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
  exvars_latent[f"{col}_lag"] = libpysal.weights.spatial_lag.lag_spatial(
/var/folders/2f/fhks6w_d0k556plcv3rfmshw0000gn/T/ipykernel_54818/186550642.py:2: PerformanceWarning: DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
  exvars_latent[f"{col}_lag"] = libpysal.weights.spatial_lag.lag_spatial(
/var/folders/2f/fhks6w_d0k556plcv3rfmshw0000gn/T/ipykernel_54818/186550642.py:2: PerformanceWarning: DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
  exvars_latent[f"{col}_lag"] = libpysal.weights.spatial_lag.lag_spatial(
/var/folders/2f/fhks6w_d0k556plcv3rfmshw0000gn/T/ipykernel_54818/186550642.py:2: PerformanceWarning: DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
  exvars_latent[f"{col}_lag"] = libpysal.weights.spatial_lag.lag_spatial(
/var/folders/2f/fhks6w_d0k556plcv3rfmshw0000gn/T/ipykernel_54818/186550642.py:2: PerformanceWarning: DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
  exvars_latent[f"{col}_lag"] = libpysal.weights.spatial_lag.lag_spatial(
/var/folders/2f/fhks6w_d0k556plcv3rfmshw0000gn/T/ipykernel_54818/186550642.py:2: PerformanceWarning: DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
  exvars_latent[f"{col}_lag"] = libpysal.weights.spatial_lag.lag_spatial(
/var/folders/2f/fhks6w_d0k556plcv3rfmshw0000gn/T/ipykernel_54818/186550642.py:2: PerformanceWarning: DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
  exvars_latent[f"{col}_lag"] = libpysal.weights.spatial_lag.lag_spatial(
/var/folders/2f/fhks6w_d0k556plcv3rfmshw0000gn/T/ipykernel_54818/186550642.py:2: PerformanceWarning: DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
  exvars_latent[f"{col}_lag"] = libpysal.weights.spatial_lag.lag_spatial(
/var/folders/2f/fhks6w_d0k556plcv3rfmshw0000gn/T/ipykernel_54818/186550642.py:2: PerformanceWarning: DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
  exvars_latent[f"{col}_lag"] = libpysal.weights.spatial_lag.lag_spatial(
/var/folders/2f/fhks6w_d0k556plcv3rfmshw0000gn/T/ipykernel_54818/186550642.py:2: PerformanceWarning: DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
  exvars_latent[f"{col}_lag"] = libpysal.weights.spatial_lag.lag_spatial(
/var/folders/2f/fhks6w_d0k556plcv3rfmshw0000gn/T/ipykernel_54818/186550642.py:2: PerformanceWarning: DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
  exvars_latent[f"{col}_lag"] = libpysal.weights.spatial_lag.lag_spatial(
/var/folders/2f/fhks6w_d0k556plcv3rfmshw0000gn/T/ipykernel_54818/186550642.py:2: PerformanceWarning: DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
  exvars_latent[f"{col}_lag"] = libpysal.weights.spatial_lag.lag_spatial(
/var/folders/2f/fhks6w_d0k556plcv3rfmshw0000gn/T/ipykernel_54818/186550642.py:2: PerformanceWarning: DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
  exvars_latent[f"{col}_lag"] = libpysal.weights.spatial_lag.lag_spatial(
/var/folders/2f/fhks6w_d0k556plcv3rfmshw0000gn/T/ipykernel_54818/186550642.py:2: PerformanceWarning: DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
  exvars_latent[f"{col}_lag"] = libpysal.weights.spatial_lag.lag_spatial(
/var/folders/2f/fhks6w_d0k556plcv3rfmshw0000gn/T/ipykernel_54818/186550642.py:2: PerformanceWarning: DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
  exvars_latent[f"{col}_lag"] = libpysal.weights.spatial_lag.lag_spatial(
/var/folders/2f/fhks6w_d0k556plcv3rfmshw0000gn/T/ipykernel_54818/186550642.py:2: PerformanceWarning: DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
  exvars_latent[f"{col}_lag"] = libpysal.weights.spatial_lag.lag_spatial(
/var/folders/2f/fhks6w_d0k556plcv3rfmshw0000gn/T/ipykernel_54818/186550642.py:2: PerformanceWarning: DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
  exvars_latent[f"{col}_lag"] = libpysal.weights.spatial_lag.lag_spatial(
/var/folders/2f/fhks6w_d0k556plcv3rfmshw0000gn/T/ipykernel_54818/186550642.py:2: PerformanceWarning: DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
  exvars_latent[f"{col}_lag"] = libpysal.weights.spatial_lag.lag_spatial(
/var/folders/2f/fhks6w_d0k556plcv3rfmshw0000gn/T/ipykernel_54818/186550642.py:2: PerformanceWarning: DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
  exvars_latent[f"{col}_lag"] = libpysal.weights.spatial_lag.lag_spatial(
/var/folders/2f/fhks6w_d0k556plcv3rfmshw0000gn/T/ipykernel_54818/186550642.py:2: PerformanceWarning: DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
  exvars_latent[f"{col}_lag"] = libpysal.weights.spatial_lag.lag_spatial(
/var/folders/2f/fhks6w_d0k556plcv3rfmshw0000gn/T/ipykernel_54818/186550642.py:2: PerformanceWarning: DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
  exvars_latent[f"{col}_lag"] = libpysal.weights.spatial_lag.lag_spatial(
/var/folders/2f/fhks6w_d0k556plcv3rfmshw0000gn/T/ipykernel_54818/186550642.py:2: PerformanceWarning: DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
  exvars_latent[f"{col}_lag"] = libpysal.weights.spatial_lag.lag_spatial(
/var/folders/2f/fhks6w_d0k556plcv3rfmshw0000gn/T/ipykernel_54818/186550642.py:2: PerformanceWarning: DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
  exvars_latent[f"{col}_lag"] = libpysal.weights.spatial_lag.lag_spatial(
/var/folders/2f/fhks6w_d0k556plcv3rfmshw0000gn/T/ipykernel_54818/186550642.py:2: PerformanceWarning: DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
  exvars_latent[f"{col}_lag"] = libpysal.weights.spatial_lag.lag_spatial(
/var/folders/2f/fhks6w_d0k556plcv3rfmshw0000gn/T/ipykernel_54818/186550642.py:2: PerformanceWarning: DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
  exvars_latent[f"{col}_lag"] = libpysal.weights.spatial_lag.lag_spatial(
/var/folders/2f/fhks6w_d0k556plcv3rfmshw0000gn/T/ipykernel_54818/186550642.py:2: PerformanceWarning: DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
  exvars_latent[f"{col}_lag"] = libpysal.weights.spatial_lag.lag_spatial(
/var/folders/2f/fhks6w_d0k556plcv3rfmshw0000gn/T/ipykernel_54818/186550642.py:2: PerformanceWarning: DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
  exvars_latent[f"{col}_lag"] = libpysal.weights.spatial_lag.lag_spatial(
/var/folders/2f/fhks6w_d0k556plcv3rfmshw0000gn/T/ipykernel_54818/186550642.py:2: PerformanceWarning: DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
  exvars_latent[f"{col}_lag"] = libpysal.weights.spatial_lag.lag_spatial(
/var/folders/2f/fhks6w_d0k556plcv3rfmshw0000gn/T/ipykernel_54818/186550642.py:2: PerformanceWarning: DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
  exvars_latent[f"{col}_lag"] = libpysal.weights.spatial_lag.lag_spatial(
/var/folders/2f/fhks6w_d0k556plcv3rfmshw0000gn/T/ipykernel_54818/186550642.py:2: PerformanceWarning: DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
  exvars_latent[f"{col}_lag"] = libpysal.weights.spatial_lag.lag_spatial(

exvars_latent = exvars_latent.copy()

parameters = {
    "learning_rate": (0.01, 0.05, 0.1),
    "max_iter": [500],
    "max_bins": (64, 128),
}

Initiate GridSearchCV with Histogram-based Gradient Boosting Regression Tree.

regressor = HistGradientBoostingRegressor(
    random_state=0,
)
est_latent = GridSearchCV(regressor, parameters, verbose=1)

Fit the data.

%%time
est_latent.fit(exvars_latent[mask], data[mask].house_price_index)

Fitting 5 folds for each of 6 candidates, totalling 30 fits
CPU times: user 3min 12s, sys: 2min 41s, total: 5min 53s
Wall time: 3min 30s

GridSearchCV(estimator=HistGradientBoostingRegressor(random_state=0),
             param_grid={'learning_rate': (0.01, 0.05, 0.1),
                         'max_bins': (64, 128), 'max_iter': [500]},
             verbose=1)

est_latent.best_estimator_

HistGradientBoostingRegressor(learning_rate=0.05, max_bins=64, max_iter=500,
                              random_state=0)

Test HistGradientBoostingRegressor without any pre-processing as a baseline.

est_latent.best_score_

0.42214079367595386

y_pred_latent = cross_val_predict(
    est_latent.best_estimator_, exvars_latent[mask], data[mask].house_price_index, cv=5
)

Get the prediction and residuals.

pred_latent = pd.Series(y_pred_latent, index=data[mask].index)
residuals_latent = data[mask].house_price_index - pred_latent

Check the error

mean_squared_error(data[mask].house_price_index, pred_latent)

190508.98759834783

MSE is only marginally better than before.

fig, ax = plt.subplots(figsize=(8, 8))
plt.scatter(data.house_price_index[mask], pred_latent, s=0.25)
plt.xlabel("Y test")
plt.ylabel("Y pred")

Text(0, 0.5, 'Y pred')

Plot prediction using the same bins as real values.

f, ax = plt.subplots(figsize=(18, 12))
extent.plot(ax=ax, alpha=0)
bins = mapclassify.NaturalBreaks(data["house_price_index"].dropna().values, k=10).bins

data.assign(pred=pred_latent).to_crs(3857).plot(
    "pred",
    scheme="userdefined",
    classification_kwds={"bins": bins},
    ax=ax,
    alpha=0.9,
    cmap="viridis",
)
legendgram(
    f,
    ax,
    pred_latent,
    bins,
    pal=palmpl.Viridis_10,
    legend_size=(0.35, 0.15),  # legend size in fractions of the axis
    loc="lower left",  # matplotlib-style legend locations
    clip=(
        0,
        data["house_price_index"].max(),
    ),  # clip the displayed range of the histogram
)
ax.set_axis_off()
contextily.add_basemap(
    ax=ax, source=contextily.providers.CartoDB.PositronNoLabels, attribution=""
)
contextily.add_basemap(
    ax=ax,
    source=contextily.providers.Stamen.TonerLines,
    alpha=0.4,
    attribution="(C) CARTO, Map tiles by Stamen Design, CC BY 3.0 -- Map data (C) OpenStreetMap contributors",
)
# plt.savefig(f"{data_folder}/outputs/figures/air_quality_index.png", dpi=150, bbox_inches="tight")

Plot residuals

f, ax = plt.subplots(figsize=(18, 12))
extent.plot(ax=ax, alpha=0)
data.assign(res=residuals_latent).to_crs(3857).plot(
    "res", ax=ax, alpha=0.9, cmap="RdBu", vmin=-2000, vmax=2000, legend=True
)
ax.set_axis_off()
contextily.add_basemap(
    ax=ax, source=contextily.providers.CartoDB.PositronNoLabels, attribution=""
)
contextily.add_basemap(
    ax=ax,
    source=contextily.providers.Stamen.TonerLines,
    alpha=0.4,
    attribution="(C) CARTO, Map tiles by Stamen Design, CC BY 3.0 -- Map data (C) OpenStreetMap contributors",
)
# plt.savefig(f"{data_folder}/outputs/figures/air_quality_index.png", dpi=150, bbox_inches="tight")

residuals_latent.plot.hist(bins=25)

f, ax = plt.subplots(figsize=(18, 12))
extent.plot(ax=ax, alpha=0)
bins = mapclassify.NaturalBreaks(pred_latent.dropna().values, k=10).bins

data.assign(pred=pred_latent).to_crs(3857).plot(
    "pred",
    scheme="userdefined",
    classification_kwds={"bins": bins},
    ax=ax,
    alpha=0.9,
    cmap="viridis",
)
legendgram(
    f,
    ax,
    pred_latent,
    bins,
    pal=palmpl.Viridis_10,
    legend_size=(0.35, 0.15),  # legend size in fractions of the axis
    loc="lower left",  # matplotlib-style legend locations
    clip=(0, pred_latent.max()),  # clip the displayed range of the histogram
)
ax.set_axis_off()
contextily.add_basemap(
    ax=ax, source=contextily.providers.CartoDB.PositronNoLabels, attribution=""
)
contextily.add_basemap(
    ax=ax,
    source=contextily.providers.Stamen.TonerLines,
    alpha=0.4,
    attribution="(C) CARTO, Map tiles by Stamen Design, CC BY 3.0 -- Map data (C) OpenStreetMap contributors",
)
# plt.savefig(f"{data_folder}/outputs/figures/air_quality_index.png", dpi=150, bbox_inches="tight")