Appendix N — House price model training

Training of the final house price model based on England-wide training data.

import geopandas as gpd
import numpy as np
import joblib
import libpysal

from sklearn.ensemble import HistGradientBoostingRegressor
data_folder = "/Users/martin/Library/CloudStorage/OneDrive-SharedLibraries-TheAlanTuringInstitute/Daniel Arribas-Bel - demoland_data"

Load the data

data = gpd.read_parquet( f"{data_folder}/processed/oa_data_england.parquet")

Filter only explanatory variables.

exvars = data.drop(
    columns=[
        "geometry",
        "air_quality",
        "house_price",
    ]
)

Create weights of the order 5 identified as optimal.

queen = libpysal.weights.Queen.from_dataframe(data)
queen5 = libpysal.weights.higher_order(queen, k=5, lower_order=True)
/var/folders/2f/fhks6w_d0k556plcv3rfmshw0000gn/T/ipykernel_49371/1156976480.py:1: FutureWarning: `use_index` defaults to False but will default to True in future. Set True/False directly to control this behavior and silence this warning
  queen = libpysal.weights.Queen.from_dataframe(data)
/Users/martin/miniforge3/envs/demoland/lib/python3.11/site-packages/libpysal/weights/weights.py:224: UserWarning: The weights matrix is not fully connected: 
 There are 51 disconnected components.
 There are 19 islands with ids: 1676, 2132, 3036, 21306, 33133, 34428, 42635, 42654, 68863, 72800, 74393, 105153, 108399, 134057, 140649, 141143, 141475, 144653, 149708.
  warnings.warn(message)

Compute spatial lag.

queen5.transform = "r"
for col in exvars.columns.copy():
    exvars[f"{col}_lag"] = libpysal.weights.spatial_lag.lag_spatial(queen5, exvars[col])
('WARNING: ', 1676, ' is an island (no neighbors)')
('WARNING: ', 2132, ' is an island (no neighbors)')
('WARNING: ', 3036, ' is an island (no neighbors)')
('WARNING: ', 21306, ' is an island (no neighbors)')
('WARNING: ', 33133, ' is an island (no neighbors)')
('WARNING: ', 34428, ' is an island (no neighbors)')
('WARNING: ', 42635, ' is an island (no neighbors)')
('WARNING: ', 42654, ' is an island (no neighbors)')
('WARNING: ', 68863, ' is an island (no neighbors)')
('WARNING: ', 72800, ' is an island (no neighbors)')
('WARNING: ', 74393, ' is an island (no neighbors)')
('WARNING: ', 105153, ' is an island (no neighbors)')
('WARNING: ', 108399, ' is an island (no neighbors)')
('WARNING: ', 134057, ' is an island (no neighbors)')
('WARNING: ', 140649, ' is an island (no neighbors)')
('WARNING: ', 141143, ' is an island (no neighbors)')
('WARNING: ', 141475, ' is an island (no neighbors)')
('WARNING: ', 144653, ' is an island (no neighbors)')
('WARNING: ', 149708, ' is an island (no neighbors)')

Create mask to ignore missing values in training.

mask = data.house_price.notna()

Initialise the model.

regressor = HistGradientBoostingRegressor(
    random_state=0, max_bins=128, max_iter=1000
)

Train the model.

regressor.fit(exvars[mask], np.log(data.house_price[mask]))
HistGradientBoostingRegressor(max_bins=128, max_iter=1000, random_state=0)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.

Test the prediction.

regressor.predict(exvars.iloc[:10])
array([7.43681357, 7.52412204, 7.34451913, 7.64584292, 7.45234704,
       7.7307261 , 7.74621834, 7.5703991 , 7.53491617, 7.4739106 ])

Save to file.

with open(f"{data_folder}/models/house_price_model.joblib", "wb") as f:
    joblib.dump(regressor, f)