import geopandas as gpd
import numpy as np
import joblib
import libpysal
from sklearn.ensemble import HistGradientBoostingRegressor
Appendix N — House price model training
Training of the final house price model based on England-wide training data.
= "/Users/martin/Library/CloudStorage/OneDrive-SharedLibraries-TheAlanTuringInstitute/Daniel Arribas-Bel - demoland_data" data_folder
Load the data
= gpd.read_parquet( f"{data_folder}/processed/oa_data_england.parquet") data
Filter only explanatory variables.
= data.drop(
exvars =[
columns"geometry",
"air_quality",
"house_price",
] )
Create weights of the order 5 identified as optimal.
= libpysal.weights.Queen.from_dataframe(data)
queen = libpysal.weights.higher_order(queen, k=5, lower_order=True) queen5
/var/folders/2f/fhks6w_d0k556plcv3rfmshw0000gn/T/ipykernel_49371/1156976480.py:1: FutureWarning: `use_index` defaults to False but will default to True in future. Set True/False directly to control this behavior and silence this warning
queen = libpysal.weights.Queen.from_dataframe(data)
/Users/martin/miniforge3/envs/demoland/lib/python3.11/site-packages/libpysal/weights/weights.py:224: UserWarning: The weights matrix is not fully connected:
There are 51 disconnected components.
There are 19 islands with ids: 1676, 2132, 3036, 21306, 33133, 34428, 42635, 42654, 68863, 72800, 74393, 105153, 108399, 134057, 140649, 141143, 141475, 144653, 149708.
warnings.warn(message)
Compute spatial lag.
= "r"
queen5.transform for col in exvars.columns.copy():
f"{col}_lag"] = libpysal.weights.spatial_lag.lag_spatial(queen5, exvars[col]) exvars[
('WARNING: ', 1676, ' is an island (no neighbors)')
('WARNING: ', 2132, ' is an island (no neighbors)')
('WARNING: ', 3036, ' is an island (no neighbors)')
('WARNING: ', 21306, ' is an island (no neighbors)')
('WARNING: ', 33133, ' is an island (no neighbors)')
('WARNING: ', 34428, ' is an island (no neighbors)')
('WARNING: ', 42635, ' is an island (no neighbors)')
('WARNING: ', 42654, ' is an island (no neighbors)')
('WARNING: ', 68863, ' is an island (no neighbors)')
('WARNING: ', 72800, ' is an island (no neighbors)')
('WARNING: ', 74393, ' is an island (no neighbors)')
('WARNING: ', 105153, ' is an island (no neighbors)')
('WARNING: ', 108399, ' is an island (no neighbors)')
('WARNING: ', 134057, ' is an island (no neighbors)')
('WARNING: ', 140649, ' is an island (no neighbors)')
('WARNING: ', 141143, ' is an island (no neighbors)')
('WARNING: ', 141475, ' is an island (no neighbors)')
('WARNING: ', 144653, ' is an island (no neighbors)')
('WARNING: ', 149708, ' is an island (no neighbors)')
Create mask to ignore missing values in training.
= data.house_price.notna() mask
Initialise the model.
= HistGradientBoostingRegressor(
regressor =0, max_bins=128, max_iter=1000
random_state )
Train the model.
regressor.fit(exvars[mask], np.log(data.house_price[mask]))
HistGradientBoostingRegressor(max_bins=128, max_iter=1000, random_state=0)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
HistGradientBoostingRegressor(max_bins=128, max_iter=1000, random_state=0)
Test the prediction.
10]) regressor.predict(exvars.iloc[:
array([7.43681357, 7.52412204, 7.34451913, 7.64584292, 7.45234704,
7.7307261 , 7.74621834, 7.5703991 , 7.53491617, 7.4739106 ])
Save to file.
with open(f"{data_folder}/models/house_price_model.joblib", "wb") as f:
joblib.dump(regressor, f)