import geopandas as gpd
import numpy as np
import joblib
import libpysal
from sklearn.ensemble import HistGradientBoostingRegressorAppendix N — House price model training
Training of the final house price model based on England-wide training data.
data_folder = "/Users/martin/Library/CloudStorage/OneDrive-SharedLibraries-TheAlanTuringInstitute/Daniel Arribas-Bel - demoland_data"Load the data
data = gpd.read_parquet( f"{data_folder}/processed/oa_data_england.parquet")Filter only explanatory variables.
exvars = data.drop(
columns=[
"geometry",
"air_quality",
"house_price",
]
)Create weights of the order 5 identified as optimal.
queen = libpysal.weights.Queen.from_dataframe(data)
queen5 = libpysal.weights.higher_order(queen, k=5, lower_order=True)/var/folders/2f/fhks6w_d0k556plcv3rfmshw0000gn/T/ipykernel_49371/1156976480.py:1: FutureWarning: `use_index` defaults to False but will default to True in future. Set True/False directly to control this behavior and silence this warning
queen = libpysal.weights.Queen.from_dataframe(data)
/Users/martin/miniforge3/envs/demoland/lib/python3.11/site-packages/libpysal/weights/weights.py:224: UserWarning: The weights matrix is not fully connected:
There are 51 disconnected components.
There are 19 islands with ids: 1676, 2132, 3036, 21306, 33133, 34428, 42635, 42654, 68863, 72800, 74393, 105153, 108399, 134057, 140649, 141143, 141475, 144653, 149708.
warnings.warn(message)
Compute spatial lag.
queen5.transform = "r"
for col in exvars.columns.copy():
exvars[f"{col}_lag"] = libpysal.weights.spatial_lag.lag_spatial(queen5, exvars[col])('WARNING: ', 1676, ' is an island (no neighbors)')
('WARNING: ', 2132, ' is an island (no neighbors)')
('WARNING: ', 3036, ' is an island (no neighbors)')
('WARNING: ', 21306, ' is an island (no neighbors)')
('WARNING: ', 33133, ' is an island (no neighbors)')
('WARNING: ', 34428, ' is an island (no neighbors)')
('WARNING: ', 42635, ' is an island (no neighbors)')
('WARNING: ', 42654, ' is an island (no neighbors)')
('WARNING: ', 68863, ' is an island (no neighbors)')
('WARNING: ', 72800, ' is an island (no neighbors)')
('WARNING: ', 74393, ' is an island (no neighbors)')
('WARNING: ', 105153, ' is an island (no neighbors)')
('WARNING: ', 108399, ' is an island (no neighbors)')
('WARNING: ', 134057, ' is an island (no neighbors)')
('WARNING: ', 140649, ' is an island (no neighbors)')
('WARNING: ', 141143, ' is an island (no neighbors)')
('WARNING: ', 141475, ' is an island (no neighbors)')
('WARNING: ', 144653, ' is an island (no neighbors)')
('WARNING: ', 149708, ' is an island (no neighbors)')
Create mask to ignore missing values in training.
mask = data.house_price.notna()Initialise the model.
regressor = HistGradientBoostingRegressor(
random_state=0, max_bins=128, max_iter=1000
)Train the model.
regressor.fit(exvars[mask], np.log(data.house_price[mask]))HistGradientBoostingRegressor(max_bins=128, max_iter=1000, random_state=0)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
HistGradientBoostingRegressor(max_bins=128, max_iter=1000, random_state=0)
Test the prediction.
regressor.predict(exvars.iloc[:10])array([7.43681357, 7.52412204, 7.34451913, 7.64584292, 7.45234704,
7.7307261 , 7.74621834, 7.5703991 , 7.53491617, 7.4739106 ])
Save to file.
with open(f"{data_folder}/models/house_price_model.joblib", "wb") as f:
joblib.dump(regressor, f)