Appendix I — Air Quality prediction model exploration

Exploration of behaviour of scikit-learn models able to predict the air quality indicator from the explanatory variable capturing different scenarios.

import geopandas as gpd
import numpy as np
import pandas as pd

import contextily
import palettable.matplotlib as palmpl
import matplotlib.pyplot as plt
import mapclassify
import libpysal

from utils import legendgram

from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_predict, GridSearchCV

/var/folders/2f/fhks6w_d0k556plcv3rfmshw0000gn/T/ipykernel_4819/881383173.py:1: UserWarning: Shapely 2.0 is installed, but because PyGEOS is also installed, GeoPandas will still use PyGEOS by default for now. To force to use and test Shapely 2.0, you have to set the environment variable USE_PYGEOS=0. You can do this before starting the Python process, or in your code before importing geopandas:

import os
os.environ['USE_PYGEOS'] = '0'
import geopandas

In a future release, GeoPandas will switch to using Shapely by default. If you are using PyGEOS directly (calling PyGEOS functions on geometries from GeoPandas), this will then stop working and you are encouraged to migrate from PyGEOS to Shapely 2.0 (https://shapely.readthedocs.io/en/latest/migration_pygeos.html).
  import geopandas as gpd

data_folder = "/Users/martin/Library/CloudStorage/OneDrive-SharedLibraries-TheAlanTuringInstitute/Daniel Arribas-Bel - demoland_data"

Load the data

data = gpd.read_parquet(f"{data_folder}/processed/interpolated/all_oa.parquet")

Filter only explanatory variables.

exvars = data.drop(
    columns=[
        "geo_code",
        "geometry",
        "air_quality_index",
        "house_price_index",
        "jobs_accessibility_index",
        "greenspace_accessibility_index",
    ]
)

Specify grid search parameters

parameters = {
    "learning_rate": (0.01, 0.05, 0.1, 0.2, 0.5, 1),
    "max_iter": [100, 200, 500],
    "max_bins": (64, 128, 255),
}

Initiate GridSearchCV with Histogram-based Gradient Boosting Regression Tree.

regressor = HistGradientBoostingRegressor(
    random_state=0,
)
est = GridSearchCV(regressor, parameters, verbose=1)

Fit the data.

%%time
est.fit(exvars, data.air_quality_index)

Fitting 5 folds for each of 54 candidates, totalling 270 fits
CPU times: user 8min 14s, sys: 13min 24s, total: 21min 38s
Wall time: 13min 40s

GridSearchCV(estimator=HistGradientBoostingRegressor(random_state=0),
             param_grid={'learning_rate': (0.01, 0.05, 0.1, 0.2, 0.5, 1),
                         'max_bins': (64, 128, 255),
                         'max_iter': [100, 200, 500]},
             verbose=1)

In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.

est.best_estimator_

HistGradientBoostingRegressor(learning_rate=0.05, max_bins=64, max_iter=500,
                              random_state=0)

Test HistGradientBoostingRegressor without any pre-processing as a baseline.

est.best_score_

0.31432792904169593

y_pred = cross_val_predict(est.best_estimator_, exvars, data.air_quality_index, cv=5)

Get the prediction and residuals.

pred = pd.Series(y_pred, index=data.index)
residuals = data.air_quality_index - pred

Check the error

mean_squared_error(data.air_quality_index, pred)

1.3402343701702735

data.air_quality_index.describe()

count    3795.000000
mean       15.983571
std         1.444698
min         9.638337
25%        15.209244
50%        16.273061
75%        16.983267
max        19.461291
Name: air_quality_index, dtype: float64

Given the mean value is ~16, MSE of 1.34 is not bad.

fig, ax = plt.subplots(figsize=(8, 8))
plt.scatter(data.air_quality_index, pred, s=0.25)
plt.xlabel("Y test")
plt.ylabel("Y pred")

Text(0, 0.5, 'Y pred')

Plot results

from shapely.geometry import box

bds = data.total_bounds
extent = gpd.GeoSeries(
    [box((bds[0] - 7000), bds[1], bds[2] + 7000, bds[3])], crs=data.crs
).to_crs(3857)

Plot prediction using the same bins as real values.

f, ax = plt.subplots(figsize=(18, 12))
extent.plot(ax=ax, alpha=0)
bins = mapclassify.EqualInterval(data["air_quality_index"].values, k=20).bins

data.assign(pred=pred).to_crs(3857).plot(
    "pred",
    scheme="userdefined",
    classification_kwds={"bins": bins},
    ax=ax,
    alpha=0.9,
    cmap="magma_r",
)
legendgram(
    f,
    ax,
    pred,
    bins,
    pal=palmpl.Magma_20_r,
    legend_size=(0.35, 0.15),  # legend size in fractions of the axis
    loc="lower left",  # matplotlib-style legend locations
    clip=(10, 20),  # clip the displayed range of the histogram
)
ax.set_axis_off()
contextily.add_basemap(
    ax=ax, source=contextily.providers.CartoDB.PositronNoLabels, attribution=""
)
contextily.add_basemap(
    ax=ax,
    source=contextily.providers.Stamen.TonerLines,
    alpha=0.4,
    attribution="(C) CARTO, Map tiles by Stamen Design, CC BY 3.0 -- Map data (C) OpenStreetMap contributors",
)
# plt.savefig(f"{data_folder}/outputs/figures/air_quality_index.png", dpi=150, bbox_inches="tight")

Plot residuals

f, ax = plt.subplots(figsize=(18, 12))
extent.plot(ax=ax, alpha=0)
data.assign(res=residuals).to_crs(3857).plot(
    "res", ax=ax, alpha=0.9, cmap="RdBu", vmin=-4, vmax=4, legend=True
)
ax.set_axis_off()
contextily.add_basemap(
    ax=ax, source=contextily.providers.CartoDB.PositronNoLabels, attribution=""
)
contextily.add_basemap(
    ax=ax,
    source=contextily.providers.Stamen.TonerLines,
    alpha=0.4,
    attribution="(C) CARTO, Map tiles by Stamen Design, CC BY 3.0 -- Map data (C) OpenStreetMap contributors",
)
# plt.savefig(f"{data_folder}/outputs/figures/air_quality_index.png", dpi=150, bbox_inches="tight")

residuals.plot.hist(bins=25)

It seems that the baseline, while having generally decent performance, tends to overpredict periphery and slightly underpredict urban areas.

I.1 Using spatial lag

We can add a lag into the mix. Let’s start with the Queen weights.

queen = libpysal.weights.Queen.from_dataframe(data)
queen.transform = "R"

/Users/martin/mambaforge/envs/demoland/lib/python3.11/site-packages/libpysal/weights/weights.py:172: UserWarning: The weights matrix is not fully connected: 
 There are 3 disconnected components.
  warnings.warn(message)

for col in exvars.columns.copy():
    exvars[f"{col}_lag"] = libpysal.weights.spatial_lag.lag_spatial(queen, exvars[col])

exvars.head()

	population_estimate	A, B, D, E. Agriculture, energy and water	C. Manufacturing	F. Construction	G, I. Distribution, hotels and restaurants	H, J. Transport and communication	K, L, M, N. Financial, real estate, professional and administrative activities	O,P,Q. Public administration, education and health	R, S, T, U. Other	Land cover [Discontinuous urban fabric]	...	sdsAre_lag	sisBpM_lag	misCel_lag	ltcRea_lag	ldeAre_lag	lseCCo_lag	lseERI_lag	lteOri_lag	lteWNB_lag	lieWCe_lag
0	315	0.547024	3.146641	5.436732	4.839081	2.065436	4.283496	6.922660	1.606140	1.000000	...	30280.039796	0.064310	28.550796	51.060993	197839.799834	0.409687	0.881904	28.787310	0.014056	0.013885
1	415	0.028774	4.017802	3.870111	33.515266	8.821760	26.432214	83.050968	15.597812	0.152752	...	17776.067815	0.054231	13.316627	60.676470	32812.973123	0.398693	0.937619	20.853156	0.029603	0.001324
2	246	0.110388	1.325208	4.297030	11.481138	2.315769	4.408114	8.930658	3.636795	1.000000	...	54958.015580	0.058214	27.507441	53.117042	216595.359019	0.323117	0.784838	16.607145	0.009236	0.002879
3	277	0.478192	2.732637	4.713528	4.167451	1.776267	3.689214	5.194090	1.366452	0.931788	...	78050.340373	0.062603	29.854903	70.854692	351081.933889	0.398664	0.862998	23.747635	0.009530	0.013940
4	271	0.680069	3.917490	6.768659	6.029714	2.573226	5.335117	8.758326	2.003362	0.696472	...	27462.223773	0.067705	29.971743	52.001773	59309.608454	0.379304	0.890960	18.537922	0.016313	0.004461

5 rows × 118 columns

regressor_lag = HistGradientBoostingRegressor(
    random_state=0,
)
est_lag = GridSearchCV(regressor_lag, parameters, verbose=1)

Fit the data.

%%time
est_lag.fit(exvars, data.air_quality_index)

Fitting 5 folds for each of 54 candidates, totalling 270 fits
CPU times: user 11min 1s, sys: 12min 4s, total: 23min 5s
Wall time: 14min 16s

GridSearchCV(estimator=HistGradientBoostingRegressor(random_state=0),
             param_grid={'learning_rate': (0.01, 0.05, 0.1, 0.2, 0.5, 1),
                         'max_bins': (64, 128, 255),
                         'max_iter': [100, 200, 500]},
             verbose=1)

est_lag.best_estimator_

HistGradientBoostingRegressor(learning_rate=0.05, max_bins=64, max_iter=500,
                              random_state=0)

Test HistGradientBoostingRegressor without any pre-processing as a baseline.

est_lag.best_score_

0.5538384955550043

y_pred_lag = cross_val_predict(
    est_lag.best_estimator_, exvars, data.air_quality_index, cv=5
)

Get the prediction and residuals.

pred_lag = pd.Series(y_pred_lag, index=data.index)
residuals_lag = data.air_quality_index - pred_lag

Check the error

mean_squared_error(data.air_quality_index, pred_lag)

0.9191784959862161

MSE got down from 1.34 to 0.92, so there’s a clear gain in including the lag.

fig, ax = plt.subplots(figsize=(8, 8))
plt.scatter(data.air_quality_index, pred_lag, s=0.25)
plt.xlabel("Y test")
plt.ylabel("Y pred")

Text(0, 0.5, 'Y pred')

Plot prediction using the same bins as real values.

f, ax = plt.subplots(figsize=(18, 12))
extent.plot(ax=ax, alpha=0)
bins = mapclassify.EqualInterval(data["air_quality_index"].values, k=20).bins

data.assign(pred=pred_lag).to_crs(3857).plot(
    "pred",
    scheme="userdefined",
    classification_kwds={"bins": bins},
    ax=ax,
    alpha=0.9,
    cmap="magma_r",
)
legendgram(
    f,
    ax,
    pred,
    bins,
    pal=palmpl.Magma_20_r,
    legend_size=(0.35, 0.15),  # legend size in fractions of the axis
    loc="lower left",  # matplotlib-style legend locations
    clip=(10, 20),  # clip the displayed range of the histogram
)
ax.set_axis_off()
contextily.add_basemap(
    ax=ax, source=contextily.providers.CartoDB.PositronNoLabels, attribution=""
)
contextily.add_basemap(
    ax=ax,
    source=contextily.providers.Stamen.TonerLines,
    alpha=0.4,
    attribution="(C) CARTO, Map tiles by Stamen Design, CC BY 3.0 -- Map data (C) OpenStreetMap contributors",
)
# plt.savefig(f"{data_folder}/outputs/figures/air_quality_index.png", dpi=150, bbox_inches="tight")

Plot residuals

f, ax = plt.subplots(figsize=(18, 12))
extent.plot(ax=ax, alpha=0)
data.assign(res=residuals_lag).to_crs(3857).plot(
    "res", ax=ax, alpha=0.9, cmap="RdBu", vmin=-4, vmax=4, legend=True
)
ax.set_axis_off()
contextily.add_basemap(
    ax=ax, source=contextily.providers.CartoDB.PositronNoLabels, attribution=""
)
contextily.add_basemap(
    ax=ax,
    source=contextily.providers.Stamen.TonerLines,
    alpha=0.4,
    attribution="(C) CARTO, Map tiles by Stamen Design, CC BY 3.0 -- Map data (C) OpenStreetMap contributors",
)
# plt.savefig(f"{data_folder}/outputs/figures/air_quality_index.png", dpi=150, bbox_inches="tight")

The over and underprediction tendency still remains but the error is lower.

The main underpredicted area is south of the city along the highway - I belive that is due to relatively open OAs and no information on the highway in the explanatory variables.

The main overprediction happens in the green belt - there’s very few of these low values so the K-fold modelling exercise likely does not have enough low-value observations to learn it.

residuals_lag.plot.hist(bins=25)

Overlay of original and a lagged model. We can see that the lagged one is more precise also here.

ax = residuals_lag.plot.hist(bins=50)
residuals.plot.hist(bins=50, ax=ax, color="r", alpha=0.7)

The average residuals:

residuals_lag.abs().mean(), residuals.abs().mean()

(0.7127002931466176, 0.8861835077899286)

Interestingly enough, both options have the same parameters of the best model.

est_lag.best_estimator_

HistGradientBoostingRegressor(learning_rate=0.05, max_bins=64, max_iter=500,
                              random_state=0)

est.best_estimator_

HistGradientBoostingRegressor(learning_rate=0.05, max_bins=64, max_iter=500,
                              random_state=0)

I.1.1 Wider weights

queen = libpysal.weights.Queen.from_dataframe(data)
queen3 = libpysal.weights.higher_order(queen, k=3, lower_order=True)

/Users/martin/mambaforge/envs/demoland/lib/python3.11/site-packages/libpysal/weights/weights.py:172: UserWarning: The weights matrix is not fully connected: 
 There are 3 disconnected components.
  warnings.warn(message)

queen3.transform = "R"

for col in exvars.columns.copy():
    exvars[f"{col}_lag"] = libpysal.weights.spatial_lag.lag_spatial(queen3, exvars[col])

exvars.head()

	population_estimate	A, B, D, E. Agriculture, energy and water	C. Manufacturing	F. Construction	G, I. Distribution, hotels and restaurants	H, J. Transport and communication	K, L, M, N. Financial, real estate, professional and administrative activities	O,P,Q. Public administration, education and health	R, S, T, U. Other	Land cover [Discontinuous urban fabric]	...	sdsAre_lag	sisBpM_lag	misCel_lag	ltcRea_lag	ldeAre_lag	lseCCo_lag	lseERI_lag	lteOri_lag	lteWNB_lag	lieWCe_lag
0	315	0.547024	3.146641	5.436732	4.839081	2.065436	4.283496	6.922660	1.606140	1.000000	...	30280.039796	0.064310	28.550796	51.060993	197839.799834	0.409687	0.881904	28.787310	0.014056	0.013885
1	415	0.028774	4.017802	3.870111	33.515266	8.821760	26.432214	83.050968	15.597812	0.152752	...	17776.067815	0.054231	13.316627	60.676470	32812.973123	0.398693	0.937619	20.853156	0.029603	0.001324
2	246	0.110388	1.325208	4.297030	11.481138	2.315769	4.408114	8.930658	3.636795	1.000000	...	54958.015580	0.058214	27.507441	53.117042	216595.359019	0.323117	0.784838	16.607145	0.009236	0.002879
3	277	0.478192	2.732637	4.713528	4.167451	1.776267	3.689214	5.194090	1.366452	0.931788	...	78050.340373	0.062603	29.854903	70.854692	351081.933889	0.398664	0.862998	23.747635	0.009530	0.013940
4	271	0.680069	3.917490	6.768659	6.029714	2.573226	5.335117	8.758326	2.003362	0.696472	...	27462.223773	0.067705	29.971743	52.001773	59309.608454	0.379304	0.890960	18.537922	0.016313	0.004461

5 rows × 118 columns

regressor_lag = HistGradientBoostingRegressor(
    random_state=0,
)
est_lag = GridSearchCV(regressor_lag, parameters, verbose=1)

Fit the data.

%%time
est_lag.fit(exvars, data.air_quality_index)

Fitting 5 folds for each of 54 candidates, totalling 270 fits
CPU times: user 10min 48s, sys: 11min 32s, total: 22min 21s
Wall time: 13min 59s

GridSearchCV(estimator=HistGradientBoostingRegressor(random_state=0),
             param_grid={'learning_rate': (0.01, 0.05, 0.1, 0.2, 0.5, 1),
                         'max_bins': (64, 128, 255),
                         'max_iter': [100, 200, 500]},
             verbose=1)

est_lag.best_estimator_

HistGradientBoostingRegressor(learning_rate=0.05, max_bins=128, max_iter=500,
                              random_state=0)

Test HistGradientBoostingRegressor without any pre-processing as a baseline.

est_lag.best_score_

0.76202775278368

y_pred_lag = cross_val_predict(
    est_lag.best_estimator_, exvars, data.air_quality_index, cv=5
)

Get the prediction and residuals.

pred_lag = pd.Series(y_pred_lag, index=data.index)
residuals_lag = data.air_quality_index - pred_lag

Check the error

mean_squared_error(data.air_quality_index, pred_lag)

0.5447792769804048

residuals_lag.abs().mean()

0.51491983590584

fig, ax = plt.subplots(figsize=(8, 8))
plt.scatter(data.air_quality_index, pred_lag, s=0.25)
plt.xlabel("Y test")
plt.ylabel("Y pred")

Text(0, 0.5, 'Y pred')

Plot prediction using the same bins as real values.

f, ax = plt.subplots(figsize=(18, 12))
extent.plot(ax=ax, alpha=0)
bins = mapclassify.EqualInterval(data["air_quality_index"].values, k=20).bins

data.assign(pred=pred_lag).to_crs(3857).plot(
    "pred",
    scheme="userdefined",
    classification_kwds={"bins": bins},
    ax=ax,
    alpha=0.9,
    cmap="magma_r",
)
legendgram(
    f,
    ax,
    pred_lag,
    bins,
    pal=palmpl.Magma_20_r,
    legend_size=(0.35, 0.15),  # legend size in fractions of the axis
    loc="lower left",  # matplotlib-style legend locations
    clip=(10, 20),  # clip the displayed range of the histogram
)
ax.set_axis_off()
contextily.add_basemap(
    ax=ax, source=contextily.providers.CartoDB.PositronNoLabels, attribution=""
)
contextily.add_basemap(
    ax=ax,
    source=contextily.providers.Stamen.TonerLines,
    alpha=0.4,
    attribution="(C) CARTO, Map tiles by Stamen Design, CC BY 3.0 -- Map data (C) OpenStreetMap contributors",
)
# plt.savefig(f"{data_folder}/outputs/figures/air_quality_index.png", dpi=150, bbox_inches="tight")

Plot residuals

f, ax = plt.subplots(figsize=(18, 12))
extent.plot(ax=ax, alpha=0)
data.assign(res=residuals_lag).to_crs(3857).plot(
    "res", ax=ax, alpha=0.9, cmap="RdBu", vmin=-4, vmax=4, legend=True
)
ax.set_axis_off()
contextily.add_basemap(
    ax=ax, source=contextily.providers.CartoDB.PositronNoLabels, attribution=""
)
contextily.add_basemap(
    ax=ax,
    source=contextily.providers.Stamen.TonerLines,
    alpha=0.4,
    attribution="(C) CARTO, Map tiles by Stamen Design, CC BY 3.0 -- Map data (C) OpenStreetMap contributors",
)
# plt.savefig(f"{data_folder}/outputs/figures/air_quality_index.png", dpi=150, bbox_inches="tight")

residuals_lag.plot.hist(bins=25)

I.2 With latent representation of Sentinel 2

Try including the lagged latent representation from the postcode Sentinel paper.

latent_oa = pd.read_parquet(f"{data_folder}/processed/sentinel/latent_oa.parquet")

queen = libpysal.weights.Queen.from_dataframe(data)
queen.transform = "R"

/Users/martin/mambaforge/envs/demoland/lib/python3.11/site-packages/libpysal/weights/weights.py:172: UserWarning: The weights matrix is not fully connected: 
 There are 3 disconnected components.
  warnings.warn(message)

exvars_latent = pd.concat([exvars, latent_oa.drop(columns="geometry")], axis=1)

for col in exvars_latent.columns.copy():
    exvars_latent[f"{col}_lag"] = libpysal.weights.spatial_lag.lag_spatial(
        queen, exvars_latent[col]
    )

/var/folders/2f/fhks6w_d0k556plcv3rfmshw0000gn/T/ipykernel_54819/186550642.py:2: PerformanceWarning: DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
  exvars_latent[f"{col}_lag"] = libpysal.weights.spatial_lag.lag_spatial(
/var/folders/2f/fhks6w_d0k556plcv3rfmshw0000gn/T/ipykernel_54819/186550642.py:2: PerformanceWarning: DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
  exvars_latent[f"{col}_lag"] = libpysal.weights.spatial_lag.lag_spatial(
/var/folders/2f/fhks6w_d0k556plcv3rfmshw0000gn/T/ipykernel_54819/186550642.py:2: PerformanceWarning: DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
  exvars_latent[f"{col}_lag"] = libpysal.weights.spatial_lag.lag_spatial(
/var/folders/2f/fhks6w_d0k556plcv3rfmshw0000gn/T/ipykernel_54819/186550642.py:2: PerformanceWarning: DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
  exvars_latent[f"{col}_lag"] = libpysal.weights.spatial_lag.lag_spatial(
/var/folders/2f/fhks6w_d0k556plcv3rfmshw0000gn/T/ipykernel_54819/186550642.py:2: PerformanceWarning: DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
  exvars_latent[f"{col}_lag"] = libpysal.weights.spatial_lag.lag_spatial(
/var/folders/2f/fhks6w_d0k556plcv3rfmshw0000gn/T/ipykernel_54819/186550642.py:2: PerformanceWarning: DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
  exvars_latent[f"{col}_lag"] = libpysal.weights.spatial_lag.lag_spatial(
/var/folders/2f/fhks6w_d0k556plcv3rfmshw0000gn/T/ipykernel_54819/186550642.py:2: PerformanceWarning: DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
  exvars_latent[f"{col}_lag"] = libpysal.weights.spatial_lag.lag_spatial(
/var/folders/2f/fhks6w_d0k556plcv3rfmshw0000gn/T/ipykernel_54819/186550642.py:2: PerformanceWarning: DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
  exvars_latent[f"{col}_lag"] = libpysal.weights.spatial_lag.lag_spatial(
/var/folders/2f/fhks6w_d0k556plcv3rfmshw0000gn/T/ipykernel_54819/186550642.py:2: PerformanceWarning: DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
  exvars_latent[f"{col}_lag"] = libpysal.weights.spatial_lag.lag_spatial(
/var/folders/2f/fhks6w_d0k556plcv3rfmshw0000gn/T/ipykernel_54819/186550642.py:2: PerformanceWarning: DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
  exvars_latent[f"{col}_lag"] = libpysal.weights.spatial_lag.lag_spatial(
/var/folders/2f/fhks6w_d0k556plcv3rfmshw0000gn/T/ipykernel_54819/186550642.py:2: PerformanceWarning: DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
  exvars_latent[f"{col}_lag"] = libpysal.weights.spatial_lag.lag_spatial(
/var/folders/2f/fhks6w_d0k556plcv3rfmshw0000gn/T/ipykernel_54819/186550642.py:2: PerformanceWarning: DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
  exvars_latent[f"{col}_lag"] = libpysal.weights.spatial_lag.lag_spatial(
/var/folders/2f/fhks6w_d0k556plcv3rfmshw0000gn/T/ipykernel_54819/186550642.py:2: PerformanceWarning: DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
  exvars_latent[f"{col}_lag"] = libpysal.weights.spatial_lag.lag_spatial(
/var/folders/2f/fhks6w_d0k556plcv3rfmshw0000gn/T/ipykernel_54819/186550642.py:2: PerformanceWarning: DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
  exvars_latent[f"{col}_lag"] = libpysal.weights.spatial_lag.lag_spatial(
/var/folders/2f/fhks6w_d0k556plcv3rfmshw0000gn/T/ipykernel_54819/186550642.py:2: PerformanceWarning: DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
  exvars_latent[f"{col}_lag"] = libpysal.weights.spatial_lag.lag_spatial(
/var/folders/2f/fhks6w_d0k556plcv3rfmshw0000gn/T/ipykernel_54819/186550642.py:2: PerformanceWarning: DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
  exvars_latent[f"{col}_lag"] = libpysal.weights.spatial_lag.lag_spatial(
/var/folders/2f/fhks6w_d0k556plcv3rfmshw0000gn/T/ipykernel_54819/186550642.py:2: PerformanceWarning: DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
  exvars_latent[f"{col}_lag"] = libpysal.weights.spatial_lag.lag_spatial(
/var/folders/2f/fhks6w_d0k556plcv3rfmshw0000gn/T/ipykernel_54819/186550642.py:2: PerformanceWarning: DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
  exvars_latent[f"{col}_lag"] = libpysal.weights.spatial_lag.lag_spatial(
/var/folders/2f/fhks6w_d0k556plcv3rfmshw0000gn/T/ipykernel_54819/186550642.py:2: PerformanceWarning: DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
  exvars_latent[f"{col}_lag"] = libpysal.weights.spatial_lag.lag_spatial(
/var/folders/2f/fhks6w_d0k556plcv3rfmshw0000gn/T/ipykernel_54819/186550642.py:2: PerformanceWarning: DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
  exvars_latent[f"{col}_lag"] = libpysal.weights.spatial_lag.lag_spatial(
/var/folders/2f/fhks6w_d0k556plcv3rfmshw0000gn/T/ipykernel_54819/186550642.py:2: PerformanceWarning: DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
  exvars_latent[f"{col}_lag"] = libpysal.weights.spatial_lag.lag_spatial(
/var/folders/2f/fhks6w_d0k556plcv3rfmshw0000gn/T/ipykernel_54819/186550642.py:2: PerformanceWarning: DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
  exvars_latent[f"{col}_lag"] = libpysal.weights.spatial_lag.lag_spatial(
/var/folders/2f/fhks6w_d0k556plcv3rfmshw0000gn/T/ipykernel_54819/186550642.py:2: PerformanceWarning: DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
  exvars_latent[f"{col}_lag"] = libpysal.weights.spatial_lag.lag_spatial(
/var/folders/2f/fhks6w_d0k556plcv3rfmshw0000gn/T/ipykernel_54819/186550642.py:2: PerformanceWarning: DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
  exvars_latent[f"{col}_lag"] = libpysal.weights.spatial_lag.lag_spatial(
/var/folders/2f/fhks6w_d0k556plcv3rfmshw0000gn/T/ipykernel_54819/186550642.py:2: PerformanceWarning: DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
  exvars_latent[f"{col}_lag"] = libpysal.weights.spatial_lag.lag_spatial(
/var/folders/2f/fhks6w_d0k556plcv3rfmshw0000gn/T/ipykernel_54819/186550642.py:2: PerformanceWarning: DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
  exvars_latent[f"{col}_lag"] = libpysal.weights.spatial_lag.lag_spatial(

exvars_latent = exvars_latent.copy()

parameters = {
    "learning_rate": (0.01, 0.05, 0.1),
    "max_iter": [500],
    "max_bins": (64, 128),
}

Initiate GridSearchCV with Histogram-based Gradient Boosting Regression Tree.

regressor = HistGradientBoostingRegressor(
    random_state=0,
)
est_latent = GridSearchCV(regressor, parameters, verbose=1)

Fit the data.

%%time
est_latent.fit(exvars_latent, data.air_quality_index)

Fitting 5 folds for each of 6 candidates, totalling 30 fits
CPU times: user 3min 14s, sys: 2min 35s, total: 5min 49s
Wall time: 3min 26s

GridSearchCV(estimator=HistGradientBoostingRegressor(random_state=0),
             param_grid={'learning_rate': (0.01, 0.05, 0.1),
                         'max_bins': (64, 128), 'max_iter': [500]},
             verbose=1)

est_latent.best_estimator_

HistGradientBoostingRegressor(max_bins=128, max_iter=500, random_state=0)

Test HistGradientBoostingRegressor without any pre-processing as a baseline.

est_latent.best_score_

0.6205192288572696

y_pred_latent = cross_val_predict(
    est_latent.best_estimator_, exvars_latent, data.air_quality_index, cv=5
)

Get the prediction and residuals.

pred_latent = pd.Series(y_pred_latent, index=data.index)
residuals_latent = data.air_quality_index - pred_latent

Check the error

mean_squared_error(data.air_quality_index, pred_latent)

0.8272652037619179

MSE got down from 1.34 to 0.92, so there’s a clear gain in including the lag.

fig, ax = plt.subplots(figsize=(8, 8))
plt.scatter(data.air_quality_index, pred_latent, s=0.25)
plt.xlabel("Y test")
plt.ylabel("Y pred")

Text(0, 0.5, 'Y pred')

Plot prediction using the same bins as real values.

f, ax = plt.subplots(figsize=(18, 12))
extent.plot(ax=ax, alpha=0)
bins = mapclassify.EqualInterval(data["air_quality_index"].values, k=20).bins

data.assign(pred=pred_latent).to_crs(3857).plot(
    "pred",
    scheme="userdefined",
    classification_kwds={"bins": bins},
    ax=ax,
    alpha=0.9,
    cmap="magma_r",
)
legendgram(
    f,
    ax,
    pred_latent,
    bins,
    pal=palmpl.Magma_20_r,
    legend_size=(0.35, 0.15),  # legend size in fractions of the axis
    loc="lower left",  # matplotlib-style legend locations
    clip=(10, 20),  # clip the displayed range of the histogram
)
ax.set_axis_off()
contextily.add_basemap(
    ax=ax, source=contextily.providers.CartoDB.PositronNoLabels, attribution=""
)
contextily.add_basemap(
    ax=ax,
    source=contextily.providers.Stamen.TonerLines,
    alpha=0.4,
    attribution="(C) CARTO, Map tiles by Stamen Design, CC BY 3.0 -- Map data (C) OpenStreetMap contributors",
)
# plt.savefig(f"{data_folder}/outputs/figures/air_quality_index.png", dpi=150, bbox_inches="tight")

Plot residuals

f, ax = plt.subplots(figsize=(18, 12))
extent.plot(ax=ax, alpha=0)
data.assign(res=residuals_latent).to_crs(3857).plot(
    "res", ax=ax, alpha=0.9, cmap="RdBu", vmin=-4, vmax=4, legend=True
)
ax.set_axis_off()
contextily.add_basemap(
    ax=ax, source=contextily.providers.CartoDB.PositronNoLabels, attribution=""
)
contextily.add_basemap(
    ax=ax,
    source=contextily.providers.Stamen.TonerLines,
    alpha=0.4,
    attribution="(C) CARTO, Map tiles by Stamen Design, CC BY 3.0 -- Map data (C) OpenStreetMap contributors",
)
# plt.savefig(f"{data_folder}/outputs/figures/air_quality_index.png", dpi=150, bbox_inches="tight")

residuals_latent.plot.hist(bins=25)

I.2.1 Only Sentinel2

regressor = HistGradientBoostingRegressor(
    random_state=0,
)
est_s2 = GridSearchCV(regressor, parameters, verbose=1)

%%time
est_s2.fit(latent_oa.drop(columns="geometry"), data.air_quality_index)

Fitting 5 folds for each of 6 candidates, totalling 30 fits
CPU times: user 1min 32s, sys: 2min 19s, total: 3min 52s
Wall time: 2min 36s

GridSearchCV(estimator=HistGradientBoostingRegressor(random_state=0),
             param_grid={'learning_rate': (0.01, 0.05, 0.1),
                         'max_bins': (64, 128), 'max_iter': [500]},
             verbose=1)

est_s2.best_estimator_

HistGradientBoostingRegressor(learning_rate=0.05, max_bins=64, max_iter=500,
                              random_state=0)

Test HistGradientBoostingRegressor without any pre-processing as a baseline.

est_s2.best_score_

0.23497429291310237

y_pred_s2 = cross_val_predict(
    est_s2.best_estimator_,
    latent_oa.drop(columns="geometry"),
    data.air_quality_index,
    cv=5,
)

Get the prediction and residuals.

pred_s2 = pd.Series(y_pred_s2, index=data.index)
residuals_s2 = data.air_quality_index - pred_s2

Check the error

mean_squared_error(data.air_quality_index, pred_s2)

1.570113449347073

MSE got down from 1.34 to 0.92, so there’s a clear gain in including the lag.

fig, ax = plt.subplots(figsize=(8, 8))
plt.scatter(data.air_quality_index, pred_s2, s=0.25)
plt.xlabel("Y test")
plt.ylabel("Y pred")

Text(0, 0.5, 'Y pred')

Plot prediction using the same bins as real values.

f, ax = plt.subplots(figsize=(18, 12))
extent.plot(ax=ax, alpha=0)
bins = mapclassify.EqualInterval(data["air_quality_index"].values, k=20).bins

data.assign(pred=pred_s2).to_crs(3857).plot(
    "pred",
    scheme="userdefined",
    classification_kwds={"bins": bins},
    ax=ax,
    alpha=0.9,
    cmap="magma_r",
)
legendgram(
    f,
    ax,
    pred_s2,
    bins,
    pal=palmpl.Magma_20_r,
    legend_size=(0.35, 0.15),  # legend size in fractions of the axis
    loc="lower left",  # matplotlib-style legend locations
    clip=(10, 20),  # clip the displayed range of the histogram
)
ax.set_axis_off()
contextily.add_basemap(
    ax=ax, source=contextily.providers.CartoDB.PositronNoLabels, attribution=""
)
contextily.add_basemap(
    ax=ax,
    source=contextily.providers.Stamen.TonerLines,
    alpha=0.4,
    attribution="(C) CARTO, Map tiles by Stamen Design, CC BY 3.0 -- Map data (C) OpenStreetMap contributors",
)
# plt.savefig(f"{data_folder}/outputs/figures/air_quality_index.png", dpi=150, bbox_inches="tight")

Plot residuals

f, ax = plt.subplots(figsize=(18, 12))
extent.plot(ax=ax, alpha=0)
data.assign(res=residuals_s2).to_crs(3857).plot(
    "res", ax=ax, alpha=0.9, cmap="RdBu", vmin=-4, vmax=4, legend=True
)
ax.set_axis_off()
contextily.add_basemap(
    ax=ax, source=contextily.providers.CartoDB.PositronNoLabels, attribution=""
)
contextily.add_basemap(
    ax=ax,
    source=contextily.providers.Stamen.TonerLines,
    alpha=0.4,
    attribution="(C) CARTO, Map tiles by Stamen Design, CC BY 3.0 -- Map data (C) OpenStreetMap contributors",
)
# plt.savefig(f"{data_folder}/outputs/figures/air_quality_index.png", dpi=150, bbox_inches="tight")