Appendix H — Training data sampled from the whole England

Retrieving data for air quality and house price for the whole England, including their explanatory variables. This is useful if we want to train models based on more data than we have in Tyne and Wear.

import datetime

import geopandas as gpd
import tobler
import pandas as pd
import numpy as np
import xarray as xr
import pooch

Specify a path to the data folder.

data_folder = "/Users/martin/Library/CloudStorage/OneDrive-SharedLibraries-TheAlanTuringInstitute/Daniel Arribas-Bel - demoland_data"

Load 2011 Output Area geoemtry (generalised).

oa = gpd.read_file(
    f"{data_folder}/raw/Output_Areas_Dec_2011_Boundaries_EW_BGC_2022.zip"
)

H.1 Air Quality

Read DEFRA data.

pm10_21 = (
    pd.read_csv(
        "https://uk-air.defra.gov.uk/datastore/pcm/mappm102021g.csv",
        header=5,
        na_values=["MISSING"],
    )
    .set_index(["x", "y"])
    .drop(columns="gridcode")
    .to_xarray()
)
pm25_21 = (
    pd.read_csv(
        "https://uk-air.defra.gov.uk/datastore/pcm/mappm252021g.csv",
        header=5,
        na_values=["MISSING"],
    )
    .set_index(["x", "y"])
    .drop(columns="gridcode")
    .to_xarray()
)
no2_21 = (
    pd.read_csv(
        "https://uk-air.defra.gov.uk/datastore/pcm/mapno22021.csv",
        header=5,
        na_values=["MISSING"],
    )
    .set_index(["x", "y"])
    .drop(columns="gridcode")
    .to_xarray()
)
so2_21 = (
    pd.read_csv(
        "https://uk-air.defra.gov.uk/datastore/pcm/mapso22021.csv",
        header=5,
        na_values=["MISSING"],
    )
    .set_index(["x", "y"])
    .drop(columns="gridcode")
    .to_xarray()
)
pollutants_2021 = xr.merge([pm10_21, pm25_21, no2_21, so2_21])
pollutants_2021
<xarray.Dataset>
Dimensions:    (x: 657, y: 1170)
Coordinates:
  * x          (x) int64 -500 500 1500 2500 3500 ... 652500 653500 654500 655500
  * y          (y) int64 5500 6500 7500 8500 ... 1216500 1217500 1218500 1219500
Data variables:
    pm102021g  (x, y) float64 nan nan nan nan nan nan ... nan nan nan nan nan
    pm252021g  (x, y) float64 nan nan nan nan nan nan ... nan nan nan nan nan
    no22021    (x, y) float64 nan nan nan nan nan nan ... nan nan nan nan nan
    so22021    (x, y) float64 nan nan nan nan nan nan ... nan nan nan nan nan

We can compute the index based on the formula.

aqi = (
    pollutants_2021.pm252021g
    + pollutants_2021.pm102021g / 2
    + pollutants_2021.no22021 / 4
    + pollutants_2021.so22021 / 10
)
pollutants_2021 = pollutants_2021.assign(aqi=aqi)

We convert the array to a GeoDataFrame with polygons representing grid area. This will be needed for areal interpolation to LSOA/MSOA.

pollutants_2021 = pollutants_2021.to_dataframe().reset_index()
pollutants_2021 = gpd.GeoDataFrame(
    pollutants_2021,
    geometry=gpd.points_from_xy(pollutants_2021.x, pollutants_2021.y, crs=27700).buffer(
        500, cap_style=3
    ),
)
pollutants_2021.to_parquet(
    f"{data_folder}/processed/air_quality/air_quality_grid_2021_england.parquet"
)
pollutants_2021
x y pm102021g pm252021g no22021 so22021 aqi geometry
0 -500 5500 NaN NaN NaN NaN NaN POLYGON ((0.000 6000.000, 0.000 5000.000, -100...
1 -500 6500 NaN NaN NaN NaN NaN POLYGON ((0.000 7000.000, 0.000 6000.000, -100...
2 -500 7500 NaN NaN NaN NaN NaN POLYGON ((0.000 8000.000, 0.000 7000.000, -100...
3 -500 8500 NaN NaN NaN NaN NaN POLYGON ((0.000 9000.000, 0.000 8000.000, -100...
4 -500 9500 NaN NaN NaN NaN NaN POLYGON ((0.000 10000.000, 0.000 9000.000, -10...
... ... ... ... ... ... ... ... ...
768685 655500 1215500 NaN NaN NaN NaN NaN POLYGON ((656000.000 1216000.000, 656000.000 1...
768686 655500 1216500 NaN NaN NaN NaN NaN POLYGON ((656000.000 1217000.000, 656000.000 1...
768687 655500 1217500 NaN NaN NaN NaN NaN POLYGON ((656000.000 1218000.000, 656000.000 1...
768688 655500 1218500 NaN NaN NaN NaN NaN POLYGON ((656000.000 1219000.000, 656000.000 1...
768689 655500 1219500 NaN NaN NaN NaN NaN POLYGON ((656000.000 1220000.000, 656000.000 1...

768690 rows × 8 columns

Interpolate to OAs.

%%time
interp = tobler.area_weighted.area_interpolate(
    pollutants_2021, oa, intensive_variables=["aqi"]
)
/Users/martin/mambaforge/envs/demoland/lib/python3.11/site-packages/pygeos/set_operations.py:129: RuntimeWarning: invalid value encountered in intersection
  return lib.intersection(a, b, **kwargs)
/Users/martin/mambaforge/envs/demoland/lib/python3.11/site-packages/tobler/util/util.py:32: UserWarning: nan values in variable: aqi, replacing with 0
  warn(f"nan values in variable: {column}, replacing with 0")
CPU times: user 17.2 s, sys: 215 ms, total: 17.4 s
Wall time: 17.4 s
oa["air_quality"] = interp.aqi

H.2 House price

Read the same dataset as before.

# It is 6.3GB...
linked_epc_path = "https://reshare.ukdataservice.ac.uk/854942/1/tranall2011_19.csv"

epc = pd.read_csv(linked_epc_path)
epc["dateoftransfer"] = pd.to_datetime(epc.dateoftransfer)
last2years = epc[epc.dateoftransfer > datetime.datetime(2018, 1, 1)]

price_per_oa = last2years[["oa11", "priceper"]].groupby("oa11").mean().reset_index()
price_per_oa.to_parquet(f"{data_folder}/processed/house_prices/price_per_oa.parquet")
price_per_oa = pd.read_parquet(
    f"{data_folder}/processed/house_prices/price_per_oa.parquet"
)
price_per_oa
oa11 priceper
0 E00000003 15722.758094
1 E00000005 12587.314452
2 E00000007 12640.262294
3 E00000010 9535.653363
4 E00000012 10341.463415
... ... ...
173553 W00010260 1632.955752
173554 W00010261 1834.401969
173555 W00010262 1511.677538
173556 W00010263 1679.802956
173557 W00010264 2046.069031

173558 rows × 2 columns

oa
OA11CD LAD11CD GlobalID geometry air_quality
0 E00000001 E09000001 f8512cce-6727-42cf-9840-6866cdbb2deb POLYGON ((532303.492 181814.110, 532213.378 18... 27.501372
1 E00000003 E09000001 9eeeb3aa-ce92-4cea-bd70-3a0e680cddc1 POLYGON ((532180.131 181763.020, 532155.909 18... 27.501372
2 E00000005 E09000001 012372dd-5e03-43c3-a915-b697d02f88e2 POLYGON ((532124.321 181682.675, 532127.299 18... 27.501372
3 E00000007 E09000001 b61eb464-9c5b-4f0e-8f78-000ea938ee78 POLYGON ((532124.321 181682.675, 532201.292 18... 27.545142
4 E00000010 E09000001 efc37450-7064-4f5e-bedd-433bf4de1167 POLYGON ((532071.302 182159.586, 532127.958 18... 27.587650
... ... ... ... ... ...
181403 W00010261 W06000011 c1051d13-4fa6-48d8-8f32-5020938ce7d6 POLYGON ((262156.208 196600.223, 262074.703 19... 13.828992
181404 W00010262 W06000011 ea1fef0a-138d-4035-a401-48c459abbca0 POLYGON ((263241.217 197440.210, 263271.904 19... 13.670047
181405 W00010263 W06000011 38819942-d3db-47e1-a85d-eec4d7a091e4 POLYGON ((262156.208 196600.223, 262205.269 19... 14.125627
181406 W00010264 W06000011 61fbb0fb-10bf-4000-a669-7114865776ff POLYGON ((268829.001 198038.000, 268708.179 19... 14.219191
181407 W00010265 W06000011 999facfa-dce5-42bf-82fe-c1f04221661e POLYGON ((266130.758 192630.558, 265987.755 19... 14.927127

181408 rows × 5 columns

Link to our OA dataset.

oa = oa.merge(price_per_oa, left_on="OA11CD", right_on="oa11", how="left")
oa = oa.drop(columns="oa11").rename(columns={"priceper": "house_price"})
oa[["OA11CD", "geometry", "air_quality", "house_price"]].to_parquet(
    f"{data_folder}/processed/oa_data_england.parquet"
)

H.3 Explanatory variables

We also want to have all our explanatory variables linked to the OA geometries.

H.3.1 Population estimates

ONS population estimates are reported on the OA level and can be merged.

Read the file processed in the Urban Grammar project.

pop = pd.read_parquet(f"{data_folder}/processed/population.parquet")

Attribute join.

oa = oa.merge(pop, left_on="OA11CD", right_on="code", how="left")
oa = oa.drop(columns=["code"])

H.3.2 Workplace population

Workplace population is reported on Workplace Zones and needs to be interpolated to OA. We use the preprocessed data from the Urban Grammar project.

wp = gpd.read_parquet(
    f"{data_folder}/raw/workplace_population/workplace_by_industry_gb.pq"
)
%%time
wp_interpolated = tobler.area_weighted.area_interpolate(
    wp,
    oa,
    extensive_variables=[
        "A, B, D, E. Agriculture, energy and water",
        "C. Manufacturing",
        "F. Construction",
        "G, I. Distribution, hotels and restaurants",
        "H, J. Transport and communication",
        "K, L, M, N. Financial, real estate, professional and administrative activities",
        "O,P,Q. Public administration, education and health",
        "R, S, T, U. Other",
    ],
)
/Users/martin/mambaforge/envs/demoland/lib/python3.11/site-packages/pygeos/set_operations.py:129: RuntimeWarning: invalid value encountered in intersection
  return lib.intersection(a, b, **kwargs)
CPU times: user 31.5 s, sys: 351 ms, total: 31.9 s
Wall time: 32 s
oa[
    [
        "A, B, D, E. Agriculture, energy and water",
        "C. Manufacturing",
        "F. Construction",
        "G, I. Distribution, hotels and restaurants",
        "H, J. Transport and communication",
        "K, L, M, N. Financial, real estate, professional and administrative activities",
        "O,P,Q. Public administration, education and health",
        "R, S, T, U. Other",
    ]
] = wp_interpolated.drop(columns="geometry").values
oa = oa.drop(columns=["LAD11CD", "GlobalID"])
oa.to_parquet(f"{data_folder}/processed/oa_data_england.parquet")

H.3.3 Land cover (CORINE)

CORINE is shipped as custom polygons. We use the data downloaded for the Urban Grammar project.

corine = gpd.read_parquet(f"{data_folder}/raw/land_cover/corine_gb.pq")
%%time
corine_interpolated = tobler.area_weighted.area_interpolate(
    corine, oa, categorical_variables=["Code_18"]
)
/Users/martin/mambaforge/envs/demoland/lib/python3.11/site-packages/pygeos/set_operations.py:129: RuntimeWarning: invalid value encountered in intersection
  return lib.intersection(a, b, **kwargs)
CPU times: user 22min 56s, sys: 5.49 s, total: 23min 1s
Wall time: 23min 4s
corine_names = {
    "Code_18_124": "Land cover [Airports]",
    "Code_18_211": "Land cover [Non-irrigated arable land]",
    "Code_18_121": "Land cover [Industrial or commercial units]",
    "Code_18_421": "Land cover [Salt marshes]",
    "Code_18_522": "Land cover [Estuaries]",
    "Code_18_142": "Land cover [Sport and leisure facilities]",
    "Code_18_141": "Land cover [Green urban areas]",
    "Code_18_112": "Land cover [Discontinuous urban fabric]",
    "Code_18_231": "Land cover [Pastures]",
    "Code_18_311": "Land cover [Broad-leaved forest]",
    "Code_18_131": "Land cover [Mineral extraction sites]",
    "Code_18_123": "Land cover [Port areas]",
    "Code_18_122": "Land cover [Road and rail networks and associated land]",
    "Code_18_512": "Land cover [Water bodies]",
    "Code_18_243": "Land cover [Land principally occupied by agriculture, with significant areas of natural vegetation]",
    "Code_18_313": "Land cover [Mixed forest]",
    "Code_18_412": "Land cover [Peat bogs]",
    "Code_18_321": "Land cover [Natural grasslands]",
    "Code_18_322": "Land cover [Moors and heathland]",
    "Code_18_324": "Land cover [Transitional woodland-shrub]",
    "Code_18_111": "Land cover [Continuous urban fabric]",
    "Code_18_423": "Land cover [Intertidal flats]",
    "Code_18_523": "Land cover [Sea and ocean]",
    "Code_18_312": "Land cover [Coniferous forest]",
    "Code_18_133": "Land cover [Construction sites]",
    "Code_18_333": "Land cover [Sparsely vegetated areas]",
    "Code_18_332": "Land cover [Bare rocks]",
    "Code_18_411": "Land cover [Inland marshes]",
    "Code_18_132": "Land cover [Dump sites]",
    "Code_18_222": "Land cover [Fruit trees and berry plantations]",
    "Code_18_242": "Land cover [Complex cultivation patterns]",
    "Code_18_331": "Land cover [Beaches, dunes, sands]",
    "Code_18_511": "Land cover [Water courses]",
    "Code_18_334": "Land cover [Burnt areas]",
    "Code_18_244": "Land cover [Agro-forestry areas]",
    "Code_18_521": "Land cover [Coastal lagoons]",
}

corine_interpolated.columns = corine_interpolated.columns.map(corine_names)

Assign the subset of classes that may be interesting for our purposes to OA dataframe.

interesting = [
    "Land cover [Discontinuous urban fabric]",
    "Land cover [Continuous urban fabric]",
    "Land cover [Non-irrigated arable land]",
    "Land cover [Industrial or commercial units]",
    "Land cover [Green urban areas]",
    "Land cover [Pastures]",
    "Land cover [Sport and leisure facilities]",
]
oa[interesting] = corine_interpolated[interesting].values
oa.to_parquet(f"{data_folder}/processed/oa_data_england.parquet")

H.4 Morphometrics

Getting the file pre-processed based on the Urban Grammar

oa_morpho = pd.read_parquet(f"{data_folder}/processed/morphometrics_oa.parquet")
oa = oa.merge(oa_morpho, left_on="OA11CD", right_index=True)
oa.to_parquet(f"{data_folder}/processed/oa_data_england.parquet")

H.5 Cleanup

Drop unusable rows

oa = gpd.read_parquet(f"{data_folder}/processed/oa_data_england.parquet")
to_drop = [
    "sdbPer",
    "ssbElo",
    "stcOri",
    "sdcLAL",
    "mdcAre",
    "ltcAre",
    "ltcWRE",
    "mtdMDi",
    "lcdMes",
    "lddNDe",
    "sddAre",
    "mdsAre",
    "ldsAre",
    "lisCel",
    "ldePer",
    "lseCWA",
]

oa = oa.drop(columns=to_drop)
oa_clean = oa.dropna()

Remove London data as London is a massive outlier that throws everything off.

london = gpd.read_file(f"{data_folder}/raw/london/OA_2011_London_gen_MHW.shp")
oa_clean = oa_clean[~oa_clean.OA11CD.isin(london.OA11CD)]
oa_clean.set_index("OA11CD").to_parquet(
    f"{data_folder}/processed/oa_data_england.parquet"
)