Appendix H — Training data sampled from the whole England

Retrieving data for air quality and house price for the whole England, including their explanatory variables. This is useful if we want to train models based on more data than we have in Tyne and Wear.

import datetime

import geopandas as gpd
import tobler
import pandas as pd
import numpy as np
import xarray as xr
import pooch

Specify a path to the data folder.

data_folder = "/Users/martin/Library/CloudStorage/OneDrive-SharedLibraries-TheAlanTuringInstitute/Daniel Arribas-Bel - demoland_data"

Load 2011 Output Area geoemtry (generalised).

oa = gpd.read_file(
    f"{data_folder}/raw/Output_Areas_Dec_2011_Boundaries_EW_BGC_2022.zip"
)

H.1 Air Quality

Read DEFRA data.

pm10_21 = (
    pd.read_csv(
        "https://uk-air.defra.gov.uk/datastore/pcm/mappm102021g.csv",
        header=5,
        na_values=["MISSING"],
    )
    .set_index(["x", "y"])
    .drop(columns="gridcode")
    .to_xarray()
)
pm25_21 = (
    pd.read_csv(
        "https://uk-air.defra.gov.uk/datastore/pcm/mappm252021g.csv",
        header=5,
        na_values=["MISSING"],
    )
    .set_index(["x", "y"])
    .drop(columns="gridcode")
    .to_xarray()
)
no2_21 = (
    pd.read_csv(
        "https://uk-air.defra.gov.uk/datastore/pcm/mapno22021.csv",
        header=5,
        na_values=["MISSING"],
    )
    .set_index(["x", "y"])
    .drop(columns="gridcode")
    .to_xarray()
)
so2_21 = (
    pd.read_csv(
        "https://uk-air.defra.gov.uk/datastore/pcm/mapso22021.csv",
        header=5,
        na_values=["MISSING"],
    )
    .set_index(["x", "y"])
    .drop(columns="gridcode")
    .to_xarray()
)
pollutants_2021 = xr.merge([pm10_21, pm25_21, no2_21, so2_21])
pollutants_2021

We can compute the index based on the formula.

aqi = (
    pollutants_2021.pm252021g
    + pollutants_2021.pm102021g / 2
    + pollutants_2021.no22021 / 4
    + pollutants_2021.so22021 / 10
)
pollutants_2021 = pollutants_2021.assign(aqi=aqi)

We convert the array to a GeoDataFrame with polygons representing grid area. This will be needed for areal interpolation to LSOA/MSOA.

pollutants_2021 = pollutants_2021.to_dataframe().reset_index()
pollutants_2021 = gpd.GeoDataFrame(
    pollutants_2021,
    geometry=gpd.points_from_xy(pollutants_2021.x, pollutants_2021.y, crs=27700).buffer(
        500, cap_style=3
    ),
)

pollutants_2021.to_parquet(
    f"{data_folder}/processed/air_quality/air_quality_grid_2021_england.parquet"
)

pollutants_2021

	x	y	pm102021g	pm252021g	no22021	so22021	aqi	geometry
0	-500	5500	NaN	NaN	NaN	NaN	NaN	POLYGON ((0.000 6000.000, 0.000 5000.000, -100...
1	-500	6500	NaN	NaN	NaN	NaN	NaN	POLYGON ((0.000 7000.000, 0.000 6000.000, -100...
2	-500	7500	NaN	NaN	NaN	NaN	NaN	POLYGON ((0.000 8000.000, 0.000 7000.000, -100...
3	-500	8500	NaN	NaN	NaN	NaN	NaN	POLYGON ((0.000 9000.000, 0.000 8000.000, -100...
4	-500	9500	NaN	NaN	NaN	NaN	NaN	POLYGON ((0.000 10000.000, 0.000 9000.000, -10...
...	...	...	...	...	...	...	...	...
768685	655500	1215500	NaN	NaN	NaN	NaN	NaN	POLYGON ((656000.000 1216000.000, 656000.000 1...
768686	655500	1216500	NaN	NaN	NaN	NaN	NaN	POLYGON ((656000.000 1217000.000, 656000.000 1...
768687	655500	1217500	NaN	NaN	NaN	NaN	NaN	POLYGON ((656000.000 1218000.000, 656000.000 1...
768688	655500	1218500	NaN	NaN	NaN	NaN	NaN	POLYGON ((656000.000 1219000.000, 656000.000 1...
768689	655500	1219500	NaN	NaN	NaN	NaN	NaN	POLYGON ((656000.000 1220000.000, 656000.000 1...

768690 rows × 8 columns

Interpolate to OAs.

%%time
interp = tobler.area_weighted.area_interpolate(
    pollutants_2021, oa, intensive_variables=["aqi"]
)

/Users/martin/mambaforge/envs/demoland/lib/python3.11/site-packages/pygeos/set_operations.py:129: RuntimeWarning: invalid value encountered in intersection
  return lib.intersection(a, b, **kwargs)
/Users/martin/mambaforge/envs/demoland/lib/python3.11/site-packages/tobler/util/util.py:32: UserWarning: nan values in variable: aqi, replacing with 0
  warn(f"nan values in variable: {column}, replacing with 0")

CPU times: user 17.2 s, sys: 215 ms, total: 17.4 s
Wall time: 17.4 s

oa["air_quality"] = interp.aqi

H.2 House price

Read the same dataset as before.

# It is 6.3GB...
linked_epc_path = "https://reshare.ukdataservice.ac.uk/854942/1/tranall2011_19.csv"

epc = pd.read_csv(linked_epc_path)
epc["dateoftransfer"] = pd.to_datetime(epc.dateoftransfer)
last2years = epc[epc.dateoftransfer > datetime.datetime(2018, 1, 1)]

price_per_oa = last2years[["oa11", "priceper"]].groupby("oa11").mean().reset_index()
price_per_oa.to_parquet(f"{data_folder}/processed/house_prices/price_per_oa.parquet")

price_per_oa = pd.read_parquet(
    f"{data_folder}/processed/house_prices/price_per_oa.parquet"
)

price_per_oa

	oa11	priceper
0	E00000003	15722.758094
1	E00000005	12587.314452
2	E00000007	12640.262294
3	E00000010	9535.653363
4	E00000012	10341.463415
...	...	...
173553	W00010260	1632.955752
173554	W00010261	1834.401969
173555	W00010262	1511.677538
173556	W00010263	1679.802956
173557	W00010264	2046.069031

173558 rows × 2 columns

oa

	OA11CD	LAD11CD	GlobalID	geometry	air_quality
0	E00000001	E09000001	f8512cce-6727-42cf-9840-6866cdbb2deb	POLYGON ((532303.492 181814.110, 532213.378 18...	27.501372
1	E00000003	E09000001	9eeeb3aa-ce92-4cea-bd70-3a0e680cddc1	POLYGON ((532180.131 181763.020, 532155.909 18...	27.501372
2	E00000005	E09000001	012372dd-5e03-43c3-a915-b697d02f88e2	POLYGON ((532124.321 181682.675, 532127.299 18...	27.501372
3	E00000007	E09000001	b61eb464-9c5b-4f0e-8f78-000ea938ee78	POLYGON ((532124.321 181682.675, 532201.292 18...	27.545142
4	E00000010	E09000001	efc37450-7064-4f5e-bedd-433bf4de1167	POLYGON ((532071.302 182159.586, 532127.958 18...	27.587650
...	...	...	...	...	...
181403	W00010261	W06000011	c1051d13-4fa6-48d8-8f32-5020938ce7d6	POLYGON ((262156.208 196600.223, 262074.703 19...	13.828992
181404	W00010262	W06000011	ea1fef0a-138d-4035-a401-48c459abbca0	POLYGON ((263241.217 197440.210, 263271.904 19...	13.670047
181405	W00010263	W06000011	38819942-d3db-47e1-a85d-eec4d7a091e4	POLYGON ((262156.208 196600.223, 262205.269 19...	14.125627
181406	W00010264	W06000011	61fbb0fb-10bf-4000-a669-7114865776ff	POLYGON ((268829.001 198038.000, 268708.179 19...	14.219191
181407	W00010265	W06000011	999facfa-dce5-42bf-82fe-c1f04221661e	POLYGON ((266130.758 192630.558, 265987.755 19...	14.927127

181408 rows × 5 columns

Link to our OA dataset.

oa = oa.merge(price_per_oa, left_on="OA11CD", right_on="oa11", how="left")
oa = oa.drop(columns="oa11").rename(columns={"priceper": "house_price"})
oa[["OA11CD", "geometry", "air_quality", "house_price"]].to_parquet(
    f"{data_folder}/processed/oa_data_england.parquet"
)

H.3 Explanatory variables

We also want to have all our explanatory variables linked to the OA geometries.

H.3.1 Population estimates

ONS population estimates are reported on the OA level and can be merged.

Read the file processed in the Urban Grammar project.

pop = pd.read_parquet(f"{data_folder}/processed/population.parquet")

Attribute join.

oa = oa.merge(pop, left_on="OA11CD", right_on="code", how="left")

oa = oa.drop(columns=["code"])

H.3.2 Workplace population

Workplace population is reported on Workplace Zones and needs to be interpolated to OA. We use the preprocessed data from the Urban Grammar project.

wp = gpd.read_parquet(
    f"{data_folder}/raw/workplace_population/workplace_by_industry_gb.pq"
)

%%time
wp_interpolated = tobler.area_weighted.area_interpolate(
    wp,
    oa,
    extensive_variables=[
        "A, B, D, E. Agriculture, energy and water",
        "C. Manufacturing",
        "F. Construction",
        "G, I. Distribution, hotels and restaurants",
        "H, J. Transport and communication",
        "K, L, M, N. Financial, real estate, professional and administrative activities",
        "O,P,Q. Public administration, education and health",
        "R, S, T, U. Other",
    ],
)

/Users/martin/mambaforge/envs/demoland/lib/python3.11/site-packages/pygeos/set_operations.py:129: RuntimeWarning: invalid value encountered in intersection
  return lib.intersection(a, b, **kwargs)

CPU times: user 31.5 s, sys: 351 ms, total: 31.9 s
Wall time: 32 s

oa[
    [
        "A, B, D, E. Agriculture, energy and water",
        "C. Manufacturing",
        "F. Construction",
        "G, I. Distribution, hotels and restaurants",
        "H, J. Transport and communication",
        "K, L, M, N. Financial, real estate, professional and administrative activities",
        "O,P,Q. Public administration, education and health",
        "R, S, T, U. Other",
    ]
] = wp_interpolated.drop(columns="geometry").values

oa = oa.drop(columns=["LAD11CD", "GlobalID"])

oa.to_parquet(f"{data_folder}/processed/oa_data_england.parquet")

H.3.3 Land cover (CORINE)

CORINE is shipped as custom polygons. We use the data downloaded for the Urban Grammar project.

corine = gpd.read_parquet(f"{data_folder}/raw/land_cover/corine_gb.pq")

%%time
corine_interpolated = tobler.area_weighted.area_interpolate(
    corine, oa, categorical_variables=["Code_18"]
)

/Users/martin/mambaforge/envs/demoland/lib/python3.11/site-packages/pygeos/set_operations.py:129: RuntimeWarning: invalid value encountered in intersection
  return lib.intersection(a, b, **kwargs)

CPU times: user 22min 56s, sys: 5.49 s, total: 23min 1s
Wall time: 23min 4s

corine_names = {
    "Code_18_124": "Land cover [Airports]",
    "Code_18_211": "Land cover [Non-irrigated arable land]",
    "Code_18_121": "Land cover [Industrial or commercial units]",
    "Code_18_421": "Land cover [Salt marshes]",
    "Code_18_522": "Land cover [Estuaries]",
    "Code_18_142": "Land cover [Sport and leisure facilities]",
    "Code_18_141": "Land cover [Green urban areas]",
    "Code_18_112": "Land cover [Discontinuous urban fabric]",
    "Code_18_231": "Land cover [Pastures]",
    "Code_18_311": "Land cover [Broad-leaved forest]",
    "Code_18_131": "Land cover [Mineral extraction sites]",
    "Code_18_123": "Land cover [Port areas]",
    "Code_18_122": "Land cover [Road and rail networks and associated land]",
    "Code_18_512": "Land cover [Water bodies]",
    "Code_18_243": "Land cover [Land principally occupied by agriculture, with significant areas of natural vegetation]",
    "Code_18_313": "Land cover [Mixed forest]",
    "Code_18_412": "Land cover [Peat bogs]",
    "Code_18_321": "Land cover [Natural grasslands]",
    "Code_18_322": "Land cover [Moors and heathland]",
    "Code_18_324": "Land cover [Transitional woodland-shrub]",
    "Code_18_111": "Land cover [Continuous urban fabric]",
    "Code_18_423": "Land cover [Intertidal flats]",
    "Code_18_523": "Land cover [Sea and ocean]",
    "Code_18_312": "Land cover [Coniferous forest]",
    "Code_18_133": "Land cover [Construction sites]",
    "Code_18_333": "Land cover [Sparsely vegetated areas]",
    "Code_18_332": "Land cover [Bare rocks]",
    "Code_18_411": "Land cover [Inland marshes]",
    "Code_18_132": "Land cover [Dump sites]",
    "Code_18_222": "Land cover [Fruit trees and berry plantations]",
    "Code_18_242": "Land cover [Complex cultivation patterns]",
    "Code_18_331": "Land cover [Beaches, dunes, sands]",
    "Code_18_511": "Land cover [Water courses]",
    "Code_18_334": "Land cover [Burnt areas]",
    "Code_18_244": "Land cover [Agro-forestry areas]",
    "Code_18_521": "Land cover [Coastal lagoons]",
}

corine_interpolated.columns = corine_interpolated.columns.map(corine_names)

Assign the subset of classes that may be interesting for our purposes to OA dataframe.

interesting = [
    "Land cover [Discontinuous urban fabric]",
    "Land cover [Continuous urban fabric]",
    "Land cover [Non-irrigated arable land]",
    "Land cover [Industrial or commercial units]",
    "Land cover [Green urban areas]",
    "Land cover [Pastures]",
    "Land cover [Sport and leisure facilities]",
]
oa[interesting] = corine_interpolated[interesting].values

oa.to_parquet(f"{data_folder}/processed/oa_data_england.parquet")

H.4 Morphometrics

Getting the file pre-processed based on the Urban Grammar

oa_morpho = pd.read_parquet(f"{data_folder}/processed/morphometrics_oa.parquet")
oa = oa.merge(oa_morpho, left_on="OA11CD", right_index=True)

oa.to_parquet(f"{data_folder}/processed/oa_data_england.parquet")

H.5 Cleanup

Drop unusable rows

oa = gpd.read_parquet(f"{data_folder}/processed/oa_data_england.parquet")

to_drop = [
    "sdbPer",
    "ssbElo",
    "stcOri",
    "sdcLAL",
    "mdcAre",
    "ltcAre",
    "ltcWRE",
    "mtdMDi",
    "lcdMes",
    "lddNDe",
    "sddAre",
    "mdsAre",
    "ldsAre",
    "lisCel",
    "ldePer",
    "lseCWA",
]

oa = oa.drop(columns=to_drop)

oa_clean = oa.dropna()

Remove London data as London is a massive outlier that throws everything off.

london = gpd.read_file(f"{data_folder}/raw/london/OA_2011_London_gen_MHW.shp")

oa_clean = oa_clean[~oa_clean.OA11CD.isin(london.OA11CD)]

oa_clean.set_index("OA11CD").to_parquet(
    f"{data_folder}/processed/oa_data_england.parquet"
)