import datetime
import geopandas as gpd
import tobler
import pandas as pd
import numpy as np
import xarray as xr
import pooch
Appendix H — Training data sampled from the whole England
Retrieving data for air quality and house price for the whole England, including their explanatory variables. This is useful if we want to train models based on more data than we have in Tyne and Wear.
Specify a path to the data folder.
= "/Users/martin/Library/CloudStorage/OneDrive-SharedLibraries-TheAlanTuringInstitute/Daniel Arribas-Bel - demoland_data" data_folder
Load 2011 Output Area geoemtry (generalised).
= gpd.read_file(
oa f"{data_folder}/raw/Output_Areas_Dec_2011_Boundaries_EW_BGC_2022.zip"
)
H.1 Air Quality
Read DEFRA data.
= (
pm10_21
pd.read_csv("https://uk-air.defra.gov.uk/datastore/pcm/mappm102021g.csv",
=5,
header=["MISSING"],
na_values
)"x", "y"])
.set_index([="gridcode")
.drop(columns
.to_xarray()
)= (
pm25_21
pd.read_csv("https://uk-air.defra.gov.uk/datastore/pcm/mappm252021g.csv",
=5,
header=["MISSING"],
na_values
)"x", "y"])
.set_index([="gridcode")
.drop(columns
.to_xarray()
)= (
no2_21
pd.read_csv("https://uk-air.defra.gov.uk/datastore/pcm/mapno22021.csv",
=5,
header=["MISSING"],
na_values
)"x", "y"])
.set_index([="gridcode")
.drop(columns
.to_xarray()
)= (
so2_21
pd.read_csv("https://uk-air.defra.gov.uk/datastore/pcm/mapso22021.csv",
=5,
header=["MISSING"],
na_values
)"x", "y"])
.set_index([="gridcode")
.drop(columns
.to_xarray()
)= xr.merge([pm10_21, pm25_21, no2_21, so2_21])
pollutants_2021 pollutants_2021
<xarray.Dataset> Dimensions: (x: 657, y: 1170) Coordinates: * x (x) int64 -500 500 1500 2500 3500 ... 652500 653500 654500 655500 * y (y) int64 5500 6500 7500 8500 ... 1216500 1217500 1218500 1219500 Data variables: pm102021g (x, y) float64 nan nan nan nan nan nan ... nan nan nan nan nan pm252021g (x, y) float64 nan nan nan nan nan nan ... nan nan nan nan nan no22021 (x, y) float64 nan nan nan nan nan nan ... nan nan nan nan nan so22021 (x, y) float64 nan nan nan nan nan nan ... nan nan nan nan nan
We can compute the index based on the formula.
= (
aqi
pollutants_2021.pm252021g+ pollutants_2021.pm102021g / 2
+ pollutants_2021.no22021 / 4
+ pollutants_2021.so22021 / 10
)= pollutants_2021.assign(aqi=aqi) pollutants_2021
We convert the array to a GeoDataFrame with polygons representing grid area. This will be needed for areal interpolation to LSOA/MSOA.
= pollutants_2021.to_dataframe().reset_index()
pollutants_2021 = gpd.GeoDataFrame(
pollutants_2021
pollutants_2021,=gpd.points_from_xy(pollutants_2021.x, pollutants_2021.y, crs=27700).buffer(
geometry500, cap_style=3
), )
pollutants_2021.to_parquet(f"{data_folder}/processed/air_quality/air_quality_grid_2021_england.parquet"
)
pollutants_2021
x | y | pm102021g | pm252021g | no22021 | so22021 | aqi | geometry | |
---|---|---|---|---|---|---|---|---|
0 | -500 | 5500 | NaN | NaN | NaN | NaN | NaN | POLYGON ((0.000 6000.000, 0.000 5000.000, -100... |
1 | -500 | 6500 | NaN | NaN | NaN | NaN | NaN | POLYGON ((0.000 7000.000, 0.000 6000.000, -100... |
2 | -500 | 7500 | NaN | NaN | NaN | NaN | NaN | POLYGON ((0.000 8000.000, 0.000 7000.000, -100... |
3 | -500 | 8500 | NaN | NaN | NaN | NaN | NaN | POLYGON ((0.000 9000.000, 0.000 8000.000, -100... |
4 | -500 | 9500 | NaN | NaN | NaN | NaN | NaN | POLYGON ((0.000 10000.000, 0.000 9000.000, -10... |
... | ... | ... | ... | ... | ... | ... | ... | ... |
768685 | 655500 | 1215500 | NaN | NaN | NaN | NaN | NaN | POLYGON ((656000.000 1216000.000, 656000.000 1... |
768686 | 655500 | 1216500 | NaN | NaN | NaN | NaN | NaN | POLYGON ((656000.000 1217000.000, 656000.000 1... |
768687 | 655500 | 1217500 | NaN | NaN | NaN | NaN | NaN | POLYGON ((656000.000 1218000.000, 656000.000 1... |
768688 | 655500 | 1218500 | NaN | NaN | NaN | NaN | NaN | POLYGON ((656000.000 1219000.000, 656000.000 1... |
768689 | 655500 | 1219500 | NaN | NaN | NaN | NaN | NaN | POLYGON ((656000.000 1220000.000, 656000.000 1... |
768690 rows × 8 columns
Interpolate to OAs.
%%time
= tobler.area_weighted.area_interpolate(
interp =["aqi"]
pollutants_2021, oa, intensive_variables )
/Users/martin/mambaforge/envs/demoland/lib/python3.11/site-packages/pygeos/set_operations.py:129: RuntimeWarning: invalid value encountered in intersection
return lib.intersection(a, b, **kwargs)
/Users/martin/mambaforge/envs/demoland/lib/python3.11/site-packages/tobler/util/util.py:32: UserWarning: nan values in variable: aqi, replacing with 0
warn(f"nan values in variable: {column}, replacing with 0")
CPU times: user 17.2 s, sys: 215 ms, total: 17.4 s
Wall time: 17.4 s
"air_quality"] = interp.aqi oa[
H.2 House price
Read the same dataset as before.
# It is 6.3GB...
= "https://reshare.ukdataservice.ac.uk/854942/1/tranall2011_19.csv"
linked_epc_path
= pd.read_csv(linked_epc_path)
epc "dateoftransfer"] = pd.to_datetime(epc.dateoftransfer)
epc[= epc[epc.dateoftransfer > datetime.datetime(2018, 1, 1)]
last2years
= last2years[["oa11", "priceper"]].groupby("oa11").mean().reset_index()
price_per_oa f"{data_folder}/processed/house_prices/price_per_oa.parquet") price_per_oa.to_parquet(
= pd.read_parquet(
price_per_oa f"{data_folder}/processed/house_prices/price_per_oa.parquet"
)
price_per_oa
oa11 | priceper | |
---|---|---|
0 | E00000003 | 15722.758094 |
1 | E00000005 | 12587.314452 |
2 | E00000007 | 12640.262294 |
3 | E00000010 | 9535.653363 |
4 | E00000012 | 10341.463415 |
... | ... | ... |
173553 | W00010260 | 1632.955752 |
173554 | W00010261 | 1834.401969 |
173555 | W00010262 | 1511.677538 |
173556 | W00010263 | 1679.802956 |
173557 | W00010264 | 2046.069031 |
173558 rows × 2 columns
oa
OA11CD | LAD11CD | GlobalID | geometry | air_quality | |
---|---|---|---|---|---|
0 | E00000001 | E09000001 | f8512cce-6727-42cf-9840-6866cdbb2deb | POLYGON ((532303.492 181814.110, 532213.378 18... | 27.501372 |
1 | E00000003 | E09000001 | 9eeeb3aa-ce92-4cea-bd70-3a0e680cddc1 | POLYGON ((532180.131 181763.020, 532155.909 18... | 27.501372 |
2 | E00000005 | E09000001 | 012372dd-5e03-43c3-a915-b697d02f88e2 | POLYGON ((532124.321 181682.675, 532127.299 18... | 27.501372 |
3 | E00000007 | E09000001 | b61eb464-9c5b-4f0e-8f78-000ea938ee78 | POLYGON ((532124.321 181682.675, 532201.292 18... | 27.545142 |
4 | E00000010 | E09000001 | efc37450-7064-4f5e-bedd-433bf4de1167 | POLYGON ((532071.302 182159.586, 532127.958 18... | 27.587650 |
... | ... | ... | ... | ... | ... |
181403 | W00010261 | W06000011 | c1051d13-4fa6-48d8-8f32-5020938ce7d6 | POLYGON ((262156.208 196600.223, 262074.703 19... | 13.828992 |
181404 | W00010262 | W06000011 | ea1fef0a-138d-4035-a401-48c459abbca0 | POLYGON ((263241.217 197440.210, 263271.904 19... | 13.670047 |
181405 | W00010263 | W06000011 | 38819942-d3db-47e1-a85d-eec4d7a091e4 | POLYGON ((262156.208 196600.223, 262205.269 19... | 14.125627 |
181406 | W00010264 | W06000011 | 61fbb0fb-10bf-4000-a669-7114865776ff | POLYGON ((268829.001 198038.000, 268708.179 19... | 14.219191 |
181407 | W00010265 | W06000011 | 999facfa-dce5-42bf-82fe-c1f04221661e | POLYGON ((266130.758 192630.558, 265987.755 19... | 14.927127 |
181408 rows × 5 columns
Link to our OA dataset.
= oa.merge(price_per_oa, left_on="OA11CD", right_on="oa11", how="left")
oa = oa.drop(columns="oa11").rename(columns={"priceper": "house_price"})
oa "OA11CD", "geometry", "air_quality", "house_price"]].to_parquet(
oa[[f"{data_folder}/processed/oa_data_england.parquet"
)
H.3 Explanatory variables
We also want to have all our explanatory variables linked to the OA geometries.
H.3.1 Population estimates
ONS population estimates are reported on the OA level and can be merged.
Read the file processed in the Urban Grammar project.
= pd.read_parquet(f"{data_folder}/processed/population.parquet") pop
Attribute join.
= oa.merge(pop, left_on="OA11CD", right_on="code", how="left") oa
= oa.drop(columns=["code"]) oa
H.3.2 Workplace population
Workplace population is reported on Workplace Zones and needs to be interpolated to OA. We use the preprocessed data from the Urban Grammar project.
= gpd.read_parquet(
wp f"{data_folder}/raw/workplace_population/workplace_by_industry_gb.pq"
)
%%time
= tobler.area_weighted.area_interpolate(
wp_interpolated
wp,
oa,=[
extensive_variables"A, B, D, E. Agriculture, energy and water",
"C. Manufacturing",
"F. Construction",
"G, I. Distribution, hotels and restaurants",
"H, J. Transport and communication",
"K, L, M, N. Financial, real estate, professional and administrative activities",
"O,P,Q. Public administration, education and health",
"R, S, T, U. Other",
], )
/Users/martin/mambaforge/envs/demoland/lib/python3.11/site-packages/pygeos/set_operations.py:129: RuntimeWarning: invalid value encountered in intersection
return lib.intersection(a, b, **kwargs)
CPU times: user 31.5 s, sys: 351 ms, total: 31.9 s
Wall time: 32 s
oa[
["A, B, D, E. Agriculture, energy and water",
"C. Manufacturing",
"F. Construction",
"G, I. Distribution, hotels and restaurants",
"H, J. Transport and communication",
"K, L, M, N. Financial, real estate, professional and administrative activities",
"O,P,Q. Public administration, education and health",
"R, S, T, U. Other",
]= wp_interpolated.drop(columns="geometry").values ]
= oa.drop(columns=["LAD11CD", "GlobalID"]) oa
f"{data_folder}/processed/oa_data_england.parquet") oa.to_parquet(
H.3.3 Land cover (CORINE)
CORINE is shipped as custom polygons. We use the data downloaded for the Urban Grammar project.
= gpd.read_parquet(f"{data_folder}/raw/land_cover/corine_gb.pq") corine
%%time
= tobler.area_weighted.area_interpolate(
corine_interpolated =["Code_18"]
corine, oa, categorical_variables )
/Users/martin/mambaforge/envs/demoland/lib/python3.11/site-packages/pygeos/set_operations.py:129: RuntimeWarning: invalid value encountered in intersection
return lib.intersection(a, b, **kwargs)
CPU times: user 22min 56s, sys: 5.49 s, total: 23min 1s
Wall time: 23min 4s
= {
corine_names "Code_18_124": "Land cover [Airports]",
"Code_18_211": "Land cover [Non-irrigated arable land]",
"Code_18_121": "Land cover [Industrial or commercial units]",
"Code_18_421": "Land cover [Salt marshes]",
"Code_18_522": "Land cover [Estuaries]",
"Code_18_142": "Land cover [Sport and leisure facilities]",
"Code_18_141": "Land cover [Green urban areas]",
"Code_18_112": "Land cover [Discontinuous urban fabric]",
"Code_18_231": "Land cover [Pastures]",
"Code_18_311": "Land cover [Broad-leaved forest]",
"Code_18_131": "Land cover [Mineral extraction sites]",
"Code_18_123": "Land cover [Port areas]",
"Code_18_122": "Land cover [Road and rail networks and associated land]",
"Code_18_512": "Land cover [Water bodies]",
"Code_18_243": "Land cover [Land principally occupied by agriculture, with significant areas of natural vegetation]",
"Code_18_313": "Land cover [Mixed forest]",
"Code_18_412": "Land cover [Peat bogs]",
"Code_18_321": "Land cover [Natural grasslands]",
"Code_18_322": "Land cover [Moors and heathland]",
"Code_18_324": "Land cover [Transitional woodland-shrub]",
"Code_18_111": "Land cover [Continuous urban fabric]",
"Code_18_423": "Land cover [Intertidal flats]",
"Code_18_523": "Land cover [Sea and ocean]",
"Code_18_312": "Land cover [Coniferous forest]",
"Code_18_133": "Land cover [Construction sites]",
"Code_18_333": "Land cover [Sparsely vegetated areas]",
"Code_18_332": "Land cover [Bare rocks]",
"Code_18_411": "Land cover [Inland marshes]",
"Code_18_132": "Land cover [Dump sites]",
"Code_18_222": "Land cover [Fruit trees and berry plantations]",
"Code_18_242": "Land cover [Complex cultivation patterns]",
"Code_18_331": "Land cover [Beaches, dunes, sands]",
"Code_18_511": "Land cover [Water courses]",
"Code_18_334": "Land cover [Burnt areas]",
"Code_18_244": "Land cover [Agro-forestry areas]",
"Code_18_521": "Land cover [Coastal lagoons]",
}
= corine_interpolated.columns.map(corine_names) corine_interpolated.columns
Assign the subset of classes that may be interesting for our purposes to OA dataframe.
= [
interesting "Land cover [Discontinuous urban fabric]",
"Land cover [Continuous urban fabric]",
"Land cover [Non-irrigated arable land]",
"Land cover [Industrial or commercial units]",
"Land cover [Green urban areas]",
"Land cover [Pastures]",
"Land cover [Sport and leisure facilities]",
]= corine_interpolated[interesting].values oa[interesting]
f"{data_folder}/processed/oa_data_england.parquet") oa.to_parquet(
H.4 Morphometrics
Getting the file pre-processed based on the Urban Grammar
= pd.read_parquet(f"{data_folder}/processed/morphometrics_oa.parquet")
oa_morpho = oa.merge(oa_morpho, left_on="OA11CD", right_index=True) oa
f"{data_folder}/processed/oa_data_england.parquet") oa.to_parquet(
H.5 Cleanup
Drop unusable rows
= gpd.read_parquet(f"{data_folder}/processed/oa_data_england.parquet") oa
= [
to_drop "sdbPer",
"ssbElo",
"stcOri",
"sdcLAL",
"mdcAre",
"ltcAre",
"ltcWRE",
"mtdMDi",
"lcdMes",
"lddNDe",
"sddAre",
"mdsAre",
"ldsAre",
"lisCel",
"ldePer",
"lseCWA",
]
= oa.drop(columns=to_drop) oa
= oa.dropna() oa_clean
Remove London data as London is a massive outlier that throws everything off.
= gpd.read_file(f"{data_folder}/raw/london/OA_2011_London_gen_MHW.shp") london
= oa_clean[~oa_clean.OA11CD.isin(london.OA11CD)] oa_clean
"OA11CD").to_parquet(
oa_clean.set_index(f"{data_folder}/processed/oa_data_england.parquet"
)