import datetime
import geopandas as gpd
import tobler
import pandas as pd
import numpy as np
import xarray as xr
import poochAppendix H — Training data sampled from the whole England
Retrieving data for air quality and house price for the whole England, including their explanatory variables. This is useful if we want to train models based on more data than we have in Tyne and Wear.
Specify a path to the data folder.
data_folder = "/Users/martin/Library/CloudStorage/OneDrive-SharedLibraries-TheAlanTuringInstitute/Daniel Arribas-Bel - demoland_data"Load 2011 Output Area geoemtry (generalised).
oa = gpd.read_file(
f"{data_folder}/raw/Output_Areas_Dec_2011_Boundaries_EW_BGC_2022.zip"
)H.1 Air Quality
Read DEFRA data.
pm10_21 = (
pd.read_csv(
"https://uk-air.defra.gov.uk/datastore/pcm/mappm102021g.csv",
header=5,
na_values=["MISSING"],
)
.set_index(["x", "y"])
.drop(columns="gridcode")
.to_xarray()
)
pm25_21 = (
pd.read_csv(
"https://uk-air.defra.gov.uk/datastore/pcm/mappm252021g.csv",
header=5,
na_values=["MISSING"],
)
.set_index(["x", "y"])
.drop(columns="gridcode")
.to_xarray()
)
no2_21 = (
pd.read_csv(
"https://uk-air.defra.gov.uk/datastore/pcm/mapno22021.csv",
header=5,
na_values=["MISSING"],
)
.set_index(["x", "y"])
.drop(columns="gridcode")
.to_xarray()
)
so2_21 = (
pd.read_csv(
"https://uk-air.defra.gov.uk/datastore/pcm/mapso22021.csv",
header=5,
na_values=["MISSING"],
)
.set_index(["x", "y"])
.drop(columns="gridcode")
.to_xarray()
)
pollutants_2021 = xr.merge([pm10_21, pm25_21, no2_21, so2_21])
pollutants_2021<xarray.Dataset>
Dimensions: (x: 657, y: 1170)
Coordinates:
* x (x) int64 -500 500 1500 2500 3500 ... 652500 653500 654500 655500
* y (y) int64 5500 6500 7500 8500 ... 1216500 1217500 1218500 1219500
Data variables:
pm102021g (x, y) float64 nan nan nan nan nan nan ... nan nan nan nan nan
pm252021g (x, y) float64 nan nan nan nan nan nan ... nan nan nan nan nan
no22021 (x, y) float64 nan nan nan nan nan nan ... nan nan nan nan nan
so22021 (x, y) float64 nan nan nan nan nan nan ... nan nan nan nan nanWe can compute the index based on the formula.
aqi = (
pollutants_2021.pm252021g
+ pollutants_2021.pm102021g / 2
+ pollutants_2021.no22021 / 4
+ pollutants_2021.so22021 / 10
)
pollutants_2021 = pollutants_2021.assign(aqi=aqi)We convert the array to a GeoDataFrame with polygons representing grid area. This will be needed for areal interpolation to LSOA/MSOA.
pollutants_2021 = pollutants_2021.to_dataframe().reset_index()
pollutants_2021 = gpd.GeoDataFrame(
pollutants_2021,
geometry=gpd.points_from_xy(pollutants_2021.x, pollutants_2021.y, crs=27700).buffer(
500, cap_style=3
),
)pollutants_2021.to_parquet(
f"{data_folder}/processed/air_quality/air_quality_grid_2021_england.parquet"
)pollutants_2021| x | y | pm102021g | pm252021g | no22021 | so22021 | aqi | geometry | |
|---|---|---|---|---|---|---|---|---|
| 0 | -500 | 5500 | NaN | NaN | NaN | NaN | NaN | POLYGON ((0.000 6000.000, 0.000 5000.000, -100... |
| 1 | -500 | 6500 | NaN | NaN | NaN | NaN | NaN | POLYGON ((0.000 7000.000, 0.000 6000.000, -100... |
| 2 | -500 | 7500 | NaN | NaN | NaN | NaN | NaN | POLYGON ((0.000 8000.000, 0.000 7000.000, -100... |
| 3 | -500 | 8500 | NaN | NaN | NaN | NaN | NaN | POLYGON ((0.000 9000.000, 0.000 8000.000, -100... |
| 4 | -500 | 9500 | NaN | NaN | NaN | NaN | NaN | POLYGON ((0.000 10000.000, 0.000 9000.000, -10... |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 768685 | 655500 | 1215500 | NaN | NaN | NaN | NaN | NaN | POLYGON ((656000.000 1216000.000, 656000.000 1... |
| 768686 | 655500 | 1216500 | NaN | NaN | NaN | NaN | NaN | POLYGON ((656000.000 1217000.000, 656000.000 1... |
| 768687 | 655500 | 1217500 | NaN | NaN | NaN | NaN | NaN | POLYGON ((656000.000 1218000.000, 656000.000 1... |
| 768688 | 655500 | 1218500 | NaN | NaN | NaN | NaN | NaN | POLYGON ((656000.000 1219000.000, 656000.000 1... |
| 768689 | 655500 | 1219500 | NaN | NaN | NaN | NaN | NaN | POLYGON ((656000.000 1220000.000, 656000.000 1... |
768690 rows × 8 columns
Interpolate to OAs.
%%time
interp = tobler.area_weighted.area_interpolate(
pollutants_2021, oa, intensive_variables=["aqi"]
)/Users/martin/mambaforge/envs/demoland/lib/python3.11/site-packages/pygeos/set_operations.py:129: RuntimeWarning: invalid value encountered in intersection
return lib.intersection(a, b, **kwargs)
/Users/martin/mambaforge/envs/demoland/lib/python3.11/site-packages/tobler/util/util.py:32: UserWarning: nan values in variable: aqi, replacing with 0
warn(f"nan values in variable: {column}, replacing with 0")
CPU times: user 17.2 s, sys: 215 ms, total: 17.4 s
Wall time: 17.4 s
oa["air_quality"] = interp.aqiH.2 House price
Read the same dataset as before.
# It is 6.3GB...
linked_epc_path = "https://reshare.ukdataservice.ac.uk/854942/1/tranall2011_19.csv"
epc = pd.read_csv(linked_epc_path)
epc["dateoftransfer"] = pd.to_datetime(epc.dateoftransfer)
last2years = epc[epc.dateoftransfer > datetime.datetime(2018, 1, 1)]
price_per_oa = last2years[["oa11", "priceper"]].groupby("oa11").mean().reset_index()
price_per_oa.to_parquet(f"{data_folder}/processed/house_prices/price_per_oa.parquet")price_per_oa = pd.read_parquet(
f"{data_folder}/processed/house_prices/price_per_oa.parquet"
)price_per_oa| oa11 | priceper | |
|---|---|---|
| 0 | E00000003 | 15722.758094 |
| 1 | E00000005 | 12587.314452 |
| 2 | E00000007 | 12640.262294 |
| 3 | E00000010 | 9535.653363 |
| 4 | E00000012 | 10341.463415 |
| ... | ... | ... |
| 173553 | W00010260 | 1632.955752 |
| 173554 | W00010261 | 1834.401969 |
| 173555 | W00010262 | 1511.677538 |
| 173556 | W00010263 | 1679.802956 |
| 173557 | W00010264 | 2046.069031 |
173558 rows × 2 columns
oa| OA11CD | LAD11CD | GlobalID | geometry | air_quality | |
|---|---|---|---|---|---|
| 0 | E00000001 | E09000001 | f8512cce-6727-42cf-9840-6866cdbb2deb | POLYGON ((532303.492 181814.110, 532213.378 18... | 27.501372 |
| 1 | E00000003 | E09000001 | 9eeeb3aa-ce92-4cea-bd70-3a0e680cddc1 | POLYGON ((532180.131 181763.020, 532155.909 18... | 27.501372 |
| 2 | E00000005 | E09000001 | 012372dd-5e03-43c3-a915-b697d02f88e2 | POLYGON ((532124.321 181682.675, 532127.299 18... | 27.501372 |
| 3 | E00000007 | E09000001 | b61eb464-9c5b-4f0e-8f78-000ea938ee78 | POLYGON ((532124.321 181682.675, 532201.292 18... | 27.545142 |
| 4 | E00000010 | E09000001 | efc37450-7064-4f5e-bedd-433bf4de1167 | POLYGON ((532071.302 182159.586, 532127.958 18... | 27.587650 |
| ... | ... | ... | ... | ... | ... |
| 181403 | W00010261 | W06000011 | c1051d13-4fa6-48d8-8f32-5020938ce7d6 | POLYGON ((262156.208 196600.223, 262074.703 19... | 13.828992 |
| 181404 | W00010262 | W06000011 | ea1fef0a-138d-4035-a401-48c459abbca0 | POLYGON ((263241.217 197440.210, 263271.904 19... | 13.670047 |
| 181405 | W00010263 | W06000011 | 38819942-d3db-47e1-a85d-eec4d7a091e4 | POLYGON ((262156.208 196600.223, 262205.269 19... | 14.125627 |
| 181406 | W00010264 | W06000011 | 61fbb0fb-10bf-4000-a669-7114865776ff | POLYGON ((268829.001 198038.000, 268708.179 19... | 14.219191 |
| 181407 | W00010265 | W06000011 | 999facfa-dce5-42bf-82fe-c1f04221661e | POLYGON ((266130.758 192630.558, 265987.755 19... | 14.927127 |
181408 rows × 5 columns
Link to our OA dataset.
oa = oa.merge(price_per_oa, left_on="OA11CD", right_on="oa11", how="left")
oa = oa.drop(columns="oa11").rename(columns={"priceper": "house_price"})
oa[["OA11CD", "geometry", "air_quality", "house_price"]].to_parquet(
f"{data_folder}/processed/oa_data_england.parquet"
)H.3 Explanatory variables
We also want to have all our explanatory variables linked to the OA geometries.
H.3.1 Population estimates
ONS population estimates are reported on the OA level and can be merged.
Read the file processed in the Urban Grammar project.
pop = pd.read_parquet(f"{data_folder}/processed/population.parquet")Attribute join.
oa = oa.merge(pop, left_on="OA11CD", right_on="code", how="left")oa = oa.drop(columns=["code"])H.3.2 Workplace population
Workplace population is reported on Workplace Zones and needs to be interpolated to OA. We use the preprocessed data from the Urban Grammar project.
wp = gpd.read_parquet(
f"{data_folder}/raw/workplace_population/workplace_by_industry_gb.pq"
)%%time
wp_interpolated = tobler.area_weighted.area_interpolate(
wp,
oa,
extensive_variables=[
"A, B, D, E. Agriculture, energy and water",
"C. Manufacturing",
"F. Construction",
"G, I. Distribution, hotels and restaurants",
"H, J. Transport and communication",
"K, L, M, N. Financial, real estate, professional and administrative activities",
"O,P,Q. Public administration, education and health",
"R, S, T, U. Other",
],
)/Users/martin/mambaforge/envs/demoland/lib/python3.11/site-packages/pygeos/set_operations.py:129: RuntimeWarning: invalid value encountered in intersection
return lib.intersection(a, b, **kwargs)
CPU times: user 31.5 s, sys: 351 ms, total: 31.9 s
Wall time: 32 s
oa[
[
"A, B, D, E. Agriculture, energy and water",
"C. Manufacturing",
"F. Construction",
"G, I. Distribution, hotels and restaurants",
"H, J. Transport and communication",
"K, L, M, N. Financial, real estate, professional and administrative activities",
"O,P,Q. Public administration, education and health",
"R, S, T, U. Other",
]
] = wp_interpolated.drop(columns="geometry").valuesoa = oa.drop(columns=["LAD11CD", "GlobalID"])oa.to_parquet(f"{data_folder}/processed/oa_data_england.parquet")H.3.3 Land cover (CORINE)
CORINE is shipped as custom polygons. We use the data downloaded for the Urban Grammar project.
corine = gpd.read_parquet(f"{data_folder}/raw/land_cover/corine_gb.pq")%%time
corine_interpolated = tobler.area_weighted.area_interpolate(
corine, oa, categorical_variables=["Code_18"]
)/Users/martin/mambaforge/envs/demoland/lib/python3.11/site-packages/pygeos/set_operations.py:129: RuntimeWarning: invalid value encountered in intersection
return lib.intersection(a, b, **kwargs)
CPU times: user 22min 56s, sys: 5.49 s, total: 23min 1s
Wall time: 23min 4s
corine_names = {
"Code_18_124": "Land cover [Airports]",
"Code_18_211": "Land cover [Non-irrigated arable land]",
"Code_18_121": "Land cover [Industrial or commercial units]",
"Code_18_421": "Land cover [Salt marshes]",
"Code_18_522": "Land cover [Estuaries]",
"Code_18_142": "Land cover [Sport and leisure facilities]",
"Code_18_141": "Land cover [Green urban areas]",
"Code_18_112": "Land cover [Discontinuous urban fabric]",
"Code_18_231": "Land cover [Pastures]",
"Code_18_311": "Land cover [Broad-leaved forest]",
"Code_18_131": "Land cover [Mineral extraction sites]",
"Code_18_123": "Land cover [Port areas]",
"Code_18_122": "Land cover [Road and rail networks and associated land]",
"Code_18_512": "Land cover [Water bodies]",
"Code_18_243": "Land cover [Land principally occupied by agriculture, with significant areas of natural vegetation]",
"Code_18_313": "Land cover [Mixed forest]",
"Code_18_412": "Land cover [Peat bogs]",
"Code_18_321": "Land cover [Natural grasslands]",
"Code_18_322": "Land cover [Moors and heathland]",
"Code_18_324": "Land cover [Transitional woodland-shrub]",
"Code_18_111": "Land cover [Continuous urban fabric]",
"Code_18_423": "Land cover [Intertidal flats]",
"Code_18_523": "Land cover [Sea and ocean]",
"Code_18_312": "Land cover [Coniferous forest]",
"Code_18_133": "Land cover [Construction sites]",
"Code_18_333": "Land cover [Sparsely vegetated areas]",
"Code_18_332": "Land cover [Bare rocks]",
"Code_18_411": "Land cover [Inland marshes]",
"Code_18_132": "Land cover [Dump sites]",
"Code_18_222": "Land cover [Fruit trees and berry plantations]",
"Code_18_242": "Land cover [Complex cultivation patterns]",
"Code_18_331": "Land cover [Beaches, dunes, sands]",
"Code_18_511": "Land cover [Water courses]",
"Code_18_334": "Land cover [Burnt areas]",
"Code_18_244": "Land cover [Agro-forestry areas]",
"Code_18_521": "Land cover [Coastal lagoons]",
}
corine_interpolated.columns = corine_interpolated.columns.map(corine_names)Assign the subset of classes that may be interesting for our purposes to OA dataframe.
interesting = [
"Land cover [Discontinuous urban fabric]",
"Land cover [Continuous urban fabric]",
"Land cover [Non-irrigated arable land]",
"Land cover [Industrial or commercial units]",
"Land cover [Green urban areas]",
"Land cover [Pastures]",
"Land cover [Sport and leisure facilities]",
]
oa[interesting] = corine_interpolated[interesting].valuesoa.to_parquet(f"{data_folder}/processed/oa_data_england.parquet")H.4 Morphometrics
Getting the file pre-processed based on the Urban Grammar
oa_morpho = pd.read_parquet(f"{data_folder}/processed/morphometrics_oa.parquet")
oa = oa.merge(oa_morpho, left_on="OA11CD", right_index=True)oa.to_parquet(f"{data_folder}/processed/oa_data_england.parquet")H.5 Cleanup
Drop unusable rows
oa = gpd.read_parquet(f"{data_folder}/processed/oa_data_england.parquet")to_drop = [
"sdbPer",
"ssbElo",
"stcOri",
"sdcLAL",
"mdcAre",
"ltcAre",
"ltcWRE",
"mtdMDi",
"lcdMes",
"lddNDe",
"sddAre",
"mdsAre",
"ldsAre",
"lisCel",
"ldePer",
"lseCWA",
]
oa = oa.drop(columns=to_drop)oa_clean = oa.dropna()Remove London data as London is a massive outlier that throws everything off.
london = gpd.read_file(f"{data_folder}/raw/london/OA_2011_London_gen_MHW.shp")oa_clean = oa_clean[~oa_clean.OA11CD.isin(london.OA11CD)]oa_clean.set_index("OA11CD").to_parquet(
f"{data_folder}/processed/oa_data_england.parquet"
)