Appendix O — Create summary data for signatures

Measure median and IQR for each variable and store them as Parquets.

Data available at https://github.com/urbangrammarai/signatures_gb.

import dask.dataframe as dd
import pandas as pd

data_folder = "/Users/martin/Library/CloudStorage/OneDrive-SharedLibraries-TheAlanTuringInstitute/Daniel Arribas-Bel - demoland_data"

Define a key to match data.

form_key = {
    "area of building": "sdbAre",
    "courtyard area of building": "sdbCoA",
    "circular compactness of building": "ssbCCo",
    "corners of building": "ssbCor",
    "squareness of building": "ssbSqu",
    "equivalent rectangular index of building": "ssbERI",
    "centroid - corner mean distance of building": "ssbCCM",
    "centroid - corner distance deviation of building": "ssbCCD",
    "orientation of building": "stbOri",
    "area of ETC": "sdcAre",
    "circular compactness of ETC": "sscCCo",
    "equivalent rectangular index of ETC": "sscERI",
    "covered area ratio of ETC": "sicCAR",
    "cell alignment of building": "stbCeA",
    "alignment of neighbouring buildings": "mtbAli",
    "mean distance between neighbouring buildings": "mtbNDi",
    "perimeter-weighted neighbours of ETC": "mtcWNe",
    "mean inter-building distance": "ltbIBD",
    "width of street profile": "sdsSPW",
    "width deviation of street profile": "sdsSWD",
    "openness of street profile": "sdsSPO",
    "length of street segment": "sdsLen",
    "linearity of street segment": "sssLin",
    "mean segment length within 3 steps": "ldsMSL",
    "node degree of junction": "mtdDeg",
    "local proportion of 3-way intersections of street network": "linP3W",
    "local proportion of 4-way intersections of street network": "linP4W",
    "local proportion of cul-de-sacs of street network": "linPDE",
    "local closeness of street network": "lcnClo",
    "local cul-de-sac length of street network": "ldsCDL",
    "square clustering of street network": "xcnSCl",
    "local degree weighted node density of street network": "linWID",
    "street alignment of building": "stbSAl",
    "area covered by edge-attached ETCs": "sdsAre",
    "buildings per meter of street segment": "sisBpM",
    "reached ETCs by neighbouring segments": "misCel",
    "reached ETCs by tessellation contiguity": "ltcRea",
    "area of enclosure": "ldeAre",
    "circular compactness of enclosure": "lseCCo",
    "equivalent rectangular index of enclosure": "lseERI",
    "orientation of enclosure": "lteOri",
    "perimeter-weighted neighbours of enclosure": "lteWNB",
    "area-weighted ETCs of enclosure": "lieWCe",
}

Read form.

form = (
    dd.read_parquet("signatures_gb/form", columns=["hindex"] + list(form_key))
    .compute()
    .set_index("hindex")
    .rename(columns=form_key)
)

Read signature labels

signature_type = (
    dd.read_parquet("urbangrammar_samba/spatial_signatures/signatures/hindex_to_type")
    .compute()
    .set_index("hindex")
)

Sort and merge via index.

form = form.sort_index()
signature_type = signature_type.sort_index()
form = form.merge(
    signature_type[["type"]], how="left", left_index=True, right_index=True
)

form.head()

	sdbAre	sdbCoA	ssbCCo	ssbCor	ssbSqu	ssbERI	ssbCCM	ssbCCD	stbOri	sdcAre	...	sisBpM	misCel	ltcRea	ldeAre	lseCCo	lseERI	lteOri	lteWNB	lieWCe	type
hindex
c000e094707t0000	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	296.055450	...	0.073677	7.185852	33	296.05545	0.350319	0.845480	6.651016	0.135074	0.000516	Wild countryside
c000e094763t0000	868.66235	0.0	0.371499	10.0	1.046911	0.700460	17.571433	7.996062	36.659023	4968.852718	...	0.006263	34.602315	55	42494.97405	0.077278	0.682760	37.918157	0.010778	0.000206	Countryside agriculture
c000e094763t0001	1387.68690	0.0	0.576501	5.0	30.424307	1.015852	26.366200	1.672909	37.459323	23541.033893	...	0.008039	7.692777	61	42494.97405	0.077278	0.682760	37.918157	0.010778	0.000206	Countryside agriculture
c000e094763t0002	183.95600	0.0	0.480131	4.0	0.501584	0.999954	11.015825	0.044073	37.969960	13985.087439	...	0.009475	86.912080	45	42494.97405	0.077278	0.682760	37.918157	0.010778	0.000206	Countryside agriculture
c000e094764t0000	1990.40485	0.0	0.640383	14.0	26.845785	0.918766	24.646409	6.637691	39.216097	27866.551253	...	0.008404	8.999952	46	41009.22230	0.101179	0.680202	37.948923	0.017903	0.000007	Countryside agriculture

5 rows × 44 columns

Group by a signature type, get quartiles and IQR.

grouper = form.groupby("type")

median = grouper.quantile(0.5)
q25 = grouper.quantile(0.25)
q75 = grouper.quantile(0.75)
iqr = q75 - q25

Properly name types.

renamer = {
    "Warehouse land": "Warehouse/Park land",
    "Hyper distilled urbanity": "Hyper concentrated urbanity",
    "Distilled urbanity": "Concentrated urbanity",
}
median = median.rename(index=renamer)
iqr = iqr.rename(index=renamer)

median.to_parquet(f"{data_folder}/sampling/median_form.parquet")
iqr.to_parquet(f"{data_folder}/sampling/iqr_form.parquet")

O.1 Non-morphological data

Key:

fn_key = {
    "Workplace population [Agriculture, energy and water]": "A, B, D, E. Agriculture, energy and water",
    "Workplace population [Manufacturing]": "C. Manufacturing",
    "Workplace population [Construction]": "F. Construction",
    "Workplace population [Distribution, hotels and restaurants]": "G, I. Distribution, hotels and restaurants",
    "Workplace population [Transport and communication]": "H, J. Transport and communication",
    "Workplace population [Financial, real estate, professional and administrative activities]": "K, L, M, N. Financial, real estate, professional and administrative activities",
    "Workplace population [Public administration, education and health]": "O,P,Q. Public administration, education and health",
    "Workplace population [Other]": "R, S, T, U. Other",
}

Read and merge labels (and areas).

function = (
    dd.read_parquet(
        "signatures_gb/function",
        columns=[
            "hindex",
            "Population",
            "Land cover [Non-irrigated arable land]",
            "Land cover [Industrial or commercial units]",
            "Land cover [Sport and leisure facilities]",
            "Land cover [Green urban areas]",
            "Land cover [Discontinuous urban fabric]",
            "Land cover [Pastures]",
            "Land cover [Continuous urban fabric]",
        ]
        + list(fn_key),
    )
    .compute()
    .set_index("hindex")
)

function = function.merge(
    signature_type[["type"]], how="left", left_index=True, right_index=True
).merge(form[["sdcAre"]], how="left", left_index=True, right_index=True)
function = function.rename(columns=fn_key)

Normalize subset by area.

subset = [
    "Population",
    "A, B, D, E. Agriculture, energy and water",
    "C. Manufacturing",
    "F. Construction",
    "G, I. Distribution, hotels and restaurants",
    "H, J. Transport and communication",
    "K, L, M, N. Financial, real estate, professional and administrative activities",
    "O,P,Q. Public administration, education and health",
    "R, S, T, U. Other",
]
function[subset] = function[subset].divide(function.sdcAre, axis=0)

Group by type, get quartiles and IQR.

grouper = function.groupby("type")
median = grouper.quantile(0.5)
q25 = grouper.quantile(0.25)
q75 = grouper.quantile(0.75)
iqr = q75 - q25
median = median.rename(index=renamer)
iqr = iqr.rename(index=renamer)

Save.

median.drop(columns="sdcAre").to_parquet(
    f"{data_folder}/sampling/median_function.parquet"
)
iqr.drop(columns="sdcAre").to_parquet(f"{data_folder}/sampling/iqr_function.parquet")