Appendix O — Create summary data for signatures

Measure median and IQR for each variable and store them as Parquets.

Data available at https://github.com/urbangrammarai/signatures_gb.

import dask.dataframe as dd
import pandas as pd
data_folder = "/Users/martin/Library/CloudStorage/OneDrive-SharedLibraries-TheAlanTuringInstitute/Daniel Arribas-Bel - demoland_data"

Define a key to match data.

form_key = {
    "area of building": "sdbAre",
    "courtyard area of building": "sdbCoA",
    "circular compactness of building": "ssbCCo",
    "corners of building": "ssbCor",
    "squareness of building": "ssbSqu",
    "equivalent rectangular index of building": "ssbERI",
    "centroid - corner mean distance of building": "ssbCCM",
    "centroid - corner distance deviation of building": "ssbCCD",
    "orientation of building": "stbOri",
    "area of ETC": "sdcAre",
    "circular compactness of ETC": "sscCCo",
    "equivalent rectangular index of ETC": "sscERI",
    "covered area ratio of ETC": "sicCAR",
    "cell alignment of building": "stbCeA",
    "alignment of neighbouring buildings": "mtbAli",
    "mean distance between neighbouring buildings": "mtbNDi",
    "perimeter-weighted neighbours of ETC": "mtcWNe",
    "mean inter-building distance": "ltbIBD",
    "width of street profile": "sdsSPW",
    "width deviation of street profile": "sdsSWD",
    "openness of street profile": "sdsSPO",
    "length of street segment": "sdsLen",
    "linearity of street segment": "sssLin",
    "mean segment length within 3 steps": "ldsMSL",
    "node degree of junction": "mtdDeg",
    "local proportion of 3-way intersections of street network": "linP3W",
    "local proportion of 4-way intersections of street network": "linP4W",
    "local proportion of cul-de-sacs of street network": "linPDE",
    "local closeness of street network": "lcnClo",
    "local cul-de-sac length of street network": "ldsCDL",
    "square clustering of street network": "xcnSCl",
    "local degree weighted node density of street network": "linWID",
    "street alignment of building": "stbSAl",
    "area covered by edge-attached ETCs": "sdsAre",
    "buildings per meter of street segment": "sisBpM",
    "reached ETCs by neighbouring segments": "misCel",
    "reached ETCs by tessellation contiguity": "ltcRea",
    "area of enclosure": "ldeAre",
    "circular compactness of enclosure": "lseCCo",
    "equivalent rectangular index of enclosure": "lseERI",
    "orientation of enclosure": "lteOri",
    "perimeter-weighted neighbours of enclosure": "lteWNB",
    "area-weighted ETCs of enclosure": "lieWCe",
}

Read form.

form = (
    dd.read_parquet("signatures_gb/form", columns=["hindex"] + list(form_key))
    .compute()
    .set_index("hindex")
    .rename(columns=form_key)
)

Read signature labels

signature_type = (
    dd.read_parquet("urbangrammar_samba/spatial_signatures/signatures/hindex_to_type")
    .compute()
    .set_index("hindex")
)

Sort and merge via index.

form = form.sort_index()
signature_type = signature_type.sort_index()
form = form.merge(
    signature_type[["type"]], how="left", left_index=True, right_index=True
)
form.head()
sdbAre sdbCoA ssbCCo ssbCor ssbSqu ssbERI ssbCCM ssbCCD stbOri sdcAre ... sisBpM misCel ltcRea ldeAre lseCCo lseERI lteOri lteWNB lieWCe type
hindex
c000e094707t0000 NaN NaN NaN NaN NaN NaN NaN NaN NaN 296.055450 ... 0.073677 7.185852 33 296.05545 0.350319 0.845480 6.651016 0.135074 0.000516 Wild countryside
c000e094763t0000 868.66235 0.0 0.371499 10.0 1.046911 0.700460 17.571433 7.996062 36.659023 4968.852718 ... 0.006263 34.602315 55 42494.97405 0.077278 0.682760 37.918157 0.010778 0.000206 Countryside agriculture
c000e094763t0001 1387.68690 0.0 0.576501 5.0 30.424307 1.015852 26.366200 1.672909 37.459323 23541.033893 ... 0.008039 7.692777 61 42494.97405 0.077278 0.682760 37.918157 0.010778 0.000206 Countryside agriculture
c000e094763t0002 183.95600 0.0 0.480131 4.0 0.501584 0.999954 11.015825 0.044073 37.969960 13985.087439 ... 0.009475 86.912080 45 42494.97405 0.077278 0.682760 37.918157 0.010778 0.000206 Countryside agriculture
c000e094764t0000 1990.40485 0.0 0.640383 14.0 26.845785 0.918766 24.646409 6.637691 39.216097 27866.551253 ... 0.008404 8.999952 46 41009.22230 0.101179 0.680202 37.948923 0.017903 0.000007 Countryside agriculture

5 rows × 44 columns

Group by a signature type, get quartiles and IQR.

grouper = form.groupby("type")
median = grouper.quantile(0.5)
q25 = grouper.quantile(0.25)
q75 = grouper.quantile(0.75)
iqr = q75 - q25

Properly name types.

renamer = {
    "Warehouse land": "Warehouse/Park land",
    "Hyper distilled urbanity": "Hyper concentrated urbanity",
    "Distilled urbanity": "Concentrated urbanity",
}
median = median.rename(index=renamer)
iqr = iqr.rename(index=renamer)
median.to_parquet(f"{data_folder}/sampling/median_form.parquet")
iqr.to_parquet(f"{data_folder}/sampling/iqr_form.parquet")

O.1 Non-morphological data

Key:

fn_key = {
    "Workplace population [Agriculture, energy and water]": "A, B, D, E. Agriculture, energy and water",
    "Workplace population [Manufacturing]": "C. Manufacturing",
    "Workplace population [Construction]": "F. Construction",
    "Workplace population [Distribution, hotels and restaurants]": "G, I. Distribution, hotels and restaurants",
    "Workplace population [Transport and communication]": "H, J. Transport and communication",
    "Workplace population [Financial, real estate, professional and administrative activities]": "K, L, M, N. Financial, real estate, professional and administrative activities",
    "Workplace population [Public administration, education and health]": "O,P,Q. Public administration, education and health",
    "Workplace population [Other]": "R, S, T, U. Other",
}

Read and merge labels (and areas).

function = (
    dd.read_parquet(
        "signatures_gb/function",
        columns=[
            "hindex",
            "Population",
            "Land cover [Non-irrigated arable land]",
            "Land cover [Industrial or commercial units]",
            "Land cover [Sport and leisure facilities]",
            "Land cover [Green urban areas]",
            "Land cover [Discontinuous urban fabric]",
            "Land cover [Pastures]",
            "Land cover [Continuous urban fabric]",
        ]
        + list(fn_key),
    )
    .compute()
    .set_index("hindex")
)

function = function.merge(
    signature_type[["type"]], how="left", left_index=True, right_index=True
).merge(form[["sdcAre"]], how="left", left_index=True, right_index=True)
function = function.rename(columns=fn_key)

Normalize subset by area.

subset = [
    "Population",
    "A, B, D, E. Agriculture, energy and water",
    "C. Manufacturing",
    "F. Construction",
    "G, I. Distribution, hotels and restaurants",
    "H, J. Transport and communication",
    "K, L, M, N. Financial, real estate, professional and administrative activities",
    "O,P,Q. Public administration, education and health",
    "R, S, T, U. Other",
]
function[subset] = function[subset].divide(function.sdcAre, axis=0)

Group by type, get quartiles and IQR.

grouper = function.groupby("type")
median = grouper.quantile(0.5)
q25 = grouper.quantile(0.25)
q75 = grouper.quantile(0.75)
iqr = q75 - q25
median = median.rename(index=renamer)
iqr = iqr.rename(index=renamer)

Save.

median.drop(columns="sdcAre").to_parquet(
    f"{data_folder}/sampling/median_function.parquet"
)
iqr.drop(columns="sdcAre").to_parquet(f"{data_folder}/sampling/iqr_function.parquet")