import dask.dataframe as dd
import pandas as pd
Appendix O — Create summary data for signatures
Measure median and IQR for each variable and store them as Parquets.
Data available at https://github.com/urbangrammarai/signatures_gb.
= "/Users/martin/Library/CloudStorage/OneDrive-SharedLibraries-TheAlanTuringInstitute/Daniel Arribas-Bel - demoland_data" data_folder
Define a key to match data.
= {
form_key "area of building": "sdbAre",
"courtyard area of building": "sdbCoA",
"circular compactness of building": "ssbCCo",
"corners of building": "ssbCor",
"squareness of building": "ssbSqu",
"equivalent rectangular index of building": "ssbERI",
"centroid - corner mean distance of building": "ssbCCM",
"centroid - corner distance deviation of building": "ssbCCD",
"orientation of building": "stbOri",
"area of ETC": "sdcAre",
"circular compactness of ETC": "sscCCo",
"equivalent rectangular index of ETC": "sscERI",
"covered area ratio of ETC": "sicCAR",
"cell alignment of building": "stbCeA",
"alignment of neighbouring buildings": "mtbAli",
"mean distance between neighbouring buildings": "mtbNDi",
"perimeter-weighted neighbours of ETC": "mtcWNe",
"mean inter-building distance": "ltbIBD",
"width of street profile": "sdsSPW",
"width deviation of street profile": "sdsSWD",
"openness of street profile": "sdsSPO",
"length of street segment": "sdsLen",
"linearity of street segment": "sssLin",
"mean segment length within 3 steps": "ldsMSL",
"node degree of junction": "mtdDeg",
"local proportion of 3-way intersections of street network": "linP3W",
"local proportion of 4-way intersections of street network": "linP4W",
"local proportion of cul-de-sacs of street network": "linPDE",
"local closeness of street network": "lcnClo",
"local cul-de-sac length of street network": "ldsCDL",
"square clustering of street network": "xcnSCl",
"local degree weighted node density of street network": "linWID",
"street alignment of building": "stbSAl",
"area covered by edge-attached ETCs": "sdsAre",
"buildings per meter of street segment": "sisBpM",
"reached ETCs by neighbouring segments": "misCel",
"reached ETCs by tessellation contiguity": "ltcRea",
"area of enclosure": "ldeAre",
"circular compactness of enclosure": "lseCCo",
"equivalent rectangular index of enclosure": "lseERI",
"orientation of enclosure": "lteOri",
"perimeter-weighted neighbours of enclosure": "lteWNB",
"area-weighted ETCs of enclosure": "lieWCe",
}
Read form.
= (
form "signatures_gb/form", columns=["hindex"] + list(form_key))
dd.read_parquet(
.compute()"hindex")
.set_index(=form_key)
.rename(columns )
Read signature labels
= (
signature_type "urbangrammar_samba/spatial_signatures/signatures/hindex_to_type")
dd.read_parquet(
.compute()"hindex")
.set_index( )
Sort and merge via index.
= form.sort_index()
form = signature_type.sort_index()
signature_type = form.merge(
form "type"]], how="left", left_index=True, right_index=True
signature_type[[ )
form.head()
sdbAre | sdbCoA | ssbCCo | ssbCor | ssbSqu | ssbERI | ssbCCM | ssbCCD | stbOri | sdcAre | ... | sisBpM | misCel | ltcRea | ldeAre | lseCCo | lseERI | lteOri | lteWNB | lieWCe | type | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
hindex | |||||||||||||||||||||
c000e094707t0000 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 296.055450 | ... | 0.073677 | 7.185852 | 33 | 296.05545 | 0.350319 | 0.845480 | 6.651016 | 0.135074 | 0.000516 | Wild countryside |
c000e094763t0000 | 868.66235 | 0.0 | 0.371499 | 10.0 | 1.046911 | 0.700460 | 17.571433 | 7.996062 | 36.659023 | 4968.852718 | ... | 0.006263 | 34.602315 | 55 | 42494.97405 | 0.077278 | 0.682760 | 37.918157 | 0.010778 | 0.000206 | Countryside agriculture |
c000e094763t0001 | 1387.68690 | 0.0 | 0.576501 | 5.0 | 30.424307 | 1.015852 | 26.366200 | 1.672909 | 37.459323 | 23541.033893 | ... | 0.008039 | 7.692777 | 61 | 42494.97405 | 0.077278 | 0.682760 | 37.918157 | 0.010778 | 0.000206 | Countryside agriculture |
c000e094763t0002 | 183.95600 | 0.0 | 0.480131 | 4.0 | 0.501584 | 0.999954 | 11.015825 | 0.044073 | 37.969960 | 13985.087439 | ... | 0.009475 | 86.912080 | 45 | 42494.97405 | 0.077278 | 0.682760 | 37.918157 | 0.010778 | 0.000206 | Countryside agriculture |
c000e094764t0000 | 1990.40485 | 0.0 | 0.640383 | 14.0 | 26.845785 | 0.918766 | 24.646409 | 6.637691 | 39.216097 | 27866.551253 | ... | 0.008404 | 8.999952 | 46 | 41009.22230 | 0.101179 | 0.680202 | 37.948923 | 0.017903 | 0.000007 | Countryside agriculture |
5 rows × 44 columns
Group by a signature type, get quartiles and IQR.
= form.groupby("type") grouper
= grouper.quantile(0.5)
median = grouper.quantile(0.25)
q25 = grouper.quantile(0.75)
q75 = q75 - q25 iqr
Properly name types.
= {
renamer "Warehouse land": "Warehouse/Park land",
"Hyper distilled urbanity": "Hyper concentrated urbanity",
"Distilled urbanity": "Concentrated urbanity",
}= median.rename(index=renamer)
median = iqr.rename(index=renamer) iqr
f"{data_folder}/sampling/median_form.parquet")
median.to_parquet(f"{data_folder}/sampling/iqr_form.parquet") iqr.to_parquet(
O.1 Non-morphological data
Key:
= {
fn_key "Workplace population [Agriculture, energy and water]": "A, B, D, E. Agriculture, energy and water",
"Workplace population [Manufacturing]": "C. Manufacturing",
"Workplace population [Construction]": "F. Construction",
"Workplace population [Distribution, hotels and restaurants]": "G, I. Distribution, hotels and restaurants",
"Workplace population [Transport and communication]": "H, J. Transport and communication",
"Workplace population [Financial, real estate, professional and administrative activities]": "K, L, M, N. Financial, real estate, professional and administrative activities",
"Workplace population [Public administration, education and health]": "O,P,Q. Public administration, education and health",
"Workplace population [Other]": "R, S, T, U. Other",
}
Read and merge labels (and areas).
= (
function
dd.read_parquet("signatures_gb/function",
=[
columns"hindex",
"Population",
"Land cover [Non-irrigated arable land]",
"Land cover [Industrial or commercial units]",
"Land cover [Sport and leisure facilities]",
"Land cover [Green urban areas]",
"Land cover [Discontinuous urban fabric]",
"Land cover [Pastures]",
"Land cover [Continuous urban fabric]",
]+ list(fn_key),
)
.compute()"hindex")
.set_index(
)
= function.merge(
function "type"]], how="left", left_index=True, right_index=True
signature_type[["sdcAre"]], how="left", left_index=True, right_index=True)
).merge(form[[= function.rename(columns=fn_key) function
Normalize subset by area.
= [
subset "Population",
"A, B, D, E. Agriculture, energy and water",
"C. Manufacturing",
"F. Construction",
"G, I. Distribution, hotels and restaurants",
"H, J. Transport and communication",
"K, L, M, N. Financial, real estate, professional and administrative activities",
"O,P,Q. Public administration, education and health",
"R, S, T, U. Other",
]= function[subset].divide(function.sdcAre, axis=0) function[subset]
Group by type, get quartiles and IQR.
= function.groupby("type")
grouper = grouper.quantile(0.5)
median = grouper.quantile(0.25)
q25 = grouper.quantile(0.75)
q75 = q75 - q25
iqr = median.rename(index=renamer)
median = iqr.rename(index=renamer) iqr
Save.
="sdcAre").to_parquet(
median.drop(columnsf"{data_folder}/sampling/median_function.parquet"
)="sdcAre").to_parquet(f"{data_folder}/sampling/iqr_function.parquet") iqr.drop(columns