import pandas as pd
import geopandas as gpd
import seaborn as sns
import numpy as np
Appendix G — Filter collinear explanatory variables
Some explanatory variables, primarily those coming from morphometrics, may be collinear within our limited area of interest. Those shall be removed prior modelling.
This notebook identifies correlations between variables and drops those that are correlated and less interpetable.
Specify a path to the data folder.
= "/Users/martin/Library/CloudStorage/OneDrive-SharedLibraries-TheAlanTuringInstitute/Daniel Arribas-Bel - demoland_data" data_folder
Load the data
= gpd.read_parquet(f"{data_folder}/processed/interpolated/all_oa.parquet") data
Filter only explanatory variables.
= data.drop(
exvars =[
columns"geo_code",
"geometry",
"air_quality_index",
"house_price_index",
"jobs_accessibility_index",
] )
Measure Pearson’s and Spearman’s Rank correllations.
= exvars.corr().abs()
pearson = exvars.corr("spearman").abs() spearman
Fill the upper triangle to keep each pair only once.
*= np.tri(*pearson.shape) pearson
Remove self.
0) np.fill_diagonal(pearson.values,
Unstack to get pairs.
= pearson.unstack()
pearson_pairs > 0.8] pearson_pairs[pearson_pairs
sdbAre sdbPer 0.804675
sdbPer ssbCor 0.831300
ssbCCo ssbElo 0.881746
stbOri stcOri 0.861596
sdcLAL sdcAre 0.897443
mtbNDi 0.925586
mdcAre 0.894625
sddAre 0.805920
ltcAre 0.821140
sdcAre mtbNDi 0.905940
mdcAre 0.946279
sddAre 0.875737
mdsAre 0.806880
mtbNDi mdcAre 0.922637
sddAre 0.841932
ltcAre 0.844478
mdcAre sddAre 0.886339
sdsAre 0.848497
mdsAre 0.853500
ltcAre 0.922004
ltcWRE lcnClo 0.852001
ltbIBD ltcAre 0.867520
sdsSPW sdsSPO 0.840288
sdsLen mtdMDi 0.937603
sdsAre 0.859295
lcdMes linPDE 0.862986
mtdMDi sddAre 0.805111
sdsAre 0.849652
mdsAre 0.811618
lddNDe linWID 0.944004
sddAre sdsAre 0.925253
mdsAre 0.889015
ltcAre 0.813191
sdsAre mdsAre 0.955687
ldsAre 0.883652
ltcAre 0.829096
misCel lisCel 0.869592
mdsAre ldsAre 0.956400
ltcAre 0.837981
ldeAre ldePer 0.872630
ldePer lseCWA 0.964556
dtype: float64
The same with Spearman
*= np.tri(*spearman.shape)
spearman 0)
np.fill_diagonal(spearman.values, = spearman.unstack()
spearman_pairs > 0.8] spearman_pairs[spearman_pairs
G, I. Distribution, hotels and restaurants K, L, M, N. Financial, real estate, professional and administrative activities 0.819823
R, S, T, U. Other 0.811958
sdbAre sdbPer 0.961472
ssbCCM 0.889109
sdbPer ssbCCM 0.951864
ssbCCo ssbElo 0.871556
ssbCor ssbCCD 0.901580
stbOri stcOri 0.858329
sdcLAL sdcAre 0.960952
mtcWNe 0.918613
mdcAre 0.928993
ltcRea 0.832384
ltcAre 0.830742
sdcAre mtcWNe 0.873712
mdcAre 0.957952
ltcRea 0.822674
ltcAre 0.841574
mtcWNe mdcAre 0.845915
mdcAre ltbIBD 0.822638
ltcRea 0.828665
ltcAre 0.933400
ltcWRE lcnClo 0.815142
ltbIBD ltcAre 0.862608
sdsSPW sdsSPO 0.821366
sdsLen mtdMDi 0.925780
lcdMes linPDE 0.890219
linPDE ldsCDL 0.810251
mtdMDi sddAre 0.823618
lddNDe linWID 0.893646
sddAre sdsAre 0.905564
mdsAre 0.859780
sdsAre mdsAre 0.946564
ldsAre 0.809841
misCel lisCel 0.909656
mdsAre ldsAre 0.916757
ldeAre ldePer 0.971434
lseCWA 0.899233
ldePer lseCWA 0.965517
lseERI lseCWA 0.812116
dtype: float64
Combine highly correlated pairs from both. We want to eliminate those with both indices above .8.
= (
high_both > 0.8]
pearson_pairs[pearson_pairs "pearson")
.to_frame(=spearman_pairs[spearman_pairs > 0.8])
.assign(spearman
.dropna()
) high_both
pearson | spearman | ||
---|---|---|---|
sdbAre | sdbPer | 0.804675 | 0.961472 |
ssbCCo | ssbElo | 0.881746 | 0.871556 |
stbOri | stcOri | 0.861596 | 0.858329 |
sdcLAL | sdcAre | 0.897443 | 0.960952 |
mdcAre | 0.894625 | 0.928993 | |
ltcAre | 0.821140 | 0.830742 | |
sdcAre | mdcAre | 0.946279 | 0.957952 |
mdcAre | ltcAre | 0.922004 | 0.933400 |
ltcWRE | lcnClo | 0.852001 | 0.815142 |
ltbIBD | ltcAre | 0.867520 | 0.862608 |
sdsSPW | sdsSPO | 0.840288 | 0.821366 |
sdsLen | mtdMDi | 0.937603 | 0.925780 |
lcdMes | linPDE | 0.862986 | 0.890219 |
mtdMDi | sddAre | 0.805111 | 0.823618 |
lddNDe | linWID | 0.944004 | 0.893646 |
sddAre | sdsAre | 0.925253 | 0.905564 |
mdsAre | 0.889015 | 0.859780 | |
sdsAre | mdsAre | 0.955687 | 0.946564 |
ldsAre | 0.883652 | 0.809841 | |
misCel | lisCel | 0.869592 | 0.909656 |
mdsAre | ldsAre | 0.956400 | 0.916757 |
ldeAre | ldePer | 0.872630 | 0.971434 |
ldePer | lseCWA | 0.964556 | 0.965517 |
Define variables to be dropped.
= [
to_drop "sdbPer",
"ssbElo",
"stcOri",
"sdcLAL",
"mdcAre",
"ltcAre",
"ltcWRE",
"mtdMDi",
"lcdMes",
"lddNDe",
"sddAre",
"mdsAre",
"ldsAre",
"lisCel",
"ldePer",
"lseCWA",
]
Check the result.
= exvars.drop(columns=to_drop).corr().abs()
pearson_check = exvars.drop(columns=to_drop).corr("spearman").abs()
spearman_check
*= np.tri(*pearson_check.shape)
pearson_check 0)
np.fill_diagonal(pearson_check.values, = pearson_check.unstack()
pearson_check_pairs
*= np.tri(*spearman_check.shape)
spearman_check 0)
np.fill_diagonal(spearman_check.values, = spearman_check.unstack()
spearman_check_pairs
= (
high_both_check > 0.8]
pearson_check_pairs[pearson_check_pairs "pearson")
.to_frame(=spearman_check_pairs[spearman_check_pairs > 0.8])
.assign(spearman
.dropna() )
high_both_check
pearson | spearman | ||
---|---|---|---|
sdsSPW | sdsSPO | 0.840288 | 0.821366 |
The street profile width - street profile openness pair is kept as there is not necessarily a logical (only empirical) relation between the two.
Check remaining high-correlation pairs if we consider only single index.
> 0.8] pearson_check_pairs[pearson_check_pairs
sdcAre mtbNDi 0.905940
sdsSPW sdsSPO 0.840288
sdsLen sdsAre 0.859295
dtype: float64
> 0.8] spearman_check_pairs[spearman_check_pairs
G, I. Distribution, hotels and restaurants K, L, M, N. Financial, real estate, professional and administrative activities 0.819823
R, S, T, U. Other 0.811958
sdbAre ssbCCM 0.889109
ssbCor ssbCCD 0.901580
sdcAre mtcWNe 0.873712
ltcRea 0.822674
sdsSPW sdsSPO 0.821366
linPDE ldsCDL 0.810251
dtype: float64
We assume that these can stay in the dataset.
Drop the collinear variables from original data.
= data.drop(columns=to_drop) data
Save to file.
f"{data_folder}/processed/interpolated/all_oa.parquet") data.to_parquet(
Get a table of all.
Key to names of morphometric characters:
= {
key "sdbAre": "area of building",
"sdbPer": "perimeter of building",
"sdbCoA": "courtyard area of building",
"ssbCCo": "circular compactness of building",
"ssbCor": "corners of building",
"ssbSqu": "squareness of building",
"ssbERI": "equivalent rectangular index of building",
"ssbElo": "elongation of building",
"ssbCCM": "centroid - corner mean distance of building",
"ssbCCD": "centroid - corner distance deviation of building",
"stbOri": "orientation of building",
"sdcLAL": "longest axis length of ETC",
"sdcAre": "area of ETC",
"sscCCo": "circular compactness of ETC",
"sscERI": "equivalent rectangular index of ETC",
"stcOri": "orientation of ETC",
"sicCAR": "covered area ratio of ETC",
"stbCeA": "cell alignment of building",
"mtbAli": "alignment of neighbouring buildings",
"mtbNDi": "mean distance between neighbouring buildings",
"mtcWNe": "perimeter-weighted neighbours of ETC",
"mdcAre": "area covered by neighbouring cells",
"ltcWRE": "weighted reached enclosures of ETC",
"ltbIBD": "mean inter-building distance",
"sdsSPW": "width of street profile",
"sdsSWD": "width deviation of street profile",
"sdsSPO": "openness of street profile",
"sdsLen": "length of street segment",
"sssLin": "linearity of street segment",
"ldsMSL": "mean segment length within 3 steps",
"mtdDeg": "node degree of junction",
"lcdMes": "local meshedness of street network",
"linP3W": "local proportion of 3-way intersections of street network",
"linP4W": "local proportion of 4-way intersections of street network",
"linPDE": "local proportion of cul-de-sacs of street network",
"lcnClo": "local closeness of street network",
"ldsCDL": "local cul-de-sac length of street network",
"xcnSCl": "square clustering of street network",
"mtdMDi": "mean distance to neighbouring nodes of street network",
"lddNDe": "local node density of street network",
"linWID": "local degree weighted node density of street network",
"stbSAl": "street alignment of building",
"sddAre": "area covered by node-attached ETCs",
"sdsAre": "area covered by edge-attached ETCs",
"sisBpM": "buildings per meter of street segment",
"misCel": "reached ETCs by neighbouring segments",
"mdsAre": "reached area by neighbouring segments",
"lisCel": "reached ETCs by local street network",
"ldsAre": "reached area by local street network",
"ltcRea": "reached ETCs by tessellation contiguity",
"ltcAre": "reached area by tessellation contiguity",
"ldeAre": "area of enclosure",
"ldePer": "perimeter of enclosure",
"lseCCo": "circular compactness of enclosure",
"lseERI": "equivalent rectangular index of enclosure",
"lseCWA": "compactness-weighted axis of enclosure",
"lteOri": "orientation of enclosure",
"lteWNB": "perimeter-weighted neighbours of enclosure",
"lieWCe": "area-weighted ETCs of enclosure",
}
List all characters with names:
if c in key else c for c in exvars.drop(columns=to_drop).columns] [key[c]
['population_estimate',
'A, B, D, E. Agriculture, energy and water',
'C. Manufacturing',
'F. Construction',
'G, I. Distribution, hotels and restaurants',
'H, J. Transport and communication',
'K, L, M, N. Financial, real estate, professional and administrative activities',
'O,P,Q. Public administration, education and health',
'R, S, T, U. Other',
'Land cover [Discontinuous urban fabric]',
'Land cover [Continuous urban fabric]',
'Land cover [Non-irrigated arable land]',
'Land cover [Industrial or commercial units]',
'Land cover [Green urban areas]',
'Land cover [Pastures]',
'Land cover [Sport and leisure facilities]',
'area of building',
'courtyard area of building',
'circular compactness of building',
'corners of building',
'squareness of building',
'equivalent rectangular index of building',
'centroid - corner mean distance of building',
'centroid - corner distance deviation of building',
'orientation of building',
'area of ETC',
'circular compactness of ETC',
'equivalent rectangular index of ETC',
'covered area ratio of ETC',
'cell alignment of building',
'alignment of neighbouring buildings',
'mean distance between neighbouring buildings',
'perimeter-weighted neighbours of ETC',
'mean inter-building distance',
'width of street profile',
'width deviation of street profile',
'openness of street profile',
'length of street segment',
'linearity of street segment',
'mean segment length within 3 steps',
'node degree of junction',
'local proportion of 3-way intersections of street network',
'local proportion of 4-way intersections of street network',
'local proportion of cul-de-sacs of street network',
'local closeness of street network',
'local cul-de-sac length of street network',
'square clustering of street network',
'local degree weighted node density of street network',
'street alignment of building',
'area covered by edge-attached ETCs',
'buildings per meter of street segment',
'reached ETCs by neighbouring segments',
'reached ETCs by tessellation contiguity',
'area of enclosure',
'circular compactness of enclosure',
'equivalent rectangular index of enclosure',
'orientation of enclosure',
'perimeter-weighted neighbours of enclosure',
'area-weighted ETCs of enclosure']