Welcome to the RPubs for the book ‘Applied Spatial Statistics And Econometrics: Data Analysis in R’ (Routledge, 2nd edition, 2026) (ed.Katarzyna Kopczewska).

This part is for Chapters 8-10.

To download and read data, please see RPubs with Intro.

This site is a collection of all R codes included in this book. These are just the codes - no comments, no explanations what, why and how. All those details are in the book.

The book was written by Spatial Warsaw team based at the Faculty of Economic Sciences at University of Warsaw. Visit us on LinkedIn.

Chapter 8: Geographically Weighted Regression - modelling spatial heterogeneity (Kateryna Zabarina, Katarzyna Kopczewska, Monika Kot, Piotr Ćwiakowski)

8.2 Estimation of the basic GWR model

data22<-data[data$year==2022, ] # select data for year 2022

data22$Y<-data22$housing_price_sqm_PLN      # Y variable
data22$X1<-data22$urbanisation          # X1 variable
data22$X2<-data22$dist                  # X2 variable
data22$X3<-data22$birth_per_1K          # X3 variable
data22$X4<-data22$new_firms_per_10K_inhab       # X4 variable
data22$X5<-data22$salary_average            # X5 variable
data22$X6<-data22$unemployment_rate         # X6 variable

eq<-Y~X1+X2+X3+X4+X5+X6     # equation to be estimated in all GWR models

lm.model<-lm(eq, data=data22) # global linear regression model (OLS)
summary(lm.model)

8.2.1 Estimating GWR, bandwidth and diagnostics with the spgwr:: package

# poviat data for one year with coordinates of centroids
pov<-st_transform(pov, 4326)        # sf object, projections in WGS84
crds.pov<-st_coordinates(st_centroid(st_make_valid(pov))) # crds matrix
crds.pov<-as.data.frame(crds.pov)       # crds data.frame

data22$crds.x<-crds.pov$X           # add coordinates to dataset
data22$crds.y<-crds.pov$Y

library(sp)                     # for dataset in sp class
data22.sp<-data22                           # copy the data.frame object
coordinates(data22.sp)<-c("crds.x", "crds.y")   # change to sp class
proj4string(data22.sp)<-CRS("+proj=longlat +datum=WGS84") # projection
class(data22.sp)

# find optimal bandwidth, adaptive, with Gaussian (default) kernel
library(spgwr)  # for gwr.sel()
bw.adapt.spgwr<-gwr.sel(eq, data=data22.sp, adapt=T, RMSE=T, method="cv")
bw.adapt.spgwr

# find the optimal bandwidth – fixed, in km
bw.fixed.spgwr<-gwr.sel(eq, data=data22.sp, adapt=F, RMSE=F, longlat=T)
# estimation of the GWR model, hatmatrix=T for diagnostic tests
gwr.mod.adapt<-gwr(eq, data=data22.sp, adapt=bw.adapt.spgwr,  hatmatrix=T) 
gwr.mod.adapt       # summary of GWR model
summary(gwr.mod.adapt)  # structure of output

library(viridis)        # for viridis colours
library(gridExtra)  # for joint plot of individual figures

pov$gwr.X1<-gwr.mod.adapt$SDF$X1
pov$gwr.X3<-gwr.mod.adapt$SDF$X3

p1<-ggplot() + geom_sf(pov, mapping=aes(geometry=geometry, fill=gwr.X1), inherit.aes=FALSE) + scale_fill_viridis(option="H") + theme_minimal()
p2<-ggplot() + geom_sf(pov, mapping=aes(geometry=geometry, fill=gwr.X3), inherit.aes=FALSE) + scale_fill_viridis(option="H") + theme_minimal()
p12<-arrangeGrob(p1, p2, ncol=2)    # with gridExtra::
plot(p12)

Figure 8.2: Spatial distribution of selected GWR parameters: a) variable X1, b) variable X3

LMZ.F1GWR.test(gwr.mod.adapt) # H1: GWR is better than OLS – confirmed
LMZ.F2GWR.test(gwr.mod.adapt) # H1: GWR is better than OLS – confirmed
BFC99.gwr.test(gwr.mod.adapt)   # H1: GWR is better than OLS – confirmed
LMZ.F3GWR.test(gwr.mod.adapt) # H1: GWR coefficients differ in space

eq<-Y~X1+X2+X3+X4+X5    # equation without X6 which is stable over space
bw.adapt.spgwr<-gwr.sel(eq, data=data22.sp, adapt=T, RMSE=T, method="cv")
bw.adapt.spgwr      # which gives 0.12*380=46 neighbours

gwr.mod.adapt<-gwr(eq, data=data22.sp, adapt=bw.adapt.spgwr,  hatmatrix=T)
LMZ.F1GWR.test(gwr.mod.adapt)       # GWR is better than OLS
LMZ.F2GWR.test(gwr.mod.adapt)   # GWR is better than OLS
BFC99.gwr.test(gwr.mod.adapt)   # GWR is better than OLS
LMZ.F3GWR.test(gwr.mod.adapt)   # all coefficients spatially non-stationary
library(spdep)
neighb<-knearneigh(coordinates(data22.sp), k=46) # knn class, spdep::
neighb.listw<-nb2listw(make.sym.nb(knn2nb(neighb))) # listw, spdep::
gwr.morantest(gwr.mod.adapt, neighb.listw) # Moran test for GWR, spgwr::

steps<-(1:10)*10
moran.stepwise<-matrix(0, nrow=length(steps), ncol=3)
colnames(moran.stepwise)<-c("No of neighb", "Moran’s I", "p-value")
for(i in 1:10){
neighb<-knearneigh(coordinates(data22.sp), k=steps[i])
neighb.listw<-nb2listw(make.sym.nb(knn2nb(neighb)))
a<-gwr.morantest(gwr.mod.adapt, neighb.listw)
moran.stepwise[i,1]<-steps[i]
moran.stepwise[i,2]<-a$estimate
moran.stepwise[i,3]<-a$ p.value
}
moran.stepwise

lm.model<-lm(eq, data=data22) # rerun OLS after equation has changed
lm.morantest(lm.model, neighb.listw)    # Moran test for OLS, spdep::

8.2.2. Estimating GWR, bandwidth and diagnostics with the GWmodel:: package

library(GWmodel)
# bandwidth estimation with GWmodel::
bw.adapt.gw<-bw.gwr(eq, data=data22.sp, kernel="gaussian", adaptive=T, p=2)
gwr.gw<-gwr.basic(eq, data=data22.sp, kernel="gaussian",  adaptive=T, bw=bw.adapt.gw, F123.test=T)  # GWR estimation, GWmodel:: 
gwr.gw                  # results printout

gwr.msc.gw<-gwr.multiscale(eq, data22.sp, kernel="gaussian", adaptive=T, bws0=c(0,0,0,0,0,0))   # GWR with various bandwidths, GWmodel::
gwr.msc.gw
gwr.mc.gw<-gwr.montecarlo(eq, data=data22.sp, kernel="gaussian", nsims=99, bw=bw.adapt.gw, adaptive=T, longlat=T) 

# significance tests of local regression parameters with p-value correction
# p-values for a single bandwidth model 
head(gwr.t.adjust(gwr.gw)$results$fb, 5)  # first 5 observations

# p-values for a multi bandwidth model 
head(gwr.t.adjust(gwr.msc.gw)$results$fb, 5)  # first 5 observations

8.2.3 Selection of the optimal functional form of the model

model.sel<-model.selection.gwr(DeVar='Y', InDeVars=c('X1', 'X2', 'X3', 'X4', 'X5'), data=data22.sp, kernel="gaussian", adaptive=TRUE, bw=bw.adapt.gw)
model.sel[[1]][15]  # structure of models
model.sel[[2]][1:2,]    # statistics of GWR models

sorted.models<-model.sort.gwr(model.sel, numVars=5, ruler.vector= model.sel[[2]][,3])           # sort models by AICc
model.list<-sorted.models[[1]]  # save list of sorted models

# Fig.8.3a - illustration of the steps of step regression
model.view.gwr('Y', c('X1', 'X2', 'X3', 'X4', 'X5'), model.list=model.list)

# Fig.8.3b - AICc of sorted models
plot(sorted.models[[2]][,3], col="black", pch=20, lty=5, main="AICc for GWR models", ylab="AICc", xlab="Model number", type="b")

Figure 8.3: An illustration of the GWR stepwise selection process

# statistics of sorted models
colnames(sorted.models[[2]])<-c("bandwidth", "AIC", "AICc", "RSS")
sorted.models[[2]]

8.2.4 GWR with heteroskedastic random errors

gwr_het<-gwr.hetero(eq, data=data22.sp, regression.points=data22.sp, longlat=T, bw=bw.adapt.gw, adaptive=TRUE, kernel='gaussian')
summary(gwr_het)

gwr_het<-gwr.hetero(eq, data=data22.sp, regression.points=data22.sp, longlat=T, bw=bw.adapt.gw, adaptive=TRUE, kernel='gaussian')
pov$gwr.gw.X1<-gwr.gw$SDF$X1    # add variables to the map
pov$gwr.het.X1<-gwr_het$X1
ggplot() + geom_sf(pov, mapping=aes(fill=gwr.gw.X1), inherit.aes=FALSE) + scale_fill_viridis(option="H") + theme_minimal()  # Fig.8.4a 
ggplot() + geom_sf(pov, mapping=aes(fill=gwr.het.X1), inherit.aes=FALSE) + scale_fill_viridis(option="H") + theme_minimal() # Fig.8.4b

Figure 8.4: Maps of coefficients for variable X1: a) simple GWR, b) heteroscedastic GWR

8.3 Collinearity in GWR models and regularisation as a solution

8.3.2 Diagnostics of local collinearity – local condition number (CN)

library(GWmodel)                # for gwr.collin.diagno()
library(RColorBrewer)           # for colours from colorRampPalette()
diag_gwr<-gwr.collin.diagno(eq, data22.sp, bw=bw.adapt.gw, kernel="gaussian", adaptive=T)   # diagnostic statistics in GWR 
summary(diag_gwr)               # list class object
names(diag_gwr)
names(diag_gwr$SDF)             # includes local_CN, VIF, etc.
names(gwr.gw$SDF)       # structure of GWR estimation output (see 8.2.2)

# Fig.8.5a - local Condition Numbers (CN)
spplot(diag_gwr$SDF, "local_CN", main="Local condition number", col.regions=colorRampPalette(brewer.pal(11, "Spectral"))(100), cex=2.5)

# Fig.8.5b - local R2 map
spplot(gwr.gw$SDF, "Local_R2", main="Local R-squared", col.regions = colorRampPalette(brewer.pal(9, "Blues"))(100), cex=2.5)

# Fig.8.5c – local coefficients of the selected variable
coef_name<-"X2"
se_name<-"X2_SE"
spplot(gwr.gw$SDF, coef_name, main=paste("Local coefficient:", coef_name), col.regions=colorRampPalette(brewer.pal(11, "RdBu"))(100), cex=2.5)

# Fig.8.5d - standard error surface
spplot(gwr.gw$SDF, se_name, main=paste("Standard error of", coef_name), col.regions=colorRampPalette(brewer.pal(9, "YlOrRd"))(100), cex=2.5)

Figure 8.5: Diagnostics of collinearity: a) condition numbers, b) R2, c) β coeefficient, d) standard error (SE) of β coefficient

# two alternative bandwidths: small (more local) vs large (more global)
bw_small<-bw.gwr(eq, data=data22.sp, kernel="bisquare", adaptive=TRUE)*0.5  
bw_large<-bw.gwr(eq, data=data22.sp, kernel="bisquare", adaptive=TRUE)*1.5

# collinearity diagnostics for each bandwidth
colldiag_small<-gwr.collin.diagno(eq, data=data22.sp, bw=bw_small, kernel= "bisquare", adaptive=TRUE)
colldiag_large<-gwr.collin.diagno(eq, data=data22.sp, bw=bw_large, kernel= "bisquare", adaptive=TRUE)

pov$CN_small<-colldiag_small$SDF$local_CN       # add CN to sf map
pov$CN_large<-colldiag_large$SDF$local_CN

ggplot()+ geom_sf(pov, mapping=aes(fill=CN_small))+ scale_fill_viridis( option="H") + theme_minimal() + ggtitle("CN for small bandwidth")
ggplot()+ geom_sf(pov, mapping=aes(fill=CN_large))+ scale_fill_viridis( option="H") + theme_minimal() + ggtitle("CN for large bandwidth")

# alternative plot with spplot() using sp class, not shown
data22.sp$CN_small<-colldiag_small$SDF$local_CN
data22.sp$CN_large<-colldiag_large$SDF$local_CN
spplot(data22.sp, c("CN_small", "CN_large"), main="Local condition number: small vs large bandwidth", col.regions=colorRampPalette(brewer.pal(11, "Spectral"))(100), cex=2.5)

Figure 8.6: Maps showing CN values for alternative bandwidths: a) small bandwidth, b) large bandwidth

8.3.3 Diagnostics of local collinearity – local design matrix

head(round(diag_gwr$corr.mat,3), 3) # only three rows displayed

# add selected pairs of variables to the map
pov$X2X4<-diag_gwr$corr.mat[,11]
#pov$X2X4<-diag_gwr$SDF$Corr_X2.X4  # equivalent notation
pov$X4X5<-diag_gwr$corr.mat[,15]
#pov$X4X5<-diag_gwr$SDF$Corr_X4.X5  # equivalent notation

sd(pov$X2X4)/mean(pov$X2X4) # coefficient of variation of local corr
sd(pov$X4X5)/mean(pov$X4X5)

library(viridis)                # Fig.8.7
ggplot()+ geom_sf(pov, mapping=aes(fill=X2X4))+ scale_fill_viridis( option="H") + theme_minimal() + ggtitle("Corr(X2,X4)")
ggplot()+ geom_sf(pov, mapping=aes(fill=X4X5))+ scale_fill_viridis( option="H") + theme_minimal() + ggtitle("Corr(X4,X5)")

Figure 8.7: Spatial distribution of the correlation coefficients between the predictors used in the local regressions: a) correlation between X2 and X4, b) correlation between X4 and X5

local_CN<-diag_gwr$SDF$local_CN
spar<-0.8           # smoothing parameter 
mat<-diag_gwr$SDF

par(mar=c(4,4,4,4))
plot(smooth.spline(local_CN, mat$Corr_X1.X2, spar=spar), ylim=c(-0.5, 0.6), xlim=c(15, 50), type="l", xlab="local Condition Number CN", ylab="local correlations between predictors", lty=1, lwd=1)
lines(smooth.spline(local_CN, mat$Corr_X1.X3, spar=spar), lty=2, lwd=2)
lines(smooth.spline(local_CN, mat$Corr_X1.X4, spar=spar), lty=3, lwd=3)
lines(smooth.spline(local_CN, mat$Corr_X1.X5, spar=spar), lty=4, lwd=1)
lines(smooth.spline(local_CN, mat$Corr_X2.X3, spar=spar), lty=5, lwd=2)
lines(smooth.spline(local_CN, mat$Corr_X2.X4, spar=spar), lty=1, lwd=3)
lines(smooth.spline(local_CN, mat$Corr_X2.X5, spar=spar), lty=2, lwd=1)
lines(smooth.spline(local_CN, mat$Corr_X3.X4, spar=spar), lty=3, lwd=2)
lines(smooth.spline(local_CN, mat$Corr_X3.X5, spar=spar), lty=4, lwd=3)
lines(smooth.spline(local_CN, mat$Corr_X4.X5, spar=spar), lty=5, lwd=1)
abline(v=c(30, 50, 100), lty=3, lwd=2)
abline(h=(-3:3)*0.2, lty=3)
legend("bottomleft", legend=c("Corr_X1.X2", "Corr_X1.X3", "Corr_X1.X4", "Corr_X1.X5", "Corr_X2.X3", "Corr_X2.X4", "Corr_X2.X5", "Corr_X3.X4", "Corr_X3.X5", "Corr_X4.X5"), lty=c(1,2,3,4,5,1,2,3,4,5), lwd=c(1, 2, 3, 1,2,3,1,2,3,1), bty="n")
title(main="Relationship between bilateral correlations of variables 
and condition number")

Figure 8.8: Relationships between local condition numbers (CN) and local correlations of predictors

8.3.4 Diagnostics of local collinearity – variance inflation factor (VIF) and variance decomposition proportion (VDP)

head(diag_gwr$VIF, 6)
summary(diag_gwr$VIF)
head(diag_gwr$VDP, 6)

library(car)    # for vif()
vif(lm.model)   # compute VIF statistics

8.3.5 Diagnostics of local collinearity – correlation between coefficients

library(GGally)                 # for ggpairs()
library(PerformanceAnalytics)       # for chart.Correlation()
coefs<-data.frame(x1=gwr.gw$SDF$X1, x2=gwr.gw$SDF$X2, x3=gwr.gw$SDF$X3, x4=gwr.gw$SDF$X4, x5=gwr.gw$SDF$X5) # save GWR coefficients in data.frame
ggpairs(coefs)              # generate a correlogram, GGally::
chart.Correlation(coefs, method="spearman", hist=F) #PerformanceAnalytics::

Figure 8.9: Correlation between parameters from GWR: a) with ggpairs() reporting Pearson correlation, b) with chat.Correlation() reporting Spearman correlation

8.4. LASSO GWR – regularisation as a solution to the collinearity problem

library(gwrr)               # for gwl.est()
gwr.lasso<-gwl.est(eq, locs=coordinates(data22.sp),  kernel="gauss", data=as.data.frame(data22.sp))     # model estimation
summary(gwr.lasso) # results
gwr.lasso   # results limited for few observations only

library(Metrics)        # for rmse()
lm.model<-lm(eq, data=data22) # linear regression
rmse(data22$Y, predict(lm.model, data22)) # from metrics::

gwr.model<-gwr(eq, data=data22.sp, adapt=NULL, hatmatrix=TRUE, predictions=TRUE, bandwidth=bw.adapt.spgwr)  # standard GWR from spgwr::
rmse(data22$Y, gwr.model$SDF$pred)

gwr.lasso<-gwl.est(eq, locs=coordinates(data22.sp),  kernel="gauss", data=as.data.frame(data22.sp)) # GWR with Lasso from gwrr::
rmse(data22$Y, gwr.lasso$yhat)

# data in sf class
data22.sf<-st_as_sf(data22, coords=c("crds.x", "crds.y"), crs=4326)
# bandwidth estimation
bw<-bw.gwr.lcr(eq, data=data22.sf, kernel="gaussian", adaptive=FALSE, lambda.adjust=TRUE, cn.thresh=20)
bw


gwrr.model.reg<-gwr.lcr(eq, data=data22.sf, bw=bw, kernel="gaussian", adaptive=FALSE, lambda.adjust=TRUE, cn.thresh=20)
gwrr.model.reg

8.5 Mixed GWR

model.mixed<-gwr.mixed(eq, data=data22.sp, fixed.vars=c('X6'), bw=bw.adapt.gw, diagnostic=T, intercept.fixed=T, adaptive=F, kernel='gaussian', longlat=T)

8.6. Robust regression in the GWR model

model.robust<-gwr.robust(eq, data=data22.sp, bw=bw.adapt.gw, adaptive=TRUE, kernel='gaussian', filtered=TRUE, cut.filter=3) # from GWmodel::
model.robust

8.7. Geographically and temporally weighted regression (GTWR)

# data processing 
data_t<-data[data$year>2019, ]      # panel data for years 2020-2023
data_t$code1<-data_t$code/1000      # add modified code variable

# add 0 as first digit to 3-digit codes to make them 4-digit
data_t$code_chr<-ifelse(is.na(data_t$code1), NA_character_, sprintf(paste0("%0", 4, "d"), as.integer(data_t$code1))) 
data_t[1:5, c("code", "code1", "code_chr")] # evidence of changes

data_t$Y<-data_t$housing_price_sqm_PLN  # Y variable
data_t$X1<-data_t$urbanisation      # X1 variable
data_t$X2<-data_t$dist              # X2 variable
data_t$X3<-data_t$birth_per_1K      # X3 variable
data_t$X4<-data_t$new_firms_per_10K_inhab   # X4 variable
data_t$X5<-data_t$salary_average        # X5 variable
data_t$X6<-data_t$unemployment_rate     # X6 variable

pov_data_t<-merge(pov, data_t, by.x="JPT_KOD_JE", by.y="code_chr", all.y=TRUE)  # sf class data
nrow(pov_data_t)    # it is 4*380 - panel added to a map

# Fig.8.10 – a map of the dependent variable
ggplot()+ geom_sf(pov_data_t, mapping=aes(fill=Y), inherit.aes=FALSE) + scale_fill_viridis(option="H") + facet_wrap(~year, nrow=2)+ theme_minimal()

Figure 8.10: Panel plot of dependent variable (housing price per sq.m) over the period 2020-2023

# prepare the sp class objects
pov_data_sp_t<-as_Spatial(pov_data_t) # SpatialPolygonsDataFrame
# spatio-temporal distance object
x<-st.dist(coordinates(pov_data_sp_t), obs.tv=pov_data_sp_t@data$year) 

t0<-Sys.time()      # start counting time
bw.gtwr.adapt<-bw.gtwr(eq, data=pov_data_sp_t, obs.tv=pov_data_sp_t@data$year, adaptive=TRUE, st.dMat=x)
t1<-Sys.time()      # end counting time
t1-t0           # duration of computation, ~10 min

gtwr.adapt<-gtwr(eq, data=pov_data_sp_t, obs.tv=pov_data_sp_t@data$year, st.bw=bw.gtwr.adapt, adaptive=TRUE)
gtwr.adapt

pov_data_t$gtwr.X3<-gtwr.adapt$SDF$X3
ggplot()+ geom_sf(pov_data_t, mapping=aes(fill=gtwr.X3)) + facet_wrap(~year, nrow=2) + scale_fill_viridis(option="H") + theme_minimal() # Fig.8.11

Figure 8.11: Coefficients from the GTWR model for variable X3 over time and space

8.8. A-spatial and Geographically Weighted Principal Component Analysis

8.8.1 Classical principal component analysis (PCA) applied to spatial data

library(dplyr)      # for data manipulation with pipe ‘|>’ 

# selection of variables from the poviats dataset
selected.vars<-c("dist", "salary_average", "salary_avg_Poland_100p", "invest_share_pp", "birth_per_1K", "death_per_1K", "area_km2",
"number_of_flats", "area_of_flats_m2", "persons_per_1km2", "persons_in_K",
"persons_women_in_K", "persons_men_in_K", "urbanisation_pp", "persons_preproductive_age_K", "persons_productive_age_K", "persons_postproductive_age_K", "new_firms_per_10K_inhab", "closed_firms_per_10K_inhab", "small_firms_per_10k_inhab", "total_firms_empl.49_per_10K_inhab", "all_firms_per_10K_inhab", "new_medical_firms_pp", "new_creative_firms_pp", "housing_price_sqm_PLN", "unemployment_rate")
data23.selected<-data23[ ,selected.vars] # keeping the selected variables

data23.sel.num<-data23.selected |> mutate(across(everything(), as.numeric)) # make sure that all variables are numeric

# save the names of poviats as well as their identification codes in vectors
pov.codes<-as.integer(data23[, "subregion_number"])
pov.names<-as.character(data23[, "subregion_name_noPL_font"])
head(data23.sel.num, 3) # first 5 variables from the dataset, 3 rows
head(pov.codes, 5)  # ID identification codes for the first five poviats
head(pov.names, 5)# names of the first five poviats

cor.mat<-cor(data23.sel.num, use="pairwise.complete.obs") # correlation

library(corrplot)   # to plot matrix in colour using corrplot()
corrplot(cor.mat, method="color", type="upper", order="hclust", tl.cex=0.6,        
  tl.srt=45)        # visualisation of the matrix

Figure 8.12: Correlation matrix for the variables selected for analysis

data23.sel.s<-scale(data23.sel.num)     # scale the data (z-score)
head(data23.sel.s, 5)   # first 5 scaled variables from the dataset

classic.pca<-prcomp(data23.sel.s, center=FALSE, scale.=FALSE) 
summary(classic.pca)
classic.pca$rotation[, 1:5] # loadings matrix for the first 5 PCs
classic.pca$x[, 1:5] # scores matrix for first 5 poviats and first 5 PCs
summary(classic.pca$x[, 1])# summary statistics for PC1 scores
sd(classic.pca$x[, 1])# standard deviation of PC1 scores

library(factoextra)     # for fviz_pca_var() and fviz_eig()
fviz_pca_var(classic.pca, labelsize=3, repel=TRUE)  # Fig.8.13a
fviz_eig(classic.pca)                       # Fig.8.13b

Figure 8.13: Graph of PCA results: a) impact (direction and strength) of variables, b) scree plot with explained variance

fviz_contrib(classic.pca, "var", axes=1, xtickslab.rt=70) # Fig.8.14a, PC1
fviz_contrib(classic.pca, "var", axes=2, xtickslab.rt=70) # Fig.8.14b, PC2

Figure 8.14: Contribution of variables to: a) first principal component, b) second principal component

library(psych)
rotated.pca<-principal(data23.sel.s, nfactors=5, rotate="varimax")
print(loadings(rotated.pca), digits=2, cutoff=0.5, sort=T)

pov.rotated.pca.scores<-data.frame(pov.codes, pov.names, rotated.pca$scores[, 1:5]) # df with rotated scores, pov names and codes
colnames(pov.rotated.pca.scores)<-c("pov.code", "pov.name", "popul.scale", "urban.struct", "business.act", "wage.level", "demogr.dynamics") # add the “umbrella names” for rotated scores
head(pov.rotated.pca.scores)    # 3 rows only

pov.rotated.pca.scores.sf<-pov.rotated.pca.scores %>% left_join(pov %>% select(JPT_KOD_JE, geometry) %>% mutate(JPT_KOD_JE= as.integer(JPT_KOD_JE)), by=c("pov.code"="JPT_KOD_JE")) %>% st_as_sf()
class(pov.rotated.pca.scores.sf)
head(pov.rotated.pca.scores.sf) # 3 rows only

ggplot() +geom_sf(data=pov.rotated.pca.scores.sf, aes(fill= popul.scale)) + scale_fill_viridis_c() + labs(title="Population scale",   fill="RC1") + theme_minimal() +theme(plot.title= element_text(hjust=0.5, size=12))# RC1

ggplot() + geom_sf(data=pov.rotated.pca.scores.sf, aes(fill= urban.struct)) + scale_fill_viridis_c() + labs(title= "Urban structure",fill="RC2") + theme_minimal() +theme(plot.title= element_text(hjust=0.5, size=12)) #RC2

ggplot() +geom_sf(data= pov.rotated.pca.scores.sf, aes(fill= business.act)) + scale_fill_viridis_c() + labs(title="Business activity", fill="RC3") + theme_minimal() + theme(plot.title=element_text(hjust=0.5, size=12)) #RC3

ggplot() + geom_sf(data= pov.rotated.pca.scores.sf, aes(fill=wage.level))+
scale_fill_viridis_c() + labs(title="Wage level", fill="RC5") + theme_minimal()+ theme(plot.title=element_text(hjust=0.5, size=12)) # RC5

ggplot() + geom_sf(data= pov.rotated.pca.scores.sf, aes(fill= demogr.dynamics)) + scale_fill_viridis_c() + labs(title="Demographic dynamics", fill="RC4") + theme_minimal() + theme(plot.title= element_text(hjust=0.5,size=12))# RC4

Figure 8.15: Spatial distribution of rotated principal components: a) population scale (RC1), b) urban structure (RC2), c) business activity (RC3), d) wage level (RC5), e) demographic dynamics (RC4)

8.8.2 Geographically weighted principal component analysis (GWPCA)

# df integrated: standardised variables, poviat names and codes
data23.sel.s.int<-data.frame(pov.codes, pov.names, data23.sel.s)

# attach the geometries to the df
data23.sel.s.int.sf<-data23.sel.s.int %>% left_join(pov %>%
select(JPT_KOD_JE, geometry) %>% mutate(JPT_KOD_JE= as.integer(JPT_KOD_JE)), by=c("pov.codes"="JPT_KOD_JE")) %>% st_as_sf() %>% st_make_valid() 
class(data23.sel.s.int.sf)

data23.sel.s.int.sp<-as(data23.sel.s.int.sf, "Spatial")
class(data23.sel.s.int.sp)

# save names of the variables in a vector
my.vars<-colnames(data23.sel.s.int.sp@data[3:28])

library(GWmodel)        # for bw.gwpca() and gwpca()
# search for the optimal bandwidth: data in sp class, 5 components
my.bw<-bw.gwpca(data=data23.sel.s.int.sp, vars=my.vars, k=5, longlat=TRUE)
my.bw

gwpca.pov<-gwpca(data=data23.sel.s.int.sp, vars=my.vars, k=5, bw=my.bw, longlat=TRUE, scores=TRUE)
gwpca.pov
head(gwpca.pov$SDF@data)

# plot in sp class
library(sp)         # for spplot()

# Fig.8.16a - local proportion of variance explained by the first PC
spplot(gwpca.pov$SDF, "Comp.1_PV", main="Local proportion of variance: PC1", par.settings=list(layout.heights=list(main.key.padding=2, top.padding=2))) 

# Fig.8.16b - cumulative local proportion of variance explained by 5 PCs
spplot(gwpca.pov$SDF, "local_CP", main="Cumulative local proportion of variance: first 5 PCs", par.settings= list(layout.heights= list(main.key.padding=2, top.padding=2))) 

# Fig.8.16c - winning variable for the first PC
gwpca.pov$SDF@data$win_var_PC1<-as.factor(gwpca.pov$SDF@data$win_var_PC1)
spplot(gwpca.pov$SDF, "win_var_PC1", main="Winning variable for PC1",
par.settings=list(layout.heights=list(main.key.padding=2, top.padding=2)))

# Fig.8.17 – the same plot in sf class
gwpca.pov.sf<-st_as_sf(gwpca.pov$SDF)
head(gwpca.pov.sf)

ggplot() + geom_sf(data=gwpca.pov.sf, aes(fill=win_var_PC1)) +  scale_fill_discrete(name=NULL) + theme_minimal() + theme(plot.title= element_text(hjust=0.5, size=12))+ labs(title="Winning variable for PC1")

Figure 8.16: GWPCA results visualised using sp class objects: a) local variance explained by PC1, b) cumulative local variance explained by the first five PCs, c) dominant variable for local PC1

Figure 8.17: Dominant variable for the first local principal component (PC1) derived from GWPCA, visualised using sf objects

# local loadings 
ld_pc1<-gwpca.pov$loadings[,c(2,7,11),1] # selected variable loadings for PC1
ld_pc2<-gwpca.pov$loadings[,c(2,7,11),2] # selected variable loadings for PC1
head(ld_pc1)    # local loadings per variable in PC1, 3 rows only
summary(ld_pc1) # summary statistics for selected local loadings (PC1)
head(ld_pc2)    # local loadings per variable in PC2, 3 rows only
summary(ld_pc2) # summary statistics for selected local loadings (PC2)
loc<-coordinates(gwpca.pov$SDF) # coordinates of the same locations as in SDF

# Fig.8.18a - glyph plot for local loadings of variables for PC1
plot(data23.sel.s.int.sp, col=NA, border="grey80", lwd=0.2) # contour
gwpca.glyph.plot(ld=ld_pc1, loc=loc, r1=20, add=TRUE) 
title(main="Local PCA loadings (PC1)")

# Fig.8.18b - glyph plot for local loadings of variables for PC2
plot(data23.sel.s.int.sp, col=NA, border="grey80", lwd=0.2) # contour
gwpca.glyph.plot(ld=ld_pc2, loc=loc, r1=20, add=TRUE) 
title(main="Local PCA loadings (PC2)")

Figure 8.18: Glyph plot illustrating spatial variation in geographically weighted PCA loadings: a) local loadings for the first principal component (PC1) and b) local loadings for the second principal component (PC2)

Chapter 9: Spatial point pattern analysis (Kateryna Zabarina)

9.1. Spatial point patterns - introduction and main definitions

# data pre-processing – subset for more efficient analysis
points$random<-runif(nrow(points), 0, 1)    # random values from uniform dist
points.lim<-points[points$random<=0.2,]     # select ~20% of all obs.
nrow(points.lim)                    # 7392 obs.

# summary of industry sector, SEC_PKD7
table(points.lim$SEC_PKD7)

# summary by employment group
# empl_group=1 -> between 1 and 9 persons
# empl_group=2 -> between 10 and 49 persons
# empl_group=3 -> between 50 and 249 persons
# empl_group=4 -> between 250 and 999 persons
# empl_group=5 -> more than 1500 persons

table(points.lim$empl_group)
# summary by legal form
table(points.lim$legal_form)

9.1.1 Basic classes in spatstat:: package - owin and ppp

library(spatstat)
voi<-st_read("wojewodztwa.shp", options="ENCODING=UTF-8") # import a map 
voi.p<-st_transform(voi, 3857)              # re-project to planar
region.voi.p<-voi.p[voi.p$JPT_NAZWA_=="lubelskie",]     # create a map subset
W<-as.owin(region.voi.p)        # convert to owin class
# W<-as(region.voi.p, "owin")       # alternative code for the same
class(W) 

# transform point data
points.lim.sf<-st_as_sf(points.lim, coords=c("crds.x", "crds.y"), crs=4326)
points.lim.sf.p<-st_transform(points.lim.sf, 3857) # re-project to planar
crds.p<-st_coordinates(points.lim.sf.p) # matrix with points coordinates

# unmarked point pattern
firms.ppp.um<-ppp(x=crds.p[,1], y=crds.p[,2], window=W) # class ppp

# marked point pattern - marks are type of sector
firms.ppp.m<-ppp(x=crds.p[,1], y=crds.p[,2], window=W, marks=points.lim$SEC_agg)    # class ppp

# Fig.9.2 - plot point patterns from example above
par(mar=c(1,1,1,1))
plot(firms.ppp.um, main="Unmarked point pattern") # with spatstat::
plot(firms.ppp.m, main="Marked point pattern") # with spatstat::

Figure 9.2: Visualisation of point patterns in spatstat:: a) unmarked, b) marked

# analyse the range of data
centre.region<-st_coordinates(st_centroid(st_geometry(region.voi.p)))
centre.region

bbox<-st_bbox(region.voi.p) # get coordinates of bounding box
bbox

max(crds.p[,1])-min(crds.p[,1]) # range of x coordinates
max(crds.p[,2])-min(crds.p[,2]) # range of y coordinates

# Fig.9.3a - create a circle window in spatstat::
# center of the circle approximately in the center of region
# parameters: a and b – ellipse radii, centre – center of circle, 
# phi – anticlockwise rotation angle
# radii of ellipse are related to the range of points
circle<-ellipse(a=200000, b=200000, centre=centre.region, phi=pi) 
firms.circle<-ppp(x=crds.p[,1], y=crds.p[,2], window=circle) # class ppp
plot(firms.circle)          # plot class ppp in spatstat

# Fig.9.3b - create a rectangle window in spatstat::
# specify in owin: c(xmin, xmax), c(ymin, ymax)
rect<-owin(c(2406231, 2687896), c(6490003, 6852339)) # create a rectangle
firms.rect<-ppp(x=crds.p[,1], y=crds.p[,2], window=rect) # class ppp
plot(firms.rect)                # plot class ppp in spatstat

Figure 9.3: Point patterns with various windows: a) circle, b) rectangle

9.1.2. Marks in point pattern

# marked point patterns 
# numerical marks - the size of employment
firms.ppp.empl<-ppp(x=crds.p[,1], y=crds.p[,2], window=W, marks=points.lim$empl_group)  

# categorical marks – legal form of the firm
firms.ppp.lf<-ppp(x=crds.p[,1], y=crds.p[,2], window=W, marks=as.factor(points.lim$legal_form)) 

# marked point pattern with numerical marks – firms.ppp.empl
# control for size of points according to number of employees
plot(firms.ppp.empl, markscale=0.002, pch=1)    # not shown

# marked point pattern with categorical marks – firms.ppp.lf
# control for colors of points according to legal form
plot(firms.ppp.lf, pch=16, cex=0.8, cols=c("red", "green", "blue"))

# split for numeric marks
# convert the numerical values to class factor
marks(firms.ppp.empl)<-as.factor(marks(firms.ppp.empl))
split_ppp_empl<-split(firms.ppp.empl) # split the pattern by factor variable

# Fig.9.4a - plot the splitted numeric mark point pattern
par(mar=c(2,2,2,2))     # window of plot
plot(W, main="Marked point pattern \n representing number of employees")

# layers of point pattern, the first layer is for the biggest grup
plot(split_ppp_empl[[1]], add=TRUE, cex=0.75, pch=16, col="green")
plot(split_ppp_empl[[2]], add=TRUE, cex=1, pch=16, col="blue")
plot(split_ppp_empl[[3]], add=TRUE, cex=1.25, pch=16, col="red")
plot(split_ppp_empl[[4]], add=TRUE, cex=1.5, pch=16, col="black")
#plot(split_ppp_empl[[5]], add=TRUE, cex=1.75, pch=16, col="orange")
legend("bottomleft", legend=c("5","30","150","600","1500"), col=c("green", "blue", "red", "black", "orange"), pch=16, cex=0.8)

# Fig.9.4b - split for categorical marks
split_ppp_firms<-split(firms.ppp.lf, reduce=TRUE) 

cols<-c("red", "green", "blue")
par(mar = c(2,2,2,2))
plot(W, main="Marked point pattern \n representing legal form")
plot(split_ppp_firms[[2]], add=TRUE, col=cols[2], pch=18)
plot(split_ppp_firms[[3]], add=TRUE, col=cols[3], pch=18)
plot(split_ppp_firms[[1]], add=TRUE, col=cols[1], pch=18)
legend("bottomleft", legend=c("natural person", "organizational entity \n without legal personality", "legal person"), col=c("green", "blue", "red"), pch=18, cex=0.8, bty="n")

Figure 9.4: Marked point patterns: a) with numerical marks, b) with categorical marks

table(firms.ppp.m$marks) # old mark values 
marks(firms.ppp.m)<-as.factor(points.lim$dummy_agri) # change values 
summary(marks(firms.ppp.m))                     # new mark values
summary(setmarks(firms.ppp.m, as.factor(points.lim$SEC_PKD7)))
summary(marks(firms.ppp.m)) # no changes can be observed

# alternative code - use pipeline operator from package magrittr::
library(magrittr)
firms.ppp.m %mark% as.factor(points.lim$SEC_PKD7)

unmark(firms.ppp.m)     # option 1
marks(firms.ppp.m)<-NULL    # option 2
firms.ppp.m

marks(firms.ppp.m)<-as.factor(points.lim$SEC_PKD7)
summary(marks(firms.ppp.m))

9.1.3 Duplicated points

sum(duplicated(firms.ppp.um))   # how many points are duplicated
firms.ppp.um<-unique(firms.ppp.um)  # keep unique points only
firms.ppp.um
summary(duplicated(firms.ppp.um)) # how many unique points are in dataset

# alternative way - result is the same as above
firms.ppp.um<-firms.ppp.um[!duplicated(firms.ppp.um)] 
summary(duplicated(firms.ppp.m))
firms.ppp.m.jitter<-rjitter(firms.ppp.m, 0.02)
summary(duplicated(firms.ppp.m.jitter))

9.1.4 Rescaling and other basic operations

area.owin(W)
summary(firms.ppp.um)

lubelskie.area<-25122.46            # area in km2
rsc<-sqrt(area.owin(W)/lubelskie.area)      # convertion ratio
rsc                         # 1595.627
firms.ppp.um<-rescale.ppp(firms.ppp.um, rsc, "km")
summary(firms.ppp.um)               # 25122.5 km2

9.1.5 Distances and covariates

between_dist<-pairdist.ppp(firms.ppp.um) # between all points in dataset (km)
between_dist[1:5, 1:5]               # output in class matrix
dim(between_dist) # exactly this amount is left after removing duplicates

# object as data.frame, 2nd minimum distance as the last column
between_dist<-as.data.frame(between_dist)
between_dist$min_d<-apply(between_dist[,1:6088], 1, function(x) sort(x)[2])

first_neib<-nndist(firms.ppp.um) # by default k=1, class numeric

# comparison of distances – 2nd minimum dist and knn=1 dist
head(first_neib)
head(between_dist$min_d)

cities_vec<-c('Lublin', 'Chelm', 'Biala Podlaska', 'Zamosc')
xc<-c(22.570278, 23.477778, 23.116667, 23.258611)
yc<-c(51.248056, 51.132222, 52.033333, 50.720556)
cities<-data.frame(xc, yc, cities_vec)

pwt.sf<-st_as_sf(cities, coords=c("xc", "yc"), crs=4326) # spherical
pwt.sf<-st_transform(st_as_sf(pwt.sf), crs=st_crs(3857)) # planar 

pwt_ppp<-ppp(x=st_coordinates(pwt.sf)[,1],                y=st_coordinates(pwt.sf)[,2], window=W)   # ppp class
pwt_ppp<-rescale.ppp(pwt_ppp, rsc, "km")    # planar point pattern for 4 cities

# apply crossdist() to obtain distances from all points to cities
d<-crossdist(firms.ppp.um, pwt_ppp)
colnames(d)<-cities_vec
d<-as.data.frame(d)
d$min_d<-apply(d[,1:4], 1, FUN=min) # which city is closest to the point
head(d)

dist.fun<-distfun(firms.ppp.um) # similar to above
class(dist.fun)             # "distfun"  "funxy"    "function"
summary(dist.fun)

dist.map<-distmap(firms.ppp.um) 
class(dist.map)         # im class
summary(dist.map)

# Fig.9.6 - comparison of distfun() and distmap()
par(mar=c(2,2,2,2))
plot(dist.fun)  # plot of distance function
plot(dist.map)  # plot by pixels, class im

Figure 9.6: Comparison of a) distfun() and b) distmap() functions outputs

# W<-as.owin(lubelskie)     # convert to owin class object, as previously
pattern<-firms.ppp.um       # unmarked point pattern
marks(pattern)<-d$min_d     # distance to closest powiat city as marks

# change class from ppp to im
mat_pix0<-as.im.ppp(pattern, W)     # 128 x 128 pixel array (ny, nx)

par(mar=c(2,2,2,2))
# Fig.9.7a
mat_pix1<-pixellate(pattern)        # measure the amount of data in each pixel
mat_pix1 # shows description of pixel image resolution and bounding box 
plot(mat_pix1, main="pixellate")    # count of firms in pixels

# Fig.9.7b
mat_pix2<-Smooth.ppp(pattern)  # kernel smooth of values in irregular loc.
plot(mat_pix2, main="Smooth.ppp")   # smoothed distance (mark)

# Fig.9.7c
mat_pix3<-idw(pattern)  # inverse distance smoothing of obs.values
plot(mat_pix3, main="idw")  # smoothed distance (mark)

# Fig.9.7d - duplicate initial pattern, firms.ppp.um, and add new marks
first_neib_ppp<-firms.ppp.um                # copy-paste ppp object
marks(first_neib_ppp)<-first_neib           # change marks – dist to knn1
first_neib_pix<-Smooth.ppp(first_neib_ppp)  # smooth
plot(first_neib_pix, main="nndist")         # plot pixeled dist to knn1

Figure 9.7. Pixel image covariate – distance to closest poviat city: a) count of firms from pixellate(), b) kernel smoothed distance from Smooth.ppp(), c) inverse-distance smoothed values from idw(), d) kernel smoothed new marks

9.1.6 Summary and preparations for further analysis

# create a comprehensive point pattern for further analysis
points$random<-runif(nrow(points), 0, 1)    # random values from uniform dist
firms<-points[points$random<=0.07,] # select ~7% of all observations
nrow(firms)                 # 2242 obs  

# transform point data to spherical WGS84 and then Mercator planar projection
firms.sf<-st_as_sf(firms, coords=c("crds.x", "crds.y"), crs=4326) #spherical
firms.sf2<-st_transform(st_as_sf(firms.sf), crs=st_crs(3857))   # planar

# create owin class window
voi<-st_read("wojewodztwa.shp", options="ENCODING=UTF-8") # import a map 
voi.p<-st_transform(voi, 3857)              # re-project to planar
region.voi.p<-voi.p[voi.p$JPT_NAZWA_=="lubelskie",]     # create a map subset
W<-as.owin(region.voi.p)        # convert to owin class

# create unmarked point pattern
pattern.um<-ppp(x=st_coordinates(firms.sf2)[,1], y=st_coordinates(firms.sf2)[,2], window=W)         # create ppp object
lubelskie.area<-25122.46                # area of region
rsc<-sqrt(area.owin(W)/lubelskie.area)      # rescaling factor
pattern.um<-rescale.ppp(pattern.um, rsc, "km")
pattern.um<-unique(pattern.um)  # keep unique points only
summary(pattern.um)             # 2043 points, no duplicates

# create marked point pattern, marks are industry sectors
pattern.m<-ppp(x=st_coordinates(firms.sf2)[,1], y=st_coordinates(firms.sf2)[,2], window=W, marks=as.factor(firms.sf$SEC_PKD7)) # add marks and make ppp class
pattern.m<-rjitter(pattern.m, 0.02) # relocate points to avoid overlaps
pattern.m<-rescale.ppp(pattern.m, rsc, "km") # rescale
summary(pattern.m) # 2242 points, of marks as industry sectors

# pixel image covariate: distance to closest poviat city
cities_vec<-c('Lublin', 'Chelm', 'Biala Podlaska', 'Zamosc')
xc<-c(22.570278, 23.477778, 23.116667, 23.258611)
yc<-c(51.248056, 51.132222, 52.033333, 50.720556)
cities<-data.frame(xc, yc, cities_vec)
pwt.sf<-st_as_sf(cities, coords=c("xc", "yc"), crs=4326) # spherical
pwt.sf<-st_transform(st_as_sf(pwt.sf), crs=st_crs(3857)) # planar 
pwt_ppp<-ppp(x=st_coordinates(pwt.sf)[,1],
             y=st_coordinates(pwt.sf)[,2], window=W)    # class ppp
pwt_ppp<-rescale.ppp(pwt_ppp, rsc, "km")        # rescale to new units in km2

d<-crossdist(pattern.um, pwt_ppp)   # distances between points and cities
colnames(d)<-cities_vec
d<-as.data.frame(d)
d$min_d<-apply(d[,1:4], 1, FUN=min) 
head(d)

pattern.im<-pattern.um          # unmarked point pattern
marks(pattern.im)<-d$min_d      # distance to closest city as marks
mat_pix<-Smooth.ppp(pattern.im)     # no rescaling, it was done before

9.2. Intensity-based analysis of unmarked point pattern

9.2.1. Quadrat test

# H0: homogeneity => CSR, H1: heterogeneity => non-CSR
q1<-quadrat.test(pattern.um, nx=5, ny=5)    # 25 squares
q1                      # H1 as p-value is small
names(q1)   # names of slots available in output
q1$method

q2<-quadrat.test(pattern.um, nx=10, ny=10)  # 100 squares
q2$p.value

q3<-quadrat.test(pattern.um, nx=25, ny=25)  # 625 squares
q3$p.value

q4<-quadrat.test(pattern.um, nx=50, ny=50)  # 2.500 squares
q4$p.value

q5<-quadrat.test(pattern.um, nx=100, ny=100)    # 10.000 squares
q5$p.value

9.2.2. Tests with spatial covariates

# H0: CSR => random locations, H1: non-CSR => non-random locations
ks.rand<-cdf.test(pattern.um, "x")  # Kolmogorov-Smirnov test
ks.rand
plot(ks.rand) # Fig.9.8a cumulative distributions (theoretical & empirical)

cvm.rand<-cdf.test(pattern.um, "x", "cvm") # Cramer-Von Mises test
cvm.rand

ad.rand<-cdf.test(pattern.um, "x", "ad") # Anderson-Darling test
ad.rand

bt1<-berman.test(pattern.um, "x") # Berman Z1
bt1

bt2<-berman.test(pattern.um, "x", "Z2") # Berman Z2
bt2

ks.rand.im<-cdf.test(pattern.um, mat_pix)
ks.rand.im

cvm.rand.im<-cdf.test(pattern.um, mat_pix, "cvm")
cvm.rand.im

ad.rand.im<-cdf.test(pattern.um, mat_pix, "ad")
ad.rand.im

bt1.im<-berman.test(pattern.um, mat_pix) 
bt1.im

bt2.im<-berman.test(pattern.um, mat_pix, "Z2") 
bt2.im

plot(ks.rand)       # Fig.9.8a
plot(ks.rand.im)    # Fig.9.8b

Figure 9.8: Plot of Kolmogorov-Smirnov test under a presence of a covariate: a) x covariate, b) pixel image covariate

9.2.3. Kernel density estimation

d.ppp<-density(pattern.um) # calculated for pixels, sigma not precalculated
summary(d.ppp)

# Fig.9.9a – kernel density estimation with contours (isolines)
plot(d.ppp, main="Density of firms witin a region")
contour(d.ppp, add=TRUE)

# Fig.9.9b – 3D density rotated
persp(d.ppp, main="3D density of firms witin a region", phi=30, theta=-30)

Figure 9.9: Density estimation of study pattern: a) KDE with isolines, b) 3D perspective plot

# example does not pre-calculate bandwidth, formally it is density.ppp()
d0.ppp<-density(pattern.um) # by default - given above
d1.ppp<-density(pattern.um, kernel="epanechnikov")
d2.ppp<-density(pattern.um, kernel="quartic")
d3.ppp<-density(pattern.um, kernel="disc")

par(mfrow=c(2,2))       # Fig.9.10, four figures (2x2) jointly
par(mar=c(1,1,1,1))
plot(d0.ppp, main="Gaussian")
plot(d1.ppp, main="Epanechnikov")
plot(d2.ppp, main="quartic")
plot(d3.ppp, main="disc")
par(mfrow=c(1,1))       # back to single figure setting

Figure 9.10. Density estimation – various kernels

# bandwidths – radii (sigma)
diggle_ppp<-bw.diggle(pattern.um)   # Diggle
ppl_ppp<-bw.ppl(pattern.um)         # likelihood CV
scott_ppp<-bw.scott(pattern.um)     # Scott, outputs two sigma
CvL_ppp<-bw.CvL(pattern.um)         # Cronie and van Lieshout
sigma<-c(diggle_ppp, ppl_ppp, scott_ppp, CvL_ppp)

# Gaussian kernel estimation with pre-calculated sigma; edge correction TRUE 
# positive=TRUE -> density values are forced to be positive
d_diggle_ppp<-density(pattern.um, sigma=diggle_ppp, positive=TRUE)
d_ppl_ppp<-density(pattern.um,  sigma=ppl_ppp, positive=TRUE)
d_scott_ppp<-density(pattern.um,  sigma=scott_ppp, positive=TRUE)
d_cvl_ppp<-density(pattern.um,  sigma=CvL_ppp, positive=TRUE)

plot(d_diggle_ppp, main="Diggle")   # Fig.9.11a
#plot(d_ppl_ppp, main="likelihood CV") # looks similar to Diggle
plot(d_scott_ppp, main="Scott") # Fig.9.11b
# plot(d_cvl_ppp, main="Cronie & van Lieshout") # looks similar to Scott

Figure 9.11: Kernel density estimation with various bandwidths: a) Diggle, b) Scott

9.3. Distance-based analysis of the unmarked point pattern

9.3.1. Distance-based measures

Ki<-Kinhom(pattern.um)
Ki

Fi<-Finhom(pattern.um)
Fi

Gi<-Ginhom(pattern.um)
Gi

Ji<-Jinhom(pattern.um)
Ji

plot(Ki) # Fig.9.12a
plot(Fi) # Fig.9.12b
plot(Gi) # Fig.9.12c
plot(Ji) # Fig.9.12d

Figure 9.12: Inhomogeneous functions for unmarked point pattern: a) K function, b) F function, c) G function, d) J function

ce.test<-clarkevans.test(pattern.um)
ce.test

ht<-hopskel.test(pattern.um)
ht

9.3.2. Monte-Carlo tests

# time consuming procedure
t0<-Sys.time()      # may take ~ 5 hours!
E.ptwise<-envelope(pattern.um, Linhom, nsim=19, fix.n=T)
t1<-Sys.time()
t1-t0
E.ptwise
plot(E.ptwise)      # Fig.9.13a

# Global, p-value = 1/(1+nsim)
t2<-Sys.time() # smaller number of simulation to save time
E.g<-envelope(pattern.um, Linhom, nsim=5, rank=1, global=TRUE)
t3<-Sys.time()
t3-t2 # ~1.7 hours
E.g
plot(E.g)       # Fig.9.13b

Figure 9.13: Estimation of point pattern: a) pointwise envelope and b) global envelope of inhomogeneous L function for study point pattern

dclf.ppp<-dclf.test(pattern.um, Lest, nsim=5)
dclf.ppp # ~1,7 hours

mad.ppp<-mad.test(pattern.um, Lest, nsim=5)
mad.ppp

9.4. Intensity-based analysis of marked point pattern

pattern.m2<-ppp(x=st_coordinates(firms.sf2)[,1], y=st_coordinates(firms.sf2)[,2], window=W, marks=as.factor(firms.sf$SEC_agg)) # add marks and make ppp class
pattern.m2<-rjitter(pattern.m2, 0.02)   # relocate points to avoid overlaps
pattern.m2<-rescale.ppp(pattern.m2, rsc, "km") # rescale
s<-segregation.test(pattern.m2, nsim=19)
s

9.5. Correlation and spacing analysis of the marked point pattern

9.5.1. Analysis under assumption of stationarity

Kfm<-Kcross(pattern.m, "F", "M")
Kfm
summary(Kfm)

Kdi<-Kdot(pattern.m, "G")
Kdi

ma<-markconnect(pattern.m, "J", "K") # time consuming for analysed pattern
ma

mc<-markcorr(pattern.m)
mc

Ic<-Iest(pattern.m)
Ic

pattern.m$window # initial window

# set a new window
Window(pattern.m)<-owin(c(1508.058,1684.539), c(4067.368,4294.449)) 
pattern.m$window # changed window

E<-envelope(pattern.m, Lcross, nsim=99, i="A", j="C", 
            simulate=expression(rshift(pattern.m, radius=35)))
summary(E)

# Fig.9.14a, Kcross() for sectors F and M
plot(Kfm, main="Homogeneous K cross function, \n sectors F and M")  
# Fig.9.14b, Kdot() for sector G
plot(Kdi, main="Homogeneous K dot function, \n sector G vs other")  
plot(ma, main="Mark connection function, \n sectors J and K")# Fig.9.14c
plot(mc, main="Mark equality function \n for study pattern")# Fig.9.14d
plot(Ic, main="I function \n for study pattern")        # Fig.9.14e
# Fig.9.14f, envelope() for sectors A and C
plot(E, main="Randomisation test of \n components’ independence")

Figure 9.14: Graphic analysis of a multitype point pattern under assumption of stationarity: a) homogeneous Kcross function, b) homogeneous Kdot function, c) mark connection function, d) mark equality function, e) I function, f) randomisation of test components’ independence

9.5.2. Analysis under assumption of non-stationarity

Kfmi<-Kcross.inhom(pattern.m, "F", "M")
Kfmi

Kdii<-Kdot.inhom(mpattern.m, "G")
Kdii

# Fig.9.15
plot(Kfmi, main="Inhomogeneous K cross function, \n sectors F and M")
plot(Kdii, main="Inhomogeneous K dot function, \n sector G vs other")

Figure 9.15: Graphic analysis of a multitype point pattern under assumption of non-stationarity: a – inhomogeneous Kcross function, b – inhomogeneous Kdot function

Chapter 10: Spatial sampling and bootstrapping (Katarzyna Kopczewska)

10.1 Spatial sampling - generating new points on the surface

# Fig.10.1a - sampling of points - regular distribution
plot(st_geometry(region.voi), main="Regular distribution")
points(st_sample(region.voi, 1000, type="regular"), pch=".", cex=3)

# Fig.10.1b - sampling of points - random distribution
plot(st_geometry(region.voi), main="Random distribution")
points(st_sample(region.voi, 1000, type="random"), pch=".", cex=3)

# Fig.10.1c - sampling of points - hexagonal distribution in planar projection
region.voi.p<-st_transform(region.voi, 3857) # re-project to planar
plot(st_geometry(region.voi.p), main="Hexagonal distribution")
points(st_sample(region.voi.p, 1000, type="hexagonal"), pch=".", cex=3)

# Fig.10.1d - own function to generate clustered points
clustered_sample<-function(region, n_points=1000, n_clusters=20, sd=2000) {  
#1.cluster centers
centers<-st_sample(region, size=n_clusters, type="random")
    centers_xy<-st_coordinates(centers)
#2.how many points per cluster?
sizes<-as.vector(rmultinom(1, n_points, rep(1, n_clusters)))
#3.generate offspring points around each center
x_all<-y_all<-numeric(0)
for(i in seq_len(n_clusters)) {
x_all<-c(x_all, rnorm(sizes[i], mean=centers_xy[i, "X"], sd=sd))
y_all<-c(y_all, rnorm(sizes[i], mean=centers_xy[i, "Y"], sd=sd))}
pts<-st_as_sf(data.frame(x=x_all, y=y_all), coords=c("x", "y"), crs=st_crs(region))

#4.keep points inside polygon
pts_inside<-pts[region, , op=st_within]
pts_inside
}           # end of the function

# create clustered points using own function
clust_pts<-clustered_sample(region.voi.p, n_points=1000, n_clusters=15, sd=10000)

# plot
plot(st_geometry(region.voi.p), main="Clustered spatial distribution")
plot(st_geometry(clust_pts), pch=16, cex=0.6, add=TRUE)

Figure 10.1: Sampling points in space: a) regular, b) random, c) hexagonal, d) clustered

# 5x5 grid over the extent of region.voi in "sfc_POLYGON" and "sfc" class
grid5<-st_make_grid(region.voi, n=c(5, 5), what="polygons")

# turn grid into an sf object – now in "sf" and "data.frame" class
grid5_sf<-st_as_sf(data.frame(id=1:length(grid5)), geometry=grid5)

# change geometry to other type (MULTIPOLYGON)
grid5_mp<-st_cast(grid5_sf, "MULTIPOLYGON")
st_geometry_type(grid5_mp)  # check results, all are "MULTIPOLYGON"

# Fig.10.2a – full grid over the regional contour
plot(st_geometry(region.voi), border="grey")
plot(st_geometry(grid5_mp), add=TRUE)

# clip the grid cells to region 
grid5_clipped<-st_intersection(grid5_mp, region.voi)
grid5_clipped<-st_collection_extract(grid5_clipped, "MULTIPOLYGON")

# Fig.10.2b – limited (clipped) grid witin the regional contour
plot(st_geometry(region.voi), border="grey")
plot(st_geometry(grid5_clipped), add=TRUE)

# add a vector of values to grids – to set how many points do sample
dim(grid5_clipped)  # 24    36 - 24 grid cells after clip (and 36 columns)
val<-round(runif(n=24, min=3, max=15),0) # random values for each grid
val

# Fig.10.2c - plot the coloured grid cells
grid5_clipped$val<-val  # assign drawn values to grid cells
ggplot() + geom_sf(data=grid5_clipped, aes(fill=val))

# sample points within grid cell according to created vector
pts<-st_sample(x=st_make_valid(grid5_clipped), size=grid5_clipped$val, type="random", by_polygon=TRUE)

# Fig.10.2d – plot the sampled points, add text layer in grid centroids
centers<-st_centroid(st_make_valid(grid5_clipped)) # grid centroids
ggplot() + geom_sf(data=grid5_clipped) + geom_sf(data=pts, size=0.6) + geom_sf_text(data=centers, aes(label=val), size=5) + theme_minimal()

Figure 10.2: Sampling points with multipolygon: a) not-clipped grid, b) clipped grid, c) values assigned to grid, d) sampling points according to values in grid

library(spatstat.geom)      # for rsyst()
library(spatstat.random)    # for rstrat()

region.voi.p<-st_transform(region.voi, 3857) # re-project to planar
region.voi.owin<-as.owin(st_as_sf(region.voi.p)) # owin class

# Fig.10.3a - stratified sampling, with background divided into tiles
r1<-rstrat(win=region.voi.owin, nx=10, ny=10, k=5)
tiles<-quadrats(region.voi.owin, nx=10, ny=10)  # to see those tiles
par(mar=c(1,1,1,1))     # narrow margins – so bigger figure
plot(r1, main="Strafified sampling, k=5 obs. in the tile") 
plot(tiles, add=TRUE, lty=3, col="grey90")  # add tiles 

# Fig.10.3b - generating points evenly distributed in space
r2<-rsyst(win=region.voi.owin, nx=10, ny=10) 
plot(r2, main="Regular sampling") # regular points and regional contour

Figure 10.3: Spatial sampling using spatstat:: a) stratified sampling, b) regular sampling

10.2 Spatial sampling for bootstrap – selecting sub-sample from existing points

10.2.1 Simple sampling – case resampling

selected<-matrix(sample(1:500, size=500*30, replace=TRUE), byrow=TRUE, nrow=500, ncol=30)   # matrix of random IDs
selected[1:5, 1:5] # the result of the ID draw
freq<-as.data.frame(table(selected))    # frequency of draws of IDs
dim(freq)                       # all IDs were sampled
head(freq)                      # frequency of sampled IDs

sub<-points[1:500, ]    # subsample with 500 obs.
sub$ID<-1:500       # add ID variable to the dataset
sub<-merge(sub, freq, by.x="ID", by.y="selected", all.x=TRUE)

# Fig.10.4a - under- and over-representation in sampling
sub$color<-"grey70"             # as expected
sub$color[is.na(sub$Freq)==TRUE]="red"  # never selected
sub$color[sub$Freq<25]="blue"           # deeply under-selected
sub$color[sub$Freq>35]="green"      # deeply over-selected
plot(sub[,c("crds.x", "crds.y")], bg=sub$color, pch=21, cex=2)
legend("bottomleft", pch=21, pt.bg=c("grey70", "red", "blue", "green"), c("as expected", "never selected", "under-selected", "over-selected"), bty="n", cex=0.8)
plot(st_geometry(region.voi), add=TRUE)

# data to analyse spatial distributions of under- and over-representation
sub2<-sub[sub$color=="blue" | sub$color=="green",]
sub2$marks<-0                       # column of values 0
sub2$marks[sub2$color=="blue"]<-"case"      # under-selected obs.
sub2$marks[sub2$color=="green"]<-"control"  # over-selected obs.
sub2<-sub2[,c("crds.x", "crds.y", "marks")] # selected columns only
sub2

# convert classes: sf spherical to sf planar (Mercator)
sub2.sf<-st_as_sf(sub2, coords=c("crds.x", "crds.y"), crs=4326) # spherical
sub2.sf2<-st_transform(st_as_sf(sub2.sf), crs=st_crs(3857)) # planar
coords<-st_coordinates(sub2.sf2)    # coordinates of points

# make ppp class objects
library(spatstat)       # package for ppp class and relrisk()
region.merc<-st_transform(st_as_sf(region.voi), crs=st_crs(3857)) # sf
region.owin<-spatstat.geom::as.owin(sf::st_geometry(region.merc)) # owin
sub2.ppp<-ppp(x=coords[,1], y=coords[,2], window=region.owin, marks=factor(sub2.sf2$marks))         # ppp class

# Fig.10.4b - spatial relative risk (srr)
srr<-relrisk(sub2.ppp, se=TRUE)             # from spatstat::
plot(srr$estimate, main="Spatial Relative Risk")    # plot estimates of srr

Figure 10.4: Results of multiple sampling: a) location of drawn observations, b) relative risk of under- and over-represented points

corr.raw<-cor(sub$locPdens, sub$locAggTOTAL) # original correlation
corr.raw

# matrix where IDs where replaced with the appropriate values of variable
sub.boot.v1<-matrix(sub$locPdens[selected], nrow=nrow(selected), ncol= ncol(selected))  # 500 rows x 30 columns
sub.boot.v2<-matrix(sub$locAggTOTAL[selected], nrow=nrow(selected), ncol= ncol(selected))   # 500 rows x 30 columns

# correlation coefficients for each iteration using sapply()
corr.boot<-sapply(seq_len(ncol(sub.boot.v1)), function(i) cor(sub.boot.v1[, i], sub.boot.v2[, i]))
corr.boot
summary(corr.boot)
ci<-quantile(corr.boot, c(0.025, 0.975))  # 95% bootstrap confidence interval
ci

10.2.2 Sampling points from k-means clusters - block bootstrap

# prepare data
crds<-st_coordinates(st_centroid(st_make_valid(pov)))    # coordinates
data23<-data[data$year==2023, c("ID_MAP", "unemployment_rate", "dist")]
data23$crds.x<-crds[,1]         # add x coordinates to the data set
data23$crds.y<-crds[,2]         # add y coordinates to the data set
data23$crds<-crds   # add xy coordinates jointly (crds.X and crds.Y)

n<-dim(data23)[1]   # number of observations (rows) - 380
b<-10           # average cluster length (#obs. in cluster)
k<-n/b          # the number of clusters - 38

c1<-kmeans(crds, k)     # k-means algorithm applied to coordinates
data23$clust<-c1$cluster    # clustering vector added to dataset
names(data23)   # coordinates were added separately and jointly

library(viridisLite)    # allows for e.g. 38 colours
cols<-inferno(k)        # colour palette, other option is e.g. plasma(k)
brks<-1:k           # intervals

plot(st_union(voi))     # all regions merged to one, plot not shown 
points(data23$crds, bg=cols[findInterval(data23$clust, brks)], pch=21, cex=2.5)
title(main="Clusters from the k-average algorithm")

# turn data into sf points (380 obs.)
df.sf<-st_as_sf(data23, coords=c("crds.x", "crds.y"), crs=4326)

# group observations by cluster and compute centroid of each cluster
# st_combine() puts all points geometries into one container
# st_union() truly merges geometries into single shape

library(dplyr)  # for pipe operations and data processing
crds <- df.sf |> group_by(clust) |>     # average values within clusters
  summarise(unemp_rate=mean(unemployment_rate, na.rm=TRUE),
    geometry=st_centroid(st_combine(geometry))) |> ungroup()

# Voronoi tessellation for the cluster centroids
region.p<-sf::st_transform(st_union(voi), crs=3857) # planar contour map
crds.p<-sf::st_transform(crds, crs=3857)        # planar point data

crds.sfc<-st_geometry(crds.p)       # class sfc from points (coordinates)
box.sfc<-st_geometry(region.p)  # class sfc from region
crds.union<-st_union(crds.sfc)  # union of points
tess<-st_voronoi(crds.union, box.sfc)   # Voronoi tesselation
tess.clip<-st_intersection(st_cast(tess), st_union(box.sfc)) # sfc class
tess.clip.wgs<-st_transform(tess.clip, crs=4326) # spherical tesselation
tess.sf<-st_as_sf(tess.clip.wgs)        # tessellation in sf class 
tess.sf$unemp_rate<-crds$unemp_rate     # add data to Voronoi tiles

# Fig.10.5a – using plot()
plot(st_geometry(tess.clip.wgs), lwd=2, lty=3)
points(data23[,c("crds.x", "crds.y")], bg=cols[findInterval(data23$clust, brks)], pch=21, cex=1.5)

# Fig.10.5b – using ggplot()
ggplot() + geom_sf(data=tess.sf, aes(fill=unemp_rate), alpha=0.6) +
geom_sf(data=df.sf, aes(col=factor(clust))) + coord_sf() + labs(fill="Unemployment rate", title="Voronoi tessellation based on cluster centroids") + theme_minimal()

Figure 10.5: Clusters of points generated with k-means plotted over tessellated surface: a) using plot(), b) using ggplot()

b_emp<-aggregate(data23$unemployment_rate, by=list(data23$clust), length)
head(b_emp)     # structure of clusters’ size

mmax<-max(b_emp$x) # Fig.10.6a, using hist()
hist(b_emp$x, breaks=1:mmax, ylim=c(0,12), labels=TRUE, col="orange") 
abline(h=(1:6)*2, lty=3)

# Fig.10.6b using ggplot()
ggplot(b_emp, aes(x=x)) + geom_histogram(binwidth=1, fill="grey50", color="white") + labs(title="Size of k-means clusters") + theme_minimal()

Fig.10.6: Histogram of cluster’s size: a) using hist(), b) using ggplot()

iter<-100           # number of iterations
result<-matrix(0, nrow=iter, ncol=4) # to write the results of analysis
colnames(result)<-c("unique clusters", "sample length", "unique observations", "correlation")
selected<-matrix(0, nrow=k, ncol=iter) # empty object to save sampled IDs

for(i in 1:iter){               # external loop for each iteration
ss<-sample(1:k, size=k, replace=TRUE)   # samples 38 values from 1:38
selected[,i]<-ss                # save IDs of randomly drawn clusters
result[i,1]<-length(unique(ss))     # save a number of unique clusters
boot1<-data23[1,] # empty object - the same structure as original dataset

# append from the bottom of the blocks of observation
for(j in 1:k){          # internal loop for each selected cluster
boot1<-rbind(boot1, data23[data23$clust==ss[j], ])
}                       # end of the internal loop

boot1<-boot1[-1,]
result[i,2]<-dim(boot1)[1]          # length of bootstrapped sample
result[i,3]<-length(unique(boot1[,1]))  # number of unique observations
result[i,4]<-cor(boot1[,2], boot1[,3]) # correlation between two variables
}                       # end of the external loop
head(result)            # IDs of iterations as rownames
selected[1:6, 1:6]

summary(result[,1]/38) # the percentage of unique clusters
summary(result[,2]/380) # length of the sample relative to the full sample
summary(result[,3]/380) # the percentage of unique observations
# correlation coefficient
summary(result[,4]/cor(data23$unemployment_rate, data23$dist)) 

# Fig.10.7a - statistical distribution of correlation coefficient
plot(density(result[,4]), main="Distribution of the correlation coefficient")
abline(v=cor(data23$unemployment_rate, data23$dist), lty=3, col="coral", lwd=2)

# Fig.10.7b – statistical distribution of parameters of sampled datsets
par(mar=c(4,4,4,4))
plot(density(result[,1]/38), xlim=c(0,2), ylim=c(0,8), main="Structure of the bootstrapped sample")                 # unique clusters
lines(density(result[,2]/380), lwd=2)   # the length of the trial
lines(density(result[,3]/380), lty=3)   # unique observations
legend(0.99,8,c("unique clusters%", "sample length%", "unique observations%"), lty=c(1,1,3), lwd=c(1,2,1), cex=1, bty="n")

Figure 10.7: Distributions of block bootstrap results: a) correlation of two variables, b) structure of a bootstrapped sample

iter<-100       # number of bootstrap iterations
n<-nrow(data23) # number of observations (380)
b<-10           # block length (expected points drawn per cluster)
k<-n/b      # number of clusters (38)
data23$ID<-seq_len(n) # add the ID to the basic file, the same as 1:n

result<-matrix(0, nrow=iter, ncol=2)    # empty object for results
colnames(result)<-c("unique_obs.", "correlation")
selected2<-matrix(0, nrow=n, ncol=iter)  # history of sampling

for(i in 1:iter) {  # first loop, over i iterations
c1<-kmeans(data23$crds, centers=k)  # new clustering in each iteration
data23$clust<-c1$cluster            # clustering vector
  boot1<-data23[0, ]        # truly empty data.frame with same columns
  pos<-0                    # position counter for filling selected2

for(m in 1:k) {     # second loop, over m clusters
sub<-data23[data23$clust==m, ]  # subset for a given cluster m
how.many.obs<-nrow(sub)     # check the length of subset
ss<-sample.int(how.many.obs, size=b, replace=TRUE)  # draw b rows 

for(j in 1:b) { # third loop, append row-by-row
      boot1<-rbind(boot1, sub[ss[j], ])
      pos<-pos+1    # record sampling history (IDs), including duplicates
      selected2[pos, i]<-sub$ID[ss[j]]
    }       # end of the third loop
  }     # end of the second loop
result[i, 1]<-length(unique(boot1$ID))  # number of unique observations
result[i, 2]<-cor(boot1$unemployment_rate, boot1$dist, use="complete.obs")
}       # end of the first loop

# Fig.10.8a – statistical distribution of bootstrapped correlation coeff.
ggplot(as.data.frame(result), aes(x=result[,2])) + geom_density(fill= "lightblue") + geom_vline(xintercept=cor(data23$unemployment_rate, data23$dist), linetype = "dashed") + ggtitle("Distribution of the correlation coefficient")

# ratio: bootstrapped-to-empirical correlation coefficient
summary(result[,2]/cor(data23$unemployment_rate, data23$dist))

# Fig.10.8b
ggplot(as.data.frame(result), aes(x=result[,1]/380)) + geom_density(fill= "lightcoral", alpha=0.8) + ggtitle("Percentage of unique observations")

# ratio: unique bootstrapped IDs to all IDs of observations
summary(result[,1]/380)

Figure 10.8: Results of stratified block bootstrap: a) density of correlation coefficient between variables and b) percentage of unique observations

10.2.3 Sampling points from moving blocks (moving block bootstrap, MBB)

# prepare data
crds<-st_coordinates(st_centroid(st_make_valid(pov)))       # coordinates
data.mbb<-data.frame(ID_MAP=data$ID_MAP[data$year==2023], unemp23=data$unemployment_rate[data$year==2023], unemp22=data$unemployment_rate[data$year==2022])
data.mbb$crds.x<-crds[,1]       # add x coordinates to the data set
data.mbb$crds.y<-crds[,2]       # add y coordinates to the data set
data.mbb$crds<-crds         # add xy coordinates once again - jointly

# determine the k nearest neighbors based on coordinates
library(spdep)
knn.set<-knearneigh(crds, k=9) # matrix of k=9 nearest neighbours, sp::
knn.set$nn # fragment of the knn matrix

# bootstrap iterations
lead<-sample(1:380, size=38, replace=FALSE) # drawing of main points
boot<-as.vector(knn.set$nn[lead,]) # vector from randomly drawn neighbours
boot2<-c(boot, lead) # 380 IDs of the main points and neighbours
boot2           # first several IDs of bootstrapped dataset
length(boot2) # total length of the drawn IDs
length(unique(boot2)) # number of unique IDs

# bootstrapped sample – original dataset by sampled IDs
boot3<-data.mbb[boot2,]
head(boot3)
cor(data.mbb$unemp22, data.mbb$unemp23) # correlation in a full sample
cor(boot3$unemp22, boot3$unemp23) # correlation in a bootstrap sample

vec<-(20:1)*5/100       # degree of coverage – 20 values, i in iterations
b=10                # block length, for Fig.10.9a b=20, Fig.10.9b b=10
knn.set<-knearneigh(crds, k=b-1) # matrix of k nearest neighbours

result<-matrix(0, nrow=30, ncol=20)             # place for output
colnames(result)<-paste(rep("cov", times=20), vec)  # columns for coverage
rownames(result)<-paste(rep("iter", times=30), 1:30)    # rows for iterations
result[1:5, 1:5]

for(i in 1:20){     # loop for i coverage levels - iterations by columns
samp.size<-380*vec[i] # the total number of observations to be drawn

for(j in 1:30){ # loop for iterations for a given coverage – results in rows
lead<-sample(1:380, size=ceiling(samp.size/b), replace=TRUE) # local sample
boot<-as.vector(knn.set$nn[lead,]) # vector from randomly drawn neighbours
boot1<-c(boot, lead) # join IDs of lead  points and their neighbours
data.boot<-data.mbb[boot1,]     # bootstrapped data
result[j, i]<-cor(data.boot$unemp23, data.boot$unemp22) # correlation
}} # end of two loops, for coverage and iterations
result[1:6, 1:6]    # iterations j in rows, coverage i in columns

mea<-as.vector(apply(result, 2, mean))     # mean by columns
sd<-as.vector(apply(result, 2, sd))     # sd by columns 
for.figure<-as.data.frame(cbind(mea, sd))       # mean and sd in one object
colnames(for.figure)<-c("mea", "sd")        # names of columns of object
for.figure$lower<-for.figure$mea-for.figure$sd      # lower bound 
for.figure$upper<-for.figure$mea+for.figure$sd      # upper bound
for.figure$model<-rep("block10", times=20) #change to "block10" for Fig.10.9b
for.figure$interval<-(20:1)*5/100       # coverage, the same as vec object
head(for.figure)

cor.fullsample<-cor(data.mbb$unemp22, data.mbb$unemp23) # for a full sample

# Fig.10.9a
p<-ggplot(data=for.figure, aes(x=interval, y=mea, colour=model)) + geom_point() + geom_line() + geom_ribbon(aes(ymin=lower, ymax=upper), linetype=2, alpha=0.1) + xlab("Coverage of the sample") + ylab("Average correlation") + geom_hline(yintercept= cor.fullsample, linetype="dashed") + ggtitle("MBB for b=10")
plot(p)  

# save the previous result as data1
for.figure1<-for.figure
# ///repeat the above analysis for block b=10 ///
# bind both datasets for block10 and block20
for.figure11<-rbind(for.figure, for.figure1)

# Fig.10.9b
p1<-ggplot(data=for.figure11, aes(x=interval, y=mea, colour=model)) + geom_point() + geom_line() + geom_ribbon(aes(ymin=lower, ymax=upper), linetype=2, alpha=0.1) + xlab("Coverage of the sample") + ylab("Average correlation") + geom_hline(yintercept=cor.fullsample, linetype="dashed") + ggtitle("MBB for b=10 and b=20")
plot(p1)

Figure 10.9: Average correlation and fluctuation corridor (by std.dev) in the moving block bootstrap (MBB) model for: a) block length b = 10, b) blocks of length b=10 and b=20

# simple bootstrap – case sampling
vec<-(20:1)*5/100       # degree of coverage – 20 values, i in iterations
result2<-matrix(0, nrow=30, ncol=20)            # place for output
colnames(result2)<-paste(rep("cov", times=20), vec) # columns for coverage
rownames(result2)<-paste(rep("iter", times=30), 1:30) # rows for iterations

for(i in 1:20){     # loop for i coverage levels - iterations by columns
samp.size<-380*vec[i] # the total number of observations to be drawn

# replace=TRUE for Fig.10.10a, replace=FALSE for Fig.10.10b
for(j in 1:30){ # loop for iterations for a given coverage – results in rows
lead<-sample(1:380, size=samp.size, replace=TRUE) # use also replace=FALSE 
boot<-matrix(0, nrow=samp.size, ncol=2)
data.boot<-data.mbb[lead,]  # bootstrapped data
result2[j, i]<-cor(data.boot$unemp23, data.boot$unemp22) # correlation
}} # end of two loops, for coverage and iterations

mea2<-as.vector(apply(result2, 2, mean))    # mean by columns
sd2<-as.vector(apply(result2, 2, sd))   # sd by columns
for.figure2<-as.data.frame(cbind(mea2, sd2))    # mean and sd in one object
colnames(for.figure2)<-c("mea", "sd")       # names of columns of object
for.figure2$lower<-for.figure2$mea-for.figure2$sd       # lower bound
for.figure2$upper<-for.figure2$mea+for.figure2$sd       # upper bound
for.figure2$model<-rep("simple", times=20)  # label for legend
for.figure2$interval<-(20:1)*5/100      # coverage, the same as vec object

for.figure3<-rbind(for.figure11, for.figure2)   # combine MBB and CR results

# joint MMB and CR chart - change label to ‘without’ for Fig.10.10b
p<-ggplot(data=for.figure3, aes(x=interval, y=mea, colour=model)) + geom_point() + geom_line() + geom_ribbon(aes(ymin=lower, ymax=upper), linetype=2, alpha=0.1) + xlab ("sample coverage") + ylab ("average correlation") + ggtitle("drawing with replacement") 
plot(p)     # Fig.10a,b

Figure 10.10: Comparison of movable block bootstrap and simple drawing typical for: a) bootstrap drawing with replacement and b) modification - drawing without replacement

10.3 Sampling as a tool in cross-validation of models

10.3.1 Spatial and non-spatial cross-validation – example of generated data

library(spdep)      # for spatial weights matrix
library(spatialreg) # for spatial regressions

# 1. Simulate strongly spatially autocorrelated SAR data
# create regular grid 20x20 = 400 units
nx<-20
ny<-20
coords<-expand.grid(x=1:nx, y=1:ny)     # regular grid of points
n<-nrow(coords)                 # n=400 for 20x20 grid
sf_dat<-st_as_sf(coords, coords=c("x", "y"), crs=3857)  # sf dataset

# knn neighbours in full set
knn<-knearneigh(st_coordinates(sf_dat), k=4)    #list of knn=4 for each point
nb<-make.sym.nb(knn2nb(knn))        # symmetric neighbourhood matrix
listw_full<-nb2listw(nb, style="W", zero.policy=TRUE)    # spatial weights

# strong spatial lag model: y=rho*W*y + X*b + eps
rho_true<-0.8   # strong spatial dependence
beta0<-2        # assumed intercept
beta1<-0.4      # relatively weak covariate effects for x1
beta2<-(-0.2)   # negative relation for x2
sigma_eps<-1    # standard deviation of residual term

x1<-rnorm(n)    # random values of X1
x2<-rnorm(n)    # random values of X2
eps<-rnorm(n, sd=sigma_eps) # random values of error term

W<-listw2mat(listw_full)    # matrix class of spatial weights matrix
I<-diag(n)              # diagonal elements

A_inv<-solve(I-rho_true*W)  # generate dependent variable y
y<-as.vector(A_inv %*% (beta0 + beta1 * x1 + beta2 * x2 + eps) )

sf_dat$y<-y     # attach generated variables to locations       
sf_dat$x1<-x1
sf_dat$x2<-x2
form_sar<-y~x1+x2   # equation to be estimated
sf_dat

library(viridis)        # for nice colours
ggplot(sf_dat) + geom_sf(aes(size=y, colour=y)) + scale_size_continuous(range=c(1.5, 6)) + scale_colour_viridis(option="C", direction=-1) + labs(title="Spatial distribution of simulated SAR outcome (y)", size="Value (y)", colour="Value (y)") + theme_minimal(base_size=18) + theme(legend.position="right", panel.grid.major=element_line(colour = "grey90"), panel.grid.minor=element_blank(), plot.title= element_text(size=12, face="bold")) # Fig.10.11a – distribution of y val.

# 2. Cross-Validation (CV) function for SAR model based on given folds
# note that most input objects is created later
# at that stage they do not have to exist in memory

cv_sar<-function(sf_dat, form_sar, nb_full, fold_id) {  # own function
  K<-length(unique(fold_id))    # number of folds, mostly assumed by user
  n<-nrow(sf_dat)           # size of dataset
  y_obs<-sf_dat$y           # dependent variable
  y_pred<-rep(NA_real_, n)  # predicted values
  X_full<-model.matrix(form_sar, data=sf_dat)   # matrix with data for model
  
for(k in sort(unique(fold_id))) {       # loop for each fold
  cat("  Fold", k, "of", K, "\n")       # text to display when waiting 
  test_idx<-which(fold_id==k)   # IDs of obs. in given fold for testing
  train_idx<-setdiff(seq_len(n), test_idx)  # IDs of obs. for training
  dat_train<-sf_dat[train_idx, ]    # training dataset
  dat_test<-sf_dat[test_idx, ]  # testing dataset
  is_train<-seq_len(n) %in% train_idx   # vector of TRUE/FALSE to label IDs
  nb_train<-subset.nb(nb_full, subset=is_train) # subset of neighbours
  listw_train<-nb2listw(nb_train, style="W", zero.policy=TRUE)  # W matrix
sar_k<-lagsarlm(form_sar, data=dat_train, listw=listw_train, method="LU", zero.policy=TRUE)    
  beta_hat<-coef(sar_k)     # beta coefficients from estimated model
X_test<-model.matrix(form_sar, data=dat_test) # data for prediction
y_pred[test_idx]<-drop(X_test %*% beta_hat[colnames(X_full)]) # predicted y
  }     # end of the loop

valid_idx<-which(!is.na(y_pred))    # check which predicted y are not NA
rmse<-sqrt(mean((y_obs[valid_idx] - y_pred[valid_idx])^2)) # RMSE
mae<-mean(abs(y_obs[valid_idx] - y_pred[valid_idx]))        # MAE
  
# construct and execute the function – outputs a table with results
fold_stats<-do.call(rbind, lapply(sort(unique(fold_id)), function(k) {
    idx<-which(fold_id==k)  # select proper obs. within fold
    data.frame(fold=k, rmse=sqrt(mean((y_obs[idx] - y_pred[idx])^2)),
    mae=mean(abs(y_obs[idx] - y_pred[idx])))}))
# list what is the output of the function
list(rmse=rmse, mae=mae, y_pred=y_pred, fold_stats=fold_stats)
}   # end of the function

# 3. Random (sampled) vs spatial (k-means) folds
K<-5        # number of folds, 5 is very typical number

# random (non-spatial) folds – simple sampling of IDs
fold_id_random<-sample(rep(1:K, length.out=n))

# spatial folds – from k-means on xy coordinates
coords_mat<-st_coordinates(sf_dat)      # coordinates from sf object
km<-kmeans(coords_mat, centers=K, nstart=25)    # k-means clustering
fold_id_spatial<-km$cluster

# 4. Cross-validation CV for both schemes - uses own function written above
cat("\n=== RANDOM (NON-SPATIAL) CV ===\n")  # text for waiting
res_random<-cv_sar(sf_dat, form_sar, nb_full=nb, fold_id=fold_id_random) 

cat("\n=== SPATIAL (BLOCK) CV ===\n")   # text for waiting
res_spatial<-cv_sar(sf_dat, form_sar, nb_full=nb, fold_id=fold_id_spatial)
data.frame(scheme=c("random", "spatial"), RMSE=c(res_random$rmse, res_spatial$rmse), MAE=c(res_random$mae,  res_spatial$mae)) # output

# 5. Visualisation of fold-wise RMSE
fold_stats_random<-transform(res_random$fold_stats, scheme="random")
fold_stats_spatial<-transform(res_spatial$fold_stats, scheme="spatial")
fold_stats_all<-rbind(fold_stats_random, fold_stats_spatial)
fold_stats_all

# Fig.10.11b – simple no-colour version
ggplot(fold_stats_all, aes(x=scheme, y=rmse)) + geom_boxplot() + geom_jitter(width=0.1, alpha=0.7) + labs(title="Fold-wise RMSE: Random vs Spatial Cross-Validation (SAR)", x="CV scheme", y="Fold RMSE") + theme_minimal()

# Fig.10.11b – more developed colour-version
ggplot(fold_stats_all, aes(x=scheme, y=rmse)) + geom_boxplot(outlier.shape= NA, fill="grey95", color="grey40") + geom_jitter(aes(size=rmse, colour= rmse), width=0.15, alpha=0.8) + scale_size_continuous(range=c(3, 8)) + scale_colour_viridis_c(option="C", end=0.95) + labs(title="Fold-wise RMSE: Random vs Spatial Cross-Validation", subtitle="Point size and colour represent RMSE", x="CV scheme", y="Fold RMSE", colour="RMSE", size="RMSE") + theme_minimal(base_size=14) + theme(panel.grid.minor= element_blank(), plot.title=element_text(face="bold"), legend.position="right")

Figure 10.11: Spatial and non-spatial cross-validation on generated data: a) spatial distribution of dependent variable; b) comparison of RMSE in both schemes

10.3.2 Spatial and non-spatial cross-validation – an example of empirical data

# spatial cross-validation in SEM model
library(spdep)      # for spatial weights matrix
library(spatialreg) # for spatial regressions

# 1. Prepare data for the analysis
data23<-data[data$year==2023, ]
form_sem<-unemployment_rate ~ salary_avg_Poland_100p + dist + birth_per_1K
X_full<-model.matrix(form_sem, data=data23) # data for model
p<-ncol(X_full)     # number of beta coefficients
n<-nrow(pov)        # number of observations in the original dataset

# spatial weights matrix for spatial econometric model
coords<-st_coordinates(st_centroid(st_make_valid(pov)))
knn<-knearneigh(coords, k=5)        # k=5 nearest neighbours
nb_full<-make.sym.nb(knn2nb(knn))   # symmetric neighbourhood
listw_full<-nb2listw(nb_full, style="W", zero.policy=TRUE) # W

# cluster the coordinates to obtain K folds
K<-5                # number of spatial folds
km<-kmeans(coords, centers=K, nstart=25)    # kmeans clustering
fold_id<-km$cluster # an integer from 1 to K for each observation
table(fold_id)      # check spatial fold sizes

# place for results - coefficients in each fold (betas + lambda)
coef_mat<-matrix(NA, nrow=K, ncol=p+1)          # empty matrix
colnames(coef_mat)<-c(colnames(X_full), "lambda")   # names of columns
y_obs<-data23$y                         # dependent variable
y_pred<-rep(NA_real_, n)        # empty vector for predictions fold-by-fold

# 2. Estimate models on test data and predict on train data
for (k in 1:K) {        # loop by folds
  cat("Running fold", k, "of", K, "...\n")  # text for waiting
  test_idx<-which(fold_id==k)               # IDs of obs. in given fold
  train_idx<-setdiff(seq_len(n), test_idx)  # IDs of obs. In other folds
  dat_train<-data23[train_idx, ]            # train data
  dat_test<-data23[test_idx, ]          # test data

# subset of neighbours: drop all neighbours that include test units
is_train<-seq_len(n) %in% train_idx  # vector of TRUE/FALSE to label IDs
nb_train<-subset.nb(nb_full, subset=is_train)   # limited neighbourhood
listw_train<-nb2listw(nb_train, style="W", zero.policy=TRUE)    # W
  
sem_k<-errorsarlm(form_sem, data=dat_train, listw=listw_train, zero.policy=TRUE, method="LU") # fit spatial error model on training data
beta_hat<-coef(sem_k)           # extract regression coefficients
lambda_hat<-sem_k$lambda        # extract lambda spatial coefficient
  
# transfer the results into common coefficient matrix
coef_mat[k, colnames(X_full)]<-beta_hat[colnames(X_full)]
coef_mat[k, "lambda"]<-lambda_hat
  
# predict on test set using linear predictor X * beta
X_test<-model.matrix(form_sem, data=dat_test)
y_pred[test_idx]<-drop(X_test %*% beta_hat[colnames(X_full)])
}                       # end of the loop

# 3 Cross-validated performance
valid_idx<-which(!is.na(y_pred))    # check IDs with non-NA prediction
rmse_cv<-sqrt(mean((y_obs[valid_idx] - y_pred[valid_idx])^2))   # RMSE
mae_cv<-mean(abs(y_obs[valid_idx] - y_pred[valid_idx]))     # MAE

cat("Spatial CV RMSE:", rmse_cv, "\n")  # text to be displayed
cat("Spatial CV MAE :", mae_cv,  "\n")

coef_mean<-colMeans(coef_mat, na.rm=TRUE)       # average of coeffcients
coef_sd<-apply(coef_mat, 2, sd, na.rm=TRUE) # sd of coefficients
coef_summary<-data.frame(term=names(coef_mean), mean=coef_mean, sd=coef_sd)
coef_summary                # set of coefficients for each fold

sem_full<-errorsarlm(form_sem, data=data23, listw=listw_full, zero.policy=TRUE, method="LU")    # full-data model
coef_full<-c(coef(sem_full)[colnames(X_full)], sem_full$lambda)

comparison<-data.frame(term=names(coef_full), full=unname(coef_full), cv_mean=coef_mean[names(coef_full)], cv_sd=coef_sd[names(coef_full)])
comparison      # comparison of CV and full model coefficients

# Fig.10.12 - line plot comparing model performance in folds
library(tidyr)  # for pivot_longer()
library(dplyr)  # for pipe notation %>%, group_by() and mutate()
library(viridis)    # for scale_colour_viridis_d()

coef_df<-as.data.frame(coef_mat)    # class data.frame
coef_df$fold<-factor(1:K)       # IDs of folds
coef_long<-pivot_longer(coef_df, cols=colnames(coef_mat), names_to="term", values_to="estimate") # convert coef_mat object into tidy long format

# standardised (z-score) coefficients per term
coef_long<-coef_long %>% group_by(term) %>% mutate(estimate_z=(estimate - mean(estimate, na.rm=TRUE)) / sd(estimate, na.rm=TRUE))

# full-model estimates (standardised)
coef_full_df<-data.frame(term=names(coef_full), full=coef_full)
coef_full_df<-coef_full_df %>% group_by(term) %>% mutate(full_z =(full - mean(coef_long$estimate[coef_long$term == term])) / sd(coef_long$estimate[coef_long$term == term]))
coef_plot_df<-merge(coef_long, coef_full_df, by="term") # merged objects

ggplot(coef_plot_df, aes(x=fold, y=estimate_z, colour=term, group=term)) + geom_point(size=3, alpha=0.7) + geom_line(alpha=0.7) +   geom_hline(aes(yintercept=full_z, colour=term), linetype="dashed", size=1)+ scale_colour_viridis_d(option="C") + labs(title="Standardised Coefficient Stability Across Spatial CV Folds", subtitle="Coefficients are z-scored within each variable; dashed line = full-sample estimate", x="Fold", y="Standardised coefficient (z-score)", colour="Term") + theme_minimal(base_size=14) + theme(legend.position="bottom", plot.title= element_text(face="bold", size=13)) # Fig.10.12

Figure 10.12: Model coefficients in different folds compared with full-sample model

# Fig.10.13a – map of folds used in cross-validation
pov$fold_id<-fold_id
ggplot(pov) + geom_sf(aes(fill=factor(fold_id)), colour="grey40", size=0.1) + scale_fill_viridis_d(option="C", name="Fold") + labs(title="Spatial Cross-Validation Folds (k=5)") + theme_minimal(base_size=14) + theme(legend.position="bottom", plot.title= element_text(face="bold", size=13))

# Fig.10.13b – map of residuals from the SEM model
pov$error<-y_obs - y_pred   # errors from the models
ggplot(pov) + geom_sf(aes(fill=error), colour="grey50", size=0.1) + scale_fill_viridis(option="B", direction=-1, name="Prediction error\n(y - ŷ)") + labs(title="Spatial Distribution of Prediction Errors") + theme_minimal(base_size=14) + theme(legend.position="bottom", plot.title= element_text(face="bold", size=13))

Figure 10.13: Maps of analysed phenomena: a) K=5 folds for cross-validation, b) residuals from the model

11. Other technical details

All figures for a book were saved in a following way:

# In R console: 
i<-"01_01a"
savePlot(filename=paste0("Fig_",i), type="jpg", device=dev.cur(), restoreConsole=TRUE)
savePlot(filename=paste0("Fig_",i), type="pdf", device=dev.cur(), restoreConsole=TRUE)
savePlot(filename=paste0("Fig_",i), type="tif", device=dev.cur(), restoreConsole=TRUE)
savePlot(filename=paste0("Fig_",i), type="emf", device=dev.cur(), restoreConsole=TRUE)
savePlot(filename=paste0("Fig_",i), type="eps", device=dev.cur(), restoreConsole=TRUE)

# In R studio:
save_plot_5formats<-function(i, plot_fun, width=10, height=10, res=300) {fname<-paste0("figures/Fig_", i)
  
## JPG
jpeg(paste0(fname, ".jpg"), width=width*res, height=height*res, res=res)
plot_fun()
dev.off()
  
## PDF
pdf(paste0(fname, ".pdf"), width=width, height=height)
plot_fun()
dev.off()
  
## TIFF
tiff(paste0(fname, ".tif"), width=width*res, height=height*res, res=res, compression="lzw")
plot_fun()
dev.off()
  
## EMF (Windows)
if (.Platform$OS.type == "windows") {
    win.metafile(paste0(fname, ".emf"), width=width, height=height)
    plot_fun()
    dev.off()
  } else {
    warning("EMF skipped: available only on Windows.")
  }
  
## EPS
postscript(paste0(fname, ".eps"), width=width, height=height, horizontal=FALSE, onefile=FALSE, paper="special")
plot_fun()
dev.off()
}

i<-"01_01a"
save_plot_5formats(i)

spatial_book_4

Katarzyna Kopczewska

2026-02-04