Welcome to the RPubs for the book ‘Applied Spatial Statistics And Econometrics: Data Analysis in R’ (Routledge, 2nd edition, 2026) (ed.Katarzyna Kopczewska).
This part is for Chapters 8-10.
To download and read data, please see RPubs with Intro.
This site is a collection of all R codes included in this book. These are just the codes - no comments, no explanations what, why and how. All those details are in the book.
The book was written by Spatial Warsaw team based at the Faculty of Economic Sciences at University of Warsaw. Visit us on LinkedIn.
data22<-data[data$year==2022, ] # select data for year 2022
data22$Y<-data22$housing_price_sqm_PLN # Y variable
data22$X1<-data22$urbanisation # X1 variable
data22$X2<-data22$dist # X2 variable
data22$X3<-data22$birth_per_1K # X3 variable
data22$X4<-data22$new_firms_per_10K_inhab # X4 variable
data22$X5<-data22$salary_average # X5 variable
data22$X6<-data22$unemployment_rate # X6 variable
eq<-Y~X1+X2+X3+X4+X5+X6 # equation to be estimated in all GWR models
lm.model<-lm(eq, data=data22) # global linear regression model (OLS)
summary(lm.model)# poviat data for one year with coordinates of centroids
pov<-st_transform(pov, 4326) # sf object, projections in WGS84
crds.pov<-st_coordinates(st_centroid(st_make_valid(pov))) # crds matrix
crds.pov<-as.data.frame(crds.pov) # crds data.frame
data22$crds.x<-crds.pov$X # add coordinates to dataset
data22$crds.y<-crds.pov$Y
library(sp) # for dataset in sp class
data22.sp<-data22 # copy the data.frame object
coordinates(data22.sp)<-c("crds.x", "crds.y") # change to sp class
proj4string(data22.sp)<-CRS("+proj=longlat +datum=WGS84") # projection
class(data22.sp)
# find optimal bandwidth, adaptive, with Gaussian (default) kernel
library(spgwr) # for gwr.sel()
bw.adapt.spgwr<-gwr.sel(eq, data=data22.sp, adapt=T, RMSE=T, method="cv")
bw.adapt.spgwr
# find the optimal bandwidth – fixed, in km
bw.fixed.spgwr<-gwr.sel(eq, data=data22.sp, adapt=F, RMSE=F, longlat=T)
# estimation of the GWR model, hatmatrix=T for diagnostic tests
gwr.mod.adapt<-gwr(eq, data=data22.sp, adapt=bw.adapt.spgwr, hatmatrix=T)
gwr.mod.adapt # summary of GWR model
summary(gwr.mod.adapt) # structure of output
library(viridis) # for viridis colours
library(gridExtra) # for joint plot of individual figures
pov$gwr.X1<-gwr.mod.adapt$SDF$X1
pov$gwr.X3<-gwr.mod.adapt$SDF$X3
p1<-ggplot() + geom_sf(pov, mapping=aes(geometry=geometry, fill=gwr.X1), inherit.aes=FALSE) + scale_fill_viridis(option="H") + theme_minimal()
p2<-ggplot() + geom_sf(pov, mapping=aes(geometry=geometry, fill=gwr.X3), inherit.aes=FALSE) + scale_fill_viridis(option="H") + theme_minimal()
p12<-arrangeGrob(p1, p2, ncol=2) # with gridExtra::
plot(p12)Figure 8.2: Spatial distribution of selected GWR parameters:
a) variable X1, b) variable X3
LMZ.F1GWR.test(gwr.mod.adapt) # H1: GWR is better than OLS – confirmed
LMZ.F2GWR.test(gwr.mod.adapt) # H1: GWR is better than OLS – confirmed
BFC99.gwr.test(gwr.mod.adapt) # H1: GWR is better than OLS – confirmed
LMZ.F3GWR.test(gwr.mod.adapt) # H1: GWR coefficients differ in space
eq<-Y~X1+X2+X3+X4+X5 # equation without X6 which is stable over space
bw.adapt.spgwr<-gwr.sel(eq, data=data22.sp, adapt=T, RMSE=T, method="cv")
bw.adapt.spgwr # which gives 0.12*380=46 neighbours
gwr.mod.adapt<-gwr(eq, data=data22.sp, adapt=bw.adapt.spgwr, hatmatrix=T)
LMZ.F1GWR.test(gwr.mod.adapt) # GWR is better than OLS
LMZ.F2GWR.test(gwr.mod.adapt) # GWR is better than OLS
BFC99.gwr.test(gwr.mod.adapt) # GWR is better than OLS
LMZ.F3GWR.test(gwr.mod.adapt) # all coefficients spatially non-stationary
library(spdep)
neighb<-knearneigh(coordinates(data22.sp), k=46) # knn class, spdep::
neighb.listw<-nb2listw(make.sym.nb(knn2nb(neighb))) # listw, spdep::
gwr.morantest(gwr.mod.adapt, neighb.listw) # Moran test for GWR, spgwr::
steps<-(1:10)*10
moran.stepwise<-matrix(0, nrow=length(steps), ncol=3)
colnames(moran.stepwise)<-c("No of neighb", "Moran’s I", "p-value")
for(i in 1:10){
neighb<-knearneigh(coordinates(data22.sp), k=steps[i])
neighb.listw<-nb2listw(make.sym.nb(knn2nb(neighb)))
a<-gwr.morantest(gwr.mod.adapt, neighb.listw)
moran.stepwise[i,1]<-steps[i]
moran.stepwise[i,2]<-a$estimate
moran.stepwise[i,3]<-a$ p.value
}
moran.stepwise
lm.model<-lm(eq, data=data22) # rerun OLS after equation has changed
lm.morantest(lm.model, neighb.listw) # Moran test for OLS, spdep::library(GWmodel)
# bandwidth estimation with GWmodel::
bw.adapt.gw<-bw.gwr(eq, data=data22.sp, kernel="gaussian", adaptive=T, p=2)
gwr.gw<-gwr.basic(eq, data=data22.sp, kernel="gaussian", adaptive=T, bw=bw.adapt.gw, F123.test=T) # GWR estimation, GWmodel::
gwr.gw # results printout
gwr.msc.gw<-gwr.multiscale(eq, data22.sp, kernel="gaussian", adaptive=T, bws0=c(0,0,0,0,0,0)) # GWR with various bandwidths, GWmodel::
gwr.msc.gw
gwr.mc.gw<-gwr.montecarlo(eq, data=data22.sp, kernel="gaussian", nsims=99, bw=bw.adapt.gw, adaptive=T, longlat=T)
# significance tests of local regression parameters with p-value correction
# p-values for a single bandwidth model
head(gwr.t.adjust(gwr.gw)$results$fb, 5) # first 5 observations
# p-values for a multi bandwidth model
head(gwr.t.adjust(gwr.msc.gw)$results$fb, 5) # first 5 observationsmodel.sel<-model.selection.gwr(DeVar='Y', InDeVars=c('X1', 'X2', 'X3', 'X4', 'X5'), data=data22.sp, kernel="gaussian", adaptive=TRUE, bw=bw.adapt.gw)
model.sel[[1]][15] # structure of models
model.sel[[2]][1:2,] # statistics of GWR models
sorted.models<-model.sort.gwr(model.sel, numVars=5, ruler.vector= model.sel[[2]][,3]) # sort models by AICc
model.list<-sorted.models[[1]] # save list of sorted models
# Fig.8.3a - illustration of the steps of step regression
model.view.gwr('Y', c('X1', 'X2', 'X3', 'X4', 'X5'), model.list=model.list)
# Fig.8.3b - AICc of sorted models
plot(sorted.models[[2]][,3], col="black", pch=20, lty=5, main="AICc for GWR models", ylab="AICc", xlab="Model number", type="b")Figure 8.3: An illustration of the GWR stepwise selection process
gwr_het<-gwr.hetero(eq, data=data22.sp, regression.points=data22.sp, longlat=T, bw=bw.adapt.gw, adaptive=TRUE, kernel='gaussian')
summary(gwr_het)
gwr_het<-gwr.hetero(eq, data=data22.sp, regression.points=data22.sp, longlat=T, bw=bw.adapt.gw, adaptive=TRUE, kernel='gaussian')
pov$gwr.gw.X1<-gwr.gw$SDF$X1 # add variables to the map
pov$gwr.het.X1<-gwr_het$X1
ggplot() + geom_sf(pov, mapping=aes(fill=gwr.gw.X1), inherit.aes=FALSE) + scale_fill_viridis(option="H") + theme_minimal() # Fig.8.4a
ggplot() + geom_sf(pov, mapping=aes(fill=gwr.het.X1), inherit.aes=FALSE) + scale_fill_viridis(option="H") + theme_minimal() # Fig.8.4b Figure 8.4: Maps of coefficients for variable X1: a) simple
GWR, b) heteroscedastic GWR
library(GWmodel) # for gwr.collin.diagno()
library(RColorBrewer) # for colours from colorRampPalette()
diag_gwr<-gwr.collin.diagno(eq, data22.sp, bw=bw.adapt.gw, kernel="gaussian", adaptive=T) # diagnostic statistics in GWR
summary(diag_gwr) # list class object
names(diag_gwr)
names(diag_gwr$SDF) # includes local_CN, VIF, etc.
names(gwr.gw$SDF) # structure of GWR estimation output (see 8.2.2)
# Fig.8.5a - local Condition Numbers (CN)
spplot(diag_gwr$SDF, "local_CN", main="Local condition number", col.regions=colorRampPalette(brewer.pal(11, "Spectral"))(100), cex=2.5)
# Fig.8.5b - local R2 map
spplot(gwr.gw$SDF, "Local_R2", main="Local R-squared", col.regions = colorRampPalette(brewer.pal(9, "Blues"))(100), cex=2.5)
# Fig.8.5c – local coefficients of the selected variable
coef_name<-"X2"
se_name<-"X2_SE"
spplot(gwr.gw$SDF, coef_name, main=paste("Local coefficient:", coef_name), col.regions=colorRampPalette(brewer.pal(11, "RdBu"))(100), cex=2.5)
# Fig.8.5d - standard error surface
spplot(gwr.gw$SDF, se_name, main=paste("Standard error of", coef_name), col.regions=colorRampPalette(brewer.pal(9, "YlOrRd"))(100), cex=2.5)Figure 8.5: Diagnostics of collinearity: a) condition
numbers, b) R2, c) β coeefficient, d) standard error (SE) of β
coefficient
# two alternative bandwidths: small (more local) vs large (more global)
bw_small<-bw.gwr(eq, data=data22.sp, kernel="bisquare", adaptive=TRUE)*0.5
bw_large<-bw.gwr(eq, data=data22.sp, kernel="bisquare", adaptive=TRUE)*1.5
# collinearity diagnostics for each bandwidth
colldiag_small<-gwr.collin.diagno(eq, data=data22.sp, bw=bw_small, kernel= "bisquare", adaptive=TRUE)
colldiag_large<-gwr.collin.diagno(eq, data=data22.sp, bw=bw_large, kernel= "bisquare", adaptive=TRUE)
pov$CN_small<-colldiag_small$SDF$local_CN # add CN to sf map
pov$CN_large<-colldiag_large$SDF$local_CN
ggplot()+ geom_sf(pov, mapping=aes(fill=CN_small))+ scale_fill_viridis( option="H") + theme_minimal() + ggtitle("CN for small bandwidth")
ggplot()+ geom_sf(pov, mapping=aes(fill=CN_large))+ scale_fill_viridis( option="H") + theme_minimal() + ggtitle("CN for large bandwidth")
# alternative plot with spplot() using sp class, not shown
data22.sp$CN_small<-colldiag_small$SDF$local_CN
data22.sp$CN_large<-colldiag_large$SDF$local_CN
spplot(data22.sp, c("CN_small", "CN_large"), main="Local condition number: small vs large bandwidth", col.regions=colorRampPalette(brewer.pal(11, "Spectral"))(100), cex=2.5)Figure 8.6: Maps showing CN values for alternative
bandwidths: a) small bandwidth, b) large bandwidth
head(round(diag_gwr$corr.mat,3), 3) # only three rows displayed
# add selected pairs of variables to the map
pov$X2X4<-diag_gwr$corr.mat[,11]
#pov$X2X4<-diag_gwr$SDF$Corr_X2.X4 # equivalent notation
pov$X4X5<-diag_gwr$corr.mat[,15]
#pov$X4X5<-diag_gwr$SDF$Corr_X4.X5 # equivalent notation
sd(pov$X2X4)/mean(pov$X2X4) # coefficient of variation of local corr
sd(pov$X4X5)/mean(pov$X4X5)
library(viridis) # Fig.8.7
ggplot()+ geom_sf(pov, mapping=aes(fill=X2X4))+ scale_fill_viridis( option="H") + theme_minimal() + ggtitle("Corr(X2,X4)")
ggplot()+ geom_sf(pov, mapping=aes(fill=X4X5))+ scale_fill_viridis( option="H") + theme_minimal() + ggtitle("Corr(X4,X5)")Figure 8.7: Spatial distribution of the correlation coefficients between the predictors used in the local regressions: a) correlation between X2 and X4, b) correlation between X4 and X5
local_CN<-diag_gwr$SDF$local_CN
spar<-0.8 # smoothing parameter
mat<-diag_gwr$SDF
par(mar=c(4,4,4,4))
plot(smooth.spline(local_CN, mat$Corr_X1.X2, spar=spar), ylim=c(-0.5, 0.6), xlim=c(15, 50), type="l", xlab="local Condition Number CN", ylab="local correlations between predictors", lty=1, lwd=1)
lines(smooth.spline(local_CN, mat$Corr_X1.X3, spar=spar), lty=2, lwd=2)
lines(smooth.spline(local_CN, mat$Corr_X1.X4, spar=spar), lty=3, lwd=3)
lines(smooth.spline(local_CN, mat$Corr_X1.X5, spar=spar), lty=4, lwd=1)
lines(smooth.spline(local_CN, mat$Corr_X2.X3, spar=spar), lty=5, lwd=2)
lines(smooth.spline(local_CN, mat$Corr_X2.X4, spar=spar), lty=1, lwd=3)
lines(smooth.spline(local_CN, mat$Corr_X2.X5, spar=spar), lty=2, lwd=1)
lines(smooth.spline(local_CN, mat$Corr_X3.X4, spar=spar), lty=3, lwd=2)
lines(smooth.spline(local_CN, mat$Corr_X3.X5, spar=spar), lty=4, lwd=3)
lines(smooth.spline(local_CN, mat$Corr_X4.X5, spar=spar), lty=5, lwd=1)
abline(v=c(30, 50, 100), lty=3, lwd=2)
abline(h=(-3:3)*0.2, lty=3)
legend("bottomleft", legend=c("Corr_X1.X2", "Corr_X1.X3", "Corr_X1.X4", "Corr_X1.X5", "Corr_X2.X3", "Corr_X2.X4", "Corr_X2.X5", "Corr_X3.X4", "Corr_X3.X5", "Corr_X4.X5"), lty=c(1,2,3,4,5,1,2,3,4,5), lwd=c(1, 2, 3, 1,2,3,1,2,3,1), bty="n")
title(main="Relationship between bilateral correlations of variables
and condition number")Figure 8.8: Relationships between local condition numbers
(CN) and local correlations of predictors
library(GGally) # for ggpairs()
library(PerformanceAnalytics) # for chart.Correlation()
coefs<-data.frame(x1=gwr.gw$SDF$X1, x2=gwr.gw$SDF$X2, x3=gwr.gw$SDF$X3, x4=gwr.gw$SDF$X4, x5=gwr.gw$SDF$X5) # save GWR coefficients in data.frame
ggpairs(coefs) # generate a correlogram, GGally::
chart.Correlation(coefs, method="spearman", hist=F) #PerformanceAnalytics::Figure 8.9: Correlation between parameters from GWR: a) with ggpairs() reporting Pearson correlation, b) with chat.Correlation() reporting Spearman correlation
library(gwrr) # for gwl.est()
gwr.lasso<-gwl.est(eq, locs=coordinates(data22.sp), kernel="gauss", data=as.data.frame(data22.sp)) # model estimation
summary(gwr.lasso) # results
gwr.lasso # results limited for few observations only
library(Metrics) # for rmse()
lm.model<-lm(eq, data=data22) # linear regression
rmse(data22$Y, predict(lm.model, data22)) # from metrics::
gwr.model<-gwr(eq, data=data22.sp, adapt=NULL, hatmatrix=TRUE, predictions=TRUE, bandwidth=bw.adapt.spgwr) # standard GWR from spgwr::
rmse(data22$Y, gwr.model$SDF$pred)
gwr.lasso<-gwl.est(eq, locs=coordinates(data22.sp), kernel="gauss", data=as.data.frame(data22.sp)) # GWR with Lasso from gwrr::
rmse(data22$Y, gwr.lasso$yhat)
# data in sf class
data22.sf<-st_as_sf(data22, coords=c("crds.x", "crds.y"), crs=4326)
# bandwidth estimation
bw<-bw.gwr.lcr(eq, data=data22.sf, kernel="gaussian", adaptive=FALSE, lambda.adjust=TRUE, cn.thresh=20)
bw
gwrr.model.reg<-gwr.lcr(eq, data=data22.sf, bw=bw, kernel="gaussian", adaptive=FALSE, lambda.adjust=TRUE, cn.thresh=20)
gwrr.model.reg# data processing
data_t<-data[data$year>2019, ] # panel data for years 2020-2023
data_t$code1<-data_t$code/1000 # add modified code variable
# add 0 as first digit to 3-digit codes to make them 4-digit
data_t$code_chr<-ifelse(is.na(data_t$code1), NA_character_, sprintf(paste0("%0", 4, "d"), as.integer(data_t$code1)))
data_t[1:5, c("code", "code1", "code_chr")] # evidence of changes
data_t$Y<-data_t$housing_price_sqm_PLN # Y variable
data_t$X1<-data_t$urbanisation # X1 variable
data_t$X2<-data_t$dist # X2 variable
data_t$X3<-data_t$birth_per_1K # X3 variable
data_t$X4<-data_t$new_firms_per_10K_inhab # X4 variable
data_t$X5<-data_t$salary_average # X5 variable
data_t$X6<-data_t$unemployment_rate # X6 variable
pov_data_t<-merge(pov, data_t, by.x="JPT_KOD_JE", by.y="code_chr", all.y=TRUE) # sf class data
nrow(pov_data_t) # it is 4*380 - panel added to a map
# Fig.8.10 – a map of the dependent variable
ggplot()+ geom_sf(pov_data_t, mapping=aes(fill=Y), inherit.aes=FALSE) + scale_fill_viridis(option="H") + facet_wrap(~year, nrow=2)+ theme_minimal() Figure 8.10: Panel plot of dependent variable (housing price
per sq.m) over the period 2020-2023
# prepare the sp class objects
pov_data_sp_t<-as_Spatial(pov_data_t) # SpatialPolygonsDataFrame
# spatio-temporal distance object
x<-st.dist(coordinates(pov_data_sp_t), obs.tv=pov_data_sp_t@data$year)
t0<-Sys.time() # start counting time
bw.gtwr.adapt<-bw.gtwr(eq, data=pov_data_sp_t, obs.tv=pov_data_sp_t@data$year, adaptive=TRUE, st.dMat=x)
t1<-Sys.time() # end counting time
t1-t0 # duration of computation, ~10 min
gtwr.adapt<-gtwr(eq, data=pov_data_sp_t, obs.tv=pov_data_sp_t@data$year, st.bw=bw.gtwr.adapt, adaptive=TRUE)
gtwr.adapt
pov_data_t$gtwr.X3<-gtwr.adapt$SDF$X3
ggplot()+ geom_sf(pov_data_t, mapping=aes(fill=gtwr.X3)) + facet_wrap(~year, nrow=2) + scale_fill_viridis(option="H") + theme_minimal() # Fig.8.11Figure 8.11: Coefficients from the GTWR model for variable X3
over time and space
library(dplyr) # for data manipulation with pipe ‘|>’
# selection of variables from the poviats dataset
selected.vars<-c("dist", "salary_average", "salary_avg_Poland_100p", "invest_share_pp", "birth_per_1K", "death_per_1K", "area_km2",
"number_of_flats", "area_of_flats_m2", "persons_per_1km2", "persons_in_K",
"persons_women_in_K", "persons_men_in_K", "urbanisation_pp", "persons_preproductive_age_K", "persons_productive_age_K", "persons_postproductive_age_K", "new_firms_per_10K_inhab", "closed_firms_per_10K_inhab", "small_firms_per_10k_inhab", "total_firms_empl.49_per_10K_inhab", "all_firms_per_10K_inhab", "new_medical_firms_pp", "new_creative_firms_pp", "housing_price_sqm_PLN", "unemployment_rate")
data23.selected<-data23[ ,selected.vars] # keeping the selected variables
data23.sel.num<-data23.selected |> mutate(across(everything(), as.numeric)) # make sure that all variables are numeric
# save the names of poviats as well as their identification codes in vectors
pov.codes<-as.integer(data23[, "subregion_number"])
pov.names<-as.character(data23[, "subregion_name_noPL_font"])
head(data23.sel.num, 3) # first 5 variables from the dataset, 3 rows
head(pov.codes, 5) # ID identification codes for the first five poviats
head(pov.names, 5)# names of the first five poviats
cor.mat<-cor(data23.sel.num, use="pairwise.complete.obs") # correlation
library(corrplot) # to plot matrix in colour using corrplot()
corrplot(cor.mat, method="color", type="upper", order="hclust", tl.cex=0.6,
tl.srt=45) # visualisation of the matrixFigure 8.12: Correlation matrix for the variables selected
for analysis
data23.sel.s<-scale(data23.sel.num) # scale the data (z-score)
head(data23.sel.s, 5) # first 5 scaled variables from the dataset
classic.pca<-prcomp(data23.sel.s, center=FALSE, scale.=FALSE)
summary(classic.pca)
classic.pca$rotation[, 1:5] # loadings matrix for the first 5 PCs
classic.pca$x[, 1:5] # scores matrix for first 5 poviats and first 5 PCs
summary(classic.pca$x[, 1])# summary statistics for PC1 scores
sd(classic.pca$x[, 1])# standard deviation of PC1 scores
library(factoextra) # for fviz_pca_var() and fviz_eig()
fviz_pca_var(classic.pca, labelsize=3, repel=TRUE) # Fig.8.13a
fviz_eig(classic.pca) # Fig.8.13b Figure 8.13: Graph of PCA results: a) impact (direction and
strength) of variables, b) scree plot with explained variance
fviz_contrib(classic.pca, "var", axes=1, xtickslab.rt=70) # Fig.8.14a, PC1
fviz_contrib(classic.pca, "var", axes=2, xtickslab.rt=70) # Fig.8.14b, PC2Figure 8.14: Contribution of variables to: a) first principal
component, b) second principal component
library(psych)
rotated.pca<-principal(data23.sel.s, nfactors=5, rotate="varimax")
print(loadings(rotated.pca), digits=2, cutoff=0.5, sort=T)
pov.rotated.pca.scores<-data.frame(pov.codes, pov.names, rotated.pca$scores[, 1:5]) # df with rotated scores, pov names and codes
colnames(pov.rotated.pca.scores)<-c("pov.code", "pov.name", "popul.scale", "urban.struct", "business.act", "wage.level", "demogr.dynamics") # add the “umbrella names” for rotated scores
head(pov.rotated.pca.scores) # 3 rows only
pov.rotated.pca.scores.sf<-pov.rotated.pca.scores %>% left_join(pov %>% select(JPT_KOD_JE, geometry) %>% mutate(JPT_KOD_JE= as.integer(JPT_KOD_JE)), by=c("pov.code"="JPT_KOD_JE")) %>% st_as_sf()
class(pov.rotated.pca.scores.sf)
head(pov.rotated.pca.scores.sf) # 3 rows only
ggplot() +geom_sf(data=pov.rotated.pca.scores.sf, aes(fill= popul.scale)) + scale_fill_viridis_c() + labs(title="Population scale", fill="RC1") + theme_minimal() +theme(plot.title= element_text(hjust=0.5, size=12))# RC1
ggplot() + geom_sf(data=pov.rotated.pca.scores.sf, aes(fill= urban.struct)) + scale_fill_viridis_c() + labs(title= "Urban structure",fill="RC2") + theme_minimal() +theme(plot.title= element_text(hjust=0.5, size=12)) #RC2
ggplot() +geom_sf(data= pov.rotated.pca.scores.sf, aes(fill= business.act)) + scale_fill_viridis_c() + labs(title="Business activity", fill="RC3") + theme_minimal() + theme(plot.title=element_text(hjust=0.5, size=12)) #RC3
ggplot() + geom_sf(data= pov.rotated.pca.scores.sf, aes(fill=wage.level))+
scale_fill_viridis_c() + labs(title="Wage level", fill="RC5") + theme_minimal()+ theme(plot.title=element_text(hjust=0.5, size=12)) # RC5
ggplot() + geom_sf(data= pov.rotated.pca.scores.sf, aes(fill= demogr.dynamics)) + scale_fill_viridis_c() + labs(title="Demographic dynamics", fill="RC4") + theme_minimal() + theme(plot.title= element_text(hjust=0.5,size=12))# RC4Figure 8.15: Spatial distribution of rotated principal components: a) population scale (RC1), b) urban structure (RC2), c) business activity (RC3), d) wage level (RC5), e) demographic dynamics (RC4)
# df integrated: standardised variables, poviat names and codes
data23.sel.s.int<-data.frame(pov.codes, pov.names, data23.sel.s)
# attach the geometries to the df
data23.sel.s.int.sf<-data23.sel.s.int %>% left_join(pov %>%
select(JPT_KOD_JE, geometry) %>% mutate(JPT_KOD_JE= as.integer(JPT_KOD_JE)), by=c("pov.codes"="JPT_KOD_JE")) %>% st_as_sf() %>% st_make_valid()
class(data23.sel.s.int.sf)
data23.sel.s.int.sp<-as(data23.sel.s.int.sf, "Spatial")
class(data23.sel.s.int.sp)
# save names of the variables in a vector
my.vars<-colnames(data23.sel.s.int.sp@data[3:28])
library(GWmodel) # for bw.gwpca() and gwpca()
# search for the optimal bandwidth: data in sp class, 5 components
my.bw<-bw.gwpca(data=data23.sel.s.int.sp, vars=my.vars, k=5, longlat=TRUE)
my.bw
gwpca.pov<-gwpca(data=data23.sel.s.int.sp, vars=my.vars, k=5, bw=my.bw, longlat=TRUE, scores=TRUE)
gwpca.pov
head(gwpca.pov$SDF@data)
# plot in sp class
library(sp) # for spplot()
# Fig.8.16a - local proportion of variance explained by the first PC
spplot(gwpca.pov$SDF, "Comp.1_PV", main="Local proportion of variance: PC1", par.settings=list(layout.heights=list(main.key.padding=2, top.padding=2)))
# Fig.8.16b - cumulative local proportion of variance explained by 5 PCs
spplot(gwpca.pov$SDF, "local_CP", main="Cumulative local proportion of variance: first 5 PCs", par.settings= list(layout.heights= list(main.key.padding=2, top.padding=2)))
# Fig.8.16c - winning variable for the first PC
gwpca.pov$SDF@data$win_var_PC1<-as.factor(gwpca.pov$SDF@data$win_var_PC1)
spplot(gwpca.pov$SDF, "win_var_PC1", main="Winning variable for PC1",
par.settings=list(layout.heights=list(main.key.padding=2, top.padding=2)))
# Fig.8.17 – the same plot in sf class
gwpca.pov.sf<-st_as_sf(gwpca.pov$SDF)
head(gwpca.pov.sf)
ggplot() + geom_sf(data=gwpca.pov.sf, aes(fill=win_var_PC1)) + scale_fill_discrete(name=NULL) + theme_minimal() + theme(plot.title= element_text(hjust=0.5, size=12))+ labs(title="Winning variable for PC1")Figure 8.16: GWPCA results visualised using sp class objects: a) local variance explained by PC1, b) cumulative local variance explained by the first five PCs, c) dominant variable for local PC1
Figure 8.17: Dominant variable for the first local principal
component (PC1) derived from GWPCA, visualised using sf objects
# local loadings
ld_pc1<-gwpca.pov$loadings[,c(2,7,11),1] # selected variable loadings for PC1
ld_pc2<-gwpca.pov$loadings[,c(2,7,11),2] # selected variable loadings for PC1
head(ld_pc1) # local loadings per variable in PC1, 3 rows only
summary(ld_pc1) # summary statistics for selected local loadings (PC1)
head(ld_pc2) # local loadings per variable in PC2, 3 rows only
summary(ld_pc2) # summary statistics for selected local loadings (PC2)
loc<-coordinates(gwpca.pov$SDF) # coordinates of the same locations as in SDF
# Fig.8.18a - glyph plot for local loadings of variables for PC1
plot(data23.sel.s.int.sp, col=NA, border="grey80", lwd=0.2) # contour
gwpca.glyph.plot(ld=ld_pc1, loc=loc, r1=20, add=TRUE)
title(main="Local PCA loadings (PC1)")
# Fig.8.18b - glyph plot for local loadings of variables for PC2
plot(data23.sel.s.int.sp, col=NA, border="grey80", lwd=0.2) # contour
gwpca.glyph.plot(ld=ld_pc2, loc=loc, r1=20, add=TRUE)
title(main="Local PCA loadings (PC2)")Figure 8.18: Glyph plot illustrating spatial variation in
geographically weighted PCA loadings: a) local loadings for the first
principal component (PC1) and b) local loadings for the second principal
component (PC2)
# data pre-processing – subset for more efficient analysis
points$random<-runif(nrow(points), 0, 1) # random values from uniform dist
points.lim<-points[points$random<=0.2,] # select ~20% of all obs.
nrow(points.lim) # 7392 obs.
# summary of industry sector, SEC_PKD7
table(points.lim$SEC_PKD7)
# summary by employment group
# empl_group=1 -> between 1 and 9 persons
# empl_group=2 -> between 10 and 49 persons
# empl_group=3 -> between 50 and 249 persons
# empl_group=4 -> between 250 and 999 persons
# empl_group=5 -> more than 1500 persons
table(points.lim$empl_group)
# summary by legal form
table(points.lim$legal_form)library(spatstat)
voi<-st_read("wojewodztwa.shp", options="ENCODING=UTF-8") # import a map
voi.p<-st_transform(voi, 3857) # re-project to planar
region.voi.p<-voi.p[voi.p$JPT_NAZWA_=="lubelskie",] # create a map subset
W<-as.owin(region.voi.p) # convert to owin class
# W<-as(region.voi.p, "owin") # alternative code for the same
class(W)
# transform point data
points.lim.sf<-st_as_sf(points.lim, coords=c("crds.x", "crds.y"), crs=4326)
points.lim.sf.p<-st_transform(points.lim.sf, 3857) # re-project to planar
crds.p<-st_coordinates(points.lim.sf.p) # matrix with points coordinates
# unmarked point pattern
firms.ppp.um<-ppp(x=crds.p[,1], y=crds.p[,2], window=W) # class ppp
# marked point pattern - marks are type of sector
firms.ppp.m<-ppp(x=crds.p[,1], y=crds.p[,2], window=W, marks=points.lim$SEC_agg) # class ppp
# Fig.9.2 - plot point patterns from example above
par(mar=c(1,1,1,1))
plot(firms.ppp.um, main="Unmarked point pattern") # with spatstat::
plot(firms.ppp.m, main="Marked point pattern") # with spatstat::Figure 9.2: Visualisation of point patterns in spatstat:: a)
unmarked, b) marked
# analyse the range of data
centre.region<-st_coordinates(st_centroid(st_geometry(region.voi.p)))
centre.region
bbox<-st_bbox(region.voi.p) # get coordinates of bounding box
bbox
max(crds.p[,1])-min(crds.p[,1]) # range of x coordinates
max(crds.p[,2])-min(crds.p[,2]) # range of y coordinates
# Fig.9.3a - create a circle window in spatstat::
# center of the circle approximately in the center of region
# parameters: a and b – ellipse radii, centre – center of circle,
# phi – anticlockwise rotation angle
# radii of ellipse are related to the range of points
circle<-ellipse(a=200000, b=200000, centre=centre.region, phi=pi)
firms.circle<-ppp(x=crds.p[,1], y=crds.p[,2], window=circle) # class ppp
plot(firms.circle) # plot class ppp in spatstat
# Fig.9.3b - create a rectangle window in spatstat::
# specify in owin: c(xmin, xmax), c(ymin, ymax)
rect<-owin(c(2406231, 2687896), c(6490003, 6852339)) # create a rectangle
firms.rect<-ppp(x=crds.p[,1], y=crds.p[,2], window=rect) # class ppp
plot(firms.rect) # plot class ppp in spatstatFigure 9.3: Point patterns with various windows: a) circle,
b) rectangle
# marked point patterns
# numerical marks - the size of employment
firms.ppp.empl<-ppp(x=crds.p[,1], y=crds.p[,2], window=W, marks=points.lim$empl_group)
# categorical marks – legal form of the firm
firms.ppp.lf<-ppp(x=crds.p[,1], y=crds.p[,2], window=W, marks=as.factor(points.lim$legal_form))
# marked point pattern with numerical marks – firms.ppp.empl
# control for size of points according to number of employees
plot(firms.ppp.empl, markscale=0.002, pch=1) # not shown
# marked point pattern with categorical marks – firms.ppp.lf
# control for colors of points according to legal form
plot(firms.ppp.lf, pch=16, cex=0.8, cols=c("red", "green", "blue"))
# split for numeric marks
# convert the numerical values to class factor
marks(firms.ppp.empl)<-as.factor(marks(firms.ppp.empl))
split_ppp_empl<-split(firms.ppp.empl) # split the pattern by factor variable
# Fig.9.4a - plot the splitted numeric mark point pattern
par(mar=c(2,2,2,2)) # window of plot
plot(W, main="Marked point pattern \n representing number of employees")
# layers of point pattern, the first layer is for the biggest grup
plot(split_ppp_empl[[1]], add=TRUE, cex=0.75, pch=16, col="green")
plot(split_ppp_empl[[2]], add=TRUE, cex=1, pch=16, col="blue")
plot(split_ppp_empl[[3]], add=TRUE, cex=1.25, pch=16, col="red")
plot(split_ppp_empl[[4]], add=TRUE, cex=1.5, pch=16, col="black")
#plot(split_ppp_empl[[5]], add=TRUE, cex=1.75, pch=16, col="orange")
legend("bottomleft", legend=c("5","30","150","600","1500"), col=c("green", "blue", "red", "black", "orange"), pch=16, cex=0.8)
# Fig.9.4b - split for categorical marks
split_ppp_firms<-split(firms.ppp.lf, reduce=TRUE)
cols<-c("red", "green", "blue")
par(mar = c(2,2,2,2))
plot(W, main="Marked point pattern \n representing legal form")
plot(split_ppp_firms[[2]], add=TRUE, col=cols[2], pch=18)
plot(split_ppp_firms[[3]], add=TRUE, col=cols[3], pch=18)
plot(split_ppp_firms[[1]], add=TRUE, col=cols[1], pch=18)
legend("bottomleft", legend=c("natural person", "organizational entity \n without legal personality", "legal person"), col=c("green", "blue", "red"), pch=18, cex=0.8, bty="n")Figure 9.4: Marked point patterns: a) with numerical marks,
b) with categorical marks
table(firms.ppp.m$marks) # old mark values
marks(firms.ppp.m)<-as.factor(points.lim$dummy_agri) # change values
summary(marks(firms.ppp.m)) # new mark values
summary(setmarks(firms.ppp.m, as.factor(points.lim$SEC_PKD7)))
summary(marks(firms.ppp.m)) # no changes can be observed
# alternative code - use pipeline operator from package magrittr::
library(magrittr)
firms.ppp.m %mark% as.factor(points.lim$SEC_PKD7)
unmark(firms.ppp.m) # option 1
marks(firms.ppp.m)<-NULL # option 2
firms.ppp.m
marks(firms.ppp.m)<-as.factor(points.lim$SEC_PKD7)
summary(marks(firms.ppp.m))sum(duplicated(firms.ppp.um)) # how many points are duplicated
firms.ppp.um<-unique(firms.ppp.um) # keep unique points only
firms.ppp.um
summary(duplicated(firms.ppp.um)) # how many unique points are in dataset
# alternative way - result is the same as above
firms.ppp.um<-firms.ppp.um[!duplicated(firms.ppp.um)]
summary(duplicated(firms.ppp.m))
firms.ppp.m.jitter<-rjitter(firms.ppp.m, 0.02)
summary(duplicated(firms.ppp.m.jitter))between_dist<-pairdist.ppp(firms.ppp.um) # between all points in dataset (km)
between_dist[1:5, 1:5] # output in class matrix
dim(between_dist) # exactly this amount is left after removing duplicates
# object as data.frame, 2nd minimum distance as the last column
between_dist<-as.data.frame(between_dist)
between_dist$min_d<-apply(between_dist[,1:6088], 1, function(x) sort(x)[2])
first_neib<-nndist(firms.ppp.um) # by default k=1, class numeric
# comparison of distances – 2nd minimum dist and knn=1 dist
head(first_neib)
head(between_dist$min_d)
cities_vec<-c('Lublin', 'Chelm', 'Biala Podlaska', 'Zamosc')
xc<-c(22.570278, 23.477778, 23.116667, 23.258611)
yc<-c(51.248056, 51.132222, 52.033333, 50.720556)
cities<-data.frame(xc, yc, cities_vec)
pwt.sf<-st_as_sf(cities, coords=c("xc", "yc"), crs=4326) # spherical
pwt.sf<-st_transform(st_as_sf(pwt.sf), crs=st_crs(3857)) # planar
pwt_ppp<-ppp(x=st_coordinates(pwt.sf)[,1], y=st_coordinates(pwt.sf)[,2], window=W) # ppp class
pwt_ppp<-rescale.ppp(pwt_ppp, rsc, "km") # planar point pattern for 4 cities
# apply crossdist() to obtain distances from all points to cities
d<-crossdist(firms.ppp.um, pwt_ppp)
colnames(d)<-cities_vec
d<-as.data.frame(d)
d$min_d<-apply(d[,1:4], 1, FUN=min) # which city is closest to the point
head(d)
dist.fun<-distfun(firms.ppp.um) # similar to above
class(dist.fun) # "distfun" "funxy" "function"
summary(dist.fun)
dist.map<-distmap(firms.ppp.um)
class(dist.map) # im class
summary(dist.map)
# Fig.9.6 - comparison of distfun() and distmap()
par(mar=c(2,2,2,2))
plot(dist.fun) # plot of distance function
plot(dist.map) # plot by pixels, class imFigure 9.6: Comparison of a) distfun() and b) distmap()
functions outputs
# W<-as.owin(lubelskie) # convert to owin class object, as previously
pattern<-firms.ppp.um # unmarked point pattern
marks(pattern)<-d$min_d # distance to closest powiat city as marks
# change class from ppp to im
mat_pix0<-as.im.ppp(pattern, W) # 128 x 128 pixel array (ny, nx)
par(mar=c(2,2,2,2))
# Fig.9.7a
mat_pix1<-pixellate(pattern) # measure the amount of data in each pixel
mat_pix1 # shows description of pixel image resolution and bounding box
plot(mat_pix1, main="pixellate") # count of firms in pixels
# Fig.9.7b
mat_pix2<-Smooth.ppp(pattern) # kernel smooth of values in irregular loc.
plot(mat_pix2, main="Smooth.ppp") # smoothed distance (mark)
# Fig.9.7c
mat_pix3<-idw(pattern) # inverse distance smoothing of obs.values
plot(mat_pix3, main="idw") # smoothed distance (mark)
# Fig.9.7d - duplicate initial pattern, firms.ppp.um, and add new marks
first_neib_ppp<-firms.ppp.um # copy-paste ppp object
marks(first_neib_ppp)<-first_neib # change marks – dist to knn1
first_neib_pix<-Smooth.ppp(first_neib_ppp) # smooth
plot(first_neib_pix, main="nndist") # plot pixeled dist to knn1Figure 9.7. Pixel image covariate – distance to closest
poviat city: a) count of firms from pixellate(), b) kernel smoothed
distance from Smooth.ppp(), c) inverse-distance smoothed values from
idw(), d) kernel smoothed new marks
# create a comprehensive point pattern for further analysis
points$random<-runif(nrow(points), 0, 1) # random values from uniform dist
firms<-points[points$random<=0.07,] # select ~7% of all observations
nrow(firms) # 2242 obs
# transform point data to spherical WGS84 and then Mercator planar projection
firms.sf<-st_as_sf(firms, coords=c("crds.x", "crds.y"), crs=4326) #spherical
firms.sf2<-st_transform(st_as_sf(firms.sf), crs=st_crs(3857)) # planar
# create owin class window
voi<-st_read("wojewodztwa.shp", options="ENCODING=UTF-8") # import a map
voi.p<-st_transform(voi, 3857) # re-project to planar
region.voi.p<-voi.p[voi.p$JPT_NAZWA_=="lubelskie",] # create a map subset
W<-as.owin(region.voi.p) # convert to owin class
# create unmarked point pattern
pattern.um<-ppp(x=st_coordinates(firms.sf2)[,1], y=st_coordinates(firms.sf2)[,2], window=W) # create ppp object
lubelskie.area<-25122.46 # area of region
rsc<-sqrt(area.owin(W)/lubelskie.area) # rescaling factor
pattern.um<-rescale.ppp(pattern.um, rsc, "km")
pattern.um<-unique(pattern.um) # keep unique points only
summary(pattern.um) # 2043 points, no duplicates
# create marked point pattern, marks are industry sectors
pattern.m<-ppp(x=st_coordinates(firms.sf2)[,1], y=st_coordinates(firms.sf2)[,2], window=W, marks=as.factor(firms.sf$SEC_PKD7)) # add marks and make ppp class
pattern.m<-rjitter(pattern.m, 0.02) # relocate points to avoid overlaps
pattern.m<-rescale.ppp(pattern.m, rsc, "km") # rescale
summary(pattern.m) # 2242 points, of marks as industry sectors
# pixel image covariate: distance to closest poviat city
cities_vec<-c('Lublin', 'Chelm', 'Biala Podlaska', 'Zamosc')
xc<-c(22.570278, 23.477778, 23.116667, 23.258611)
yc<-c(51.248056, 51.132222, 52.033333, 50.720556)
cities<-data.frame(xc, yc, cities_vec)
pwt.sf<-st_as_sf(cities, coords=c("xc", "yc"), crs=4326) # spherical
pwt.sf<-st_transform(st_as_sf(pwt.sf), crs=st_crs(3857)) # planar
pwt_ppp<-ppp(x=st_coordinates(pwt.sf)[,1],
y=st_coordinates(pwt.sf)[,2], window=W) # class ppp
pwt_ppp<-rescale.ppp(pwt_ppp, rsc, "km") # rescale to new units in km2
d<-crossdist(pattern.um, pwt_ppp) # distances between points and cities
colnames(d)<-cities_vec
d<-as.data.frame(d)
d$min_d<-apply(d[,1:4], 1, FUN=min)
head(d)
pattern.im<-pattern.um # unmarked point pattern
marks(pattern.im)<-d$min_d # distance to closest city as marks
mat_pix<-Smooth.ppp(pattern.im) # no rescaling, it was done before# H0: homogeneity => CSR, H1: heterogeneity => non-CSR
q1<-quadrat.test(pattern.um, nx=5, ny=5) # 25 squares
q1 # H1 as p-value is small
names(q1) # names of slots available in output
q1$method
q2<-quadrat.test(pattern.um, nx=10, ny=10) # 100 squares
q2$p.value
q3<-quadrat.test(pattern.um, nx=25, ny=25) # 625 squares
q3$p.value
q4<-quadrat.test(pattern.um, nx=50, ny=50) # 2.500 squares
q4$p.value
q5<-quadrat.test(pattern.um, nx=100, ny=100) # 10.000 squares
q5$p.value # H0: CSR => random locations, H1: non-CSR => non-random locations
ks.rand<-cdf.test(pattern.um, "x") # Kolmogorov-Smirnov test
ks.rand
plot(ks.rand) # Fig.9.8a cumulative distributions (theoretical & empirical)
cvm.rand<-cdf.test(pattern.um, "x", "cvm") # Cramer-Von Mises test
cvm.rand
ad.rand<-cdf.test(pattern.um, "x", "ad") # Anderson-Darling test
ad.rand
bt1<-berman.test(pattern.um, "x") # Berman Z1
bt1
bt2<-berman.test(pattern.um, "x", "Z2") # Berman Z2
bt2
ks.rand.im<-cdf.test(pattern.um, mat_pix)
ks.rand.im
cvm.rand.im<-cdf.test(pattern.um, mat_pix, "cvm")
cvm.rand.im
ad.rand.im<-cdf.test(pattern.um, mat_pix, "ad")
ad.rand.im
bt1.im<-berman.test(pattern.um, mat_pix)
bt1.im
bt2.im<-berman.test(pattern.um, mat_pix, "Z2")
bt2.im
plot(ks.rand) # Fig.9.8a
plot(ks.rand.im) # Fig.9.8b Figure 9.8: Plot of Kolmogorov-Smirnov test under a presence
of a covariate: a) x covariate, b) pixel image covariate
d.ppp<-density(pattern.um) # calculated for pixels, sigma not precalculated
summary(d.ppp)
# Fig.9.9a – kernel density estimation with contours (isolines)
plot(d.ppp, main="Density of firms witin a region")
contour(d.ppp, add=TRUE)
# Fig.9.9b – 3D density rotated
persp(d.ppp, main="3D density of firms witin a region", phi=30, theta=-30)Figure 9.9: Density estimation of study pattern: a) KDE with
isolines, b) 3D perspective plot
# example does not pre-calculate bandwidth, formally it is density.ppp()
d0.ppp<-density(pattern.um) # by default - given above
d1.ppp<-density(pattern.um, kernel="epanechnikov")
d2.ppp<-density(pattern.um, kernel="quartic")
d3.ppp<-density(pattern.um, kernel="disc")
par(mfrow=c(2,2)) # Fig.9.10, four figures (2x2) jointly
par(mar=c(1,1,1,1))
plot(d0.ppp, main="Gaussian")
plot(d1.ppp, main="Epanechnikov")
plot(d2.ppp, main="quartic")
plot(d3.ppp, main="disc")
par(mfrow=c(1,1)) # back to single figure settingFigure 9.10. Density estimation – various kernels
# bandwidths – radii (sigma)
diggle_ppp<-bw.diggle(pattern.um) # Diggle
ppl_ppp<-bw.ppl(pattern.um) # likelihood CV
scott_ppp<-bw.scott(pattern.um) # Scott, outputs two sigma
CvL_ppp<-bw.CvL(pattern.um) # Cronie and van Lieshout
sigma<-c(diggle_ppp, ppl_ppp, scott_ppp, CvL_ppp)
# Gaussian kernel estimation with pre-calculated sigma; edge correction TRUE
# positive=TRUE -> density values are forced to be positive
d_diggle_ppp<-density(pattern.um, sigma=diggle_ppp, positive=TRUE)
d_ppl_ppp<-density(pattern.um, sigma=ppl_ppp, positive=TRUE)
d_scott_ppp<-density(pattern.um, sigma=scott_ppp, positive=TRUE)
d_cvl_ppp<-density(pattern.um, sigma=CvL_ppp, positive=TRUE)
plot(d_diggle_ppp, main="Diggle") # Fig.9.11a
#plot(d_ppl_ppp, main="likelihood CV") # looks similar to Diggle
plot(d_scott_ppp, main="Scott") # Fig.9.11b
# plot(d_cvl_ppp, main="Cronie & van Lieshout") # looks similar to ScottFigure 9.11: Kernel density estimation with various
bandwidths: a) Diggle, b) Scott
Ki<-Kinhom(pattern.um)
Ki
Fi<-Finhom(pattern.um)
Fi
Gi<-Ginhom(pattern.um)
Gi
Ji<-Jinhom(pattern.um)
Ji
plot(Ki) # Fig.9.12a
plot(Fi) # Fig.9.12b
plot(Gi) # Fig.9.12c
plot(Ji) # Fig.9.12dFigure 9.12: Inhomogeneous functions for unmarked point
pattern: a) K function, b) F function, c) G function, d) J
function
# time consuming procedure
t0<-Sys.time() # may take ~ 5 hours!
E.ptwise<-envelope(pattern.um, Linhom, nsim=19, fix.n=T)
t1<-Sys.time()
t1-t0
E.ptwise
plot(E.ptwise) # Fig.9.13a
# Global, p-value = 1/(1+nsim)
t2<-Sys.time() # smaller number of simulation to save time
E.g<-envelope(pattern.um, Linhom, nsim=5, rank=1, global=TRUE)
t3<-Sys.time()
t3-t2 # ~1.7 hours
E.g
plot(E.g) # Fig.9.13bFigure 9.13: Estimation of point pattern: a) pointwise envelope and b) global envelope of inhomogeneous L function for study point pattern
pattern.m2<-ppp(x=st_coordinates(firms.sf2)[,1], y=st_coordinates(firms.sf2)[,2], window=W, marks=as.factor(firms.sf$SEC_agg)) # add marks and make ppp class
pattern.m2<-rjitter(pattern.m2, 0.02) # relocate points to avoid overlaps
pattern.m2<-rescale.ppp(pattern.m2, rsc, "km") # rescale
s<-segregation.test(pattern.m2, nsim=19)
sKfm<-Kcross(pattern.m, "F", "M")
Kfm
summary(Kfm)
Kdi<-Kdot(pattern.m, "G")
Kdi
ma<-markconnect(pattern.m, "J", "K") # time consuming for analysed pattern
ma
mc<-markcorr(pattern.m)
mc
Ic<-Iest(pattern.m)
Ic
pattern.m$window # initial window
# set a new window
Window(pattern.m)<-owin(c(1508.058,1684.539), c(4067.368,4294.449))
pattern.m$window # changed window
E<-envelope(pattern.m, Lcross, nsim=99, i="A", j="C",
simulate=expression(rshift(pattern.m, radius=35)))
summary(E)
# Fig.9.14a, Kcross() for sectors F and M
plot(Kfm, main="Homogeneous K cross function, \n sectors F and M")
# Fig.9.14b, Kdot() for sector G
plot(Kdi, main="Homogeneous K dot function, \n sector G vs other")
plot(ma, main="Mark connection function, \n sectors J and K")# Fig.9.14c
plot(mc, main="Mark equality function \n for study pattern")# Fig.9.14d
plot(Ic, main="I function \n for study pattern") # Fig.9.14e
# Fig.9.14f, envelope() for sectors A and C
plot(E, main="Randomisation test of \n components’ independence") Figure 9.14: Graphic analysis of a multitype point pattern under assumption of stationarity: a) homogeneous Kcross function, b) homogeneous Kdot function, c) mark connection function, d) mark equality function, e) I function, f) randomisation of test components’ independence
Kfmi<-Kcross.inhom(pattern.m, "F", "M")
Kfmi
Kdii<-Kdot.inhom(mpattern.m, "G")
Kdii
# Fig.9.15
plot(Kfmi, main="Inhomogeneous K cross function, \n sectors F and M")
plot(Kdii, main="Inhomogeneous K dot function, \n sector G vs other")Figure 9.15: Graphic analysis of a multitype point pattern under assumption of non-stationarity: a – inhomogeneous Kcross function, b – inhomogeneous Kdot function
# Fig.10.1a - sampling of points - regular distribution
plot(st_geometry(region.voi), main="Regular distribution")
points(st_sample(region.voi, 1000, type="regular"), pch=".", cex=3)
# Fig.10.1b - sampling of points - random distribution
plot(st_geometry(region.voi), main="Random distribution")
points(st_sample(region.voi, 1000, type="random"), pch=".", cex=3)
# Fig.10.1c - sampling of points - hexagonal distribution in planar projection
region.voi.p<-st_transform(region.voi, 3857) # re-project to planar
plot(st_geometry(region.voi.p), main="Hexagonal distribution")
points(st_sample(region.voi.p, 1000, type="hexagonal"), pch=".", cex=3)
# Fig.10.1d - own function to generate clustered points
clustered_sample<-function(region, n_points=1000, n_clusters=20, sd=2000) {
#1.cluster centers
centers<-st_sample(region, size=n_clusters, type="random")
centers_xy<-st_coordinates(centers)
#2.how many points per cluster?
sizes<-as.vector(rmultinom(1, n_points, rep(1, n_clusters)))
#3.generate offspring points around each center
x_all<-y_all<-numeric(0)
for(i in seq_len(n_clusters)) {
x_all<-c(x_all, rnorm(sizes[i], mean=centers_xy[i, "X"], sd=sd))
y_all<-c(y_all, rnorm(sizes[i], mean=centers_xy[i, "Y"], sd=sd))}
pts<-st_as_sf(data.frame(x=x_all, y=y_all), coords=c("x", "y"), crs=st_crs(region))
#4.keep points inside polygon
pts_inside<-pts[region, , op=st_within]
pts_inside
} # end of the function
# create clustered points using own function
clust_pts<-clustered_sample(region.voi.p, n_points=1000, n_clusters=15, sd=10000)
# plot
plot(st_geometry(region.voi.p), main="Clustered spatial distribution")
plot(st_geometry(clust_pts), pch=16, cex=0.6, add=TRUE)Figure 10.1: Sampling points in space: a) regular, b) random,
c) hexagonal, d) clustered
# 5x5 grid over the extent of region.voi in "sfc_POLYGON" and "sfc" class
grid5<-st_make_grid(region.voi, n=c(5, 5), what="polygons")
# turn grid into an sf object – now in "sf" and "data.frame" class
grid5_sf<-st_as_sf(data.frame(id=1:length(grid5)), geometry=grid5)
# change geometry to other type (MULTIPOLYGON)
grid5_mp<-st_cast(grid5_sf, "MULTIPOLYGON")
st_geometry_type(grid5_mp) # check results, all are "MULTIPOLYGON"
# Fig.10.2a – full grid over the regional contour
plot(st_geometry(region.voi), border="grey")
plot(st_geometry(grid5_mp), add=TRUE)
# clip the grid cells to region
grid5_clipped<-st_intersection(grid5_mp, region.voi)
grid5_clipped<-st_collection_extract(grid5_clipped, "MULTIPOLYGON")
# Fig.10.2b – limited (clipped) grid witin the regional contour
plot(st_geometry(region.voi), border="grey")
plot(st_geometry(grid5_clipped), add=TRUE)
# add a vector of values to grids – to set how many points do sample
dim(grid5_clipped) # 24 36 - 24 grid cells after clip (and 36 columns)
val<-round(runif(n=24, min=3, max=15),0) # random values for each grid
val
# Fig.10.2c - plot the coloured grid cells
grid5_clipped$val<-val # assign drawn values to grid cells
ggplot() + geom_sf(data=grid5_clipped, aes(fill=val))
# sample points within grid cell according to created vector
pts<-st_sample(x=st_make_valid(grid5_clipped), size=grid5_clipped$val, type="random", by_polygon=TRUE)
# Fig.10.2d – plot the sampled points, add text layer in grid centroids
centers<-st_centroid(st_make_valid(grid5_clipped)) # grid centroids
ggplot() + geom_sf(data=grid5_clipped) + geom_sf(data=pts, size=0.6) + geom_sf_text(data=centers, aes(label=val), size=5) + theme_minimal()Figure 10.2: Sampling points with multipolygon: a) not-clipped grid, b) clipped grid, c) values assigned to grid, d) sampling points according to values in grid
library(spatstat.geom) # for rsyst()
library(spatstat.random) # for rstrat()
region.voi.p<-st_transform(region.voi, 3857) # re-project to planar
region.voi.owin<-as.owin(st_as_sf(region.voi.p)) # owin class
# Fig.10.3a - stratified sampling, with background divided into tiles
r1<-rstrat(win=region.voi.owin, nx=10, ny=10, k=5)
tiles<-quadrats(region.voi.owin, nx=10, ny=10) # to see those tiles
par(mar=c(1,1,1,1)) # narrow margins – so bigger figure
plot(r1, main="Strafified sampling, k=5 obs. in the tile")
plot(tiles, add=TRUE, lty=3, col="grey90") # add tiles
# Fig.10.3b - generating points evenly distributed in space
r2<-rsyst(win=region.voi.owin, nx=10, ny=10)
plot(r2, main="Regular sampling") # regular points and regional contourFigure 10.3: Spatial sampling using spatstat:: a) stratified
sampling, b) regular sampling
selected<-matrix(sample(1:500, size=500*30, replace=TRUE), byrow=TRUE, nrow=500, ncol=30) # matrix of random IDs
selected[1:5, 1:5] # the result of the ID draw
freq<-as.data.frame(table(selected)) # frequency of draws of IDs
dim(freq) # all IDs were sampled
head(freq) # frequency of sampled IDs
sub<-points[1:500, ] # subsample with 500 obs.
sub$ID<-1:500 # add ID variable to the dataset
sub<-merge(sub, freq, by.x="ID", by.y="selected", all.x=TRUE)
# Fig.10.4a - under- and over-representation in sampling
sub$color<-"grey70" # as expected
sub$color[is.na(sub$Freq)==TRUE]="red" # never selected
sub$color[sub$Freq<25]="blue" # deeply under-selected
sub$color[sub$Freq>35]="green" # deeply over-selected
plot(sub[,c("crds.x", "crds.y")], bg=sub$color, pch=21, cex=2)
legend("bottomleft", pch=21, pt.bg=c("grey70", "red", "blue", "green"), c("as expected", "never selected", "under-selected", "over-selected"), bty="n", cex=0.8)
plot(st_geometry(region.voi), add=TRUE)
# data to analyse spatial distributions of under- and over-representation
sub2<-sub[sub$color=="blue" | sub$color=="green",]
sub2$marks<-0 # column of values 0
sub2$marks[sub2$color=="blue"]<-"case" # under-selected obs.
sub2$marks[sub2$color=="green"]<-"control" # over-selected obs.
sub2<-sub2[,c("crds.x", "crds.y", "marks")] # selected columns only
sub2
# convert classes: sf spherical to sf planar (Mercator)
sub2.sf<-st_as_sf(sub2, coords=c("crds.x", "crds.y"), crs=4326) # spherical
sub2.sf2<-st_transform(st_as_sf(sub2.sf), crs=st_crs(3857)) # planar
coords<-st_coordinates(sub2.sf2) # coordinates of points
# make ppp class objects
library(spatstat) # package for ppp class and relrisk()
region.merc<-st_transform(st_as_sf(region.voi), crs=st_crs(3857)) # sf
region.owin<-spatstat.geom::as.owin(sf::st_geometry(region.merc)) # owin
sub2.ppp<-ppp(x=coords[,1], y=coords[,2], window=region.owin, marks=factor(sub2.sf2$marks)) # ppp class
# Fig.10.4b - spatial relative risk (srr)
srr<-relrisk(sub2.ppp, se=TRUE) # from spatstat::
plot(srr$estimate, main="Spatial Relative Risk") # plot estimates of srrFigure 10.4: Results of multiple sampling: a) location of drawn observations, b) relative risk of under- and over-represented points
corr.raw<-cor(sub$locPdens, sub$locAggTOTAL) # original correlation
corr.raw
# matrix where IDs where replaced with the appropriate values of variable
sub.boot.v1<-matrix(sub$locPdens[selected], nrow=nrow(selected), ncol= ncol(selected)) # 500 rows x 30 columns
sub.boot.v2<-matrix(sub$locAggTOTAL[selected], nrow=nrow(selected), ncol= ncol(selected)) # 500 rows x 30 columns
# correlation coefficients for each iteration using sapply()
corr.boot<-sapply(seq_len(ncol(sub.boot.v1)), function(i) cor(sub.boot.v1[, i], sub.boot.v2[, i]))
corr.boot
summary(corr.boot)
ci<-quantile(corr.boot, c(0.025, 0.975)) # 95% bootstrap confidence interval
ci# prepare data
crds<-st_coordinates(st_centroid(st_make_valid(pov))) # coordinates
data23<-data[data$year==2023, c("ID_MAP", "unemployment_rate", "dist")]
data23$crds.x<-crds[,1] # add x coordinates to the data set
data23$crds.y<-crds[,2] # add y coordinates to the data set
data23$crds<-crds # add xy coordinates jointly (crds.X and crds.Y)
n<-dim(data23)[1] # number of observations (rows) - 380
b<-10 # average cluster length (#obs. in cluster)
k<-n/b # the number of clusters - 38
c1<-kmeans(crds, k) # k-means algorithm applied to coordinates
data23$clust<-c1$cluster # clustering vector added to dataset
names(data23) # coordinates were added separately and jointly
library(viridisLite) # allows for e.g. 38 colours
cols<-inferno(k) # colour palette, other option is e.g. plasma(k)
brks<-1:k # intervals
plot(st_union(voi)) # all regions merged to one, plot not shown
points(data23$crds, bg=cols[findInterval(data23$clust, brks)], pch=21, cex=2.5)
title(main="Clusters from the k-average algorithm")
# turn data into sf points (380 obs.)
df.sf<-st_as_sf(data23, coords=c("crds.x", "crds.y"), crs=4326)
# group observations by cluster and compute centroid of each cluster
# st_combine() puts all points geometries into one container
# st_union() truly merges geometries into single shape
library(dplyr) # for pipe operations and data processing
crds <- df.sf |> group_by(clust) |> # average values within clusters
summarise(unemp_rate=mean(unemployment_rate, na.rm=TRUE),
geometry=st_centroid(st_combine(geometry))) |> ungroup()
# Voronoi tessellation for the cluster centroids
region.p<-sf::st_transform(st_union(voi), crs=3857) # planar contour map
crds.p<-sf::st_transform(crds, crs=3857) # planar point data
crds.sfc<-st_geometry(crds.p) # class sfc from points (coordinates)
box.sfc<-st_geometry(region.p) # class sfc from region
crds.union<-st_union(crds.sfc) # union of points
tess<-st_voronoi(crds.union, box.sfc) # Voronoi tesselation
tess.clip<-st_intersection(st_cast(tess), st_union(box.sfc)) # sfc class
tess.clip.wgs<-st_transform(tess.clip, crs=4326) # spherical tesselation
tess.sf<-st_as_sf(tess.clip.wgs) # tessellation in sf class
tess.sf$unemp_rate<-crds$unemp_rate # add data to Voronoi tiles
# Fig.10.5a – using plot()
plot(st_geometry(tess.clip.wgs), lwd=2, lty=3)
points(data23[,c("crds.x", "crds.y")], bg=cols[findInterval(data23$clust, brks)], pch=21, cex=1.5)
# Fig.10.5b – using ggplot()
ggplot() + geom_sf(data=tess.sf, aes(fill=unemp_rate), alpha=0.6) +
geom_sf(data=df.sf, aes(col=factor(clust))) + coord_sf() + labs(fill="Unemployment rate", title="Voronoi tessellation based on cluster centroids") + theme_minimal()Figure 10.5: Clusters of points generated with k-means
plotted over tessellated surface: a) using plot(), b) using
ggplot()
b_emp<-aggregate(data23$unemployment_rate, by=list(data23$clust), length)
head(b_emp) # structure of clusters’ size
mmax<-max(b_emp$x) # Fig.10.6a, using hist()
hist(b_emp$x, breaks=1:mmax, ylim=c(0,12), labels=TRUE, col="orange")
abline(h=(1:6)*2, lty=3)
# Fig.10.6b using ggplot()
ggplot(b_emp, aes(x=x)) + geom_histogram(binwidth=1, fill="grey50", color="white") + labs(title="Size of k-means clusters") + theme_minimal()Fig.10.6: Histogram of cluster’s size: a) using hist(), b) using ggplot()
iter<-100 # number of iterations
result<-matrix(0, nrow=iter, ncol=4) # to write the results of analysis
colnames(result)<-c("unique clusters", "sample length", "unique observations", "correlation")
selected<-matrix(0, nrow=k, ncol=iter) # empty object to save sampled IDs
for(i in 1:iter){ # external loop for each iteration
ss<-sample(1:k, size=k, replace=TRUE) # samples 38 values from 1:38
selected[,i]<-ss # save IDs of randomly drawn clusters
result[i,1]<-length(unique(ss)) # save a number of unique clusters
boot1<-data23[1,] # empty object - the same structure as original dataset
# append from the bottom of the blocks of observation
for(j in 1:k){ # internal loop for each selected cluster
boot1<-rbind(boot1, data23[data23$clust==ss[j], ])
} # end of the internal loop
boot1<-boot1[-1,]
result[i,2]<-dim(boot1)[1] # length of bootstrapped sample
result[i,3]<-length(unique(boot1[,1])) # number of unique observations
result[i,4]<-cor(boot1[,2], boot1[,3]) # correlation between two variables
} # end of the external loop
head(result) # IDs of iterations as rownames
selected[1:6, 1:6]
summary(result[,1]/38) # the percentage of unique clusters
summary(result[,2]/380) # length of the sample relative to the full sample
summary(result[,3]/380) # the percentage of unique observations
# correlation coefficient
summary(result[,4]/cor(data23$unemployment_rate, data23$dist))
# Fig.10.7a - statistical distribution of correlation coefficient
plot(density(result[,4]), main="Distribution of the correlation coefficient")
abline(v=cor(data23$unemployment_rate, data23$dist), lty=3, col="coral", lwd=2)
# Fig.10.7b – statistical distribution of parameters of sampled datsets
par(mar=c(4,4,4,4))
plot(density(result[,1]/38), xlim=c(0,2), ylim=c(0,8), main="Structure of the bootstrapped sample") # unique clusters
lines(density(result[,2]/380), lwd=2) # the length of the trial
lines(density(result[,3]/380), lty=3) # unique observations
legend(0.99,8,c("unique clusters%", "sample length%", "unique observations%"), lty=c(1,1,3), lwd=c(1,2,1), cex=1, bty="n")Figure 10.7: Distributions of block bootstrap results: a)
correlation of two variables, b) structure of a bootstrapped
sample
iter<-100 # number of bootstrap iterations
n<-nrow(data23) # number of observations (380)
b<-10 # block length (expected points drawn per cluster)
k<-n/b # number of clusters (38)
data23$ID<-seq_len(n) # add the ID to the basic file, the same as 1:n
result<-matrix(0, nrow=iter, ncol=2) # empty object for results
colnames(result)<-c("unique_obs.", "correlation")
selected2<-matrix(0, nrow=n, ncol=iter) # history of sampling
for(i in 1:iter) { # first loop, over i iterations
c1<-kmeans(data23$crds, centers=k) # new clustering in each iteration
data23$clust<-c1$cluster # clustering vector
boot1<-data23[0, ] # truly empty data.frame with same columns
pos<-0 # position counter for filling selected2
for(m in 1:k) { # second loop, over m clusters
sub<-data23[data23$clust==m, ] # subset for a given cluster m
how.many.obs<-nrow(sub) # check the length of subset
ss<-sample.int(how.many.obs, size=b, replace=TRUE) # draw b rows
for(j in 1:b) { # third loop, append row-by-row
boot1<-rbind(boot1, sub[ss[j], ])
pos<-pos+1 # record sampling history (IDs), including duplicates
selected2[pos, i]<-sub$ID[ss[j]]
} # end of the third loop
} # end of the second loop
result[i, 1]<-length(unique(boot1$ID)) # number of unique observations
result[i, 2]<-cor(boot1$unemployment_rate, boot1$dist, use="complete.obs")
} # end of the first loop
# Fig.10.8a – statistical distribution of bootstrapped correlation coeff.
ggplot(as.data.frame(result), aes(x=result[,2])) + geom_density(fill= "lightblue") + geom_vline(xintercept=cor(data23$unemployment_rate, data23$dist), linetype = "dashed") + ggtitle("Distribution of the correlation coefficient")
# ratio: bootstrapped-to-empirical correlation coefficient
summary(result[,2]/cor(data23$unemployment_rate, data23$dist))
# Fig.10.8b
ggplot(as.data.frame(result), aes(x=result[,1]/380)) + geom_density(fill= "lightcoral", alpha=0.8) + ggtitle("Percentage of unique observations")
# ratio: unique bootstrapped IDs to all IDs of observations
summary(result[,1]/380) Figure 10.8: Results of stratified block bootstrap: a) density of correlation coefficient between variables and b) percentage of unique observations
# prepare data
crds<-st_coordinates(st_centroid(st_make_valid(pov))) # coordinates
data.mbb<-data.frame(ID_MAP=data$ID_MAP[data$year==2023], unemp23=data$unemployment_rate[data$year==2023], unemp22=data$unemployment_rate[data$year==2022])
data.mbb$crds.x<-crds[,1] # add x coordinates to the data set
data.mbb$crds.y<-crds[,2] # add y coordinates to the data set
data.mbb$crds<-crds # add xy coordinates once again - jointly
# determine the k nearest neighbors based on coordinates
library(spdep)
knn.set<-knearneigh(crds, k=9) # matrix of k=9 nearest neighbours, sp::
knn.set$nn # fragment of the knn matrix
# bootstrap iterations
lead<-sample(1:380, size=38, replace=FALSE) # drawing of main points
boot<-as.vector(knn.set$nn[lead,]) # vector from randomly drawn neighbours
boot2<-c(boot, lead) # 380 IDs of the main points and neighbours
boot2 # first several IDs of bootstrapped dataset
length(boot2) # total length of the drawn IDs
length(unique(boot2)) # number of unique IDs
# bootstrapped sample – original dataset by sampled IDs
boot3<-data.mbb[boot2,]
head(boot3)
cor(data.mbb$unemp22, data.mbb$unemp23) # correlation in a full sample
cor(boot3$unemp22, boot3$unemp23) # correlation in a bootstrap sample
vec<-(20:1)*5/100 # degree of coverage – 20 values, i in iterations
b=10 # block length, for Fig.10.9a b=20, Fig.10.9b b=10
knn.set<-knearneigh(crds, k=b-1) # matrix of k nearest neighbours
result<-matrix(0, nrow=30, ncol=20) # place for output
colnames(result)<-paste(rep("cov", times=20), vec) # columns for coverage
rownames(result)<-paste(rep("iter", times=30), 1:30) # rows for iterations
result[1:5, 1:5]
for(i in 1:20){ # loop for i coverage levels - iterations by columns
samp.size<-380*vec[i] # the total number of observations to be drawn
for(j in 1:30){ # loop for iterations for a given coverage – results in rows
lead<-sample(1:380, size=ceiling(samp.size/b), replace=TRUE) # local sample
boot<-as.vector(knn.set$nn[lead,]) # vector from randomly drawn neighbours
boot1<-c(boot, lead) # join IDs of lead points and their neighbours
data.boot<-data.mbb[boot1,] # bootstrapped data
result[j, i]<-cor(data.boot$unemp23, data.boot$unemp22) # correlation
}} # end of two loops, for coverage and iterations
result[1:6, 1:6] # iterations j in rows, coverage i in columns
mea<-as.vector(apply(result, 2, mean)) # mean by columns
sd<-as.vector(apply(result, 2, sd)) # sd by columns
for.figure<-as.data.frame(cbind(mea, sd)) # mean and sd in one object
colnames(for.figure)<-c("mea", "sd") # names of columns of object
for.figure$lower<-for.figure$mea-for.figure$sd # lower bound
for.figure$upper<-for.figure$mea+for.figure$sd # upper bound
for.figure$model<-rep("block10", times=20) #change to "block10" for Fig.10.9b
for.figure$interval<-(20:1)*5/100 # coverage, the same as vec object
head(for.figure)
cor.fullsample<-cor(data.mbb$unemp22, data.mbb$unemp23) # for a full sample
# Fig.10.9a
p<-ggplot(data=for.figure, aes(x=interval, y=mea, colour=model)) + geom_point() + geom_line() + geom_ribbon(aes(ymin=lower, ymax=upper), linetype=2, alpha=0.1) + xlab("Coverage of the sample") + ylab("Average correlation") + geom_hline(yintercept= cor.fullsample, linetype="dashed") + ggtitle("MBB for b=10")
plot(p)
# save the previous result as data1
for.figure1<-for.figure
# ///repeat the above analysis for block b=10 ///
# bind both datasets for block10 and block20
for.figure11<-rbind(for.figure, for.figure1)
# Fig.10.9b
p1<-ggplot(data=for.figure11, aes(x=interval, y=mea, colour=model)) + geom_point() + geom_line() + geom_ribbon(aes(ymin=lower, ymax=upper), linetype=2, alpha=0.1) + xlab("Coverage of the sample") + ylab("Average correlation") + geom_hline(yintercept=cor.fullsample, linetype="dashed") + ggtitle("MBB for b=10 and b=20")
plot(p1) Figure 10.9: Average correlation and fluctuation corridor (by std.dev) in the moving block bootstrap (MBB) model for: a) block length b = 10, b) blocks of length b=10 and b=20
# simple bootstrap – case sampling
vec<-(20:1)*5/100 # degree of coverage – 20 values, i in iterations
result2<-matrix(0, nrow=30, ncol=20) # place for output
colnames(result2)<-paste(rep("cov", times=20), vec) # columns for coverage
rownames(result2)<-paste(rep("iter", times=30), 1:30) # rows for iterations
for(i in 1:20){ # loop for i coverage levels - iterations by columns
samp.size<-380*vec[i] # the total number of observations to be drawn
# replace=TRUE for Fig.10.10a, replace=FALSE for Fig.10.10b
for(j in 1:30){ # loop for iterations for a given coverage – results in rows
lead<-sample(1:380, size=samp.size, replace=TRUE) # use also replace=FALSE
boot<-matrix(0, nrow=samp.size, ncol=2)
data.boot<-data.mbb[lead,] # bootstrapped data
result2[j, i]<-cor(data.boot$unemp23, data.boot$unemp22) # correlation
}} # end of two loops, for coverage and iterations
mea2<-as.vector(apply(result2, 2, mean)) # mean by columns
sd2<-as.vector(apply(result2, 2, sd)) # sd by columns
for.figure2<-as.data.frame(cbind(mea2, sd2)) # mean and sd in one object
colnames(for.figure2)<-c("mea", "sd") # names of columns of object
for.figure2$lower<-for.figure2$mea-for.figure2$sd # lower bound
for.figure2$upper<-for.figure2$mea+for.figure2$sd # upper bound
for.figure2$model<-rep("simple", times=20) # label for legend
for.figure2$interval<-(20:1)*5/100 # coverage, the same as vec object
for.figure3<-rbind(for.figure11, for.figure2) # combine MBB and CR results
# joint MMB and CR chart - change label to ‘without’ for Fig.10.10b
p<-ggplot(data=for.figure3, aes(x=interval, y=mea, colour=model)) + geom_point() + geom_line() + geom_ribbon(aes(ymin=lower, ymax=upper), linetype=2, alpha=0.1) + xlab ("sample coverage") + ylab ("average correlation") + ggtitle("drawing with replacement")
plot(p) # Fig.10a,bFigure 10.10: Comparison of movable block bootstrap and simple drawing typical for: a) bootstrap drawing with replacement and b) modification - drawing without replacement
library(spdep) # for spatial weights matrix
library(spatialreg) # for spatial regressions
# 1. Simulate strongly spatially autocorrelated SAR data
# create regular grid 20x20 = 400 units
nx<-20
ny<-20
coords<-expand.grid(x=1:nx, y=1:ny) # regular grid of points
n<-nrow(coords) # n=400 for 20x20 grid
sf_dat<-st_as_sf(coords, coords=c("x", "y"), crs=3857) # sf dataset
# knn neighbours in full set
knn<-knearneigh(st_coordinates(sf_dat), k=4) #list of knn=4 for each point
nb<-make.sym.nb(knn2nb(knn)) # symmetric neighbourhood matrix
listw_full<-nb2listw(nb, style="W", zero.policy=TRUE) # spatial weights
# strong spatial lag model: y=rho*W*y + X*b + eps
rho_true<-0.8 # strong spatial dependence
beta0<-2 # assumed intercept
beta1<-0.4 # relatively weak covariate effects for x1
beta2<-(-0.2) # negative relation for x2
sigma_eps<-1 # standard deviation of residual term
x1<-rnorm(n) # random values of X1
x2<-rnorm(n) # random values of X2
eps<-rnorm(n, sd=sigma_eps) # random values of error term
W<-listw2mat(listw_full) # matrix class of spatial weights matrix
I<-diag(n) # diagonal elements
A_inv<-solve(I-rho_true*W) # generate dependent variable y
y<-as.vector(A_inv %*% (beta0 + beta1 * x1 + beta2 * x2 + eps) )
sf_dat$y<-y # attach generated variables to locations
sf_dat$x1<-x1
sf_dat$x2<-x2
form_sar<-y~x1+x2 # equation to be estimated
sf_dat
library(viridis) # for nice colours
ggplot(sf_dat) + geom_sf(aes(size=y, colour=y)) + scale_size_continuous(range=c(1.5, 6)) + scale_colour_viridis(option="C", direction=-1) + labs(title="Spatial distribution of simulated SAR outcome (y)", size="Value (y)", colour="Value (y)") + theme_minimal(base_size=18) + theme(legend.position="right", panel.grid.major=element_line(colour = "grey90"), panel.grid.minor=element_blank(), plot.title= element_text(size=12, face="bold")) # Fig.10.11a – distribution of y val.
# 2. Cross-Validation (CV) function for SAR model based on given folds
# note that most input objects is created later
# at that stage they do not have to exist in memory
cv_sar<-function(sf_dat, form_sar, nb_full, fold_id) { # own function
K<-length(unique(fold_id)) # number of folds, mostly assumed by user
n<-nrow(sf_dat) # size of dataset
y_obs<-sf_dat$y # dependent variable
y_pred<-rep(NA_real_, n) # predicted values
X_full<-model.matrix(form_sar, data=sf_dat) # matrix with data for model
for(k in sort(unique(fold_id))) { # loop for each fold
cat(" Fold", k, "of", K, "\n") # text to display when waiting
test_idx<-which(fold_id==k) # IDs of obs. in given fold for testing
train_idx<-setdiff(seq_len(n), test_idx) # IDs of obs. for training
dat_train<-sf_dat[train_idx, ] # training dataset
dat_test<-sf_dat[test_idx, ] # testing dataset
is_train<-seq_len(n) %in% train_idx # vector of TRUE/FALSE to label IDs
nb_train<-subset.nb(nb_full, subset=is_train) # subset of neighbours
listw_train<-nb2listw(nb_train, style="W", zero.policy=TRUE) # W matrix
sar_k<-lagsarlm(form_sar, data=dat_train, listw=listw_train, method="LU", zero.policy=TRUE)
beta_hat<-coef(sar_k) # beta coefficients from estimated model
X_test<-model.matrix(form_sar, data=dat_test) # data for prediction
y_pred[test_idx]<-drop(X_test %*% beta_hat[colnames(X_full)]) # predicted y
} # end of the loop
valid_idx<-which(!is.na(y_pred)) # check which predicted y are not NA
rmse<-sqrt(mean((y_obs[valid_idx] - y_pred[valid_idx])^2)) # RMSE
mae<-mean(abs(y_obs[valid_idx] - y_pred[valid_idx])) # MAE
# construct and execute the function – outputs a table with results
fold_stats<-do.call(rbind, lapply(sort(unique(fold_id)), function(k) {
idx<-which(fold_id==k) # select proper obs. within fold
data.frame(fold=k, rmse=sqrt(mean((y_obs[idx] - y_pred[idx])^2)),
mae=mean(abs(y_obs[idx] - y_pred[idx])))}))
# list what is the output of the function
list(rmse=rmse, mae=mae, y_pred=y_pred, fold_stats=fold_stats)
} # end of the function
# 3. Random (sampled) vs spatial (k-means) folds
K<-5 # number of folds, 5 is very typical number
# random (non-spatial) folds – simple sampling of IDs
fold_id_random<-sample(rep(1:K, length.out=n))
# spatial folds – from k-means on xy coordinates
coords_mat<-st_coordinates(sf_dat) # coordinates from sf object
km<-kmeans(coords_mat, centers=K, nstart=25) # k-means clustering
fold_id_spatial<-km$cluster
# 4. Cross-validation CV for both schemes - uses own function written above
cat("\n=== RANDOM (NON-SPATIAL) CV ===\n") # text for waiting
res_random<-cv_sar(sf_dat, form_sar, nb_full=nb, fold_id=fold_id_random)
cat("\n=== SPATIAL (BLOCK) CV ===\n") # text for waiting
res_spatial<-cv_sar(sf_dat, form_sar, nb_full=nb, fold_id=fold_id_spatial)
data.frame(scheme=c("random", "spatial"), RMSE=c(res_random$rmse, res_spatial$rmse), MAE=c(res_random$mae, res_spatial$mae)) # output
# 5. Visualisation of fold-wise RMSE
fold_stats_random<-transform(res_random$fold_stats, scheme="random")
fold_stats_spatial<-transform(res_spatial$fold_stats, scheme="spatial")
fold_stats_all<-rbind(fold_stats_random, fold_stats_spatial)
fold_stats_all
# Fig.10.11b – simple no-colour version
ggplot(fold_stats_all, aes(x=scheme, y=rmse)) + geom_boxplot() + geom_jitter(width=0.1, alpha=0.7) + labs(title="Fold-wise RMSE: Random vs Spatial Cross-Validation (SAR)", x="CV scheme", y="Fold RMSE") + theme_minimal()
# Fig.10.11b – more developed colour-version
ggplot(fold_stats_all, aes(x=scheme, y=rmse)) + geom_boxplot(outlier.shape= NA, fill="grey95", color="grey40") + geom_jitter(aes(size=rmse, colour= rmse), width=0.15, alpha=0.8) + scale_size_continuous(range=c(3, 8)) + scale_colour_viridis_c(option="C", end=0.95) + labs(title="Fold-wise RMSE: Random vs Spatial Cross-Validation", subtitle="Point size and colour represent RMSE", x="CV scheme", y="Fold RMSE", colour="RMSE", size="RMSE") + theme_minimal(base_size=14) + theme(panel.grid.minor= element_blank(), plot.title=element_text(face="bold"), legend.position="right")Figure 10.11: Spatial and non-spatial cross-validation on generated data: a) spatial distribution of dependent variable; b) comparison of RMSE in both schemes
# spatial cross-validation in SEM model
library(spdep) # for spatial weights matrix
library(spatialreg) # for spatial regressions
# 1. Prepare data for the analysis
data23<-data[data$year==2023, ]
form_sem<-unemployment_rate ~ salary_avg_Poland_100p + dist + birth_per_1K
X_full<-model.matrix(form_sem, data=data23) # data for model
p<-ncol(X_full) # number of beta coefficients
n<-nrow(pov) # number of observations in the original dataset
# spatial weights matrix for spatial econometric model
coords<-st_coordinates(st_centroid(st_make_valid(pov)))
knn<-knearneigh(coords, k=5) # k=5 nearest neighbours
nb_full<-make.sym.nb(knn2nb(knn)) # symmetric neighbourhood
listw_full<-nb2listw(nb_full, style="W", zero.policy=TRUE) # W
# cluster the coordinates to obtain K folds
K<-5 # number of spatial folds
km<-kmeans(coords, centers=K, nstart=25) # kmeans clustering
fold_id<-km$cluster # an integer from 1 to K for each observation
table(fold_id) # check spatial fold sizes
# place for results - coefficients in each fold (betas + lambda)
coef_mat<-matrix(NA, nrow=K, ncol=p+1) # empty matrix
colnames(coef_mat)<-c(colnames(X_full), "lambda") # names of columns
y_obs<-data23$y # dependent variable
y_pred<-rep(NA_real_, n) # empty vector for predictions fold-by-fold
# 2. Estimate models on test data and predict on train data
for (k in 1:K) { # loop by folds
cat("Running fold", k, "of", K, "...\n") # text for waiting
test_idx<-which(fold_id==k) # IDs of obs. in given fold
train_idx<-setdiff(seq_len(n), test_idx) # IDs of obs. In other folds
dat_train<-data23[train_idx, ] # train data
dat_test<-data23[test_idx, ] # test data
# subset of neighbours: drop all neighbours that include test units
is_train<-seq_len(n) %in% train_idx # vector of TRUE/FALSE to label IDs
nb_train<-subset.nb(nb_full, subset=is_train) # limited neighbourhood
listw_train<-nb2listw(nb_train, style="W", zero.policy=TRUE) # W
sem_k<-errorsarlm(form_sem, data=dat_train, listw=listw_train, zero.policy=TRUE, method="LU") # fit spatial error model on training data
beta_hat<-coef(sem_k) # extract regression coefficients
lambda_hat<-sem_k$lambda # extract lambda spatial coefficient
# transfer the results into common coefficient matrix
coef_mat[k, colnames(X_full)]<-beta_hat[colnames(X_full)]
coef_mat[k, "lambda"]<-lambda_hat
# predict on test set using linear predictor X * beta
X_test<-model.matrix(form_sem, data=dat_test)
y_pred[test_idx]<-drop(X_test %*% beta_hat[colnames(X_full)])
} # end of the loop
# 3 Cross-validated performance
valid_idx<-which(!is.na(y_pred)) # check IDs with non-NA prediction
rmse_cv<-sqrt(mean((y_obs[valid_idx] - y_pred[valid_idx])^2)) # RMSE
mae_cv<-mean(abs(y_obs[valid_idx] - y_pred[valid_idx])) # MAE
cat("Spatial CV RMSE:", rmse_cv, "\n") # text to be displayed
cat("Spatial CV MAE :", mae_cv, "\n")
coef_mean<-colMeans(coef_mat, na.rm=TRUE) # average of coeffcients
coef_sd<-apply(coef_mat, 2, sd, na.rm=TRUE) # sd of coefficients
coef_summary<-data.frame(term=names(coef_mean), mean=coef_mean, sd=coef_sd)
coef_summary # set of coefficients for each fold
sem_full<-errorsarlm(form_sem, data=data23, listw=listw_full, zero.policy=TRUE, method="LU") # full-data model
coef_full<-c(coef(sem_full)[colnames(X_full)], sem_full$lambda)
comparison<-data.frame(term=names(coef_full), full=unname(coef_full), cv_mean=coef_mean[names(coef_full)], cv_sd=coef_sd[names(coef_full)])
comparison # comparison of CV and full model coefficients
# Fig.10.12 - line plot comparing model performance in folds
library(tidyr) # for pivot_longer()
library(dplyr) # for pipe notation %>%, group_by() and mutate()
library(viridis) # for scale_colour_viridis_d()
coef_df<-as.data.frame(coef_mat) # class data.frame
coef_df$fold<-factor(1:K) # IDs of folds
coef_long<-pivot_longer(coef_df, cols=colnames(coef_mat), names_to="term", values_to="estimate") # convert coef_mat object into tidy long format
# standardised (z-score) coefficients per term
coef_long<-coef_long %>% group_by(term) %>% mutate(estimate_z=(estimate - mean(estimate, na.rm=TRUE)) / sd(estimate, na.rm=TRUE))
# full-model estimates (standardised)
coef_full_df<-data.frame(term=names(coef_full), full=coef_full)
coef_full_df<-coef_full_df %>% group_by(term) %>% mutate(full_z =(full - mean(coef_long$estimate[coef_long$term == term])) / sd(coef_long$estimate[coef_long$term == term]))
coef_plot_df<-merge(coef_long, coef_full_df, by="term") # merged objects
ggplot(coef_plot_df, aes(x=fold, y=estimate_z, colour=term, group=term)) + geom_point(size=3, alpha=0.7) + geom_line(alpha=0.7) + geom_hline(aes(yintercept=full_z, colour=term), linetype="dashed", size=1)+ scale_colour_viridis_d(option="C") + labs(title="Standardised Coefficient Stability Across Spatial CV Folds", subtitle="Coefficients are z-scored within each variable; dashed line = full-sample estimate", x="Fold", y="Standardised coefficient (z-score)", colour="Term") + theme_minimal(base_size=14) + theme(legend.position="bottom", plot.title= element_text(face="bold", size=13)) # Fig.10.12Figure 10.12: Model coefficients in different folds compared
with full-sample model
# Fig.10.13a – map of folds used in cross-validation
pov$fold_id<-fold_id
ggplot(pov) + geom_sf(aes(fill=factor(fold_id)), colour="grey40", size=0.1) + scale_fill_viridis_d(option="C", name="Fold") + labs(title="Spatial Cross-Validation Folds (k=5)") + theme_minimal(base_size=14) + theme(legend.position="bottom", plot.title= element_text(face="bold", size=13))
# Fig.10.13b – map of residuals from the SEM model
pov$error<-y_obs - y_pred # errors from the models
ggplot(pov) + geom_sf(aes(fill=error), colour="grey50", size=0.1) + scale_fill_viridis(option="B", direction=-1, name="Prediction error\n(y - ŷ)") + labs(title="Spatial Distribution of Prediction Errors") + theme_minimal(base_size=14) + theme(legend.position="bottom", plot.title= element_text(face="bold", size=13))Figure 10.13: Maps of analysed phenomena: a) K=5 folds for
cross-validation, b) residuals from the model
All figures for a book were saved in a following way:
# In R console:
i<-"01_01a"
savePlot(filename=paste0("Fig_",i), type="jpg", device=dev.cur(), restoreConsole=TRUE)
savePlot(filename=paste0("Fig_",i), type="pdf", device=dev.cur(), restoreConsole=TRUE)
savePlot(filename=paste0("Fig_",i), type="tif", device=dev.cur(), restoreConsole=TRUE)
savePlot(filename=paste0("Fig_",i), type="emf", device=dev.cur(), restoreConsole=TRUE)
savePlot(filename=paste0("Fig_",i), type="eps", device=dev.cur(), restoreConsole=TRUE)# In R studio:
save_plot_5formats<-function(i, plot_fun, width=10, height=10, res=300) {fname<-paste0("figures/Fig_", i)
## JPG
jpeg(paste0(fname, ".jpg"), width=width*res, height=height*res, res=res)
plot_fun()
dev.off()
## PDF
pdf(paste0(fname, ".pdf"), width=width, height=height)
plot_fun()
dev.off()
## TIFF
tiff(paste0(fname, ".tif"), width=width*res, height=height*res, res=res, compression="lzw")
plot_fun()
dev.off()
## EMF (Windows)
if (.Platform$OS.type == "windows") {
win.metafile(paste0(fname, ".emf"), width=width, height=height)
plot_fun()
dev.off()
} else {
warning("EMF skipped: available only on Windows.")
}
## EPS
postscript(paste0(fname, ".eps"), width=width, height=height, horizontal=FALSE, onefile=FALSE, paper="special")
plot_fun()
dev.off()
}
i<-"01_01a"
save_plot_5formats(i)