Welcome to the RPubs for the book ‘Applied Spatial Statistics And Econometrics: Data Analysis in R’ (Routledge, 2nd edition, 2026) (ed.Katarzyna Kopczewska).
This part is for Chapters 5-7.
To download and read data, please see RPubs with Intro.
This site is a collection of all R codes included in this book. These are just the codes - no comments, no explanations what, why and how. All those details are in the book.
The book was written by Spatial Warsaw team based at the Faculty of Economic Sciences at University of Warsaw. Visit us on LinkedIn.
selector<-sample(1:37373, 10, replace=FALSE) # random 10 rows
points.10<-as.matrix(points[selector, c("crds.x", "crds.y")]) # subset
points.10 # coordinates only for random 10 obs. in class matrix
my.dist<-dist(points.10, diag=TRUE, upper=TRUE) # full matrix of distances
print(my.dist, digits=2) # print only two non-zero numbers
# objects with geo-coordinates of city of Lublin (WGS84)
crds.Lublin.vec<-c(22.4236877, 51.2180254) # vector
crds.Lublin.df<-data.frame(lon=22.4236877, lat=51.2180254) # data.frame
crds.Lublin.sf<-st_as_sf(crds.Lublin.df, coords=c("lon", "lat"), crs=4326)
# xy coordinates of points
points.sf<-st_as_sf(points, coords=c("crds.x", "crds.y"), crs=4326)
head(st_coordinates(points.sf))
# distances between all points and centre of Lublin
dist<-st_distance(points.sf, crds.Lublin.sf) # distances in metres (m)
dist<-dist/1000 # convert to kilometres (km), 1 km=1000 m
# colour palette for plotting with plot()
cols<-terrain.colors(8)
cols # display selected colours
# Fig.5.1a – a point map of the distance between the point and the centre
summary(dist) # statistics of distances, from 0 to 140 km
brks<-c(0, 2, 25, 50, 75, 100, 125, 150) # intervals for colours
plot(st_geometry(region.voi)) # regional contour map
plot(st_geometry(region.pov), add=TRUE) # extra contours
points(points[, c("crds.x", "crds.y")], col=cols[findInterval(dist, brks)], pch=21, bg=cols[findInterval(dist, brks)], cex=0.2) # points in colours
legend("bottomleft", legend=brks, pt.bg=cols, bty="n", pch=21) # legend
title(main="Distance from the centre") # title
points.sf$dist<-as.numeric(dist) # add dist variable to sf object
ggplot()+ geom_sf(data=points.sf, aes(col=dist)) + ggtitle("Distance from the centre of Lublin city") # Fig.5.1b
library(fields) # Fig.5.2b
bubblePlot(points[,c("crds.x", "crds.y")], points$min.dist)Figure 5.1: Visualisation of calculated distance: a) with
plot(), b) with ggplot()
library(fields) # for quilt.plot() - Fig.5.2a plot of gridded values
quilt.plot(points[,c("crds.x", "crds.y")], points$min.dist) # Fig.5.2aFigure 5.2: Visualisation of point dataset with values with fields:: package: a) quilt.plot() with gridded data; b) bubblePlot() for point data
library(sp) # for coordinates(), CRS(), proj4string(), zerodist()
points.sp<-points # point data in sp class
coordinates(points.sp)<-c("crds.x", "crds.y")
proj4string(points.sp)<-CRS("+proj=longlat +datum=WGS84")
duplicates<-zerodist(points.sp) # detect duplicated locations
# remove duplicated coordinates from the dataset
# option ‘zero’ sets the threshold of distance to be duplicates
removed.duplicates<-remove.duplicates(points.sp, zero=0, remove.second=TRUE)
dim(removed.duplicates) # size of dataset after removing duplicates
dim(points.sp) # size of original dataset
# jittering – shifting locations by epsilon
# add a small value to all coordinates to avoid spatial overlap
n<-dim(points)[1] # size of dataset, number of observations
x.epsilon<-rnorm(n, 0, 0.001) # small values from normal distribution
y.epsilon<-rnorm(n, 0, 0.001)
jitter<-data.frame(x=points[,"crds.x"], y=points[,"crds.y"], x.epsilon=x.epsilon, y.epsilon=y.epsilon, new.x=0, new.y=0) # new df
jitter$new.x<-jitter$x+jitter$x.epsilon # new locs, shifted by epsilon
jitter$new.y<-jitter$y+jitter$y.epsilon
head(jitter)
# new data in sp class
jitter.sp<-jitter # copy data.frame object
coordinates(jitter.sp)<-c("new.x", "new.y") # change class to sp
proj4string(jitter.sp)<-CRS("+proj=longlat +datum=WGS84") # re-project
removed.dupli.jitter<-remove.duplicates(jitter.sp, remove.second=TRUE)
dim(removed.dupli.jitter) # no duplicates detected, size of data is the same
# find geometrically identical points
the.same<-st_equals(points.sf) # find duplicates
the.same # display the first 10 elements
the.same2<-st_equals(points.sf, remove_self=TRUE) # duplicates without self
the.same2 # display the first 10 elements# k=10 nearest neighbours for point data
library(spdep)
points.knn<-knearneigh(points[ ,c("crds.x", "crds.y")], k=10) # spdep::
head(points.knn$nn) # IDs of k=10 neighbours for each observation
# convert class of object to obtain spatial lag – average in neighbourhood
points.nb<-knn2nb(points.knn) # convert from knn to nb class
points.listw<-nb2listw(points.nb) # convert from nb to listw class
points.lag<-lag.listw(points.listw, points$min.dist) # spatial lag
cor(points$min.dist, points.lag) # similarity of data in neigbourhoods
plot(points$min.dist, points.lag) # Fig.5.3a
# k=10 nearest neighbours for regions
crds<-st_coordinates(st_centroid(st_make_valid(pov)))# centres of regions
regions.knn<-knearneigh(crds, k=10) # knn with spdep::
regions.listw<-nb2listw(knn2nb(regions.knn)) # knn -> nb -> listw class
regions.lag<-lag.listw(regions.listw, data$ unemployment_rate [data$year==2023]) # spatial lag – average in surrouding regions
cor(data$unemployment_rate[data$year==2023], regions.lag) # correlation
plot(data$unemployment_rate[data$year==2023], regions.lag, pch=21, bg="black", xlim=c(0,25), ylim=c(0,15)) # Fig.5.3b
abline(0,1, lty=3) # line of 45 degreesFigure 5.3: Spatial lag for nearest neighbours: a) average distance to regional city in knn=10 points; b) average unemployment rate in knn=10 poviats
library(dbscan) # for kNN() and frNN()
points.knn<-kNN(as.matrix(points[ ,c("crds.x", "crds.y")]), k=10)
head(points.knn$id) # IDs of knn=10 nearest neighbours
round(head(points.knn$dist),5) # distance to knn=10 nearest neighbours
# fixed radius nearest neighbours – radius is ca. 5 km (0.05 degree)
points.frnn<-frNN(as.matrix(points[ ,c("crds.x", "crds.y")]), eps=0.05)
points.frnn$id[[3]] # IDs of neighbours of point no. 3
points.frnn$dist[[3]] # distance to nearest neighbours from point no.3
# count the number of neighbours of each point and report as a vector
# function lapply() is for list class
# function length() reports the number of elements for each element of list
# function unlist() outputs the results as a vector (not a list)
nn.count<-unlist(lapply(points.frnn$id, length))
nn.count[1:10] # show counts for the first ten points (out of 37373)
points$nn.count<-nn.count # Fig.5.4a
ggplot(points) + aes(crds.x, crds.y, color=nn.count) + geom_point(size=1.5) + scale_color_viridis_c(option="turbo") + theme_bw()
border.Lublin<-pov[pov$JPT_NAZWA_=="powiat Lublin", ]
points.Lublin<-st_join(points.sf, border.Lublin, join=st_intersects)
nn1.points.Lublin<-st_nearest_feature(points.Lublin[1:50,]) #nn
nn1.points.Lublin # IDs of nn from the same dataset
nn2.points.Lublin<-st_nearest_feature(points.Lublin[1:50,], points.Lublin[51:100,])
nn2.points.Lublin # IDs of nn from the different dataset
subsample<-points.Lublin[1:50,]
crds.org<-st_coordinates(subsample) # coordinates of origin
crds.dest<-st_coordinates(subsample[nn1.points.Lublin,]) # destination
plot(st_geometry(subsample), pch=21, bg="black") # Fig.5.4b
# loop for all points to draw lines,
for(i in 1:50){ # from c(X0,X1) to c(Y0,Y1)
lines(c(crds.org[i,1], crds.dest[i,1]), c(crds.org[i,2], crds.dest[i,2]))}
title(main="Nearest points linked")Figure 5.4: Nearest neighbours: a) a map of a number of nearest neighbours of each point b) nearest neighbours from st_nearest_feature() linked
library(nabor) # for knn() function
selector<-sample(1:37373, 10, replace=FALSE) # random 10 rows
points.10<-points[selector, c("crds.x", "crds.y")] # xy crds of subset
popul.around.firms<-knn(as.matrix(popul.points[,1:2]), as.matrix(points.10), k=7) # 7 nearest neighbours (popul) of firms
print(popul.around.firms, digits=2) # IDs and dist of nearest neighbours# check the belonginess of points to regions
# perform the spatial join and keep both geometries
# output object is a merge of both input objects
joined_points<-st_join(points.sf, region.pov)
names(joined_points) # structure of ouput, parts from region.pov in bold
# count points within each polygon
joined_points$ones<-1 # vector of values of 1 for all observations
how.many.pts.in.pov1<-aggregate(joined_points$ones, by=list(joined_points$JPT_NAZWA_), sum) # sum of ones in groups ‘by’
head(how.many.pts.in.pov1) # data.frame output with groups and counts
# merge the sf object with the counts of points
region.pov<-merge(region.pov, how.many.pts.in.pov1, by.x="JPT_NAZWA_", by.y="Group.1", sort=FALSE) # new variable added as ‘x’
# alternative coding for the same output – using pipe notation
how.many.pts.in.pov2<-joined_points %>% group_by(JPT_NAZWA_) %>% summarise(count=n())
how.many.pts.in.pov2 # class "sf", "tbl_df", "tbl", "data.frame"
# Fig.5.5a - plot the number of points within each region
ggplot(data=region.pov) + geom_sf(aes(fill=x)) + labs(title="Number of points within each region", x="Longitude", y="Latitude")
# calculate the area of each region and add it to the sf object
region.pov$area_m2<-st_area(region.pov) # in meters^2
region.pov$area_km2<-st_area(region.pov)/(1000*1000) # in kilometeres^2
# calculate the density of points – number of points per km2
region.pov$density<-region.pov$x/region.pov$area_km2
# calculate quintile breaks for points density
quintile_breaks<-quantile(region.pov$density, probs=seq(0, 1, by=0.2), na.rm=TRUE)
# categorize density into quintiles using custom breaks
region.pov$density_cat<-cut(region.pov$density, breaks=quintile_breaks, include.lowest=TRUE)
# Fig.5.5b - plot the data as the quintile-based categories
ggplot(data=region.pov) + geom_sf(aes(fill=density_cat)) + scale_fill_brewer(palette="Reds", name="Quintiles of density") + theme_minimal() + theme(legend.position="bottom")Figure 5.5: Number of points within the region: a) as nominal
values, b) as relative values (density=count/area)
# convert the sf objects from spherical to planar coordinates
region.p<-sf::st_transform(region.voi, crs=3857)
points.sf.p<-sf::st_transform(points.sf, crs=3857)
a<-sample(1:dim(points.sf.p)[1], size=500, replace=FALSE) # subsample
points.sf.p.sel<-points.sf.p[a,]
# tesselation in sf::
points.sfc<-st_geometry(points.sf.p.sel)
box.sfc<-st_geometry(region.p)
points.union<-st_union(points.sfc)
tess<-st_voronoi(points.union, box.sfc)
tess.clip<-st_intersection(st_cast(tess), st_union(box.sfc))
# Fig.5.6a - tessellation tiles and their centroids (original points)
plot(st_geometry(tess.clip), main="Voronoi tesselation", sub="Subsample of 500 observations")
plot(st_geometry(points.sfc), add=TRUE, bg="darkblue", pch=21, cex=0.7)
# calculate the areas of tiles and plot them divided into small and large
tess.sf<-st_as_sf(tess.clip) # convert from sfc to sf class
area<-st_area(tess.sf) # absolute area of tessellation tiles
mean.exp.area<-sum(area)/length(area) # mean expected area
size<-rep("small", times=length(area)) # vector of labels
size[area>mean.exp.area]<-"large"
tess.sf$size<-size # labels attached to Voronoi map
# Fig.5.6b - plot the Voronoi tiles coloured by size
ggplot() + geom_sf(tess.sf, mapping=aes(fill=size)) + labs(title="Size of Voronoi tiles: \n small for high density areas, large for low density areas") # \n for a text in a new line Figure 5.6: Voronoi tesselation: a) pure division with
centroids of tiles, b) relative size of tiles to detect points
density
# convert the sf objects from spherical to planar projection
region.p<-sf::st_transform(region.voi, crs=3857)
points.sf.p<-sf::st_transform(points.sf, crs=3857)
a<-sample(1:dim(points.sf.p)[1], size=500, replace=FALSE) # subsample
points.sf.p.sel<-points.sf.p[a,]
# convert the sf planar objects to owin class
library(spatstat)
region.owin<-as.owin(region.p$geometry)
crds<-st_coordinates(points.sf.p.sel) # coordinates of sf points
data.ppp<-ppp(x=crds[ ,"X"], y=crds[ ,"Y"], window=region.owin)
data.tes<-dirichlet(data.ppp) # Dirichlet tessellation
plot(data.tes, main="Tessellation for n=500 points") # Fig.5.7a
plot(data.ppp, add=TRUE, pch=".", col="darkblue", cex=2)
degAxis(1)
degAxis(2)
deldist<-delaunayDistance(data.ppp)
plot(delaunay(data.ppp), lty=3, main="Delaunay triangulation for n=500 points") # Fig.5.7b
del.net<-delaunayNetwork(data.ppp)Figure 5.7: Tessellation and triangulation for randomly selected 500 points using the spatstat:: package: a) tessellation with the dirichlet() command, b) triangulation with the delaunay() command
# selected shape from pov object – powiat Lublin (city of Lublin)
border.Lublin<-pov[pov$JPT_NAZWA_=="powiat Lublin", ]
# selected points – located within the analysed shape
# assign points to regions
points.Lublin<-st_join(points.sf, border.Lublin, join=st_intersects)
points.Lublin$ones<-1 # vector of values 1 for easier summing up
dim(points.Lublin) # wider dataset attached to points
unique(points.Lublin$JPT_NAZWA_) # regional identification
table(points.Lublin$JPT_NAZWA_) # number of points within a region
NAs<-which(is.na(points.Lublin$JPT_NAZWA_)==TRUE) # IDs of NA obs.
points.Lublin<-points.Lublin[-NAs, ] # eliminate NA obs.
dim(points.Lublin) # less obs. in the dataset
# inputs for rasters
xy<-st_coordinates(points.Lublin) # geo-coordinates of points
z<-points.Lublin$ones # vector of values ‘1’
bbox<-st_bbox(border.Lublin) # bounding box for a contour
library(raster) # creating a raster with raster:: package
r.raster<-raster(nrows=50, ncols=50, xmn=bbox[1], ymn=bbox[2], xmx=bbox[3], ymx=bbox[4]) # raster 50 x 50
r1.raster<-rasterize(xy, r.raster, field=z, fun=sum) # counts of points
class(r1.raster)
r1.raster
library(terra) # the same with newer package terra::
r.terra<-rast(nrows=50, ncols=50, xmin=bbox[1], ymin=bbox[2], xmax=bbox[3], ymax=bbox[4]) # raster 50 x 50
r1.terra<-rasterize(xy, r.terra, "z", fun=length) # counts of points
class(r1.terra)
r1.terra
# Fig.5.8a - plot of raster – cells and counts of points
# old version with raster:: or newer version with terra::
plot(r1.raster, main="Total number of points within the raster cells")
plot(r1.terra, main="Total number of points within the raster cells")
Fig.5.8b – plot of raster with visible grid
# sf::st_make_grid() makes a grid which replicates raster and add on plot
plot(st_make_grid(border.Lublin, n=c(50,50)), add=TRUE, border="grey85")
plot(st_geometry(border.Lublin), add=TRUE) # plot the contour or regionFigure 5.8: Rasterized data with raster:: package: a) count of points in cells within a selected region; b) raster with a visible grid layer and city contour
# create a raster on 10,000 grids with random values
r2<-raster(ncols=100, nrows=100, ymn=-50, ymx=50, xmn=-50, xmx=50)
ncell(r2) # check how many cells were created
r2[]<-round(runif(ncell(r2), 1,8), digits=0) # assign values to raster
plot(r2) # plot not shown
# vector of raster values read by rows from upper left to bottom right corner
r.val<-extract(r1.raster, 1:2500) # NA values are for empty cells
points.Lublin.A<-points.Lublin[points.Lublin$SEC_PKD7=="A",]
points.Lublin.G<-points.Lublin[points.Lublin$SEC_PKD7=="G",]
xy.A<-st_coordinates(points.Lublin.A) # geo-coordinates of points
z.A<-points.Lublin.A$ones # vector of values ‘1’
xy.G<-st_coordinates(points.Lublin.G) # geo-coordinates of points
z.G<-points.Lublin.G$ones # vector of values ‘1’
bbox<-st_bbox(border.Lublin) # bounding box for a contour
r<-raster(nrows=50, ncols=50, xmn=bbox[1], ymn=bbox[2], xmx=bbox[3], ymx=bbox[4]) # raster of size 50 x 50, with raster::
rA<-rasterize(xy.A, r, field=z.A, fun=sum) # counts of points of sector A
rG<-rasterize(xy.G, r, field=z.G, fun=sum) # counts of points of sector G
plot(rA, col=colorRampPalette(c("cornsilk2", "indianred1", "brown3"))(255), main="Spatial distribution firms from the sector A") # Fig.5.9a
plot(border.Lublin$geometry, add=TRUE)
plot(rG, col=colorRampPalette(c("cornsilk2", "indianred1", "brown3"))(255), main="Spatial distribution of firms from the sector G") # Fig.5.9b
plot(border.Lublin$geometry, add=TRUE)Figure 5.9: Rasterized distributions for two subsets of data:
a) firms from sector A, b) firms from sector G
# Pearson's correlation and its visualisation
cor.r<-corLocal(rA, rG, ngb=5, method="pearson", test=TRUE)
plot(cor.r$pearson, main="Perason correlation for local counts in cells")
plot(border.Lublin$geometry, add=TRUE) # Fig.5.10a
plot(cor.r$p.value, main="p-value of correlations between the number \nof firms from sectors G and A ") # p-value graph, not shown
plot(border.Lublin$geometry, add=TRUE)Figure 5.10: Similarity of point patterns: a) local Person
correlation, b) differences of counts from a reclassified
raster
# reclassify difference into categories
# -1 => rG<rA (rA is bigger than rG)
# 0 => rG=rA (rA is the same as rG)
# 1 => rG>rA (rA is lower than rG)
diff_raster<-(rG-rA) # difference between rasters
threshold<-1 # define the threshold, change based on the data scale
reclass_mat<-matrix(c(-Inf, -threshold, -1, -threshold, threshold, 0, threshold, Inf, 1), ncol=3, byrow=TRUE) # reclassification matrix
reclass_mat
out<-reclassify(diff_raster, reclass_mat)
# plot the categorical raster (difference map) – Fig.5.10b
cols<-c("indianred1", "cornsilk2", "palegreen1")
plot(st_geometry(border.Lublin), main="Pattern Differences")
plot(out, add=TRUE, col=cols, legend=FALSE, axes=FALSE)
legend('bottomright', title='Differences', legend=c('rA>rG, more rA', 'rA=rG, no difference', 'rA<rG, more rG'), fill=cols, bg='white', bty='n')
ct<-crosstab(rA, rG) # layer.1 is rA, layer.2 is rG
ct # table reports co-occurrence of counts in both layers
library(stars) # stars:: package for raster->stars conversion
r1.stars<-st_as_stars(r1.terra, crs=4326) # raster->stars conversion
r1.sf<-st_as_sf(r1.stars, crs=4326) # stars->sf conversion
names(r1.sf)
# Fig.5.11a – it uses slot ‘values’ from r1.sf as a colour layer
ggplot(data=r1.sf) + geom_sf(aes(fill=values)) + scale_fill_continuous(low="thistle2", high="darkred", guide="colorbar", na.value="white") + labs(title="Grid in sf:: with values, converted from raster:: and stars::") + geom_sf(data=border.Lublin, fill=NA, linewidth=1, color="black") + theme_minimal()Figure 5.11: Rasters as sf objects: a) converted via stars::;
b) after data order processing
grid<-st_make_grid(border.Lublin, n=c(50,50))
class(grid)
grid<-st_as_sf(grid)
class(grid)
value<-getValues(r1.raster) # vector of values from raster
# centroids from raster – goes down by rows from top left to bottom right
xy<-coordinates(raster(r1.terra))
# centroids from sf grid – go up by rows from bottom left to upper right
crds.r.sf<-st_coordinates(st_centroid(grid))
library(nabor) # k nearest neighbour search, for both sets of centroids
knn.grid<-nabor::knn(as.matrix(xy), crds.r.sf, k=1)
# dataset with raster values, their coordinates and order for grid
xyz<-data.frame(x=xy[,1], y=xy[,2], z=value, ID=knn.grid$nn.idx)
library(doBy)
xyz.sorted<-orderBy(~ID, xyz) # sort the object by ID
grid$value<-xyz.sorted$z # merge sorted raster values to grid
# Fig.5.11b - plot the raster in sf:: with resorted values
ggplot(data=grid) + geom_sf(aes(fill=value)) + scale_fill_continuous(low="thistle2", high="darkred", guide="colorbar", na.value="white") + labs(title="Grid in sf:: with values, created from raster:: output") + geom_sf(data=border.Lublin, fill=NA, linewidth=2, color="black") + theme_light()
region.voi.sp<-as(region.voi, "Spatial") # convert contour from sf to sp
class(region.voi.sp)
grid.lub<-makegrid(region.voi.sp, n=100) # generate grid – set of points
grid.lub # output – set of coordinates of points
plot(region.voi.sp) # plot of contour, Fig.5.12a
points(grid.lub, pch=".", cex=3.5) # plot of regular points
# read a grid with population data + change the projection
grid<-st_read("POP_lub.shp", stringsAsFactors=FALSE)
grid<-st_transform(grid, 4326)
dim(grid)
# limit the grid to area of city of Lublin
lim<-st_join(grid, border.Lublin, join=st_intersects)
unique(lim$JPT_NAZWA_)
a<-which(lim$JPT_NAZWA_=="powiat Lublin") # cut the grid
grid.Lublin<-grid[a,]
#st_write(grid.Lubin, "grid_Lublin.shp") # save grid as a shapefile
dim(grid.Lublin)
ggplot(grid.Lublin) + geom_sf(aes(fill=TOT), color=NA) + # Fig.5.12b geom_sf(data=border.Lublin, fill=NA, color="white", size=2) + scale_fill_viridis_c(name="Population", option="viridis") + theme_minimal() + ggtitle("Population grid for city of Lublin") Figure 5.12: Grid for a selected bounding box: a) region with regular points – centroids of grid, b) population grid limited by the contour
#grid.Lublin grid shapefile with population for city of Lublin
#border.Lublin contour shapefile for city of Lublin
#points.Lublin point data for city of Lublin
# create a new variable in a grid dataset
# there are two ifelse() functions to avoid problems with NA and 0 values
# output is set NA or 0 if input is NA or 0
grid.Lublin$share_15_64<-ifelse(is.na(grid.Lublin$TOT_15_64), NA, ifelse( grid.Lublin$TOT==0, 0, grid.Lublin$TOT_15_64 / grid.Lublin$TOT))
# for each point - add the value of grid to point that belongs to that grid
points.Lublin<-st_join(points.Lublin, grid.Lublin["share_15_64"])
library(dplyr) # for pipe operations
library(h3r) # for H3 hexagons
library(h3jsr) # for polygon_to_cells()
st_crs(region.voi) # check if crs is 4326 (WGS 84)
st_is_valid(region.voi) |> table() # check if all contours are valid
region<-st_transform(st_make_valid(region.voi), 4326) # if the above fails
region_poly<-st_cast(st_union(region), "MULTIPOLYGON") # a single region
res<-5 # choose resolution, here 5 or 6
cells<- polygon_to_cells(region_poly, res=res) # H3 hexagons
head(cells) # part of output, in list class
hex_sf<-cell_to_polygon(cells) |> st_as_sf() # convert to sf
plot(st_geometry(hex_sf), main="H3 hexagon cell–resolution 6")
plot(st_geometry(region), add=TRUE)
st_write(hex_sf, "H3res5GRID.shp", delete_layer=TRUE) # save as shapefileFigure 5.13: H3 hexagons for regional contour: a) resolution
5, b) resolution 6
# create buffers (circular polygons), distance in metres (1000 m = 1 km)
points.buffer.1km<-st_buffer(points.Lublin, dist=1000) # output in sf class
# store the object of buffers for further calculations and read it
saveRDS(points.buffer.1km, file="points_buffer_1km.rds") # save
points.buffer.1km<-readRDS("points_buffer_1km.rds")) # load
# calculate a list of obs. that are within buffers
# option sparse=TRUE returns avoids outputting a dense logical matrix
points.within.1km<-st_intersects(points.buffer.1km, points.Lublin, sparse=TRUE)
points.within.1km
class(points.within.1km)
# store the list object for further calculations and read it
save(points.within.1km, file="points_within_1km.RData")# save the object
load("points_within_1km.RData") # load the object
# simple count of observations fulfilling some condition
# here: count the number of firms around from given sector, self-excluded
i=1 # example for the first row
sect="M" # which sector should be counted
idx<-points.within.1km[[i]] # IDs of points on the list
idx<-idx[idx != i] # exclude obs. itself
# number of observations double-conditioned (by sector and by IDs)
# sum counts the number of rows that fulfil the condition for sector
count<-sum(points.Lublin$SEC_PKD7[idx]==sect)
# create a function that makes a new column in a dataset
# and executes the above code for all the observations in a dataset
# function() is quicker than a for() loop
sect="M" # which sector should be counted
col_name<-paste0("firms_around_", sect) # firms_around_M
points.Lublin[[col_name]]<-sapply(1:nrow(points.Lublin), function(i) {
idx<-points.within.1km[[i]] # IDs of points on the list
idx<-idx[idx != i] # exclude obs. itself
count<-sum(points.Lublin$SEC_PKD7[idx]==sect)
return(count) })
# store the results
points.Lublin # display the last columns in the dataset
new_var<-st_drop_geometry(points.Lublin[ ,col_name]) # data.frame variable
write.csv(new_var, paste0(col_name, ".csv")), row.names=FALSE) # csv file
# build a loop for each sector
sectors<-sort(unique(points.Lublin$SEC_PKD7)) # elements to iterate
for(sect in sectors){ # for() loop, runs for each sector
col_name<-paste0("firms_around_", sect) # builds the name of the variable
points.Lublin[[col_name]]<-sapply(1:nrow(points.Lublin), function(i) {
idx<-points.within.1km[[i]] # IDs of points on the list
idx<-idx[idx != i] # exclude obs. itself
count<-sum(points.Lublin$SEC_PKD7[idx]==sect) # counts obs.
return(count) }) # store the result, end of the function
} # end of the loop
# Fig.5.14a - visualisation of buffers
plot(st_geometry(border.Lublin))
plot(st_geometry(points.buffer.1km[1:10,]), add=TRUE) # only 10 rings
plot(st_geometry(points.Lublin[1:10, ]), cex=2, pch=21, bg="red", add=TRUE)
plot(st_geometry(points.Lublin), cex=0.3, pch=21, bg="black", add=TRUE)
title(main="Selected buffers of 1 km")
degAxis(1); degAxis(2); add_grid() # from terra::
# boxplots for the relationship between two different values in buffers
# for total number of firms and total population, all in 500 m radius
par(mar=c(5,5,5,5)) # Fig.5.14b
boxplot(points$locAggTOTAL~cut(points$locPdens, breaks=0:16*25), xlab="Population density *100", ylab="Agglomeration in radius of 500 m", main="Selected firms in NUTS2 Lublin region", col="cornflowerblue")
abline(h=0:6*2000, lty=3, col="grey70")Figure 5.14: Visualisation of buffers: a) as circles on a map; b) as boxplots for relationship between two variables counted as buffers
# objects created before:
# points.buffer.1km<-st_buffer(points.Lublin, dist=1000) # buffers
# points.within.1km<-st_intersects(points.buffer.1km, points.Lublin, sparse=TRUE) # list of IDs within each buffer
# count neighbors around
points.Lublin$agglom_around<-sapply(points.within.1km, function(idx) {
length(points.within.1km[idx])})
# compute average profitability (roa) in firms in the neighbourhood
# the code adds a new variable
points.Lublin$roa_around<-sapply(points.within.1km, function(idx) {
if(length(idx)==0) return(NA)
mean(points.Lublin$roa[idx], na.rm=TRUE)})
plot(points.Lublin["agglom_around"], pch=20, cex=0.8, main="Agglomeration around - number of points in a radius of 1 km") # Fig.5.15a
degAxis(1); degAxis(2)Figure 5.15: Visualisation of point pattern: a) the number of
neighbours by location; b) SPAG from spatialWarsaw::
# buffers for selected points, with radius by other variable
sel.points<-points.Lublin[25:100,] # selected points
buffers.by.size<-st_buffer(sel.points, dist=sel.points$empl_av_size*100)
plot(st_geometry(border.Lublin), lwd=2) # Fig.5.16a
plot(st_geometry(buffers.by.size), add=TRUE, border="red", col=adjustcolor("red", alpha.f=0.2))
points(st_coordinates(sel.points), pch=20, col="blue")
# union of circles (buffers)
buffers_union<-st_union(buffers.by.size)
plot(st_geometry(border.Lublin), lwd=2) # Fig.5.16b
plot(st_geometry(buffers_union), add=TRUE, col=adjustcolor("grey", alpha.f= 0.3), border="blue")
sum(st_area(st_make_valid(buffers_union)))/sum(st_area(buffers.by.size))Figure 5.16: Visualisation of buffer zones: a) individual
circles, b) circles integrated as union
library(flexurba)
options(timeout=500)
# download the three grid layers for the whole globe
download_GHSLdata(output_directory="data/global", filenames=c("built.tif", "pop.tif", "land.tif"))
# transform the region boundary to Mollweide projection
region.voi.moll<-region.voi %>% sf::st_transform("ESRI:54009")
# crop the global grids to the extent of the study area
crop_GHSLdata(extent = terra::ext(region.voi.moll),
global_directory="data/global",
global_filenames=c("built.tif", "pop.tif", "land.tif"),
output_directory="data/lubelskie_region",
output_filenames=c("built_lub.tif", "pop_lub.tif", "land_lub.tif"))
data.lub<-DoU_preprocess_grid(directory="data/lubelskie_region", c("built_lub.tif","pop_lub.tif", "land_lub.tif"))
str(data.lub) # looking up the structure of the list
data.lub$built # SpatRaster object with information for built-up areas
data.lub$land # SpatRaster object with information for land coverage
data.lub$pop # SpatRaster object with information for population
# prepare the level 1 DEGURBA classification
degurba.level.1.voi<-DoU_classify_grid(data=data.lub)
degurba.level.1.voi
# Fig.5.16 - plot the result of the DEGURBA level 1 classification
DoU_plot_grid(degurba.level.1.voi) # Fig.5.17a - plot of grids, no contours
DoU_plot_grid(degurba.level.1.voi, scalebar=TRUE, title="DEGURBA L1 for Lubelskie") + geom_sf(data=region.voi.moll, fill=NA, linewidth=0.2, color="black") # Fig.5.17b – plot of grids with region contourFigure 5.17: Plots of DEGURBA classification at Level 1: a) classified grids only, b) classified grids with a contour of the region, scalebar and title
# prepare the level 2 DEGURBA classification
degurba.level.2.voi<-DoU_classify_grid(data=data.lub, level1=FALSE)
degurba.level.2.voi
# Fig.5.18 - plot the result of the DEGURBA level 2 classification
DoU_plot_grid(degurba.level.2.voi, scalebar=TRUE, title="DEGURBA L2 for Lubelskie", level1=FALSE) + geom_sf(data=region.voi.moll, fill=NA, linewidth=0.3, color="black")Figure 5.18: Plot of DEGURBA classification Level 2
# select the city boundary from the pov layer (powiat Lublin)
border.Lublin<-pov[pov$JPT_NAZWA_=="powiat Lublin", ]
# transform to Mollweide projection to match the raster grids
border.Lublin.moll <- border.Lublin %>% sf::st_transform("ESRI:54009")
# Fig.5.19a - plot Level 1 DEGURBA classification for the selected extent
DoU_plot_grid(degurba.level.1.voi, scalebar=TRUE, title="DEGURBA L1", extent=terra::ext(border.Lublin.moll)) + geom_sf(data=border.Lublin.moll, fill=NA, linewidth=0.2, color="black")
# Fig.5.19b - plot Level 2 DEGURBA classification for the selected extent
DoU_plot_grid(degurba.level.2.voi, scalebar=TRUE, title="DEGURBA L2", level1=FALSE, extent=terra::ext(border.Lublin.moll)) + geom_sf(data= border.Lublin.moll, fill=NA, linewidth=0.2, color="black")Figure 5.19: DEGURBA classification for a Lublin city: a)
Level 1, b) Level 2
# default parameters for Level 1 classification
DoU_get_grid_parameters(level1=TRUE)
# modify some of the thresholds for the level 1 classification
degurba.level.1.voi.adapt<-DoU_classify_grid(data=data.lub, level1=TRUE,
parameters= list(UC_density_threshold = 1650, # default = 1500
UC_size_threshold = 60000, # default = 50000
UC_contiguity_rule = 8, # default = 4 (rook)
UC_built_criterium = FALSE, # default = TRUE
UCL_density_threshold = 200, # default = 300
UCL_size_threshold = 4000)) # default = 5000
# create binary reclassification mask by comparing the two rasters
# 0=unchanged, 1=reclassified
degurba.level.1.voi.diff<-terra::ifel(degurba.level.1.voi.adapt != degurba.level.1.voi, 1, 0)
# convert the binary raster to a categorical (factor) raster
degurba.level.1.voi.diff.fac<-terra::as.factor(degurba.level.1.voi.diff)
# assign descriptive labels to the factor levels
levels(degurba.level.1.voi.diff.fac)<-data.frame(ID=c(0,1), changed=c("Unchanged", "Reclassified"))
# Fig.5.20a - Level 1 classification with adapted parameter settings
DoU_plot_grid(degurba.level.1.voi.adapt) + geom_sf(data=region.voi.moll, fill=NA, linewidth=0.2, color="black")
# Fig.5.20b - Locations of reclassified grid cells (default vs. adapted)
ggplot() + tidyterra::geom_spatraster(data=degurba.level.1.voi.diff.fac) +
scale_fill_manual(values=c("Unchanged"="grey95", "Reclassified"= "grey30"), name=NULL) + geom_sf(data=region.voi.moll, fill=NA, linewidth=0.25, color="black") + theme_void() + theme(legend.position="bottom", legend.direction="horizontal")Figure 5.20: DEGURBA Level 1 classification for Lubelskie region a) adapter parameter settings, b) locations of reclassified grid cells (default vs. adapted parameters)
degurba.L1.pols<-terra::as.polygons(degurba.level.1.voi, dissolve=FALSE, na.rm=TRUE) # raster->polygons: one polygon per grid cell
degurba.L1.pols
# convert to sf (one row per grid cell)
degurba.L1.sf<-sf::st_as_sf(degurba.L1.pols) %>% dplyr::rename(degurba.L1=layer) %>% dplyr::mutate(degurba.L1=factor(degurba.L1, levels=c(0, 1, 2, 3),
labels=c("water cell", "rural cell", "urban cluster", "urban centre")))
degurba.L1.sf # only two rows displayed
#Fig.5.21
ggplot() + geom_sf(data=degurba.L1.sf, aes(fill=degurba.L1), color=NA)+ geom_sf(data=region.voi.moll, fill=NA, linewidth=0.3) + scale_fill_manual(values=c("water cell"="#7AB5F5", "rural cell"="#73B273", "urban cluster"="#FFAA00", "urban centre"="#FF0000")) + theme_minimal() + labs(fill="DEGURBA Level 1")Figure 5.21: DEGURBA Level 1 classification visualized with ggplot()
degurba.L1.sf.lub<-sf::st_filter(degurba.L1.sf, region.voi.moll, .predicate=st_intersects) # keep grid cells within the contour
degurba.L1.sf.lub # only two rows displayed
table(degurba.L1.sf.lub$degurba.L1) # absolute number of grid cells
shares<-table(degurba.L1.sf.lub$degurba.L1)/dim(degurba.L1.sf.lub)[1]*100
round(shares, 2) # round the share values of grid cells
# Fig.5.22a - visualisation of the filtered grids in Mollweide projection
ggplot() + geom_sf(data=degurba.L1.sf.lub, aes(fill=degurba.L1), color=NA) + geom_sf(data=region.voi.moll, fill=NA, linewidth=0.3) + scale_fill_manual(values=c("water cell"= "#7AB5F5", "rural cell"= "#73B273", "urban cluster"= "#FFAA00", "urban centre"= "#FF0000")) + theme_minimal() + theme(legend.position="none")
# changing the projection to WGS84
degurba.L1.sf.lub.4326<-sf::st_transform(degurba.L1.sf.lub, 4326)
# Fig.5.22b - visualisation of the filtered grids in WGS84 projection
ggplot() + geom_sf(data=degurba.L1.sf.lub.4326, aes(fill=degurba.L1), color=NA) + geom_sf(data=region.voi.moll, fill=NA, linewidth=0.3) + scale_fill_manual(values=c("water cell"= "#7AB5F5", "rural cell"= "#73B273", "urban cluster"= "#FFAA00", "urban centre"= "#FF0000")) + theme_minimal() + labs(fill="DEGURBA Level 1")Figure 5.22: DEGURBA Level 1 classification for the exact shape of the region a) in Mollweide projection (ESRI:54009), b) WGS84 projection (EPSG:4326)
# dataset with point data (selected variables)
points.sf[, c("ID", "SEC_agg", "poviat")] # only 3 rows displayed
degurba.L1.sf.lub.4326 # grids with DEGURBA Level 1 classification
# CRS check: both datasets must use the same coordinate reference system
st_crs(points.sf) == st_crs(degurba.L1.sf.lub.4326)
# If FALSE, transform one dataset to match the other, e.g.:
# points.sf<-st_transform(points.sf, st_crs(degurba.L1.sf.lub.4326))
# perform spatial join where each firm (point) inherits the DEGURBA class
# of the grid cell (polygon) in which it is located
points.sf.degurba<-sf::st_join(points.sf, degurba.L1.sf.lub.4326, join=st_within) # point-in-polygon relations
points.sf.degurba[, c("ID", "SEC_agg", "poviat", "degurba.L1")]
table(points.sf.degurba$degurba.L1) # number of firms by DEGURBA classes# transform the poviats boundaries to Mollweide projection
region.pov.moll<-region.pov %>% sf::st_transform("ESRI:54009")
data.poviats.L1<-DoU_preprocess_units( # pre-process the units data
units=region.pov.moll, # spatial units – vector layer
classification=degurba.level.1.voi, #grid classification – raster layer
pop=data.lub$pop) # population in grids – raster layer
region.pov.moll$JPT_NAZWA_# IDs – names of poviats
# classification of units with path for saving the results
poviats.class.L1<-DoU_classify_units(data.poviats.L1, id="JPT_NAZWA_", filename="data/lubelskie_region/poviats_L1.csv")
head(poviats.class.L1) # first 3 lines shown only
# Fig.5.23a - visualisation of the resulting classification of units
DoU_plot_units(region.pov.moll, classification=poviats.class.L1)
# Fig.5.23b - DEGURBA L1 classification on grids with the poviats shape
DoU_plot_grid(degurba.level.1.voi) + geom_sf(data=region.pov.moll, fill=NA, linewidth=0.2, color="black")Figure 5.23: Plot of DEGURBA Level 1 classification: a) for
poviats, b) for grids with overlay of poviats’ contours
Figure 5.24: DEGURBA Level 1 classification for poviats in
WGS84 projection
data.poviats.L2<-DoU_preprocess_units(units=region.pov.moll, classification =degurba.level.2.voi, pop=data.lub$pop) # data pre-processing
poviats.class.L2<-DoU_classify_units(data.poviats.L2, id="JPT_NAZWA_", level1=FALSE, filename="poviats_L2.csv") # classification of units
head(poviats.class.L2) # only 3 rows displayed
DoU_plot_units(region.pov.moll, classification=poviats.class.L2, level1= FALSE) # Fig.5.25a - visualisation of the resulting classification of units
# Fig.5.25b - DEGURBA L2 classification on grids with the poviats shape
DoU_plot_grid(degurba.level.2.voi, level1=FALSE) + geom_sf(data= region.pov.moll, fill=NA, linewidth=0.4, color="black")Figure 5.25: DEGURBA Level 2 classification a) for poviats,
b) for grids with overlay of poviats’ shape
library(spdep) # spdep:: is for spatial weights matrices
# matrix of neighbourhood according to the contiguity criterion
cont.nb<-poly2nb(st_make_valid(pov), queen=TRUE) # class nb
# spatial weights matrix, row-standardised to one
cont.listw<-nb2listw(cont.nb, style="W") # class listw
cont.listw # displays a summary of the weight matrix
centroids<-st_centroid(st_geometry(st_make_valid(pov))) # centroids of polygons
crds.pov<-st_coordinates(centroids) # extract coordinates of centroids
plot(pov$geometry) # Fig.6.2a - plot contour map
plot(cont.nb, crds.pov, add=TRUE) # plot contiguity neighbourhood linksFigure 6.2: Illustration of the neighbourhood matrix
according to the contiguity criterion: a) for poviats, b) for
voivodships
cont.mat<-nb2mat(cont.nb) # conversion to matrix class
cont.mat[1:5, 1:5]
names_voi<-as.character(voi$JPT_NAZWA_) # region names vector
names_voi
cont.voi.nb<-poly2nb(voi, row.names=names_voi) # nb class
cont.voi.mat<-nb2mat(cont.voi.nb) # convert to matrix class
cont.voi.mat[1:4, 1:4] # display weights for regions
# create an object with names of regions and coordinates of centroids
crds.voi<-st_coordinates(st_centroid(st_make_valid(voi)))
labels<-data.frame(X=crds.voi[,1], Y=crds.voi[,2], Name=as.character(voi$ JPT_NAZWA_)) # it uses information from the shapefile map
labels
# convert spatial links to lines – for plot in ggplot()
nb_lines<-nb2lines(cont.voi.nb, coords=crds.voi, as_sf=TRUE)
st_crs(nb_lines)<-st_crs(voi) # make common projections of map and lines
# Fig.6.2b
library(ggrepel) # for labels in boxes
ggplot() + geom_sf(data=voi, fill="cornsilk", color="grey70") +
geom_sf(data=nb_lines, color="red", linewidth=0.3, alpha=0.8) +
geom_sf(data=st_centroid(voi), color="black", size=1) +
theme_minimal() +
theme(axis.title.x=element_blank(), #remove x axis title
axis.title.y=element_blank()) + #remove y axis title
geom_label_repel(aes(x=X, y=Y, label=Name), data=labels,
size=2, box.padding=0.2, point.padding=0.5, segment.color='grey50') +
labs(title="Contiguity-based Spatial Weights",
subtitle="Lines connect neighbouring polygons",
caption="Area: Voivodships in Poland")
names_voi[cont.voi.nb[[which(names_voi=="lubuskie")]]]# centroids of NTS 2 voivodeships
crds.voi<-st_coordinates(st_centroid(voi)) # regular notation
crds.voi<-st_centroid(voi) |> st_coordinates() # native pipe notation
library(magrittr)
crds.voi<-st_centroid(voi) %>% st_coordinates() # magrittr:: pipe notation
head(crds.voi)
crds.pov<-st_centroid(st_make_valid(pov)) # sf object
crds.pov.sfc<-st_centroid(st_geometry(st_make_valid(pov))) # "sfc_POINT"
# matrix k=5 nearest neighbours for areal data
voi.knn<-knearneigh(crds.voi, k=5) # create a knn object
voi.knn.nb<-knn2nb(voi.knn) # convert to a nb class
#Fig.6.3a – connections of k nearest neighbours
plot(st_geometry(voi), main="K nearest neighbours, k=5")# regional contour
plot(voi.knn.nb, crds.voi, add=TRUE) # relationship layer
print(is.symmetric.nb(voi.knn.nb)) # check the symmetry of the matrix
voi.knn.sym.nb<-make.sym.nb(voi.knn.nb) # make the matrix symmetric
print(is.symmetric.nb(voi.knn.sym.nb))
# create a listw class object from symmetric knn object
voi.knn.sym.listw<-nb2listw(voi.knn.sym.nb)
summary(voi.knn.sym.listw)Figure 6.3: K nearest neighbours for NUTS2 regions: a)
visualisation of links, b) statistics of symmetric matrix
library(dplyr) # for slice_sample()
points.sub<-slice_sample(points, n=100, replace=FALSE) # using dplyr::
dim(points.sub)
# matrix k=5 nearest neighbours for point data - knn class object
points.knn<-knearneigh(as.matrix(points.sub[, c("crds.x", "crds.y")]), k=5)
points.knn.nb<-knn2nb(points.knn)
points.knn.nb
# Fig.6.4 - plot of knn: region + points + links
plot(region.voi$geometry) # plot a single region
plot(points.knn.nb, as.matrix(points.sub[,c("crds.x", "crds.y")]), add=TRUE)
print(is.symmetric.nb(points.knn.nb))
points.knn.sym.nb<-make.sym.nb(points.knn.nb)
print(is.symmetric.nb(points.knn.sym.nb))
# create a listw class object
points.knn.sym.listw<-nb2listw(points.knn.sym.nb)
summary(points.knn.sym.listw)Figure 6.4: K nearest neighbours for points: a) visualisation
of links, b) statistics of symmetric matrix
install.packages("devtools") # install spatialWarsaw:: from GitHub
devtools::install_github("poktam/spatialWarsaw")
library(spatialWarsaw)
selector<-sample(1:dim(points)[1], 2000, replace=FALSE) # 1000 points
points.sub.sf<-points.sf[selector, ] # subset
csl<-corrSpatialLags(points.sub.sf, "roa", 2000, knn=1:15)
print(csl, digits=2) # empirical and theoretical correlationsFigure 6.5: Correlations between spatial lags for different number of k nearest neighbours: a) empirical correlation, b) theoretical correlation
eq<-roa ~ empl_av_size + dummy_prod + dummy_constr + dummy_serv + dist.lublin
best.W<-bestW(points.sub.sf, eq, model_type="SDM", 1000, knn=c(2, 5, 10, 15, 20)) # from spatialWarsaw::
best.W Figure 6.6: Relationships in econometric models due to
different k nearest neighbours (knn) included in the spatial weights
matrix W: a) AIC (Akaike Information Criterion) by knn, b) spatial
parameter rho (σ) in models with different knn
# spatial weights matrix of neighbours in a radius of d=30 km
pov<-st_transform(pov, 4326) # projections in WGS84
crds.pov<-st_centroid(st_make_valid(pov)) # sf object
# matrix class as input, distance in degrees
conti.d.30<-dnearneigh(st_coordinates(crds.pov), 0, 0.3, longlat=FALSE)# in°
# sf as input, distance in km
conti.d.30<-dnearneigh(crds.pov, d1=0, d2=30) # in km
# Fig.6.7a - a matrix of neighbours within a radius of d km
par(mar=c(1,1,1,1))
plot(st_geometry(pov)) # background contour map
plot(conti.d.30, st_coordinates(crds.pov), add=TRUE) # neighbourhood linksFigure 6.7: Spatial relationships in a weight matrix according to the distance criterion (in a radius d = 30 km): a) links between spatial units, b) in dark - regions without a link
conti.d.30.m<-nb2mat(conti.d.30, zero.policy=TRUE) # convert nb to matrix
a<-colMeans(t(conti.d.30.m)) # averages in rows
pov$a<-a # append a value vector to map
# Fig.6.7b
ggplot(data=pov) + geom_sf(aes(fill=a)) + scale_fill_viridis_c(option ="viridis") + labs(fill="Attribute 'a'") + theme_minimal()
conti.d.30.listw<-nb2listw(conti.d.30)
#Error in command 'nb2listw(conti.d.30)
# Empty neighbour sets found (zero.policy: FALSE)
# neighbours within a radius of d km, so that each region has a neighbour
kkk<-knn2nb(knearneigh(crds.pov)) # k=1 nearest neighbours
all<-max(unlist(nbdists(kkk, crds.pov))) # change the class
all.nb<-dnearneigh(crds.pov, 0, all) # find neighbours in a radius
summary(all.nb) # summary of output# knn matrix with all regions linked – base for distance calculations
pov.knn<-knearneigh(crds.pov, k=379)
pov.nb<-knn2nb(pov.knn)
# inverse distance matrix
dist<-nbdists(pov.nb, crds.pov) # distances between neighbours
dist1<-lapply(dist, function(x) 1/x) # inverse distance - list class
pov.dist.listw<-nb2listw(pov.nb, glist=dist1) # listw matrix
summary(pov.dist.listw)
dist2<-lapply(dist, function(x) x^(-2)) # squared inverse distance
pov.dist.listw2<-nb2listw(pov.nb, glist=dist2)
dist3<-lapply(dist, function(x) exp(-1.5*x)) # exponential distance decay
pov.dist.listw3<-nb2listw(pov.nb, glist=dist3)
# inverse distance matrix
inv.dist<-nb2listwdist(pov.nb, crds.pov, type="idw", style="raw", alpha=1, dmax=NULL, longlat=TRUE, zero.policy=NULL) # class "listw" "nb"
# inverse squared distance
dpd.dist<-nb2listwdist(pov.nb, crds.pov, type="dpd", style="raw", alpha=0, dmax=200, longlat=TRUE, zero.policy=NULL)
# exponential distance-decay
exp.dist<-nb2listwdist(pov.nb, crds.pov, type="exp", style="raw", alpha=0, dmax=NULL, longlat=TRUE, zero.policy=NULL)
inv.dist.mat<-listw2mat(inv.dist) # convert from listw to matrix class
# map of weights for a given region – Warsaw, row 355
pov$inv.dist.WAW<-inv.dist.mat[,355] # add weights to map
plot(pov["inv.dist.WAW"], key.pos=4) # Fig.6.8a
ggplot() + geom_sf(data=pov, aes(fill=inv.dist.WAW)) # Fig.6.8bFigure 6.8: Weighted distance spatial weights matrix: a)
inverse distance in plot(), b) inverse squared distance in
ggplot()
install.packages("devtools") # install spatialWarsaw:: from GitHub
devtools::install_github("poktam/spatialWarsaw")
library(spatialWarsaw)
selector<-sample(1:dim(points)[1], 1000, replace=FALSE) # 1000 points
points.sub.sf<-points.sf[selector, ] # subset
my.tess<-tessW(points.sub.sf, region.voi, sample_size=nrow(points.sub.sf))
my.tess # Fig.6.9a
selector<-sample(1:dim(points)[1], 30, replace=FALSE) # 30 points
points.sub.sf<-points.sf[selector, ] # subset
my.tess<-tessW(points.sub.sf, region.voi, sample_size=nrow(points.sub.sf))
my.tess # Fig.6.7bFigure 6.9: Spatial weights matrix based on Voronoi
tesselation: a) for 1,000 points; b) for 30 points
# number of poviats that build regional groups
sub<-data[data$year==2023,] # subset for one year
table(sub$region_name) # number of poviats within each region
matt<-matrix(0, nrow=380, ncol=380) # create an empty matrix
sub$region_name<-factor(sub$region_name) # change text to factor
reg.names<-levels(sub$region_name) # list all names
# spatial weights matrix based on group membership
for(i in 1:16){ # loop for each region
c1<-which(sub$region_name==reg.names[i]) # vector of IDs of poviats
matt[c1, c1]<-1} # neighbours within groups
diag(matt)<-0
matt[1:15, 1:15]
vec<-rowSums(matt) # sums in rows - de facto number of neighbours
matt.W<-matt/vec # matrix standardisation
matt.listw<-mat2listw(matt.W) # convert to listw
matt.listw
# Fig.6.10a - visualisation of values in the weight matrix
a<-apply(matt.W, 1, max) # maximum value in each row
pov$a<-a
# create 5 quantile-based categories using cut_number() from ggplot2::
pov$a_cat<-cut_number(pov$a, 5)
library(RColorBrewer) # colour palette and plot from RColorBrewer::
color_palette<-brewer.pal(5, "Reds")
ggplot(data=pov) + geom_sf(aes(fill=a_cat)) +
scale_fill_manual(values=color_palette, name="Max Weight Category") +
theme_minimal() + theme(legend.position="none") # Fig.6.10aFigure 6.10: Visualisation of W matrix: a) category-based
matrix, b) k-means groups of points
library(factoextra)
selector<-sample(1:nrow(points), 1000, replace=FALSE)
points.sub<-points[selector, ]
clus<-eclust(points.sub[,c("crds.x", "crds.y")], "kmeans", hc_metric="euclidean", k=10, graph=FALSE)
points.sub$cluster<-clus$cluster
points.sub.sf<-st_as_sf(points.sub, coords=c("crds.x","crds.y"), crs=4326)
pal<-brewer.pal(10, "Paired") # from RColorBrewer::
ggplot() + geom_sf(data=points.sub.sf, aes(col=cluster, size=2)) + scale_colour_gradientn(colors=pal) # Fig.6.10b
matt<-matrix(0, nrow=1000, ncol=1000) # create an empty matrix
for(i in 1:10){ # iterate by clusters
c1<-which(clus$cluster==i) # select observations from given cluster
matt[c1, c1]<-1} # label within-cluster obs. as neighbours
diag(matt)<-0 # diagonal with 0 values
vec<-rowSums(matt) # sums in rows - de facto number of neighbours
matt.W<-matt/vec # matrix standardisation
matt.listw<-mat2listw(matt.W) # convert to listw
matt.listwsummary(conti.d.30) # summary of the nb class neighbourhood matrix
all.listw<-nb2listw(all.nb)
summary(unlist(all.listw$weights))
all.mat<-nb2mat(all.nb) # nb in class matrix
apply(all.mat, 1, summary) # summart by rows (1)
table(card(all.nb))
voi.knn.mat<-nb2mat(voi.knn.nb) # full knn=5 matrix for 16 regions
print(voi.knn.mat)
cont.listw$weights
# launch the editing interface
# edit.nb(voi.knn.nb, crds.voi, polys=voi) # run this only in R console
cont.voi.nb
logic<-c(F,F,F,F,F,F,F,F,F,F,F,T,F,F,F,F) # TRUE means ‘delete’
name<-c("mazowieckie") # region to delete
id<-c(12) # ID of region to detele
change1<-droplinks(cont.voi.nb, logic) # option with logic vector
change2<-droplinks(cont.voi.nb, name) # option with name
change3<-droplinks(cont.voi.nb, id) # option with ID of region
change1 # others give identical results
subset<-c(FALSE, TRUE, FALSE, TRUE, rep(FALSE, 12))
subset.voi<-subset.nb(cont.voi.nb, subset)
subset.voi
cont.voi.nb.self<-include.self(cont.voi.nb)
cont.voi.nb.self
par(mfrow=c(1,3)) # 1 x 3 split window
par(mar=c(5.1, 2, 4.1, 1)) # margins of the figure
plot(st_geometry(voi), main="complement.nb()") # Fig.6.11a
plot(complement.nb(change1), crds.voi, add=T)
plot(st_geometry(voi), main="intersect.nb()") # Fig.6.11b
plot(intersect.nb(cont.voi.nb, change1), crds.voi, add=T)
plot(st_geometry(voi), main="setdiff.nb()") # Fig.6.11c
plot(setdiff.nb(cont.voi.nb, change1), crds.voi, add=T)
par(mfrow=c(1,1)) # 1 x 1 split windowFigure 6.11: Neighbour connections resulting from subsequent operations: a) completed links, b) common part of two matrices, c) difference of two sets
head(levels(as.factor(pov$JPT_NAZWA_)))
# spatial lags and higher order neighbourhood
pov$JPT_NAZWA_ # display the names of regions
pov$SHOCK<-rep(0,380) # new variable with 0 only
pov$SHOCK[pov$JPT_NAZWA_=="powiat Łódź "]<-1 # Łódź labelled with 1
pov$SHOCK[pov$JPT_NAZWA_=="powiat Warszawa"]<-1 # Warszawa labelled with 1
pov$SHOCK[pov$JPT_NAZWA_=="powiat Gdańsk"]<-1 # Gdańsk labelled with 1
pov$SHOCK[pov$JPT_NAZWA_=="powiat Poznań"]<-1 # Poznań labelled with 1
ggplot(data=pov) + geom_sf(aes(fill=SHOCK)) + scale_fill_viridis_c(option = "magma") + labs(fill="SHOCK") + theme_minimal() # Fig.6.12a
# spatial lag of the SHOCK variable - first row
pov$lagg<-lag.listw(cont.listw, pov$SHOCK)
summary(pov$lagg)
library(viridis)
ggplot(data=pov) + geom_sf(aes(fill=lagg)) + scale_fill_viridis_c(option = "magma", direction=1) + labs(fill="lagg") + theme_minimal() # Fig.6.12bFigure 6.12: Diffusion of shocks: a) centres, b) spatial lag
of third order
# spatial lag of the SHOCK variable - second row
poviats.2.list<-nblag(cont.nb, 2)
poviats.2.nb<-nblag_cumul(poviats.2.list)
pov$lagg<-lag.listw(nb2listw(poviats.2.nb), pov$SHOCK)
ggplot(data=pov) + geom_sf(aes(fill=lagg)) + scale_fill_viridis(option= "magma", direction=1) + labs(fill="lagg") + theme_minimal() # not shown
# spatial lag of the SHOCK variable - third row
poviats.3.list<-nblag(cont.nb, 3)
poviats.3.nb<-nblag_cumul(poviats.3.list)
pov$lagg<-lag.listw(nb2listw(poviats.3.nb), pov$SHOCK)
ggplot(data=pov) + geom_sf(aes(fill=lagg)) + scale_fill_viridis(option= "magma", direction=1) + labs(fill="lagg") + theme_minimal() # not shownpts_sub<-slice_sample(points, n=200, replace = FALSE) # subsample 200 firms
pts_sf<-st_as_sf(pts_sub, coords=c("crds.x", "crds.y"), crs=4326) # sf
pts_m<-st_transform(pts_sf, 2180) # metric CRS for metre-based distances
coords_m<-st_coordinates(pts_m) # xy coordinates of points
r_km<-3000 # 3 km # set the radius
nb_d<-dnearneigh(coords_m, d1=0, d2=r_km) # radius from d1 to d2 km
lw_d<-nb2listw(nb_d, style="W", zero.policy=TRUE) # listw matrix
# spatial lag – average in the neighbourhood
pts_m$roa_neigh_mean<-lag.listw(lw_d, pts_m$roa, zero.policy=T)
# number of neighbours within 5 km for each firm
pts_m$n_neigh<-card(nb_d) # from spdep::
head(pts_m[, c("roa", "roa_neigh_mean", "n_neigh")]) # 3 rows onlydata23<-data[data$year==2023,] # subset for one year
cont.nb<-poly2nb(st_make_valid(pov), queen=TRUE) # nb class matrix
cont.listw<-nb2listw(cont.nb, style="W", zero.policy=TRUE) # listw class
pov$unemp_rate23<-data23$unemployment_rate # copy variable to mapmoran(pov$unemp_rate23, cont.listw, length(cont.nb), Szero(cont.listw))
moran.test(pov$unemp_rate23, cont.listw, alternative="greater")
m3<-moran.test(pov$unemp_rate23, cont.listw, randomisation=FALSE)
m3
# p-value for Moran’s I (normal approximation)
pval.norm<-1-pnorm(m3$statistic, mean=0, sd=1)
pval.norm
moran.mc(pov$unemp_rate23, cont.listw, nsim=99, alternative="greater")
z<-as.numeric(scale(pov$unemp_rate23))
moran.plot(z, cont.listw, labels=as.character(pov$JPT_NAZWA_), pch=19, quiet=FALSE)Figure 6.13: Moran scatter plot: a) using the moran.plot(),
b) visualised on the map
# standardised variable and its spatial lag
z<-as.numeric(scale(pov$unemp_rate23))
Wz<-lag.listw(cont.listw, z)
fit<-lm(Wz~z) # fit line (slope ≈ Moran's I)
slope<-coef(fit)[2]
interc<-coef(fit)[1]
df<-data.frame(z=z, Wz=Wz, name=pov$JPT_NAZWA_)# data.frame for plotting
df$rad<-sqrt(df$z^2 + df$Wz^2) # radius – how far is obs. from (0,0)
df$label<-ifelse(rank(-df$rad)<=5, df$name, NA)
library(ggrepel) # for geom_text_repel()
ggplot(df, aes(x=z, y=Wz)) + geom_hline(yintercept=0, linetype=2) + geom_vline(xintercept=0, linetype=2) + geom_point(alpha=0.85, size=1.8) + geom_abline(intercept=interc, slope=slope) + geom_text_repel(aes(label= label), min.segment.length=0) + coord_equal() + labs(x="Standardised unemployment rate (z)", y="Spatial lag of z (Wz)", title="Moran scatterplot (2023, queen contiguity)") + theme_minimal()
# map of belonging to the quarters of the Moran scatterplot
quad<-ifelse(df$z >= 0 & df$Wz >= 0, "I:HH",
ifelse(df$z >= 0 & df$Wz < 0, "II:LH",
ifelse(df$z < 0 & df$Wz < 0, "III:LL", "IV:HL")))
pov$quad_moran<-factor(quad, levels=c("I:HH","II:LH","III:LL","IV:HL"))
ggplot(pov) + geom_sf(aes(fill=quad_moran), color="grey70", size=0.1) + scale_fill_manual(values=c("I:HH"="grey20", "II:LH"="grey55", "III:LL"="grey85", "IV:HL"="grey40"), name="Moran quadrants") + labs(title="Regions by Moran scatterplot quadrants") + theme_minimal() + theme(legend.position="bottom")summary(pov$unemp_rate23)
pov$rate_group<-factor(cut(pov$unemp_rate23, breaks=c(0, 5, 10, 30), labels=c("low", "medium", "high")))
head(pov$rate_group)
# define colour palette
cols<-c("low"="seagreen3", "medium"="dodgerblue3", "high"="firebrick3")
# Fig.6.14a - scatterplot by categories
ggplot(pov, aes(x=1:nrow(pov23), y=unemployment_rate, fill=rate_group)) + geom_point(shape=21, size=3, color="grey30") + geom_hline(yintercept=c(5, 10), linetype="dashed", color="black") + scale_fill_manual(values=cols, name="Rate group") + labs(x="Observation number", y="Unemployment rate (%)", title="Unemployment rates by category (2023)", subtitle="Dashed lines show group thresholds at 5% & 10%") + theme_minimal() + theme(legend.position = "bottom", panel.grid.minor=element_blank())
# Fig.6.14b - spatial distribution by categories
ggplot(pov) + geom_sf(aes(fill=rate_group), colour="grey60", size=0.1) + scale_fill_manual(values=cols, name="Unemployment rate group") + labs(title="Unemployment rate categories, 2023") + theme_minimal() + theme(legend.position="bottom")Figure 6.14: Graph of unemployment rates in 2023 a) scatterplot (x = ordinal number, y = variable value), b) spatial distribution of values according to value groups
locM<-localmoran(pov$unemp_rate23, cont.listw)
head(locM)
# attach the data
pov$Ii <-locM[, "Ii"]
pov$p_two <-locM[, "Pr(z != E(Ii))"]
# classify significance
pov$lisa_sig<-with(pov, ifelse(p_two<0.05 & Ii>0, "Significant positive (cluster)", ifelse(p_two<0.05 & Ii<0, "Significant negative (outlier)", "Not significant")))
pov$lisa_sig<-factor(pov$lisa_sig, levels=c("Significant positive (cluster)", "Not significant", "Significant negative (outlier)"))
# Fig.6.15a - map of local significance (LISA)
ggplot(pov) + geom_sf(aes(fill=lisa_sig), color="grey70", size=0.1) + scale_fill_manual(values=c("Significant positive (cluster)"="grey20", "Not significant"="grey90", "Significant negative (outlier)"="grey55"), name="Local Moran’s I (p < 0.05)") + labs(title="Significance of Local Moran’s I (unemployment rate, 2023)") + theme_minimal() + theme(legend.position ="bottom")# Gi (no self in W): hot/cold spots
gi_z<-localG(pov$unemp_rate23, cont.listw) # local statistic
gi_p<-2*(1-pnorm(abs(gi_z))) # two-sided p-value
pov$gi_z<-as.numeric(gi_z)
pov$gi_p<-gi_p
pov$gi_sig<-factor(ifelse(gi_p<0.05 & gi_z>0, "High (hot spot)", ifelse(gi_p<0.05 & gi_z<0, "Low (cold spot)", "Not significant")), levels= c("High (hot spot)", "Low (cold spot)", "Not significant"))
summary(pov$gi_sig)
# Gi* (self in W): include self-neighbour; expect more/larger hot spots
lw_star<-nb2listw(include.self(cont.nb), style="W", zero.policy=TRUE)
gstar_z<-localG(pov$unemp_rate23, lw_star)
gstar_p<-2*(1-pnorm(abs(gstar_z)))
pov$gstar_z<-as.numeric(gstar_z)
pov$gstar_p<-gstar_p
pov$gstar_sig<-factor(ifelse(gstar_p<0.05 & gstar_z>0, "High (hot spot)", ifelse(gstar_p<0.05 & gstar_z<0, "Low (cold spot)", "Not significant")), levels=c("High (hot spot)", "Low (cold spot)", "Not significant"))
summary(pov$gstar_sig)
# Fig.6.15b - map of Gi significance
ggplot(pov) + geom_sf(aes(fill=gi_sig), colour="grey75", size=0.08) + scale_fill_manual(values=c("High (hot spot)"= "#1a9850", "Low (cold spot)"="#d73027", "Not significant" ="grey85"), name="Gi (two-sided, 5%)") + labs(title="Local Getis–Ord Gi: hot/cold spots of unemployment") + theme_minimal() + theme(legend.position="bottom")Figure 6.15: Significant local statistics: a) local Moran, b)
local Getis-ord Gi
locG<-localG(pov$unemp_rate23, cont.listw) # local Gi statistics
a<-summary(locG)
a
# define colour breaks and palette
brks<-c(a[1], a[2], a[3], a[5], a[6])
colfunc<-colorRampPalette(c("royalblue", "springgreen", "yellow", "red"))
cols<-colfunc(5)
# Fig.6.16a - map of Gi statistics
ggplot(pov) + geom_sf(aes(fill=cut(locG, breaks=brks, include.lowest= TRUE)), colour="grey70", size=0.1) + scale_fill_manual(values=cols, labels= c("Very low", "Low", "Medium", "High", "Very high"), name="Gi statistic") + labs(title="Local Getis–Ord Gi (2023)") + theme_minimal() + theme(legend.position="bottom")
locH<-LOSH(pov$unemp_rate23, cont.listw, a=2, var_hi=TRUE, zero.policy=TRUE, na.action=na.exclude) # local Hi LOSH statistics
summary(locH)
b<-summary(locH[,"Hi"]) # summary of single element of output
b
# Fig.6.16b - map of Hi statistics
brks<-c(b[1], b[2], b[3], b[5], b[6])
ggplot(pov) + geom_sf(aes(fill=cut(locH[,"Hi"], breaks=brks, include.lowest=TRUE)), colour="grey70", size=0.1) + scale_fill_manual(values=cols, labels=c("Very low", "Low", "Medium", "High", "Very high"), name="Hi (LOSH) statistic") + labs(title="Local spatial heteroscedasticity (Hi, 2023)") + theme_minimal() + theme(legend.position ="bottom")Figure 6.16: Common analysis of local statistics: a) measure
of spatial correlation Gi, b) measure of spatial
heteroscedasticity
library(spatialEco) # for crossCorrelation()
library(spdep) # for nb2mat()
library(mice) # for imputation in case of missing values
# prepare variables: unemployment in 2008 and 2023
data08<-data[data$year==2008,]
data23<-data[data$year==2023,]
x_2008<-data08$unemployment_rate # this variable has NA, imputation needed
x_2023<-data23$unemployment_rate
myNA<-which(is.na(x_2008)==TRUE) # check which observation is missing
index<-rep(TRUE, times=length(x_2008)) # vector of TRUE for all obs.
index[myNA]<-FALSE # exchange TRUE to FALSE for missing obs.
imput.x_2008<- mice.impute.mean(x_2008, index) # simple imputation
x_2008[myNA]<-imput.x_2008 # change NA to values in vector of data
# prepare contiguity weights in matrix class for crossCorrelation()
W_mat<-nb2mat(cont.nb, style="W") # row-standardised queen neighbours
#W_mat<-listw2mat(cont.listw) # row-standardised queen neighbours
# version 1 - with contiguity W matrix
cc_W<-crossCorrelation(x=x_2008, y=x_2023, w=W_mat, dist.function="none", type=c("LSCI", "GSCI"), k=999, scale.xy=TRUE, scale.partial=TRUE)
cc_W
# version 2 - with coordinates and inverse distance matrix
crds<-st_coordinates(st_centroid(st_make_valid(pov)))
cc_IP<-crossCorrelation(x=x_2008, y=x_2023, coords=crds, type=c("LSCI", "GSCI"), k=999, dist.function="inv.power", scale.xy=TRUE, scale.partial=TRUE)
cc_IP
# non-spatial correlation between 2022 and 2023 unemployment
cor(x_2008, x_2023, use="complete.obs")
library(scales) # for number_format()
# attach LSCI (Local Spatial Cross-Correlation Index) (x→y) to map
pov$LSCI_W<-cc_W$SCI[ ,"lsci.xy"]
pov$LSCI_IP<-cc_IP$SCI[ ,"lsci.xy"]
# maps with a common colour scale
# diverging around 0, negative=local contrast, positive=local reinforcement)
lims<-c(-1, 1) # limits of scale from -1 to 1
lab_fun<-function(x) number_format(accuracy = 0.1)(x) # to round the ticks
# Fig.6.17a: LSCI with queen contiguity (W)
ggplot(pov) + geom_sf(aes(fill=pmin(pmax(LSCI_W, lims[1]), lims[2])), colour="grey75", size=0.08) + scale_fill_gradient2(low="#4575b4", mid="white", high="#d73027", midpoint=0, limits=lims, name="LSCI (2008 → 2023)", labels=lab_fun ) + labs(title="LSCI:2008 → 2023", subtitle="W = queen contiguity") + theme_minimal() + theme(legend.position="bottom")
# Fig.6.17b: LSCI with inverse-power distance W
ggplot(pov) + geom_sf(aes(fill=pmin(pmax(LSCI_IP, lims[1]), lims[2])), colour="grey75", size=0.08) + scale_fill_gradient2(low="#4575b4", mid="white", high="#d73027", midpoint=0, limits=lims, name="LSCI (2008 → 2023)", labels=lab_fun) + labs(title="LSCI:2008 → 2023", subtitle= "W = inverse-power distance") + theme_minimal() + theme(legend.position ="bottom")Figure 6.17: Local Spatial Cross-Correlation (LSCI) of
unemployment rate between years 2008 and 2023
# Moran’s I by neighbour order
corr_I<-sp.correlogram(cont.nb, data23$unemployment_rate, order=6, method="I", zero.policy=TRUE) # from spdep::
print(corr_I, digits=2)
# reconstruct Z and two-sided p-values for plotting
library(tibble) # to make a table in format for plotting
library(dplyr) # for mutate()
corrI_df<-tibble(order=seq_len(nrow(corr_I$res)),
estimate_I = corr_I$res[, 1],
expectation = corr_I$res[, 2],
variance = corr_I$res[, 3]) %>% mutate(
z = (estimate_I - expectation) / sqrt(variance),
p_two_sided = 2 * (1 - pnorm(abs(z))),
sig = p_two_sided < 0.05)
# Fig.6.18a — correlogram (Moran’s I) by neighbour order
ggplot(corrI_df, aes(order, estimate_I)) + geom_hline(yintercept=0, linetype=2) + geom_line() + geom_point(aes(shape=sig)) + scale_shape_manual(values=c(`TRUE`=16, `FALSE`=1), name="Significant (5%)") + labs(title="Correlogram (Moran’s I)", x="Neighbour order (queen contiguity)", y="Moran’s I") + theme_minimal() + theme(legend.position= "bottom")
# Pearson’s classic correlation by neighbour order
corr_R<-sp.correlogram(cont.nb, data23$unemployment_rate, order=6, method="corr", zero.policy=TRUE)
print(corr_R)
# test r statistic with t-test: df=n-2
n_obs<-sum(!is.na(data23$unemployment_rate))
r_vals<-as.numeric(corr_R$res)
t_stat<-sqrt(n_obs-2)*r_vals/sqrt(pmax(1-r_vals^2, .Machine$double.eps))
p_val<-2*(1-pt(abs(t_stat), df=n_obs-2))
corrR_df<-tibble(order=seq_along(r_vals), r=r_vals, t=t_stat, p=p_val, sig=p<0.05)
corrR_df
# correlogram (Pearson’s classic r) by neighbour order – not shown
ggplot(corrR_df, aes(order, r)) + geom_hline(yintercept=0, linetype=2) + geom_line() + geom_point(aes(shape=sig)) + scale_shape_manual(values= c(`TRUE`=16, `FALSE`=1), name="Significant (5%)") + labs(title="Correlogram (classic correlation) - unemployment rate, 2023", x="Neighbour order (queen contiguity)", y="Correlation r") + theme_minimal()
library(spatialEco) # for correlogram()
crds<-st_centroid(st_make_valid(pov)) # centroids of regions
crds$unemp_rate23<-data23$unemployment_rate # variable for analysis
cg<-correlogram(crds, crds$unemp_rate23, dist=35000) #dist in meters
cg # a few first rows of the output, originally there are 22 rows
# build a tidy data.frame for plotting
ac_df<-as.data.frame(cg$autocorrelation)
names(ac_df)<-c("acf", "dist_m", "lci", "uci")
ac_df$dist_km<-ac_df$dist_m / 1000
ac_df
# Fig.6.18b - distance correlogram with 95% CI ribbon
ggplot(ac_df, aes(dist_km, acf)) + geom_hline(yintercept=0, linetype=2) + geom_ribbon(aes(ymin=lci, ymax=uci), alpha=0.15) + geom_line() + geom_point(size=1.6) + labs(title="Distance correlogram", x="Distance (km, bin width ≈ 35 km)", y="Correlation") + theme_minimal()+ theme(legend.position="bottom")Figure 6.18: Correlogram for the variable unemployment rate in 2023: a) sixth order correlogram, b) correlation graph based on distance
selector<-sample(1:dim(points)[1], 2000, replace=FALSE) # 1000 points
points.sub.sf<-points.sf[selector, ]# subset
svk1<-semiVarKnn(points.sub.sf, "roa", 500, max_knn=20)
svk1 # Fig.6.19a
svk2<-semiVarKnn(points.sub.sf, "locAggTOTAL", 500, max_knn=20)
svk2 # Fig.6.19bFigure 6.19: Semi-variance by nearest neighbours: a) for a variable firm profitability (roa), b) for a variable local agglomeration of firms (locAggTOTAL)
# prepare data
sub<-data[data$year==2022, ] # selection of year
# select dependent and independent variables
sub$y<-sub$salary_avg_Poland_100p
sub$x1<-sub$unemployment_rate
sub$x2<-sub$dist
sub$x3<-sub$birth_per_1K
sub$x4<-sub$death_per_1K
sub$x5<-sub$invest_pc_PLN
sub$x6<-sub$fixed_assets_pc_PLN
sub$x7<-sub$new_firms_per_10K_inhab
eq1<-y~x1+x2+x3+x4+x5+x6+x7 # equation for the estimation
# reduced dataset
regdata<-sub[ ,c("y", "x1", "x2", "x3", "x4", "x5","x6", "x7")]
library(PerformanceAnalytics)
chart.Correlation(regdata, histogram=TRUE, pch=19) # Fig.7.3Figure 7.3: Examination of correlations between variables
selected for the model
model.lm<-lm(eq1, data=sub)
summary(model.lm)
library(spdep) # spdep:: for spatial weights matrices
cont.nb<-poly2nb(st_make_valid(pov), queen=TRUE) # neighbourhood matrix
cont.listw<-nb2listw(cont.nb, style="W") # contiguity spatial weights matrix
library(spatialreg) # for sacsarlm() and other spatial models
# Manski model (all three spatial coefficients)
# spatial lag of Y (rho)
# spatial lags of X (theta)
# spatial autocorrelation of error (lambda)
model.GNS<-sacsarlm(eq1, data=sub, listw=cont.listw, type="sacmixed")
summary(model.GNS, Nagelkerke=TRUE)
model.GNS$logLik_lm.model # OLS: logLik and degrees of freedom
model.GNS$LL # logLik of the GNS model
# SDM model (two spatial coefficients)
# spatial lag Y (rho ρ)
# spatial lags X (theta θ)
model.SDM<-lagsarlm(eq1, data=sub, listw=cont.listw, type="mixed", tol.solve=1.0e-20, method="LU")
summary(model.SDM, Nagelkerke=TRUE)
# Manski (GNS) model (all three spatial coefficients)
# spatial lag Y (rho)
# spatial lags X (theta)
# spatial autocorrelation of error (lambda)
model.GNS<-sacsarlm(eq1, data=sub, listw=cont.listw, type="sacmixed")
summary(model.GNS, correlation=TRUE)
# SAC model (two spatial coefficients)
# spatial lag Y (rho)
# spatial autocorrelation of error (lambda)
model.SAC<-sacsarlm(eq1, data=sub, listw=cont.listw, method="LU", tol.solve=1.0e-20)
# SDM model (two spatial coefficients)
# spatial lag Y (rho ρ)
# spatial lags X (theta θ)
model.SDM<-lagsarlm(eq1, data=sub, listw=cont.listw, type="mixed", tol.solve=1.0e-20, method="LU")
# SDEM model (two spatial coefficients)
# spatial lags X (theta)
# spatial autocorrelation of error (lambda)
model.SDEM<-errorsarlm(eq1, data=sub, listw=cont.listw, etype="emixed", method="LU", tol.solve=1.0e-20)
# SAR model (one spatial factor)
# spatial lag Y (rho)
model.SAR<-lagsarlm(eq1, data=sub, listw=cont.listw)
# SLX model (one spatial factor)
# spatial lags X (theta)
model.SLX<-lmSLX(eq1, data=sub, listw=cont.listw)
# SEM model (one spatial factor)
# spatial autocorrelation of error (lambda)
model.SEM<-errorsarlm(eq1, data=sub, listw=cont.listw)
# OLS model (no spatial factors)
model.OLS<-lm(eq1, data=sub)
library(texreg) # for nice summary of models
my.table<-screenreg(list(model.GNS, model.SAC, model.SDM, model.SDEM, model.SEM, model.SAR, model.SLX, model.OLS), custom.model.names=c("GNS", "SAC", "SDM", "SDEM", "SEM", "SAR", "SLX", "OLS")) # all models jointly
my.table# create time-space lag
sub0<-data[data$year==2021, ]
sub$y.stlag<-lag.listw(cont.listw, sub0$salary_avg_Poland_100p)
eq1<-y~x1+x2+x3+x4+x5+x6+x7 # initial regression equation
eq2<-y~x1+x2+x3+x4+x5+x6+x7+y.stlag # equation with spatio-temporal lag
# SEM model (one spatial factor lambda) and time-space lag y (y.stlag)
model.SEMst<-errorsarlm(eq2, data=sub, listw=cont.listw, tol.solve=1.0e-20, method="LU")
summary(model.SEMst)# impacts for SDM
model.SDM<-lagsarlm(eq1, data=sub, listw=cont.listw, type="mixed", tol.solve=1.0e-20, method="LU")
W.c<-as(as_dgRMatrix_listw(cont.listw), "CsparseMatrix")
trMat<-trW(W.c, type="mult")
model.SDM.imp<-impacts(model.SDM, tr=trMat, R=2000)
summary(model.SDM.imp, zstats=TRUE, short=TRUE)
a<-model.SDM.imp$res$direct # direct effects only
b<-model.SDM.imp$res$indirect # only indirect effects
c<-model.SDM.imp$res$total # only total effects
a/c # share of direct effect in the total
abs(a)/abs(b) # relation of direct to indirect effects# SAR model (one spatial factor rho)
model.SAR<-lagsarlm(eq1, data=sub, listw=cont.listw)
# SDM model (two spatial coefficients rho & theta)
model.SDM<-lagsarlm(eq1, data=sub, listw=cont.listw, type="mixed", tol.solve=1.0e-20, method="LU")
AIC(model.SAR)
BIC(model.SAR)
attributes(model.SAR)
model.SAR$AIC_lm.model
logLik(model.SAR)
model.SAR$LL
AIC(model.lm, model.SAR, model.SDM) # AIC information criteria
BIC(model.lm, model.SAR, model.SDM) # BIC information criteria
out<-AIC(model.lm, model.SAR, model.SDM)
out$BIC<-c(BIC(model.lm), BIC(model.SAR), BIC(model.SDM))
out$logLik<-c(logLik(model.lm), logLik(model.SAR), logLik(model.SDM))
out # displaying the resultlibrary(lmtest)
model.lm<-lm(eq1, data=sub)
bptest(model.lm) # BP test for residuals from the OLS model
# SAC model (two spatial coefficients rho and lambda)
model.SAC<-sacsarlm(eq1, data=sub, listw=cont.listw, method="LU")
bptest.Sarlm(model.SAC) # BP test for residuals from the SAC model
# LOSH statistics for the rest of the model
losh.stat.SAC<-LOSH(model.SAC$residuals, cont.listw, a=2, var_hi=TRUE, zero.policy=TRUE, na.action=na.exclude)
summary(losh.stat.SAC)
losh.stat.LM<-LOSH(model.lm$residuals, cont.listw, a=2, var_hi=TRUE, zero.policy=TRUE, na.action=na.exclude)
# Fig.7.4a - visualizing spatial distribution of model residuals
# attach LOSH heterogeneity to spatial object
pov$LOSH_LM<-losh.stat.LM[,3]
pov$LOSH_SAC<-losh.stat.SAC[,1]
# Define colour breaks and palette
a_lm<-summary(pov$LOSH_LM)
a_sac<-summary(pov$LOSH_SAC)
brks_lm<-c(a_lm[1], a_lm[2], a_lm[3], a_lm[5], a_lm[6])
brks_sac<-c(a_sac[1], a_sac[2], a_sac[3], a_sac[5], a_sac[6])
colfunc<-colorRampPalette(c("royalblue", "springgreen", "yellow", "red"))
cols<-colfunc(5)
ggplot(pov) + geom_sf(aes(fill=cut(LOSH_LM, breaks=brks_lm, include.lowest=TRUE)), colour="grey70", size=0.1) +
scale_fill_manual(values=cols, labels=c("Very low", "Low", "Medium", "High", "Very high"), name="LOSH variance") + labs(title="LOSH heterogeneity of OLS residuals") + theme_minimal() + theme(legend.position="bottom") # Fig.7.4a
ggplot(pov) + geom_sf(aes(fill=cut(LOSH_SAC, breaks=brks_sac, include.lowest=TRUE)), colour="grey70", size=0.1) +
scale_fill_manual(values=cols, labels=c("Very low", "Low", "Medium", "High", "Very high"), name="LOSH variance") + labs(title="LOSH heterogeneity of SAC residuals") + theme_minimal() + theme(legend.position="bottom") # Fig.7.4bFigure 7.4: LOSH heterogeneity for residuals: a) from OLS
model, b) from SAC model
model.lm<-lm(eq1, data=sub)
lm.morantest(model.lm, cont.listw) # Moran test for OLS residuals
# SAC model (two spatial coefficients rho and lambda)
model.SAC<-sacsarlm(eq1, data=sub, listw=cont.listw, method="LU")
moran.test(model.SAC$residuals, cont.listw)# Moran test for SAC residuals
# Fig.7.5a - spatial distribution of residuals from the OLS linear model
pov$res<-model.lm$residuals
ggplot(pov)+ geom_sf(aes(fill=cut(res, c(-Inf, mean(res)-sd(res), mean(res), mean(res)+sd(res), Inf), labels=c("(min; mean−sd]","(mean−sd; mean]", "(mean; mean+sd]", "(mean+sd; max]"))), colour=NA) + scale_fill_manual(name="OLS residuals", values=c("steelblue4", "lightskyblue", "thistle1", "plum3")) + labs(title = "Distribution of residuals in the OLS model") + theme_minimal()
# Fig.7.5b - residuals by sign
ggplot(pov)+geom_sf(aes(fill=res>0),colour=NA) + scale_fill_manual(name= "Residual sign", values=c("#b2182b", "#2166ac"), labels=c("Negative", "Positive")) + labs(title="Positive and negative residuals in the OLS model") + theme_minimal()Figure 7.5: Illustration of model residuals: a) by split according to mean and standard deviation, b) by sign – split into positive and negative
# join.count test for residuals (positive vs. negative)
res<-model.lm$residuals
resid<-factor(cut(res, breaks=c(-100, 0, 100), labels=c("Negative", "Positive")))
joincount.test(resid, cont.listw)# unlimited Manski model - three spatial coefficients - lambda, rho, theta
model.GNS<-sacsarlm(eq1, data=sub, listw=cont.listw, type="sacmixed")
# restricted model - SDM model (two spatial coefficients rho and theta)
model.SDM<-lagsarlm(eq1, data=sub, listw=cont.listw, type="mixed", tol.solve=1.0e-20, method="LU")
LR.Sarlm(model.GNS, model.SDM) # comparison of the two indicated models
Wald1.Sarlm(model.SDM) # comparison of the indicated model with OLS
LR1.Sarlm(model.SDM) # comparison of the indicated model with OLScrds<-st_coordinates(st_centroid(st_make_valid(pov))) # crds matrix
# kNN listw for k = 10,…,100
listw10<-nb2listw(make.sym.nb(knn2nb(knearneigh(crds, k=10))))
listw20<-nb2listw(make.sym.nb(knn2nb(knearneigh(crds, k=20))))
listw30<-nb2listw(make.sym.nb(knn2nb(knearneigh(crds, k=30))))
listw40<-nb2listw(make.sym.nb(knn2nb(knearneigh(crds, k=40))))
listw50<-nb2listw(make.sym.nb(knn2nb(knearneigh(crds, k=50))))
listw60<-nb2listw(make.sym.nb(knn2nb(knearneigh(crds, k=60))))
listw70<-nb2listw(make.sym.nb(knn2nb(knearneigh(crds, k=70))))
listw80<-nb2listw(make.sym.nb(knn2nb(knearneigh(crds, k=80))))
listw90<-nb2listw(make.sym.nb(knn2nb(knearneigh(crds, k=90))))
listw100<-nb2listw(make.sym.nb(knn2nb(knearneigh(crds, k=100))))
# estimate SAC models
model.SAC.10<-sacsarlm(eq1, data=sub, listw=listw10, method="LU")
model.SAC.20<-sacsarlm(eq1, data=sub, listw=listw20, method="LU")
model.SAC.30<-sacsarlm(eq1, data=sub, listw=listw30, method="LU")
model.SAC.40<-sacsarlm(eq1, data=sub, listw=listw40, method="LU")
model.SAC.50<-sacsarlm(eq1, data=sub, listw=listw50, method="LU")
model.SAC.60<-sacsarlm(eq1, data=sub, listw=listw60, method="LU")
model.SAC.70<-sacsarlm(eq1, data=sub, listw=listw70, method="LU")
model.SAC.80<-sacsarlm(eq1, data=sub, listw=listw80, method="LU")
model.SAC.90<-sacsarlm(eq1, data=sub, listw=listw90, method="LU")
model.SAC.100<-sacsarlm(eq1, data=sub, listw=listw100, method="LU")
# create a comparison table
out<-AIC(model.SAC.10, model.SAC.20, model.SAC.30, model.SAC.40, model.SAC.50, model.SAC.60, model.SAC.70, model.SAC.80, model.SAC.90, model.SAC.100)
out$knn<-seq(10, 100, by=10) # number of knn included in W matrixes
out$logLik<-c(logLik(model.SAC.10), logLik(model.SAC.20), logLik(model.SAC.30), logLik(model.SAC.40), logLik(model.SAC.50), logLik(model.SAC.60), logLik(model.SAC.70), logLik(model.SAC.80), logLik(model.SAC.90),logLik(model.SAC.100))
out$lambda<-c(model.SAC.10$lambda, model.SAC.20$lambda, model.SAC.30$lambda, model.SAC.40$lambda, model.SAC.50$lambda, model.SAC.60$lambda, model.SAC.70$lambda, model.SAC.80$lambda, model.SAC.90$lambda, model.SAC.100$lambda)
out$rho<-c(model.SAC.10$rho, model.SAC.20$rho, model.SAC.30$rho, model.SAC.40$rho, model.SAC.50$rho, model.SAC.60$rho, model.SAC.70$rho, model.SAC.80$rho, model.SAC.90$rho, model.SAC.100$rho)
out
# graph of spatial parameters in subsequent models – Fig.7.6a
ggplot(out, aes(x = knn)) + geom_line(aes(y=lambda, linetype="lambda")) +
geom_line(aes(y=rho, linetype="rho"), linewidth=1.1) +
scale_linetype_manual(values= c("lambda"="dashed", "rho"="solid")) +
scale_x_continuous(breaks= seq(10, 100, by=10)) +
scale_y_continuous(breaks= seq(-0.2, 0.4, by=0.1)) +
labs(x="Number of k nearest neighbours", y="Spatial parameters", linetype="") + theme_minimal() + theme(legend.position= "bottom", panel.grid.minor=element_blank(), panel.grid.major= element_line(colour="grey85"))
# AIC information criteria chart in subsequent models - Fig.7.6b
ggplot(out, aes(x=knn, y=AIC)) + geom_line(aes(linetype="AIC"), linewidth=1) + scale_linetype_manual(values="solid", name="") +
scale_x_continuous(breaks= seq(10, 100, by=10)) +
scale_y_continuous(breaks= c(2652, 2655, 2658, 2661)) +
labs(x="Number of k nearest neighbours", y="AIC") + theme_minimal() +
theme(legend.position="bottom", panel.grid.minor= element_blank(), panel.grid.major=element_line(colour="grey85"))Figure 7.6: Parameters in the SAC model depending on the number of k nearest neighbours in the spatial weight matrix: a) rho and lambda, b) AIC
# SDM model (two spatial coefficients, rho and theta)
model.SDM<-lagsarlm(eq1, data=sub, listw=cont.listw, type="mixed", tol.solve=1.0e-20, method="LU")
# SDEM model (two spatial coefficients, theta and lambda)
model.SDEM<-errorsarlm(eq1, data=sub, listw=cont.listw, etype="emixed", method="LU")
# SAR model (one spatial factor rho)
model.SAR<-lagsarlm(eq1, data=sub, listw=cont.listw)
# SEM model (one lambda spatial coefficient)
model.SEM<-errorsarlm(eq1, data=sub, listw=cont.listw)
model.SDM.p<-predict(model.SDM)
model.SDEM.p<-predict(model.SDEM)
model.SAR.p<-predict(model.SAR)
model.SEM.p<-predict(model.SEM)
model.SEM.p # only first 5 rows displayed
library(Metrics)
vec<-c("model.SDM.p", "model.SDEM.p", "model.SAR.p", "model.SEM.p")
metrics<-matrix(0, nrow=4, ncol=5)
rownames(metrics)<-vec
colnames(metrics)<-c("bias", "bias%", "MAE","MAPE", "RMSE")
for(i in 1:4){
metrics[i,1]<-bias(sub$y, get(vec[i]))
metrics[i,2]<-percent_bias(sub$y, get(vec[i]))
metrics[i,3]<-mae(sub$y, get(vec[i]))
metrics[i,4]<-mape(sub$y, get(vec[i]))
metrics[i,5]<-rmse(sub$y, get(vec[i]))}
metrics# plot of core cities and diffusion directions (Fig.7.7a)
voi.m<-st_transform(voi, 2180) # voi projected to metres (Poland CS92)
pov.m<-st_transform(pov, 2180) # pov projected to metres (Poland CS92)
cent<-st_coordinates(st_centroid(st_make_valid(pov.m))) # arrow start points
city.id<-which(data$core_city==1 & data$year==2008) # IDs of core cities
# starting coordinates of arrows
starts<-data.frame(x=cent[city.id, 1], y=cent[city.id, 2])
# create 5 random arrows per city
arrows_df <- starts[rep(seq_len(nrow(starts)), each=5), ] |>
mutate(angle=runif(n(), 0, 2*pi), # random direction
len_m=runif(n(), 20000, 60000), # random length from 20 to 60 km
xend=x+len_m*cos(angle), yend=y+len_m*sin(angle))
ggplot() + geom_sf(data=pov.m, aes(fill= factor(data$core_city[data$year== 2022], levels=c(0, 1))), color="grey70", linewidth=0.2) + geom_sf(data=voi.m, fill=NA, color="black", linewidth=0.6) + scale_fill_manual(values= c("0"="white", "1"="red"), guide="none") + theme_void() + geom_curve(data= arrows_df, curvature=0.2, aes(x=x, y=y, xend= xend, yend= yend), arrow = arrow(length=unit(0.1, "cm"), type="open"), linewidth=0.3, color="blue") # Fig.7.7a
library(RColorBrewer)
# core–periphery distance: poviat centroids to regional cores (Fig.7.7b)
pov$dist<-as.numeric(data$dist[data$year==2022]) # distances to core cities
pov$dist_class<-cut(pov$dist, breaks=c(0, 25, 40, 55, 70, 90, Inf), include.lowest=T) # assign distance classes
ggplot() + geom_sf(data=pov, aes(fill=dist_class), color="black", linewidth=0.2) + geom_sf(data=voi, fill=NA, color="black", linewidth=0.6) + scale_fill_manual(values=rev(brewer.pal(6, "Spectral")), name="Core periphery\ndistance (km)") + theme_void() +theme(legend.position="right")Figure 7.7: a) directions of the influence of cores, b)
distances between core and peripheral locations
# line chart based on panel data Fig.7.8a
# data in long panel format
df<-data %>% mutate(variable=unemployment_rate, # select the variable here
group=factor(case_when(core_city==1 ~ "core - main regional cities",
core_city==0 & dist<25 ~ "up to 25 km from the core",
dist>=25 & dist<50 ~ "25–50 km from the core",
dist>=50 & dist<100 ~ "50–100 km from the core",
dist>=100 ~ "more than 100 km from the core"),
levels= c("core - main regional cities", "up to 25 km from the core",
"25–50 km from the core", "50–100 km from the core",
"more than 100 km from the core"), ordered = TRUE)) %>%
group_by(year, group) %>% summarise(mean_value=mean(variable, na.rm=TRUE), .groups="drop")
head(df)
library(RColorBrewer)
ggplot(df, aes(x=year, y=mean_value, colour=group)) + geom_line(linewidth= 0.9) + scale_colour_manual(values= rev(brewer.pal(5, "Spectral")), name=NULL) + labs(title= "Unemployment rate in poviats", x=NULL, y=NULL) + coord_cartesian(ylim =c(0, 21)) + theme_classic() + theme(legend.position= c(0.98, 0.98), legend.justification=c("right", "top"), plot.title=element_text(hjust=0.5, face="bold")) # Fig.7.8a
# phenomenon depending on the distance – bubble chart Fig.7.8b
brks.pop<-c(0, 0.5, 0.75, 1.00, 1.25, 2, 5, 15) # intervals for population
# filter the data, compute relative population and create size classes
df1<-data %>% filter(region_name=="mazowieckie", year==2022) %>%
mutate(variable= unemployment_rate, pop_rel= persons_in_K/ mean(persons_in_K, na.rm=TRUE), pop_class=cut(pop_rel,breaks=brks.pop, include.lowest=TRUE, right=TRUE))
ggplot(df1, aes(x=dist, y=variable, size=pop_class)) + geom_point(shape=21, colour="chartreuse3", fill="chartreuse3", stroke=0) + scale_size_manual(values= brks.pop[-1]*2, name=NULL) + coord_cartesian(xlim=c(0, 120), ylim=c(0, 20)) + labs(title="Unemployment rate in Mazowieckie poviats in 2022", x="Distance of the poviat from the voivodeship city", y="Unemployment rate") + theme_classic() + guides(size="none") + theme(plot.title= element_text(hjust=0.5, face="bold")) # Fig.7.8bFigure 7.8: Dependence of the unemployment rate on the distance from the core city visualized with a) line chart, b) bubble chart
sub<-data[data$year==2022,]
sub$variable<-sub$unemployment_rate/mean(sub$unemployment_rate, na.rm=TRUE)
# visualisation of the distance - phenomenon relation
plot(log(sub$dist),log(sub$variable), main="log x, log y")
plot(log(sub$dist), sub$variable, main="x, log y")
plot(sub$dist, sub$variable, main="x, y")
# matrix of spatial weights according to the contiguity criterion
cont.nb<-poly2nb(st_make_valid(pov), queen=TRUE) # spdep:: package
cont.listw<-nb2listw(cont.nb, style="W") # spdep:: package
# model of polynomial distance (multinominal distance)
mod.multi.asp<-glm(variable~dist+I(dist^2)+I(dist^3)+ I(dist^4), data=sub)
mod.multi.sp<-errorsarlm(variable~dist+I(dist^2)+ I(dist^3)+ I(dist^4), data=sub, cont.listw, tol.solve=2e-40)
# power model
mod.power.asp<-glm(log1p(variable)~log1p(dist), data=sub)
mod.power.sp<-errorsarlm(log1p(variable)~log1p(dist), data=sub, cont.listw)
# exponential model
mod.exp.asp<-glm(log1p(variable)~dist, data=sub)
mod.exp.sp<-errorsarlm(log1p(variable)~dist, data=sub, cont.listw)
# goodness-of-fit measures
out<-matrix(0, nrow=2, ncol=6)
colnames(out)<-c("multi.asp", "multi.sp", "power.asp", "power.sp", "exp.asp", "exp.sp")
rownames(out)<-c("SRMSE", "lambda")
a<-mean(sub$variable)
b<-dim(sub)[1]
c<-sub$variable
out[1,1]<-sqrt(sum((mod.multi.asp$fitted.values-c)^2)/b)/a
out[1,2]<-sqrt(sum((mod.multi.sp$fitted.values-c)^2)/b)/a
out[1,3]<-sqrt(sum((mod.power.asp$fitted.values-c)^2)/b)/a
out[1,4]<-sqrt(sum((mod.power.sp$fitted.values-c)^2)/b)/a
out[1,5]<-sqrt(sum((mod.exp.asp$fitted.values-c)^2)/b)/a
out[1,6]<-sqrt(sum((mod.exp.sp$fitted.values-c)^2)/b)/a
out[2,2]<- mod.multi.sp$lambda
out[2,4]<- mod.power.sp$lambda
out[2,6]<- mod.exp.sp$lambda
out
# visualisation of model fit – Fig.7.9
plot(sub$dist, sub$variable, main="OLS, multinominal, fitted values")
points(sub$dist, mod.multi.asp$fitted.values, col="red")
abline(h=1, lty=3)
plot(sub$dist, sub$variable, main="SEM, multinominal, fitted values")
points(sub$dist, mod.multi.sp$fitted.values, col="red")
abline(h=1, lty=3)
plot(log1p(sub$dist), log1p(sub$variable), main="OLS, power, fitted values")
points(log1p(sub$dist), mod.power.asp$fitted.values, col="red")
abline(h=1, lty=3)
plot(log1p(sub$dist), log1p(sub$variable), main="SEM, power, fitted values")
points(log1p(sub$dist), mod.power.sp$fitted.values, col="red")
abline(h=1, lty=3)
plot(sub$dist, log1p(sub$variable), main="OLS, exponential, fitted values")
points(sub$dist, mod.exp.asp$fitted.values, col="red")
abline(h=1, lty=3)
plot(sub$dist, log1p(sub$variable), main="SEM, exponential, fitted values")
points(sub$dist, mod.exp.sp$fitted.values, col="red")
abline(h=1, lty=3)Figure 7.9: Visualisation of the quality of matching spatial
interaction models to empirical data
# spatial polynomial models
sub<-data[data$year==2022,]
sub$variable<-sub$unemployment_rate/mean(sub$unemployment_rate, na.rm=TRUE)
mod.multi.sp.2022<-errorsarlm(variable~poly(dist,4), data=sub, cont.listw, tol.solve=2e-40)
sqrt(sum((mod.multi.sp.2022$fitted.values-sub$variable)^2)/dim(sub)[1])/ mean(sub$variable)
sub<-data[data$year==2013,]
sub$variable<-sub$unemployment_rate/mean(sub$unemployment_rate, na.rm=TRUE)
mod.multi.sp.2013<-errorsarlm(variable~poly(dist,4), data=sub, cont.listw, tol.solve=2e-40)
sqrt(sum((mod.multi.sp.2013$fitted.values-sub$variable)^2)/dim(sub)[1])/ mean(sub$variable)
summary(mod.multi.sp.2013)
# matching visualisation
plot(sub$dist, mod.multi.sp.2022$fitted.values, col="red", pch=".", cex=1.5, xlab="Distance from the core city", ylab="Fitted value of the dependent variable")
abline(v=seq(0, max(sub$dist), by=5), col="grey90", lty=1)
lines(smooth.spline(sub$dist, mod.multi.sp.2022$fitted.values, spar=0.99), col="red")
points(sub$dist, mod.multi.sp.2013$fitted.values, col="black", pch=".", cex=1.3)
lines(smooth.spline(sub$dist, mod.multi.sp.2013$fitted.values, spar=0.99), col="black")
abline(h=1, lty=3)
legend("bottomright", legend=c("2013", "2022"), col=c("black", "red"), lty=c(1,1), bty="n")Figure 7.10: Smoothed fitting of polynomial models in 2013
and 2022
# prepare dataset
data$y<-data$fixed_assets_pc_PLN
data$x1<-data$invest_pc_PLN
data$x2<-data$salary_avg_Poland_100p
data$x3<-data$persons_productive_age_K/data$persons_postproductive_age_K
data$x4<-data$all_firms_per_10K_inhab
data$x5<-data$dist
# x6 has to be calculated for each year separately - done in next steps
data$x7<-data$unemployment_rate
# subsets for each year
sub15<-data[data$year==2015, ]
sub16<-data[data$year==2016, ]
sub17<-data[data$year==2017, ]
sub18<-data[data$year==2018, ]
sub19<-data[data$year==2019, ]
sub20<-data[data$year==2020, ]
sub21<-data[data$year==2021, ]
sub22<-data[data$year==2022, ]
# variables Poland=100%, referring to the average from a given year
sub15$x6<-sub15$persons_per_1km2/mean(sub15$persons_per_1km2, na.rm=TRUE)
sub16$x6<-sub16$persons_per_1km2/mean(sub16$persons_per_1km2, na.rm=TRUE)
sub17$x6<-sub17$persons_per_1km2/mean(sub17$persons_per_1km2, na.rm=TRUE)
sub18$x6<-sub18$persons_per_1km2/mean(sub18$persons_per_1km2, na.rm=TRUE)
sub19$x6<-sub19$persons_per_1km2/mean(sub19$persons_per_1km2, na.rm=TRUE)
sub20$x6<-sub20$persons_per_1km2/mean(sub20$persons_per_1km2, na.rm=TRUE)
sub21$x6<-sub21$persons_per_1km2/mean(sub21$persons_per_1km2, na.rm=TRUE)
sub22$x6<-sub22$persons_per_1km2/mean(sub22$persons_per_1km2, na.rm=TRUE)
# cumulative dependent variable (y)
sub15$y.cum<-sub15$y
sub16$y.cum<-sub15$y+sub16$y
sub17$y.cum<-sub15$y+sub16$y+sub17$y
sub18$y.cum<-sub15$y+sub16$y+sub17$y+sub18$y
sub19$y.cum<-sub15$y+sub16$y+sub17$y+sub18$y+sub19$y
sub20$y.cum<-sub15$y+sub16$y+sub17$y+sub18$y+sub19$y+sub20$y
sub21$y.cum<-sub15$y+sub16$y+sub17$y+sub18$y+sub19$y+sub20$y+sub21$y
sub22$y.cum<-sub15$y+sub16$y+sub17$y+sub18$y+sub19$y+sub20$y+sub21$y+ sub22$y
# cumulative explanatory variable (x1)
sub15$x1.cum<-sub15$x1
sub16$x1.cum<-sub15$x1+sub16$x1
sub17$x1.cum<-sub15$x1+sub16$x1+sub17$x1
sub18$x1.cum<-sub15$x1+sub16$x1+sub17$x1+sub18$x1
sub19$x1.cum<-sub15$x1+sub16$x1+sub17$x1+sub18$x1+sub19$x1
sub20$x1.cum<-sub15$x1+sub16$x1+sub17$x1+sub18$x1+sub19$x1+sub20$x1
sub21$x1.cum<-sub15$x1+sub16$x1+sub17$x1+sub18$x1+sub19$x1+sub20$x1+ sub21$x1
sub22$x1.cum<-sub15$x1+sub16$x1+sub17$x1+sub18$x1+sub19$x1+sub20$x1+ sub21$x1+sub22$x1
# model equation
eq<-y.cum~x1.cum+x2+x3+x4+x5+x6+x7 # form of a regression equation
library(spdep)
library(spatialreg)
# contiguity spatial weights matrix
cont.nb<-poly2nb(as(st_make_valid(pov), queen=TRUE)
cont.listw<-nb2listw(cont.nb, style="W")
# model estimation for subsequent years
m15<-errorsarlm(eq, data=sub15, cont.listw, etype="emixed", tol.solve=1e-22)
m16<-errorsarlm(eq, data=sub16, cont.listw, etype="emixed", tol.solve=1e-22)
m17<-errorsarlm(eq, data=sub17, cont.listw, etype="emixed", tol.solve=1e-22)
m18<-errorsarlm(eq, data=sub18, cont.listw, etype="emixed", tol.solve=1e-22)
m19<-errorsarlm(eq, data=sub19, cont.listw, etype="emixed", tol.solve=1e-22)
m20<-errorsarlm(eq, data=sub20, cont.listw, etype="emixed", tol.solve=1e-22)
m21<-errorsarlm(eq, data=sub21, cont.listw, etype="emixed", tol.solve=1e-22)
m22<-errorsarlm(eq, data=sub22, cont.listw, etype="emixed", tol.solve=1e-22)
# combine model results into one printout
options(scipen=999, digits=1)
result<-cbind(m15$coefficients, m16$coefficients, m17$coefficients, m18$coefficients, m19$coefficients, m20$coefficients, m21$coefficients, m22$coefficients)
colnames(result)<-paste(rep("mod", times=8),2015:2022)
lambda<-cbind(m15$lambda, m16$lambda, m17$lambda, m18$lambda, m19$lambda, m20$lambda, m21$lambda, m22$lambda)
AIC<-cbind(AIC(m15),AIC(m16), AIC(m17), AIC(m18), AIC(m19), AIC(m20), AIC(m21), AIC(m22))
result<-rbind(result,lambda,AIC)
rownames(result)[17]<-"AIC"
result
summary(m22)
# ratio of direct and indirect effects
abs(result[2:8,])/abs(result[9:15,])# subset of points.sf with necessary variables - points.sf.sub
points.sf.sub<-points.sf[, c("roa", "empl_av_size", "dummy_prod", "dummy_constr", "dummy_serv", "dummy_agri", "dist.lublin")]
dim(points.sf.sub) # dimensions of the dataset – 37373 rows & 8 columns
head(points.sf.sub) # displaying first 6 rows of the dataset
summary(points.sf.sub) # summary of the variables
# split data into the training and test data
points.in.sf<-points.sf.sub[1:30000,] # training data
points.out.sf<-points.sf.sub[30001:37374,] # test data
install.packages("devtools") # install SpatialWarsaw:: from GitHub
devtools::install_github("poktam/spatialWarsaw") # access to GitHub
library(spatialWarsaw)
# equation for modelling
eq<-roa ~ empl_av_size + dummy_prod + dummy_constr + dummy_serv + dist.lublin
# bootstrapped spatial regression estimation
bsr<-BootSpatReg(points_sf=points.in.sf, iter=5, sample_size=1500, eq=eq, knn=20, model_type="SEM")
bsr # display the full result
class(bsr$model.best)# class of the model
asr<-ApproxSERoot2(bsr$model.best)
print(asr, digits=2)Figure 7.11: Extrapolated standard errors of coefficients
based on the √2 rule
# prepare coordinates of best model observations (sf) – for knn matrix
crds.sf<-st_centroid(st_geometry(points.in.sf[rownames(bsr$data.best),]))
crds.sf
# symmetric k nearest neighbours spatial weight matrix for predictive model
knn.sym.listw<-nb2listw(make.sym.nb(knn2nb(knearneigh(crds.sf, k=20))))
knn.sym.listw
# prediction
my.pred<-SpatPredTess(model_spatial=bsr$model.best, points_spatial_sf= points.in.sf[rownames(bsr$data.best),], knnW=knn.sym.listw, points_new_sf=points.out.sf[1:10,], region_sf=region.voi)
my.pred # figure appears automaticallyFigure 7.12: Original (in-sample) and predicted
(out-of-sample) locations, with Voronoi tiles acting as catchment
areas
# point data for business locations - subset of points.sf dataframe
points.sf.sub <- points.sf[, c("roa", "empl_av_size", "dummy_prod", "dummy_constr", "dummy_serv", "dummy_agri", "dist.lublin")]
head(points.sf.sub)
# load grid data for population (not included in datasets, because of size)
pop.grid<-st_read("PD_STAT_GRID_CELL_2011.shp") # 315 857 units (grids)
pop.grid<-st_transform(pop.grid, 4326) # reproject to WGS84
head(pop.grid)
# cut the grid according to the contour of the lubelskie region
# extract contour of the lubelskie region
voi.lub<-voi[voi$JPT_NAZWA_=="lubelskie",]
# select grids only for lubelskie region
temp<-st_join(pop.grid, voi.lub, join=st_intersects)
a<-which(temp$JPT_NAZWA_=="lubelskie")
pop.grid.lub<-pop.grid[a,]
# read limited grid, included in datasets (section I.4.2)
grid<-st_read("POP_lub.shp", stringsAsFactors=FALSE)
grid<-st_transform(grid, 4326)
pop.grid.lub<-grid # copy-paste the object with a new name
# Fig.7.13a - administrative contour and grid
ggplot() + geom_sf(data=pop.grid.lub) + geom_sf(data=voi.lub, fill=NA, color="red", linewidth=0.5) + theme_minimal()
# Figure (not shown) - values of total population for the whole region
ggplot(pop.grid.lub) + geom_sf(aes(fill=TOT), linewidth=0) + scale_fill_viridis_c(option="C") + theme_minimal() + labs(fill="Population (TOT)")
# Fig.7.13b - values of the examined variable - zoomed district
pov.lub<-pov %>% filter(JPT_NAZWA_=="powiat Lublin")
bb<-st_bbox(pov.lub)
ggplot() + geom_sf(data=pov.lub, fill=NA, color="black", linewidth=0.8) +
geom_sf(data=pop.grid.lub, aes(fill=TOT), linewidth=0) +
geom_sf(data=pov.lub, fill=NA, color="black", linewidth=0.8) +
coord_sf(xlim=c(bb["xmin"], bb["xmax"]), ylim=c(bb["ymin"], bb["ymax"]), expand=FALSE) + scale_fill_viridis_c(option="C") + theme_minimal() + labs(fill="Population (TOT)")Figure 7.13: Illustration of grid: a) grid limited to the
contour of the voivodeship, b) zoom of the grid for a selected limited
area
# code to assign and aggregate point data by grid cells
library(dplyr)
# make sure both layers use the same CRS
st_crs(pop.grid.lub) == st_crs(points.sf.sub)
# add an explicit grid id (1..n) to the grid
pop.grid.lub<-pop.grid.lub %>% mutate(grid_id=row_number())
# assign each point to a grid and keep only the grid_id appended to points
pts.with.grid<-st_join(points.sf.sub, pop.grid.lub %>% select(grid_id), join =st_within)
pts.with.grid # only 3 first rows displayed
# aggregate point attributes by grid_id
# example: mean roa per cell + number of points per cell
grid.agg<-pts.with.grid %>% st_drop_geometry() %>% group_by(grid_id) %>%
summarise(roa_mean=mean(roa), n_points=n(), .groups="drop")
# join aggregated results to grid and replace NA with 0 in empty cells
pop.grid.lub<-pop.grid.lub %>% left_join(grid.agg, by="grid_id") %>%
mutate(roa_mean=ifelse(is.na(roa_mean), 0, roa_mean), n_points= ifelse(is.na(n_points), 0, n_points))
pop.grid.lub # only 3 first rows displayed
# summary for the new variables in grid – roa_mean and n_points
summary(pop.grid.lub %>% select(roa_mean, n_points))
# optional: choropleth map of aggregated roa_mean (not shown)
ggplot(pop.grid.lub) + geom_sf(aes(fill=roa_mean), linewidth=0) + theme_minimal() + scale_fill_viridis_c(option="C") + labs(fill="Mean ROA")
# spatial weights matrix (contiguity for grid polygons)
cont.nb<-poly2nb(pop.grid.lub, queen=TRUE)
cont.listw<-nb2listw(cont.nb, style="W", zero.policy=TRUE)
# preparation of variables for the model (created directly in pop.grid.lub)
pop.grid.lub<-pop.grid.lub %>% mutate(pop_prod=TOT_15_64/TOT,
pop_prod=ifelse(is.na(pop_prod) | is.infinite(pop_prod), 0, pop_prod))
# estimation of the spatial error model (SEM) on grid
model<-errorsarlm(roa_mean~FEM_RATIO+pop_prod + n_points, data=pop.grid.lub, listw=cont.listw, zero.policy=TRUE, method="LU")
summary(model, Nagelkerke=TRUE)library(plm) # for panel data estimators
library(splm) # for spatial panel models
library(spdep) # for spatial weights matrix
# prepare of the data – attach variable codes
data$y<-data$salary_avg_Poland_100p
data$x1<-data$unemployment_rate
data$x2<-data$birth_per_1K
data$x3<-data$death_per_1K
data$x4<-data$invest_pc_PLN
data$x5<-data$fixed_assets_pc_PLN
data$x6<-data$new_firms_per_10K_inhab
# limit the time period and number of variables
data.lim<-data[data$year>2012 & data$year<2023, ]
data.lim<-data.lim[,c("ID_MAP", "year", "y", "x1", "x2", "x3", "x4", "x5", "x6")]
# save to the panel data format
data.panel<-pdata.frame(data.lim, index=(c("ID_MAP", "year")))
head(data.panel)
pdim(data.panel)
# model equation
eq1<-y~x1+x2+x3+x4+x5+x6
# create the spatial weights matrix (contiguity)
cont.nb<-poly2nb(st_make_valid(pov), queen=TRUE)
cont.listw<-nb2listw(cont.nb, style="W")
# model with fixed effects (FE), SAC/SARAR specification:
# - lambda: here for spatial lag of y
# - rho: here for spatial dependence in the error term (Baltagi-type)
# - one-way (individual) fixed effects
model1<-spml(eq1, data=data.panel, listw=cont.listw, model="within", spatial.error="b", lag=TRUE, effect="individual", rel.tol=2e-40)
options(scipen=999, digits=2)
summary(model1)
# extract individual fixed effects
eff.m1<-effects(model1)
eff.m1 # only first 6 rows displayed
attributes(eff.m1) # attributes of specific effects object
# Fig.7.14a - density plot of fixed effects
fe.m1<-data.frame(fe=eff.m1$SETable[, 1])
ggplot(fe.m1, aes(x=fe)) + geom_density(fill="steelblue", alpha=0.4) +
labs(title = "Density of individual fixed effects", x="Fixed effect", y="Density") + theme_minimal()
# Fig.7.14b - map of individual fixed effects
pov$fe.m1<-eff.m1$SETable[, 1]
ggplot(pov) + geom_sf(aes(fill=fe.m1), color=NA) + theme_minimal() + scale_fill_viridis_c(option="plasma") + labs(title="Individual fixed effects", fill="FE")Figure 7.14: Inspection of fixed effects from spatial panel
model: a) statistical density distribution, b) map of values
# standardisation of variables according to temporal parameters (µ, σ)
library(doBy)
data.panel$y.sc<-transformBy(~year, data=data.panel, y=scale(y))$y
data.panel$x1.sc<-transformBy(~year, data=data.panel, x1=scale(x1))$x1
data.panel$x2.sc<-transformBy(~year, data=data.panel, x2=scale(x2))$x2
data.panel$x3.sc<-transformBy(~year, data=data.panel, x3=scale(x3))$x3
data.panel$x4.sc<-transformBy(~year, data=data.panel, x4=scale(x4))$x4
data.panel$x5.sc<-transformBy(~year, data=data.panel, x5=scale(x5))$x5
data.panel$x6.sc<-transformBy(~year, data=data.panel, x6=scale(x6))$x6
# equation for scaled variables
eq1.sc<-y.sc~x1.sc+x2.sc+x3.sc+x4.sc+x5.sc+x6.sc
model.spml<-spml(eq1.sc, data=data.panel, listw=cont.listw, model="within", spatial.error="b", lag=TRUE, effect="individual", rel.tol=2e-40)
summary(model.spml)
# BSJK conditional test (C.1) – for H1: spatial dependence in the error term,
# conditional on random effects and serial correlation
bsjktest(eq1, data=data.panel, listw=cont.listw, test="C.1")
# BSJK joint test (J) - H1: at least one of the following is present:
# random effects OR serial correlation OR spatial error dependence
bsjktest(eq1, data=data.panel, listw=cont.listw, test="J")
# LM-H test: joint test for random effects and spatial autocorrelation
bsktest(eq1,data=data.panel, listw=cont.listw, test="LMH", standardize=TRUE)
# LM1 test: marginal test for random effects
bsktest(eq1,data=data.panel, listw=cont.listw, test="LM1", standardize=TRUE)
# LM2 test: marginal test for spatial autocorrelation in the error term
bsktest(eq1,data=data.panel, listw=cont.listw, test="LM2", standardize=TRUE)
# impacts with simulation of significance
attr(model1, "have_factor_preds")<-FALSE
imp<-splm::impacts(model1, listw=cont.listw, time=10, R=1000)
summary(imp, zstats=TRUE, short=TRUE)