#install.packages('SpatialEpi',depend=TRUE)
These libraries (tmap
, SpatialEpi
,
sf
, spdep
) provide the tools needed for
spatial analysis and mapping. The Pennsylvania lung cancer dataset
(pennLC
) contains geographic information and smoking data —
the core variables for the study. This stage sets the foundation for
everything that follows.
# Make sure the necessary packages have been loaded
#install.packages("tmap")
#install.packages("tmaptools")
library(tmap)
library(tmaptools)
library(SpatialEpi)
## Loading required package: sp
# Read in the Pennsylvania lung cancer data
data(pennLC)
# Extract the SpatialPolygon info
penn.state.latlong <- pennLC$spatial.polygon
library(sf)
## Linking to GEOS 3.13.0, GDAL 3.10.1, PROJ 9.5.1; sf_use_s2() is TRUE
# Convert to an sf object
penn.state.sf <- st_as_sf(penn.state.latlong)
# Reproject to UTM zone 17N (EPSG code can be specified; here, using 3724 as given)
penn.state.sf <- st_transform(penn.state.sf, crs = 3724)
# Convert back to a Spatial object if needed
penn.state.utm <- as(penn.state.sf, "Spatial")
# Obtain the smoking rates
penn.state.utm$smk <- pennLC$smoking$smoking * 100
# Convert to sf object (this will add the geometry column)
penn.state.sf <- st_as_sf(penn.state.utm)
# Draw the choropleth map using tmap v4 syntax
tm_shape(penn.state.sf) +
tm_polygons(fill = "smk", fill.legend = tm_legend(title = "% of Popn."))
Visualizing smoking rates on a map helps identify patterns across counties — like clusters of high or low smoking rates. This map is our first look at how smoking behavior varies spatially, which is key for understanding any potential spatial dependence in health outcomes like lung cancer.
Creating Randomized Data:
By scrambling the smoking rate data and plotting both the real and fake
data, this stage helps us see if the observed spatial pattern in smoking
rates is real or could happen by chance. It also helps us visually
distinguish true spatial clustering from random noise.
# Set up a set of five 'fake' smoking update rates as well as the real one
# Create new columns in penn.state.utm for randomised data
# Here the seed 4676 is used. Use a different one to get an unknown outcome.
# Create new columns for random smoking rates
set.seed(123)
penn.state.sf$smk_rand1 <- sample(penn.state.sf$smk)
penn.state.sf$smk_rand2 <- sample(penn.state.sf$smk)
penn.state.sf$smk_rand3 <- sample(penn.state.sf$smk)
penn.state.sf$smk_rand4 <- sample(penn.state.sf$smk)
penn.state.sf$smk_rand5 <- sample(penn.state.sf$smk)
# Scramble the variables
vars <- sample(c('smk', 'smk_rand1', 'smk_rand2', 'smk_rand3', 'smk_rand4', 'smk_rand5'))
# Identify the real data
real.data.i <- which(vars == 'smk')
# Plot the maps in a 3x2 grid
tm_shape(penn.state.sf) +
tm_polygons(fill = vars, fill.legend.show = FALSE) +
tm_layout(title = 1:6, title.position = c("right", "top"))
## [tm_polygons()] Argument `fill.legend.show` unknown.
## [v3->v4] `tm_layout()`: use `tm_title()` instead of `tm_layout(title = )`
## [plot mode] fit legend/component: Some legend items or map compoments do not
## fit well, and are therefore rescaled.
## ℹ Set the tmap option `component.autoscale = FALSE` to disable rescaling.
real.data.i
## [1] 6
NEIGBOURS AND LAGGED MEAN PLOTS
Queen’s Case:
Think of a chessboard’s queen — she can move in any direction (side, corner, or diagonal).
In this method, two counties are neighbors if they share any boundary point, whether it’s a side or a corner.
This approach captures more neighbors because even touching at a corner counts.
require(spdep)
## Loading required package: spdep
## Loading required package: spData
## To access larger datasets in this package, install the spDataLarge
## package with: `install.packages('spDataLarge',
## repos='https://nowosad.github.io/drat/', type='source')`
penn.state.nb <- poly2nb(penn.state.utm)
penn.state.nb
## Neighbour list object:
## Number of regions: 67
## Number of nonzero links: 346
## Percentage nonzero weights: 7.70773
## Average number of links: 5.164179
# Create a SpatialLinesDataFrame showing the Queen's case contiguities
# Convert SpatialPolygonsDataFrame to sf
penn.state.sf <- st_as_sf(penn.state.utm)
# Create a SpatialLinesDataFrame showing Queen's case contiguities
penn.state.net <- nb2lines(penn.state.nb, coords = coordinates(penn.state.utm))
## Warning in CRS(proj4string): CRS: projargs should not be NULL; set to NA
# Convert SpatialLinesDataFrame to sf
penn.state.net.sf <- st_as_sf(penn.state.net)
# Assign projection (assuming UTM Zone 17N)
st_crs(penn.state.sf) <- 3724
st_crs(penn.state.net.sf) <- 3724
# Plot the borders and the network
tm_shape(penn.state.sf) + tm_borders(col = 'lightgrey') +
tm_shape(penn.state.net.sf) + tm_lines(col = 'red')
Rook’s Case:
Like a chessboard’s rook, this method only allows movement in straight lines (rows or columns).
Two counties are neighbors only if they share a common side (border) — no diagonal neighbors.
# Calculate Rook’s case neighbors
penn.state.nb2 <- poly2nb(penn.state.utm, queen = FALSE)
# Convert neighbors to SpatialLinesDataFrame
penn.state.net2 <- nb2lines(penn.state.nb2, coords = coordinates(penn.state.utm))
## Warning in CRS(proj4string): CRS: projargs should not be NULL; set to NA
# Convert both SpatialLinesDataFrame and SpatialPolygonsDataFrame to sf
penn.state.sf <- st_as_sf(penn.state.utm)
penn.state.net.sf <- st_as_sf(penn.state.net)
penn.state.net2.sf <- st_as_sf(penn.state.net2)
# Set CRS for all
st_crs(penn.state.sf) <- 3724
st_crs(penn.state.net.sf) <- 3724
st_crs(penn.state.net2.sf) <- 3724
# Plot the counties, Queen’s and Rook’s case neighbors
tm_shape(penn.state.sf) + tm_borders(col = 'lightgrey') +
tm_shape(penn.state.net.sf) + tm_lines(col = 'blue', lwd = 2) +
tm_shape(penn.state.net2.sf) + tm_lines(col = 'yellow')
# Convert the neighbour list to a listw object - use Rook's case...
penn.state.lw <- nb2listw(penn.state.nb2)
penn.state.lw
## Characteristics of weights list object:
## Neighbour list object:
## Number of regions: 67
## Number of nonzero links: 330
## Percentage nonzero weights: 7.351303
## Average number of links: 4.925373
##
## Weights style: W
## Weights constants summary:
## n nn S0 S1 S2
## W 67 4489 67 28.73789 274.6157
# Calculate lagged means
penn.state.utm$smk.lagged.means <- lag.listw(penn.state.lw, penn.state.utm$smk)
# Convert SpatialPolygonsDataFrame to sf
penn.state.sf <- st_as_sf(penn.state.utm)
# Assign projection (UTM Zone 17N)
st_crs(penn.state.sf) <- 3724
# Plot lagged smoking rates
tm_shape(penn.state.sf) +
tm_polygons(fill = 'smk.lagged.means', fill.legend = tm_legend(title = '% of Popn.')) +
tm_layout(legend.bg.color = "white")
with(data.frame(penn.state.utm), {
plot(smk,smk.lagged.means,asp=1,xlim=range(smk),ylim=range(smk))
abline(a=0,b=1)
abline(v=mean(smk),lty=2)
abline(h=mean(smk.lagged.means),lty=2)
})
moran.plot(penn.state.utm$smk,penn.state.lw)
MORAN’S I : AN INDEX OF AUTOCORRELATION
moran.test(penn.state.utm$smk,penn.state.lw)
##
## Moran I test under randomisation
##
## data: penn.state.utm$smk
## weights: penn.state.lw
##
## Moran I statistic standard deviate = 5.4175, p-value = 3.022e-08
## alternative hypothesis: greater
## sample estimates:
## Moran I statistic Expectation Variance
## 0.404431265 -0.015151515 0.005998405
Moran’s I statistic (0.404):
This value ranges between -1 and +1:
+1 means strong positive spatial autocorrelation — nearby counties tend to have similar smoking rates (either high or low).
0 means no spatial autocorrelation — smoking rates are randomly distributed across counties
-1 means negative spatial autocorrelation — neighboring counties have very different rates (like a checkerboard pattern).
The 0.404 we got is a clear positive value, showing that counties with similar smoking rates tend to cluster together.
p-value (3.02e-08):
This tiny p-value (< 0.001) means the result is highly significant — there’s almost no chance this spatial pattern is due to random chance.
So the clustering of smoking rates we see is real and meaningful.
moran.range <- function(lw) {
wmat <- listw2mat(lw)
return(range(eigen((wmat + t(wmat))/2)$values))
}
moran.range(penn.state.lw)
## [1] -0.5785577 1.0202308
moran.test(penn.state.utm$smk,penn.state.lw,randomisation=FALSE)
##
## Moran I test under normality
##
## data: penn.state.utm$smk
## weights: penn.state.lw
##
## Moran I statistic standard deviate = 5.4492, p-value = 2.53e-08
## alternative hypothesis: greater
## sample estimates:
## Moran I statistic Expectation Variance
## 0.404431265 -0.015151515 0.005928887
A SIMMULATION BASED APPROACH
moran.mc(penn.state.utm$smk,penn.state.lw,10000)
##
## Monte-Carlo simulation of Moran I
##
## data: penn.state.utm$smk
## weights: penn.state.lw
## number of simulations + 1: 10001
##
## statistic = 0.40443, observed rank = 10001, p-value = 9.999e-05
## alternative hypothesis: greater
library(spatialreg)
## Loading required package: Matrix
##
## Attaching package: 'spatialreg'
## The following objects are masked from 'package:spdep':
##
## get.ClusterOption, get.coresOption, get.mcOption,
## get.VerboseOption, get.ZeroPolicyOption, set.ClusterOption,
## set.coresOption, set.mcOption, set.VerboseOption,
## set.ZeroPolicyOption
sar.res <- spautolm(smk~1,listw=penn.state.lw,data=penn.state.utm)
sar.res
##
## Call:
## spautolm(formula = smk ~ 1, data = penn.state.utm, listw = penn.state.lw)
##
## Coefficients:
## (Intercept) lambda
## 23.7689073 0.6179367
##
## Log likelihood: -142.8993
sar.res$lambda.se
## [1] 0.1130446
sar.res$lambda + c(-2,2)*sar.res$lambda.se
## [1] 0.3918474 0.8440260
head(pennLC$data)
## county cases population race gender age
## 1 adams 0 1492 o f Under.40
## 2 adams 0 365 o f 40.59
## 3 adams 1 68 o f 60.69
## 4 adams 0 73 o f 70+
## 5 adams 0 23351 w f Under.40
## 6 adams 5 12136 w f 40.59
tail(pennLC$data)
## county cases population race gender age
## 1067 york 1 513 o m 60.69
## 1068 york 1 319 o m 70+
## 1069 york 0 93689 w m Under.40
## 1070 york 30 51757 w m 40.59
## 1071 york 40 13863 w m 60.69
## 1072 york 84 14466 w m 70+
library(plyr)
require(plyr)
totcases <- ddply(pennLC$data,c("county"),numcolwise(sum))
head(totcases)
## county cases population
## 1 adams 55 91292
## 2 allegheny 1275 1281666
## 3 armstrong 49 72392
## 4 beaver 172 181412
## 5 bedford 37 49984
## 6 berks 308 373638
totcases <- transform(totcases,rate=10000*cases/population)
head(totcases)
## county cases population rate
## 1 adams 55 91292 6.024624
## 2 allegheny 1275 1281666 9.947990
## 3 armstrong 49 72392 6.768704
## 4 beaver 172 181412 9.481181
## 5 bedford 37 49984 7.402369
## 6 berks 308 373638 8.243273
#Check the distribution of rates
boxplot(totcases$rate,horizontal=TRUE,
xlab='Cancer Rate (Cases per 10,000 Popn.)')
sar.mod <- spautolm(rate~sqrt(penn.state.utm$smk),listw=penn.state.lw,
weight=population,data=totcases)
summary(sar.mod)
##
## Call: spautolm(formula = rate ~ sqrt(penn.state.utm$smk), data = totcases,
## listw = penn.state.lw, weights = population)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.45183 -1.10235 -0.31549 0.59901 5.00115
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.35263 2.26795 -0.1555 0.8764
## sqrt(penn.state.utm$smk) 1.80976 0.46064 3.9288 8.537e-05
##
## Lambda: 0.38063 LR test value: 6.3123 p-value: 0.01199
## Numerical Hessian standard error of lambda: 0.14049
##
## Log likelihood: -123.3056
## ML residual variance (sigma squared): 209030, (sigma: 457.19)
## Number of observations: 67
## Number of parameters estimated: 4
## AIC: NA (not available for weighted model)
Smoking rates are a strong predictor of lung cancer rates across Pennsylvania counties (p < 0.001).
There is significant spatial dependence (Lambda = 0.38, p = 0.012), meaning counties with high lung cancer rates are near other counties with high rates.
Ignoring spatial patterns would have biased our understanding of the smoking-cancer relationship.
Smoking intervention policies may need to target clusters of high-smoking and high-cancer counties.
This confirms that smoking is a key driver of lung cancer rates, and geography matters when studying health patterns.