#install.packages('SpatialEpi',depend=TRUE)

These libraries (tmap, SpatialEpi, sf, spdep) provide the tools needed for spatial analysis and mapping. The Pennsylvania lung cancer dataset (pennLC) contains geographic information and smoking data — the core variables for the study. This stage sets the foundation for everything that follows.

# Make sure the necessary packages have been loaded
#install.packages("tmap")
#install.packages("tmaptools")
library(tmap)
library(tmaptools)
library(SpatialEpi)
## Loading required package: sp
# Read in the Pennsylvania lung cancer data
data(pennLC)
# Extract the SpatialPolygon info
penn.state.latlong <- pennLC$spatial.polygon

library(sf)
## Linking to GEOS 3.13.0, GDAL 3.10.1, PROJ 9.5.1; sf_use_s2() is TRUE
# Convert to an sf object
penn.state.sf <- st_as_sf(penn.state.latlong)
# Reproject to UTM zone 17N (EPSG code can be specified; here, using 3724 as given)
penn.state.sf <- st_transform(penn.state.sf, crs = 3724)
# Convert back to a Spatial object if needed
penn.state.utm <- as(penn.state.sf, "Spatial")

# Obtain the smoking rates
penn.state.utm$smk <- pennLC$smoking$smoking * 100
# Convert to sf object (this will add the geometry column)
penn.state.sf <- st_as_sf(penn.state.utm)

# Draw the choropleth map using tmap v4 syntax
tm_shape(penn.state.sf) +
  tm_polygons(fill = "smk", fill.legend = tm_legend(title = "% of Popn.")) 

Visualizing smoking rates on a map helps identify patterns across counties — like clusters of high or low smoking rates. This map is our first look at how smoking behavior varies spatially, which is key for understanding any potential spatial dependence in health outcomes like lung cancer.

Creating Randomized Data:
By scrambling the smoking rate data and plotting both the real and fake data, this stage helps us see if the observed spatial pattern in smoking rates is real or could happen by chance. It also helps us visually distinguish true spatial clustering from random noise.

# Set up a set of five 'fake' smoking update rates as well as the real one
# Create new columns in penn.state.utm for randomised data
# Here the seed 4676 is used.  Use a different one to get an unknown outcome.


# Create new columns for random smoking rates
set.seed(123)
penn.state.sf$smk_rand1 <- sample(penn.state.sf$smk)
penn.state.sf$smk_rand2 <- sample(penn.state.sf$smk)
penn.state.sf$smk_rand3 <- sample(penn.state.sf$smk)
penn.state.sf$smk_rand4 <- sample(penn.state.sf$smk)
penn.state.sf$smk_rand5 <- sample(penn.state.sf$smk)

# Scramble the variables
vars <- sample(c('smk', 'smk_rand1', 'smk_rand2', 'smk_rand3', 'smk_rand4', 'smk_rand5'))

# Identify the real data
real.data.i <- which(vars == 'smk')

# Plot the maps in a 3x2 grid
tm_shape(penn.state.sf) +
  tm_polygons(fill = vars, fill.legend.show = FALSE) +
  tm_layout(title = 1:6, title.position = c("right", "top"))
## [tm_polygons()] Argument `fill.legend.show` unknown.
## [v3->v4] `tm_layout()`: use `tm_title()` instead of `tm_layout(title = )`
## [plot mode] fit legend/component: Some legend items or map compoments do not
## fit well, and are therefore rescaled.
## ℹ Set the tmap option `component.autoscale = FALSE` to disable rescaling.

real.data.i
## [1] 6

NEIGBOURS AND LAGGED MEAN PLOTS

require(spdep)
## Loading required package: spdep
## Loading required package: spData
## To access larger datasets in this package, install the spDataLarge
## package with: `install.packages('spDataLarge',
## repos='https://nowosad.github.io/drat/', type='source')`
penn.state.nb <- poly2nb(penn.state.utm)
penn.state.nb
## Neighbour list object:
## Number of regions: 67 
## Number of nonzero links: 346 
## Percentage nonzero weights: 7.70773 
## Average number of links: 5.164179
# Create a SpatialLinesDataFrame showing the Queen's case contiguities
# Convert SpatialPolygonsDataFrame to sf
penn.state.sf <- st_as_sf(penn.state.utm)

# Create a SpatialLinesDataFrame showing Queen's case contiguities
penn.state.net <- nb2lines(penn.state.nb, coords = coordinates(penn.state.utm))
## Warning in CRS(proj4string): CRS: projargs should not be NULL; set to NA
# Convert SpatialLinesDataFrame to sf
penn.state.net.sf <- st_as_sf(penn.state.net)

# Assign projection (assuming UTM Zone 17N)
st_crs(penn.state.sf) <- 3724
st_crs(penn.state.net.sf) <- 3724

# Plot the borders and the network
tm_shape(penn.state.sf) + tm_borders(col = 'lightgrey') +
  tm_shape(penn.state.net.sf) + tm_lines(col = 'red')

Rook’s Case:

# Calculate Rook’s case neighbors
penn.state.nb2 <- poly2nb(penn.state.utm, queen = FALSE)

# Convert neighbors to SpatialLinesDataFrame
penn.state.net2 <- nb2lines(penn.state.nb2, coords = coordinates(penn.state.utm))
## Warning in CRS(proj4string): CRS: projargs should not be NULL; set to NA
# Convert both SpatialLinesDataFrame and SpatialPolygonsDataFrame to sf
penn.state.sf <- st_as_sf(penn.state.utm)
penn.state.net.sf <- st_as_sf(penn.state.net)
penn.state.net2.sf <- st_as_sf(penn.state.net2)

# Set CRS for all
st_crs(penn.state.sf) <- 3724
st_crs(penn.state.net.sf) <- 3724
st_crs(penn.state.net2.sf) <- 3724

# Plot the counties, Queen’s and Rook’s case neighbors
tm_shape(penn.state.sf) + tm_borders(col = 'lightgrey') +
  tm_shape(penn.state.net.sf) + tm_lines(col = 'blue', lwd = 2) +
  tm_shape(penn.state.net2.sf) + tm_lines(col = 'yellow')

# Convert the neighbour list to a listw object - use Rook's case...
penn.state.lw <- nb2listw(penn.state.nb2)
penn.state.lw
## Characteristics of weights list object:
## Neighbour list object:
## Number of regions: 67 
## Number of nonzero links: 330 
## Percentage nonzero weights: 7.351303 
## Average number of links: 4.925373 
## 
## Weights style: W 
## Weights constants summary:
##    n   nn S0       S1       S2
## W 67 4489 67 28.73789 274.6157
# Calculate lagged means
penn.state.utm$smk.lagged.means <- lag.listw(penn.state.lw, penn.state.utm$smk)

# Convert SpatialPolygonsDataFrame to sf
penn.state.sf <- st_as_sf(penn.state.utm)

# Assign projection (UTM Zone 17N)
st_crs(penn.state.sf) <- 3724

# Plot lagged smoking rates
tm_shape(penn.state.sf) +
  tm_polygons(fill = 'smk.lagged.means', fill.legend = tm_legend(title = '% of Popn.')) +
  tm_layout(legend.bg.color = "white")

with(data.frame(penn.state.utm), {
  plot(smk,smk.lagged.means,asp=1,xlim=range(smk),ylim=range(smk))
  abline(a=0,b=1)
  abline(v=mean(smk),lty=2)
  abline(h=mean(smk.lagged.means),lty=2)
})

moran.plot(penn.state.utm$smk,penn.state.lw)

MORAN’S I : AN INDEX OF AUTOCORRELATION

moran.test(penn.state.utm$smk,penn.state.lw)
## 
##  Moran I test under randomisation
## 
## data:  penn.state.utm$smk  
## weights: penn.state.lw    
## 
## Moran I statistic standard deviate = 5.4175, p-value = 3.022e-08
## alternative hypothesis: greater
## sample estimates:
## Moran I statistic       Expectation          Variance 
##       0.404431265      -0.015151515       0.005998405
moran.range <- function(lw) {
  wmat <- listw2mat(lw)
  return(range(eigen((wmat + t(wmat))/2)$values))
}
moran.range(penn.state.lw)
## [1] -0.5785577  1.0202308
moran.test(penn.state.utm$smk,penn.state.lw,randomisation=FALSE)
## 
##  Moran I test under normality
## 
## data:  penn.state.utm$smk  
## weights: penn.state.lw    
## 
## Moran I statistic standard deviate = 5.4492, p-value = 2.53e-08
## alternative hypothesis: greater
## sample estimates:
## Moran I statistic       Expectation          Variance 
##       0.404431265      -0.015151515       0.005928887

A SIMMULATION BASED APPROACH

moran.mc(penn.state.utm$smk,penn.state.lw,10000)
## 
##  Monte-Carlo simulation of Moran I
## 
## data:  penn.state.utm$smk 
## weights: penn.state.lw  
## number of simulations + 1: 10001 
## 
## statistic = 0.40443, observed rank = 10001, p-value = 9.999e-05
## alternative hypothesis: greater
library(spatialreg)
## Loading required package: Matrix
## 
## Attaching package: 'spatialreg'
## The following objects are masked from 'package:spdep':
## 
##     get.ClusterOption, get.coresOption, get.mcOption,
##     get.VerboseOption, get.ZeroPolicyOption, set.ClusterOption,
##     set.coresOption, set.mcOption, set.VerboseOption,
##     set.ZeroPolicyOption
sar.res <- spautolm(smk~1,listw=penn.state.lw,data=penn.state.utm)
sar.res
## 
## Call:
## spautolm(formula = smk ~ 1, data = penn.state.utm, listw = penn.state.lw)
## 
## Coefficients:
## (Intercept)      lambda 
##  23.7689073   0.6179367 
## 
## Log likelihood: -142.8993
sar.res$lambda.se
## [1] 0.1130446
sar.res$lambda + c(-2,2)*sar.res$lambda.se
## [1] 0.3918474 0.8440260
head(pennLC$data)
##   county cases population race gender      age
## 1  adams     0       1492    o      f Under.40
## 2  adams     0        365    o      f    40.59
## 3  adams     1         68    o      f    60.69
## 4  adams     0         73    o      f      70+
## 5  adams     0      23351    w      f Under.40
## 6  adams     5      12136    w      f    40.59
tail(pennLC$data)
##      county cases population race gender      age
## 1067   york     1        513    o      m    60.69
## 1068   york     1        319    o      m      70+
## 1069   york     0      93689    w      m Under.40
## 1070   york    30      51757    w      m    40.59
## 1071   york    40      13863    w      m    60.69
## 1072   york    84      14466    w      m      70+
library(plyr)
require(plyr)
totcases <- ddply(pennLC$data,c("county"),numcolwise(sum))
head(totcases)
##      county cases population
## 1     adams    55      91292
## 2 allegheny  1275    1281666
## 3 armstrong    49      72392
## 4    beaver   172     181412
## 5   bedford    37      49984
## 6     berks   308     373638
totcases <- transform(totcases,rate=10000*cases/population)
head(totcases)
##      county cases population     rate
## 1     adams    55      91292 6.024624
## 2 allegheny  1275    1281666 9.947990
## 3 armstrong    49      72392 6.768704
## 4    beaver   172     181412 9.481181
## 5   bedford    37      49984 7.402369
## 6     berks   308     373638 8.243273
#Check the distribution of rates
boxplot(totcases$rate,horizontal=TRUE,
        xlab='Cancer Rate (Cases per 10,000 Popn.)')

sar.mod <- spautolm(rate~sqrt(penn.state.utm$smk),listw=penn.state.lw,
                    weight=population,data=totcases)
summary(sar.mod)
## 
## Call: spautolm(formula = rate ~ sqrt(penn.state.utm$smk), data = totcases, 
##     listw = penn.state.lw, weights = population)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -5.45183 -1.10235 -0.31549  0.59901  5.00115 
## 
## Coefficients: 
##                          Estimate Std. Error z value  Pr(>|z|)
## (Intercept)              -0.35263    2.26795 -0.1555    0.8764
## sqrt(penn.state.utm$smk)  1.80976    0.46064  3.9288 8.537e-05
## 
## Lambda: 0.38063 LR test value: 6.3123 p-value: 0.01199 
## Numerical Hessian standard error of lambda: 0.14049 
## 
## Log likelihood: -123.3056 
## ML residual variance (sigma squared): 209030, (sigma: 457.19)
## Number of observations: 67 
## Number of parameters estimated: 4 
## AIC: NA (not available for weighted model)

Final Summary of Interpretation:

This confirms that smoking is a key driver of lung cancer rates, and geography matters when studying health patterns.