Geo Spatial Statistics in R

An R script to spatial analysis

Author
Affiliation

Vusumuzi Mabasa😉

Witwatersrand University

library(R2BayesX)
library(BayesXsrc)
library(BayesX)
library(INLA)
library(spdep)
library(stringdist)
library(sf)
library(SpatialEpi)
library(raster)
library(tidyverse)
library(dplyr)

0.1 The Moran’s I statistic

  • Checking for spatial autocorrelation
setwd("C:/Users/VUSI/Downloads/CAR_models-master/ZAF_adm")
(p <- st_read("ZAF_adm1_1.shp"))
Reading layer `ZAF_adm1_1' from data source 
  `C:\Users\VUSI\Downloads\CAR_models-master\ZAF_adm\ZAF_adm1_1.shp' 
  using driver `ESRI Shapefile'
Simple feature collection with 9 features and 9 fields
Geometry type: MULTIPOLYGON
Dimension:     XY
Bounding box:  xmin: 16.45802 ymin: -34.83514 xmax: 32.89125 ymax: -22.12661
Geodetic CRS:  WGS 84
Simple feature collection with 9 features and 9 fields
Geometry type: MULTIPOLYGON
Dimension:     XY
Bounding box:  xmin: 16.45802 ymin: -34.83514 xmax: 32.89125 ymax: -22.12661
Geodetic CRS:  WGS 84
  ID_0 ISO       NAME_0 ID_1        NAME_1    TYPE_1 ENGTYPE_1 NL_NAME_1
1  211 ZAF South Africa    1  Eastern Cape Provinsie  Province      <NA>
2  211 ZAF South Africa    2    Free State Provinsie  Province      <NA>
3  211 ZAF South Africa    3       Gauteng Provinsie  Province      <NA>
4  211 ZAF South Africa    4 KwaZulu-Natal Provinsie  Province      <NA>
5  211 ZAF South Africa    5       Limpopo Provinsie  Province      <NA>
6  211 ZAF South Africa    6    Mpumalanga Provinsie  Province      <NA>
7  211 ZAF South Africa    7    North West Provinsie  Province      <NA>
8  211 ZAF South Africa    8 Northern Cape Provinsie  Province      <NA>
9  211 ZAF South Africa    9  Western Cape Provinsie  Province      <NA>
                                                  VARNAME_1
1                                                  Oos-Kaap
2                                Orange Free State|Vrystaat
3                               Pretoria/Witwatersrand/Vaal
4                                        Natal and Zululand
5 Noordelike Provinsie|Northern Transvaal|Northern Province
6                                         Eastern Transvaal
7                                       North-West|Noordwes
8                                                Noord-Kaap
9                                                  Wes-Kaap
                        geometry
1 MULTIPOLYGON (((25.64375 -3...
2 MULTIPOLYGON (((27.98611 -2...
3 MULTIPOLYGON (((28.75139 -2...
4 MULTIPOLYGON (((31.63736 -2...
5 MULTIPOLYGON (((29.66671 -2...
6 MULTIPOLYGON (((31.87835 -2...
7 MULTIPOLYGON (((26.40328 -2...
8 MULTIPOLYGON (((16.99514 -2...
9 MULTIPOLYGON (((19.66764 -3...
library(RColorBrewer)

# ---- 1) Read shapefile as sf ----
(p <- st_read("C:/Users/VUSI/Downloads/CAR_models-master/ZAF_adm/ZAF_adm1_1.shp"))
Reading layer `ZAF_adm1_1' from data source 
  `C:\Users\VUSI\Downloads\CAR_models-master\ZAF_adm\ZAF_adm1_1.shp' 
  using driver `ESRI Shapefile'
Simple feature collection with 9 features and 9 fields
Geometry type: MULTIPOLYGON
Dimension:     XY
Bounding box:  xmin: 16.45802 ymin: -34.83514 xmax: 32.89125 ymax: -22.12661
Geodetic CRS:  WGS 84
Simple feature collection with 9 features and 9 fields
Geometry type: MULTIPOLYGON
Dimension:     XY
Bounding box:  xmin: 16.45802 ymin: -34.83514 xmax: 32.89125 ymax: -22.12661
Geodetic CRS:  WGS 84
  ID_0 ISO       NAME_0 ID_1        NAME_1    TYPE_1 ENGTYPE_1 NL_NAME_1
1  211 ZAF South Africa    1  Eastern Cape Provinsie  Province      <NA>
2  211 ZAF South Africa    2    Free State Provinsie  Province      <NA>
3  211 ZAF South Africa    3       Gauteng Provinsie  Province      <NA>
4  211 ZAF South Africa    4 KwaZulu-Natal Provinsie  Province      <NA>
5  211 ZAF South Africa    5       Limpopo Provinsie  Province      <NA>
6  211 ZAF South Africa    6    Mpumalanga Provinsie  Province      <NA>
7  211 ZAF South Africa    7    North West Provinsie  Province      <NA>
8  211 ZAF South Africa    8 Northern Cape Provinsie  Province      <NA>
9  211 ZAF South Africa    9  Western Cape Provinsie  Province      <NA>
                                                  VARNAME_1
1                                                  Oos-Kaap
2                                Orange Free State|Vrystaat
3                               Pretoria/Witwatersrand/Vaal
4                                        Natal and Zululand
5 Noordelike Provinsie|Northern Transvaal|Northern Province
6                                         Eastern Transvaal
7                                       North-West|Noordwes
8                                                Noord-Kaap
9                                                  Wes-Kaap
                        geometry
1 MULTIPOLYGON (((25.64375 -3...
2 MULTIPOLYGON (((27.98611 -2...
3 MULTIPOLYGON (((28.75139 -2...
4 MULTIPOLYGON (((31.63736 -2...
5 MULTIPOLYGON (((29.66671 -2...
6 MULTIPOLYGON (((31.87835 -2...
7 MULTIPOLYGON (((26.40328 -2...
8 MULTIPOLYGON (((16.99514 -2...
9 MULTIPOLYGON (((19.66764 -3...
# ---- 2) Add your variable (ensure numeric, length matches rows) ----
p$percent <- c(21.9, 24.6, 15.5, 15.5, 13.4, 16.2, 21.2, 25.5, 28.9)
stopifnot(nrow(p) == length(p$percent))
stopifnot(is.numeric(p$percent))
# ---- 3) Make neighbors (queen contiguity) ----
# Use a unique ID present in your data; NAME_1 exists and is unique
w  <- poly2nb(p, row.names = p$NAME_1)
ww <- nb2listw(w, style = "B")  # binary weights
# 4) Map - corrected tmap syntax for v4+
library(tmap)
tmap_mode("plot")

tm_shape(p) +
  tm_polygons(
    col = "percent",  # Use 'col' instead of 'fill' in newer tmap versions
    palette = "Reds",
    style = "quantile",
    n = 5,
    title = "Hypertension prevalence",
    border.alpha = 0.4  # Use dot notation instead of underscore
  ) +
  tm_text("NAME_1") +
  tm_compass() +
  tm_title("South AFRICA, DHIS2016") +
  tm_layout(
    legend.text.size = 0.5,
    legend.title.size = 0.5,
    legend.position = c("right", "bottom"),
    frame = FALSE
  )

# ---- 5) Moran's I (global) ----
# Quick statistic
n_areas <- nrow(p)
moran_stat <- moran(p$percent, ww, n = n_areas, S0 = Szero(ww))
print(moran_stat)
$I
[1] 0.2909315

$K
[1] 1.679342
# Analytical test
print(moran.test(p$percent, ww, randomisation = FALSE))

    Moran I test under normality

data:  p$percent  
weights: ww    

Moran I statistic standard deviate = 2.4985, p-value = 0.006237
alternative hypothesis: greater
sample estimates:
Moran I statistic       Expectation          Variance 
        0.2909315        -0.1250000         0.0277141 
# Permutation test
set.seed(123)
print(moran.mc(p$percent, ww, nsim = 99))               # permutation test

    Monte-Carlo simulation of Moran I

data:  p$percent 
weights: ww  
number of simulations + 1: 100 

statistic = 0.29093, observed rank = 99, p-value = 0.01
alternative hypothesis: greater

1 BayesX CAR model for hypertension in South Africa

library(sf)

setwd("C:/Users/VUSI/Downloads/CAR_models-master/ZAF_adm")
SA_shp<-st_read("ZAF_adm1_1.shp")
Reading layer `ZAF_adm1_1' from data source 
  `C:\Users\VUSI\Downloads\CAR_models-master\ZAF_adm\ZAF_adm1_1.shp' 
  using driver `ESRI Shapefile'
Simple feature collection with 9 features and 9 fields
Geometry type: MULTIPOLYGON
Dimension:     XY
Bounding box:  xmin: 16.45802 ymin: -34.83514 xmax: 32.89125 ymax: -22.12661
Geodetic CRS:  WGS 84
plot(SA_shp)

## read shapefile into bnd object

setwd("C:/Users/VUSI/Downloads/CAR_models-master/ZAF_adm")

shpname2 <- file.path("C:/Users/VUSI/Downloads/CAR_models-master/ZAF_adm/ZAF_adm1_1")
SA_bnd_prov <- BayesX::shp2bnd(shpname = shpname2, regionnames = "ID_1", check.is.in = F)
Reading map ... finished
Note: map consists originally of 53 polygons
Note: map consists of 9 regions
BayesX::drawmap(map=SA_bnd_prov)

sagra<-BayesX::bnd2gra(SA_bnd_prov)
Start neighbor search ...
progress: 50%
progress: 100%
Neighbor search finished.

1.1 BayesX Model

-uses a normal prior for the \(\beta\)s and an Inverse Gamma, \(IG(a,b)\) for precision parameter \(\tau^{2}\). \(a=b=0.001\) is assumed

  • The variance parameter \(tau^{2}_{j}\) is equivalent to the inverse smoothing parameter in a frequentist approach and controls the trade off between flexibility and smoothness.

  • Markov random fields (Fahrmeir et al. 2004): Defines a Markov random field prior for a spatial covariate, where geographical information is provided by a map object in boundary or graph file format.

  • mrf prior is used for the spatial effects and an MCMC algorithm implemented for sampling on the parameters

library(haven)
(hypertensiondata <- haven::read_dta("hypertensiondat.dta"))
# A tibble: 10,294 × 13
   current_age age_cat        region    type_residence residence education_level
         <dbl> <dbl+lbl>      <dbl+lbl> <dbl+lbl>      <dbl+lbl> <dbl+lbl>      
 1          45  7 [7. 45-49]  7 [7. ga… 1 [1. urban]   1 [1. ur… 2 [2. secondar…
 2          60 10 [10. 60-64] 7 [7. ga… 1 [1. urban]   1 [1. ur… 2 [2. secondar…
 3          30  4 [4. 30-34]  7 [7. ga… 1 [1. urban]   1 [1. ur… 2 [2. secondar…
 4          43  6 [6. 40-44]  7 [7. ga… 1 [1. urban]   1 [1. ur… 2 [2. secondar…
 5          15  1 [1. 15-19]  5 [5. kw… 2 [2. rural]   2 [2. ru… 2 [2. secondar…
 6          20  2 [2. 20-24]  5 [5. kw… 2 [2. rural]   2 [2. ru… 2 [2. secondar…
 7          15  1 [1. 15-19]  5 [5. kw… 2 [2. rural]   2 [2. ru… 2 [2. secondar…
 8          55  9 [9. 55-59]  5 [5. kw… 2 [2. rural]   2 [2. ru… 1 [1. primary] 
 9          17  1 [1. 15-19]  5 [5. kw… 2 [2. rural]   2 [2. ru… 2 [2. secondar…
10          23  2 [2. 20-24]  4 [4. fr… 1 [1. urban]   1 [1. ur… 1 [1. primary] 
# ℹ 10,284 more rows
# ℹ 7 more variables: wealth_level <dbl+lbl>, hypertension <dbl+lbl>,
#   sex <dbl+lbl>, hypertension2 <dbl+lbl>, caseid <dbl>, province <dbl+lbl>,
#   diabetes <dbl>
#recoding data to align with shapefile regions codes and names
hypertensiondata$province<-factor(hypertensiondata$province,
                       levels=c(1,2,3,4,5,6,7,8,9),
                       labels=c("Eastern cape","Freestate", "Gauteng","Kwazulu natal","Limpopo", "Mpumalanga", "North west", "Northern cape", "Western cape"))

#defining factor variables
hypertensiondata <- within(hypertensiondata, {
  residence <- factor(residence, levels=c(1,2), labels=c("Urban", "Rural"))
  education_level<-factor(education_level, levels = c(0,1,2,3), labels = c("no education", "primary", "secondary","higher"))
  wealth_level<- factor(wealth_level, levels = c(1,2,3,4,5), labels = c("poorest","poor","middle","richer","richest"))
  sex<-factor(sex, levels = c(0,1), labels = c("male","female"))
  
})

1.2 Logistic regression model using stata and R

library(Statamarkdown)
cd "C:\Users\VUSI\Downloads\CAR_models-master"

use "hypertensiondat.dta", clear

logit hypertension2 i.sex i.residence current_age i.wealth_level
C:\Users\VUSI\Downloads\CAR_models-master



Iteration 0:  Log likelihood = -5075.6754  
Iteration 1:  Log likelihood = -4043.8348  
Iteration 2:  Log likelihood = -3937.7108  
Iteration 3:  Log likelihood = -3936.8902  
Iteration 4:  Log likelihood =   -3936.89  

Logistic regression                                    Number of obs =  10,294
                                                       LR chi2(7)    = 2277.57
                                                       Prob > chi2   =  0.0000
Log likelihood = -3936.89                              Pseudo R2     =  0.2244

------------------------------------------------------------------------------
hypertensi~2 | Coefficient  Std. err.      z    P>|z|     [95% conf. interval]
-------------+----------------------------------------------------------------
         sex |
  1. Female  |   .6251579   .0614456    10.17   0.000     .5047268    .7455889
             |
   residence |
   2. rural  |  -.3940172   .0585302    -6.73   0.000    -.5087343   -.2793002
 current_age |   .0662251   .0016896    39.20   0.000     .0629135    .0695367
             |
wealth_level |
  2. poorer  |   .3269566   .0993283     3.29   0.001     .1322768    .5216364
  3. middle  |   .3187024   .0983854     3.24   0.001     .1258705    .5115343
  4. richer  |   .3845553   .0984207     3.91   0.000     .1916543    .5774563
 5. richest  |   .3996982   .0969397     4.12   0.000     .2096998    .5896966
             |
       _cons |  -4.956168   .1198338   -41.36   0.000    -5.191038   -4.721298
------------------------------------------------------------------------------
names(hypertensiondata)
 [1] "current_age"     "age_cat"         "region"          "type_residence" 
 [5] "residence"       "education_level" "wealth_level"    "hypertension"   
 [9] "sex"             "hypertension2"   "caseid"          "province"       
[13] "diabetes"       
library(haven); library(dplyr); library(survival); library(survminer); library(broom)

cat_vars <- c(
  "age_cat","region","type_residence","residence","education_level",
  "wealth_level","hypertension","sex","hypertension2","province"
)

(hyp <- hypertensiondata %>%
  mutate(across(
    all_of(cat_vars),
    ~ if (inherits(.x, "haven_labelled")) haven::as_factor(.x) else as.factor(.x)
  )))
# A tibble: 10,294 × 13
   current_age age_cat   region         type_residence residence education_level
         <dbl> <fct>     <fct>          <fct>          <fct>     <fct>          
 1          45 7. 45-49  7. gauteng     1. urban       Urban     secondary      
 2          60 10. 60-64 7. gauteng     1. urban       Urban     secondary      
 3          30 4. 30-34  7. gauteng     1. urban       Urban     secondary      
 4          43 6. 40-44  7. gauteng     1. urban       Urban     secondary      
 5          15 1. 15-19  5. kwazulu-na… 2. rural       Rural     secondary      
 6          20 2. 20-24  5. kwazulu-na… 2. rural       Rural     secondary      
 7          15 1. 15-19  5. kwazulu-na… 2. rural       Rural     secondary      
 8          55 9. 55-59  5. kwazulu-na… 2. rural       Rural     primary        
 9          17 1. 15-19  5. kwazulu-na… 2. rural       Rural     secondary      
10          23 2. 20-24  4. free state  1. urban       Urban     primary        
# ℹ 10,284 more rows
# ℹ 7 more variables: wealth_level <fct>, hypertension <fct>, sex <fct>,
#   hypertension2 <fct>, caseid <dbl>, province <fct>, diabetes <dbl>
model <- glm(hypertension2 ~ sex + residence + current_age + wealth_level,
             data=hypertensiondata, family=binomial())
summary(model)

Call:
glm(formula = hypertension2 ~ sex + residence + current_age + 
    wealth_level, family = binomial(), data = hypertensiondata)

Coefficients:
                    Estimate Std. Error z value Pr(>|z|)    
(Intercept)         -4.95617    0.11983 -41.359  < 2e-16 ***
sexfemale            0.62516    0.06144  10.174  < 2e-16 ***
residenceRural      -0.39402    0.05853  -6.732 1.67e-11 ***
current_age          0.06623    0.00169  39.195  < 2e-16 ***
wealth_levelpoor     0.32696    0.09933   3.292 0.000996 ***
wealth_levelmiddle   0.31870    0.09839   3.239 0.001198 ** 
wealth_levelricher   0.38455    0.09842   3.907 9.33e-05 ***
wealth_levelrichest  0.39970    0.09694   4.123 3.74e-05 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 10151.4  on 10293  degrees of freedom
Residual deviance:  7873.8  on 10286  degrees of freedom
AIC: 7889.8

Number of Fisher Scoring iterations: 5
exp(coef(model))   # odds ratios
        (Intercept)           sexfemale      residenceRural         current_age 
        0.007039851         1.868541187         0.674342357         1.068467207 
   wealth_levelpoor  wealth_levelmiddle  wealth_levelricher wealth_levelrichest 
        1.386741395         1.375342052         1.468961041         1.491374635 
library(colorspace)

imodel <- bayesx(hypertension2 ~ sex + residence + wealth_level + sx(current_age) + sx(province, bs="mrf", map=sagra) 
                 + sx(province, bs="re"), family = "binomial", method = "MCMC", iterations = 12000, burnin = 2000, 
                 step = 10, weights = NULL,subset = NULL, offset = NULL, na.action = NULL, seed = 12345, 
                 predict = TRUE, data = hypertensiondata)

summary(imodel)
Call:
bayesx(formula = hypertension2 ~ sex + residence + wealth_level + 
    sx(current_age) + sx(province, bs = "mrf", map = sagra) + 
    sx(province, bs = "re"), data = hypertensiondata, weights = NULL, 
    subset = NULL, offset = NULL, na.action = NULL, family = "binomial", 
    method = "MCMC", iterations = 12000, burnin = 2000, step = 10, 
    seed = 12345, predict = TRUE)
 
Fixed effects estimation results:

Parametric coefficients:
                       Mean      Sd    2.5%     50%   97.5%
(Intercept)         -1.7710  0.1226 -2.0097 -1.7741 -1.5310
sexfemale            0.6363  0.0605  0.5228  0.6347  0.7545
residenceRural      -0.1413  0.0695 -0.2744 -0.1431 -0.0066
wealth_levelpoor     0.4170  0.1050  0.2171  0.4170  0.6200
wealth_levelmiddle   0.3824  0.0995  0.1836  0.3829  0.5747
wealth_levelricher   0.4000  0.1026  0.1938  0.4042  0.5951
wealth_levelrichest  0.4161  0.1009  0.2231  0.4150  0.6087

Smooth terms variances:
                   Mean     Sd   2.5%    50%  97.5%    Min    Max
sx(current_age)  0.0153 0.0155 0.0027 0.0102 0.0621 0.0016 0.1353
sx(province):mrf 0.2329 0.3008 0.0015 0.1622 0.9720 0.0002 4.6444
 
Random effects variances:
                  Mean     Sd   2.5%    50%  97.5%    Min    Max
sx(province):re 0.0508 0.0727 0.0008 0.0240 0.2491 0.0003 0.8077
 
N = 10294  burnin = 2000  method = MCMC  family = binomial  
iterations = 12000  step = 10  
#Getting Odds Ratios
exp(coef(imodel))
                         Mean       Sd      2.5%       50%     97.5%
(Intercept)         0.1701610 1.130458 0.1340302 0.1696428 0.2163214
sexfemale           1.8895392 1.062382 1.6867760 1.8863675 2.1264544
residenceRural      0.8682583 1.071962 0.7600394 0.8666787 0.9933815
wealth_levelpoor    1.5174511 1.110705 1.2424249 1.5173661 1.8589355
wealth_levelmiddle  1.4657822 1.104662 1.2014895 1.4664742 1.7765531
wealth_levelricher  1.4918516 1.108018 1.2138778 1.4980571 1.8131760
wealth_levelrichest 1.5160678 1.106205 1.2498856 1.5143526 1.8380496
#wald.test(b = coef(imodel), Sigma = vcov(imodel), Terms=3:4)

## odds ratios and 95% CI
exp(cbind(OR = coef(imodel), confint(imodel)))
                      OR.Mean    OR.Sd   OR.2.5%    OR.50%  OR.97.5%      2.5%
(Intercept)         0.1701610 1.130458 0.1340302 0.1696428 0.2163214 0.1341410
sexfemale           1.8895392 1.062382 1.6867760 1.8863675 2.1264544 1.6868416
residenceRural      0.8682583 1.071962 0.7600394 0.8666787 0.9933815 0.7612302
wealth_levelpoor    1.5174511 1.110705 1.2424249 1.5173661 1.8589355 1.2425264
wealth_levelmiddle  1.4657822 1.104662 1.2014895 1.4664742 1.7765531 1.2021311
wealth_levelricher  1.4918516 1.108018 1.2138778 1.4980571 1.8131760 1.2147972
wealth_levelrichest 1.5160678 1.106205 1.2498856 1.5143526 1.8380496 1.2501492
                        97.5%
(Intercept)         0.2161817
sexfemale           2.1254891
residenceRural      0.9933567
wealth_levelpoor    1.8575973
wealth_levelmiddle  1.7761734
wealth_levelricher  1.8119689
wealth_levelrichest 1.8338462

1.3 Plotting some results

par(mar=c(1,1,1,1))

plot(imodel, term =  "sx(current_age)")

#Visualization of posterior mean of structured and unstructured spatial effects
## plot all spatial effects
plot(imodel, term = c("sx(province):mrf", "sx(province):re"))

par(mar=c(1,1,1,1))

#Plotting autocorrelation for age  variance samples
plot(imodel, term = "sx(current_age)", which = "var-samples", acf = TRUE)

par(mar=c(1,1,1,1))


#plot(imodel, which = "max-acf")

zs <- samples(imodel, term = "linear-samples")
par(mar=c(1,1,1,1))
plot(zs)

## extract model residuals and plot histogram
#par(mar=c(1,1,1,1))
hist(residuals(imodel))

1.4 Plot structured spatial effects maps etc

plot(imodel, term = "sx(province):mrf", map = SA_bnd_prov, main = "Structured spatial effect")

  • Plot unstructured spatial effects
plot(imodel, term = "sx(province):re", map = SA_bnd_prov, main = "Unstructured spatial effect")

  • Plot both structured and unstructured effects
plot(imodel, term = "sx(province):total", map = SA_bnd_prov, main = "Total spatial effect", digits = 4)

  • Map the fitted spatial term only
library(ggplot2)
fv <- fitted(imodel, term = "sx(province):mrf")[, "Mean"]
(SA_shp$spatial <- fv)
[1]  0.0989143  0.1146970 -0.1237380 -0.1069340 -0.3885810 -0.0552362  0.0642649
[8]  0.1957780  0.2008350
SA_shp
Simple feature collection with 9 features and 10 fields
Geometry type: MULTIPOLYGON
Dimension:     XY
Bounding box:  xmin: 16.45802 ymin: -34.83514 xmax: 32.89125 ymax: -22.12661
Geodetic CRS:  WGS 84
  ID_0 ISO       NAME_0 ID_1        NAME_1    TYPE_1 ENGTYPE_1 NL_NAME_1
1  211 ZAF South Africa    1  Eastern Cape Provinsie  Province      <NA>
2  211 ZAF South Africa    2    Free State Provinsie  Province      <NA>
3  211 ZAF South Africa    3       Gauteng Provinsie  Province      <NA>
4  211 ZAF South Africa    4 KwaZulu-Natal Provinsie  Province      <NA>
5  211 ZAF South Africa    5       Limpopo Provinsie  Province      <NA>
6  211 ZAF South Africa    6    Mpumalanga Provinsie  Province      <NA>
7  211 ZAF South Africa    7    North West Provinsie  Province      <NA>
8  211 ZAF South Africa    8 Northern Cape Provinsie  Province      <NA>
9  211 ZAF South Africa    9  Western Cape Provinsie  Province      <NA>
                                                  VARNAME_1
1                                                  Oos-Kaap
2                                Orange Free State|Vrystaat
3                               Pretoria/Witwatersrand/Vaal
4                                        Natal and Zululand
5 Noordelike Provinsie|Northern Transvaal|Northern Province
6                                         Eastern Transvaal
7                                       North-West|Noordwes
8                                                Noord-Kaap
9                                                  Wes-Kaap
                        geometry    spatial
1 MULTIPOLYGON (((25.64375 -3...  0.0989143
2 MULTIPOLYGON (((27.98611 -2...  0.1146970
3 MULTIPOLYGON (((28.75139 -2... -0.1237380
4 MULTIPOLYGON (((31.63736 -2... -0.1069340
5 MULTIPOLYGON (((29.66671 -2... -0.3885810
6 MULTIPOLYGON (((31.87835 -2... -0.0552362
7 MULTIPOLYGON (((26.40328 -2...  0.0642649
8 MULTIPOLYGON (((16.99514 -2...  0.1957780
9 MULTIPOLYGON (((19.66764 -3...  0.2008350
library(sf)
library(dplyr)
library(ggplot2)

# 0) Sanity checks
stopifnot(inherits(SA_shp, "sf"))             # must be an sf
stopifnot(!is.null(sf::st_geometry(SA_shp)))  # geometry must exist
stopifnot(length(SA_shp$spatial) == nrow(SA_shp))  # one value per region

# 1) Choropleth of the fitted spatial effect (log-odds)
ggplot(SA_shp) +
  geom_sf(aes(fill = spatial)) +
  scale_fill_distiller(palette = "RdBu", direction = -1,  # blue=low, red=high
                       limits = range(SA_shp$spatial, na.rm = TRUE),
                       oob = scales::squish) +
  labs(title = "Fitted spatial effect (logit scale)",
       fill = "log-odds") +
  theme_minimal()

1.5 Running same model with INLA

SA_OGR <- sf::st_read("C:/Users/VUSI/Downloads/CAR_models-master/ZAF_adm/ZAF_adm1_1.shp")
Reading layer `ZAF_adm1_1' from data source 
  `C:\Users\VUSI\Downloads\CAR_models-master\ZAF_adm\ZAF_adm1_1.shp' 
  using driver `ESRI Shapefile'
Simple feature collection with 9 features and 9 fields
Geometry type: MULTIPOLYGON
Dimension:     XY
Bounding box:  xmin: 16.45802 ymin: -34.83514 xmax: 32.89125 ymax: -22.12661
Geodetic CRS:  WGS 84
plot(SA_OGR, border = "blue", axes = TRUE, las = 1)

#SA_valid <- data.table::data.table(valid = gIsValid(SA_OGR, byid = TRUE)) 
SA_valid <- data.table::data.table(SA_OGR, byid = TRUE) 
sum(SA_valid$valid == F)
[1] 0
# install.packages(c("sf","spdep","ggplot2"))
library(sf)
library(spdep)
library(ggplot2)

# --- 1) Ensure SA_shp is sf ---
# If you already have SA_shp in memory but it's not sf, convert or re-read.
if (!inherits(SA_shp, "sf")) {
  if (inherits(SA_shp, "Spatial")) {
    SA_shp <- st_as_sf(SA_shp)                         # sp -> sf
  } else if ("geometry" %in% names(SA_shp)) {
    SA_shp <- st_as_sf(SA_shp)                         # list-column exists but class lost
  } else {
    SA_shp <- st_read("SA_shp")                    # read from disk (adjust path)
  }
}

# --- 2) Set CRS only AFTER it's sf ---
if (is.na(st_crs(SA_shp))) st_crs(SA_shp) <- 4326      # WGS84; change if you know better

# --- 3) Unique id for neighbors (if missing) ---
if (!"OBJECTID" %in% names(SA_shp)) SA_shp$OBJECTID <- seq_len(nrow(SA_shp))

# --- 4) Plot with ggplot2 ---
ggplot(SA_shp) + geom_sf(color = "blue", fill = NA) + theme_minimal()

# --- 5) Neighbors + WinBUGS vectors (optional) ---
nb <- spdep::poly2nb(SA_shp, row.names = SA_shp$OBJECTID)
wb <- spdep::nb2WB(nb)

# Write a simple 3-line text file (like sp2WB output)
writeLines(c(
  paste(wb$num,     collapse = " "),
  paste(wb$adj,     collapse = " "),
  paste(wb$weights, collapse = " ")
), "splus_911.txt")
SA_nb <- spdep::poly2nb(SA_shp)

# 2. Generate adjacency matrix for INLA
nb2INLA("sa_inla.graph", SA_nb) 
sa_adj <- file.path(getwd(), "sa_inla.graph")  # Use file.path for proper path handling

# 3. Read the graph
H <- inla.read.graph(filename = "sa_inla.graph") 
image(inla.graph2matrix(H), xlab = "", ylab = "")

# 4. Add ID_1 to the data - FIXED PATH (use forward slashes or double backslashes)
hyp_id <- read_csv("C:/Users/VUSI/Downloads/CAR_models-master/ZAF_adm/ZAF_adm.csv")  # Corrected path

# 5. PROPER JOINING with diagnostics
cat("Unique provinces in hypertensiondata:\n")
Unique provinces in hypertensiondata:
print(unique(hypertensiondata$province))
[1] Gauteng       Kwazulu natal Freestate     Mpumalanga    Eastern cape 
[6] North west    Western cape  Northern cape Limpopo      
9 Levels: Eastern cape Freestate Gauteng Kwazulu natal Limpopo ... Western cape
cat("\nUnique NAME_1 in hyp_id:\n")

Unique NAME_1 in hyp_id:
print(unique(hyp_id$NAME_1))
[1] "Eastern Cape"  "Free State"    "Gauteng"       "KwaZulu-Natal"
[5] "Limpopo"       "Mpumalanga"    "North West"    "Northern Cape"
[9] "Western Cape" 
# Clean and standardize names for joining
hypertensiondata <- hypertensiondata %>%
  mutate(
    NAME_1 = tolower(trimws(province)),  # Clean province names
    NAME_1 = gsub("[^a-zA-Z0-9]", " ", NAME_1)  # Remove special characters
  )

hyp_id <- hyp_id %>%
  mutate(
    NAME_1_clean = tolower(trimws(NAME_1)),
    NAME_1_clean = gsub("[^a-zA-Z0-9]", " ", NAME_1_clean)
  )
# Perform the join
hypertensiondata <- hypertensiondata %>% 
  left_join(hyp_id %>% select(NAME_0, ID_1, NAME_1_clean), 
            by = c("NAME_1" = "NAME_1_clean"))

# 6. Check if join worked and handle missing IDs
cat("\nMissing ID_1 values after join:", sum(is.na(hypertensiondata$ID_1)), "\n")

Missing ID_1 values after join: 1029 
# If there are missing IDs, check the first few mismatches
if(any(is.na(hypertensiondata$ID_1))) {
  mismatches <- hypertensiondata %>% 
    distinct(province, NAME_1) %>% 
    head(5)
  cat("Sample mismatches:\n")
  print(mismatches)
}
Sample mismatches:
# A tibble: 5 × 2
  province      NAME_1       
  <fct>         <chr>        
1 Gauteng       gauteng      
2 Kwazulu natal kwazulu natal
3 Freestate     freestate    
4 Mpumalanga    mpumalanga   
5 Eastern cape  eastern cape 
# 7. Ensure ID_1 is numeric and properly ordered
hypertensiondata_clean <- hypertensiondata %>%
  mutate(ID_1 = as.numeric(ID_1))

# 8. Verify spatial structure alignment
cat("Spatial regions:", length(SA_nb), "\n")
Spatial regions: 9 
cat("Unique ID_1 in data:", length(unique(hypertensiondata_clean$ID_1)), "\n")
Unique ID_1 in data: 9 
cat("ID_1 range in data:", range(hypertensiondata_clean$ID_1), "\n")
ID_1 range in data: NA NA 
# 9. Fitting CAR model - CORRECTED (use H instead of sa_adj)
formula_inla <- hypertension2 ~ 1 + 
  as.factor(sex) + 
  as.factor(residence) + 
  current_age + 
  as.factor(wealth_level) +
  f(ID_1, model = "bym", graph = H, scale.model = TRUE,  # Use H, not sa_adj
    hyper = list(
      prec.unstruct = list(prior = "loggamma", param = c(1, 0.001)), 
      prec.spatial = list(prior = "loggamma", param = c(1, 0.001))
    )
  )
# 10. Run INLA model
model_inla <- inla(
  formula_inla,
  family = "binomial", 
  data = hypertensiondata_clean, 
  control.compute = list(dic = TRUE),
  control.predictor = list(compute = TRUE),
  control.inla = list(strategy = "gaussian"),  # More stable
  verbose = TRUE
)

# 11. Check results
summary(model_inla)
Time used:
    Pre = 1.85, Running = 7.39, Post = 0.421, Total = 9.66 
Fixed effects:
                                 mean    sd 0.025quant 0.5quant 0.975quant
(Intercept)                    -4.952 0.141     -5.228   -4.953     -4.674
as.factor(sex)female            0.638 0.062      0.517    0.638      0.760
as.factor(residence)Rural      -0.201 0.069     -0.335   -0.201     -0.065
current_age                     0.067 0.002      0.063    0.067      0.070
as.factor(wealth_level)poor     0.404 0.102      0.205    0.404      0.603
as.factor(wealth_level)middle   0.379 0.101      0.181    0.379      0.577
as.factor(wealth_level)richer   0.398 0.101      0.200    0.398      0.595
as.factor(wealth_level)richest  0.442 0.100      0.246    0.442      0.638
                                 mode kld
(Intercept)                    -4.953   0
as.factor(sex)female            0.638   0
as.factor(residence)Rural      -0.201   0
current_age                     0.067   0
as.factor(wealth_level)poor     0.404   0
as.factor(wealth_level)middle   0.379   0
as.factor(wealth_level)richer   0.398   0
as.factor(wealth_level)richest  0.442   0

Random effects:
  Name    Model
    ID_1 BYM model

Model hyperparameters:
                                          mean      sd 0.025quant 0.5quant
Precision for ID_1 (iid component)       10.27    5.98       2.92     8.91
Precision for ID_1 (spatial component) 1044.23 1177.40      60.68   669.29
                                       0.975quant   mode
Precision for ID_1 (iid component)          25.58   6.61
Precision for ID_1 (spatial component)    4167.88 160.80

Deviance Information Criterion (DIC) ...............: 7793.05
Deviance Information Criterion (DIC, saturated) ....: 7778.16
Effective number of parameters .....................: 15.34

Marginal log-Likelihood:  -3941.12 
 is computed 
Posterior summaries for the linear predictor and the fitted values are computed
(Posterior marginals needs also 'control.compute=list(return.marginals.predictor=TRUE)')
SA_shp1 <- st_read("C:/Users/VUSI/Downloads/CAR_models-master/ZAF_adm/ZAF_adm.csv", quiet = TRUE)
SA_shp1$ID_1  <- as.character(SA_shp1$ID_1)  # use character IDs
SA_shp1$NAME_1 <- as.character(SA_shp1$NAME_1)
# --- 0) Read the SHAPEFILE (not the CSV) ---
SA_sf <- st_read("C:/Users/VUSI/Downloads/CAR_models-master/ZAF_adm/ZAF_adm1_1.shp")
Reading layer `ZAF_adm1_1' from data source 
  `C:\Users\VUSI\Downloads\CAR_models-master\ZAF_adm\ZAF_adm1_1.shp' 
  using driver `ESRI Shapefile'
Simple feature collection with 9 features and 9 fields
Geometry type: MULTIPOLYGON
Dimension:     XY
Bounding box:  xmin: 16.45802 ymin: -34.83514 xmax: 32.89125 ymax: -22.12661
Geodetic CRS:  WGS 84
SA_sf <- SA_sf %>% mutate(ID_1 = as.character(ID_1), NAME_1 = as.character(NAME_1))

# --- 1) Build adjacency only from polygons ---
sa_graph_path <- file.path(getwd(), "ZAF_adm", "sa_inla.graph")
if (!file.exists(sa_graph_path)) {
  SA_nb <- poly2nb(SA_sf, row.names = SA_sf$ID_1)  # use ID_1 as node name
  INLA::nb2INLA(sa_graph_path, SA_nb)
}
H <- INLA::inla.read.graph(sa_graph_path)
graph_names <- if (!is.null(H$names)) H$names else as.character(seq_len(H$n))

# --- 2) Make a robust lookup (NAME_1 -> ID_1) with normalization ---
norm_key <- function(x) x %>%
  str_trim() %>%
  str_to_lower() %>%
  str_replace_all("[^[:alnum:]]", "")   # drop spaces, hyphens, etc.

id_map <- SA_sf %>%
  st_drop_geometry() %>%
  transmute(NAME_1,
            ID_1,
            key = norm_key(NAME_1))
# Since your province codes (1-9) correspond directly to ID_1 values (1-9)
hyp <- hypertensiondata %>%
  mutate(
    ID_1 = as.character(as.numeric(province)),  # Convert to character to match shapefile
    y = case_when(
      hypertension2 %in% c(1, "1", TRUE, "TRUE", "Yes", "yes") ~ 1L,
      hypertension2 %in% c(0, "0", FALSE, "FALSE", "No", "no") ~ 0L,
      TRUE ~ NA_integer_
    ),
    sex = factor(sex),
    residence = factor(residence),
    wealth_level = factor(wealth_level),
    region = factor(ID_1, levels = graph_names)
  ) %>%
  tidyr::drop_na(y, region, sex, residence, wealth_level, current_age)
# --- 4) Fit your BYM model, passing the graph object directly ---
form_bym <- y ~ 1 + sex + residence + current_age + wealth_level +
  f(region,
    model  = "bym",
    graph  = H,              # graph object (not a file path string)
    values = graph_names,    # map factor levels to graph nodes
    scale.model = TRUE,
    hyper = list(
      prec.unstruct = list(prior = "loggamma", param = c(1, 0.001)),
      prec.spatial  = list(prior = "loggamma", param = c(1, 0.001))
    )
  )
model_inla <- INLA::inla(
  form_bym,
  family = "binomial",
  data   = hyp,
  control.compute = list(dic = TRUE)
)
summary(model_inla)
Time used:
    Pre = 1.37, Running = 4.63, Post = 0.936, Total = 6.94 
Fixed effects:
                      mean    sd 0.025quant 0.5quant 0.975quant   mode kld
(Intercept)         -5.103 0.127     -5.352   -5.103     -4.855 -5.103   0
sexfemale            0.635 0.062      0.514    0.635      0.757  0.635   0
residenceRural      -0.185 0.068     -0.318   -0.185     -0.051 -0.185   0
current_age          0.067 0.002      0.063    0.067      0.070  0.067   0
wealth_levelpoor     0.398 0.102      0.198    0.398      0.597  0.398   0
wealth_levelmiddle   0.375 0.101      0.177    0.375      0.573  0.375   0
wealth_levelricher   0.395 0.101      0.198    0.395      0.593  0.395   0
wealth_levelrichest  0.441 0.100      0.245    0.441      0.637  0.441   0

Random effects:
  Name    Model
    region BYM model

Model hyperparameters:
                                            mean      sd 0.025quant 0.5quant
Precision for region (iid component)     1089.97 1201.01      70.80   712.30
Precision for region (spatial component)   17.90    9.60       5.57    15.85
                                         0.975quant   mode
Precision for region (iid component)        4278.48 192.11
Precision for region (spatial component)      42.19  12.27

Deviance Information Criterion (DIC) ...............: 7791.47
Deviance Information Criterion (DIC, saturated) ....: 7776.58
Effective number of parameters .....................: 15.15

Marginal log-Likelihood:  -3938.87 
 is computed 
Posterior summaries for the linear predictor and the fitted values are computed
(Posterior marginals needs also 'control.compute=list(return.marginals.predictor=TRUE)')
round(model_inla$summary.fixed, 3)
                      mean    sd 0.025quant 0.5quant 0.975quant   mode kld
(Intercept)         -5.103 0.127     -5.352   -5.103     -4.855 -5.103   0
sexfemale            0.635 0.062      0.514    0.635      0.757  0.635   0
residenceRural      -0.185 0.068     -0.318   -0.185     -0.051 -0.185   0
current_age          0.067 0.002      0.063    0.067      0.070  0.067   0
wealth_levelpoor     0.398 0.102      0.198    0.398      0.597  0.398   0
wealth_levelmiddle   0.375 0.101      0.177    0.375      0.573  0.375   0
wealth_levelricher   0.395 0.101      0.198    0.395      0.593  0.395   0
wealth_levelrichest  0.441 0.100      0.245    0.441      0.637  0.441   0
head(model_inla$summary.random$region, 3)
  ID       mean         sd  0.025quant   0.5quant  0.975quant       mode
1  1  0.1361564 0.07520687 -0.01171638  0.1362717  0.28335261  0.1362484
2  2  0.2199034 0.08021864  0.06482728  0.2191213  0.37950370  0.2191920
3  3 -0.2026820 0.08840432 -0.37697060 -0.2024190 -0.02988096 -0.2024254
           kld
1 4.472543e-07
2 2.550223e-07
3 1.936613e-08

1.6 Plotting posterior graphs from INLA

Try this out in your own time….

2 Count Data at location level

  • For this we will use the COVID19 data

  • We use the Poisson log normal model for this analysis

2.1 Three steps for the lognormal model

  • Stage 1: The likelihood \[Y_{i}|\theta_{i}\sim Poisson(E_{i}\theta_{i})\] with \(i=1,2,\dots, n\) with \[log(\theta_{i})=\beta_{0}+\beta_{1}X_{i} + v_{i}\] where \(X_{i}\) is the covariate measured at county level (areal level)

  • Stage 2: The random effects (prior distribution) is \[v_{i}|\sigma^{2}_{i}\sim N(0,\sigma^{2}_{v})\]

  • stage 3: The hyperprior on the hyperparameters \(\beta_{0}, ~\beta_{1},~ \sigma^{2}_{v}\) is given by \[p(\beta_{0},\beta_{1}, \sigma^{2}_{v})=p(\beta_{0})p(\beta_{1})p(\sigma^{2}_{v})\]

  • priors are assumed independent and for ease of Bayesian implementation, we work with the precision \(\tau_{v}=\sigma^{-2}_{v}\) rather than \(\sigma^{2}_{v}\).

2.2 Lognormal spatial model with covariates

  • We now add spatial (ICAR) random effects to the model. and for this we need a graph file as before

  • INLA specification of the model

\[log(\theta_{i})=\beta_{0}+\beta_{1}X_{i}+u_{i}+v_{i}\] with \(u_{i}\) structured spatial effects with a CAR prior and \(v_{i}\) the unstructured random effects with an iid normal prior.

  • Data exploration and computing standardized mortality rate

3 Modelling corona virus deaths in South Africa

Hypothesis: COVID deaths in South Africa have a significant spatial association

setwd("C:/Users/VUSI/Downloads/CAR_models-master/ZAF_adm")
sadistricts <- shapefile("district2.shp")
sadistricts
class       : SpatialPolygonsDataFrame 
features    : 52 
extent      : 16.45189, 32.94498, -34.83417, -22.12503  (xmin, xmax, ymin, ymax)
crs         : +proj=longlat +datum=WGS84 +no_defs 
variables   : 21
names       : CATEGORY, DC_MDB_C, ID_3, DC_MN_C, DC_MN_C_st,    DC_NAME,         DC_NAME_C,                        MAP_TITLE, PR_MDB_C, PR_CODE, PR_CODE_st,      PR_NAME,    ALBERS_ARE,    SHAPE_Leng,    SHAPE_Area, ... 
min values  :        A,      BUF,    1,     101,        101, Alfred Nzo, Alfred Nzo(DC44 ), Alfred Nzo District Municipality,       EC,       1,          1, Eastern Cape, 1644.98312348,  2.8057080426, 0.14852545311, ... 
max values  :        C,      TSH,   52,     947,        947,   Zululand,   Zululand(DC26 ),   Zululand District Municipality,       WC,       9,          9, Western Cape, 126836.337661, 28.7266980377, 11.9243802163, ... 

Describe patterns of deaths with hypertension prevalence (assess association visually)

library(tmap)
library(tmap)
library(sf)

# Ensure sadistricts is an sf object with proper geometry column
if(!"sf" %in% class(sadistricts)) {
  sadistricts <- st_as_sf(sadistricts)
}

# Check the geometry column name
geometry_col <- attr(sadistricts, "sf_column")
print(paste("Geometry column name:", geometry_col))
[1] "Geometry column name: geometry"
# If the geometry column isn't named "geometry", rename it
if(geometry_col != "geometry") {
  sadistricts <- sadistricts %>%
    rename(geometry = all_of(geometry_col)) %>%
    st_set_geometry("geometry")
}

# Now create the map
tm_shape(sadistricts) + 
  tm_polygons(col = "hypertensi", title = "Hypertension") +
  tm_bubbles(size = "deaths", col = "deaths", palette = "Blues",
             style = "quantile", legend.size.show = FALSE, 
             title.col = "Corona virus deaths",
             border.col = "black", border.lwd = 0.1, border.alpha = 0.1) +
  tm_layout(legend.text.size = 0.8, legend.title.size = 1.1,
            frame = FALSE)

Pull the data points (information) from the map object for use in simple models

Simple feature collection with 52 features and 21 fields
Geometry type: MULTIPOLYGON
Dimension:     XY
Bounding box:  xmin: 16.45189 ymin: -34.83417 xmax: 32.94498 ymax: -22.12503
Geodetic CRS:  +proj=longlat +datum=WGS84 +no_defs
First 10 features:
   CATEGORY DC_MDB_C ID_3 DC_MN_C DC_MN_C_st              DC_NAME
1         A      MAN    1     499        499             Mangaung
2         A      NMA    2     299        299   Nelson Mandela Bay
3         A      TSH    3     799        799      City of Tshwane
4         A      JHB    4     798        798 City of Johannesburg
5         A      BUF    5     260        260         Buffalo City
6         A      CPT    6     199        199    City of Cape Town
7         C      DC1    7     101        101           West Coast
8         C     DC10    8     210        210               Cacadu
9         C     DC12    9     212        212             Amathole
10        C     DC13   10     213        213           Chris Hani
                    DC_NAME_C                                      MAP_TITLE
1              Mangaung(MAN )             Mangaung Metropolitan Municipality
2    Nelson Mandela Bay(NMA )   Nelson Mandela Bay Metropolitan Municipality
3       City of Tshwane(TSH )      City of Tshwane Metropolitan Municipality
4  City of Johannesburg(JHB ) City of Johannesburg Metropolitan Municipality
5          Buffalo City(BUF )         Buffalo City Metropolitan Municipality
6     City of Cape Town(CPT )    City of Cape Town Metropolitan Municipality
7            West Coast(DC1 )               West Coast District Municipality
8               Cacadu(DC10 )                   Cacadu District Municipality
9             Amathole(DC12 )                 Amathole District Municipality
10          Chris Hani(DC13 )               Chris Hani District Municipality
   PR_MDB_C PR_CODE PR_CODE_st      PR_NAME ALBERS_ARE SHAPE_Leng SHAPE_Area
1        FS       4          4   Free State   6283.988   6.196157  0.5827334
2        EC       2          2 Eastern Cape   1958.908   2.939828  0.1907285
3        GT       7          7      Gauteng   6297.833   6.356642  0.5661993
4        GT       7          7      Gauteng   1644.983   3.031305  0.1485255
5        EC       2          2 Eastern Cape   2535.932   4.458223  0.2445463
6        WC       1          1 Western Cape   2444.963   5.198175  0.2382967
7        WC       1          1 Western Cape  31118.684  14.651131  2.9732263
8        EC       2          2 Eastern Cape  58243.283  17.874817  5.6239269
9        EC       2          2 Eastern Cape  21594.916  16.058800  2.0734975
10       EC       2          2 Eastern Cape  36143.538  14.247915  3.4418078
   FID_1             district covidcases deaths expected hypertensi
1      1             Mangaung      13236    225   264.72       29.4
2      2   Nelson Mandela Bay      42831    885   856.62       35.0
3      3      City of Tshwane      77990    749  1559.80       24.5
4      4 City of Johannesburg      88609   1277  1772.18       30.8
5      5         Buffalo city      39635    629   792.70       40.1
6      6    City of Cape Town     138219   2764  2764.38       31.4
7      7           West coast       6769    134   135.38       46.1
8      8               Cacadu      11865    199   237.30       37.5
9      9             Amathole      16835    272   336.70       39.7
10    10           Chris Hani      16292    278   325.84       30.8
                         geometry
1  MULTIPOLYGON (((25.90767 -2...
2  MULTIPOLYGON (((25.46655 -3...
3  MULTIPOLYGON (((28.75833 -2...
4  MULTIPOLYGON (((27.98484 -2...
5  MULTIPOLYGON (((27.35458 -3...
6  MULTIPOLYGON (((18.52309 -3...
7  MULTIPOLYGON (((18.07458 -3...
8  MULTIPOLYGON (((24.4407 -31...
9  MULTIPOLYGON (((28.26665 -3...
10 MULTIPOLYGON (((26.37107 -3...

Simple poisson model withn intercept and random error. The exponentiated intercept is the average rate (relative risk rate) fo deaths across the districts.

deaths.fit1<-inla(deaths ~ 1 + f(FID_1, model = "iid", param = c(0.5, 5e-04)), data=sadata, family = "poisson", E=expected)


summary(deaths.fit1)
Time used:
    Pre = 1.45, Running = 1.07, Post = 0.204, Total = 2.72 
Fixed effects:
              mean    sd 0.025quant 0.5quant 0.975quant   mode kld
(Intercept) -0.683 0.081     -0.844   -0.683     -0.525 -0.683   0

Random effects:
  Name    Model
    FID_1 IID model

Model hyperparameters:
                    mean   sd 0.025quant 0.5quant 0.975quant mode
Precision for FID_1 3.27 0.71       2.05     3.21       4.82 3.10

Marginal log-Likelihood:  -288.30 
 is computed 
Posterior summaries for the linear predictor and the fitted values are computed
(Posterior marginals needs also 'control.compute=list(return.marginals.predictor=TRUE)')
exp(deaths.fit1$summary.fixed[1,])
                 mean       sd 0.025quant  0.5quant 0.975quant      mode kld
(Intercept) 0.5050245 1.084355   0.430014 0.5052593  0.5915584 0.5052577   1

3.1 Fitting non-spatial lognormal model with covariate

deaths.fit2<-inla(deaths ~ 1 + hypertensi + f(FID_1, model = "iid", param=c(1, 0.014)), data=sadata, family = "poisson", E = expected, control.compute =list(dic = TRUE))

summary(deaths.fit2)
Time used:
    Pre = 1.43, Running = 1.04, Post = 0.278, Total = 2.75 
Fixed effects:
              mean    sd 0.025quant 0.5quant 0.975quant   mode kld
(Intercept) -1.273 0.306     -1.874   -1.274     -0.667 -1.274   0
hypertensi   0.019 0.010      0.000    0.019      0.038  0.019   0

Random effects:
  Name    Model
    FID_1 IID model

Model hyperparameters:
                    mean    sd 0.025quant 0.5quant 0.975quant mode
Precision for FID_1 3.64 0.808       2.26     3.57       5.42 3.43

Deviance Information Criterion (DIC) ...............: 435.78
Deviance Information Criterion (DIC, saturated) ....: 104.08
Effective number of parameters .....................: 48.79

Marginal log-Likelihood:  -293.76 
 is computed 
Posterior summaries for the linear predictor and the fitted values are computed
(Posterior marginals needs also 'control.compute=list(return.marginals.predictor=TRUE)')
  • Examine the posterior summaries for covariate, on the original (log scale- log RR) or exponential (RR)
deaths.fit2$summary.fixed
                   mean          sd    0.025quant    0.5quant  0.975quant
(Intercept) -1.27297495 0.306313064 -1.8737596297 -1.27383261 -0.66731452
hypertensi   0.01920151 0.009593265  0.0001839012  0.01924635  0.03796449
                  mode          kld
(Intercept) -1.2738222 3.141363e-08
hypertensi   0.0192459 3.274789e-08
deaths.fit2$summary.fixed[2,]
                 mean          sd   0.025quant   0.5quant 0.975quant      mode
hypertensi 0.01920151 0.009593265 0.0001839012 0.01924635 0.03796449 0.0192459
                    kld
hypertensi 3.274789e-08
exp(deaths.fit2$summary.fixed[2,])
               mean       sd 0.025quant 0.5quant 0.975quant     mode kld
hypertensi 1.019387 1.009639   1.000184 1.019433   1.038694 1.019432   1

3.2 Lognormal spatial model with covariates

tmap_mode("plot")


if (!"smr" %in% names(sadistricts)) {
  stopifnot(all(c("deaths","expected") %in% names(sadistricts)))
  sadistricts$smr <- with(sadistricts, ifelse(expected > 0, deaths/expected, NA_real_))
}
if (!inherits(sadistricts, "sf")) sadistricts <- st_as_sf(sadistricts)
if (!is.numeric(sadistricts$smr)) sadistricts$smr <- as.numeric(sadistricts$smr)

tm_shape(sadistricts) +
  tm_polygons(
    fill        = "smr",
    fill.scale  = tm_scale_intervals(
      style  = "quantile",
      n      = 5,
      values = RColorBrewer::brewer.pal(5, "Blues")
    ),
    fill.legend = tm_legend(title = "Covid SMR"),
    col_alpha   = 0.4
  ) +
  tm_compass() +
  tm_title("South AFRICA, COVID2020") +
  tm_layout(
    legend.text.size  = 0.5,
    legend.title.size = 0.5,
    legend.position   = c("right","bottom"),
    frame             = FALSE
  )

# --- 0) Ensure `sadistricts` is sf and has an ID to use as node names ---
if (inherits(sadistricts, "Spatial")) sadistricts <- st_as_sf(sadistricts)

if (!"FID_1" %in% names(sadistricts)) sadistricts$FID_1 <- seq_len(nrow(sadistricts))
sadistricts$FID_1 <- as.character(sadistricts$FID_1)

sadist_nb <- spdep::poly2nb(sadistricts, row.names = sadistricts$FID_1)
graph_path <- file.path(getwd(), "sadist_inla.graph")
spdep::nb2INLA(graph_path, sadist_nb)


H <- INLA::inla.read.graph(graph_path)

sadistricts <- sadistricts %>%
  mutate(
    region1 = FID_1,
    region2 = FID_1
  )

Plotting the expected deaths

# summary of attributes (no geometry)
summary(sf::st_drop_geometry(sadistricts))
   CATEGORY           DC_MDB_C              ID_3          DC_MN_C     
 Length:52          Length:52          Min.   : 1.00   Min.   :101.0  
 Class :character   Class :character   1st Qu.:13.75   1st Qu.:289.2  
 Mode  :character   Mode  :character   Median :26.50   Median :522.5  
                                       Mean   :26.50   Mean   :498.7  
                                       3rd Qu.:39.25   3rd Qu.:665.5  
                                       Max.   :52.00   Max.   :947.0  
  DC_MN_C_st          DC_NAME           DC_NAME_C          MAP_TITLE        
 Length:52          Length:52          Length:52          Length:52         
 Class :character   Class :character   Class :character   Class :character  
 Mode  :character   Mode  :character   Mode  :character   Mode  :character  
                                                                            
                                                                            
                                                                            
   PR_MDB_C            PR_CODE       PR_CODE_st          PR_NAME         
 Length:52          Min.   :1.000   Length:52          Length:52         
 Class :character   1st Qu.:2.000   Class :character   Class :character  
 Mode  :character   Median :5.000   Mode  :character   Mode  :character  
                    Mean   :4.615                                        
                    3rd Qu.:6.250                                        
                    Max.   :9.000                                        
   ALBERS_ARE       SHAPE_Leng       SHAPE_Area         FID_1          
 Min.   :  1645   Min.   : 2.806   Min.   : 0.1485   Length:52         
 1st Qu.:  7888   1st Qu.: 6.261   1st Qu.: 0.7269   Class :character  
 Median : 15779   Median : 9.512   Median : 1.4313   Mode  :character  
 Mean   : 23477   Mean   :10.011   Mean   : 2.1766                     
 3rd Qu.: 28934   3rd Qu.:12.311   3rd Qu.: 2.6284                     
 Max.   :126836   Max.   :28.727   Max.   :11.9244                     
   district           covidcases         deaths           expected      
 Length:52          Min.   :   345   Min.   :   3.00   Min.   :   6.90  
 Class :character   1st Qu.:  4702   1st Qu.:  40.75   1st Qu.:  94.05  
 Mode  :character   Median :  8906   Median :  82.00   Median : 178.12  
                    Mean   : 19209   Mean   : 248.00   Mean   : 384.19  
                    3rd Qu.: 18274   3rd Qu.: 259.25   3rd Qu.: 365.47  
                    Max.   :138219   Max.   :2764.00   Max.   :2764.38  
   hypertensi         smr           region1            region2         
 Min.   :19.50   Min.   :0.1106   Length:52          Length:52         
 1st Qu.:24.38   1st Qu.:0.3683   Class :character   Class :character  
 Median :30.05   Median :0.5642   Mode  :character   Mode  :character  
 Mean   :31.11   Mean   :0.5755                                        
 3rd Qu.:37.58   3rd Qu.:0.8409                                        
 Max.   :56.00   Max.   :1.0331                                        
# map the 'expected' column with tmap v4
library(tmap)
library(RColorBrewer)

tmap_mode("plot")

tm_shape(sadistricts) +
  tm_polygons(
    fill = "expected",
    fill.scale  = tm_scale_intervals(
      style  = "quantile",
      n      = 5,
      values = RColorBrewer::brewer.pal(5, "Blues")
    ),
    fill.legend = tm_legend(title = "Expected"),
    col_alpha   = 0.4
  ) +
  tm_layout(legend.position = c("right","bottom"), frame = FALSE)

Plotting the standardized death rates

library(spdep)
library(INLA)

# neighbours -> INLA graph file
sadist_nb <- spdep::poly2nb(sadistricts, row.names = sadistricts$FID_1)
graph_path <- file.path(getwd(), "sadist_inla.graph")
spdep::nb2INLA(graph_path, sadist_nb)

# read the graph into an INLA graph object
H <- INLA::inla.read.graph(graph_path)

class(sadist_nb)   # "nb"         (spdep neighbours)
[1] "nb"
class(graph_path)  # "character"  (file path)
[1] "character"
class(H)           # "inla.graph" (use this in f(..., graph = H, values = H$names))
[1] "inla.graph"
form <- y ~ ... +
  f(region, model = "bym2", graph = H, values = H$names, scale.model = TRUE)
# ---------- 1) Neighbours and proper INLA graph ----------
sadist_nb <- poly2nb(sadistricts, row.names = sadistricts$FID_1)
graph_path <- file.path(getwd(), "sadist_inla.graph")
nb2INLA(graph_path, sadist_nb)
H <- INLA::inla.read.graph(graph_path)     # <-- graph OBJECT
g_names <- if (!is.null(H$names)) H$names else as.character(seq_len(H$n))

# ---------- 2) Build modelling data (sf-safe; no @data) ----------
dat <- sadistricts |>
  st_drop_geometry() |>
  mutate(
    region1 = FID_1,     # IID part index (character)
    region2 = FID_1      # Besag part index (character)
  )

# Safety checks
stopifnot(all(unique(dat$region2) %in% g_names))
dat <- dat[match(g_names, dat$region2), , drop = FALSE]  # align to graph order (optional but safe)
# ---------- 3) Fit Poisson BYM (IID + Besag) ----------
# Use hyper=list(prec=list(param=c(shape, rate))) for log-Gamma prior on precision
form <- deaths ~ 1 + 
  f(region1, model = "iid",   hyper = list(prec = list(param = c(1, 0.014)))) +
  f(region2, model = "besag", graph = H, values = g_names,
    hyper = list(prec = list(param = c(1, 0.68))), scale.model = TRUE)

fit <- INLA::inla(
  form, family = "poisson",
  data = dat,
  E = expected,                       # expected counts (offset via log(E))
  control.predictor = list(compute = TRUE),
  control.compute  = list(dic = TRUE, waic = TRUE, cpo = TRUE)
)
# ---------- 4) Summaries ----------
print(summary(fit))
Time used:
    Pre = 1.59, Running = 1.23, Post = 0.358, Total = 3.18 
Fixed effects:
             mean    sd 0.025quant 0.5quant 0.975quant  mode kld
(Intercept) -0.68 0.029     -0.738    -0.68     -0.623 -0.68   0

Random effects:
  Name    Model
    region1 IID model
   region2 Besags ICAR model

Model hyperparameters:
                       mean     sd 0.025quant 0.5quant 0.975quant  mode
Precision for region1 89.47 97.708       7.51    59.66     347.93 20.81
Precision for region2  2.96  0.756       1.75     2.87       4.71  2.70

Deviance Information Criterion (DIC) ...............: 433.99
Deviance Information Criterion (DIC, saturated) ....: 102.29
Effective number of parameters .....................: 47.07

Watanabe-Akaike information criterion (WAIC) ...: 425.72
Effective number of parameters .................: 27.96

Marginal log-Likelihood:  -291.71 
CPO, PIT is computed 
Posterior summaries for the linear predictor and the fitted values are computed
(Posterior marginals needs also 'control.compute=list(return.marginals.predictor=TRUE)')
print(round(fit$summary.fixed, 3))
             mean    sd 0.025quant 0.5quant 0.975quant  mode kld
(Intercept) -0.68 0.029     -0.738    -0.68     -0.623 -0.68   0
# ---------- 5) Extract RR for mapping (use posterior means) ----------
# Random effects are on log scale; exponentiate to get multiplicative RR
rr_nonspatial <- exp(fit$summary.random$region1$mean)
rr_spatial    <- exp(fit$summary.random$region2$mean)

# Convert sf -> sp for SpatialEpi::mapvariable
sadistricts_sp <- sf::as_Spatial(sadistricts)

library(tmap)
if (!inherits(sadistricts, "sf")) sadistricts <- sf::st_as_sf(sadistricts)
sadistricts$rr_nonspatial <- rr_nonspatial

tmap_mode("plot")
tm_shape(sadistricts) +
  tm_polygons(fill = "rr_nonspatial") +
  tm_title("Spatial RR (Besag)")

library(tmap)
if (!inherits(sadistricts, "sf")) sadistricts <- sf::st_as_sf(sadistricts)
sadistricts$rr_spatial <- rr_spatial

tmap_mode("plot")
tm_shape(sadistricts) +
  tm_polygons(fill = "rr_spatial") +
  tm_title("Spatial RR (Besag)")

About 90% of COVID deaths in the Western Cape is spatially related and this can be due to socio-economic dynamics in that region and more specifically those districts.