library(dplyr)
library(sf)
library(ggplot2)
library(tmap)
library(spdep)
library(spatialreg)
library(gstat)
library(CARBayes)
library(R2BayesX)
library(BayesX)
library(INLA)
1. Research Question
What is the spatial distribution of HIV prevalence in Zimbabwe, and how do socioeconomic factors such as education and wealth status influence HIV prevalence across districts?
Descriptive thematic/choropleth map
This choropleth map reveals significant geographical disparities in HIV prevalence across the region, with rates ranging from 4.1% to 48.0% between districts. The spatial distribution shows a clear pattern of higher prevalence concentrated in northern and central areas, while southern districts generally experience lower infection rates. The striking variation, with some districts showing nearly twelve times higher prevalence than others, highlights the uneven burden of the HIV epidemic and suggests underlying structural, socioeconomic, or behavioral factors driving transmission differences. The presence of missing data in some districts indicates surveillance gaps that need addressing.
library(sf)
library(dplyr)
library(tidyr)
library(tmap)
# --- Inputs you already have ---
<- read_rds("final_hiv_ZW.rds")
dat <- st_read("C:/Users/VUSI/Downloads/GEO_GROUP/ZW_2015_DHS_08222025_1651_42180/ZWGE72FL/ZWGE72FL.shp", quiet = TRUE)
shp <- st_read("C:/Users/VUSI/Downloads/GEO_GROUP/zwe_admbnda_adm2_zimstat_ocha_20180911", quiet = TRUE) # the file with ADM2_EN/ADM2_PCODE
zw_dist
# 1) Build cluster-level counts from individuals (this creates n, pos, hiv_prev)
<- dat %>%
dat_cluster group_by(v001) %>%
summarise(
n = sum(!is.na(hiv_status)),
pos = sum(hiv_status == 1, na.rm = TRUE),
hiv_prev = if_else(n > 0, pos / n, NA_real_),
.groups = "drop"
)
# 2) Attach counts to DHS cluster points
<- shp %>%
shp_pts left_join(dat_cluster, by = c("DHSCLUST" = "v001"))
# Sanity: ensure n/pos exist now
stopifnot(all(c("n","pos","hiv_prev") %in% names(shp_pts)))
# 3) Align CRS
<- st_transform(shp_pts, 4326)
shp_pts <- st_transform(zw_dist, 4326)
zw_dist
# 4) Spatial join: assign each point to a district polygon
<- st_join(shp_pts, zw_dist, join = st_within)
zw_pts_in_dist
# Handle displaced points that fall just outside polygons: snap to nearest district
if (any(is.na(zw_pts_in_dist$ADM2_PCODE))) {
<- which(is.na(zw_pts_in_dist$ADM2_PCODE))
miss_idx <- st_nearest_feature(shp_pts[miss_idx, ], zw_dist)
nearest_id $ADM2_PCODE[miss_idx] <- zw_dist$ADM2_PCODE[nearest_id]
zw_pts_in_dist$ADM2_EN[miss_idx] <- zw_dist$ADM2_EN[nearest_id]
zw_pts_in_dist
}
# 5) Aggregate to district counts & prevalence (now n/pos exist)
<- zw_pts_in_dist %>%
zw_district_prev st_drop_geometry() %>%
group_by(ADM2_PCODE, ADM2_EN) %>%
summarise(
clusters = dplyr::n(),
tested = sum(n, na.rm = TRUE),
positive = sum(pos, na.rm = TRUE),
prev = if_else(tested > 0, positive / tested, NA_real_),
.groups = "drop"
)
# 6) Join back to polygons for choropleth
<- zw_dist %>%
zw_dist_map left_join(zw_district_prev, by = c("ADM2_PCODE","ADM2_EN"))
#
# 7) Map
tmap_mode("plot")
tm_shape(zw_dist_map) +
tm_polygons("prev", style = "quantile", n = 5,
title = "HIV prevalence (district)") +
tm_borders(col = "grey40") +
tm_layout(legend.outside = TRUE, frame = FALSE) +
tm_shape(shp_pts) +
tm_dots(col = "hiv_prev", size = 0.05, alpha = 0.5,
palette = "Reds", title = "Cluster prevalence")
3. Spatial Clustering Investigation - The Moran Test
There is highly significant positive spatial autocorrelation in HIV prevalence across Zimbabwe’s districts. Neighboring districts tend to have similar prevalence rates (clusters of high prevalence and clusters of low prevalence).
library(spdep)
# Create neighbors from polygons
<- poly2nb(zw_dist_map, queen = TRUE)
nb <- nb2listw(nb, style = "W", zero.policy = TRUE)
lw
# Global Moran’s I test
moran.test(zw_dist_map$prev, lw, zero.policy = TRUE)
#>
#> Moran I test under randomisation
#>
#> data: zw_dist_map$prev
#> weights: lw
#>
#> Moran I statistic standard deviate = 5.2239, p-value = 8.76e-08
#> alternative hypothesis: greater
#> sample estimates:
#> Moran I statistic Expectation Variance
#> 0.39736905 -0.01111111 0.00611441
Hot-Spot and Cold-Spot Analysis (Local Indicators of Spatial Association – LISA)
This LISA (Local Indicators of Spatial Association) cluster map reveals significant spatial patterning of HIV prevalence at the district level. The presence of “High-High” clusters indicates districts with high HIV prevalence surrounded by neighboring districts that also have high prevalence, suggesting concentrated hotspots of transmission where the epidemic is spatially reinforced. Conversely, “Low-Low” clusters identify districts with low prevalence surrounded by similarly low-prevalence neighbors, indicating protective spatial patterns or successful localized intervention efforts. The “Not significant” areas show districts without statistically meaningful spatial autocorrelation, where prevalence rates appear more randomly distributed. This spatial analysis confirms that HIV prevalence is not randomly distributed but rather exhibits clear clustering patterns, emphasizing the importance of geographically targeted public health interventions that address both hotspot areas and protective regions to effectively combat the epidemic.
library(spdep)
library(dplyr)
library(tmap)
# 0) work on districts with non-missing prevalence
<- which(!is.na(zw_dist_map$prev))
ok <- zw_dist_map$prev[ok]
y
# 1) neighbors/weights on the same subset of polygons
<- poly2nb(zw_dist_map[ok, ], queen = TRUE)
nb_ok <- nb2listw(nb_ok, style = "W", zero.policy = TRUE)
lw_ok
# 2) local Moran
<- localmoran(y, lw_ok, zero.policy = TRUE)
loc
# 3) pull columns robustly across spdep versions
<- colnames(loc)
cn <- loc[, "Ii"]
Ii <- loc[, "Z.Ii"]
Z
# p-value column names vary; if missing, compute from Z
if ("Pr(z > 0)" %in% cn) {
<- loc[, "Pr(z > 0)"]
p else if ("Pr(Z > 0)" %in% cn) {
} <- loc[, "Pr(Z > 0)"]
p else if ("Pr(z > 0) " %in% cn) { # some builds have a trailing space
} <- loc[, "Pr(z > 0) "]
p else {
} <- pnorm(Z, lower.tail = FALSE) # one-sided (greater) p from Z
p
}
# 4) write results back into full sf (others -> NA)
$LISA_I <- NA_real_
zw_dist_map$LISA_Z <- NA_real_
zw_dist_map$LISA_p <- NA_real_
zw_dist_map$LISA_I[ok] <- Ii
zw_dist_map$LISA_Z[ok] <- Z
zw_dist_map$LISA_p[ok] <- p
zw_dist_map
# Optional: FDR-adjusted p-values
$LISA_p_fdr <- NA_real_
zw_dist_map$LISA_p_fdr[ok] <- p.adjust(p, method = "fdr")
zw_dist_map
# 5) Moran quadrant classification (High-High, Low-Low, etc.)
<- lag.listw(lw_ok, y, zero.policy = TRUE)
wy <- scale(y)[,1]
y_s <- scale(wy)
wy_s
<- rep(NA_character_, length(y))
quad >= 0 & wy_s >= 0] <- "High-High"
quad[y_s <= 0 & wy_s <= 0] <- "Low-Low"
quad[y_s >= 0 & wy_s <= 0] <- "High-Low"
quad[y_s <= 0 & wy_s >= 0] <- "Low-High"
quad[y_s
# keep only significant at 0.05 (change as you like)
<- ifelse(p < 0.05, quad, "Not sig")
lab
$LISA_quad <- NA_character_
zw_dist_map$LISA_quad[ok] <- lab
zw_dist_map
# 6) Maps
tmap_mode("plot")
tm_shape(zw_dist_map) +
tm_polygons("LISA_I", style = "quantile", n = 5,
title = "Local Moran's I (district)") +
tm_borders() +
tm_layout(legend.outside = TRUE)
tm_shape(zw_dist_map) +
tm_polygons("LISA_quad",
palette = c("High-High"="red","Low-Low"="blue",
"High-Low"="orange","Low-High"="lightblue",
"Not sig"="grey80"),
title = "LISA clusters (p<0.05)") +
tm_borders() +
tm_layout(legend.outside = TRUE)
5. Kriging Smoothing Procedures : Kriging is a geostatistical technique that uses known data points to estimate values at unsampled locations, incorporating spatial autocorrelation to produce optimal predictions.
Research question: To what extent do spatial spillover effects influence the heterogeneous distribution of HIV prevalence across districts in Zimbabwe?
The smooth, continuous gradient of predicted prevalence (ranging from 0.05 to 0.35) reveals clearly interconnected zones rather than isolated outbreaks, indicating that prevalence in one district strongly influences and is influenced by its neighbors. This pattern confirms that spillover effects propagate high prevalence from core hotspots into surrounding areas, forming regional clusters, while also maintaining protective low-prevalence zones. Consequently, the spatial structure of the HIV epidemic in Zimbabwe is deeply shaped by cross-district dynamics, underscoring the necessity for interventions that target entire transmission zones rather than individual districts.
# 1) Project to a metric CRS (Zimbabwe sits well in UTM 35S; 36S also ok for the east)
<- 32735 # EPSG:32735 UTM zone 35S (meters)
crs_proj
<- st_transform(shp_pts, crs_proj) # POINTS with hiv_prev
shp_pts_proj <- st_transform(zw_dist, crs_proj) # POLYGONS for context
zw_dist_proj
# 2) Build a regular prediction grid over Zimbabwe (e.g., ~10 km cells)
<- 10
cell_km <- cell_km * 1000
cell_m
<- st_make_grid(zw_dist_proj, cellsize = cell_m, what = "centers") |>
grid st_sf(geometry = _)
# 3) Empirical variogram & fit (use Spatial for gstat internals)
<- as(shp_pts_proj, "Spatial")
pts_sp <- variogram(hiv_prev ~ 1, pts_sp)
vgm_emp <- fit.variogram(vgm_emp, vgm("Sph"))
vgm_fit
# 4) Ordinary kriging to the grid (POINT predictions, not polygons)
<- as(grid, "Spatial")
grid_sp <- krige(hiv_prev ~ 1, locations = pts_sp, newdata = grid_sp, model = vgm_fit)
kriged #> [using ordinary kriging]
# 5) Back to sf for plotting
<- st_as_sf(kriged) # columns: var1.pred, var1.var
krig_sf
# 6) Plot kriged surface with districts on top
tmap_mode("plot")
tm_shape(krig_sf) +
tm_dots("var1.pred", title = "Kriged HIV prevalence") +
tm_shape(zw_dist_proj) + tm_borders(col = "grey40") +
tm_layout(legend.outside = TRUE)
# Optional: create a raster-looking surface (interpolate to a polygonal grid)
# If you prefer a smooth raster, use stars:
library(stars)
<- st_rasterize(krig_sf["var1.pred"], dx = cell_m, dy = cell_m)
kr_stars tm_shape(kr_stars) + tm_raster("var1.pred") + tm_shape(zw_dist_proj) + tm_borders()
6. Spatial regression for any three of the following modelling procedures using any software
Logistic Regression (No Spatial Effect)
We assume:
\[ Y_i \sim \text{Binomial}(n_i, p_i) \]
where:
\[ ( Y_i ): number of HIV cases in district ( i ) ( n_i ): total individuals sampled in district ( i ) ( p_i ): prevalence of HIV in district ( i ) \] We model the log-odds of HIV prevalence as:
\[ \text{logit}(p_i) = \log\left(\frac{p_i}{1 - p_i}\right) = \beta_0 + \sum_{k=1}^{K} \beta_k X_{ik} \]
<- readRDS("Data.rds") dat1
A. Logistic regression (binary outcome at individual level)
The logistic regression model identifies age, education, and household wealth as important predictors of HIV status. Age (v012) is strongly associated with HIV infection (β = 0.071, p < 0.001), meaning that each additional year of age increases the log-odds of being HIV positive. Compared to those with no education, individuals with primary schooling have significantly higher odds of HIV infection (β = 0.781, p = 0.005), while secondary education shows a borderline positive effect (β = 0.534, p = 0.052), and higher education shows no association. Wealth effects are mixed: individuals from poorer households are significantly less likely to be HIV positive than the poorest (β = −0.214, p = 0.041), and those from the richest households also have lower odds (β = −0.200, p = 0.047). Middle and richer categories are not significantly different, though the richer group trends toward increased odds (p = 0.064). The model fits the data substantially better than the null model (residual deviance = 7769.9 vs. null deviance = 8392.7) and has an AIC of 7787.9. Overall, the results suggest that HIV risk is shaped by age progression, modest educational attainment (primary/secondary), and nuanced wealth gradients, with both poorer and richest groups showing lower risk compared to the most disadvantaged baseline.
<- glm(hiv_status ~v012 + v106 + v190,
glm_mod data = dat1, family = binomial(link = "logit"))
summary(glm_mod)
#>
#> Call:
#> glm(formula = hiv_status ~ v012 + v106 + v190, family = binomial(link = "logit"),
#> data = dat1)
#>
#> Coefficients:
#> Estimate Std. Error z value Pr(>|z|)
#> (Intercept) -4.236624 0.300561 -14.096 < 2e-16 ***
#> v012 0.071266 0.003176 22.438 < 2e-16 ***
#> v106primary 0.780698 0.275024 2.839 0.00453 **
#> v106secondary 0.534351 0.275240 1.941 0.05221 .
#> v106higher -0.025665 0.300481 -0.085 0.93193
#> v190poorer -0.213595 0.104485 -2.044 0.04093 *
#> v190middle 0.004287 0.100501 0.043 0.96598
#> v190richer 0.173014 0.093364 1.853 0.06387 .
#> v190richest -0.199872 0.100721 -1.984 0.04721 *
#> ---
#> Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#>
#> (Dispersion parameter for binomial family taken to be 1)
#>
#> Null deviance: 8392.7 on 9053 degrees of freedom
#> Residual deviance: 7769.9 on 9045 degrees of freedom
#> AIC: 7787.9
#>
#> Number of Fisher Scoring iterations: 4
Modelling - Random spatial effect
B. Logistic Regression with Conditional Autoregressive (CAR) Prior
We assume:
\[ Y_i \sim \text{Binomial}(n_i, p_i) \]
with the log-odds modeled as:
\[ \text{logit}(p_i) = \log\left(\frac{p_i}{1 - p_i}\right) = \beta_0 + \sum_{k=1}^{K} \beta_k X_{ik} + u_i \]
where: \[ ( X_{ik}): covariates (e.g., age, wealth, education) ( \beta_k): regression coefficients ( u_i ): spatial random effect for area (i) \]
library(sf)
# Load the shapefile
# Replace "path/to/your/shapefile.shp" with the actual path to your shapefile
<- st_read("C:/Users/VUSI/Downloads/GEO_GROUP/zwe_admbnda_adm2_zimstat_ocha_20180911/zwe_admbnda_adm2_zimstat_ocha_20180911.shp")
zw_shp #> Reading layer `zwe_admbnda_adm2_zimstat_ocha_20180911' from data source
#> `C:\Users\VUSI\Downloads\GEO_GROUP\zwe_admbnda_adm2_zimstat_ocha_20180911\zwe_admbnda_adm2_zimstat_ocha_20180911.shp'
#> using driver `ESRI Shapefile'
#> Simple feature collection with 91 features and 14 fields
#> Geometry type: POLYGON
#> Dimension: XY
#> Bounding box: xmin: 25.2376 ymin: -22.42105 xmax: 33.06718 ymax: -15.60714
#> Geodetic CRS: WGS 84
# Load the dataset from the .rds file
# Replace "path/to/your/Data.rds" with the actual path to your Data.rds file
<- readRDS("Data.rds") my_data
head(my_data, 3)
caseid | v001 | v002 | v003 | v012 | v106 | v190 | hiv03_code | hiv_status | hiv05 | hivage | hivsex | hiv_wt | ADM2_PCODE | ADM2_EN |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
1 1 2 | 1 | 1 | 2 | 38 | secondary | richer | 1 | 1 | 1770538 | 38 | female | 1.770538 | ZW1309 | Uzumba Maramba Pfungwe |
1 2 2 | 1 | 2 | 2 | 37 | primary | poorer | 0 | 0 | 1770538 | 37 | female | 1.770538 | ZW1309 | Uzumba Maramba Pfungwe |
1 6 2 | 1 | 6 | 2 | 37 | secondary | middle | 0 | 0 | 1770538 | 37 | female | 1.770538 | ZW1309 | Uzumba Maramba Pfungwe |
<-zw_shp %>%
zw_shp1 mutate(ID_1 = row_number())
# Load the dataset from the .rd
#st_write(zw_shp1 , "Zim_new1.shp")
plot(zw_shp1)
<-zw_shp1 %>%
zw_shp2as.data.frame() %>%
::select(ID_1,ADM2_PCODE) %>%
dplyrrename(district = ADM2_PCODE)
<- left_join(my_data, zw_shp2, by = c("ADM2_PCODE" = "district"))
my_data1head(my_data1, 3)
caseid | v001 | v002 | v003 | v012 | v106 | v190 | hiv03_code | hiv_status | hiv05 | hivage | hivsex | hiv_wt | ADM2_PCODE | ADM2_EN | ID_1 |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
1 1 2 | 1 | 1 | 2 | 38 | secondary | richer | 1 | 1 | 1770538 | 38 | female | 1.770538 | ZW1309 | Uzumba Maramba Pfungwe | 86 |
1 2 2 | 1 | 2 | 2 | 37 | primary | poorer | 0 | 0 | 1770538 | 37 | female | 1.770538 | ZW1309 | Uzumba Maramba Pfungwe | 86 |
1 6 2 | 1 | 6 | 2 | 37 | secondary | middle | 0 | 0 | 1770538 | 37 | female | 1.770538 | ZW1309 | Uzumba Maramba Pfungwe | 86 |
$ID_1 <- as.factor(my_data1$ID_1) my_data1
<- file.path("Zim_new1")
shpname2 <- BayesX::shp2bnd(shpname = shpname2, regionnames = "ID_1", check.is.in = F)
zw_bnd_prov #> Reading map ... finished
#> Note: map consists originally of 108 polygons
#> Note: map consists of 91 regions
::drawmap(map=zw_bnd_prov) BayesX
<-BayesX::bnd2gra(zw_bnd_prov)
sagra#> Start neighbor search ...
#> progress: 20%
#> progress: 40%
#> progress: 60%
#> progress: 80%
#> progress: 100%
#> Neighbor search finished.
Building the CAR model
The Bayesian CAR model fitted with bayesx shows that both individual-level and spatial factors contribute to HIV risk. Among the fixed effects, individuals classified as poorer (β = −0.223, 95% CI: −0.449 to −0.011) and richest (β = −0.344, 95% CI: −0.608 to −0.091) have significantly lower odds of HIV infection compared to the poorest group, while the middle and richer categories are not significantly different. Education shows a positive effect for primary schooling (β = 0.497), though only borderline significant, while secondary and higher education do not show strong evidence of association. The smooth effect of age (v012) has a small but non-zero variance (0.0093), suggesting a modest nonlinear contribution to HIV risk. Importantly, the structured spatial effect (sx(ID_1):mrf) has a substantial variance (0.341), indicating that HIV prevalence varies geographically in a way consistent with spatial autocorrelation between neighboring districts, while the unstructured spatial effect (sx(ID_1):re) variance is small (0.011), suggesting limited additional random heterogeneity. Overall, this model highlights the importance of age, socioeconomic status, and regional clustering in explaining HIV distribution across districts, with clear evidence of spatial dependence captured by the CAR component.
<- bayesx(
imodel ~ v190 + v106 + sx(v012) +
hiv_status sx(ID_1, bs = "mrf", map = sagra) + # <— key changes
sx(ID_1, bs = "re"),
family = "binomial",
method = "MCMC",
iterations = 12000, burnin = 2000, step = 10,
data = my_data1
)summary(imodel)
#> Call:
#> bayesx(formula = hiv_status ~ v190 + v106 + sx(v012) + sx(ID_1,
#> bs = "mrf", map = sagra) + sx(ID_1, bs = "re"), data = my_data1,
#> family = "binomial", method = "MCMC", iterations = 12000,
#> burnin = 2000, step = 10)
#>
#> Fixed effects estimation results:
#>
#> Parametric coefficients:
#> Mean Sd 2.5% 50% 97.5%
#> (Intercept) -1.7176 0.2879 -2.3036 -1.7097 -1.1785
#> v190poorer -0.2198 0.1109 -0.4281 -0.2169 -0.0115
#> v190middle 0.0143 0.1071 -0.2005 0.0149 0.2222
#> v190richer 0.0241 0.1126 -0.2013 0.0198 0.2449
#> v190richest -0.3367 0.1280 -0.5772 -0.3357 -0.0904
#> v106primary 0.5119 0.2865 -0.0203 0.5026 1.0992
#> v106secondary 0.2242 0.2894 -0.3329 0.2189 0.8078
#> v106higher -0.4004 0.3085 -0.9715 -0.3991 0.2351
#>
#> Smooth terms variances:
#> Mean Sd 2.5% 50% 97.5% Min Max
#> sx(ID_1):mrf 0.3143 0.1028 0.1465 0.3004 0.5662 0.0945 0.7359
#> sx(v012) 0.0097 0.0107 0.0017 0.0068 0.0362 0.0008 0.1333
#>
#> Random effects variances:
#> Mean Sd 2.5% 50% 97.5% Min Max
#> sx(ID_1):re 0.0138 0.0174 0.0005 0.0076 0.0674 0.0002 0.1152
#>
#> N = 9054 burnin = 2000 method = MCMC family = binomial
#> iterations = 12000 step = 10
par(mar=c(1,1,1,1))
plot(imodel, term = "sx(v012)")
#Visualization of posterior mean of structured and unstructured spatial effects
## plot all spatial effects
plot(imodel, term = c("sx(ID_1):mrf", "sx(ID_1):re"))
par(mar=c(1,1,1,1))
#Plotting autocorrelation for age variance samples
plot(imodel, term = "sx(v012)", which = "var-samples", acf = TRUE)
par(mar=c(1,1,1,1))
#plot(imodel, which = "max-acf")
<- samples(imodel, term = "linear-samples")
zs par(mar=c(1,1,1,1))
plot(zs)
## extract model residuals and plot histogram
#par(mar=c(1,1,1,1))
hist(residuals(imodel))
Plot structured spatial effects maps etc
plot(imodel, term = "sx(ID_1):mrf", map = zw_bnd_prov, main = "Structured spatial effect")
- Plot unstructured spatial effects
plot(imodel, term = "sx(ID_1):re", map = zw_bnd_prov, main = "Unstructured spatial effect")
- Plot both structured and unstructured effects
plot(imodel, term = "sx(ID_1):total", map = zw_bnd_prov, main = "Total spatial effect", digits = 4)
- Map the fitted spatial term only
$spatial <- fitted(imodel, term = "sx(ID_1):mrf")[, "Mean"]
zw_shp1
# Create the map using ggplot2
ggplot(zw_shp1) +
geom_sf(aes(fill = spatial), color = "white", size = 0.3) +
scale_fill_gradient2(low = "blue", mid = "white", high = "red",
midpoint = 0,
name = "Spatial Effect") +
labs(title = "Fitted Spatial Random Effects",
subtitle = "Bayesian MRF Model Results") +
theme_minimal()
C. Logistic Regression with Simultaneous Autoregressive (SAR) Structure
We assume:
\[ Y_i \sim \text{Binomial}(n_i, p_i) \]
and model the log-odds with a spatial lag:
\[ \text{logit}(\mathbf{p}) = \mathbf{X} \boldsymbol{\beta} + \rho \mathbf{W} \, \text{logit}(\mathbf{p}) + \boldsymbol{\varepsilon} \]
<- my_data1 %>%
agg_data group_by(ID_1) %>%
summarise(
cases = sum(hiv_status, na.rm = TRUE),
total = n(),
prevalence = cases / total,
logit_prev = ifelse(prevalence > 0 & prevalence < 1,
log(prevalence / (1 - prevalence)),
NA), # Avoid Inf; handle 0/1 with small epsilon if needed
mean_age = mean(v012, na.rm = TRUE),
prop_poorer = mean(v190 == "poorer", na.rm = TRUE),
prop_middle = mean(v190 == "middle", na.rm = TRUE),
prop_richer = mean(v190 == "richer", na.rm = TRUE),
prop_richest = mean(v190 == "richest", na.rm = TRUE),
prop_primary = mean(v106 == "primary", na.rm = TRUE),
prop_secondary = mean(v106 == "secondary", na.rm = TRUE),
prop_higher = mean(v106 == "higher", na.rm = TRUE)
%>%
) filter(!is.na(logit_prev)) # Remove areas with extreme prevalence
<- st_read("Zim_new1.shp")
poly_sf #> Reading layer `Zim_new1' from data source
#> `C:\Users\VUSI\Downloads\GEO_GROUP\Zim_new1.shp' using driver `ESRI Shapefile'
#> Simple feature collection with 91 features and 15 fields
#> Geometry type: POLYGON
#> Dimension: XY
#> Bounding box: xmin: 25.2376 ymin: -22.42105 xmax: 33.06718 ymax: -15.60714
#> Geodetic CRS: WGS 84
<- poly2nb(poly_sf, queen = T) # Queen contiguity
sagra_nb
# Create row-standardized weights (W style, like MRF neighborhood averaging)
<- nb2listw(sagra_nb, style = "W", zero.policy = TRUE) listw
SAR Model
The spatial autoregressive lag model (SAR) shows strong evidence of spatial dependence in HIV prevalence across districts, with a significant spatial autocorrelation parameter (ρ = 0.545, p < 0.001). This indicates that HIV prevalence in one district is strongly influenced by prevalence in neighboring districts. Among covariates, mean age is positively associated with higher HIV prevalence (β = 0.070, p = 0.048), suggesting that older populations experience greater burden. Socioeconomic indicators are mixed: the poorer, middle, and richer categories are not individually significant, though the middle (p = 0.078) and richer (p = 0.096) groups show suggestive positive associations. Educational attainment is a strong predictor: higher proportions of individuals with primary (β = 6.66, p = 0.014), secondary (β = 6.23, p = 0.014), and higher education (β = 5.90, p = 0.032) are significantly associated with higher HIV prevalence, highlighting complex education-related risk dynamics. Model fit is good, with lower AIC (121.9) compared to the equivalent non-spatial model (AIC = 142.6), and no residual spatial autocorrelation (LM test p = 0.18), confirming that the spatial lag specification adequately accounts for spatial clustering. Overall, the SAR model emphasizes that HIV prevalence is spatially structured and shaped both by demographic composition and district-level educational profiles.
<- lagsarlm(logit_prev ~ mean_age + prop_poorer + prop_middle + prop_richer + prop_richest +
sar_model + prop_secondary + prop_higher,
prop_primary data = agg_data, listw = listw, method = "eigen", # ML estimation
zero.policy = TRUE) # Handle disconnected components if any
summary(sar_model)
#>
#> Call:lagsarlm(formula = logit_prev ~ mean_age + prop_poorer + prop_middle +
#> prop_richer + prop_richest + prop_primary + prop_secondary +
#> prop_higher, data = agg_data, listw = listw, method = "eigen",
#> zero.policy = TRUE)
#>
#> Residuals:
#> Min 1Q Median 3Q Max
#> -0.9705388 -0.2320103 -0.0087743 0.2804740 0.8823225
#>
#> Type: lag
#> Coefficients: (asymptotic standard errors)
#> Estimate Std. Error z value Pr(>|z|)
#> (Intercept) -9.231316 2.856139 -3.2321 0.001229
#> mean_age 0.070108 0.035468 1.9767 0.048080
#> prop_poorer -0.521346 0.725753 -0.7184 0.472540
#> prop_middle 0.863969 0.489990 1.7632 0.077860
#> prop_richer 0.674169 0.404578 1.6664 0.095644
#> prop_richest 0.140786 0.500307 0.2814 0.778404
#> prop_primary 6.655751 2.709606 2.4564 0.014035
#> prop_secondary 6.231964 2.538164 2.4553 0.014077
#> prop_higher 5.898761 2.747626 2.1469 0.031805
#>
#> Rho: 0.54472, LR test value: 22.613, p-value: 1.9816e-06
#> Asymptotic standard error: 0.094786
#> z-value: 5.7469, p-value: 9.0905e-09
#> Wald statistic: 33.027, p-value: 9.0905e-09
#>
#> Log likelihood: -49.97736 for lag model
#> ML residual variance (sigma squared): 0.16336, (sigma: 0.40418)
#> Number of observations: 91
#> Number of parameters estimated: 11
#> AIC: 121.95, (AIC for lm: 142.57)
#> LM test for residual autocorrelation
#> test value: 1.7905, p-value: 0.18087
# Residual Moran's I (should be non-significant if SAR accounts for dependence)
moran.test(sar_model$residuals, listw = listw)
#>
#> Moran I test under randomisation
#>
#> data: sar_model$residuals
#> weights: listw
#>
#> Moran I statistic standard deviate = 0.77365, p-value = 0.2196
#> alternative hypothesis: greater
#> sample estimates:
#> Moran I statistic Expectation Variance
#> 0.050337639 -0.011111111 0.006308629
<- impacts(sar_model, listw = listw, R = 1000) # sar_model is your lagsarlm object
imp summary(imp, zstats = TRUE)
#> Impact measures (lag, exact):
#> Direct Indirect Total
#> mean_age 0.07582988 0.0781596 0.1539895
#> prop_poorer -0.56389806 -0.5812227 -1.1451208
#> prop_middle 0.93448544 0.9631957 1.8976811
#> prop_richer 0.72919458 0.7515977 1.4807923
#> prop_richest 0.15227687 0.1569553 0.3092322
#> prop_primary 7.19899064 7.4201657 14.6191564
#> prop_secondary 6.74061457 6.9477069 13.6883215
#> prop_higher 6.38021536 6.5762352 12.9564505
#> ========================================================
#> Simulation results ( variance matrix):
#> Direct:
#>
#> Iterations = 1:1000
#> Thinning interval = 1
#> Number of chains = 1
#> Sample size per chain = 1000
#>
#> 1. Empirical mean and standard deviation for each variable,
#> plus standard error of the mean:
#>
#> Mean SD Naive SE Time-series SE
#> mean_age 0.07741 0.04013 0.001269 0.001269
#> prop_poorer -0.60439 0.80714 0.025524 0.025524
#> prop_middle 0.93920 0.52533 0.016612 0.016612
#> prop_richer 0.71036 0.43970 0.013905 0.013905
#> prop_richest 0.12305 0.53638 0.016962 0.016962
#> prop_primary 7.09452 2.92757 0.092578 0.092578
#> prop_secondary 6.67277 2.72618 0.086209 0.086209
#> prop_higher 6.34025 2.90873 0.091982 0.091982
#>
#> 2. Quantiles for each variable:
#>
#> 2.5% 25% 50% 75% 97.5%
#> mean_age 0.0008837 0.05198 0.07663 0.10424 0.1533
#> prop_poorer -2.1624327 -1.14856 -0.60285 -0.05055 0.9267
#> prop_middle -0.1227156 0.59786 0.94599 1.29041 1.9560
#> prop_richer -0.1540754 0.41841 0.71311 1.01940 1.5286
#> prop_richest -0.8908204 -0.23380 0.11701 0.48990 1.1654
#> prop_primary 1.4219087 5.18498 7.13421 9.01260 12.7842
#> prop_secondary 1.3876210 4.90954 6.75284 8.49587 11.9877
#> prop_higher 0.6865110 4.37863 6.37405 8.28744 12.0347
#>
#> ========================================================
#> Indirect:
#>
#> Iterations = 1:1000
#> Thinning interval = 1
#> Number of chains = 1
#> Sample size per chain = 1000
#>
#> 1. Empirical mean and standard deviation for each variable,
#> plus standard error of the mean:
#>
#> Mean SD Naive SE Time-series SE
#> mean_age 0.08531 0.06614 0.002092 0.002278
#> prop_poorer -0.69325 1.02014 0.032260 0.034237
#> prop_middle 1.01237 0.71116 0.022489 0.022489
#> prop_richer 0.76022 0.58196 0.018403 0.018684
#> prop_richest 0.11856 0.63071 0.019945 0.019945
#> prop_primary 7.85831 5.42325 0.171498 0.179716
#> prop_secondary 7.37572 5.02491 0.158902 0.158902
#> prop_higher 7.03565 5.13777 0.162471 0.171834
#>
#> 2. Quantiles for each variable:
#>
#> 2.5% 25% 50% 75% 97.5%
#> mean_age 0.0009289 0.04651 0.0737 0.11260 0.2250
#> prop_poorer -2.9407086 -1.25393 -0.5411 -0.05492 0.9895
#> prop_middle -0.1245881 0.56182 0.9088 1.35861 2.6610
#> prop_richer -0.1874787 0.38543 0.6734 1.06352 1.9500
#> prop_richest -1.1593452 -0.23385 0.1009 0.47758 1.3908
#> prop_primary 1.2801237 4.60197 6.9787 10.07040 18.5778
#> prop_secondary 1.3269235 4.36443 6.5615 9.40112 18.1789
#> prop_higher 0.7464868 3.81616 6.0962 9.00596 16.9389
#>
#> ========================================================
#> Total:
#>
#> Iterations = 1:1000
#> Thinning interval = 1
#> Number of chains = 1
#> Sample size per chain = 1000
#>
#> 1. Empirical mean and standard deviation for each variable,
#> plus standard error of the mean:
#>
#> Mean SD Naive SE Time-series SE
#> mean_age 0.1627 0.09974 0.003154 0.003327
#> prop_poorer -1.2976 1.77909 0.056260 0.056260
#> prop_middle 1.9516 1.17144 0.037044 0.037044
#> prop_richer 1.4706 0.97010 0.030677 0.030346
#> prop_richest 0.2416 1.14711 0.036275 0.036275
#> prop_primary 14.9528 7.75029 0.245086 0.245086
#> prop_secondary 14.0485 7.18600 0.227241 0.227241
#> prop_higher 13.3759 7.51341 0.237595 0.248859
#>
#> 2. Quantiles for each variable:
#>
#> 2.5% 25% 50% 75% 97.5%
#> mean_age 0.001992 0.09995 0.1556 0.2154 0.3716
#> prop_poorer -4.876250 -2.48444 -1.1658 -0.1089 1.8740
#> prop_middle -0.250249 1.18051 1.8856 2.6304 4.6139
#> prop_richer -0.353037 0.83561 1.4319 2.0867 3.3595
#> prop_richest -1.958004 -0.48049 0.2247 0.9750 2.4730
#> prop_primary 2.791412 10.16210 14.1174 19.0338 30.2829
#> prop_secondary 3.219761 9.67190 13.3844 17.8587 29.0660
#> prop_higher 1.697840 8.38692 12.7578 17.3363 27.9132
#>
#> ========================================================
#> Simulated standard errors
#> Direct Indirect Total
#> mean_age 0.0401259 0.06614194 0.09973873
#> prop_poorer 0.8071399 1.02013692 1.77908722
#> prop_middle 0.5253266 0.71115639 1.17144453
#> prop_richer 0.4397032 0.58195671 0.97009991
#> prop_richest 0.5363839 0.63071394 1.14711275
#> prop_primary 2.9275692 5.42324751 7.75029210
#> prop_secondary 2.7261810 5.02490840 7.18600078
#> prop_higher 2.9087348 5.13776978 7.51340571
#>
#> Simulated z-values:
#> Direct Indirect Total
#> mean_age 1.9291688 1.2898360 1.6314814
#> prop_poorer -0.7488000 -0.6795619 -0.7293810
#> prop_middle 1.7878394 1.4235479 1.6659473
#> prop_richer 1.6155418 1.3063176 1.5159048
#> prop_richest 0.2294082 0.1879791 0.2106261
#> prop_primary 2.4233476 1.4490040 1.9293241
#> prop_secondary 2.4476609 1.4678317 1.9549798
#> prop_higher 2.1797288 1.3693970 1.7802711
#>
#> Simulated p-values:
#> Direct Indirect Total
#> mean_age 0.053710 0.19711 0.102789
#> prop_poorer 0.453978 0.49678 0.465769
#> prop_middle 0.073802 0.15458 0.095724
#> prop_richer 0.106193 0.19144 0.129543
#> prop_richest 0.818552 0.85089 0.833179
#> prop_primary 0.015378 0.14734 0.053691
#> prop_secondary 0.014379 0.14215 0.050585
#> prop_higher 0.029278 0.17088 0.075032
plot(imp) # distribution of direct, indirect, total impacts
<- fitted(sar_model)
fitted_vals plot(agg_data$logit_prev, fitted_vals,
xlab = "Observed logit(prevalence)",
ylab = "Fitted values",
main = "Observed vs Fitted (SAR model)")
abline(0, 1, col = "red")
<- residuals(sar_model)
resid_vals
# Histogram + QQ plot
par(mfrow = c(1,2))
hist(resid_vals, main = "Residuals Histogram", xlab = "Residuals")
qqnorm(resid_vals); qqline(resid_vals, col="red")
par(mfrow = c(1,1))
# Moran’s I on residuals
moran.test(resid_vals, listw = listw, zero.policy = TRUE)
#>
#> Moran I test under randomisation
#>
#> data: resid_vals
#> weights: listw
#>
#> Moran I statistic standard deviate = 0.77365, p-value = 0.2196
#> alternative hypothesis: greater
#> sample estimates:
#> Moran I statistic Expectation Variance
#> 0.050337639 -0.011111111 0.006308629
8. Model results comparisons between any 3 modelling approaches done in step 6. Display results on a single table within your main submission document and attach the supporting documentation
The three models—logistic regression, CAR, and SAR—offer complementary insights into the determinants and spatial dynamics of HIV risk. The logistic regression model provides a baseline, showing strong age effects, modest educational influences, and mixed wealth associations, but it assumes independence across geographic units. The CAR model (BayesX MRF specification) extends this by explicitly incorporating structured spatial dependence across districts, capturing residual clustering that cannot be explained by individual-level covariates; this reveals stronger spatial heterogeneity in HIV prevalence and smooth variation linked to geography. In contrast, the SAR model (spatial lag regression) emphasizes the spillover effect of HIV risk between neighboring districts through a significant spatial autoregressive parameter (ρ ≈ 0.54, p < 0.001), suggesting that prevalence in one area is strongly influenced by that in adjacent areas. While the logistic regression highlights individual-level socioeconomic and demographic predictors, the CAR model adjusts for latent spatial correlation, and the SAR model quantifies direct spatial diffusion processes. Together, they show that both individual risk factors and spatial dependence matter, with the spatial models improving fit and revealing patterns that a standard logistic regression would overlook.