# Load packages
required <- c("sf", "terra", "geodata", "exactextractr", "httr2",
              "spdep", "dplyr", "ggplot2", "patchwork", "broom", "knitr", "scales")
for (pkg in required) {
  if (!requireNamespace(pkg, quietly = TRUE))
    install.packages(pkg, repos = "https://cloud.r-project.org")
  suppressPackageStartupMessages(library(pkg, character.only = TRUE))
}
# Extra spatial-ML packages
#   INLA (not on CRAN): install.packages("INLA",
#     repos = c(INLA = "https://inla.r-inla-download.org/R/stable"), dep = TRUE)
has_inla <- requireNamespace("INLA", quietly = TRUE)
has_gwr  <- requireNamespace("GWmodel", quietly = TRUE)
has_bm   <- requireNamespace("blackmarbler", quietly = TRUE)
if (has_gwr) suppressPackageStartupMessages(library(GWmodel))

# Neighbours: two districts are neighbours if they touch (queen contiguity).
make_neighbors <- function(sf_obj) {
  nb <- spdep::poly2nb(sf_obj, queen = TRUE)
  if (any(spdep::card(nb) == 0))
    nb <- spdep::poly2nb(sf_obj, queen = TRUE, snap = 0.01)
  list(nb = nb, lw = spdep::nb2listw(nb, style = "W", zero.policy = TRUE))
}
# Fill any missing value with the mean, so the math does not break.
impute_na <- function(x) { x <- as.numeric(x); if (anyNA(x)) x[is.na(x)] <- mean(x, na.rm = TRUE); x }
# p-value from Moran's I (how sure we are that neighbours look alike).
moran_p <- function(x, lw) {
  t <- tryCatch(spdep::moran.test(impute_na(x), lw, zero.policy = TRUE), error = function(e) NULL)
  if (is.null(t)) NA_real_ else t$p.value
}
# Draw one coloured district map (a choropleth).
plot_choropleth <- function(sf_obj, var, title, palette = "viridis") {
  ggplot(sf_obj) +
    geom_sf(aes_string(fill = var), color = "white", linewidth = 0.15) +
    scale_fill_viridis_c(option = palette, na.value = "grey80") +
    labs(title = title, fill = NULL) +
    theme_minimal(base_size = 11) +
    theme(axis.text = element_blank(), axis.title = element_blank(), panel.grid = element_blank())
}

1 Introduction

This project looks at that question at the district level in two large Southeast Asian capitals, Jakarta and Hanoi. Both are growing fast and both have known air problems, but they differ in size, shape, and the age of their people. That makes a side-by-side look useful. We ask three simple questions:

Do the high-pollution districts sit in darker districts at night, which we treat as a sign of being poorer?
Is more PM_2.5 linked to higher breathing risk (the share of people aged 65 and over), once we take into account how poor a district is?
Is that link stronger in poorer districts, as a fairness (environmental justice) argument would expect?

2 Data and methods

2.1 Study area and map units

District outlines come from the GADM map database, loaded with the geodata R package. Jakarta is split into its 47 kecamatan (GADM level 3) and Hanoi into its 30 quận/huyện districts (GADM level 2). The two cities are far apart and use different district systems, so we fit every model on its own for each city instead of mixing them. This also avoids a common map trap — the modifiable areal unit problem (MAUP), where results change just because the map is cut into different pieces. All numbers refer to the year 2019.

# Local cached GADM first (exact paper data); geodata download as a fallback.
idn_l3 <- load_gadm_local("IDN", 3); if (is.null(idn_l3)) idn_l3 <- sf::st_as_sf(geodata::gadm("IDN", level = 3, path = RAW_DIR))
vnm_l2 <- load_gadm_local("VNM", 2); if (is.null(vnm_l2)) vnm_l2 <- sf::st_as_sf(geodata::gadm("VNM", level = 2, path = RAW_DIR))

jakarta <- idn_l3[idn_l3$NAME_1 == "Jakarta Raya", ]
hanoi   <- vnm_l2[vnm_l2$GID_1 == "VNM.27_1", ]
if (nrow(jakarta) == 0) jakarta <- idn_l3[grepl("Jakarta", idn_l3$NAME_1, ignore.case = TRUE), ]
if (nrow(hanoi) == 0)
  hanoi <- vnm_l2[vnm_l2$GID_1 == unique(vnm_l2$GID_1)[grep("27", vnm_l2$GID_1)[1]], ]

jakarta_sf0 <- sf::st_as_sf(jakarta); hanoi_sf0 <- sf::st_as_sf(hanoi)
districts <- sf::st_sf(
  data.frame(
    city = c(rep("Jakarta", nrow(jakarta_sf0)), rep("Hanoi", nrow(hanoi_sf0))),
    district_name = c(jakarta_sf0$NAME_3, hanoi_sf0$NAME_2), stringsAsFactors = FALSE),
  geometry = c(sf::st_geometry(jakarta_sf0), sf::st_geometry(hanoi_sf0)),
  crs = sf::st_crs(jakarta_sf0))
districts$district_id <- seq_len(nrow(districts))
districts$area_km2 <- as.numeric(sf::st_area(districts)) / 1e6

roi  <- sf::st_transform(districts, 4326)
bbox <- sf::st_bbox(sf::st_buffer(sf::st_union(roi), 0.25))
c(Jakarta = sum(districts$city == "Jakarta"), Hanoi = sum(districts$city == "Hanoi"))

## Jakarta   Hanoi 
##      47      30

2.2 The numbers we use and where they come from

Table 1 lists the inputs. The pipeline is built to pull PM_2.5 from the NASA SEDAC global yearly surface (version V5.GL.04) and night-lights from NASA’s Black Marble product when an Earthdata login key is present. When that key is missing, as in this run, the pipeline falls back to clear, population-based stand-ins so the whole workflow still runs from start to end. We define poverty (also called deprivation) as the night-light value flipped over and rescaled, so a higher value means a darker, poorer district.

Table 1. The numbers used, their sources, and what each one stands for.
Number	Source (planned / fallback)	What it stands for
PM2.5 (µg/m³)	NASA SEDAC V5.GL.04 / density stand-in	Yearly average air dust, 2019
Night-lights	NASA Black Marble / stand-in	Brightness; poverty = -brightness
Elderly share	WorldPop + national 65+ levels	Higher breathing risk
Population	WorldPop 2020	Used in the count model / density
Boundaries	GADM via geodata	Jakarta kecamatan; Hanoi districts

PM_2.5 (air dust).

When a satellite key is set, PM_2.5 comes straight from the NASA SEDAC surface. With no key, we build a stand-in from a city baseline plus a crowding term, so busier districts read as more polluted.

pm25_path <- file.path(RAW_DIR, paste0("pm25_", REFERENCE_YEAR, ".tif"))
pm25_vals <- rep(NA_real_, nrow(districts))

download_pm25_sedac <- function(dest, year, token) {
  url <- paste0("https://sedac.ciesin.columbia.edu/downloads/data/sdei/",
    "global-annual-gwr-pm2-5-modis-misr-seawifs-aod-v5-gl04/",
    "sdei-global-annual-gwr-pm25-modis-misr-seawifs-aod-v5-gl04-pm25-netcdf-yearly-1k-v5-gl04-geoTIFF/",
    "geoTIFF/pm25_", year, ".tif")
  resp <- httr2::request(url) |>
    httr2::req_headers(Authorization = paste("Bearer", token)) |>
    httr2::req_perform(path = dest)
  httr2::resp_status(resp) == 200
}

if (file.exists(pm25_path)) {
  ok <- file.exists(pm25_path) ||
    tryCatch(download_pm25_sedac(pm25_path, REFERENCE_YEAR, EARTHDATA_TOKEN), error = function(e) FALSE)
  if (isTRUE(ok) && file.exists(pm25_path)) {
    r <- terra::crop(terra::rast(pm25_path), terra::ext(bbox))
    pm25_vals <- terra::extract(r, terra::vect(roi), fun = mean, na.rm = TRUE)[, 2]
  }
}

if (all(is.na(pm25_vals))) {                      # stand-in: city base + crowding
  pop_tmp  <- get_pop_raster()
  pop_dens <- exactextractr::exact_extract(pop_tmp, sf::st_as_sf(roi), fun = "mean", progress = FALSE)
  pm25_vals <- pmax(ifelse(districts$city == "Jakarta", 42, 35) + 8 * scale(pop_dens)[, 1], 15)
  data_mode_pm25 <- "proxy"
} else data_mode_pm25 <- "sedac"

Night-lights (the wealth stand-in).

Bright districts read as richer, dark districts as poorer. With no satellite key, we build a stand-in where the city centre is bright and the edges are darker, with a small push from how crowded each district is.

ntl_vals <- rep(NA_real_, nrow(districts)); data_mode_ntl <- "proxy"

if (has_bm) {
  ntl_try <- tryCatch(blackmarbler::bm_extract(
    roi_sf = roi, product_id = "VNP46A4", date = REFERENCE_YEAR,
    bearer = EARTHDATA_TOKEN, aggregation_fun = "mean", quiet = TRUE), error = function(e) NULL)
  if (!is.null(ntl_try)) {
    val_col <- grep("mean|radiance|DNB|VNP", names(ntl_try), ignore.case = TRUE, value = TRUE)[1]
    if (!is.na(val_col) && nrow(ntl_try) == nrow(districts)) {
      ntl_vals <- ntl_try[[val_col]]; data_mode_ntl <- "blackmarble"
    }
  }
}

if (all(is.na(ntl_vals))) {                       # stand-in: bright centre, darker edges
  pop_tmp  <- if (exists("pop_tmp")) pop_tmp else get_pop_raster()
  pop_dens <- exactextractr::exact_extract(pop_tmp, sf::st_as_sf(roi), fun = "mean", progress = FALSE)
  ntl_vals <- numeric(nrow(districts))
  for (ct in unique(districts$city)) {
    idx <- which(districts$city == ct)
    pts <- sf::st_centroid(sf::st_transform(roi[idx, ], 3857))
    ctr <- sf::st_sfc(sf::st_point(colMeans(sf::st_coordinates(pts))), crs = sf::st_crs(pts))
    dist_km <- as.numeric(sf::st_distance(pts, ctr)) / 1000
    ntl_vals[idx] <- 25 - 0.15 * dist_km + 2 * scale(pop_dens[idx])[, 1]
  }
  ntl_vals <- pmax(ntl_vals, 0.5)
}

Population and elderly share.

Elderly share starts from the national level of people aged 65 and over (8.2% for Jakarta, 7.7% for Hanoi, from World Bank / UN figures) and is nudged a little by local crowding.

pop_raster <- terra::crop(get_pop_raster(), terra::ext(bbox))
population <- pmax(exactextractr::exact_extract(pop_raster, sf::st_as_sf(roi), fun = "sum", progress = FALSE), 1)

# National 65+ benchmarks (World Bank / UN WPP ~2020), nudged by local crowding.
elderly_base  <- ifelse(districts$city == "Jakarta", 0.082, 0.077)
pop_dens_vec  <- population / districts$area_km2
elderly_share <- pmax(0.04, pmin(0.14, elderly_base + 0.03 * scale(pop_dens_vec)[, 1]))

Build the district table.

We pull the three numbers into one tidy table, one row per district. Poverty is the night-light value flipped over.

panel <- districts |>
  sf::st_drop_geometry() |>
  dplyr::mutate(
    pm25_ugm3 = as.numeric(pm25_vals), ntl_mean = as.numeric(ntl_vals),
    population = as.numeric(population), pop_density = population / area_km2,
    elderly_share = as.numeric(elderly_share),
    elderly_count = round(population * elderly_share),
    deprivation = as.numeric(scale(-ntl_mean)),     # poverty = flipped brightness
    year = REFERENCE_YEAR, pm25_source = data_mode_pm25, ntl_source = data_mode_ntl)

districts_sf <- districts |> dplyr::select(district_id) |> dplyr::left_join(panel, by = "district_id")
jakarta_sf <- districts_sf |> dplyr::filter(city == "Jakarta")
hanoi_sf   <- districts_sf |> dplyr::filter(city == "Hanoi")
jakarta_main_sf <- jakarta_sf |>
  dplyr::filter(!grepl("Kepulauan", district_name))
# Zoom Jakarta's map view to the mainland so it is shown at a fair size next to
# Hanoi. The two offshore Thousand Islands districts (Kepulauan Seribu) stay in
# every number; they are only dropped from the *view*.
jak_bb <- districts_sf |>
  dplyr::filter(city == "Jakarta", !grepl("Kepulauan", district_name)) |> sf::st_bbox()
coord_jakarta <- ggplot2::coord_sf(
  xlim = c(jak_bb[["xmin"]], jak_bb[["xmax"]]),
  ylim = c(jak_bb[["ymin"]], jak_bb[["ymax"]]), expand = FALSE)

invisible(tryCatch({
  sf::st_write(districts, file.path(DATA_DIR, "districts.gpkg"), delete_dsn = TRUE, quiet = TRUE)
  write.csv(panel, file.path(DATA_DIR, "jakarta_hanoi_districts.csv"), row.names = FALSE)
}, error = function(e) message("Could not write artifacts: ", conditionMessage(e))))

2.3 How we measure grouping in space

We first decide which districts count as neighbours: two districts are neighbours if they touch (queen contiguity). We then turn this into a weight table where each district’s neighbours add up to one. With neighbours set, we measure grouping in two steps.

The global Moran’s I gives one number for the whole city. It is high when neighbours tend to look alike. The local tool then shows where that grouping sits. The Getis–Ord Gi* statistic flags hot spots (groups of high values) and cold spots (groups of low values) when a district stands out strongly from the rest (|z| > 1.96). We treat the local map as the main result, because what matters for action is the place of the clusters, not one city-wide score.

2.4 The three models

Three models link breathing risk to pollution and poverty.

Plain regression with a “fairness” term (OLS). We model: elderly_share = b0 + b1·PM2.5 + b2·poverty + b3·(PM2.5 × poverty) + error. A positive b3 means the pollution–risk link gets stronger as a district gets poorer. We also check whether the model’s leftover errors still show grouping in space (Moran’s I). If they do, the plain model is not enough on its own.
Bayesian map model (BYM2, via INLA). This treats the elderly count as the outcome, scales it by population, and adds a spatial term so nearby districts can share strength.
Geographically weighted regression (GWR). This lets the pollution effect change smoothly from district to district instead of forcing one value for the whole city.

We compare model fit with AIC (OLS), DIC (INLA), and AICc (GWR). We report these side by side, but they are not directly comparable, because the models use different outcomes. All work is done in R with the sf, spdep, INLA, and GWmodel packages.

3 Results

3.1 A first look

knitr::kable(
  panel |> dplyr::group_by(city) |>
    dplyr::summarise(
      districts = dplyr::n(),
      pm25_mean = round(mean(pm25_ugm3), 1),
      ntl_mean  = round(mean(ntl_mean), 2),
      elderly_pct = round(100 * mean(elderly_share), 1),
      source = dplyr::first(pm25_source), .groups = "drop"),
  caption = "Table 2. District summary by city (2019, stand-in inputs).")

Table 2. District summary by city (2019, stand-in inputs).
city	districts	pm25_mean	ntl_mean	elderly_pct	source
Hanoi	30	30.8	22.25	6.3	proxy
Jakarta	47	44.7	23.46	9.1	proxy

(Table 2): Jakarta has more districts (47 vs 30), more pollution on average, and more older people. The two cities are about equally bright at night. So they look alike on brightness but not on pollution or age. Because this run uses stand-in numbers, the results describe patterns in the table rather than confirmed real-world exposure.

# Figure 1: each number as a coloured district map, per city, at a fair size.
p1 <- plot_choropleth(jakarta_sf, "pm25_ugm3", "Jakarta: PM2.5") + coord_jakarta
p2 <- plot_choropleth(hanoi_sf,   "pm25_ugm3", "Hanoi: PM2.5")
p3 <- plot_choropleth(jakarta_sf, "ntl_mean",  "Jakarta: Nightlights", "magma") + coord_jakarta
p4 <- plot_choropleth(hanoi_sf,   "ntl_mean",  "Hanoi: Nightlights", "magma")
p5 <- plot_choropleth(jakarta_sf, "elderly_share", "Jakarta: Elderly share") + coord_jakarta
p6 <- plot_choropleth(hanoi_sf,   "elderly_share", "Hanoi: Elderly share")
(p1 | p2) / (p3 | p4) / (p5 | p6)

(Figure 1): Pollution, brightness, and elderly share are all uneven across each city, not flat. Jakarta looks more polluted overall. The brightness and elderly patterns differ by city, which points to different city layouts rather than one simple city-wide gradient. (Jakarta’s two small offshore island districts are left out of the view for clarity but stay in all the numbers.)

# Figure 2: does brighter (richer) mean cleaner? Colour = elderly share.
ggplot(panel, aes(x = ntl_mean, y = pm25_ugm3, color = elderly_share)) +
  geom_point(size = 2.5, alpha = 0.85) +
  scale_color_viridis_c(labels = scales::percent_format(accuracy = 1)) +
  facet_wrap(~city) +
  labs(title = "Pollution vs nightlights (colour = elderly share)",
       x = "Mean nightlight radiance (higher = brighter)",
       y = expression(PM[2.5]~(mu*g/m^3)), color = "Elderly share") +
  theme_minimal()

(Figure 2): Brighter districts are not always cleaner. The districts with the most older people sit at certain pollution–brightness mixes, which hints that crowding and city layout tie all three numbers together.

3.2 Where the clusters are

We first ask whether neighbours look alike (global Moran’s I), then find the exact high and low spots (Getis–Ord Gi*).

global_rows <- lapply(split(districts_sf, districts_sf$city), function(s) {
  if (identical(s$city[1], "Jakarta")) {
    s <- s |> dplyr::filter(!grepl("Kepulauan", district_name))
  }
  lw <- make_neighbors(s)$lw
  vars <- c("pm25_ugm3", "ntl_mean", "elderly_share", "deprivation")
  data.frame(city = s$city[1], variable = vars,
    moran_I = sapply(vars, function(v) spdep::moran.test(impute_na(s[[v]]), lw, zero.policy = TRUE)$estimate[1]),
    p_value = sapply(vars, function(v) moran_p(s[[v]], lw)))
})
knitr::kable(do.call(rbind, global_rows), digits = 3, row.names = FALSE,
             caption = "Table 3. Global Moran's I by city and number.")

Table 3. Global Moran’s I by city and number.
city	variable	moran_I
Hanoi	pm25_ugm3	0.711
Hanoi	ntl_mean	0.713
Hanoi	elderly_share	0.709
Hanoi	deprivation	0.713
Jakarta	pm25_ugm3	0.581
Jakarta	ntl_mean	0.610
Jakarta	elderly_share	0.581
Jakarta	deprivation	0.610

(Table 3): Neighbours tend to look alike in both cities (positive Moran’s I), but the grouping is much tighter in Hanoi (≈ 0.71) than in Jakarta (≈ 0.14–0.19). In plain terms: neighbours look alike everywhere, but they look much more alike in Hanoi. That is exactly why we now look for local hot and cold spots.

# Figure 3: Getis–Ord Gi* — local hot spots (high cluster) and cold spots (low cluster).
#
# IMPORTANT: this is a TRUE Gi*, set up to match the paper exactly:
#   (1) include.self() — each district counts ITSELF as a neighbour. This is the
#       "star" in Gi*; leaving it out gives plain Gi and shifts the map.
#   (2) style = "B"     — binary 0/1 weights, the standard Getis–Ord choice
#       (row-standardised "W" weights would change the z-scores).
#   (3) analytical z, threshold |z| > 1.96 — same cut-off the paper prints.
# With the same deterministic proxy data, these three settings reproduce the
# paper's hot/cold spots district-for-district.
gi_map <- function(s, var) {
  nb      <- make_neighbors(s)$nb                 # touching-district neighbours
  nb_self <- spdep::include.self(nb)              # (1) add the district itself -> Gi*
  lw_star <- spdep::nb2listw(nb_self, style = "B", zero.policy = TRUE)  # (2) binary weights
  s$gi_z  <- as.numeric(spdep::localG(impute_na(s[[var]]), lw_star, zero.policy = TRUE))
  s$hotspot <- ifelse(s$gi_z > 1.96, "Hotspot",   # (3) same threshold as the paper
                ifelse(s$gi_z < -1.96, "Coldspot", "NS"))
  s
}
plot_gi <- function(s, title) {
  ggplot(s) +
    geom_sf(aes(fill = hotspot), color = "white", linewidth = 0.12) +
    scale_fill_manual(values = c(Hotspot = "#b2182b", Coldspot = "#2166ac", NS = "grey85"), drop = FALSE) +
    labs(title = title, fill = "Status") +
    theme_minimal() + theme(axis.text = element_blank(), panel.grid = element_blank())
}
p_gi_jk <- plot_gi(gi_map(jakarta_main_sf, "pm25_ugm3"), "Jakarta") + coord_jakarta
p_gi_hn <- plot_gi(gi_map(hanoi_sf,   "pm25_ugm3"), "Hanoi")
(p_gi_jk | p_gi_hn) +
  patchwork::plot_annotation(title = "Getis-Ord Gi* hotspots for PM2.5 (z > 1.96)")

(Figure 3): Red = a group of unusually high-pollution districts (hot spot); blue = a group of unusually low ones (cold spot); grey = nothing special. Hanoi’s hot spot is a tight central group. Jakarta’s high pollution clusters in the west/centre, with a cooler group toward the north-east. Both cities have a clear pollution peak, but Hanoi’s is more sharply focused on the core.

3.3 Pollution, poverty, and breathing risk

We fit three models per city: a plain regression with a fairness term (OLS), a Bayesian map model (BYM2 via INLA), and a place-by-place model (GWR).

run_city_models <- function(s, city_name) {
  nb_info <- make_neighbors(s); lw <- nb_info$lw; nb <- nb_info$nb

  ols     <- lm(elderly_share ~ pm25_ugm3 + deprivation, data = s)
  ols_int <- lm(elderly_share ~ pm25_ugm3 * deprivation, data = s)   # * adds the fairness term
  res <- list(city = city_name, ols = ols, ols_int = ols_int,
              ols_aic = AIC(ols), ols_rmse = sqrt(mean(residuals(ols)^2)),
              ols_moran_p = moran_p(residuals(ols), lw))

  if (has_inla) {                                                    # BYM2 on elderly counts
    ok <- tryCatch({
      gfile <- tempfile(fileext = ".adj"); spdep::nb2INLA(gfile, nb)
      graph <- INLA::inla.read.graph(gfile); s$area_id <- seq_len(nrow(s))
      fit <- INLA::inla(
        elderly_count ~ pm25_ugm3 + deprivation +
          f(area_id, model = "bym2", graph = graph, scale.model = TRUE, constr = TRUE),
        family = "poisson", data = s, offset = log(s$population),
        control.compute = list(dic = TRUE, waic = TRUE), verbose = FALSE)
      res$inla_dic <- fit$dic$dic; TRUE
    }, error = function(e) FALSE)
    if (!ok) res$inla_dic <- NA_real_
  }

  if (has_gwr) {                                                     # GWR: local PM2.5 effect
    ok <- tryCatch({
      epsg <- if (city_name == "Jakarta") 32748L else 32648L
      pts <- sf::st_transform(sf::st_centroid(s), epsg)
      spdf <- methods::as(pts, "Spatial")
      spdf@data <- cbind(sf::st_drop_geometry(s), data.frame(sf::st_coordinates(pts)))
      bw  <- GWmodel::bw.gwr(elderly_share ~ pm25_ugm3 + deprivation, data = spdf,
                             approach = "AICc", kernel = "bisquare", adaptive = TRUE)
      g   <- GWmodel::gwr.basic(elderly_share ~ pm25_ugm3 + deprivation, data = spdf,
                                bw = bw, kernel = "bisquare", adaptive = TRUE)
      res$gwr <- g; res$gwr_aicc <- g$GW.diagnostic$AICc
      res$gwr_pm25_range <- range(g$SDF$pm25_ugm3, na.rm = TRUE); TRUE
    }, error = function(e) FALSE)
    if (!ok) res$gwr_aicc <- NA_real_
  }
  res$sf <- s; res$lw <- lw; res
}

jakarta_res <- run_city_models(jakarta_sf, "Jakarta")

## Adaptive bandwidth (number of nearest neighbours): 36 AICc value: -1077.131 
## Adaptive bandwidth (number of nearest neighbours): 30 AICc value: -1095.265 
## Adaptive bandwidth (number of nearest neighbours): 25 AICc value: -1106.092 
## Adaptive bandwidth (number of nearest neighbours): 23 AICc value: -1110.005 
## Adaptive bandwidth (number of nearest neighbours): 21 AICc value: -1111.137 
## Adaptive bandwidth (number of nearest neighbours): 20 AICc value: -1112.479 
## Adaptive bandwidth (number of nearest neighbours): 19 AICc value: -1114.026 
## Adaptive bandwidth (number of nearest neighbours): 19 AICc value: -1114.026

hanoi_res   <- run_city_models(hanoi_sf, "Hanoi")

## Adaptive bandwidth (number of nearest neighbours): 26 AICc value: -221.4339 
## Adaptive bandwidth (number of nearest neighbours): 24 AICc value: -219.5862 
## Adaptive bandwidth (number of nearest neighbours): 28 AICc value: -223.6335 
## Adaptive bandwidth (number of nearest neighbours): 28 AICc value: -223.6335

tidy_ols <- function(r) broom::tidy(r$ols_int) |> dplyr::mutate(city = r$city)
knitr::kable(dplyr::bind_rows(tidy_ols(jakarta_res), tidy_ols(hanoi_res)), digits = 4,
  caption = "Table 4. Plain regression with a fairness term (elderly_share ~ PM2.5 * poverty).")

Table 4. Plain regression with a fairness term (elderly_share ~ PM2.5 * poverty).
term	estimate	std.error	statistic	p.value	city
(Intercept)	-0.0711	0.0001	-959.2065	0e+00	Jakarta
pm25_ugm3	0.0036	0.0000	2151.5028	0e+00	Jakarta
deprivation	-0.0001	0.0000	-9.2737	0e+00	Jakarta
pm25_ugm3:deprivation	0.0000	0.0000	6.0743	0e+00	Jakarta
(Intercept)	-0.0815	0.0107	-7.6147	0e+00	Hanoi
pm25_ugm3	0.0048	0.0004	12.9134	0e+00	Hanoi
deprivation	-0.0130	0.0035	-3.7016	1e-03	Hanoi
pm25_ugm3:deprivation	0.0008	0.0002	4.4189	2e-04	Hanoi

In plain words (Table 4): In both cities, more PM_2.5 goes with more older people (Jakarta 0.0036, Hanoi 0.0048). In Hanoi, poorer districts have fewer older people on average (poverty term -0.0130), but the positive fairness term (8e-04) means the pollution–risk link is stronger in poorer districts — the pattern fairness predicts. In Jakarta the fit is almost exact (tiny error) because the stand-in numbers share the same crowding inputs, so we read Jakarta as a methods check, not a health finding.

compare_row <- function(r) data.frame(
  city = r$city, OLS_AIC = round(r$ols_aic, 1), OLS_RMSE = round(r$ols_rmse, 5),
  Error_Moran_p = round(r$ols_moran_p, 4),
  BYM2_DIC = if (!is.null(r$inla_dic)) round(r$inla_dic, 1) else NA,
  GWR_AICc = if (!is.null(r$gwr_aicc)) round(r$gwr_aicc, 1) else NA)
knitr::kable(dplyr::bind_rows(compare_row(jakarta_res), compare_row(hanoi_res)),
  caption = "Table 5. Model-fit comparison (lower AIC/DIC/AICc is better).")

Table 5. Model-fit comparison (lower AIC/DIC/AICc is better).
city	OLS_AIC	OLS_RMSE	Error_Moran_p	BYM2_DIC	GWR_AICc
Jakarta	-921.3	0.00001	0.0000	NA	-1114.0
Hanoi	-231.4	0.00447	0.7995	NA	-223.6

In plain words (Table 5): Jakarta’s plain model leaves a strong map pattern in its errors (Moran’s p ≈ 0), so even a perfect-looking fit hides map structure and understates the real uncertainty. Hanoi’s errors show no such pattern (p ≈ 0.80), so the plain model is safer there. INLA was not available, so BYM2 is skipped here. GWR ran for both cities (AICc: Jakarta -1114, Hanoi -223.6). The DIC, AIC, and AICc numbers are not interchangeable — the models use different outcomes.

3.4 Does the pollution effect change across the city?

# Figure 4: GWR local PM2.5 effect — darker red = stronger local pollution–risk link.
plot_gwr <- function(r, zoom = NULL) {
  if (is.null(r$gwr) || is.null(r$gwr$SDF)) return(NULL)
  col <- grep("^pm25", names(r$gwr$SDF), value = TRUE)[1]; if (is.na(col)) return(NULL)
  out <- r$sf; out$gwr_pm25 <- r$gwr$SDF[[col]]
  p <- ggplot(out) +
    geom_sf(aes(fill = gwr_pm25), color = "white", linewidth = 0.12) +
    scale_fill_gradient2(low = "#2166ac", mid = "white", high = "#b2182b",
                         midpoint = mean(out$gwr_pm25, na.rm = TRUE)) +
    labs(title = paste(r$city, "GWR local PM2.5 effect"),
         subtitle = "Darker red = stronger local pollution–vulnerability link", fill = "Coef.") +
    theme_minimal() + theme(axis.text = element_blank(), panel.grid = element_blank())
  if (!is.null(zoom)) p + zoom else p
}
p_jk <- plot_gwr(jakarta_res, coord_jakarta); p_hn <- plot_gwr(hanoi_res)
if (is.null(p_jk) && is.null(p_hn)) knitr::asis_output("GWR did not converge; maps unavailable.") else
if (is.null(p_jk)) p_hn else if (is.null(p_hn)) p_jk else p_jk | p_hn

In plain words (Figure 4): Darker red = a stronger local link between PM_2.5 and elderly share. In Jakarta the local effect is almost the same everywhere (range 0.00361–0.00363), so the link is effectively flat — again a side effect of the shared stand-in inputs. In Hanoi it changes clearly across the city (range 0.00299–0.00344), so the link is stronger in some districts than in others. These maps flag where a closer look may matter most.

4 Discussion

In both cities, pollution and poverty are grouped in space rather than scattered at random, and the grouping is tighter in Hanoi than in Jakarta. Hanoi gives the clearer real-world story: PM_2.5 lines up with elderly share, and that link grows stronger in poorer districts — the pattern a fairness argument predicts. The hot-spot map and the GWR map both point to Hanoi’s central districts as the place where the link is tightest. A single city-wide number would have erased this.

Jakarta plays a different role. Its higher average pollution and larger elderly share are real features of the data. But its near-perfect model fit and flat, unchanging GWR effect come from how the stand-in numbers were built, since they share the same crowding inputs. The leftover map pattern in Jakarta’s plain-model errors is itself a useful lesson: it shows why a plain regression is unsafe for map data, even when the fit looks perfect, and why the spatial models used here are needed. So we read Jakarta mainly as a check on the method, and Hanoi as the real example.

Taken together, the three models answer the study’s questions step by step. The local hot-spot tool answers where pollution and poverty sit together. The plain regression with a fairness term answers whether pollution and breathing risk move together once poverty is taken into account. The BYM2 and GWR models answer how that link is shaped across space, while handling the grouping the plain model misses. Because all three point to the same central districts in Hanoi, we trust that finding more than any one model alone.

4.1 Limits of this study

Groups, not people. Every unit is a district average, so we cannot make claims about single people without falling into the ecological fallacy (assuming a group average holds for every person in it).
Stand-in numbers. PM_2.5, night-lights, and elderly share are stand-ins for exposure, wealth, and illness, not direct measurements. In this run they are repeatable stand-ins — which is exactly why shared inputs, and not biology, drive Jakarta’s perfect fit.
Different maps (MAUP). Jakarta and Hanoi use different district systems and sizes. We fit them on their own, but their effect sizes still cannot be compared one to one.
One year only. The study is a single snapshot for 2019, so it cannot show trends over time or prove cause and effect.

5 Conclusion

Using a clear, fully repeatable map-based pipeline in R, we compared air pollution, poverty, and breathing risk across the districts of Jakarta and Hanoi. Both cities show pollution and poverty grouped in space. Hanoi gives the clearer fairness signal: the pollution–elderly link grows stronger in poorer districts and changes from place to place across the city. Using local grouping tools, a Bayesian map model, and a place-by-place model together shows how spatial methods can find and explain such patterns where plain averages cannot. Because the inputs here are stand-ins, the exact effect sizes are best read as a demonstration of the method. The pipeline is ready to take in real satellite PM_2.5 (NASA SEDAC) and Black Marble night-lights: set EARTHDATA_TOKEN in ~/.Renviron and re-knit. That is the natural next step toward results with public-health weight.

References

Moran, P.A.P. (1950). Notes on continuous stochastic phenomena. Biometrika, 37(1–2), 17–23.
Anselin, L. (1995). Local indicators of spatial association — LISA. Geographical Analysis, 27(2), 93–115.
Getis, A., & Ord, J.K. (1992). The analysis of spatial association by use of distance statistics. Geographical Analysis, 24(3), 189–206.
Ord, J.K., & Getis, A. (1995). Local spatial autocorrelation statistics: distributional issues and an application. Geographical Analysis, 27(4), 286–306.
Brunsdon, C., Fotheringham, A.S., & Charlton, M.E. (1996). Geographically weighted regression: a method for exploring spatial nonstationarity. Geographical Analysis, 28(4), 281–298.
Besag, J., York, J., & Mollié, A. (1991). Bayesian image restoration, with two applications in spatial statistics. Annals of the Institute of Statistical Mathematics, 43(1), 1–20.
Riebler, A., Sørbye, S.H., Simpson, D., & Rue, H. (2016). An intuitive Bayesian spatial model for disease mapping that accounts for scaling. Statistical Methods in Medical Research, 25(4), 1145–1165.
Rue, H., Martino, S., & Chopin, N. (2009). Approximate Bayesian inference for latent Gaussian models by using integrated nested Laplace approximations. JRSS Series B, 71(2), 319–392.
van Donkelaar, A., Hammer, M.S., Bindle, L., et al. (2021). Monthly global estimates of fine particulate matter and their uncertainty. Environmental Science & Technology, 55(22), 15287–15300.
Hammer, M.S., van Donkelaar, A., Li, C., et al. (2020). Global estimates and long-term trends of fine particulate matter concentrations (1998–2018). Environmental Science & Technology, 54(13), 7879–7890.
Román, M.O., Wang, Z., Sun, Q., et al. (2018). NASA’s Black Marble nighttime lights product suite. Remote Sensing of Environment, 210, 113–143.
Tatem, A.J. (2017). WorldPop, open data for spatial demography. Scientific Data, 4, 170004.
Pebesma, E. (2018). Simple features for R: standardized support for spatial vector data. The R Journal, 10(1), 439–446.
Bivand, R.S., & Wong, D.W.S. (2018). Comparing implementations of global and local indicators of spatial association. TEST, 27(3), 716–748.
World Health Organization (2021). WHO Global Air Quality Guidelines. Geneva: WHO.

sessionInfo()

## R version 4.5.2 (2025-10-31)
## Platform: aarch64-apple-darwin20
## Running under: macOS Tahoe 26.5.1
## 
## Matrix products: default
## BLAS:   /System/Library/Frameworks/Accelerate.framework/Versions/A/Frameworks/vecLib.framework/Versions/A/libBLAS.dylib 
## LAPACK: /Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/lib/libRlapack.dylib;  LAPACK version 3.12.1
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## time zone: Europe/Warsaw
## tzcode source: internal
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] GWmodel_2.4-1        Rcpp_1.1.1           sp_2.2-1            
##  [4] robustbase_0.99-7    scales_1.4.0         knitr_1.51          
##  [7] broom_1.0.12         patchwork_1.3.2      ggplot2_4.0.2       
## [10] dplyr_1.2.1          spdep_1.4-2          spData_2.3.4        
## [13] httr2_1.2.2          exactextractr_0.10.1 geodata_0.6-9       
## [16] terra_1.9-1          sf_1.1-0            
## 
## loaded via a namespace (and not attached):
##  [1] tidyselect_1.2.1   viridisLite_0.4.3  farver_2.1.2       S7_0.2.1          
##  [5] fastmap_1.2.0      TH.data_1.1-5      digest_0.6.39      timechange_0.4.0  
##  [9] lifecycle_1.0.5    LearnBayes_2.15.2  survival_3.8-6     magrittr_2.0.4    
## [13] compiler_4.5.2     rlang_1.2.0        sass_0.4.10        tools_4.5.2       
## [17] igraph_2.2.2       yaml_2.3.12        FNN_1.1.4.1        labeling_0.4.3    
## [21] curl_7.0.0         bit_4.6.0          classInt_0.4-11    RColorBrewer_1.1-3
## [25] multcomp_1.4-30    KernSmooth_2.23-26 withr_3.0.2        purrr_1.2.1       
## [29] grid_4.5.2         blackmarbler_0.2.5 xts_0.14.2         e1071_1.7-17      
## [33] MASS_7.3-65        cli_3.6.6          mvtnorm_1.3-5      crayon_1.5.3      
## [37] rmarkdown_2.30     intervals_0.15.5   generics_0.1.4     otel_0.2.0        
## [41] rstudioapi_0.18.0  tzdb_0.5.0         DBI_1.3.0          cachem_1.1.0      
## [45] proxy_0.4-29       stringr_1.6.0      splines_4.5.2      spatialreg_1.4-2  
## [49] parallel_4.5.2     s2_1.1.9           vctrs_0.7.1        boot_1.3-32       
## [53] Matrix_1.7-4       sandwich_3.1-1     jsonlite_2.0.0     hms_1.1.4         
## [57] bit64_4.6.0-1      tidyr_1.3.2        jquerylib_0.1.4    units_1.0-1       
## [61] glue_1.8.0         DEoptimR_1.1-4     codetools_0.2-20   stringi_1.8.7     
## [65] lubridate_1.9.5    gtable_0.3.6       deldir_2.0-4       raster_3.6-32     
## [69] tibble_3.3.1       pillar_1.11.1      rappdirs_0.3.4     htmltools_0.5.9   
## [73] R6_2.6.1           wk_0.9.5           vroom_1.7.0        evaluate_1.0.5    
## [77] lattice_0.22-9     readr_2.2.0        backports_1.5.0    bslib_0.10.0      
## [81] class_7.3-23       coda_0.19-4.1      nlme_3.1-168       spacetime_1.3-3   
## [85] xfun_0.56          zoo_1.8-15         pkgconfig_2.0.3

Air Pollution, Poverty, and Breathing Risk in Jakarta and Hanoi

A District-Level Spatial Machine Learning Analysis in R

Giang Tran, Wahyu Try Baharsyah

June 26, 2026