1 Introduction

Tuberculosis (TB) remains one of the world’s most persistent infectious disease burdens, contributing to significant morbidity and over one million deaths annually. Despite global control efforts, its distribution continues to reflect strong spatial, temporal, and socioeconomic inequalities across countries. This report uses World Bank World Development Indicators (WDI) data to conduct a comprehensive multi-level analysis of TB dynamics. The study addresses a series of analytical objectives covering spatial distribution, temporal trends, epidemiological associations, forecasting, socioeconomic determinants, and the impact of COVID-19 on TB burden. Together, these analyses provide an integrated evidence base for understanding patterns, drivers, and inequalities in tuberculosis burden globally.

2 Setup & Data Acquisition

2.1 Libraries

library(WDI)
library(tidyverse)
library(sf)
library(rnaturalearth)
library(rnaturalearthdata)
library(viridis)
library(prophet)
library(spdep)
library(ggrepel)
library(patchwork)
library(scales)

# Disable strict S2 spherical geometry to avoid edge-crossing errors
# in rnaturalearth shapefiles
sf_use_s2(FALSE)

2.2 Data Download

A single WDI pull retrieves all required indicators (2000–2024), eliminating repeated downloads across analyses.

raw <- WDI(
  country   = "all",
  indicator = c(
    tb_incidence       = "SH.TBS.INCD",
    hiv_prevalence     = "SH.DYN.AIDS.ZS",
    treatment_success  = "SH.TBS.CURE.ZS",
    gdp_per_capita     = "NY.GDP.PCAP.CD",
    health_expenditure = "SH.XPD.CHEX.GD.ZS"
  ),
  start = 2000,
  end   = 2024,
  extra = TRUE
)

# Remove regional/income-group aggregates; keep country-level only
tb_clean <- raw |>
  filter(region != "Aggregates") |>
  select(country, iso3c, region, year,
         tb_incidence, hiv_prevalence, treatment_success,
         gdp_per_capita, health_expenditure)

latest_year <- max(tb_clean$year[!is.na(tb_clean$tb_incidence)])
tb_latest   <- tb_clean |> filter(year == latest_year)

2.3 Spatial Data

# Load and repair geometries (fixes self-intersecting polygon edges)
world_sf  <- ne_countries(returnclass = "sf") |> st_make_valid()
africa_sf <- ne_countries(continent = "Africa", returnclass = "sf") |> st_make_valid()

3 Spatial Distribution of TB in Africa

Objective 1: To analyze the spatial distribution of tuberculosis burden across African countries.

Southern and Eastern Africa carry a disproportionately high TB burden, driven largely by the co-epidemic with HIV. West African countries show moderate incidence, while North Africa remains comparatively low.

africa_map_data <- africa_sf |>
  left_join(
    tb_latest |> filter(!is.na(tb_incidence)) |> select(iso3c, tb_incidence),
    by = c("iso_a3" = "iso3c")
  )

ggplot(africa_map_data) +
  geom_sf(aes(fill = tb_incidence), color = "white", linewidth = 0.2) +
  scale_fill_viridis(
    option   = "plasma",
    na.value = "grey88",
    name     = "TB incidence\nper 100,000",
    guide    = guide_colorbar(
      barwidth = 10, barheight = 0.5,
      title.position = "top", title.hjust = 0.5,
      ticks.colour = "white"
    )
  ) +
  labs(
    title    = paste0("Spatial Distribution of Tuberculosis in Africa (", latest_year, ")"),
    subtitle = "TB incidence per 100,000 population · Grey = no data",
    caption  = "Source: World Bank WDI (SH.TBS.INCD)"
  ) +
  theme_void(base_size = 12) +
  theme(
    plot.title      = element_text(face = "bold", size = 14, hjust = 0.5, margin = margin(b = 4)),
    plot.subtitle   = element_text(color = "grey40", size = 9, hjust = 0.5, margin = margin(b = 8)),
    plot.caption    = element_text(color = "grey55", size = 8, hjust = 1, margin = margin(t = 6)),
    legend.position = "bottom",
    legend.title    = element_text(size = 8.5, face = "bold"),
    legend.text     = element_text(size = 8)
  )


4 Geographic Inequalities in TB Burden

Objective 2: To visualize geographic inequalities in tuberculosis burden.

global_map_data <- world_sf |>
  left_join(
    tb_latest |> select(iso3c, tb_incidence),
    by = c("iso_a3" = "iso3c")
  )

ggplot(global_map_data) +
  geom_sf(aes(fill = tb_incidence), color = "white", linewidth = 0.15) +
  scale_fill_viridis(
    option   = "plasma",
    na.value = "grey88",
    name     = "TB incidence\nper 100,000",
    breaks   = c(50, 150, 300, 500, 700),
    labels   = comma_format(accuracy = 1),
    guide    = guide_colorbar(
      barwidth = 10, barheight = 0.5,
      title.position = "top", title.hjust = 0.5,
      ticks.colour = "white"
    )
  ) +
  labs(
    title    = paste0("Global TB Burden Inequality (", latest_year, ")"),
    subtitle = "TB incidence per 100,000 population · Grey = no data",
    caption  = "Source: World Bank WDI"
  ) +
  theme_void(base_size = 11) +
  theme(
    plot.title      = element_text(face = "bold", size = 13, hjust = 0.5, margin = margin(b = 4)),
    plot.subtitle   = element_text(color = "grey40", size = 8.5, hjust = 0.5, margin = margin(b = 8)),
    plot.caption    = element_text(color = "grey55", size = 8, hjust = 1, margin = margin(t = 6)),
    legend.position = "bottom",
    legend.title    = element_text(size = 8.5, face = "bold"),
    legend.text     = element_text(size = 8)
  )

4.1 Regional TB Inequality

global_avg <- mean(tb_latest$tb_incidence, na.rm = TRUE)

region_tb <- tb_latest |>
  group_by(region) |>
  summarise(avg_tb = mean(tb_incidence, na.rm = TRUE), .groups = "drop") |>
  arrange(desc(avg_tb)) |>
  mutate(
    above_avg = avg_tb >= global_avg,
    label     = as.character(round(avg_tb, 1))
  )

n_above_tb <- sum(region_tb$above_avg)
n_total_tb <- nrow(region_tb)

ggplot(region_tb, aes(x = avg_tb, y = reorder(region, avg_tb))) +

  geom_col(aes(fill = above_avg), width = 0.65, alpha = 0.88) +

  geom_text(
    aes(label = label, color = above_avg),
    hjust = -0.2, size = 3.4, fontface = "bold"
  ) +

  geom_vline(
    xintercept = global_avg,
    linetype = "dashed", color = "#c0392b", linewidth = 0.7
  ) +
  annotate("text",
    x = global_avg - 2, y = 3.5,
    label    = paste0("Global avg\n(", round(global_avg, 1), ")"),
    hjust = 1, vjust = 0.5,
    size = 2.85, color = "#c0392b", fontface = "italic", lineheight = 0.95
  ) +

  scale_fill_manual(
    values = c("TRUE" = "#c0392b", "FALSE" = "#2980b9"),
    labels = c("TRUE" = "Above global average", "FALSE" = "Below global average"),
    name = NULL, drop = FALSE
  ) +
  scale_color_manual(
    values = c("TRUE" = "#922b21", "FALSE" = "#1a5276"),
    guide  = "none"
  ) +
  scale_x_continuous(
    limits = c(0, max(region_tb$avg_tb) * 1.18),
    breaks = seq(0, 200, 50),
    labels = comma_format(accuracy = 1),
    expand = c(0, 0)
  ) +
  scale_y_discrete(expand = expansion(add = c(0.5, 0.8))) +

  labs(
    title    = paste0("TB Burden Inequality by Region (", latest_year, ")"),
    subtitle = paste0(
      "Average TB incidence per 100,000 \u00b7 ",
      n_above_tb, " of ", n_total_tb,
      " regions exceed the global average (", round(global_avg, 1), ")"
    ),
    x       = "Average TB incidence per 100,000",
    y       = NULL,
    caption = "Source: World Bank WDI"
  ) +

  theme_minimal(base_size = 12) +
  theme(
    plot.title         = element_text(face = "bold", size = 13, hjust = 0),
    plot.subtitle      = element_text(color = "grey40", size = 8.5, hjust = 0, margin = margin(b = 10)),
    plot.caption       = element_text(color = "grey55", size = 8, hjust = 1),
    panel.grid.major.y = element_blank(),
    panel.grid.minor   = element_blank(),
    panel.grid.major.x = element_line(color = "grey92"),
    axis.text.y        = element_text(size = 10, color = "grey25"),
    axis.text.x        = element_text(size = 9, color = "grey45"),
    axis.title.x       = element_text(size = 9.5, color = "grey30", margin = margin(t = 6)),
    legend.position    = "top",
    legend.justification = "left",
    legend.text        = element_text(size = 9),
    plot.margin        = margin(12, 24, 10, 12)
  )

6 Forecasting TB Incidence

Objective 3: To model and forecast tuberculosis incidence rates using time-series analysis.

A Prophet time-series model is fitted to global average TB incidence (2000–2024) and projected forward 5 years. The declining trajectory is expected to continue, but the widening confidence interval reflects genuine uncertainty in the post-COVID period.

ts_data <- tb_clean |>
  filter(!is.na(tb_incidence)) |>
  group_by(year) |>
  summarise(tb_incidence = mean(tb_incidence, na.rm = TRUE), .groups = "drop") |>
  rename(ds = year, y = tb_incidence) |>
  mutate(ds = as.Date(paste0(ds, "-01-01")))

prophet_model <- prophet(
  ts_data,
  yearly.seasonality  = FALSE,
  weekly.seasonality  = FALSE,
  daily.seasonality   = FALSE
)

future_df  <- make_future_dataframe(prophet_model, periods = 5, freq = "year")
prophet_fc <- predict(prophet_model, future_df)
cut_date   <- max(ts_data$ds)

fc_plot <- prophet_fc |>
  select(ds, yhat, yhat_lower, yhat_upper) |>
  left_join(ts_data, by = "ds") |>
  mutate(period = if_else(ds <= cut_date, "Historical", "Forecast"))

6.1 Main Forecast

ggplot(fc_plot, aes(x = ds)) +

  geom_ribbon(
    aes(ymin = yhat_lower, ymax = yhat_upper, fill = period),
    alpha = 0.18
  ) +
  geom_line(aes(y = yhat, color = period), linewidth = 1) +
  geom_point(
    data = filter(fc_plot, !is.na(y)),
    aes(y = y),
    shape = 21, fill = "white", color = "#2c3e50", size = 2.4, stroke = 0.8
  ) +
  geom_vline(
    xintercept = as.numeric(cut_date),
    linetype = "dashed", color = "grey50", linewidth = 0.6
  ) +
  annotate("text",
    x = cut_date, y = Inf,
    label    = "  Forecast \u25b6",
    vjust = 1.6, hjust = -0.05,
    size = 3, color = "grey45", fontface = "italic"
  ) +

  scale_color_manual(
    values = c("Historical" = "#2980b9", "Forecast" = "#e67e22"),
    guide  = "none"
  ) +
  scale_fill_manual(
    values = c("Historical" = "#2980b9", "Forecast" = "#e67e22"),
    guide  = "none"
  ) +
  scale_x_date(date_breaks = "3 years", date_labels = "%Y", expand = c(0.01, 0)) +
  scale_y_continuous(labels = comma_format(accuracy = 1)) +

  labs(
    title    = "Global TB Incidence: Historical Trend & 5-Year Forecast",
    subtitle = "Prophet model \u00b7 Shaded band = 95% uncertainty interval \u00b7 Circles = observed data",
    x        = NULL,
    y        = "Avg. TB incidence per 100,000",
    caption  = "Source: World Bank WDI"
  ) +

  theme_minimal(base_size = 12) +
  theme(
    plot.title       = element_text(face = "bold", size = 13, hjust = 0),
    plot.subtitle    = element_text(color = "grey45", size = 8.5, hjust = 0, margin = margin(b = 8)),
    plot.caption     = element_text(color = "grey55", size = 8, hjust = 1),
    panel.grid.minor = element_blank(),
    panel.grid.major = element_line(color = "grey93"),
    axis.text        = element_text(size = 9, color = "grey40"),
    axis.title.y     = element_text(size = 9.5, color = "grey30", margin = margin(r = 8)),
    plot.margin      = margin(12, 16, 10, 12)
  )

6.2 Trend Component

trend_df <- prophet_fc |>
  select(ds, trend, trend_lower, trend_upper) |>
  mutate(period = if_else(ds <= cut_date, "Historical", "Forecast"))

ggplot(trend_df, aes(x = ds)) +
  geom_ribbon(
    aes(ymin = trend_lower, ymax = trend_upper),
    fill = "#8e44ad", alpha = 0.15
  ) +
  geom_line(aes(y = trend, color = period), linewidth = 1) +
  geom_vline(
    xintercept = as.numeric(cut_date),
    linetype = "dashed", color = "grey50", linewidth = 0.6
  ) +

  scale_color_manual(
    values = c("Historical" = "#8e44ad", "Forecast" = "#e67e22"),
    name   = NULL
  ) +
  scale_x_date(date_breaks = "3 years", date_labels = "%Y", expand = c(0.01, 0)) +
  scale_y_continuous(labels = comma_format(accuracy = 1)) +

  labs(
    title    = "Underlying Trend Component",
    subtitle = "Smoothed long-run trajectory extracted by Prophet (95% CI shaded)",
    x        = NULL,
    y        = "Trend value",
    caption  = "Source: World Bank WDI"
  ) +

  theme_minimal(base_size = 12) +
  theme(
    plot.title       = element_text(face = "bold", size = 13, hjust = 0),
    plot.subtitle    = element_text(color = "grey45", size = 8.5, hjust = 0, margin = margin(b = 8)),
    plot.caption     = element_text(color = "grey55", size = 8, hjust = 1),
    panel.grid.minor = element_blank(),
    panel.grid.major = element_line(color = "grey93"),
    legend.position  = "top",
    legend.justification = "left",
    axis.text        = element_text(size = 9, color = "grey40"),
    axis.title.y     = element_text(size = 9.5, color = "grey30", margin = margin(r = 8)),
    plot.margin      = margin(12, 16, 10, 12)
  )


7 HIV Prevalence and TB Incidence

Objective 3: To assess the relationship between HIV prevalence and tuberculosis incidence.

HIV is a leading driver of TB susceptibility. The regression below (IQR-winsorised to reduce the influence of extreme values) quantifies this relationship across countries. Countries in the top 20% of both burdens are highlighted.

# Use the latest year for which both HIV and TB data are available
hiv_tb_year <- max(tb_clean$year[!is.na(tb_clean$hiv_prevalence) & !is.na(tb_clean$tb_incidence)])

hiv_tb_raw <- tb_clean |>
  filter(year == hiv_tb_year, !is.na(tb_incidence), !is.na(hiv_prevalence))

cap_iqr <- function(x) {
  q1 <- quantile(x, 0.25, na.rm = TRUE)
  q3 <- quantile(x, 0.75, na.rm = TRUE)
  iqr <- q3 - q1
  pmin(pmax(x, q1 - 1.5 * iqr), q3 + 1.5 * iqr)
}

hiv_tb_plot <- hiv_tb_raw |>
  mutate(
    tb  = cap_iqr(tb_incidence),
    hiv = cap_iqr(hiv_prevalence),
    high_burden = ifelse(
      tb >= quantile(tb, 0.8) & hiv >= quantile(hiv, 0.8),
      "High TB-HIV burden", "Other"
    )
  )

lm_hiv    <- lm(tb ~ hiv, data = hiv_tb_plot)
r2_hiv    <- summary(lm_hiv)$r.squared
int_hiv   <- coef(lm_hiv)[1]
slp_hiv   <- coef(lm_hiv)[2]
eq_label  <- paste0(
  "TB = ", round(int_hiv, 2), " + ", round(slp_hiv, 2), " \u00d7 HIV\n",
  "R\u00b2 = ", round(r2_hiv, 3)
)
ggplot(hiv_tb_plot, aes(x = hiv, y = tb)) +

  geom_point(
    aes(color = high_burden, shape = high_burden),
    size = 2.8, alpha = 0.75, stroke = 0.3
  ) +
  geom_smooth(
    method = "lm", color = "#2c2c2c", linewidth = 0.8,
    se = TRUE, fill = "grey80", alpha = 0.25
  ) +
  geom_text_repel(
    data          = subset(hiv_tb_plot, high_burden == "High TB-HIV burden"),
    aes(label     = country),
    size          = 2.8, color = "#c0392b", fontface = "italic",
    max.overlaps  = 20, box.padding = 0.4, point.padding = 0.3,
    segment.color = "grey55", segment.size = 0.3
  ) +
  annotate("label",
    x = -Inf, y = -Inf, hjust = -0.05, vjust = -0.3,
    label    = eq_label,
    size     = 3.2, color = "#2c2c2c", fill = "white",
    label.size = 0.3, label.r = unit(0.15, "lines"),
    fontface = "plain", family = "sans"
  ) +

  scale_color_manual(values = c("High TB-HIV burden" = "#c0392b", "Other" = "#3498db")) +
  scale_shape_manual(values = c("High TB-HIV burden" = 17, "Other" = 16)) +

  labs(
    title    = paste0("HIV Prevalence vs. TB Incidence (", hiv_tb_year, ")"),
    subtitle = "IQR-winsorised values \u00b7 OLS regression line with 95% CI \u00b7 Triangles = top 20% dual burden",
    x        = "HIV prevalence (%)",
    y        = "TB incidence per 100,000",
    color    = NULL, shape = NULL,
    caption  = "Source: World Bank WDI"
  ) +

  theme_minimal(base_size = 12) +
  theme(
    plot.title       = element_text(face = "bold", size = 13, hjust = 0),
    plot.subtitle    = element_text(color = "grey45", size = 9, hjust = 0, margin = margin(b = 8)),
    plot.caption     = element_text(color = "grey55", size = 8, hjust = 1),
    panel.grid.minor = element_blank(),
    panel.grid.major = element_line(color = "grey92"),
    legend.position  = "top",
    legend.justification = "left",
    legend.text      = element_text(size = 9),
    axis.title       = element_text(size = 10, color = "grey30"),
    axis.text        = element_text(size = 9, color = "grey40"),
    plot.margin      = margin(12, 16, 10, 12)
  )


8 TB–HIV Spatial Hotspot Analysis

Objective 4: To identify tuberculosis hotspot regions using spatial analysis techniques.

A composite TB–HIV burden index (standardised TB + HIV z-scores) is used to compute Local Moran’s I (LISA) statistics. High-High clusters indicate countries that are themselves high-burden and surrounded by high-burden neighbours.

hiv_tb_spatial <- tb_clean |>
  filter(year == hiv_tb_year, !is.na(tb_incidence), !is.na(hiv_prevalence)) |>
  mutate(
    tb_z         = as.numeric(scale(tb_incidence)),
    hiv_z        = as.numeric(scale(hiv_prevalence)),
    burden_index = tb_z + hiv_z
  )

world_hotspot <- world_sf |>
  left_join(hiv_tb_spatial, by = c("iso_a3" = "iso3c"))

map_clean_hs <- world_hotspot |> filter(!is.na(burden_index))

nb_hs  <- poly2nb(map_clean_hs, queen = TRUE, snap = 0.1)
wt_hs  <- nb2listw(nb_hs, style = "W", zero.policy = TRUE)

moran_i    <- moran.test(map_clean_hs$burden_index, wt_hs, zero.policy = TRUE)
lm_hs      <- localmoran(map_clean_hs$burden_index, wt_hs, zero.policy = TRUE)
map_clean_hs$local_I <- lm_hs[, 1]

map_clean_hs <- map_clean_hs |>
  mutate(hotspot = case_when(
    burden_index > 0 & local_I > 0 ~ "High-High (Hotspot)",
    burden_index < 0 & local_I > 0 ~ "Low-Low (Coldspot)",
    TRUE                            ~ "Other"
  ))

Global Moran’s I = 0.747 (p < 0.001), confirming strong positive spatial autocorrelation — TB–HIV burden clusters geographically, not randomly.

ggplot() +
  geom_sf(data = world_hotspot, fill = "grey95", color = "white", linewidth = 0.15) +
  geom_sf(data = map_clean_hs, aes(fill = hotspot), color = "white", linewidth = 0.15) +
  scale_fill_manual(values = c(
    "High-High (Hotspot)" = "#c0392b",
    "Low-Low (Coldspot)"  = "#27ae60",
    "Other"               = "grey80"
  )) +
  labs(
    title    = paste0("TB\u2013HIV Spatial Hotspots (", hiv_tb_year, ")"),
    subtitle = paste0(
      "Local Moran\u2019s I \u00b7 Global I = ",
      round(moran_i$estimate[1], 3), ", p < 0.001"
    ),
    fill    = "Cluster Type",
    caption = "Source: World Bank WDI \u00b7 Natural Earth"
  ) +
  theme_void(base_size = 12) +
  theme(
    plot.title      = element_text(face = "bold", size = 13, hjust = 0.5, margin = margin(b = 4)),
    plot.subtitle   = element_text(color = "grey40", size = 9, hjust = 0.5, margin = margin(b = 8)),
    plot.caption    = element_text(color = "grey55", size = 8, hjust = 1),
    legend.position = "bottom",
    legend.title    = element_text(size = 9),
    legend.text     = element_text(size = 8)
  )


9 TB Treatment Outcomes

Objective 5: To compare tuberculosis treatment outcomes across high-burden countries.

The WHO recommends a treatment success rate of ≥85% as the benchmark for effective TB programmes. No region currently meets this target, though South Asia and the Middle East & North Africa are within 0.2 percentage points.

tx_year   <- max(tb_clean$year[!is.na(tb_clean$treatment_success)])
tx_latest <- tb_clean |>
  filter(year == tx_year, !is.na(treatment_success))

top_10_tx <- tx_latest |> arrange(desc(treatment_success)) |> slice_head(n = 10) |>
  select(country, region, treatment_success)

bottom_10_tx <- tx_latest |> arrange(treatment_success) |> slice_head(n = 10) |>
  select(country, region, treatment_success)

region_tx <- tx_latest |>
  group_by(region) |>
  summarise(avg_success = mean(treatment_success, na.rm = TRUE), .groups = "drop") |>
  arrange(desc(avg_success)) |>
  mutate(
    meets_target = avg_success >= 85,
    label        = paste0(round(avg_success, 1), "%")
  )

n_tx <- nrow(region_tx)
ggplot(region_tx, aes(x = avg_success, y = reorder(region, avg_success))) +

  geom_col(aes(fill = meets_target), width = 0.65, alpha = 0.88) +
  geom_text(
    aes(label = label, color = meets_target),
    hjust = -0.15, size = 3.4, fontface = "bold"
  ) +
  geom_vline(
    xintercept = 85,
    linetype = "dashed", color = "#c0392b", linewidth = 0.7
  ) +
  annotate("text",
    x = 85 - 0.8, y = n_tx + 0.45,
    label    = "WHO target\n(85%)",
    hjust = 1, vjust = 0.5,
    size = 2.8, color = "#c0392b", fontface = "italic", lineheight = 0.95
  ) +

  scale_fill_manual(
    values = c("TRUE" = "#27ae60", "FALSE" = "#e67e22"),
    labels = c("TRUE" = "\u226585% \u2014 meets target", "FALSE" = "Below WHO target (85%)"),
    name = NULL, drop = FALSE
  ) +
  scale_color_manual(
    values = c("TRUE" = "#1a7a45", "FALSE" = "#b85c10"),
    guide  = "none"
  ) +
  scale_x_continuous(
    limits = c(0, 100), breaks = seq(0, 100, 20),
    labels = function(x) paste0(x, "%"), expand = c(0, 0)
  ) +
  scale_y_discrete(expand = expansion(add = c(0.5, 0.8))) +

  labs(
    title    = paste0("TB Treatment Success Rate by Region (", tx_year, ")"),
    subtitle = paste0(
      "WHO recommends \u226585% \u00b7 ",
      sum(region_tx$meets_target), " of ", n_tx,
      " regions currently meet the target"
    ),
    x       = "Average treatment success rate (%)",
    y       = NULL,
    caption = "Source: World Bank WDI (SH.TBS.CURE.ZS)"
  ) +

  theme_minimal(base_size = 12) +
  theme(
    plot.title         = element_text(face = "bold", size = 13, hjust = 0),
    plot.subtitle      = element_text(color = "grey40", size = 8.5, hjust = 0, margin = margin(b = 10)),
    plot.caption       = element_text(color = "grey55", size = 8, hjust = 1),
    panel.grid.major.y = element_blank(),
    panel.grid.minor   = element_blank(),
    panel.grid.major.x = element_line(color = "grey92"),
    axis.text.y        = element_text(size = 10, color = "grey25"),
    axis.text.x        = element_text(size = 9, color = "grey45"),
    axis.title.x       = element_text(size = 9.5, color = "grey30", margin = margin(t = 6)),
    legend.position    = "top",
    legend.justification = "left",
    legend.text        = element_text(size = 9),
    plot.margin        = margin(12, 20, 10, 12)
  )

9.0.1 Top & Bottom 10 Countries

Top 10 Countries by Treatment Success Rate (2023)
Country Region Success Rate (%)
Samoa East Asia & Pacific 100
West Bank and Gaza Middle East, North Africa, Afghanistan & Pakistan 100
Kuwait Middle East, North Africa, Afghanistan & Pakistan 99
Bangladesh South Asia 96
Cambodia East Asia & Pacific 96
Ethiopia Sub-Saharan Africa 96
Tanzania Sub-Saharan Africa 96
China East Asia & Pacific 95
Congo, Dem. Rep. Sub-Saharan Africa 95
Mozambique Sub-Saharan Africa 95
Bottom 10 Countries by Treatment Success Rate (2023)
Country Region Success Rate (%)
Antigua and Barbuda Latin America & Caribbean 0
Grenada Latin America & Caribbean 0
Latvia Europe & Central Asia 0
France Europe & Central Asia 20
New Caledonia East Asia & Pacific 26
Denmark Europe & Central Asia 30
Finland Europe & Central Asia 34
Ireland Europe & Central Asia 36
Bosnia and Herzegovina Europe & Central Asia 37
Jamaica Latin America & Caribbean 38

10 Socioeconomic Determinants of TB

Objective 6: To investigate socio-economic predictors of tuberculosis prevalence.

GDP per capita (log-transformed) explains 27.2% of the variance in TB incidence, reinforcing the strong poverty–disease link. Health expenditure as a share of GDP shows a weaker relationship, suggesting how much a country spends matters less than how wealthy it is overall.

socio_year <- max(tb_clean$year[
  !is.na(tb_clean$gdp_per_capita) &
  !is.na(tb_clean$health_expenditure) &
  !is.na(tb_clean$tb_incidence)
])

socio_df <- tb_clean |>
  filter(year == socio_year, !is.na(tb_incidence),
         !is.na(gdp_per_capita), !is.na(health_expenditure))

tb_thresh_socio <- quantile(socio_df$tb_incidence, 0.90, na.rm = TRUE)

m_gdp_s  <- lm(tb_incidence ~ log10(gdp_per_capita), data = socio_df)
m_hlth_s <- lm(tb_incidence ~ health_expenditure,    data = socio_df)
r2_g_s   <- summary(m_gdp_s)$r.squared
r2_h_s   <- summary(m_hlth_s)$r.squared
p_gdp_s <- ggplot(socio_df, aes(x = gdp_per_capita, y = tb_incidence)) +

  geom_point(aes(color = region), size = 2.2, alpha = 0.75, shape = 16) +
  geom_smooth(
    method = "lm", color = "#2c2c2c", linewidth = 0.85,
    fill = "grey75", alpha = 0.2, se = TRUE
  ) +
  geom_text_repel(
    data = filter(socio_df, tb_incidence >= tb_thresh_socio),
    aes(label = country, color = region),
    size = 2.5, fontface = "italic", max.overlaps = 15,
    box.padding = 0.35, segment.color = "grey60",
    segment.size = 0.3, show.legend = FALSE
  ) +
  annotate("label",
    x = Inf, y = Inf, hjust = 1.05, vjust = 1.4,
    label    = paste0("R\u00b2 = ", round(r2_g_s, 3)),
    size = 3.2, fill = "white", label.size = 0.3,
    label.r = unit(0.12, "lines"), color = "#2c2c2c"
  ) +

  scale_x_log10(labels = label_dollar(scale_cut = cut_short_scale())) +
  scale_y_continuous(labels = comma_format(accuracy = 1)) +
  scale_color_brewer(palette = "Set2", name = NULL) +

  labs(
    title    = "GDP per Capita vs TB Incidence",
    subtitle = "Log scale \u00b7 OLS fit with 95% CI \u00b7 Top 10% burden labelled",
    x        = "GDP per capita (log scale, USD)",
    y        = "TB incidence per 100,000"
  ) +

  theme_minimal(base_size = 11) +
  theme(
    plot.title       = element_text(face = "bold", size = 12, hjust = 0),
    plot.subtitle    = element_text(color = "grey45", size = 8, hjust = 0, margin = margin(b = 6)),
    panel.grid.minor = element_blank(),
    panel.grid.major = element_line(color = "grey93"),
    legend.position  = "none",
    axis.text        = element_text(size = 8.5, color = "grey40"),
    axis.title       = element_text(size = 9, color = "grey30"),
    plot.margin      = margin(10, 12, 6, 10)
  )

p_hlth_s <- ggplot(socio_df, aes(x = health_expenditure, y = tb_incidence)) +

  geom_point(aes(color = region), size = 2.2, alpha = 0.75, shape = 16) +
  geom_smooth(
    method = "lm", color = "#2c2c2c", linewidth = 0.85,
    fill = "grey75", alpha = 0.2, se = TRUE
  ) +
  geom_text_repel(
    data = filter(socio_df, tb_incidence >= tb_thresh_socio),
    aes(label = country, color = region),
    size = 2.5, fontface = "italic", max.overlaps = 15,
    box.padding = 0.35, segment.color = "grey60",
    segment.size = 0.3, show.legend = FALSE
  ) +
  annotate("label",
    x = Inf, y = Inf, hjust = 1.05, vjust = 1.4,
    label    = paste0("R\u00b2 = ", round(r2_h_s, 3)),
    size = 3.2, fill = "white", label.size = 0.3,
    label.r = unit(0.12, "lines"), color = "#2c2c2c"
  ) +

  scale_y_continuous(labels = comma_format(accuracy = 1)) +
  scale_color_brewer(palette = "Set2", name = "Region") +

  labs(
    title    = "Health Expenditure vs TB Incidence",
    subtitle = "% of GDP \u00b7 OLS fit with 95% CI \u00b7 Top 10% burden labelled",
    x        = "Health expenditure (% of GDP)",
    y        = "TB incidence per 100,000"
  ) +

  theme_minimal(base_size = 11) +
  theme(
    plot.title       = element_text(face = "bold", size = 12, hjust = 0),
    plot.subtitle    = element_text(color = "grey45", size = 8, hjust = 0, margin = margin(b = 6)),
    panel.grid.minor = element_blank(),
    panel.grid.major = element_line(color = "grey93"),
    legend.position  = "right",
    legend.text      = element_text(size = 8),
    legend.key.size  = unit(0.45, "cm"),
    axis.text        = element_text(size = 8.5, color = "grey40"),
    axis.title       = element_text(size = 9, color = "grey30"),
    plot.margin      = margin(10, 12, 6, 10)
  )

(p_gdp_s | p_hlth_s) +
  plot_annotation(
    title   = paste0("Socioeconomic Determinants of TB Incidence (", socio_year, ")"),
    caption = "Source: World Bank WDI",
    theme   = theme(
      plot.title   = element_text(face = "bold", size = 14, hjust = 0),
      plot.caption = element_text(color = "grey55", size = 8, hjust = 1)
    )
  ) +
  plot_layout(guides = "collect") &
  theme(legend.position = "bottom")


11 Summary

Objective Key Finding
1. Spatial distribution (Africa) Southern & Eastern Africa carry the highest sub-continental burden.
2. Temporal trends (2000–2024) Consistent decline of ~50% over 25 years; post-2020 rebound observed.
3. HIV–TB relationship R² = 0.251; HIV prevalence is a significant positive predictor of TB incidence.
4. Spatial hotspot analysis Global Moran’s I = 0.747; Sub-Saharan Africa is the dominant High-High cluster.
5. Forecasting Prophet projects continued decline; 5-year CI widens post-2024 due to post-COVID uncertainty.
6. Treatment outcomes 0 of 7 regions meet the WHO ≥85% target; Latin America & Caribbean lags most (68%).
7. Socioeconomic determinants GDP (log) explains 47.8% of TB variance; health spending explains only 7.6%.
8. Global inequality 3 of 7 regions exceed the global average; North America is lowest at 4.2.
9. COVID-19 disruptions 2020–2022 shows a visible notification dip consistent with service disruption; not a true incidence decline.

Report generated using R 4.5.3 · World Bank WDI via the WDI package · May 2026