1 Introduction

Tuberculosis (TB) remains one of the world’s most persistent infectious disease burdens, contributing to significant morbidity and over one million deaths annually. Despite global control efforts, its distribution continues to reflect strong spatial, temporal, and socioeconomic inequalities across countries. This report uses World Bank World Development Indicators (WDI) data to conduct a comprehensive multi-level analysis of TB dynamics. The study addresses a series of analytical objectives covering spatial distribution, temporal trends, epidemiological associations, forecasting, socioeconomic determinants, and the impact of COVID-19 on TB burden. Together, these analyses provide an integrated evidence base for understanding patterns, drivers, and inequalities in tuberculosis burden globally.

2 Setup & Data Acquisition

2.1 Libraries

library(WDI)
library(tidyverse)
library(sf)
library(rnaturalearth)
library(rnaturalearthdata)
library(viridis)
library(prophet)
library(spdep)
library(ggrepel)
library(patchwork)
library(scales)

# Disable strict S2 spherical geometry to avoid edge-crossing errors
# in rnaturalearth shapefiles
sf_use_s2(FALSE)

2.2 Data Download

A single WDI pull retrieves all required indicators (2000–2024), eliminating repeated downloads across analyses.

raw <- WDI(
  country   = "all",
  indicator = c(
    tb_incidence       = "SH.TBS.INCD",
    hiv_prevalence     = "SH.DYN.AIDS.ZS",
    treatment_success  = "SH.TBS.CURE.ZS",
    gdp_per_capita     = "NY.GDP.PCAP.CD",
    health_expenditure = "SH.XPD.CHEX.GD.ZS"
  ),
  start = 2000,
  end   = 2024,
  extra = TRUE
)

# Remove regional/income-group aggregates; keep country-level only
tb_clean <- raw |>
  filter(region != "Aggregates") |>
  select(country, iso3c, region, year,
         tb_incidence, hiv_prevalence, treatment_success,
         gdp_per_capita, health_expenditure)

latest_year <- max(tb_clean$year[!is.na(tb_clean$tb_incidence)])
tb_latest   <- tb_clean |> filter(year == latest_year)

2.3 Spatial Data

# Load and repair geometries (fixes self-intersecting polygon edges)
world_sf  <- ne_countries(returnclass = "sf") |> st_make_valid()
africa_sf <- ne_countries(continent = "Africa", returnclass = "sf") |> st_make_valid()

3 Spatial Distribution of TB in Africa

Objective 1: To analyze the spatial distribution of tuberculosis burden across African countries.

Southern and Eastern Africa carry a disproportionately high TB burden, driven largely by the co-epidemic with HIV. West African countries show moderate incidence, while North Africa remains comparatively low.

africa_map_data <- africa_sf |>
  left_join(
    tb_latest |> filter(!is.na(tb_incidence)) |> select(iso3c, tb_incidence),
    by = c("iso_a3" = "iso3c")
  )

ggplot(africa_map_data) +
  geom_sf(aes(fill = tb_incidence), color = "white", linewidth = 0.2) +
  scale_fill_viridis(
    option   = "plasma",
    na.value = "grey88",
    name     = "TB incidence\nper 100,000",
    guide    = guide_colorbar(
      barwidth = 10, barheight = 0.5,
      title.position = "top", title.hjust = 0.5,
      ticks.colour = "white"
    )
  ) +
  labs(
    title    = paste0("Spatial Distribution of Tuberculosis in Africa (", latest_year, ")"),
    subtitle = "TB incidence per 100,000 population · Grey = no data",
    caption  = "Source: World Bank WDI (SH.TBS.INCD)"
  ) +
  theme_void(base_size = 12) +
  theme(
    plot.title      = element_text(face = "bold", size = 14, hjust = 0.5, margin = margin(b = 4)),
    plot.subtitle   = element_text(color = "grey40", size = 9, hjust = 0.5, margin = margin(b = 8)),
    plot.caption    = element_text(color = "grey55", size = 8, hjust = 1, margin = margin(t = 6)),
    legend.position = "bottom",
    legend.title    = element_text(size = 8.5, face = "bold"),
    legend.text     = element_text(size = 8)
  )

4 Geographic Inequalities in TB Burden

Objective 2: To visualize geographic inequalities in tuberculosis burden.

global_map_data <- world_sf |>
  left_join(
    tb_latest |> select(iso3c, tb_incidence),
    by = c("iso_a3" = "iso3c")
  )

ggplot(global_map_data) +
  geom_sf(aes(fill = tb_incidence), color = "white", linewidth = 0.15) +
  scale_fill_viridis(
    option   = "plasma",
    na.value = "grey88",
    name     = "TB incidence\nper 100,000",
    breaks   = c(50, 150, 300, 500, 700),
    labels   = comma_format(accuracy = 1),
    guide    = guide_colorbar(
      barwidth = 10, barheight = 0.5,
      title.position = "top", title.hjust = 0.5,
      ticks.colour = "white"
    )
  ) +
  labs(
    title    = paste0("Global TB Burden Inequality (", latest_year, ")"),
    subtitle = "TB incidence per 100,000 population · Grey = no data",
    caption  = "Source: World Bank WDI"
  ) +
  theme_void(base_size = 11) +
  theme(
    plot.title      = element_text(face = "bold", size = 13, hjust = 0.5, margin = margin(b = 4)),
    plot.subtitle   = element_text(color = "grey40", size = 8.5, hjust = 0.5, margin = margin(b = 8)),
    plot.caption    = element_text(color = "grey55", size = 8, hjust = 1, margin = margin(t = 6)),
    legend.position = "bottom",
    legend.title    = element_text(size = 8.5, face = "bold"),
    legend.text     = element_text(size = 8)
  )

4.1 Regional TB Inequality

global_avg <- mean(tb_latest$tb_incidence, na.rm = TRUE)

region_tb <- tb_latest |>
  group_by(region) |>
  summarise(avg_tb = mean(tb_incidence, na.rm = TRUE), .groups = "drop") |>
  arrange(desc(avg_tb)) |>
  mutate(
    above_avg = avg_tb >= global_avg,
    label     = as.character(round(avg_tb, 1))
  )

n_above_tb <- sum(region_tb$above_avg)
n_total_tb <- nrow(region_tb)

ggplot(region_tb, aes(x = avg_tb, y = reorder(region, avg_tb))) +

  geom_col(aes(fill = above_avg), width = 0.65, alpha = 0.88) +

  geom_text(
    aes(label = label, color = above_avg),
    hjust = -0.2, size = 3.4, fontface = "bold"
  ) +

  geom_vline(
    xintercept = global_avg,
    linetype = "dashed", color = "#c0392b", linewidth = 0.7
  ) +
  annotate("text",
    x = global_avg - 2, y = 3.5,
    label    = paste0("Global avg\n(", round(global_avg, 1), ")"),
    hjust = 1, vjust = 0.5,
    size = 2.85, color = "#c0392b", fontface = "italic", lineheight = 0.95
  ) +

  scale_fill_manual(
    values = c("TRUE" = "#c0392b", "FALSE" = "#2980b9"),
    labels = c("TRUE" = "Above global average", "FALSE" = "Below global average"),
    name = NULL, drop = FALSE
  ) +
  scale_color_manual(
    values = c("TRUE" = "#922b21", "FALSE" = "#1a5276"),
    guide  = "none"
  ) +
  scale_x_continuous(
    limits = c(0, max(region_tb$avg_tb) * 1.18),
    breaks = seq(0, 200, 50),
    labels = comma_format(accuracy = 1),
    expand = c(0, 0)
  ) +
  scale_y_discrete(expand = expansion(add = c(0.5, 0.8))) +

  labs(
    title    = paste0("TB Burden Inequality by Region (", latest_year, ")"),
    subtitle = paste0(
      "Average TB incidence per 100,000 \u00b7 ",
      n_above_tb, " of ", n_total_tb,
      " regions exceed the global average (", round(global_avg, 1), ")"
    ),
    x       = "Average TB incidence per 100,000",
    y       = NULL,
    caption = "Source: World Bank WDI"
  ) +

  theme_minimal(base_size = 12) +
  theme(
    plot.title         = element_text(face = "bold", size = 13, hjust = 0),
    plot.subtitle      = element_text(color = "grey40", size = 8.5, hjust = 0, margin = margin(b = 10)),
    plot.caption       = element_text(color = "grey55", size = 8, hjust = 1),
    panel.grid.major.y = element_blank(),
    panel.grid.minor   = element_blank(),
    panel.grid.major.x = element_line(color = "grey92"),
    axis.text.y        = element_text(size = 10, color = "grey25"),
    axis.text.x        = element_text(size = 9, color = "grey45"),
    axis.title.x       = element_text(size = 9.5, color = "grey30", margin = margin(t = 6)),
    legend.position    = "top",
    legend.justification = "left",
    legend.text        = element_text(size = 9),
    plot.margin        = margin(12, 24, 10, 12)
  )

5 Temporal Trends & COVID-19 Disruption

Objectives 2: To examine temporal trends in TB incidence (2000–2024) and the impact of COVID-19 disruptions on TB notifications.

The global average TB incidence has declined steadily since 2000. The COVID-19 pandemic (2020–2022) disrupted health services, suppressing TB notifications — a data artefact rather than a true reduction, as subsequent years show a partial rebound.

tb_global_trend <- tb_clean |>
  filter(!is.na(tb_incidence)) |>
  group_by(year) |>
  summarise(avg_tb = mean(tb_incidence, na.rm = TRUE), .groups = "drop")

ggplot(tb_global_trend, aes(x = year, y = avg_tb)) +

  # COVID-19 disruption shading
  annotate("rect",
    xmin = 2019.5, xmax = 2022.5,
    ymin = -Inf,   ymax = Inf,
    fill = "#f39c12", alpha = 0.10
  ) +
  annotate("text",
    x = 2021, y = max(tb_global_trend$avg_tb) * 0.97,
    label    = "COVID-19\ndisruption",
    size = 3, color = "#e67e22", fontface = "italic", lineheight = 0.9
  ) +

  geom_line(color = "#2980b9", linewidth = 1.1) +
  geom_point(
    shape = 21, fill = "white", color = "#2980b9",
    size = 2.5, stroke = 0.9
  ) +

  scale_x_continuous(breaks = seq(2000, 2024, 3)) +
  scale_y_continuous(labels = comma_format(accuracy = 1)) +

  labs(
    title    = "Global Average TB Incidence Trend (2000\u20132024)",
    subtitle = "Mean TB incidence per 100,000 \u00b7 Shaded area = COVID-19 disruption period (2020\u20132022)",
    x        = NULL,
    y        = "Avg. TB incidence per 100,000",
    caption  = "Source: World Bank WDI"
  ) +

  theme_minimal(base_size = 12) +
  theme(
    plot.title       = element_text(face = "bold", size = 13, hjust = 0),
    plot.subtitle    = element_text(color = "grey45", size = 8.5, hjust = 0, margin = margin(b = 8)),
    plot.caption     = element_text(color = "grey55", size = 8, hjust = 1),
    panel.grid.minor = element_blank(),
    panel.grid.major = element_line(color = "grey93"),
    axis.text        = element_text(size = 9, color = "grey40"),
    axis.title.y     = element_text(size = 9.5, color = "grey30", margin = margin(r = 8))
  )

6 Forecasting TB Incidence

Objective 3: To model and forecast tuberculosis incidence rates using time-series analysis.

A Prophet time-series model is fitted to global average TB incidence (2000–2024) and projected forward 5 years. The declining trajectory is expected to continue, but the widening confidence interval reflects genuine uncertainty in the post-COVID period.

ts_data <- tb_clean |>
  filter(!is.na(tb_incidence)) |>
  group_by(year) |>
  summarise(tb_incidence = mean(tb_incidence, na.rm = TRUE), .groups = "drop") |>
  rename(ds = year, y = tb_incidence) |>
  mutate(ds = as.Date(paste0(ds, "-01-01")))

prophet_model <- prophet(
  ts_data,
  yearly.seasonality  = FALSE,
  weekly.seasonality  = FALSE,
  daily.seasonality   = FALSE
)

future_df  <- make_future_dataframe(prophet_model, periods = 5, freq = "year")
prophet_fc <- predict(prophet_model, future_df)
cut_date   <- max(ts_data$ds)

fc_plot <- prophet_fc |>
  select(ds, yhat, yhat_lower, yhat_upper) |>
  left_join(ts_data, by = "ds") |>
  mutate(period = if_else(ds <= cut_date, "Historical", "Forecast"))

6.1 Main Forecast

ggplot(fc_plot, aes(x = ds)) +

  geom_ribbon(
    aes(ymin = yhat_lower, ymax = yhat_upper, fill = period),
    alpha = 0.18
  ) +
  geom_line(aes(y = yhat, color = period), linewidth = 1) +
  geom_point(
    data = filter(fc_plot, !is.na(y)),
    aes(y = y),
    shape = 21, fill = "white", color = "#2c3e50", size = 2.4, stroke = 0.8
  ) +
  geom_vline(
    xintercept = as.numeric(cut_date),
    linetype = "dashed", color = "grey50", linewidth = 0.6
  ) +
  annotate("text",
    x = cut_date, y = Inf,
    label    = "  Forecast \u25b6",
    vjust = 1.6, hjust = -0.05,
    size = 3, color = "grey45", fontface = "italic"
  ) +

  scale_color_manual(
    values = c("Historical" = "#2980b9", "Forecast" = "#e67e22"),
    guide  = "none"
  ) +
  scale_fill_manual(
    values = c("Historical" = "#2980b9", "Forecast" = "#e67e22"),
    guide  = "none"
  ) +
  scale_x_date(date_breaks = "3 years", date_labels = "%Y", expand = c(0.01, 0)) +
  scale_y_continuous(labels = comma_format(accuracy = 1)) +

  labs(
    title    = "Global TB Incidence: Historical Trend & 5-Year Forecast",
    subtitle = "Prophet model \u00b7 Shaded band = 95% uncertainty interval \u00b7 Circles = observed data",
    x        = NULL,
    y        = "Avg. TB incidence per 100,000",
    caption  = "Source: World Bank WDI"
  ) +

  theme_minimal(base_size = 12) +
  theme(
    plot.title       = element_text(face = "bold", size = 13, hjust = 0),
    plot.subtitle    = element_text(color = "grey45", size = 8.5, hjust = 0, margin = margin(b = 8)),
    plot.caption     = element_text(color = "grey55", size = 8, hjust = 1),
    panel.grid.minor = element_blank(),
    panel.grid.major = element_line(color = "grey93"),
    axis.text        = element_text(size = 9, color = "grey40"),
    axis.title.y     = element_text(size = 9.5, color = "grey30", margin = margin(r = 8)),
    plot.margin      = margin(12, 16, 10, 12)
  )

6.2 Trend Component

trend_df <- prophet_fc |>
  select(ds, trend, trend_lower, trend_upper) |>
  mutate(period = if_else(ds <= cut_date, "Historical", "Forecast"))

ggplot(trend_df, aes(x = ds)) +
  geom_ribbon(
    aes(ymin = trend_lower, ymax = trend_upper),
    fill = "#8e44ad", alpha = 0.15
  ) +
  geom_line(aes(y = trend, color = period), linewidth = 1) +
  geom_vline(
    xintercept = as.numeric(cut_date),
    linetype = "dashed", color = "grey50", linewidth = 0.6
  ) +

  scale_color_manual(
    values = c("Historical" = "#8e44ad", "Forecast" = "#e67e22"),
    name   = NULL
  ) +
  scale_x_date(date_breaks = "3 years", date_labels = "%Y", expand = c(0.01, 0)) +
  scale_y_continuous(labels = comma_format(accuracy = 1)) +

  labs(
    title    = "Underlying Trend Component",
    subtitle = "Smoothed long-run trajectory extracted by Prophet (95% CI shaded)",
    x        = NULL,
    y        = "Trend value",
    caption  = "Source: World Bank WDI"
  ) +

  theme_minimal(base_size = 12) +
  theme(
    plot.title       = element_text(face = "bold", size = 13, hjust = 0),
    plot.subtitle    = element_text(color = "grey45", size = 8.5, hjust = 0, margin = margin(b = 8)),
    plot.caption     = element_text(color = "grey55", size = 8, hjust = 1),
    panel.grid.minor = element_blank(),
    panel.grid.major = element_line(color = "grey93"),
    legend.position  = "top",
    legend.justification = "left",
    axis.text        = element_text(size = 9, color = "grey40"),
    axis.title.y     = element_text(size = 9.5, color = "grey30", margin = margin(r = 8)),
    plot.margin      = margin(12, 16, 10, 12)
  )

7 HIV Prevalence and TB Incidence

Objective 3: To assess the relationship between HIV prevalence and tuberculosis incidence.

HIV is a leading driver of TB susceptibility. The regression below (IQR-winsorised to reduce the influence of extreme values) quantifies this relationship across countries. Countries in the top 20% of both burdens are highlighted.

# Use the latest year for which both HIV and TB data are available
hiv_tb_year <- max(tb_clean$year[!is.na(tb_clean$hiv_prevalence) & !is.na(tb_clean$tb_incidence)])

hiv_tb_raw <- tb_clean |>
  filter(year == hiv_tb_year, !is.na(tb_incidence), !is.na(hiv_prevalence))

cap_iqr <- function(x) {
  q1 <- quantile(x, 0.25, na.rm = TRUE)
  q3 <- quantile(x, 0.75, na.rm = TRUE)
  iqr <- q3 - q1
  pmin(pmax(x, q1 - 1.5 * iqr), q3 + 1.5 * iqr)
}

hiv_tb_plot <- hiv_tb_raw |>
  mutate(
    tb  = cap_iqr(tb_incidence),
    hiv = cap_iqr(hiv_prevalence),
    high_burden = ifelse(
      tb >= quantile(tb, 0.8) & hiv >= quantile(hiv, 0.8),
      "High TB-HIV burden", "Other"
    )
  )

lm_hiv    <- lm(tb ~ hiv, data = hiv_tb_plot)
r2_hiv    <- summary(lm_hiv)$r.squared
int_hiv   <- coef(lm_hiv)[1]
slp_hiv   <- coef(lm_hiv)[2]
eq_label  <- paste0(
  "TB = ", round(int_hiv, 2), " + ", round(slp_hiv, 2), " \u00d7 HIV\n",
  "R\u00b2 = ", round(r2_hiv, 3)
)

ggplot(hiv_tb_plot, aes(x = hiv, y = tb)) +

  geom_point(
    aes(color = high_burden, shape = high_burden),
    size = 2.8, alpha = 0.75, stroke = 0.3
  ) +
  geom_smooth(
    method = "lm", color = "#2c2c2c", linewidth = 0.8,
    se = TRUE, fill = "grey80", alpha = 0.25
  ) +
  geom_text_repel(
    data          = subset(hiv_tb_plot, high_burden == "High TB-HIV burden"),
    aes(label     = country),
    size          = 2.8, color = "#c0392b", fontface = "italic",
    max.overlaps  = 20, box.padding = 0.4, point.padding = 0.3,
    segment.color = "grey55", segment.size = 0.3
  ) +
  annotate("label",
    x = -Inf, y = -Inf, hjust = -0.05, vjust = -0.3,
    label    = eq_label,
    size     = 3.2, color = "#2c2c2c", fill = "white",
    label.size = 0.3, label.r = unit(0.15, "lines"),
    fontface = "plain", family = "sans"
  ) +

  scale_color_manual(values = c("High TB-HIV burden" = "#c0392b", "Other" = "#3498db")) +
  scale_shape_manual(values = c("High TB-HIV burden" = 17, "Other" = 16)) +

  labs(
    title    = paste0("HIV Prevalence vs. TB Incidence (", hiv_tb_year, ")"),
    subtitle = "IQR-winsorised values \u00b7 OLS regression line with 95% CI \u00b7 Triangles = top 20% dual burden",
    x        = "HIV prevalence (%)",
    y        = "TB incidence per 100,000",
    color    = NULL, shape = NULL,
    caption  = "Source: World Bank WDI"
  ) +

  theme_minimal(base_size = 12) +
  theme(
    plot.title       = element_text(face = "bold", size = 13, hjust = 0),
    plot.subtitle    = element_text(color = "grey45", size = 9, hjust = 0, margin = margin(b = 8)),
    plot.caption     = element_text(color = "grey55", size = 8, hjust = 1),
    panel.grid.minor = element_blank(),
    panel.grid.major = element_line(color = "grey92"),
    legend.position  = "top",
    legend.justification = "left",
    legend.text      = element_text(size = 9),
    axis.title       = element_text(size = 10, color = "grey30"),
    axis.text        = element_text(size = 9, color = "grey40"),
    plot.margin      = margin(12, 16, 10, 12)
  )

8 TB–HIV Spatial Hotspot Analysis

Objective 4: To identify tuberculosis hotspot regions using spatial analysis techniques.

A composite TB–HIV burden index (standardised TB + HIV z-scores) is used to compute Local Moran’s I (LISA) statistics. High-High clusters indicate countries that are themselves high-burden and surrounded by high-burden neighbours.

hiv_tb_spatial <- tb_clean |>
  filter(year == hiv_tb_year, !is.na(tb_incidence), !is.na(hiv_prevalence)) |>
  mutate(
    tb_z         = as.numeric(scale(tb_incidence)),
    hiv_z        = as.numeric(scale(hiv_prevalence)),
    burden_index = tb_z + hiv_z
  )

world_hotspot <- world_sf |>
  left_join(hiv_tb_spatial, by = c("iso_a3" = "iso3c"))

map_clean_hs <- world_hotspot |> filter(!is.na(burden_index))

nb_hs  <- poly2nb(map_clean_hs, queen = TRUE, snap = 0.1)
wt_hs  <- nb2listw(nb_hs, style = "W", zero.policy = TRUE)

moran_i    <- moran.test(map_clean_hs$burden_index, wt_hs, zero.policy = TRUE)
lm_hs      <- localmoran(map_clean_hs$burden_index, wt_hs, zero.policy = TRUE)
map_clean_hs$local_I <- lm_hs[, 1]

map_clean_hs <- map_clean_hs |>
  mutate(hotspot = case_when(
    burden_index > 0 & local_I > 0 ~ "High-High (Hotspot)",
    burden_index < 0 & local_I > 0 ~ "Low-Low (Coldspot)",
    TRUE                            ~ "Other"
  ))

Global Moran’s I = 0.747 (p < 0.001), confirming strong positive spatial autocorrelation — TB–HIV burden clusters geographically, not randomly.

ggplot() +
  geom_sf(data = world_hotspot, fill = "grey95", color = "white", linewidth = 0.15) +
  geom_sf(data = map_clean_hs, aes(fill = hotspot), color = "white", linewidth = 0.15) +
  scale_fill_manual(values = c(
    "High-High (Hotspot)" = "#c0392b",
    "Low-Low (Coldspot)"  = "#27ae60",
    "Other"               = "grey80"
  )) +
  labs(
    title    = paste0("TB\u2013HIV Spatial Hotspots (", hiv_tb_year, ")"),
    subtitle = paste0(
      "Local Moran\u2019s I \u00b7 Global I = ",
      round(moran_i$estimate[1], 3), ", p < 0.001"
    ),
    fill    = "Cluster Type",
    caption = "Source: World Bank WDI \u00b7 Natural Earth"
  ) +
  theme_void(base_size = 12) +
  theme(
    plot.title      = element_text(face = "bold", size = 13, hjust = 0.5, margin = margin(b = 4)),
    plot.subtitle   = element_text(color = "grey40", size = 9, hjust = 0.5, margin = margin(b = 8)),
    plot.caption    = element_text(color = "grey55", size = 8, hjust = 1),
    legend.position = "bottom",
    legend.title    = element_text(size = 9),
    legend.text     = element_text(size = 8)
  )

9 TB Treatment Outcomes

Objective 5: To compare tuberculosis treatment outcomes across high-burden countries.

The WHO recommends a treatment success rate of ≥85% as the benchmark for effective TB programmes. No region currently meets this target, though South Asia and the Middle East & North Africa are within 0.2 percentage points.

tx_year   <- max(tb_clean$year[!is.na(tb_clean$treatment_success)])
tx_latest <- tb_clean |>
  filter(year == tx_year, !is.na(treatment_success))

top_10_tx <- tx_latest |> arrange(desc(treatment_success)) |> slice_head(n = 10) |>
  select(country, region, treatment_success)

bottom_10_tx <- tx_latest |> arrange(treatment_success) |> slice_head(n = 10) |>
  select(country, region, treatment_success)

region_tx <- tx_latest |>
  group_by(region) |>
  summarise(avg_success = mean(treatment_success, na.rm = TRUE), .groups = "drop") |>
  arrange(desc(avg_success)) |>
  mutate(
    meets_target = avg_success >= 85,
    label        = paste0(round(avg_success, 1), "%")
  )

n_tx <- nrow(region_tx)

ggplot(region_tx, aes(x = avg_success, y = reorder(region, avg_success))) +

  geom_col(aes(fill = meets_target), width = 0.65, alpha = 0.88) +
  geom_text(
    aes(label = label, color = meets_target),
    hjust = -0.15, size = 3.4, fontface = "bold"
  ) +
  geom_vline(
    xintercept = 85,
    linetype = "dashed", color = "#c0392b", linewidth = 0.7
  ) +
  annotate("text",
    x = 85 - 0.8, y = n_tx + 0.45,
    label    = "WHO target\n(85%)",
    hjust = 1, vjust = 0.5,
    size = 2.8, color = "#c0392b", fontface = "italic", lineheight = 0.95
  ) +

  scale_fill_manual(
    values = c("TRUE" = "#27ae60", "FALSE" = "#e67e22"),
    labels = c("TRUE" = "\u226585% \u2014 meets target", "FALSE" = "Below WHO target (85%)"),
    name = NULL, drop = FALSE
  ) +
  scale_color_manual(
    values = c("TRUE" = "#1a7a45", "FALSE" = "#b85c10"),
    guide  = "none"
  ) +
  scale_x_continuous(
    limits = c(0, 100), breaks = seq(0, 100, 20),
    labels = function(x) paste0(x, "%"), expand = c(0, 0)
  ) +
  scale_y_discrete(expand = expansion(add = c(0.5, 0.8))) +

  labs(
    title    = paste0("TB Treatment Success Rate by Region (", tx_year, ")"),
    subtitle = paste0(
      "WHO recommends \u226585% \u00b7 ",
      sum(region_tx$meets_target), " of ", n_tx,
      " regions currently meet the target"
    ),
    x       = "Average treatment success rate (%)",
    y       = NULL,
    caption = "Source: World Bank WDI (SH.TBS.CURE.ZS)"
  ) +

  theme_minimal(base_size = 12) +
  theme(
    plot.title         = element_text(face = "bold", size = 13, hjust = 0),
    plot.subtitle      = element_text(color = "grey40", size = 8.5, hjust = 0, margin = margin(b = 10)),
    plot.caption       = element_text(color = "grey55", size = 8, hjust = 1),
    panel.grid.major.y = element_blank(),
    panel.grid.minor   = element_blank(),
    panel.grid.major.x = element_line(color = "grey92"),
    axis.text.y        = element_text(size = 10, color = "grey25"),
    axis.text.x        = element_text(size = 9, color = "grey45"),
    axis.title.x       = element_text(size = 9.5, color = "grey30", margin = margin(t = 6)),
    legend.position    = "top",
    legend.justification = "left",
    legend.text        = element_text(size = 9),
    plot.margin        = margin(12, 20, 10, 12)
  )

9.0.1 Top & Bottom 10 Countries

Top 10 Countries by Treatment Success Rate (2023)
Country	Region	Success Rate (%)
Samoa	East Asia & Pacific	100
West Bank and Gaza	Middle East, North Africa, Afghanistan & Pakistan	100
Kuwait	Middle East, North Africa, Afghanistan & Pakistan	99
Bangladesh	South Asia	96
Cambodia	East Asia & Pacific	96
Ethiopia	Sub-Saharan Africa	96
Tanzania	Sub-Saharan Africa	96
China	East Asia & Pacific	95
Congo, Dem. Rep.	Sub-Saharan Africa	95
Mozambique	Sub-Saharan Africa	95

Bottom 10 Countries by Treatment Success Rate (2023)
Country	Region	Success Rate (%)
Antigua and Barbuda	Latin America & Caribbean	0
Grenada	Latin America & Caribbean	0
Latvia	Europe & Central Asia	0
France	Europe & Central Asia	20
New Caledonia	East Asia & Pacific	26
Denmark	Europe & Central Asia	30
Finland	Europe & Central Asia	34
Ireland	Europe & Central Asia	36
Bosnia and Herzegovina	Europe & Central Asia	37
Jamaica	Latin America & Caribbean	38

10 Socioeconomic Determinants of TB

Objective 6: To investigate socio-economic predictors of tuberculosis prevalence.

GDP per capita (log-transformed) explains 27.2% of the variance in TB incidence, reinforcing the strong poverty–disease link. Health expenditure as a share of GDP shows a weaker relationship, suggesting how much a country spends matters less than how wealthy it is overall.

socio_year <- max(tb_clean$year[
  !is.na(tb_clean$gdp_per_capita) &
  !is.na(tb_clean$health_expenditure) &
  !is.na(tb_clean$tb_incidence)
])

socio_df <- tb_clean |>
  filter(year == socio_year, !is.na(tb_incidence),
         !is.na(gdp_per_capita), !is.na(health_expenditure))

tb_thresh_socio <- quantile(socio_df$tb_incidence, 0.90, na.rm = TRUE)

m_gdp_s  <- lm(tb_incidence ~ log10(gdp_per_capita), data = socio_df)
m_hlth_s <- lm(tb_incidence ~ health_expenditure,    data = socio_df)
r2_g_s   <- summary(m_gdp_s)$r.squared
r2_h_s   <- summary(m_hlth_s)$r.squared

p_gdp_s <- ggplot(socio_df, aes(x = gdp_per_capita, y = tb_incidence)) +

  geom_point(aes(color = region), size = 2.2, alpha = 0.75, shape = 16) +
  geom_smooth(
    method = "lm", color = "#2c2c2c", linewidth = 0.85,
    fill = "grey75", alpha = 0.2, se = TRUE
  ) +
  geom_text_repel(
    data = filter(socio_df, tb_incidence >= tb_thresh_socio),
    aes(label = country, color = region),
    size = 2.5, fontface = "italic", max.overlaps = 15,
    box.padding = 0.35, segment.color = "grey60",
    segment.size = 0.3, show.legend = FALSE
  ) +
  annotate("label",
    x = Inf, y = Inf, hjust = 1.05, vjust = 1.4,
    label    = paste0("R\u00b2 = ", round(r2_g_s, 3)),
    size = 3.2, fill = "white", label.size = 0.3,
    label.r = unit(0.12, "lines"), color = "#2c2c2c"
  ) +

  scale_x_log10(labels = label_dollar(scale_cut = cut_short_scale())) +
  scale_y_continuous(labels = comma_format(accuracy = 1)) +
  scale_color_brewer(palette = "Set2", name = NULL) +

  labs(
    title    = "GDP per Capita vs TB Incidence",
    subtitle = "Log scale \u00b7 OLS fit with 95% CI \u00b7 Top 10% burden labelled",
    x        = "GDP per capita (log scale, USD)",
    y        = "TB incidence per 100,000"
  ) +

  theme_minimal(base_size = 11) +
  theme(
    plot.title       = element_text(face = "bold", size = 12, hjust = 0),
    plot.subtitle    = element_text(color = "grey45", size = 8, hjust = 0, margin = margin(b = 6)),
    panel.grid.minor = element_blank(),
    panel.grid.major = element_line(color = "grey93"),
    legend.position  = "none",
    axis.text        = element_text(size = 8.5, color = "grey40"),
    axis.title       = element_text(size = 9, color = "grey30"),
    plot.margin      = margin(10, 12, 6, 10)
  )

p_hlth_s <- ggplot(socio_df, aes(x = health_expenditure, y = tb_incidence)) +

  geom_point(aes(color = region), size = 2.2, alpha = 0.75, shape = 16) +
  geom_smooth(
    method = "lm", color = "#2c2c2c", linewidth = 0.85,
    fill = "grey75", alpha = 0.2, se = TRUE
  ) +
  geom_text_repel(
    data = filter(socio_df, tb_incidence >= tb_thresh_socio),
    aes(label = country, color = region),
    size = 2.5, fontface = "italic", max.overlaps = 15,
    box.padding = 0.35, segment.color = "grey60",
    segment.size = 0.3, show.legend = FALSE
  ) +
  annotate("label",
    x = Inf, y = Inf, hjust = 1.05, vjust = 1.4,
    label    = paste0("R\u00b2 = ", round(r2_h_s, 3)),
    size = 3.2, fill = "white", label.size = 0.3,
    label.r = unit(0.12, "lines"), color = "#2c2c2c"
  ) +

  scale_y_continuous(labels = comma_format(accuracy = 1)) +
  scale_color_brewer(palette = "Set2", name = "Region") +

  labs(
    title    = "Health Expenditure vs TB Incidence",
    subtitle = "% of GDP \u00b7 OLS fit with 95% CI \u00b7 Top 10% burden labelled",
    x        = "Health expenditure (% of GDP)",
    y        = "TB incidence per 100,000"
  ) +

  theme_minimal(base_size = 11) +
  theme(
    plot.title       = element_text(face = "bold", size = 12, hjust = 0),
    plot.subtitle    = element_text(color = "grey45", size = 8, hjust = 0, margin = margin(b = 6)),
    panel.grid.minor = element_blank(),
    panel.grid.major = element_line(color = "grey93"),
    legend.position  = "right",
    legend.text      = element_text(size = 8),
    legend.key.size  = unit(0.45, "cm"),
    axis.text        = element_text(size = 8.5, color = "grey40"),
    axis.title       = element_text(size = 9, color = "grey30"),
    plot.margin      = margin(10, 12, 6, 10)
  )

(p_gdp_s | p_hlth_s) +
  plot_annotation(
    title   = paste0("Socioeconomic Determinants of TB Incidence (", socio_year, ")"),
    caption = "Source: World Bank WDI",
    theme   = theme(
      plot.title   = element_text(face = "bold", size = 14, hjust = 0),
      plot.caption = element_text(color = "grey55", size = 8, hjust = 1)
    )
  ) +
  plot_layout(guides = "collect") &
  theme(legend.position = "bottom")

11 Summary

Objective	Key Finding
1. Spatial distribution (Africa)	Southern & Eastern Africa carry the highest sub-continental burden.
2. Temporal trends (2000–2024)	Consistent decline of ~50% over 25 years; post-2020 rebound observed.
3. HIV–TB relationship	R² = 0.251; HIV prevalence is a significant positive predictor of TB incidence.
4. Spatial hotspot analysis	Global Moran’s I = 0.747; Sub-Saharan Africa is the dominant High-High cluster.
5. Forecasting	Prophet projects continued decline; 5-year CI widens post-2024 due to post-COVID uncertainty.
6. Treatment outcomes	0 of 7 regions meet the WHO ≥85% target; Latin America & Caribbean lags most (68%).
7. Socioeconomic determinants	GDP (log) explains 47.8% of TB variance; health spending explains only 7.6%.
8. Global inequality	3 of 7 regions exceed the global average; North America is lowest at 4.2.
9. COVID-19 disruptions	2020–2022 shows a visible notification dip consistent with service disruption; not a true incidence decline.

Report generated using R 4.5.3 · World Bank WDI via the WDI package · May 2026

Global Tuberculosis Burden: Surveillance, Trends & Determinants

A Multi-Objective Evidence-Based Analysis (2000–2024)

May 20, 2026