Weekly Malaria Case Prediction

Negative Binomial Regression with Environmental Predictors

Author

Timothy Achala

Overview

This report models weekly disease case counts in a specific district using Negative Binomial Regression, which is well-suited for over-dispersed count data. Environmental predictors include:

  • Rainfall (mm/week)
  • Temperature (°C — mean weekly)
  • NDVI (Normalised Difference Vegetation Index — proxy for vegetation density and humidity)

1. Setup: Load Libraries

Code
# Install missing packages automatically
packages <- c(
  "tidyverse", "MASS", "ggplot2", "performance",
  "broom", "lubridate", "patchwork", "corrplot",
  "GGally", "DHARMa", "car", "kableExtra",
  "modelsummary", "ggeffects", "scales"
)

installed <- rownames(installed.packages())
to_install <- setdiff(packages, installed)
if (length(to_install) > 0) install.packages(to_install, repos = "https://cloud.r-project.org")

library(tidyverse)
library(MASS)          # Negative Binomial regression (glm.nb)
library(ggplot2)
library(performance)   # Model diagnostics
library(broom)         # Tidy model outputs
library(lubridate)     # Date handling
library(patchwork)     # Combine plots
library(corrplot)      # Correlation matrix
library(GGally)        # Pairs plot
library(DHARMa)        # Residual diagnostics for GLMs
library(car)           # VIF
library(kableExtra)    # Table formatting
library(modelsummary)  # Regression tables
library(ggeffects)     # Marginal effects
library(scales)        # Axis formatting

conflicted::conflict_prefer("select", "dplyr")
conflicted::conflict_prefer("filter", "dplyr")
conflicted::conflict_prefer("recode", "dplyr")
set.seed(123)  # Reproducibility

Code
# ── Option A: Simulate realistic data ─────────────────────────────────────────
n_weeks <- 156  # 3 years of weekly data

# Create weekly time series
week_start <- seq(as.Date("2021-01-04"), by = "week", length.out = n_weeks)
week_num   <- 1:n_weeks

# Seasonal patterns (tropical region)
seasonal_rain  <- 40 + 35 * sin(2 * pi * week_num / 52 - 1.2) + rnorm(n_weeks, 0, 8)
seasonal_temp  <- 26 + 4  * sin(2 * pi * week_num / 52 - 0.5) + rnorm(n_weeks, 0, 1.2)
seasonal_ndvi  <- 0.45 + 0.18 * sin(2 * pi * week_num / 52 - 0.8) + rnorm(n_weeks, 0, 0.04)

# Clip to realistic ranges
rainfall_mm <- pmax(0, seasonal_rain)
temp_mean_c <- pmin(pmax(seasonal_temp, 18), 38)
ndvi        <- pmin(pmax(seasonal_ndvi, 0.1), 0.9)

# True log-linear relationship (Negative Binomial outcome)
log_mu <- -2.5 +
  0.015 * rainfall_mm +
  0.12  * temp_mean_c +
  3.2   * ndvi +
  0.4   * sin(2 * pi * week_num / 52)   # additional seasonal trend

# Simulate NB counts (theta = dispersion parameter)
cases <- rnegbin(n_weeks, mu = exp(log_mu), theta = 3.5)

district_data <- tibble(
  week_start   = week_start,
  year         = year(week_start),
  week_of_year = week(week_start),
  cases        = cases,
  rainfall_mm  = round(rainfall_mm, 1),
  temp_mean_c  = round(temp_mean_c, 2),
  ndvi         = round(ndvi, 4)
)

# ── Option B: Import your own data ────────────────────────────────────────────
# district_data <- read_csv("your_data.csv") |>
#   mutate(week_start = as.Date(week_start))

head(district_data, 10)

3. Exploratory Data Analysis (EDA)

3.1 Summary Statistics

Code
district_data |>
  dplyr::select(cases, rainfall_mm, temp_mean_c, ndvi) |>
  summary() |>
  kable(caption = "Summary Statistics of Key Variables") |>
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)
Summary Statistics of Key Variables
cases rainfall_mm temp_mean_c ndvi
Min. : 0.00 Min. : 0.00 Min. :20.01 Min. :0.1739
1st Qu.: 3.75 1st Qu.:16.10 1st Qu.:23.59 1st Qu.:0.3337
Median : 13.00 Median :40.50 Median :26.21 Median :0.4577
Mean : 28.00 Mean :39.91 Mean :26.10 Mean :0.4497
3rd Qu.: 39.25 3rd Qu.:63.25 3rd Qu.:28.72 3rd Qu.:0.5614
Max. :162.00 Max. :88.80 Max. :32.53 Max. :0.6834

3.2 Time Series of Cases and Predictors

Code
p_cases <- ggplot(district_data, aes(x = week_start, y = cases)) +
  geom_col(fill = "#c0392b", alpha = 0.75) +
  geom_smooth(color = "#7f0000", se = FALSE, linewidth = 1) +
  labs(title = "Weekly Disease Cases", x = NULL, y = "Cases") +
  theme_minimal(base_size = 12) +
  theme(plot.title = element_text(face = "bold"))

p_rain <- ggplot(district_data, aes(x = week_start, y = rainfall_mm)) +
  geom_line(color = "#2980b9", linewidth = 0.8) +
  geom_area(fill = "#2980b9", alpha = 0.2) +
  labs(title = "Weekly Rainfall (mm)", x = NULL, y = "mm") +
  theme_minimal(base_size = 12)

p_temp <- ggplot(district_data, aes(x = week_start, y = temp_mean_c)) +
  geom_line(color = "#e67e22", linewidth = 0.8) +
  labs(title = "Mean Weekly Temperature (°C)", x = NULL, y = "°C") +
  theme_minimal(base_size = 12)

p_ndvi <- ggplot(district_data, aes(x = week_start, y = ndvi)) +
  geom_line(color = "#27ae60", linewidth = 0.8) +
  geom_area(fill = "#27ae60", alpha = 0.2) +
  labs(title = "NDVI (Vegetation Index)", x = NULL, y = "NDVI") +
  theme_minimal(base_size = 12)

(p_cases / p_rain / p_temp / p_ndvi) +
  plot_annotation(
    title    = "District Environmental & Epidemiological Trends",
    subtitle = "3-Year Weekly Data",
    theme    = theme(plot.title = element_text(face = "bold", size = 14))
  )

3.3 Distribution of Cases

Code
p_hist <- ggplot(district_data, aes(x = cases)) +
  geom_histogram(binwidth = 2, fill = "#c0392b", color = "white", alpha = 0.8) +
  labs(title = "Distribution of Weekly Cases",
       subtitle = sprintf("Mean = %.1f | Variance = %.1f | Var/Mean = %.1f",
                          mean(district_data$cases),
                          var(district_data$cases),
                          var(district_data$cases) / mean(district_data$cases)),
       x = "Weekly Cases", y = "Frequency") +
  theme_minimal(base_size = 12)

p_qq <- ggplot(district_data, aes(sample = cases)) +
  stat_qq(color = "#c0392b", alpha = 0.6) +
  stat_qq_line(color = "black", linetype = "dashed") +
  labs(title = "Q-Q Plot of Cases", x = "Theoretical Quantiles", y = "Sample Quantiles") +
  theme_minimal(base_size = 12)

p_hist + p_qq +
  plot_annotation(title = "Case Count Distribution — Over-dispersion Check",
                  theme = theme(plot.title = element_text(face = "bold")))

Interpretation: A Variance/Mean ratio >> 1 confirms over-dispersion, justifying the Negative Binomial model over Poisson.

3.4 Correlation Matrix

Code
cor_data <- district_data |> select(cases, rainfall_mm, temp_mean_c, ndvi)
cor_matrix <- cor(cor_data, method = "spearman")

corrplot(cor_matrix,
         method = "color",
         type   = "upper",
         addCoef.col = "black",
         tl.col  = "black",
         tl.srt  = 45,
         col     = colorRampPalette(c("#2980b9", "white", "#c0392b"))(200),
         title   = "Spearman Correlation Matrix",
         mar     = c(0, 0, 2, 0))

3.5 Scatter Plots: Cases vs Predictors

Code
p_s1 <- ggplot(district_data, aes(x = rainfall_mm, y = cases)) +
  geom_point(alpha = 0.5, color = "#2980b9") +
  geom_smooth(method = "loess", color = "#c0392b", se = TRUE) +
  labs(title = "Cases vs Rainfall", x = "Rainfall (mm)", y = "Cases") +
  theme_minimal()

p_s2 <- ggplot(district_data, aes(x = temp_mean_c, y = cases)) +
  geom_point(alpha = 0.5, color = "#e67e22") +
  geom_smooth(method = "loess", color = "#c0392b", se = TRUE) +
  labs(title = "Cases vs Temperature", x = "Temperature (°C)", y = "Cases") +
  theme_minimal()

p_s3 <- ggplot(district_data, aes(x = ndvi, y = cases)) +
  geom_point(alpha = 0.5, color = "#27ae60") +
  geom_smooth(method = "loess", color = "#c0392b", se = TRUE) +
  labs(title = "Cases vs NDVI", x = "NDVI", y = "Cases") +
  theme_minimal()

p_s1 + p_s2 + p_s3


4. Model Building

4.1 Data Splitting (Train / Test)

Code
# Use last 20 weeks as test set
n_test  <- 20
n_train <- nrow(district_data) - n_test

train_data <- district_data[1:n_train, ]
test_data  <- district_data[(n_train + 1):nrow(district_data), ]

cat(sprintf("Training weeks: %d | Test weeks: %d\n", n_train, n_test))
Training weeks: 136 | Test weeks: 20

4.2 Fit Negative Binomial Regression

Code
# ── Main model ─────────────────────────────────────────────────────────────────
nb_model <- glm.nb(
  cases ~ rainfall_mm + temp_mean_c + ndvi,
  data = train_data
)

# ── Model with seasonal term ───────────────────────────────────────────────────
nb_model_seasonal <- glm.nb(
  cases ~ rainfall_mm + temp_mean_c + ndvi +
          sin(2 * pi * week_of_year / 52) +
          cos(2 * pi * week_of_year / 52),
  data = train_data
)

# ── Null model (intercept only) for comparison ────────────────────────────────
nb_null <- glm.nb(cases ~ 1, data = train_data)

summary(nb_model_seasonal)

Call:
glm.nb(formula = cases ~ rainfall_mm + temp_mean_c + ndvi + sin(2 * 
    pi * week_of_year/52) + cos(2 * pi * week_of_year/52), data = train_data, 
    init.theta = 5.443376253, link = log)

Coefficients:
                               Estimate Std. Error z value Pr(>|z|)    
(Intercept)                   -3.207958   1.166219  -2.751 0.005946 ** 
rainfall_mm                    0.021725   0.006243   3.480 0.000501 ***
temp_mean_c                    0.133247   0.039806   3.347 0.000816 ***
ndvi                           3.327279   1.113229   2.989 0.002800 ** 
sin(2 * pi * week_of_year/52)  0.276690   0.216640   1.277 0.201537    
cos(2 * pi * week_of_year/52)  0.355841   0.252943   1.407 0.159485    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for Negative Binomial(5.4434) family taken to be 1)

    Null deviance: 834.22  on 135  degrees of freedom
Residual deviance: 142.35  on 130  degrees of freedom
AIC: 976.93

Number of Fisher Scoring iterations: 1

              Theta:  5.443 
          Std. Err.:  0.888 

 2 x log-likelihood:  -962.928 

4.3 Model Comparison

Code
model_list <- list(
  "NB: Null"            = nb_null,
  "NB: Environmental"   = nb_model,
  "NB: + Seasonal Terms" = nb_model_seasonal
)

modelsummary(
  model_list,
  statistic  = "({p.value})",
  stars      = TRUE,
  fmt        = 3,
  gof_map    = c("nobs", "AIC", "BIC", "logLik"),
  title      = "Negative Binomial Model Comparison",
  notes      = "p-values in parentheses. * p<0.05, ** p<0.01, *** p<0.001"
)
Negative Binomial Model Comparison
NB: Null NB: Environmental NB: + Seasonal Terms
+ p < 0.1, * p < 0.05, ** p < 0.01, *** p < 0.001
p-values in parentheses. * p<0.05, ** p<0.01, *** p<0.001
(Intercept) 3.454*** -4.719*** -3.208**
(<0.001) (<0.001) (0.006)
rainfall_mm 0.006+ 0.022***
(0.072) (<0.001)
temp_mean_c 0.204*** 0.133***
(<0.001) (<0.001)
ndvi 4.046*** 3.327**
(<0.001) (0.003)
sin(2 * pi * week_of_year/52) 0.277
(0.202)
cos(2 * pi * week_of_year/52) 0.356
(0.159)
Num.Obs. 136 136 136
Log.Lik. -605.484 -488.506 -481.464
Code
aic_table <- AIC(nb_null, nb_model, nb_model_seasonal) |>
  rownames_to_column("Model") |>
  mutate(Model = c("Null", "Environmental Only", "Environmental + Seasonal"),
         Delta_AIC = AIC - min(AIC)) |>
  arrange(AIC)

kable(aic_table, digits = 2, caption = "AIC Comparison (lower = better)") |>
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE) |>
  row_spec(1, bold = TRUE, background = "#d5e8d4")
AIC Comparison (lower = better)
Model df AIC Delta_AIC
Environmental + Seasonal 7 976.93 0.00
Environmental Only 5 987.01 10.08
Null 2 1214.97 238.04

Use the model with lowest AIC as your final model.

Code
# Select best model automatically
final_model <- nb_model_seasonal
cat("Selected model: NB with Environmental + Seasonal Terms\n")
Selected model: NB with Environmental + Seasonal Terms
Code
cat("Theta (dispersion):", final_model$theta, "\n")
Theta (dispersion): 5.443376 

5. Model Interpretation

5.1 Coefficient Table

Code
tidy(final_model, exponentiate = TRUE, conf.int = TRUE) |>
  rename(
    Term          = term,
    IRR           = estimate,
    `Std. Error`  = std.error,
    `z-value`     = statistic,
    `p-value`     = p.value,
    `CI Lower`    = conf.low,
    `CI Upper`    = conf.high
  ) |>
  mutate(Significance = case_when(
    `p-value` < 0.001 ~ "***",
    `p-value` < 0.01  ~ "**",
    `p-value` < 0.05  ~ "*",
    `p-value` < 0.1   ~ ".",
    TRUE              ~ ""
  )) |>
  kable(digits = 4,
        caption = "Incidence Rate Ratios (IRR) from Negative Binomial Model") |>
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE) |>
  add_header_above(c(" " = 1, "Exponentiated Coefficients (IRR)" = 3,
                     "95% CI" = 2, " " = 2))
Incidence Rate Ratios (IRR) from Negative Binomial Model
Exponentiated Coefficients (IRR)
95% CI
Term IRR Std. Error z-value p-value CI Lower CI Upper Significance
(Intercept) 0.0404 1.1662 -2.7507 0.0059 0.0040 0.4069 **
rainfall_mm 1.0220 0.0062 3.4800 0.0005 1.0094 1.0348 ***
temp_mean_c 1.1425 0.0398 3.3474 0.0008 1.0548 1.2384 ***
ndvi 27.8624 1.1132 2.9889 0.0028 3.1793 245.5287 **
sin(2 * pi * week_of_year/52) 1.3188 0.2166 1.2772 0.2015 0.8576 2.0305
cos(2 * pi * week_of_year/52) 1.4274 0.2529 1.4068 0.1595 0.8602 2.3650

IRR interpretation:
- IRR > 1 → predictor increases expected case count
- IRR < 1 → predictor decreases expected case count
- E.g., IRR of 1.05 for rainfall → each extra mm/week increases cases by ~5%

5.2 Coefficient Forest Plot

Code
coef_df <- tidy(final_model, exponentiate = TRUE, conf.int = TRUE) |>
  filter(term != "(Intercept)") |>
  mutate(term = recode(term,
    rainfall_mm              = "Rainfall (mm)",
    temp_mean_c              = "Temperature (°C)",
    ndvi                     = "NDVI",
    `sin(2 * pi * week_of_year/52)` = "Seasonal (sin)",
    `cos(2 * pi * week_of_year/52)` = "Seasonal (cos)"
  ),
  significant = p.value < 0.05)

ggplot(coef_df, aes(x = estimate, y = reorder(term, estimate), color = significant)) +
  geom_vline(xintercept = 1, linetype = "dashed", color = "grey50") +
  geom_errorbarh(aes(xmin = conf.low, xmax = conf.high), height = 0.25, linewidth = 1) +
  geom_point(size = 4) +
  scale_color_manual(values = c("FALSE" = "grey60", "TRUE" = "#c0392b"),
                     labels = c("Non-significant", "Significant (p<0.05)")) +
  labs(title    = "Incidence Rate Ratios (IRR) — Negative Binomial Model",
       subtitle  = "Points right of dashed line → increased disease risk",
       x        = "IRR (with 95% CI)",
       y        = NULL,
       color    = NULL) +
  theme_minimal(base_size = 12) +
  theme(legend.position = "bottom", plot.title = element_text(face = "bold"))

5.3 Marginal Effects (Predicted Cases vs Each Predictor)

Code
me_rain <- ggpredict(final_model, terms = "rainfall_mm [all]")
me_temp <- ggpredict(final_model, terms = "temp_mean_c [all]")
me_ndvi <- ggpredict(final_model, terms = "ndvi [all]")

p_me1 <- plot(me_rain) +
  labs(title = "Effect of Rainfall", x = "Rainfall (mm)", y = "Predicted Cases") +
  theme_minimal()

p_me2 <- plot(me_temp) +
  labs(title = "Effect of Temperature", x = "Temperature (°C)", y = "Predicted Cases") +
  theme_minimal()

p_me3 <- plot(me_ndvi) +
  labs(title = "Effect of NDVI", x = "NDVI", y = "Predicted Cases") +
  theme_minimal()

p_me1 + p_me2 + p_me3 +
  plot_annotation(title = "Marginal Effects: Predicted Cases by Predictor",
                  theme = theme(plot.title = element_text(face = "bold")))


6. Model Diagnostics

6.1 Multicollinearity (VIF)

Code
vif_vals <- vif(final_model)

tibble(
  Predictor = names(vif_vals),
  VIF       = round(vif_vals, 3),
  Flag      = ifelse(vif_vals > 5, "⚠ High", "✓ OK")
) |>
  kable(caption = "Variance Inflation Factors (VIF < 5 is acceptable)") |>
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)
Variance Inflation Factors (VIF < 5 is acceptable)
Predictor VIF Flag
rainfall_mm 10.895 ⚠ High
temp_mean_c 5.705 ⚠ High
ndvi 8.884 ⚠ High
sin(2 * pi * week_of_year/52) 10.065 ⚠ High
cos(2 * pi * week_of_year/52) 16.138 ⚠ High

6.2 DHARMa Residual Diagnostics

Code
# DHARMa creates scaled residuals suitable for count models
sim_residuals <- simulateResiduals(fittedModel = final_model, n = 1000)

par(mfrow = c(1, 2))
plot(sim_residuals)

Code
par(mfrow = c(1, 1))
Code
# Formal tests
cat("=== DHARMa Diagnostic Tests ===\n\n")
=== DHARMa Diagnostic Tests ===
Code
test_unif    <- testUniformity(sim_residuals)

Code
test_disp    <- testDispersion(sim_residuals)

Code
test_outlier <- testOutliers(sim_residuals)

Code
tibble(
  Test           = c("Uniformity (KS)", "Dispersion", "Outliers"),
  Statistic      = c(test_unif$statistic, test_disp$statistic, test_outlier$statistic),
  `p-value`      = c(test_unif$p.value, test_disp$p.value, test_outlier$p.value),
  Interpretation = c(
    ifelse(test_unif$p.value > 0.05, " Residuals uniform", "Deviation detected"),
    ifelse(test_disp$p.value > 0.05, " Dispersion OK", "Mis-specified dispersion"),
    ifelse(test_outlier$p.value > 0.05, " No outliers", "Outliers present")
  )
) |>
  kable(digits = 4, caption = "DHARMa Formal Residual Tests") |>
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)
DHARMa Formal Residual Tests
Test Statistic p-value Interpretation
Uniformity (KS) 0.0792 0.3605 Residuals uniform
Dispersion 1.0307 0.7940 Dispersion OK
Outliers 0.0000 1.0000 No outliers

6.3 Observed vs Fitted (Training Set)

Code
train_data <- train_data |>
  mutate(fitted_cases = fitted(final_model))

ggplot(train_data, aes(x = week_start)) +
  geom_col(aes(y = cases), fill = "#c0392b", alpha = 0.5, width = 5) +
  geom_line(aes(y = fitted_cases), color = "#2c3e50", linewidth = 1.2) +
  geom_ribbon(
    aes(ymin = fitted_cases * 0.8, ymax = fitted_cases * 1.2),
    alpha = 0.15, fill = "#2c3e50"
  ) +
  labs(title    = "Observed vs Fitted Cases (Training Set)",
       subtitle = "Red bars = observed | Dark line = model fit",
       x = NULL, y = "Weekly Cases") +
  theme_minimal(base_size = 12) +
  theme(plot.title = element_text(face = "bold"))


7. Prediction on Test Set

7.1 Generate Predictions

Code
# Predict on test set with confidence intervals
pred_obj <- predict(final_model, newdata = test_data,
                    type = "link", se.fit = TRUE)

# Back-transform from log scale
test_data <- test_data |>
  mutate(
    pred_log    = pred_obj$fit,
    pred_se     = pred_obj$se.fit,
    pred_cases  = exp(pred_log),
    pred_lower  = exp(pred_log - 1.96 * pred_se),
    pred_upper  = exp(pred_log + 1.96 * pred_se)
  )

head(test_data |> select(week_start, cases, pred_cases, pred_lower, pred_upper), 10) |>
  kable(digits = 1, caption = "Predicted vs Actual Cases (Test Set, first 10 weeks)") |>
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)
Predicted vs Actual Cases (Test Set, first 10 weeks)
week_start cases pred_cases pred_lower pred_upper
2023-08-14 6 6.6 5.0 8.9
2023-08-21 9 6.3 5.0 8.0
2023-08-28 6 14.5 10.4 20.2
2023-09-04 6 4.7 3.6 6.2
2023-09-11 3 7.1 5.5 9.1
2023-09-18 0 3.2 2.4 4.1
2023-09-25 6 2.8 2.1 3.8
2023-10-02 2 1.9 1.3 2.7
2023-10-09 3 2.1 1.5 2.9
2023-10-16 1 1.9 1.5 2.4

7.2 Prediction Plot

Code
ggplot(test_data, aes(x = week_start)) +
  geom_ribbon(aes(ymin = pred_lower, ymax = pred_upper),
              fill = "#2980b9", alpha = 0.25) +
  geom_line(aes(y = pred_cases), color = "#2980b9", linewidth = 1.4, linetype = "dashed") +
  geom_col(aes(y = cases), fill = "#c0392b", alpha = 0.7, width = 4) +
  labs(title    = "Predicted vs Actual Cases — Test Set",
       subtitle  = "Red bars = observed | Blue dashed = predicted | Shaded = 95% PI",
       x = NULL, y = "Weekly Cases") +
  theme_minimal(base_size = 12) +
  theme(plot.title = element_text(face = "bold"))

7.3 Prediction Accuracy Metrics

Code
metrics <- test_data |>
  summarise(
    MAE   = mean(abs(cases - pred_cases)),
    RMSE  = sqrt(mean((cases - pred_cases)^2)),
    MAPE  = mean(abs((cases - pred_cases) / (cases + 0.1))) * 100,
    Corr  = cor(cases, pred_cases),
    Coverage_95 = mean(cases >= pred_lower & cases <= pred_upper) * 100
  )

kable(metrics, digits = 2,
      caption = "Prediction Performance Metrics (Test Set)",
      col.names = c("MAE", "RMSE", "MAPE (%)", "Pearson r", "95% PI Coverage (%)")) |>
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)
Prediction Performance Metrics (Test Set)
MAE RMSE MAPE (%) Pearson r 95% PI Coverage (%)
2.49 3.15 257.56 0.41 20

8. Scenario Prediction Tool

Use this section to predict cases for specific environmental scenarios.

Code
# Define scenarios of interest
scenarios <- tibble(
  Scenario    = c("Low Risk", "Moderate Risk", "High Risk", "Extreme Risk"),
  rainfall_mm = c(10,  30,  55,  80),
  temp_mean_c = c(22,  25,  28,  32),
  ndvi        = c(0.2, 0.4, 0.6, 0.75),
  week_of_year = c(20, 30, 40, 15)   # representative weeks
)

# Predict
scen_pred <- predict(final_model, newdata = scenarios,
                     type = "link", se.fit = TRUE)

scenarios <- scenarios |>
  mutate(
    Predicted_Cases = round(exp(scen_pred$fit), 1),
    Lower_95_PI     = round(exp(scen_pred$fit - 1.96 * scen_pred$se.fit), 1),
    Upper_95_PI     = round(exp(scen_pred$fit + 1.96 * scen_pred$se.fit), 1)
  )

kable(scenarios,
      caption = "Predicted Weekly Cases Under Environmental Scenarios",
      col.names = c("Scenario", "Rainfall (mm)", "Temp (°C)", "NDVI",
                    "Week", "Predicted Cases", "Lower 95%", "Upper 95%")) |>
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE) |>
  row_spec(3:4, background = "#fde8e8")  # Highlight high risk
Predicted Weekly Cases Under Environmental Scenarios
Scenario Rainfall (mm) Temp (°C) NDVI Week Predicted Cases Lower 95% Upper 95%
Low Risk 10 22 0.20 20 1.7 0.5 6.3
Moderate Risk 30 25 0.40 30 5.3 3.2 8.6
High Risk 55 28 0.60 40 32.5 14.0 75.7
Extreme Risk 80 32 0.75 15 238.1 153.4 369.7
Code
scenarios |>
  ggplot(aes(x = fct_inorder(Scenario), y = Predicted_Cases, fill = Scenario)) +
  geom_col(alpha = 0.85) +
  geom_errorbar(aes(ymin = Lower_95_PI, ymax = Upper_95_PI), width = 0.3, linewidth = 1) +
  scale_fill_manual(values = c("#27ae60", "#f39c12", "#e74c3c", "#8e1010")) +
  labs(title    = "Predicted Weekly Cases by Risk Scenario",
       subtitle  = "Error bars = 95% Prediction Interval",
       x = "Scenario", y = "Predicted Cases") +
  theme_minimal(base_size = 13) +
  theme(legend.position = "none", plot.title = element_text(face = "bold"))


9. Model Summary & Conclusions

Code
# Final model summary
glance_df <- glance(final_model)

tibble(
  Metric = c("Model Type", "Outcome", "Predictors", "N (Training)",
             "AIC", "Log-Likelihood", "Theta (Dispersion)"),
  Value  = c(
    "Negative Binomial GLM",
    "Weekly Disease Cases",
    "Rainfall, Temperature, NDVI, Seasonal Terms",
    as.character(n_train),
    round(AIC(final_model), 2),
    round(logLik(final_model), 2),
    round(final_model$theta, 3)
  )
) |>
  kable(caption = "Final Model Summary") |>
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)
Final Model Summary
Metric Value
Model Type Negative Binomial GLM
Outcome Weekly Disease Cases
Predictors Rainfall, Temperature, NDVI, Seasonal Terms
N (Training) 136
AIC 976.93
Log-Likelihood -481.46
Theta (Dispersion) 5.443
Key Takeaways
  • Over-dispersion confirmed: Variance >> Mean in raw counts justifies Negative Binomial over Poisson.
  • Environmental drivers: Rainfall, temperature, and NDVI are significant predictors of weekly case burden.
  • Seasonal patterns: Including harmonic (sin/cos) terms captures residual seasonality not explained by climate variables alone.
  • Prediction intervals: Use 95% prediction intervals for operational planning and early-warning thresholds.
  • Theta (dispersion): A lower theta = greater over-dispersion; the model accounts for this explicitly.