Overview

This report models weekly disease case counts in a specific district using Negative Binomial Regression, which is well-suited for over-dispersed count data. Environmental predictors include:

Rainfall (mm/week)
Temperature (°C — mean weekly)
NDVI (Normalised Difference Vegetation Index — proxy for vegetation density and humidity)

1. Setup: Load Libraries

Code

# Install missing packages automatically
packages <- c(
  "tidyverse", "MASS", "ggplot2", "performance",
  "broom", "lubridate", "patchwork", "corrplot",
  "GGally", "DHARMa", "car", "kableExtra",
  "modelsummary", "ggeffects", "scales"
)

installed <- rownames(installed.packages())
to_install <- setdiff(packages, installed)
if (length(to_install) > 0) install.packages(to_install, repos = "https://cloud.r-project.org")

library(tidyverse)
library(MASS)          # Negative Binomial regression (glm.nb)
library(ggplot2)
library(performance)   # Model diagnostics
library(broom)         # Tidy model outputs
library(lubridate)     # Date handling
library(patchwork)     # Combine plots
library(corrplot)      # Correlation matrix
library(GGally)        # Pairs plot
library(DHARMa)        # Residual diagnostics for GLMs
library(car)           # VIF
library(kableExtra)    # Table formatting
library(modelsummary)  # Regression tables
library(ggeffects)     # Marginal effects
library(scales)        # Axis formatting

conflicted::conflict_prefer("select", "dplyr")
conflicted::conflict_prefer("filter", "dplyr")
conflicted::conflict_prefer("recode", "dplyr")
set.seed(123)  # Reproducibility

Code

# ── Option A: Simulate realistic data ─────────────────────────────────────────
n_weeks <- 156  # 3 years of weekly data

# Create weekly time series
week_start <- seq(as.Date("2021-01-04"), by = "week", length.out = n_weeks)
week_num   <- 1:n_weeks

# Seasonal patterns (tropical region)
seasonal_rain  <- 40 + 35 * sin(2 * pi * week_num / 52 - 1.2) + rnorm(n_weeks, 0, 8)
seasonal_temp  <- 26 + 4  * sin(2 * pi * week_num / 52 - 0.5) + rnorm(n_weeks, 0, 1.2)
seasonal_ndvi  <- 0.45 + 0.18 * sin(2 * pi * week_num / 52 - 0.8) + rnorm(n_weeks, 0, 0.04)

# Clip to realistic ranges
rainfall_mm <- pmax(0, seasonal_rain)
temp_mean_c <- pmin(pmax(seasonal_temp, 18), 38)
ndvi        <- pmin(pmax(seasonal_ndvi, 0.1), 0.9)

# True log-linear relationship (Negative Binomial outcome)
log_mu <- -2.5 +
  0.015 * rainfall_mm +
  0.12  * temp_mean_c +
  3.2   * ndvi +
  0.4   * sin(2 * pi * week_num / 52)   # additional seasonal trend

# Simulate NB counts (theta = dispersion parameter)
cases <- rnegbin(n_weeks, mu = exp(log_mu), theta = 3.5)

district_data <- tibble(
  week_start   = week_start,
  year         = year(week_start),
  week_of_year = week(week_start),
  cases        = cases,
  rainfall_mm  = round(rainfall_mm, 1),
  temp_mean_c  = round(temp_mean_c, 2),
  ndvi         = round(ndvi, 4)
)

# ── Option B: Import your own data ────────────────────────────────────────────
# district_data <- read_csv("your_data.csv") |>
#   mutate(week_start = as.Date(week_start))

head(district_data, 10)

3. Exploratory Data Analysis (EDA)

3.1 Summary Statistics

Code

district_data |>
  dplyr::select(cases, rainfall_mm, temp_mean_c, ndvi) |>
  summary() |>
  kable(caption = "Summary Statistics of Key Variables") |>
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)

Summary Statistics of Key Variables
cases	rainfall_mm	temp_mean_c	ndvi
Min. : 0.00	Min. : 0.00	Min. :20.01	Min. :0.1739
1st Qu.: 3.75	1st Qu.:16.10	1st Qu.:23.59	1st Qu.:0.3337
Median : 13.00	Median :40.50	Median :26.21	Median :0.4577
Mean : 28.00	Mean :39.91	Mean :26.10	Mean :0.4497
3rd Qu.: 39.25	3rd Qu.:63.25	3rd Qu.:28.72	3rd Qu.:0.5614
Max. :162.00	Max. :88.80	Max. :32.53	Max. :0.6834

3.2 Time Series of Cases and Predictors

Code

p_cases <- ggplot(district_data, aes(x = week_start, y = cases)) +
  geom_col(fill = "#c0392b", alpha = 0.75) +
  geom_smooth(color = "#7f0000", se = FALSE, linewidth = 1) +
  labs(title = "Weekly Disease Cases", x = NULL, y = "Cases") +
  theme_minimal(base_size = 12) +
  theme(plot.title = element_text(face = "bold"))

p_rain <- ggplot(district_data, aes(x = week_start, y = rainfall_mm)) +
  geom_line(color = "#2980b9", linewidth = 0.8) +
  geom_area(fill = "#2980b9", alpha = 0.2) +
  labs(title = "Weekly Rainfall (mm)", x = NULL, y = "mm") +
  theme_minimal(base_size = 12)

p_temp <- ggplot(district_data, aes(x = week_start, y = temp_mean_c)) +
  geom_line(color = "#e67e22", linewidth = 0.8) +
  labs(title = "Mean Weekly Temperature (°C)", x = NULL, y = "°C") +
  theme_minimal(base_size = 12)

p_ndvi <- ggplot(district_data, aes(x = week_start, y = ndvi)) +
  geom_line(color = "#27ae60", linewidth = 0.8) +
  geom_area(fill = "#27ae60", alpha = 0.2) +
  labs(title = "NDVI (Vegetation Index)", x = NULL, y = "NDVI") +
  theme_minimal(base_size = 12)

(p_cases / p_rain / p_temp / p_ndvi) +
  plot_annotation(
    title    = "District Environmental & Epidemiological Trends",
    subtitle = "3-Year Weekly Data",
    theme    = theme(plot.title = element_text(face = "bold", size = 14))
  )

3.3 Distribution of Cases

Code

p_hist <- ggplot(district_data, aes(x = cases)) +
  geom_histogram(binwidth = 2, fill = "#c0392b", color = "white", alpha = 0.8) +
  labs(title = "Distribution of Weekly Cases",
       subtitle = sprintf("Mean = %.1f | Variance = %.1f | Var/Mean = %.1f",
                          mean(district_data$cases),
                          var(district_data$cases),
                          var(district_data$cases) / mean(district_data$cases)),
       x = "Weekly Cases", y = "Frequency") +
  theme_minimal(base_size = 12)

p_qq <- ggplot(district_data, aes(sample = cases)) +
  stat_qq(color = "#c0392b", alpha = 0.6) +
  stat_qq_line(color = "black", linetype = "dashed") +
  labs(title = "Q-Q Plot of Cases", x = "Theoretical Quantiles", y = "Sample Quantiles") +
  theme_minimal(base_size = 12)

p_hist + p_qq +
  plot_annotation(title = "Case Count Distribution — Over-dispersion Check",
                  theme = theme(plot.title = element_text(face = "bold")))

Interpretation: A Variance/Mean ratio >> 1 confirms over-dispersion, justifying the Negative Binomial model over Poisson.

3.4 Correlation Matrix

Code

cor_data <- district_data |> select(cases, rainfall_mm, temp_mean_c, ndvi)
cor_matrix <- cor(cor_data, method = "spearman")

corrplot(cor_matrix,
         method = "color",
         type   = "upper",
         addCoef.col = "black",
         tl.col  = "black",
         tl.srt  = 45,
         col     = colorRampPalette(c("#2980b9", "white", "#c0392b"))(200),
         title   = "Spearman Correlation Matrix",
         mar     = c(0, 0, 2, 0))

3.5 Scatter Plots: Cases vs Predictors

Code

p_s1 <- ggplot(district_data, aes(x = rainfall_mm, y = cases)) +
  geom_point(alpha = 0.5, color = "#2980b9") +
  geom_smooth(method = "loess", color = "#c0392b", se = TRUE) +
  labs(title = "Cases vs Rainfall", x = "Rainfall (mm)", y = "Cases") +
  theme_minimal()

p_s2 <- ggplot(district_data, aes(x = temp_mean_c, y = cases)) +
  geom_point(alpha = 0.5, color = "#e67e22") +
  geom_smooth(method = "loess", color = "#c0392b", se = TRUE) +
  labs(title = "Cases vs Temperature", x = "Temperature (°C)", y = "Cases") +
  theme_minimal()

p_s3 <- ggplot(district_data, aes(x = ndvi, y = cases)) +
  geom_point(alpha = 0.5, color = "#27ae60") +
  geom_smooth(method = "loess", color = "#c0392b", se = TRUE) +
  labs(title = "Cases vs NDVI", x = "NDVI", y = "Cases") +
  theme_minimal()

p_s1 + p_s2 + p_s3

4. Model Building

4.1 Data Splitting (Train / Test)

Code

# Use last 20 weeks as test set
n_test  <- 20
n_train <- nrow(district_data) - n_test

train_data <- district_data[1:n_train, ]
test_data  <- district_data[(n_train + 1):nrow(district_data), ]

cat(sprintf("Training weeks: %d | Test weeks: %d\n", n_train, n_test))

Training weeks: 136 | Test weeks: 20

4.2 Fit Negative Binomial Regression

Code

# ── Main model ─────────────────────────────────────────────────────────────────
nb_model <- glm.nb(
  cases ~ rainfall_mm + temp_mean_c + ndvi,
  data = train_data
)

# ── Model with seasonal term ───────────────────────────────────────────────────
nb_model_seasonal <- glm.nb(
  cases ~ rainfall_mm + temp_mean_c + ndvi +
          sin(2 * pi * week_of_year / 52) +
          cos(2 * pi * week_of_year / 52),
  data = train_data
)

# ── Null model (intercept only) for comparison ────────────────────────────────
nb_null <- glm.nb(cases ~ 1, data = train_data)

summary(nb_model_seasonal)


Call:
glm.nb(formula = cases ~ rainfall_mm + temp_mean_c + ndvi + sin(2 * 
    pi * week_of_year/52) + cos(2 * pi * week_of_year/52), data = train_data, 
    init.theta = 5.443376253, link = log)

Coefficients:
                               Estimate Std. Error z value Pr(>|z|)    
(Intercept)                   -3.207958   1.166219  -2.751 0.005946 ** 
rainfall_mm                    0.021725   0.006243   3.480 0.000501 ***
temp_mean_c                    0.133247   0.039806   3.347 0.000816 ***
ndvi                           3.327279   1.113229   2.989 0.002800 ** 
sin(2 * pi * week_of_year/52)  0.276690   0.216640   1.277 0.201537    
cos(2 * pi * week_of_year/52)  0.355841   0.252943   1.407 0.159485    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for Negative Binomial(5.4434) family taken to be 1)

    Null deviance: 834.22  on 135  degrees of freedom
Residual deviance: 142.35  on 130  degrees of freedom
AIC: 976.93

Number of Fisher Scoring iterations: 1

              Theta:  5.443 
          Std. Err.:  0.888 

 2 x log-likelihood:  -962.928

4.3 Model Comparison

Code

model_list <- list(
  "NB: Null"            = nb_null,
  "NB: Environmental"   = nb_model,
  "NB: + Seasonal Terms" = nb_model_seasonal
)

modelsummary(
  model_list,
  statistic  = "({p.value})",
  stars      = TRUE,
  fmt        = 3,
  gof_map    = c("nobs", "AIC", "BIC", "logLik"),
  title      = "Negative Binomial Model Comparison",
  notes      = "p-values in parentheses. * p<0.05, ** p<0.01, *** p<0.001"
)

Negative Binomial Model Comparison
	NB: Null	NB: Environmental	NB: + Seasonal Terms
+ p < 0.1, * p < 0.05, p < 0.01, * p < 0.001
p-values in parentheses. * p<0.05, p<0.01, * p<0.001
(Intercept)	3.454***	-4.719***	-3.208**
	(<0.001)	(<0.001)	(0.006)
rainfall_mm		0.006+	0.022***
		(0.072)	(<0.001)
temp_mean_c		0.204***	0.133***
		(<0.001)	(<0.001)
ndvi		4.046***	3.327**
		(<0.001)	(0.003)
sin(2 * pi * week_of_year/52)			0.277
			(0.202)
cos(2 * pi * week_of_year/52)			0.356
			(0.159)
Num.Obs.	136	136	136
Log.Lik.	-605.484	-488.506	-481.464

Code

aic_table <- AIC(nb_null, nb_model, nb_model_seasonal) |>
  rownames_to_column("Model") |>
  mutate(Model = c("Null", "Environmental Only", "Environmental + Seasonal"),
         Delta_AIC = AIC - min(AIC)) |>
  arrange(AIC)

kable(aic_table, digits = 2, caption = "AIC Comparison (lower = better)") |>
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE) |>
  row_spec(1, bold = TRUE, background = "#d5e8d4")

AIC Comparison (lower = better)
Model	df	AIC	Delta_AIC
Environmental + Seasonal	7	976.93	0.00
Environmental Only	5	987.01	10.08
Null	2	1214.97	238.04

Use the model with lowest AIC as your final model.

Code

# Select best model automatically
final_model <- nb_model_seasonal
cat("Selected model: NB with Environmental + Seasonal Terms\n")

Selected model: NB with Environmental + Seasonal Terms

Code

cat("Theta (dispersion):", final_model$theta, "\n")

Theta (dispersion): 5.443376

5. Model Interpretation

5.1 Coefficient Table

Code

tidy(final_model, exponentiate = TRUE, conf.int = TRUE) |>
  rename(
    Term          = term,
    IRR           = estimate,
    `Std. Error`  = std.error,
    `z-value`     = statistic,
    `p-value`     = p.value,
    `CI Lower`    = conf.low,
    `CI Upper`    = conf.high
  ) |>
  mutate(Significance = case_when(
    `p-value` < 0.001 ~ "***",
    `p-value` < 0.01  ~ "**",
    `p-value` < 0.05  ~ "*",
    `p-value` < 0.1   ~ ".",
    TRUE              ~ ""
  )) |>
  kable(digits = 4,
        caption = "Incidence Rate Ratios (IRR) from Negative Binomial Model") |>
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE) |>
  add_header_above(c(" " = 1, "Exponentiated Coefficients (IRR)" = 3,
                     "95% CI" = 2, " " = 2))

Incidence Rate Ratios (IRR) from Negative Binomial Model
	Exponentiated Coefficients (IRR)			95% CI
Term	IRR	Std. Error	z-value	p-value	CI Lower	CI Upper	Significance
(Intercept)	0.0404	1.1662	-2.7507	0.0059	0.0040	0.4069	**
rainfall_mm	1.0220	0.0062	3.4800	0.0005	1.0094	1.0348	***
temp_mean_c	1.1425	0.0398	3.3474	0.0008	1.0548	1.2384	***
ndvi	27.8624	1.1132	2.9889	0.0028	3.1793	245.5287	**
sin(2 * pi * week_of_year/52)	1.3188	0.2166	1.2772	0.2015	0.8576	2.0305
cos(2 * pi * week_of_year/52)	1.4274	0.2529	1.4068	0.1595	0.8602	2.3650

IRR interpretation:
- IRR > 1 → predictor increases expected case count
- IRR < 1 → predictor decreases expected case count
- E.g., IRR of 1.05 for rainfall → each extra mm/week increases cases by ~5%

5.2 Coefficient Forest Plot

Code

coef_df <- tidy(final_model, exponentiate = TRUE, conf.int = TRUE) |>
  filter(term != "(Intercept)") |>
  mutate(term = recode(term,
    rainfall_mm              = "Rainfall (mm)",
    temp_mean_c              = "Temperature (°C)",
    ndvi                     = "NDVI",
    `sin(2 * pi * week_of_year/52)` = "Seasonal (sin)",
    `cos(2 * pi * week_of_year/52)` = "Seasonal (cos)"
  ),
  significant = p.value < 0.05)

ggplot(coef_df, aes(x = estimate, y = reorder(term, estimate), color = significant)) +
  geom_vline(xintercept = 1, linetype = "dashed", color = "grey50") +
  geom_errorbarh(aes(xmin = conf.low, xmax = conf.high), height = 0.25, linewidth = 1) +
  geom_point(size = 4) +
  scale_color_manual(values = c("FALSE" = "grey60", "TRUE" = "#c0392b"),
                     labels = c("Non-significant", "Significant (p<0.05)")) +
  labs(title    = "Incidence Rate Ratios (IRR) — Negative Binomial Model",
       subtitle  = "Points right of dashed line → increased disease risk",
       x        = "IRR (with 95% CI)",
       y        = NULL,
       color    = NULL) +
  theme_minimal(base_size = 12) +
  theme(legend.position = "bottom", plot.title = element_text(face = "bold"))

5.3 Marginal Effects (Predicted Cases vs Each Predictor)

Code

me_rain <- ggpredict(final_model, terms = "rainfall_mm [all]")
me_temp <- ggpredict(final_model, terms = "temp_mean_c [all]")
me_ndvi <- ggpredict(final_model, terms = "ndvi [all]")

p_me1 <- plot(me_rain) +
  labs(title = "Effect of Rainfall", x = "Rainfall (mm)", y = "Predicted Cases") +
  theme_minimal()

p_me2 <- plot(me_temp) +
  labs(title = "Effect of Temperature", x = "Temperature (°C)", y = "Predicted Cases") +
  theme_minimal()

p_me3 <- plot(me_ndvi) +
  labs(title = "Effect of NDVI", x = "NDVI", y = "Predicted Cases") +
  theme_minimal()

p_me1 + p_me2 + p_me3 +
  plot_annotation(title = "Marginal Effects: Predicted Cases by Predictor",
                  theme = theme(plot.title = element_text(face = "bold")))

6. Model Diagnostics

6.1 Multicollinearity (VIF)

Code

vif_vals <- vif(final_model)

tibble(
  Predictor = names(vif_vals),
  VIF       = round(vif_vals, 3),
  Flag      = ifelse(vif_vals > 5, "⚠ High", "✓ OK")
) |>
  kable(caption = "Variance Inflation Factors (VIF < 5 is acceptable)") |>
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)

Variance Inflation Factors (VIF < 5 is acceptable)
Predictor	VIF	Flag
rainfall_mm	10.895	⚠ High
temp_mean_c	5.705	⚠ High
ndvi	8.884	⚠ High
sin(2 * pi * week_of_year/52)	10.065	⚠ High
cos(2 * pi * week_of_year/52)	16.138	⚠ High

6.2 DHARMa Residual Diagnostics

Code

# DHARMa creates scaled residuals suitable for count models
sim_residuals <- simulateResiduals(fittedModel = final_model, n = 1000)

par(mfrow = c(1, 2))
plot(sim_residuals)

Code

par(mfrow = c(1, 1))

Code

# Formal tests
cat("=== DHARMa Diagnostic Tests ===\n\n")

=== DHARMa Diagnostic Tests ===

Code

test_unif    <- testUniformity(sim_residuals)

Code

test_disp    <- testDispersion(sim_residuals)

Code

test_outlier <- testOutliers(sim_residuals)

Code

tibble(
  Test           = c("Uniformity (KS)", "Dispersion", "Outliers"),
  Statistic      = c(test_unif$statistic, test_disp$statistic, test_outlier$statistic),
  `p-value`      = c(test_unif$p.value, test_disp$p.value, test_outlier$p.value),
  Interpretation = c(
    ifelse(test_unif$p.value > 0.05, " Residuals uniform", "Deviation detected"),
    ifelse(test_disp$p.value > 0.05, " Dispersion OK", "Mis-specified dispersion"),
    ifelse(test_outlier$p.value > 0.05, " No outliers", "Outliers present")
  )
) |>
  kable(digits = 4, caption = "DHARMa Formal Residual Tests") |>
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)

DHARMa Formal Residual Tests
Test	Statistic	p-value	Interpretation
Uniformity (KS)	0.0792	0.3605	Residuals uniform
Dispersion	1.0307	0.7940	Dispersion OK
Outliers	0.0000	1.0000	No outliers

6.3 Observed vs Fitted (Training Set)

Code

train_data <- train_data |>
  mutate(fitted_cases = fitted(final_model))

ggplot(train_data, aes(x = week_start)) +
  geom_col(aes(y = cases), fill = "#c0392b", alpha = 0.5, width = 5) +
  geom_line(aes(y = fitted_cases), color = "#2c3e50", linewidth = 1.2) +
  geom_ribbon(
    aes(ymin = fitted_cases * 0.8, ymax = fitted_cases * 1.2),
    alpha = 0.15, fill = "#2c3e50"
  ) +
  labs(title    = "Observed vs Fitted Cases (Training Set)",
       subtitle = "Red bars = observed | Dark line = model fit",
       x = NULL, y = "Weekly Cases") +
  theme_minimal(base_size = 12) +
  theme(plot.title = element_text(face = "bold"))

7. Prediction on Test Set

7.1 Generate Predictions

Code

# Predict on test set with confidence intervals
pred_obj <- predict(final_model, newdata = test_data,
                    type = "link", se.fit = TRUE)

# Back-transform from log scale
test_data <- test_data |>
  mutate(
    pred_log    = pred_obj$fit,
    pred_se     = pred_obj$se.fit,
    pred_cases  = exp(pred_log),
    pred_lower  = exp(pred_log - 1.96 * pred_se),
    pred_upper  = exp(pred_log + 1.96 * pred_se)
  )

head(test_data |> select(week_start, cases, pred_cases, pred_lower, pred_upper), 10) |>
  kable(digits = 1, caption = "Predicted vs Actual Cases (Test Set, first 10 weeks)") |>
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)

Predicted vs Actual Cases (Test Set, first 10 weeks)
week_start	cases	pred_cases	pred_lower	pred_upper
2023-08-14	6	6.6	5.0	8.9
2023-08-21	9	6.3	5.0	8.0
2023-08-28	6	14.5	10.4	20.2
2023-09-04	6	4.7	3.6	6.2
2023-09-11	3	7.1	5.5	9.1
2023-09-18	0	3.2	2.4	4.1
2023-09-25	6	2.8	2.1	3.8
2023-10-02	2	1.9	1.3	2.7
2023-10-09	3	2.1	1.5	2.9
2023-10-16	1	1.9	1.5	2.4

7.2 Prediction Plot

Code

ggplot(test_data, aes(x = week_start)) +
  geom_ribbon(aes(ymin = pred_lower, ymax = pred_upper),
              fill = "#2980b9", alpha = 0.25) +
  geom_line(aes(y = pred_cases), color = "#2980b9", linewidth = 1.4, linetype = "dashed") +
  geom_col(aes(y = cases), fill = "#c0392b", alpha = 0.7, width = 4) +
  labs(title    = "Predicted vs Actual Cases — Test Set",
       subtitle  = "Red bars = observed | Blue dashed = predicted | Shaded = 95% PI",
       x = NULL, y = "Weekly Cases") +
  theme_minimal(base_size = 12) +
  theme(plot.title = element_text(face = "bold"))

7.3 Prediction Accuracy Metrics

Code

metrics <- test_data |>
  summarise(
    MAE   = mean(abs(cases - pred_cases)),
    RMSE  = sqrt(mean((cases - pred_cases)^2)),
    MAPE  = mean(abs((cases - pred_cases) / (cases + 0.1))) * 100,
    Corr  = cor(cases, pred_cases),
    Coverage_95 = mean(cases >= pred_lower & cases <= pred_upper) * 100
  )

kable(metrics, digits = 2,
      caption = "Prediction Performance Metrics (Test Set)",
      col.names = c("MAE", "RMSE", "MAPE (%)", "Pearson r", "95% PI Coverage (%)")) |>
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)

Prediction Performance Metrics (Test Set)
MAE	RMSE	MAPE (%)	Pearson r	95% PI Coverage (%)
2.49	3.15	257.56	0.41	20

8. Scenario Prediction Tool

Use this section to predict cases for specific environmental scenarios.

Code

# Define scenarios of interest
scenarios <- tibble(
  Scenario    = c("Low Risk", "Moderate Risk", "High Risk", "Extreme Risk"),
  rainfall_mm = c(10,  30,  55,  80),
  temp_mean_c = c(22,  25,  28,  32),
  ndvi        = c(0.2, 0.4, 0.6, 0.75),
  week_of_year = c(20, 30, 40, 15)   # representative weeks
)

# Predict
scen_pred <- predict(final_model, newdata = scenarios,
                     type = "link", se.fit = TRUE)

scenarios <- scenarios |>
  mutate(
    Predicted_Cases = round(exp(scen_pred$fit), 1),
    Lower_95_PI     = round(exp(scen_pred$fit - 1.96 * scen_pred$se.fit), 1),
    Upper_95_PI     = round(exp(scen_pred$fit + 1.96 * scen_pred$se.fit), 1)
  )

kable(scenarios,
      caption = "Predicted Weekly Cases Under Environmental Scenarios",
      col.names = c("Scenario", "Rainfall (mm)", "Temp (°C)", "NDVI",
                    "Week", "Predicted Cases", "Lower 95%", "Upper 95%")) |>
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE) |>
  row_spec(3:4, background = "#fde8e8")  # Highlight high risk

Predicted Weekly Cases Under Environmental Scenarios
Scenario	Rainfall (mm)	Temp (°C)	NDVI	Week	Predicted Cases	Lower 95%	Upper 95%
Low Risk	10	22	0.20	20	1.7	0.5	6.3
Moderate Risk	30	25	0.40	30	5.3	3.2	8.6
High Risk	55	28	0.60	40	32.5	14.0	75.7
Extreme Risk	80	32	0.75	15	238.1	153.4	369.7

Code

scenarios |>
  ggplot(aes(x = fct_inorder(Scenario), y = Predicted_Cases, fill = Scenario)) +
  geom_col(alpha = 0.85) +
  geom_errorbar(aes(ymin = Lower_95_PI, ymax = Upper_95_PI), width = 0.3, linewidth = 1) +
  scale_fill_manual(values = c("#27ae60", "#f39c12", "#e74c3c", "#8e1010")) +
  labs(title    = "Predicted Weekly Cases by Risk Scenario",
       subtitle  = "Error bars = 95% Prediction Interval",
       x = "Scenario", y = "Predicted Cases") +
  theme_minimal(base_size = 13) +
  theme(legend.position = "none", plot.title = element_text(face = "bold"))

9. Model Summary & Conclusions

Code

# Final model summary
glance_df <- glance(final_model)

tibble(
  Metric = c("Model Type", "Outcome", "Predictors", "N (Training)",
             "AIC", "Log-Likelihood", "Theta (Dispersion)"),
  Value  = c(
    "Negative Binomial GLM",
    "Weekly Disease Cases",
    "Rainfall, Temperature, NDVI, Seasonal Terms",
    as.character(n_train),
    round(AIC(final_model), 2),
    round(logLik(final_model), 2),
    round(final_model$theta, 3)
  )
) |>
  kable(caption = "Final Model Summary") |>
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)

Final Model Summary
Metric	Value
Model Type	Negative Binomial GLM
Outcome	Weekly Disease Cases
Predictors	Rainfall, Temperature, NDVI, Seasonal Terms
N (Training)	136
AIC	976.93
Log-Likelihood	-481.46
Theta (Dispersion)	5.443

Key Takeaways

Over-dispersion confirmed: Variance >> Mean in raw counts justifies Negative Binomial over Poisson.
Environmental drivers: Rainfall, temperature, and NDVI are significant predictors of weekly case burden.
Seasonal patterns: Including harmonic (sin/cos) terms captures residual seasonality not explained by climate variables alone.
Prediction intervals: Use 95% prediction intervals for operational planning and early-warning thresholds.
Theta (dispersion): A lower theta = greater over-dispersion; the model accounts for this explicitly.