---
title: "Weekly Malaria Case Prediction"
subtitle: "Negative Binomial Regression with Environmental Predictors"
author: "Timothy Achala"
format:
html:
toc: true
toc-depth: 3
toc-title: "Contents"
theme: cosmo
code-fold: true
code-tools: true
fig-width: 9
fig-height: 6
embed-resources: true
df-print: paged
execute:
warning: false
message: false
echo: true
---
# Overview
This report models weekly disease case counts in a specific district using **Negative Binomial Regression**, which is well-suited for over-dispersed count data. Environmental predictors include:
- **Rainfall** (mm/week)
- **Temperature** (°C — mean weekly)
- **NDVI** (Normalised Difference Vegetation Index — proxy for vegetation density and humidity)
---
# 1. Setup: Load Libraries
```{r setup}
# Install missing packages automatically
packages <- c (
"tidyverse" , "MASS" , "ggplot2" , "performance" ,
"broom" , "lubridate" , "patchwork" , "corrplot" ,
"GGally" , "DHARMa" , "car" , "kableExtra" ,
"modelsummary" , "ggeffects" , "scales"
)
installed <- rownames (installed.packages ())
to_install <- setdiff (packages, installed)
if (length (to_install) > 0 ) install.packages (to_install, repos = "https://cloud.r-project.org" )
library (tidyverse)
library (MASS) # Negative Binomial regression (glm.nb)
library (ggplot2)
library (performance) # Model diagnostics
library (broom) # Tidy model outputs
library (lubridate) # Date handling
library (patchwork) # Combine plots
library (corrplot) # Correlation matrix
library (GGally) # Pairs plot
library (DHARMa) # Residual diagnostics for GLMs
library (car) # VIF
library (kableExtra) # Table formatting
library (modelsummary) # Regression tables
library (ggeffects) # Marginal effects
library (scales) # Axis formatting
conflicted:: conflict_prefer ("select" , "dplyr" )
conflicted:: conflict_prefer ("filter" , "dplyr" )
conflicted:: conflict_prefer ("recode" , "dplyr" )
set.seed (123 ) # Reproducibility
```
---
```{r data}
# ── Option A: Simulate realistic data ─────────────────────────────────────────
n_weeks <- 156 # 3 years of weekly data
# Create weekly time series
week_start <- seq (as.Date ("2021-01-04" ), by = "week" , length.out = n_weeks)
week_num <- 1 : n_weeks
# Seasonal patterns (tropical region)
seasonal_rain <- 40 + 35 * sin (2 * pi * week_num / 52 - 1.2 ) + rnorm (n_weeks, 0 , 8 )
seasonal_temp <- 26 + 4 * sin (2 * pi * week_num / 52 - 0.5 ) + rnorm (n_weeks, 0 , 1.2 )
seasonal_ndvi <- 0.45 + 0.18 * sin (2 * pi * week_num / 52 - 0.8 ) + rnorm (n_weeks, 0 , 0.04 )
# Clip to realistic ranges
rainfall_mm <- pmax (0 , seasonal_rain)
temp_mean_c <- pmin (pmax (seasonal_temp, 18 ), 38 )
ndvi <- pmin (pmax (seasonal_ndvi, 0.1 ), 0.9 )
# True log-linear relationship (Negative Binomial outcome)
log_mu <- - 2.5 +
0.015 * rainfall_mm +
0.12 * temp_mean_c +
3.2 * ndvi +
0.4 * sin (2 * pi * week_num / 52 ) # additional seasonal trend
# Simulate NB counts (theta = dispersion parameter)
cases <- rnegbin (n_weeks, mu = exp (log_mu), theta = 3.5 )
district_data <- tibble (
week_start = week_start,
year = year (week_start),
week_of_year = week (week_start),
cases = cases,
rainfall_mm = round (rainfall_mm, 1 ),
temp_mean_c = round (temp_mean_c, 2 ),
ndvi = round (ndvi, 4 )
)
# ── Option B: Import your own data ────────────────────────────────────────────
# district_data <- read_csv("your_data.csv") |>
# mutate(week_start = as.Date(week_start))
head (district_data, 10 )
```
---
# 3. Exploratory Data Analysis (EDA)
## 3.1 Summary Statistics
```{r summary-stats}
district_data |>
dplyr:: select (cases, rainfall_mm, temp_mean_c, ndvi) |>
summary () |>
kable (caption = "Summary Statistics of Key Variables" ) |>
kable_styling (bootstrap_options = c ("striped" , "hover" ), full_width = FALSE )
```
## 3.2 Time Series of Cases and Predictors
```{r time-series, fig.height=8}
p_cases <- ggplot (district_data, aes (x = week_start, y = cases)) +
geom_col (fill = "#c0392b" , alpha = 0.75 ) +
geom_smooth (color = "#7f0000" , se = FALSE , linewidth = 1 ) +
labs (title = "Weekly Disease Cases" , x = NULL , y = "Cases" ) +
theme_minimal (base_size = 12 ) +
theme (plot.title = element_text (face = "bold" ))
p_rain <- ggplot (district_data, aes (x = week_start, y = rainfall_mm)) +
geom_line (color = "#2980b9" , linewidth = 0.8 ) +
geom_area (fill = "#2980b9" , alpha = 0.2 ) +
labs (title = "Weekly Rainfall (mm)" , x = NULL , y = "mm" ) +
theme_minimal (base_size = 12 )
p_temp <- ggplot (district_data, aes (x = week_start, y = temp_mean_c)) +
geom_line (color = "#e67e22" , linewidth = 0.8 ) +
labs (title = "Mean Weekly Temperature (°C)" , x = NULL , y = "°C" ) +
theme_minimal (base_size = 12 )
p_ndvi <- ggplot (district_data, aes (x = week_start, y = ndvi)) +
geom_line (color = "#27ae60" , linewidth = 0.8 ) +
geom_area (fill = "#27ae60" , alpha = 0.2 ) +
labs (title = "NDVI (Vegetation Index)" , x = NULL , y = "NDVI" ) +
theme_minimal (base_size = 12 )
(p_cases / p_rain / p_temp / p_ndvi) +
plot_annotation (
title = "District Environmental & Epidemiological Trends" ,
subtitle = "3-Year Weekly Data" ,
theme = theme (plot.title = element_text (face = "bold" , size = 14 ))
)
```
## 3.3 Distribution of Cases
```{r case-distribution}
p_hist <- ggplot (district_data, aes (x = cases)) +
geom_histogram (binwidth = 2 , fill = "#c0392b" , color = "white" , alpha = 0.8 ) +
labs (title = "Distribution of Weekly Cases" ,
subtitle = sprintf ("Mean = %.1f | Variance = %.1f | Var/Mean = %.1f" ,
mean (district_data$ cases),
var (district_data$ cases),
var (district_data$ cases) / mean (district_data$ cases)),
x = "Weekly Cases" , y = "Frequency" ) +
theme_minimal (base_size = 12 )
p_qq <- ggplot (district_data, aes (sample = cases)) +
stat_qq (color = "#c0392b" , alpha = 0.6 ) +
stat_qq_line (color = "black" , linetype = "dashed" ) +
labs (title = "Q-Q Plot of Cases" , x = "Theoretical Quantiles" , y = "Sample Quantiles" ) +
theme_minimal (base_size = 12 )
p_hist + p_qq +
plot_annotation (title = "Case Count Distribution — Over-dispersion Check" ,
theme = theme (plot.title = element_text (face = "bold" )))
```
> **Interpretation:** A Variance/Mean ratio >> 1 confirms **over-dispersion**, justifying the Negative Binomial model over Poisson.
## 3.4 Correlation Matrix
```{r correlation}
cor_data <- district_data |> select (cases, rainfall_mm, temp_mean_c, ndvi)
cor_matrix <- cor (cor_data, method = "spearman" )
corrplot (cor_matrix,
method = "color" ,
type = "upper" ,
addCoef.col = "black" ,
tl.col = "black" ,
tl.srt = 45 ,
col = colorRampPalette (c ("#2980b9" , "white" , "#c0392b" ))(200 ),
title = "Spearman Correlation Matrix" ,
mar = c (0 , 0 , 2 , 0 ))
```
## 3.5 Scatter Plots: Cases vs Predictors
```{r scatter-plots, fig.height=5}
p_s1 <- ggplot (district_data, aes (x = rainfall_mm, y = cases)) +
geom_point (alpha = 0.5 , color = "#2980b9" ) +
geom_smooth (method = "loess" , color = "#c0392b" , se = TRUE ) +
labs (title = "Cases vs Rainfall" , x = "Rainfall (mm)" , y = "Cases" ) +
theme_minimal ()
p_s2 <- ggplot (district_data, aes (x = temp_mean_c, y = cases)) +
geom_point (alpha = 0.5 , color = "#e67e22" ) +
geom_smooth (method = "loess" , color = "#c0392b" , se = TRUE ) +
labs (title = "Cases vs Temperature" , x = "Temperature (°C)" , y = "Cases" ) +
theme_minimal ()
p_s3 <- ggplot (district_data, aes (x = ndvi, y = cases)) +
geom_point (alpha = 0.5 , color = "#27ae60" ) +
geom_smooth (method = "loess" , color = "#c0392b" , se = TRUE ) +
labs (title = "Cases vs NDVI" , x = "NDVI" , y = "Cases" ) +
theme_minimal ()
p_s1 + p_s2 + p_s3
```
---
# 4. Model Building
## 4.1 Data Splitting (Train / Test)
```{r split}
# Use last 20 weeks as test set
n_test <- 20
n_train <- nrow (district_data) - n_test
train_data <- district_data[1 : n_train, ]
test_data <- district_data[(n_train + 1 ): nrow (district_data), ]
cat (sprintf ("Training weeks: %d | Test weeks: %d \n " , n_train, n_test))
```
## 4.2 Fit Negative Binomial Regression
```{r model-fit}
# ── Main model ─────────────────────────────────────────────────────────────────
nb_model <- glm.nb (
cases ~ rainfall_mm + temp_mean_c + ndvi,
data = train_data
)
# ── Model with seasonal term ───────────────────────────────────────────────────
nb_model_seasonal <- glm.nb (
cases ~ rainfall_mm + temp_mean_c + ndvi +
sin (2 * pi * week_of_year / 52 ) +
cos (2 * pi * week_of_year / 52 ),
data = train_data
)
# ── Null model (intercept only) for comparison ────────────────────────────────
nb_null <- glm.nb (cases ~ 1 , data = train_data)
summary (nb_model_seasonal)
```
## 4.3 Model Comparison
```{r model-comparison}
model_list <- list (
"NB: Null" = nb_null,
"NB: Environmental" = nb_model,
"NB: + Seasonal Terms" = nb_model_seasonal
)
modelsummary (
model_list,
statistic = "({p.value})" ,
stars = TRUE ,
fmt = 3 ,
gof_map = c ("nobs" , "AIC" , "BIC" , "logLik" ),
title = "Negative Binomial Model Comparison" ,
notes = "p-values in parentheses. * p<0.05, ** p<0.01, *** p<0.001"
)
```
```{r aic-comparison}
aic_table <- AIC (nb_null, nb_model, nb_model_seasonal) |>
rownames_to_column ("Model" ) |>
mutate (Model = c ("Null" , "Environmental Only" , "Environmental + Seasonal" ),
Delta_AIC = AIC - min (AIC)) |>
arrange (AIC)
kable (aic_table, digits = 2 , caption = "AIC Comparison (lower = better)" ) |>
kable_styling (bootstrap_options = c ("striped" , "hover" ), full_width = FALSE ) |>
row_spec (1 , bold = TRUE , background = "#d5e8d4" )
```
> **Use the model with lowest AIC as your final model.**
```{r select-final}
# Select best model automatically
final_model <- nb_model_seasonal
cat ("Selected model: NB with Environmental + Seasonal Terms \n " )
cat ("Theta (dispersion):" , final_model$ theta, " \n " )
```
---
# 5. Model Interpretation
## 5.1 Coefficient Table
```{r coef-table}
tidy (final_model, exponentiate = TRUE , conf.int = TRUE ) |>
rename (
Term = term,
IRR = estimate,
` Std. Error ` = std.error,
` z-value ` = statistic,
` p-value ` = p.value,
` CI Lower ` = conf.low,
` CI Upper ` = conf.high
) |>
mutate (Significance = case_when (
` p-value ` < 0.001 ~ "***" ,
` p-value ` < 0.01 ~ "**" ,
` p-value ` < 0.05 ~ "*" ,
` p-value ` < 0.1 ~ "." ,
TRUE ~ ""
)) |>
kable (digits = 4 ,
caption = "Incidence Rate Ratios (IRR) from Negative Binomial Model" ) |>
kable_styling (bootstrap_options = c ("striped" , "hover" ), full_width = FALSE ) |>
add_header_above (c (" " = 1 , "Exponentiated Coefficients (IRR)" = 3 ,
"95% CI" = 2 , " " = 2 ))
```
> **IRR interpretation:**
> - IRR > 1 → predictor increases expected case count
> - IRR < 1 → predictor decreases expected case count
> - E.g., IRR of 1.05 for rainfall → each extra mm/week increases cases by ~5%
## 5.2 Coefficient Forest Plot
```{r forest-plot}
coef_df <- tidy (final_model, exponentiate = TRUE , conf.int = TRUE ) |>
filter (term != "(Intercept)" ) |>
mutate (term = recode (term,
rainfall_mm = "Rainfall (mm)" ,
temp_mean_c = "Temperature (°C)" ,
ndvi = "NDVI" ,
` sin(2 * pi * week_of_year/52) ` = "Seasonal (sin)" ,
` cos(2 * pi * week_of_year/52) ` = "Seasonal (cos)"
),
significant = p.value < 0.05 )
ggplot (coef_df, aes (x = estimate, y = reorder (term, estimate), color = significant)) +
geom_vline (xintercept = 1 , linetype = "dashed" , color = "grey50" ) +
geom_errorbarh (aes (xmin = conf.low, xmax = conf.high), height = 0.25 , linewidth = 1 ) +
geom_point (size = 4 ) +
scale_color_manual (values = c ("FALSE" = "grey60" , "TRUE" = "#c0392b" ),
labels = c ("Non-significant" , "Significant (p<0.05)" )) +
labs (title = "Incidence Rate Ratios (IRR) — Negative Binomial Model" ,
subtitle = "Points right of dashed line → increased disease risk" ,
x = "IRR (with 95% CI)" ,
y = NULL ,
color = NULL ) +
theme_minimal (base_size = 12 ) +
theme (legend.position = "bottom" , plot.title = element_text (face = "bold" ))
```
## 5.3 Marginal Effects (Predicted Cases vs Each Predictor)
```{r marginal-effects, fig.height=5}
me_rain <- ggpredict (final_model, terms = "rainfall_mm [all]" )
me_temp <- ggpredict (final_model, terms = "temp_mean_c [all]" )
me_ndvi <- ggpredict (final_model, terms = "ndvi [all]" )
p_me1 <- plot (me_rain) +
labs (title = "Effect of Rainfall" , x = "Rainfall (mm)" , y = "Predicted Cases" ) +
theme_minimal ()
p_me2 <- plot (me_temp) +
labs (title = "Effect of Temperature" , x = "Temperature (°C)" , y = "Predicted Cases" ) +
theme_minimal ()
p_me3 <- plot (me_ndvi) +
labs (title = "Effect of NDVI" , x = "NDVI" , y = "Predicted Cases" ) +
theme_minimal ()
p_me1 + p_me2 + p_me3 +
plot_annotation (title = "Marginal Effects: Predicted Cases by Predictor" ,
theme = theme (plot.title = element_text (face = "bold" )))
```
---
# 6. Model Diagnostics
## 6.1 Multicollinearity (VIF)
```{r vif}
vif_vals <- vif (final_model)
tibble (
Predictor = names (vif_vals),
VIF = round (vif_vals, 3 ),
Flag = ifelse (vif_vals > 5 , "⚠ High" , "✓ OK" )
) |>
kable (caption = "Variance Inflation Factors (VIF < 5 is acceptable)" ) |>
kable_styling (bootstrap_options = c ("striped" , "hover" ), full_width = FALSE )
```
## 6.2 DHARMa Residual Diagnostics
```{r dharma-diagnostics, fig.height=6}
# DHARMa creates scaled residuals suitable for count models
sim_residuals <- simulateResiduals (fittedModel = final_model, n = 1000 )
par (mfrow = c (1 , 2 ))
plot (sim_residuals)
par (mfrow = c (1 , 1 ))
```
```{r dharma-tests}
# Formal tests
cat ("=== DHARMa Diagnostic Tests === \n\n " )
test_unif <- testUniformity (sim_residuals)
test_disp <- testDispersion (sim_residuals)
test_outlier <- testOutliers (sim_residuals)
tibble (
Test = c ("Uniformity (KS)" , "Dispersion" , "Outliers" ),
Statistic = c (test_unif$ statistic, test_disp$ statistic, test_outlier$ statistic),
` p-value ` = c (test_unif$ p.value, test_disp$ p.value, test_outlier$ p.value),
Interpretation = c (
ifelse (test_unif$ p.value > 0.05 , " Residuals uniform" , "Deviation detected" ),
ifelse (test_disp$ p.value > 0.05 , " Dispersion OK" , "Mis-specified dispersion" ),
ifelse (test_outlier$ p.value > 0.05 , " No outliers" , "Outliers present" )
)
) |>
kable (digits = 4 , caption = "DHARMa Formal Residual Tests" ) |>
kable_styling (bootstrap_options = c ("striped" , "hover" ), full_width = FALSE )
```
## 6.3 Observed vs Fitted (Training Set)
```{r obs-vs-fitted}
train_data <- train_data |>
mutate (fitted_cases = fitted (final_model))
ggplot (train_data, aes (x = week_start)) +
geom_col (aes (y = cases), fill = "#c0392b" , alpha = 0.5 , width = 5 ) +
geom_line (aes (y = fitted_cases), color = "#2c3e50" , linewidth = 1.2 ) +
geom_ribbon (
aes (ymin = fitted_cases * 0.8 , ymax = fitted_cases * 1.2 ),
alpha = 0.15 , fill = "#2c3e50"
) +
labs (title = "Observed vs Fitted Cases (Training Set)" ,
subtitle = "Red bars = observed | Dark line = model fit" ,
x = NULL , y = "Weekly Cases" ) +
theme_minimal (base_size = 12 ) +
theme (plot.title = element_text (face = "bold" ))
```
---
# 7. Prediction on Test Set
## 7.1 Generate Predictions
```{r predictions}
# Predict on test set with confidence intervals
pred_obj <- predict (final_model, newdata = test_data,
type = "link" , se.fit = TRUE )
# Back-transform from log scale
test_data <- test_data |>
mutate (
pred_log = pred_obj$ fit,
pred_se = pred_obj$ se.fit,
pred_cases = exp (pred_log),
pred_lower = exp (pred_log - 1.96 * pred_se),
pred_upper = exp (pred_log + 1.96 * pred_se)
)
head (test_data |> select (week_start, cases, pred_cases, pred_lower, pred_upper), 10 ) |>
kable (digits = 1 , caption = "Predicted vs Actual Cases (Test Set, first 10 weeks)" ) |>
kable_styling (bootstrap_options = c ("striped" , "hover" ), full_width = FALSE )
```
## 7.2 Prediction Plot
```{r prediction-plot}
ggplot (test_data, aes (x = week_start)) +
geom_ribbon (aes (ymin = pred_lower, ymax = pred_upper),
fill = "#2980b9" , alpha = 0.25 ) +
geom_line (aes (y = pred_cases), color = "#2980b9" , linewidth = 1.4 , linetype = "dashed" ) +
geom_col (aes (y = cases), fill = "#c0392b" , alpha = 0.7 , width = 4 ) +
labs (title = "Predicted vs Actual Cases — Test Set" ,
subtitle = "Red bars = observed | Blue dashed = predicted | Shaded = 95% PI" ,
x = NULL , y = "Weekly Cases" ) +
theme_minimal (base_size = 12 ) +
theme (plot.title = element_text (face = "bold" ))
```
## 7.3 Prediction Accuracy Metrics
```{r accuracy}
metrics <- test_data |>
summarise (
MAE = mean (abs (cases - pred_cases)),
RMSE = sqrt (mean ((cases - pred_cases)^ 2 )),
MAPE = mean (abs ((cases - pred_cases) / (cases + 0.1 ))) * 100 ,
Corr = cor (cases, pred_cases),
Coverage_95 = mean (cases >= pred_lower & cases <= pred_upper) * 100
)
kable (metrics, digits = 2 ,
caption = "Prediction Performance Metrics (Test Set)" ,
col.names = c ("MAE" , "RMSE" , "MAPE (%)" , "Pearson r" , "95% PI Coverage (%)" )) |>
kable_styling (bootstrap_options = c ("striped" , "hover" ), full_width = FALSE )
```
---
# 8. Scenario Prediction Tool
> Use this section to predict cases for **specific environmental scenarios**.
```{r scenario-prediction}
# Define scenarios of interest
scenarios <- tibble (
Scenario = c ("Low Risk" , "Moderate Risk" , "High Risk" , "Extreme Risk" ),
rainfall_mm = c (10 , 30 , 55 , 80 ),
temp_mean_c = c (22 , 25 , 28 , 32 ),
ndvi = c (0.2 , 0.4 , 0.6 , 0.75 ),
week_of_year = c (20 , 30 , 40 , 15 ) # representative weeks
)
# Predict
scen_pred <- predict (final_model, newdata = scenarios,
type = "link" , se.fit = TRUE )
scenarios <- scenarios |>
mutate (
Predicted_Cases = round (exp (scen_pred$ fit), 1 ),
Lower_95_PI = round (exp (scen_pred$ fit - 1.96 * scen_pred$ se.fit), 1 ),
Upper_95_PI = round (exp (scen_pred$ fit + 1.96 * scen_pred$ se.fit), 1 )
)
kable (scenarios,
caption = "Predicted Weekly Cases Under Environmental Scenarios" ,
col.names = c ("Scenario" , "Rainfall (mm)" , "Temp (°C)" , "NDVI" ,
"Week" , "Predicted Cases" , "Lower 95%" , "Upper 95%" )) |>
kable_styling (bootstrap_options = c ("striped" , "hover" ), full_width = FALSE ) |>
row_spec (3 : 4 , background = "#fde8e8" ) # Highlight high risk
```
```{r scenario-plot}
scenarios |>
ggplot (aes (x = fct_inorder (Scenario), y = Predicted_Cases, fill = Scenario)) +
geom_col (alpha = 0.85 ) +
geom_errorbar (aes (ymin = Lower_95_PI, ymax = Upper_95_PI), width = 0.3 , linewidth = 1 ) +
scale_fill_manual (values = c ("#27ae60" , "#f39c12" , "#e74c3c" , "#8e1010" )) +
labs (title = "Predicted Weekly Cases by Risk Scenario" ,
subtitle = "Error bars = 95% Prediction Interval" ,
x = "Scenario" , y = "Predicted Cases" ) +
theme_minimal (base_size = 13 ) +
theme (legend.position = "none" , plot.title = element_text (face = "bold" ))
```
---
# 9. Model Summary & Conclusions
```{r conclusions}
# Final model summary
glance_df <- glance (final_model)
tibble (
Metric = c ("Model Type" , "Outcome" , "Predictors" , "N (Training)" ,
"AIC" , "Log-Likelihood" , "Theta (Dispersion)" ),
Value = c (
"Negative Binomial GLM" ,
"Weekly Disease Cases" ,
"Rainfall, Temperature, NDVI, Seasonal Terms" ,
as.character (n_train),
round (AIC (final_model), 2 ),
round (logLik (final_model), 2 ),
round (final_model$ theta, 3 )
)
) |>
kable (caption = "Final Model Summary" ) |>
kable_styling (bootstrap_options = c ("striped" , "hover" ), full_width = FALSE )
```
::: {.callout-note}
## Key Takeaways
- **Over-dispersion confirmed**: Variance >> Mean in raw counts justifies Negative Binomial over Poisson.
- **Environmental drivers**: Rainfall, temperature, and NDVI are significant predictors of weekly case burden.
- **Seasonal patterns**: Including harmonic (sin/cos) terms captures residual seasonality not explained by climate variables alone.
- **Prediction intervals**: Use 95% prediction intervals for operational planning and early-warning thresholds.
- **Theta (dispersion)**: A lower theta = greater over-dispersion; the model accounts for this explicitly.
:::