1 Introduction

This document presents a comprehensive time series analysis of dengue cases in San Juan from 1990 to 2009. The analysis includes:

  • Exploratory data analysis
  • Stationarity testing
  • Time series decomposition
  • Multiple forecasting models
  • Model comparison and selection
  • Future predictions

2 Load Required Libraries

# Install packages if needed
# install.packages(c("tidyverse", "lubridate", "tseries", "forecast", "gridExtra", "knitr", "kableExtra"))

library(tidyverse)
library(lubridate)
library(tseries)
library(forecast)
library(ggplot2)
library(gridExtra)
library(knitr)
library(kableExtra)

3 Load and Prepare Data

# Read the data
dengue_data <- read.csv("San_Juan_Training_Data.csv")

# Convert week_start_date to Date format
dengue_data$week_start_date <- as.Date(dengue_data$week_start_date)

# Display first few rows
head(dengue_data) %>%
  kable(caption = "First 6 rows of dengue data") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"))
First 6 rows of dengue data
season season_week week_start_date denv1_cases denv2_cases denv3_cases denv4_cases other_positive_cases additional_cases total_cases
1990/1991 1 1990-04-30 0 0 0 0 4 0 4
1990/1991 2 1990-05-07 0 0 0 0 5 0 5
1990/1991 3 1990-05-14 0 0 0 0 4 0 4
1990/1991 4 1990-05-21 0 0 0 0 3 0 3
1990/1991 5 1990-05-28 0 0 0 0 6 0 6
1990/1991 6 1990-06-04 1 0 0 0 1 0 2
# Data structure
str(dengue_data)
## 'data.frame':    988 obs. of  10 variables:
##  $ season              : chr  "1990/1991" "1990/1991" "1990/1991" "1990/1991" ...
##  $ season_week         : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ week_start_date     : Date, format: "1990-04-30" "1990-05-07" ...
##  $ denv1_cases         : int  0 0 0 0 0 1 0 0 2 0 ...
##  $ denv2_cases         : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ denv3_cases         : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ denv4_cases         : int  0 0 0 0 0 0 0 0 0 1 ...
##  $ other_positive_cases: int  4 5 4 3 6 1 4 5 8 5 ...
##  $ additional_cases    : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ total_cases         : int  4 5 4 3 6 2 4 5 10 6 ...

3.1 Summary Statistics

# Summary statistics for total cases
summary_stats <- data.frame(
  Statistic = c("Min", "1st Qu", "Median", "Mean", "3rd Qu", "Max", "SD"),
  Value = c(
    min(dengue_data$total_cases),
    quantile(dengue_data$total_cases, 0.25),
    median(dengue_data$total_cases),
    mean(dengue_data$total_cases),
    quantile(dengue_data$total_cases, 0.75),
    max(dengue_data$total_cases),
    sd(dengue_data$total_cases)
  )
)

summary_stats %>%
  kable(caption = "Summary Statistics for Total Dengue Cases", digits = 2) %>%
  kable_styling(bootstrap_options = c("striped", "hover"))
Summary Statistics for Total Dengue Cases
Statistic Value
Min 0.00
1st Qu 9.00
Median 19.00
Mean 33.43
3rd Qu 37.00
Max 461.00
SD 50.21

3.2 Create Time Series Object

# Create a time series object for total cases
# Weekly data with 52 weeks per year
ts_total_cases <- ts(dengue_data$total_cases, 
                     start = c(1990, 1), 
                     frequency = 52)

cat("Time series created with", length(ts_total_cases), "observations\n")
## Time series created with 988 observations
cat("Start:", start(ts_total_cases), "\n")
## Start: 1990 1
cat("End:", end(ts_total_cases), "\n")
## End: 2008 52
cat("Frequency:", frequency(ts_total_cases), "\n")
## Frequency: 52

4 Exploratory Data Analysis

4.1 Time Series Plot

ggplot(dengue_data, aes(x = week_start_date, y = total_cases)) +
  geom_line(color = "steelblue", linewidth = 0.7) +
  geom_smooth(method = "loess", color = "red", se = TRUE, alpha = 0.2) +
  labs(
    title = "Dengue Cases Over Time (1990-2009)",
    subtitle = "Weekly reported cases in San Juan",
    x = "Date",
    y = "Total Cases"
  ) +
  theme_minimal() +
  theme(
    plot.title = element_text(size = 16, face = "bold"),
    plot.subtitle = element_text(size = 12)
  )

Observation: The time series shows seasonal patterns with varying amplitude over the years, with notable epidemic peaks.

4.2 Seasonal Patterns

ggseasonplot(ts_total_cases, year.labels = FALSE, continuous = TRUE) +
  labs(
    title = "Seasonal Plot of Dengue Cases",
    subtitle = "Pattern across weeks of the year",
    y = "Total Cases"
  ) +
  theme_minimal()

4.3 Subseries Plot

ggsubseriesplot(ts_total_cases) +
  labs(
    title = "Subseries Plot",
    subtitle = "Average cases by week of year"
  ) +
  theme_minimal() +
  theme(axis.text.x = element_text(size = 6))

4.4 Monthly Aggregation

# Aggregate by month for clearer visualization
dengue_monthly <- dengue_data %>%
  mutate(
    year = year(week_start_date),
    month = month(week_start_date)
  ) %>%
  group_by(year, month) %>%
  summarise(
    total_cases = sum(total_cases),
    .groups = "drop"
  ) %>%
  mutate(date = as.Date(paste(year, month, "01", sep = "-")))

ggplot(dengue_monthly, aes(x = date, y = total_cases)) +
  geom_line(color = "darkgreen", linewidth = 0.8) +
  geom_point(color = "darkgreen", size = 1) +
  labs(
    title = "Monthly Aggregated Dengue Cases",
    x = "Date",
    y = "Total Cases"
  ) +
  theme_minimal()

5 Time Series Decomposition

5.1 STL Decomposition

# STL Decomposition (Seasonal and Trend decomposition using Loess)
stl_decomp <- stl(ts_total_cases, s.window = "periodic")
plot(stl_decomp, main = "STL Decomposition of Dengue Cases")

Interpretation:

  • Trend: Shows long-term changes in dengue incidence
  • Seasonal: Reveals consistent within-year patterns
  • Remainder: Captures irregular fluctuations

5.2 Classical Decomposition

# Classical decomposition
classical_decomp <- decompose(ts_total_cases, type = "additive")
plot(classical_decomp)

5.3 Seasonal Strength

# Calculate seasonal strength
seasonal_strength <- 1 - var(stl_decomp$time.series[, "remainder"]) / 
                         var(stl_decomp$time.series[, "seasonal"] + 
                             stl_decomp$time.series[, "remainder"])

trend_strength <- 1 - var(stl_decomp$time.series[, "remainder"]) / 
                      var(stl_decomp$time.series[, "trend"] + 
                          stl_decomp$time.series[, "remainder"])

cat("Seasonal Strength:", round(seasonal_strength, 4), "\n")
## Seasonal Strength: 0.2744
cat("Trend Strength:", round(trend_strength, 4), "\n")
## Trend Strength: 0.521

6 Stationarity Analysis

Before we can model a time series, we must understand its underlying properties. A key property is stationarity, which means the statistical properties of the series (like its mean and variance) do not change over time. Most time series models, including ARIMA, assume stationarity. In this section, we will test whether the dengue cases series is stationary using several methods:

  • Visual Inspection: Plotting the data to look for obvious trends or changes in variance.
  • Statistical Tests: Applying formal tests like the Augmented Dickey-Fuller (ADF) and KPSS tests, which have opposing null hypotheses, to rigorously assess stationarity.
  • Correlation Plots: Examining the Autocorrelation Function (ACF) and Partial Autocorrelation Function (PACF) plots to see how observations are related to each other over time. If the series is non-stationary, we will apply transformations like differencing to stabilize it.

6.1 Visual Inspection

par(mfrow = c(2, 1))
plot(ts_total_cases, main = "Original Series", ylab = "Cases", col = "blue")
plot(diff(ts_total_cases), main = "First Differenced Series", 
     ylab = "Differenced Cases", col = "darkgreen")

par(mfrow = c(1, 1))

6.2 Statistical Tests

6.2.1 Augmented Dickey-Fuller Test

adf_test <- adf.test(ts_total_cases)
print(adf_test)
## 
##  Augmented Dickey-Fuller Test
## 
## data:  ts_total_cases
## Dickey-Fuller = -7.0547, Lag order = 9, p-value = 0.01
## alternative hypothesis: stationary
adf_result <- data.frame(
  Test = "ADF Test",
  Statistic = adf_test$statistic,
  P_Value = adf_test$p.value,
  Conclusion = ifelse(adf_test$p.value < 0.05, 
                     "Stationary", "Non-Stationary")
)

adf_result %>%
  kable(caption = "Augmented Dickey-Fuller Test Results", digits = 4) %>%
  kable_styling(bootstrap_options = c("striped", "hover"))
Augmented Dickey-Fuller Test Results
Test Statistic P_Value Conclusion
Dickey-Fuller ADF Test -7.0547 0.01 Stationary

6.2.2 KPSS Test

kpss_test <- kpss.test(ts_total_cases)
print(kpss_test)
## 
##  KPSS Test for Level Stationarity
## 
## data:  ts_total_cases
## KPSS Level = 0.78673, Truncation lag parameter = 7, p-value = 0.01
kpss_result <- data.frame(
  Test = "KPSS Test",
  Statistic = kpss_test$statistic,
  P_Value = kpss_test$p.value,
  Conclusion = ifelse(kpss_test$p.value < 0.05, 
                     "Non-Stationary", "Stationary")
)

kpss_result %>%
  kable(caption = "KPSS Test Results", digits = 4) %>%
  kable_styling(bootstrap_options = c("striped", "hover"))
KPSS Test Results
Test Statistic P_Value Conclusion
KPSS Level KPSS Test 0.7867 0.01 Non-Stationary

6.2.3 Phillips-Perron Test

pp_test <- pp.test(ts_total_cases)
print(pp_test)
## 
##  Phillips-Perron Unit Root Test
## 
## data:  ts_total_cases
## Dickey-Fuller Z(alpha) = -60.935, Truncation lag parameter = 7, p-value
## = 0.01
## alternative hypothesis: stationary
pp_result <- data.frame(
  Test = "Phillips-Perron Test",
  Statistic = pp_test$statistic,
  P_Value = pp_test$p.value,
  Conclusion = ifelse(pp_test$p.value < 0.05, 
                     "Stationary", "Non-Stationary")
)

pp_result %>%
  kable(caption = "Phillips-Perron Test Results", digits = 4) %>%
  kable_styling(bootstrap_options = c("striped", "hover"))
Phillips-Perron Test Results
Test Statistic P_Value Conclusion
Dickey-Fuller Z(alpha) Phillips-Perron Test -60.9353 0.01 Stationary

6.2.4 Summary of Stationarity Tests

stationarity_summary <- rbind(adf_result, kpss_result, pp_result)

stationarity_summary %>%
  kable(caption = "Summary of Stationarity Tests", digits = 4) %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed")) %>%
  row_spec(which(stationarity_summary$Conclusion == "Stationary"), 
           background = "#d4edda") %>%
  row_spec(which(stationarity_summary$Conclusion == "Non-Stationary"), 
           background = "#f8d7da")
Summary of Stationarity Tests
Test Statistic P_Value Conclusion
Dickey-Fuller ADF Test -7.0547 0.01 Stationary
KPSS Level KPSS Test 0.7867 0.01 Non-Stationary
Dickey-Fuller Z(alpha) Phillips-Perron Test -60.9353 0.01 Stationary

6.3 ACF and PACF Analysis

par(mfrow = c(2, 1))
acf(ts_total_cases, main = "ACF of Total Cases", lag.max = 104)
pacf(ts_total_cases, main = "PACF of Total Cases", lag.max = 104)

par(mfrow = c(1, 1))

Interpretation:

  • ACF shows slow decay, indicating non-stationarity
  • Strong seasonal patterns visible at lags 52, 104, etc.
  • PACF shows significant spikes at early lags

6.4 Differencing

# First difference
diff1_cases <- diff(ts_total_cases)

# Test stationarity after differencing
adf_diff1 <- adf.test(diff1_cases)

cat("ADF Test after first differencing:\n")
## ADF Test after first differencing:
print(adf_diff1)
## 
##  Augmented Dickey-Fuller Test
## 
## data:  diff1_cases
## Dickey-Fuller = -9.5459, Lag order = 9, p-value = 0.01
## alternative hypothesis: stationary
# ACF and PACF of differenced series
par(mfrow = c(2, 1))
acf(diff1_cases, main = "ACF of First Differenced Series", lag.max = 104)
pacf(diff1_cases, main = "PACF of First Differenced Series", lag.max = 104)

par(mfrow = c(1, 1))

7 Model Fitting

With a better understanding of the data’s properties, we can now proceed to fit forecasting models. The goal is to find a model that accurately captures the underlying patterns (trend and seasonality) in the data. To evaluate model performance objectively, we will first split the data into a training set (the first 80% of the data) and a testing set (the final 20%). We will fit the models on the training data and then test their forecasting accuracy on the unseen testing data. This section will explore four different, powerful time series models:

  • Auto ARIMA: An automated algorithm that selects the best ARIMA model parameters.
  • SARIMA: A manual Seasonal ARIMA model where we specify the parameters based on our exploratory analysis.
  • ETS (Exponential Smoothing): A model that forecasts by smoothing past observations, with components for error, trend, and seasonality.
  • TBATS: A complex model designed to handle intricate seasonal patterns, including multiple seasonalities.

7.1 Train-Test Split

# Split data into training and testing sets (80-20 split)
train_size <- floor(0.8 * length(ts_total_cases))
train_data <- window(ts_total_cases, end = time(ts_total_cases)[train_size])
test_data <- window(ts_total_cases, start = time(ts_total_cases)[train_size + 1])

cat("Training set size:", length(train_data), "observations\n")
## Training set size: 790 observations
cat("Test set size:", length(test_data), "observations\n")
## Test set size: 198 observations

7.2 Model 1: Auto ARIMA

auto_arima_model <- auto.arima(train_data, seasonal = TRUE, stepwise = FALSE)
summary(auto_arima_model)
## Series: train_data 
## ARIMA(0,1,2)(2,0,1)[52] 
## 
## Coefficients:
##          ma1     ma2    sar1    sar2     sma1
##       0.1716  0.0858  0.3878  0.0492  -0.3614
## s.e.  0.0363  0.0342  0.4663  0.0436   0.4671
## 
## sigma^2 = 184.8:  log likelihood = -3176.28
## AIC=6364.57   AICc=6364.68   BIC=6392.59
## 
## Training set error measures:
##                     ME     RMSE      MAE MPE MAPE      MASE         ACF1
## Training set 0.0150855 13.54193 8.162633 NaN  Inf 0.2141114 -0.003929785
# Forecast
arima_forecast <- forecast(auto_arima_model, h = length(test_data))
autoplot(arima_forecast) +
  autolayer(test_data, series = "Actual", color = "red") +
  labs(
    title = "Auto ARIMA Forecast vs Actual",
    x = "Time",
    y = "Total Cases"
  ) +
  theme_minimal()

7.3 Model 2: SARIMA

sarima_model <- Arima(train_data, 
                      order = c(1, 1, 1), 
                      seasonal = list(order = c(1, 1, 1), period = 52))
summary(sarima_model)
## Series: train_data 
## ARIMA(1,1,1)(1,1,1)[52] 
## 
## Coefficients:
##          ar1      ma1     sar1     sma1
##       0.4514  -0.3089  -0.0150  -0.9047
## s.e.  0.3187   0.3435   0.0467   0.0616
## 
## sigma^2 = 196.7:  log likelihood = -3033.9
## AIC=6077.8   AICc=6077.88   BIC=6100.81
## 
## Training set error measures:
##                       ME    RMSE      MAE MPE MAPE      MASE        ACF1
## Training set -0.03546395 13.5107 8.388034 NaN  Inf 0.2200238 0.003011365
# Forecast
sarima_forecast <- forecast(sarima_model, h = length(test_data))
autoplot(sarima_forecast) +
  autolayer(test_data, series = "Actual", color = "red") +
  labs(
    title = "SARIMA Forecast vs Actual",
    x = "Time",
    y = "Total Cases"
  ) +
  theme_minimal()

7.4 Model 3: ETS (Exponential Smoothing)

ets_model <- ets(train_data)
summary(ets_model)
## ETS(A,Ad,N) 
## 
## Call:
## ets(y = train_data)
## 
##   Smoothing parameters:
##     alpha = 0.9999 
##     beta  = 0.1524 
##     phi   = 0.8 
## 
##   Initial states:
##     l = 4.0358 
##     b = -0.7328 
## 
##   sigma:  13.6366
## 
##      AIC     AICc      BIC 
## 9406.046 9406.153 9434.078 
## 
## Training set error measures:
##                      ME     RMSE      MAE MPE MAPE      MASE       ACF1
## Training set 0.01778684 13.59337 8.075487 NaN  Inf 0.2118255 0.03668071
# Forecast
ets_forecast <- forecast(ets_model, h = length(test_data))
autoplot(ets_forecast) +
  autolayer(test_data, series = "Actual", color = "red") +
  labs(
    title = "ETS Forecast vs Actual",
    x = "Time",
    y = "Total Cases"
  ) +
  theme_minimal()

7.5 Model 4: TBATS

tbats_model <- tbats(train_data)
summary(tbats_model)
##                   Length Class  Mode     
## lambda                0  -none- NULL     
## alpha                 1  -none- numeric  
## beta                  1  -none- numeric  
## damping.parameter     1  -none- numeric  
## gamma.one.values      1  -none- numeric  
## gamma.two.values      1  -none- numeric  
## ar.coefficients       0  -none- NULL     
## ma.coefficients       0  -none- NULL     
## likelihood            1  -none- numeric  
## optim.return.code     1  -none- numeric  
## variance              1  -none- numeric  
## AIC                   1  -none- numeric  
## parameters            2  -none- list     
## seed.states          18  -none- numeric  
## fitted.values       790  ts     numeric  
## errors              790  ts     numeric  
## x                 14220  -none- numeric  
## seasonal.periods      1  -none- numeric  
## k.vector              1  -none- numeric  
## y                   790  ts     numeric  
## p                     1  -none- numeric  
## q                     1  -none- numeric  
## call                  2  -none- call     
## series                1  -none- character
## method                1  -none- character
# Forecast
tbats_forecast <- forecast(tbats_model, h = length(test_data))
autoplot(tbats_forecast) +
  autolayer(test_data, series = "Actual", color = "red") +
  labs(
    title = "TBATS Forecast vs Actual",
    x = "Time",
    y = "Total Cases"
  ) +
  theme_minimal()

8 Model Diagnostics

8.1 Residual Analysis for Auto ARIMA

checkresiduals(auto_arima_model)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(0,1,2)(2,0,1)[52]
## Q* = 101.92, df = 99, p-value = 0.4003
## 
## Model df: 5.   Total lags used: 104

8.2 Ljung-Box Test

lb_test <- Box.test(residuals(auto_arima_model), lag = 20, type = "Ljung-Box")
print(lb_test)
## 
##  Box-Ljung test
## 
## data:  residuals(auto_arima_model)
## X-squared = 48.636, df = 20, p-value = 0.0003461
lb_result <- data.frame(
  Test = "Ljung-Box",
  Statistic = lb_test$statistic,
  P_Value = lb_test$p.value,
  Conclusion = ifelse(lb_test$p.value > 0.05, 
                     "No autocorrelation (Good)", 
                     "Autocorrelation present")
)

lb_result %>%
  kable(caption = "Ljung-Box Test for Residual Autocorrelation", digits = 4) %>%
  kable_styling(bootstrap_options = c("striped", "hover"))
Ljung-Box Test for Residual Autocorrelation
Test Statistic P_Value Conclusion
X-squared Ljung-Box 48.6362 3e-04 Autocorrelation present

8.3 Residual Normality Test

# Using subset for Shapiro test (max 5000 observations)
residuals_subset <- residuals(auto_arima_model)
if(length(residuals_subset) > 5000) {
  residuals_subset <- sample(residuals_subset, 5000)
}
shapiro_test <- shapiro.test(residuals_subset)
print(shapiro_test)
## 
##  Shapiro-Wilk normality test
## 
## data:  residuals_subset
## W = 0.82127, p-value < 2.2e-16

9 Model Comparison

# Calculate accuracy metrics
arima_accuracy <- accuracy(arima_forecast, test_data)
sarima_accuracy <- accuracy(sarima_forecast, test_data)
ets_accuracy <- accuracy(ets_forecast, test_data)
tbats_accuracy <- accuracy(tbats_forecast, test_data)

comparison_df <- data.frame(
  Model = c("Auto ARIMA", "SARIMA", "ETS", "TBATS"),
  RMSE = c(arima_accuracy[2, "RMSE"], 
           sarima_accuracy[2, "RMSE"],
           ets_accuracy[2, "RMSE"],
           tbats_accuracy[2, "RMSE"]),
  MAE = c(arima_accuracy[2, "MAE"],
          sarima_accuracy[2, "MAE"],
          ets_accuracy[2, "MAE"],
          tbats_accuracy[2, "MAE"]),
  MAPE = c(arima_accuracy[2, "MAPE"],
           sarima_accuracy[2, "MAPE"],
           ets_accuracy[2, "MAPE"],
           tbats_accuracy[2, "MAPE"]),
  AIC = c(AIC(auto_arima_model),
          AIC(sarima_model),
          AIC(ets_model),
          NA)  # TBATS doesn't have AIC
)

# Highlight best model for each metric
comparison_df %>%
  kable(caption = "Model Comparison - Test Set Performance", digits = 2) %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed")) %>%
  column_spec(2, background = ifelse(comparison_df$RMSE == min(comparison_df$RMSE), 
                                     "#d4edda", "white")) %>%
  column_spec(3, background = ifelse(comparison_df$MAE == min(comparison_df$MAE), 
                                     "#d4edda", "white")) %>%
  column_spec(4, background = ifelse(comparison_df$MAPE == min(comparison_df$MAPE), 
                                     "#d4edda", "white"))
Model Comparison - Test Set Performance
Model RMSE MAE MAPE AIC
Auto ARIMA 30.68 19.26 Inf 6364.57
SARIMA 27.54 21.01 Inf 6077.80
ETS 30.65 20.50 Inf 9406.05
TBATS 27.19 19.73 Inf NA

Best Model: The model with the lowest RMSE, MAE, and MAPE is considered the best performer.

9.1 Visual Comparison

# Create comparison plot
comparison_data <- data.frame(
  Time = time(test_data),
  Actual = as.numeric(test_data),
  Auto_ARIMA = as.numeric(arima_forecast$mean),
  SARIMA = as.numeric(sarima_forecast$mean),
  ETS = as.numeric(ets_forecast$mean),
  TBATS = as.numeric(tbats_forecast$mean)
)

comparison_long <- comparison_data %>%
  pivot_longer(cols = -Time, names_to = "Model", values_to = "Cases")

ggplot(comparison_long, aes(x = Time, y = Cases, color = Model)) +
  geom_line(linewidth = 0.8) +
  scale_color_manual(values = c("Actual" = "black", 
                                 "Auto_ARIMA" = "blue",
                                 "SARIMA" = "red",
                                 "ETS" = "green",
                                 "TBATS" = "purple")) +
  labs(
    title = "Model Comparison: Forecasts vs Actual",
    x = "Time",
    y = "Total Cases"
  ) +
  theme_minimal() +
  theme(legend.position = "bottom")

10 Final Forecast

10.1 Refit Best Model on Full Data

# Refit best model on full data (using Auto ARIMA as example)
best_model <- auto.arima(ts_total_cases, seasonal = TRUE)
summary(best_model)
## Series: ts_total_cases 
## ARIMA(1,1,1) 
## 
## Coefficients:
##          ar1      ma1
##       0.7202  -0.6081
## s.e.  0.0887   0.1004
## 
## sigma^2 = 175.1:  log likelihood = -3948.49
## AIC=7902.99   AICc=7903.01   BIC=7917.67
## 
## Training set error measures:
##                       ME     RMSE      MAE MPE MAPE      MASE          ACF1
## Training set 0.007622131 13.21081 7.933374 NaN  Inf 0.2182599 -0.0005099813

10.2 Forecast Next 52 Weeks

# Forecast next 52 weeks (1 year)
final_forecast <- forecast(best_model, h = 52)

autoplot(final_forecast) +
  labs(
    title = "Dengue Cases Forecast for Next Year",
    subtitle = paste("Model:", best_model$method),
    x = "Time",
    y = "Total Cases"
  ) +
  theme_minimal()

10.3 Forecast Table

forecast_df <- data.frame(
  Week = 1:52,
  Point_Forecast = round(as.numeric(final_forecast$mean), 0),
  Lower_80 = round(as.numeric(final_forecast$lower[, 1]), 0),
  Upper_80 = round(as.numeric(final_forecast$upper[, 1]), 0),
  Lower_95 = round(as.numeric(final_forecast$lower[, 2]), 0),
  Upper_95 = round(as.numeric(final_forecast$upper[, 2]), 0)
)

# Display first 12 weeks
head(forecast_df, 12) %>%
  kable(caption = "Forecast for Next 12 Weeks (with confidence intervals)") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"))
Forecast for Next 12 Weeks (with confidence intervals)
Week Point_Forecast Lower_80 Upper_80 Lower_95 Upper_95
1 15 -2 32 -11 41
2 15 -11 40 -24 54
3 15 -18 47 -35 64
4 15 -24 53 -45 74
5 15 -30 59 -53 83
6 15 -35 64 -62 91
7 15 -40 69 -69 98
8 15 -45 74 -76 106
9 15 -49 78 -83 112
10 15 -53 83 -89 119
11 15 -57 87 -95 125
12 15 -61 90 -101 130
# Save complete forecast
write.csv(forecast_df, "dengue_forecast.csv", row.names = FALSE)
cat("Complete forecast saved to 'dengue_forecast.csv'\n")
## Complete forecast saved to 'dengue_forecast.csv'

11 Serotype-Specific Analysis

11.1 Time Series by Serotype

# Create time series for each serotype
ts_denv1 <- ts(dengue_data$denv1_cases, start = c(1990, 1), frequency = 52)
ts_denv2 <- ts(dengue_data$denv2_cases, start = c(1990, 1), frequency = 52)
ts_denv3 <- ts(dengue_data$denv3_cases, start = c(1990, 1), frequency = 52)
ts_denv4 <- ts(dengue_data$denv4_cases, start = c(1990, 1), frequency = 52)
par(mfrow = c(4, 1), mar = c(4, 4, 2, 1))
plot(ts_denv1, main = "DENV-1 Cases", ylab = "Cases", col = "red", lwd = 1.5)
plot(ts_denv2, main = "DENV-2 Cases", ylab = "Cases", col = "blue", lwd = 1.5)
plot(ts_denv3, main = "DENV-3 Cases", ylab = "Cases", col = "green", lwd = 1.5)
plot(ts_denv4, main = "DENV-4 Cases", ylab = "Cases", col = "purple", lwd = 1.5)

par(mfrow = c(1, 1))

11.2 Serotype Proportions Over Time

serotype_data <- dengue_data %>%
  mutate(
    year = year(week_start_date),
    total_serotyped = denv1_cases + denv2_cases + denv3_cases + denv4_cases
  ) %>%
  filter(total_serotyped > 0) %>%
  group_by(year) %>%
  summarise(
    DENV1 = sum(denv1_cases) / sum(total_serotyped) * 100,
    DENV2 = sum(denv2_cases) / sum(total_serotyped) * 100,
    DENV3 = sum(denv3_cases) / sum(total_serotyped) * 100,
    DENV4 = sum(denv4_cases) / sum(total_serotyped) * 100,
    .groups = "drop"
  ) %>%
  pivot_longer(cols = starts_with("DENV"), 
               names_to = "Serotype", 
               values_to = "Percentage")

ggplot(serotype_data, aes(x = year, y = Percentage, fill = Serotype)) +
  geom_area(alpha = 0.7) +
  scale_fill_manual(values = c("DENV1" = "red", "DENV2" = "blue", 
                                "DENV3" = "green", "DENV4" = "purple")) +
  labs(
    title = "Dengue Serotype Distribution Over Time",
    subtitle = "Proportion of each serotype among typed cases",
    x = "Year",
    y = "Percentage (%)"
  ) +
  theme_minimal() +
  theme(legend.position = "bottom")

12 Conclusions

12.1 Key Findings

  1. Seasonality: Strong seasonal patterns with peaks typically occurring in the latter half of the year
  2. Trend: Variable long-term trends with epidemic cycles
  3. Stationarity: The original series is non-stationary but becomes stationary after differencing
  4. Best Model: Based on accuracy metrics, the model with lowest RMSE performed best
  5. Forecast: The next 52 weeks show expected seasonal patterns

12.2 Recommendations

  1. Public Health Planning: Prepare for seasonal peaks, especially during high-risk months
  2. Surveillance: Enhanced monitoring during predicted high-incidence periods
  3. Model Updates: Regularly update models with new data for improved accuracy
  4. Serotype Monitoring: Track serotype distribution for epidemic preparedness

12.3 Limitations

  • Model assumes historical patterns continue
  • Does not account for climate variables or vector control interventions
  • Uncertainty increases with longer forecast horizons
  • Extreme events may not be well-predicted

13 Session Information

sessionInfo()
## R version 4.5.0 (2025-04-11)
## Platform: aarch64-apple-darwin20
## Running under: macOS Sequoia 15.6.1
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/lib/libRblas.0.dylib 
## LAPACK: /Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/lib/libRlapack.dylib;  LAPACK version 3.12.1
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## time zone: America/Puerto_Rico
## tzcode source: internal
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] kableExtra_1.4.0 knitr_1.50       gridExtra_2.3    forecast_8.24.0 
##  [5] tseries_0.10-58  lubridate_1.9.4  forcats_1.0.0    stringr_1.5.1   
##  [9] dplyr_1.1.4      purrr_1.0.4      readr_2.1.5      tidyr_1.3.1     
## [13] tibble_3.2.1     ggplot2_3.5.2    tidyverse_2.0.0 
## 
## loaded via a namespace (and not attached):
##  [1] gtable_0.3.6      xfun_0.52         bslib_0.9.0       lattice_0.22-6   
##  [5] tzdb_0.5.0        quadprog_1.5-8    vctrs_0.6.5       tools_4.5.0      
##  [9] generics_0.1.3    curl_6.2.2        parallel_4.5.0    xts_0.14.1       
## [13] pkgconfig_2.0.3   Matrix_1.7-3      lifecycle_1.0.4   farver_2.1.2     
## [17] compiler_4.5.0    textshaping_1.0.0 munsell_0.5.1     htmltools_0.5.8.1
## [21] sass_0.4.10       yaml_2.3.10       pillar_1.10.2     jquerylib_0.1.4  
## [25] cachem_1.1.0      nlme_3.1-168      fracdiff_1.5-3    tidyselect_1.2.1 
## [29] digest_0.6.37     stringi_1.8.7     labeling_0.4.3    splines_4.5.0    
## [33] fastmap_1.2.0     grid_4.5.0        colorspace_2.1-1  cli_3.6.4        
## [37] magrittr_2.0.3    withr_3.0.2       scales_1.3.0      timechange_0.3.0 
## [41] TTR_0.24.4        rmarkdown_2.29    quantmod_0.4.28   nnet_7.3-20      
## [45] timeDate_4051.111 zoo_1.8-14        hms_1.1.3         urca_1.3-4       
## [49] evaluate_1.0.3    lmtest_0.9-40     viridisLite_0.4.2 mgcv_1.9-1       
## [53] rlang_1.1.6       Rcpp_1.0.14       glue_1.8.0        xml2_1.3.8       
## [57] svglite_2.2.1     rstudioapi_0.17.1 jsonlite_2.0.0    R6_2.6.1         
## [61] systemfonts_1.3.1

End of Report