1 Introduction

This document presents a comprehensive time series analysis of dengue cases in San Juan from 1990 to 2009. The analysis includes:

Exploratory data analysis
Stationarity testing
Time series decomposition
Multiple forecasting models
Model comparison and selection
Future predictions

2 Load Required Libraries

# Install packages if needed
# install.packages(c("tidyverse", "lubridate", "tseries", "forecast", "gridExtra", "knitr", "kableExtra"))

library(tidyverse)
library(lubridate)
library(tseries)
library(forecast)
library(ggplot2)
library(gridExtra)
library(knitr)
library(kableExtra)

3 Load and Prepare Data

# Read the data
dengue_data <- read.csv("San_Juan_Training_Data.csv")

# Convert week_start_date to Date format
dengue_data$week_start_date <- as.Date(dengue_data$week_start_date)

# Display first few rows
head(dengue_data) %>%
  kable(caption = "First 6 rows of dengue data") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"))

First 6 rows of dengue data
season	season_week	week_start_date	denv1_cases	other_positive_cases	total_cases
1990/1991	1	1990-04-30	0	4	4
1990/1991	2	1990-05-07	0	5	5
1990/1991	3	1990-05-14	0	4	4
1990/1991	4	1990-05-21	0	3	3
1990/1991	5	1990-05-28	0	6	6
1990/1991	6	1990-06-04	1	1	2

# Data structure
str(dengue_data)

## 'data.frame':    988 obs. of  10 variables:
##  $ season              : chr  "1990/1991" "1990/1991" "1990/1991" "1990/1991" ...
##  $ season_week         : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ week_start_date     : Date, format: "1990-04-30" "1990-05-07" ...
##  $ denv1_cases         : int  0 0 0 0 0 1 0 0 2 0 ...
##  $ denv2_cases         : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ denv3_cases         : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ denv4_cases         : int  0 0 0 0 0 0 0 0 0 1 ...
##  $ other_positive_cases: int  4 5 4 3 6 1 4 5 8 5 ...
##  $ additional_cases    : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ total_cases         : int  4 5 4 3 6 2 4 5 10 6 ...

3.1 Summary Statistics

# Summary statistics for total cases
summary_stats <- data.frame(
  Statistic = c("Min", "1st Qu", "Median", "Mean", "3rd Qu", "Max", "SD"),
  Value = c(
    min(dengue_data$total_cases),
    quantile(dengue_data$total_cases, 0.25),
    median(dengue_data$total_cases),
    mean(dengue_data$total_cases),
    quantile(dengue_data$total_cases, 0.75),
    max(dengue_data$total_cases),
    sd(dengue_data$total_cases)
  )
)

summary_stats %>%
  kable(caption = "Summary Statistics for Total Dengue Cases", digits = 2) %>%
  kable_styling(bootstrap_options = c("striped", "hover"))

Summary Statistics for Total Dengue Cases
Statistic	Value
Min	0.00
1st Qu	9.00
Median	19.00
Mean	33.43
3rd Qu	37.00
Max	461.00
SD	50.21

3.2 Create Time Series Object

# Create a time series object for total cases
# Weekly data with 52 weeks per year
ts_total_cases <- ts(dengue_data$total_cases, 
                     start = c(1990, 1), 
                     frequency = 52)

cat("Time series created with", length(ts_total_cases), "observations\n")

## Time series created with 988 observations

cat("Start:", start(ts_total_cases), "\n")

## Start: 1990 1

cat("End:", end(ts_total_cases), "\n")

## End: 2008 52

cat("Frequency:", frequency(ts_total_cases), "\n")

## Frequency: 52

4 Exploratory Data Analysis

4.1 Time Series Plot

ggplot(dengue_data, aes(x = week_start_date, y = total_cases)) +
  geom_line(color = "steelblue", linewidth = 0.7) +
  geom_smooth(method = "loess", color = "red", se = TRUE, alpha = 0.2) +
  labs(
    title = "Dengue Cases Over Time (1990-2009)",
    subtitle = "Weekly reported cases in San Juan",
    x = "Date",
    y = "Total Cases"
  ) +
  theme_minimal() +
  theme(
    plot.title = element_text(size = 16, face = "bold"),
    plot.subtitle = element_text(size = 12)
  )

Observation: The time series shows seasonal patterns with varying amplitude over the years, with notable epidemic peaks.

4.2 Seasonal Patterns

ggseasonplot(ts_total_cases, year.labels = FALSE, continuous = TRUE) +
  labs(
    title = "Seasonal Plot of Dengue Cases",
    subtitle = "Pattern across weeks of the year",
    y = "Total Cases"
  ) +
  theme_minimal()

4.3 Subseries Plot

ggsubseriesplot(ts_total_cases) +
  labs(
    title = "Subseries Plot",
    subtitle = "Average cases by week of year"
  ) +
  theme_minimal() +
  theme(axis.text.x = element_text(size = 6))

4.4 Monthly Aggregation

# Aggregate by month for clearer visualization
dengue_monthly <- dengue_data %>%
  mutate(
    year = year(week_start_date),
    month = month(week_start_date)
  ) %>%
  group_by(year, month) %>%
  summarise(
    total_cases = sum(total_cases),
    .groups = "drop"
  ) %>%
  mutate(date = as.Date(paste(year, month, "01", sep = "-")))

ggplot(dengue_monthly, aes(x = date, y = total_cases)) +
  geom_line(color = "darkgreen", linewidth = 0.8) +
  geom_point(color = "darkgreen", size = 1) +
  labs(
    title = "Monthly Aggregated Dengue Cases",
    x = "Date",
    y = "Total Cases"
  ) +
  theme_minimal()

5 Time Series Decomposition

5.1 STL Decomposition

# STL Decomposition (Seasonal and Trend decomposition using Loess)
stl_decomp <- stl(ts_total_cases, s.window = "periodic")
plot(stl_decomp, main = "STL Decomposition of Dengue Cases")

Interpretation:

Trend: Shows long-term changes in dengue incidence
Seasonal: Reveals consistent within-year patterns
Remainder: Captures irregular fluctuations

5.2 Classical Decomposition

# Classical decomposition
classical_decomp <- decompose(ts_total_cases, type = "additive")
plot(classical_decomp)

5.3 Seasonal Strength

# Calculate seasonal strength
seasonal_strength <- 1 - var(stl_decomp$time.series[, "remainder"]) / 
                         var(stl_decomp$time.series[, "seasonal"] + 
                             stl_decomp$time.series[, "remainder"])

trend_strength <- 1 - var(stl_decomp$time.series[, "remainder"]) / 
                      var(stl_decomp$time.series[, "trend"] + 
                          stl_decomp$time.series[, "remainder"])

cat("Seasonal Strength:", round(seasonal_strength, 4), "\n")

## Seasonal Strength: 0.2744

cat("Trend Strength:", round(trend_strength, 4), "\n")

## Trend Strength: 0.521

6 Stationarity Analysis

Before we can model a time series, we must understand its underlying properties. A key property is stationarity, which means the statistical properties of the series (like its mean and variance) do not change over time. Most time series models, including ARIMA, assume stationarity. In this section, we will test whether the dengue cases series is stationary using several methods:

Visual Inspection: Plotting the data to look for obvious trends or changes in variance.
Statistical Tests: Applying formal tests like the Augmented Dickey-Fuller (ADF) and KPSS tests, which have opposing null hypotheses, to rigorously assess stationarity.
Correlation Plots: Examining the Autocorrelation Function (ACF) and Partial Autocorrelation Function (PACF) plots to see how observations are related to each other over time. If the series is non-stationary, we will apply transformations like differencing to stabilize it.

6.1 Visual Inspection

par(mfrow = c(2, 1))
plot(ts_total_cases, main = "Original Series", ylab = "Cases", col = "blue")
plot(diff(ts_total_cases), main = "First Differenced Series", 
     ylab = "Differenced Cases", col = "darkgreen")

par(mfrow = c(1, 1))

6.2 Statistical Tests

6.2.1 Augmented Dickey-Fuller Test

adf_test <- adf.test(ts_total_cases)
print(adf_test)

## 
##  Augmented Dickey-Fuller Test
## 
## data:  ts_total_cases
## Dickey-Fuller = -7.0547, Lag order = 9, p-value = 0.01
## alternative hypothesis: stationary

adf_result <- data.frame(
  Test = "ADF Test",
  Statistic = adf_test$statistic,
  P_Value = adf_test$p.value,
  Conclusion = ifelse(adf_test$p.value < 0.05, 
                     "Stationary", "Non-Stationary")
)

adf_result %>%
  kable(caption = "Augmented Dickey-Fuller Test Results", digits = 4) %>%
  kable_styling(bootstrap_options = c("striped", "hover"))

Augmented Dickey-Fuller Test Results
	Test	Statistic	P_Value	Conclusion
Dickey-Fuller	ADF Test	-7.0547	0.01	Stationary

6.2.2 KPSS Test

kpss_test <- kpss.test(ts_total_cases)
print(kpss_test)

## 
##  KPSS Test for Level Stationarity
## 
## data:  ts_total_cases
## KPSS Level = 0.78673, Truncation lag parameter = 7, p-value = 0.01

kpss_result <- data.frame(
  Test = "KPSS Test",
  Statistic = kpss_test$statistic,
  P_Value = kpss_test$p.value,
  Conclusion = ifelse(kpss_test$p.value < 0.05, 
                     "Non-Stationary", "Stationary")
)

kpss_result %>%
  kable(caption = "KPSS Test Results", digits = 4) %>%
  kable_styling(bootstrap_options = c("striped", "hover"))

KPSS Test Results
	Test	Statistic	P_Value	Conclusion
KPSS Level	KPSS Test	0.7867	0.01	Non-Stationary

6.2.3 Phillips-Perron Test

pp_test <- pp.test(ts_total_cases)
print(pp_test)

## 
##  Phillips-Perron Unit Root Test
## 
## data:  ts_total_cases
## Dickey-Fuller Z(alpha) = -60.935, Truncation lag parameter = 7, p-value
## = 0.01
## alternative hypothesis: stationary

pp_result <- data.frame(
  Test = "Phillips-Perron Test",
  Statistic = pp_test$statistic,
  P_Value = pp_test$p.value,
  Conclusion = ifelse(pp_test$p.value < 0.05, 
                     "Stationary", "Non-Stationary")
)

pp_result %>%
  kable(caption = "Phillips-Perron Test Results", digits = 4) %>%
  kable_styling(bootstrap_options = c("striped", "hover"))

Phillips-Perron Test Results
	Test	Statistic	P_Value	Conclusion
Dickey-Fuller Z(alpha)	Phillips-Perron Test	-60.9353	0.01	Stationary

6.2.4 Summary of Stationarity Tests

stationarity_summary <- rbind(adf_result, kpss_result, pp_result)

stationarity_summary %>%
  kable(caption = "Summary of Stationarity Tests", digits = 4) %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed")) %>%
  row_spec(which(stationarity_summary$Conclusion == "Stationary"), 
           background = "#d4edda") %>%
  row_spec(which(stationarity_summary$Conclusion == "Non-Stationary"), 
           background = "#f8d7da")

Summary of Stationarity Tests
	Test	Statistic	P_Value	Conclusion
Dickey-Fuller	ADF Test	-7.0547	0.01	Stationary
KPSS Level	KPSS Test	0.7867	0.01	Non-Stationary
Dickey-Fuller Z(alpha)	Phillips-Perron Test	-60.9353	0.01	Stationary

6.3 ACF and PACF Analysis

par(mfrow = c(2, 1))
acf(ts_total_cases, main = "ACF of Total Cases", lag.max = 104)
pacf(ts_total_cases, main = "PACF of Total Cases", lag.max = 104)

par(mfrow = c(1, 1))

Interpretation:

ACF shows slow decay, indicating non-stationarity
Strong seasonal patterns visible at lags 52, 104, etc.
PACF shows significant spikes at early lags

6.4 Differencing

# First difference
diff1_cases <- diff(ts_total_cases)

# Test stationarity after differencing
adf_diff1 <- adf.test(diff1_cases)

cat("ADF Test after first differencing:\n")

## ADF Test after first differencing:

print(adf_diff1)

## 
##  Augmented Dickey-Fuller Test
## 
## data:  diff1_cases
## Dickey-Fuller = -9.5459, Lag order = 9, p-value = 0.01
## alternative hypothesis: stationary

# ACF and PACF of differenced series
par(mfrow = c(2, 1))
acf(diff1_cases, main = "ACF of First Differenced Series", lag.max = 104)
pacf(diff1_cases, main = "PACF of First Differenced Series", lag.max = 104)

par(mfrow = c(1, 1))

7 Model Fitting

With a better understanding of the data’s properties, we can now proceed to fit forecasting models. The goal is to find a model that accurately captures the underlying patterns (trend and seasonality) in the data. To evaluate model performance objectively, we will first split the data into a training set (the first 80% of the data) and a testing set (the final 20%). We will fit the models on the training data and then test their forecasting accuracy on the unseen testing data. This section will explore four different, powerful time series models:

Auto ARIMA: An automated algorithm that selects the best ARIMA model parameters.
SARIMA: A manual Seasonal ARIMA model where we specify the parameters based on our exploratory analysis.
ETS (Exponential Smoothing): A model that forecasts by smoothing past observations, with components for error, trend, and seasonality.
TBATS: A complex model designed to handle intricate seasonal patterns, including multiple seasonalities.

7.1 Train-Test Split

# Split data into training and testing sets (80-20 split)
train_size <- floor(0.8 * length(ts_total_cases))
train_data <- window(ts_total_cases, end = time(ts_total_cases)[train_size])
test_data <- window(ts_total_cases, start = time(ts_total_cases)[train_size + 1])

cat("Training set size:", length(train_data), "observations\n")

## Training set size: 790 observations

cat("Test set size:", length(test_data), "observations\n")

## Test set size: 198 observations

7.2 Model 1: Auto ARIMA

auto_arima_model <- auto.arima(train_data, seasonal = TRUE, stepwise = FALSE)
summary(auto_arima_model)

## Series: train_data 
## ARIMA(0,1,2)(2,0,1)[52] 
## 
## Coefficients:
##          ma1     ma2    sar1    sar2     sma1
##       0.1716  0.0858  0.3878  0.0492  -0.3614
## s.e.  0.0363  0.0342  0.4663  0.0436   0.4671
## 
## sigma^2 = 184.8:  log likelihood = -3176.28
## AIC=6364.57   AICc=6364.68   BIC=6392.59
## 
## Training set error measures:
##                     ME     RMSE      MAE MPE MAPE      MASE         ACF1
## Training set 0.0150855 13.54193 8.162633 NaN  Inf 0.2141114 -0.003929785

# Forecast
arima_forecast <- forecast(auto_arima_model, h = length(test_data))

autoplot(arima_forecast) +
  autolayer(test_data, series = "Actual", color = "red") +
  labs(
    title = "Auto ARIMA Forecast vs Actual",
    x = "Time",
    y = "Total Cases"
  ) +
  theme_minimal()

7.3 Model 2: SARIMA

sarima_model <- Arima(train_data, 
                      order = c(1, 1, 1), 
                      seasonal = list(order = c(1, 1, 1), period = 52))
summary(sarima_model)

## Series: train_data 
## ARIMA(1,1,1)(1,1,1)[52] 
## 
## Coefficients:
##          ar1      ma1     sar1     sma1
##       0.4514  -0.3089  -0.0150  -0.9047
## s.e.  0.3187   0.3435   0.0467   0.0616
## 
## sigma^2 = 196.7:  log likelihood = -3033.9
## AIC=6077.8   AICc=6077.88   BIC=6100.81
## 
## Training set error measures:
##                       ME    RMSE      MAE MPE MAPE      MASE        ACF1
## Training set -0.03546395 13.5107 8.388034 NaN  Inf 0.2200238 0.003011365

# Forecast
sarima_forecast <- forecast(sarima_model, h = length(test_data))

autoplot(sarima_forecast) +
  autolayer(test_data, series = "Actual", color = "red") +
  labs(
    title = "SARIMA Forecast vs Actual",
    x = "Time",
    y = "Total Cases"
  ) +
  theme_minimal()

7.4 Model 3: ETS (Exponential Smoothing)

ets_model <- ets(train_data)
summary(ets_model)

## ETS(A,Ad,N) 
## 
## Call:
## ets(y = train_data)
## 
##   Smoothing parameters:
##     alpha = 0.9999 
##     beta  = 0.1524 
##     phi   = 0.8 
## 
##   Initial states:
##     l = 4.0358 
##     b = -0.7328 
## 
##   sigma:  13.6366
## 
##      AIC     AICc      BIC 
## 9406.046 9406.153 9434.078 
## 
## Training set error measures:
##                      ME     RMSE      MAE MPE MAPE      MASE       ACF1
## Training set 0.01778684 13.59337 8.075487 NaN  Inf 0.2118255 0.03668071

# Forecast
ets_forecast <- forecast(ets_model, h = length(test_data))

autoplot(ets_forecast) +
  autolayer(test_data, series = "Actual", color = "red") +
  labs(
    title = "ETS Forecast vs Actual",
    x = "Time",
    y = "Total Cases"
  ) +
  theme_minimal()

7.5 Model 4: TBATS

tbats_model <- tbats(train_data)
summary(tbats_model)

##                   Length Class  Mode     
## lambda                0  -none- NULL     
## alpha                 1  -none- numeric  
## beta                  1  -none- numeric  
## damping.parameter     1  -none- numeric  
## gamma.one.values      1  -none- numeric  
## gamma.two.values      1  -none- numeric  
## ar.coefficients       0  -none- NULL     
## ma.coefficients       0  -none- NULL     
## likelihood            1  -none- numeric  
## optim.return.code     1  -none- numeric  
## variance              1  -none- numeric  
## AIC                   1  -none- numeric  
## parameters            2  -none- list     
## seed.states          18  -none- numeric  
## fitted.values       790  ts     numeric  
## errors              790  ts     numeric  
## x                 14220  -none- numeric  
## seasonal.periods      1  -none- numeric  
## k.vector              1  -none- numeric  
## y                   790  ts     numeric  
## p                     1  -none- numeric  
## q                     1  -none- numeric  
## call                  2  -none- call     
## series                1  -none- character
## method                1  -none- character

# Forecast
tbats_forecast <- forecast(tbats_model, h = length(test_data))

autoplot(tbats_forecast) +
  autolayer(test_data, series = "Actual", color = "red") +
  labs(
    title = "TBATS Forecast vs Actual",
    x = "Time",
    y = "Total Cases"
  ) +
  theme_minimal()

8 Model Diagnostics

8.1 Residual Analysis for Auto ARIMA

checkresiduals(auto_arima_model)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(0,1,2)(2,0,1)[52]
## Q* = 101.92, df = 99, p-value = 0.4003
## 
## Model df: 5.   Total lags used: 104

8.2 Ljung-Box Test

lb_test <- Box.test(residuals(auto_arima_model), lag = 20, type = "Ljung-Box")
print(lb_test)

## 
##  Box-Ljung test
## 
## data:  residuals(auto_arima_model)
## X-squared = 48.636, df = 20, p-value = 0.0003461

lb_result <- data.frame(
  Test = "Ljung-Box",
  Statistic = lb_test$statistic,
  P_Value = lb_test$p.value,
  Conclusion = ifelse(lb_test$p.value > 0.05, 
                     "No autocorrelation (Good)", 
                     "Autocorrelation present")
)

lb_result %>%
  kable(caption = "Ljung-Box Test for Residual Autocorrelation", digits = 4) %>%
  kable_styling(bootstrap_options = c("striped", "hover"))

Ljung-Box Test for Residual Autocorrelation
	Test	Statistic	P_Value	Conclusion
X-squared	Ljung-Box	48.6362	3e-04	Autocorrelation present

8.3 Residual Normality Test

# Using subset for Shapiro test (max 5000 observations)
residuals_subset <- residuals(auto_arima_model)
if(length(residuals_subset) > 5000) {
  residuals_subset <- sample(residuals_subset, 5000)
}
shapiro_test <- shapiro.test(residuals_subset)
print(shapiro_test)

## 
##  Shapiro-Wilk normality test
## 
## data:  residuals_subset
## W = 0.82127, p-value < 2.2e-16

9 Model Comparison

# Calculate accuracy metrics
arima_accuracy <- accuracy(arima_forecast, test_data)
sarima_accuracy <- accuracy(sarima_forecast, test_data)
ets_accuracy <- accuracy(ets_forecast, test_data)
tbats_accuracy <- accuracy(tbats_forecast, test_data)

comparison_df <- data.frame(
  Model = c("Auto ARIMA", "SARIMA", "ETS", "TBATS"),
  RMSE = c(arima_accuracy[2, "RMSE"], 
           sarima_accuracy[2, "RMSE"],
           ets_accuracy[2, "RMSE"],
           tbats_accuracy[2, "RMSE"]),
  MAE = c(arima_accuracy[2, "MAE"],
          sarima_accuracy[2, "MAE"],
          ets_accuracy[2, "MAE"],
          tbats_accuracy[2, "MAE"]),
  MAPE = c(arima_accuracy[2, "MAPE"],
           sarima_accuracy[2, "MAPE"],
           ets_accuracy[2, "MAPE"],
           tbats_accuracy[2, "MAPE"]),
  AIC = c(AIC(auto_arima_model),
          AIC(sarima_model),
          AIC(ets_model),
          NA)  # TBATS doesn't have AIC
)

# Highlight best model for each metric
comparison_df %>%
  kable(caption = "Model Comparison - Test Set Performance", digits = 2) %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed")) %>%
  column_spec(2, background = ifelse(comparison_df$RMSE == min(comparison_df$RMSE), 
                                     "#d4edda", "white")) %>%
  column_spec(3, background = ifelse(comparison_df$MAE == min(comparison_df$MAE), 
                                     "#d4edda", "white")) %>%
  column_spec(4, background = ifelse(comparison_df$MAPE == min(comparison_df$MAPE), 
                                     "#d4edda", "white"))

Model Comparison - Test Set Performance
Model	RMSE	MAE	MAPE	AIC
Auto ARIMA	30.68	19.26	Inf	6364.57
SARIMA	27.54	21.01	Inf	6077.80
ETS	30.65	20.50	Inf	9406.05
TBATS	27.19	19.73	Inf	NA

Best Model: The model with the lowest RMSE, MAE, and MAPE is considered the best performer.

9.1 Visual Comparison

# Create comparison plot
comparison_data <- data.frame(
  Time = time(test_data),
  Actual = as.numeric(test_data),
  Auto_ARIMA = as.numeric(arima_forecast$mean),
  SARIMA = as.numeric(sarima_forecast$mean),
  ETS = as.numeric(ets_forecast$mean),
  TBATS = as.numeric(tbats_forecast$mean)
)

comparison_long <- comparison_data %>%
  pivot_longer(cols = -Time, names_to = "Model", values_to = "Cases")

ggplot(comparison_long, aes(x = Time, y = Cases, color = Model)) +
  geom_line(linewidth = 0.8) +
  scale_color_manual(values = c("Actual" = "black", 
                                 "Auto_ARIMA" = "blue",
                                 "SARIMA" = "red",
                                 "ETS" = "green",
                                 "TBATS" = "purple")) +
  labs(
    title = "Model Comparison: Forecasts vs Actual",
    x = "Time",
    y = "Total Cases"
  ) +
  theme_minimal() +
  theme(legend.position = "bottom")

10 Final Forecast

10.1 Refit Best Model on Full Data

# Refit best model on full data (using Auto ARIMA as example)
best_model <- auto.arima(ts_total_cases, seasonal = TRUE)
summary(best_model)

## Series: ts_total_cases 
## ARIMA(1,1,1) 
## 
## Coefficients:
##          ar1      ma1
##       0.7202  -0.6081
## s.e.  0.0887   0.1004
## 
## sigma^2 = 175.1:  log likelihood = -3948.49
## AIC=7902.99   AICc=7903.01   BIC=7917.67
## 
## Training set error measures:
##                       ME     RMSE      MAE MPE MAPE      MASE          ACF1
## Training set 0.007622131 13.21081 7.933374 NaN  Inf 0.2182599 -0.0005099813

10.2 Forecast Next 52 Weeks

# Forecast next 52 weeks (1 year)
final_forecast <- forecast(best_model, h = 52)

autoplot(final_forecast) +
  labs(
    title = "Dengue Cases Forecast for Next Year",
    subtitle = paste("Model:", best_model$method),
    x = "Time",
    y = "Total Cases"
  ) +
  theme_minimal()

10.3 Forecast Table

forecast_df <- data.frame(
  Week = 1:52,
  Point_Forecast = round(as.numeric(final_forecast$mean), 0),
  Lower_80 = round(as.numeric(final_forecast$lower[, 1]), 0),
  Upper_80 = round(as.numeric(final_forecast$upper[, 1]), 0),
  Lower_95 = round(as.numeric(final_forecast$lower[, 2]), 0),
  Upper_95 = round(as.numeric(final_forecast$upper[, 2]), 0)
)

# Display first 12 weeks
head(forecast_df, 12) %>%
  kable(caption = "Forecast for Next 12 Weeks (with confidence intervals)") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"))

Forecast for Next 12 Weeks (with confidence intervals)
Week	Point_Forecast	Lower_80	Upper_80	Lower_95	Upper_95
1	15	-2	32	-11	41
2	15	-11	40	-24	54
3	15	-18	47	-35	64
4	15	-24	53	-45	74
5	15	-30	59	-53	83
6	15	-35	64	-62	91
7	15	-40	69	-69	98
8	15	-45	74	-76	106
9	15	-49	78	-83	112
10	15	-53	83	-89	119
11	15	-57	87	-95	125
12	15	-61	90	-101	130

# Save complete forecast
write.csv(forecast_df, "dengue_forecast.csv", row.names = FALSE)
cat("Complete forecast saved to 'dengue_forecast.csv'\n")

## Complete forecast saved to 'dengue_forecast.csv'

11 Serotype-Specific Analysis

11.1 Time Series by Serotype

# Create time series for each serotype
ts_denv1 <- ts(dengue_data$denv1_cases, start = c(1990, 1), frequency = 52)
ts_denv2 <- ts(dengue_data$denv2_cases, start = c(1990, 1), frequency = 52)
ts_denv3 <- ts(dengue_data$denv3_cases, start = c(1990, 1), frequency = 52)
ts_denv4 <- ts(dengue_data$denv4_cases, start = c(1990, 1), frequency = 52)

par(mfrow = c(4, 1), mar = c(4, 4, 2, 1))
plot(ts_denv1, main = "DENV-1 Cases", ylab = "Cases", col = "red", lwd = 1.5)
plot(ts_denv2, main = "DENV-2 Cases", ylab = "Cases", col = "blue", lwd = 1.5)
plot(ts_denv3, main = "DENV-3 Cases", ylab = "Cases", col = "green", lwd = 1.5)
plot(ts_denv4, main = "DENV-4 Cases", ylab = "Cases", col = "purple", lwd = 1.5)

par(mfrow = c(1, 1))

11.2 Serotype Proportions Over Time

serotype_data <- dengue_data %>%
  mutate(
    year = year(week_start_date),
    total_serotyped = denv1_cases + denv2_cases + denv3_cases + denv4_cases
  ) %>%
  filter(total_serotyped > 0) %>%
  group_by(year) %>%
  summarise(
    DENV1 = sum(denv1_cases) / sum(total_serotyped) * 100,
    DENV2 = sum(denv2_cases) / sum(total_serotyped) * 100,
    DENV3 = sum(denv3_cases) / sum(total_serotyped) * 100,
    DENV4 = sum(denv4_cases) / sum(total_serotyped) * 100,
    .groups = "drop"
  ) %>%
  pivot_longer(cols = starts_with("DENV"), 
               names_to = "Serotype", 
               values_to = "Percentage")

ggplot(serotype_data, aes(x = year, y = Percentage, fill = Serotype)) +
  geom_area(alpha = 0.7) +
  scale_fill_manual(values = c("DENV1" = "red", "DENV2" = "blue", 
                                "DENV3" = "green", "DENV4" = "purple")) +
  labs(
    title = "Dengue Serotype Distribution Over Time",
    subtitle = "Proportion of each serotype among typed cases",
    x = "Year",
    y = "Percentage (%)"
  ) +
  theme_minimal() +
  theme(legend.position = "bottom")

12 Conclusions

12.1 Key Findings

Seasonality: Strong seasonal patterns with peaks typically occurring in the latter half of the year
Trend: Variable long-term trends with epidemic cycles
Stationarity: The original series is non-stationary but becomes stationary after differencing
Best Model: Based on accuracy metrics, the model with lowest RMSE performed best
Forecast: The next 52 weeks show expected seasonal patterns

12.2 Recommendations

Public Health Planning: Prepare for seasonal peaks, especially during high-risk months
Surveillance: Enhanced monitoring during predicted high-incidence periods
Model Updates: Regularly update models with new data for improved accuracy
Serotype Monitoring: Track serotype distribution for epidemic preparedness

12.3 Limitations

Model assumes historical patterns continue
Does not account for climate variables or vector control interventions
Uncertainty increases with longer forecast horizons
Extreme events may not be well-predicted

13 Session Information

sessionInfo()

## R version 4.5.0 (2025-04-11)
## Platform: aarch64-apple-darwin20
## Running under: macOS Sequoia 15.6.1
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/lib/libRblas.0.dylib 
## LAPACK: /Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/lib/libRlapack.dylib;  LAPACK version 3.12.1
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## time zone: America/Puerto_Rico
## tzcode source: internal
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] kableExtra_1.4.0 knitr_1.50       gridExtra_2.3    forecast_8.24.0 
##  [5] tseries_0.10-58  lubridate_1.9.4  forcats_1.0.0    stringr_1.5.1   
##  [9] dplyr_1.1.4      purrr_1.0.4      readr_2.1.5      tidyr_1.3.1     
## [13] tibble_3.2.1     ggplot2_3.5.2    tidyverse_2.0.0 
## 
## loaded via a namespace (and not attached):
##  [1] gtable_0.3.6      xfun_0.52         bslib_0.9.0       lattice_0.22-6   
##  [5] tzdb_0.5.0        quadprog_1.5-8    vctrs_0.6.5       tools_4.5.0      
##  [9] generics_0.1.3    curl_6.2.2        parallel_4.5.0    xts_0.14.1       
## [13] pkgconfig_2.0.3   Matrix_1.7-3      lifecycle_1.0.4   farver_2.1.2     
## [17] compiler_4.5.0    textshaping_1.0.0 munsell_0.5.1     htmltools_0.5.8.1
## [21] sass_0.4.10       yaml_2.3.10       pillar_1.10.2     jquerylib_0.1.4  
## [25] cachem_1.1.0      nlme_3.1-168      fracdiff_1.5-3    tidyselect_1.2.1 
## [29] digest_0.6.37     stringi_1.8.7     labeling_0.4.3    splines_4.5.0    
## [33] fastmap_1.2.0     grid_4.5.0        colorspace_2.1-1  cli_3.6.4        
## [37] magrittr_2.0.3    withr_3.0.2       scales_1.3.0      timechange_0.3.0 
## [41] TTR_0.24.4        rmarkdown_2.29    quantmod_0.4.28   nnet_7.3-20      
## [45] timeDate_4051.111 zoo_1.8-14        hms_1.1.3         urca_1.3-4       
## [49] evaluate_1.0.3    lmtest_0.9-40     viridisLite_0.4.2 mgcv_1.9-1       
## [53] rlang_1.1.6       Rcpp_1.0.14       glue_1.8.0        xml2_1.3.8       
## [57] svglite_2.2.1     rstudioapi_0.17.1 jsonlite_2.0.0    R6_2.6.1         
## [61] systemfonts_1.3.1

End of Report

Dengue Cases Time Series Analysis

Denny S. Fernández del Viso

2025-10-30