Information on Dengue

Data

## 
## Summary Statistics of Variables
## ==============================================================================
## Statistic                               N     Mean    St. Dev.   Min     Max  
## ------------------------------------------------------------------------------
## year                                  1,456 2,001.032  5.408    1,990   2,010 
## weekofyear                            1,456  26.503    15.019     1      53   
## ndvi_ne                               1,262   0.142    0.141   -0.406   0.508 
## ndvi_nw                               1,404   0.131    0.120   -0.456   0.454 
## ndvi_se                               1,434   0.204    0.074   -0.016   0.538 
## ndvi_sw                               1,434   0.202    0.084   -0.063   0.546 
## precipitation_amt_mm                  1,443  45.760    43.716   0.000  390.600
## reanalysis_air_temp_k                 1,446  298.702   1.362   294.636 302.200
## reanalysis_avg_temp_k                 1,446  299.226   1.262   294.893 302.929
## reanalysis_dew_point_temp_k           1,446  295.246   1.528   289.643 298.450
## reanalysis_max_air_temp_k             1,446  303.427   3.235   297.800 314.000
## reanalysis_min_air_temp_k             1,446  295.719   2.565   286.900 299.900
## reanalysis_precip_amt_kg_per_m2       1,446  40.152    43.434   0.000  570.500
## reanalysis_relative_humidity_percent  1,446  82.162    7.154   57.787  98.610 
## reanalysis_sat_precip_amt_mm          1,443  45.760    43.716   0.000  390.600
## reanalysis_specific_humidity_g_per_kg 1,446  16.746    1.542   11.716  20.461 
## reanalysis_tdtr_k                     1,446   4.904    3.546    1.357  16.029 
## station_avg_temp_c                    1,413  27.186    1.292   21.400  30.800 
## station_diur_temp_rng_c               1,413   8.059    2.129    4.529  15.800 
## station_max_temp_c                    1,436  32.452    1.959   26.700  42.200 
## station_min_temp_c                    1,442  22.102    1.574   14.700  25.600 
## station_precip_mm                     1,434  39.326    47.455   0.000  543.300
## ------------------------------------------------------------------------------

Data Manipulation

# Fill missing values with the last observed input 

## Fill missing values with the last observed input of each column

filled_df <- apply(train, 2, function(x) na.locf(x, na.rm = FALSE))

## Convert filled_df back to data frame (apply converts it to a matrix)
dengue_train <- as.data.frame(filled_df)

# Repeat the same process on test data

filled_df <- apply(test, 2, function(x) na.locf(x, na.rm = FALSE))

## Convert filled_df back to data frame (apply converts it to a matrix)
dengue_test <- as.data.frame(filled_df)
# Merge 'total case' df with 'dengue_train' df

df <- cbind(total_case$total_cases, dengue_train)
# Turn "week_start_date" as Date

df$week_start_date <- as.Date(df$week_start_date)

# Change variables from "chr" to "num"

## Function to change variable type by specifying column numbers
change_variable_type <- function(data, col_numbers, new_type) {
  data %>%
    mutate_at(col_numbers, as.factor) %>%
    mutate_at(col_numbers, as.numeric)
}

# Specify the column numbers you want to change

columns <- c(3:4,6:25) 

# Specify the new type

new_type <- "numeric"  

# Change variable type by specifying column numbers

df <- change_variable_type(df, columns, new_type)
# Rename the Y variable

names(df)[1] <- "cases"
# Separate master dataframe by cities

dengue_sj <- df |>
  filter(city == 'sj')

dengue_iq <- df |>
  filter(city == 'iq')
# Make time series object for both dataframes

sj_ts <- ts(dengue_sj, start = c(1990,03), end =c(2008,04), frequency = 52)

iq_ts <- ts(dengue_iq, start = c(2000,07), end =c(2010,06), frequency = 52)
# Train / Test Split (70/30)

## San Jan (937 * 0.7 = 681)

train_sj <- dengue_sj |>
  slice(1:681) |>
  ts(cases, start = c(1990,4), end =c(2003,8), frequency = 52)

test_sj <- dengue_sj |>
  slice(682:937) |>
  ts(cases, start = c(2003,8), end =c(2008,2), frequency = 52)

## Iquitos (520 * 0.7 = 364)

train_iq <- dengue_iq |>
  slice(1:364) |>
  ts(cases, start = c(2000,7), end =c(2007,6), frequency = 52)

test_iq <- dengue_iq |>
  slice(365:520) |>
  ts(cases, start = c(2007,6), end =c(2010,5), frequency = 52)

Data Visualization

Original Data

Decomposition

  • Comment: Additive decomposition performs better than multiplicative method on both cities, less remainder component.

Correlation Matrix

Models

1. ARIMA

# Make model

arima_model_sj <- auto.arima(train_sj[,'cases'], seasonal = T)

# Generate forecast

forecast_sj <- forecast(arima_model_sj, h = 255)

# Plot forecast and actual data
forecast_sj |>
  autoplot() +
  autolayer(sj_ts[,'cases'], series = "Actual Data") +
  labs(y = 'Number of cases', title = "San Juan Forecasts with ARIMA(2,0,1)") +
  theme_minimal()

# Make model

arima_model_iq <- auto.arima(train_iq[,'cases'], seasonal = T)

# Generate forecast

forecast_iq <- forecast(arima_model_iq, h = 156)

# Plot forecast and actual data
forecast_iq |>
  autoplot() +
  autolayer(iq_ts[,'cases'], series = "Actual Data") +
  labs(y = 'Number of cases', title = "Iquitos Forecasts with ARIMA(0,1,2)") +
  theme_minimal()

2. ARIMA w Regressor

# Make model

arima_reg_model_sj <- auto.arima(train_sj[,'cases'], xreg = train_sj[,20] + train_sj[,17])

# Generate forecast

forecast_sj <- forecast(arima_reg_model_sj, xreg = test_sj[,20] + test_sj[,17], h = 255)

# Plot forecast and actual data
forecast_sj |>
  autoplot() +
  autolayer(sj_ts[,'cases'], series = "Actual Data") +
  labs(y = 'Number of cases', title = "San Juan Forecasts with ARIMA(2,0,1)") +
  theme_minimal()

# Make model

arima_reg_model_iq <- auto.arima(train_iq[,'cases'], xreg = train_iq[,13] + train_iq[,17])

# Generate forecast

forecast_iq <- forecast(arima_reg_model_iq, xreg = test_iq[,13] + test_iq[,17], h = 156)

# Plot forecast and actual data
forecast_iq |>
  autoplot() +
  autolayer(iq_ts[,'cases'], series = "Actual Data") +
  labs(y = 'Number of cases', title = "Iquitos Forecasts with ARIMA(2,0,1)") +
  theme_minimal()

4. Neural Network

Regular Neural Network

# Fit the NNETAR model

nnt_model_sj <- nnetar(train_sj[,'cases'])

nnt_result <- forecast(nnt_model_sj, h = 255)

autoplot(sj_ts[,'cases']) +
  autolayer(nnt_result, series = "NNT Model") +
  labs(y = 'Number of cases', title = 'San Juan Forecasts from Neural Network Model') +
  theme_minimal()

# Fit the NNETAR model

nnt_model_iq <- nnetar(train_iq[,'cases'])

nnt_result <- forecast(nnt_model_iq, h = 156)

autoplot(iq_ts[,'cases']) +
  autolayer(nnt_result, series = "NNT Model") +
  labs(y = 'Number of cases', title = 'Iquitos Forecasts from Neural Network Model') +
  theme_minimal()

Neural Network with Regressors

# Build model

nnt_r_model_sj <- nnetar(train_sj[,'cases'], xreg = cbind(train_sj[,14], train_sj[,17])) 
# column 14 = max_temp, 
# column 17 = relative humidity

# Forecast

nnt_r_result_sj <- forecast(nnt_r_model_sj, xreg = cbind(test_sj[,14], test_sj[,17]), h = 255)
## Warning in forecast.nnetar(nnt_r_model_sj, xreg = cbind(test_sj[, 14],
## test_sj[, : xreg contains different column names from the xreg used in
## training. Please check that the regressors are in the same order.
# Graph result

autoplot(sj_ts[,'cases']) +
  autolayer(nnt_r_result_sj, series = "NNT with Regressor") +
  labs(y = 'Number of cases', title = 'San Juan Forecasts from Neural Network Model with Regressors') +
  theme_minimal()

# Build model

nnt_r_model_iq <- nnetar(train_iq[,'cases'], xreg = cbind(train_iq[,6:9],train_iq[,13], train_iq[,19], train_iq[,20])) # column 6-9 = vegetation indexes
                               # column 13 = dew point, 
                               # column 19 = specific humidity
                               # column 20 = daily temp difference

# Forecast

nnt_r_result_iq <- forecast(nnt_r_model_iq, xreg = cbind(test_iq[,6:9],test_iq[,13], test_iq[,19], test_iq[,20]), h = 156)
## Warning in forecast.nnetar(nnt_r_model_iq, xreg = cbind(test_iq[, 6:9], : xreg
## contains different column names from the xreg used in training. Please check
## that the regressors are in the same order.
# Graph result

autoplot(iq_ts[,'cases']) +
  autolayer(nnt_r_result_iq, series = "NNT with Regressor") +
  labs(y = 'Number of cases', title = 'Iquitos Forecasts from Neural Network Model with Regressors') +
  theme_minimal()

Model Evaluation & Selection

# ARIMA SJ
accuracy(arima_model_sj)
##                      ME     RMSE      MAE  MPE MAPE      MASE       ACF1
## Training set 0.01073254 14.09735 8.522365 -Inf  Inf 0.1994185 0.03950335
# ARIMA Reg SJ
accuracy(arima_reg_model_sj)
##                      ME     RMSE      MAE  MPE MAPE      MASE       ACF1
## Training set 0.01809104 14.07827 8.558853 -Inf  Inf 0.2002723 0.04075783
# NNT Model SJ
accuracy(nnt_model_sj)
##                       ME     RMSE      MAE  MPE MAPE      MASE        ACF1
## Training set -0.02307732 6.795765 4.958958 -Inf  Inf 0.1160368 -0.01009775
# NNT Reg SJ
accuracy(nnt_r_model_sj) # Most optimal
##                       ME     RMSE      MAE  MPE MAPE      MASE        ACF1
## Training set -0.03470002 6.391498 4.672788 -Inf  Inf 0.1093406 -0.03596043
# ARIMA IQ
accuracy(arima_model_iq)
##                      ME    RMSE      MAE MPE MAPE      MASE          ACF1
## Training set 0.02696242 7.09392 3.619809 NaN  Inf 0.3894415 -0.0006579031
# ARIMA Reg IQ
accuracy(arima_reg_model_iq)
##                     ME     RMSE      MAE  MPE MAPE      MASE        ACF1
## Training set 0.0370819 6.960542 3.650921 -Inf  Inf 0.3927887 -0.01354936
# NNT Model IQ
accuracy(nnt_model_iq)
##                        ME     RMSE      MAE  MPE MAPE      MASE         ACF1
## Training set -0.004818704 5.496049 3.327064 -Inf  Inf 0.3579462 -0.007487392
# NNT Reg IQ
accuracy(nnt_r_model_iq) # Most optimal
##                        ME     RMSE      MAE MPE MAPE      MASE       ACF1
## Training set -0.003458757 2.740832 2.025392 NaN  Inf 0.2179042 0.02182933

Apply Models

# Modify test set 

test_sj <- dengue_test |>
  filter(city == 'sj') |>
  ts(cases, start = c(2008,4), end =c(2013,4), frequency = 52)
  
test_iq <- dengue_test |>
  filter(city == 'iq')  |>
  ts(cases, start = c(2010,7), end =c(2013,6), frequency = 52)
# Build model

nnt_sj_model <- nnetar(sj_ts[,'cases'], xreg = cbind(sj_ts[,14],sj_ts[,17])) # column 14 = max_temp, 
                                                                               # column 17 = relative humidity

# Forecast
sj_forecast <- forecast(nnt_sj_model, xreg = cbind(test_sj[,14], test_sj[,17]), h = 5)
## Warning in forecast.nnetar(nnt_sj_model, xreg = cbind(test_sj[, 14], test_sj[,
## : xreg contains different column names from the xreg used in training. Please
## check that the regressors are in the same order.
# Graph result
autoplot(sj_ts[,'cases']) +
  autolayer(sj_forecast$fitted, series = 'Fitted values') +
  autolayer(sj_forecast, series = 'Forecasts') +
  labs(y = 'Number of Cases', title = 'Neural Network Model Forecasts (SJ)') +
  theme_minimal()
## Warning: Removed 52 rows containing missing values (`geom_line()`).

## Warning in forecast.nnetar(nnt_iq_model, xreg = cbind(test_iq[, 6:9], test_iq[,
## : xreg contains different column names from the xreg used in training. Please
## check that the regressors are in the same order.
## Warning: Removed 52 rows containing missing values (`geom_line()`).

Extract Results

##    city year weekofyear x
## 1    sj 2008         18 5
## 2    sj 2008         19 5
## 3    sj 2008         20 5
## 4    sj 2008         21 6
## 5    sj 2008         22 7
## 6    iq 2010         26 6
## 7    iq 2010         27 7
## 8    iq 2010         28 8
## 9    iq 2010         29 9
## 10   iq 2010         30 9