# load packages
library(readr)
library(ISOweek)
library(skimr)
library(stringr)
library(zoo)
library(forecast)
library(tidyr)
library(tidyverse)
library(fable)
library(fabletools)
library(dplyr)
library(ggplot2)
library(ggcorrplot)
library(corrplot)
library(kableExtra)
library(tsibbledata)
library(fpp3)
# import datasets 
train_features <- read_csv("/Users/laykhoonyu/Documents/MSAA_AE/predictive_analytics/DengAI/dengue_features_train.csv")
train_labels <- read_csv("/Users/laykhoonyu/Documents/MSAA_AE/predictive_analytics/DengAI/dengue_labels_train.csv")
test_features <- read_csv("/Users/laykhoonyu/Documents/MSAA_AE/predictive_analytics/DengAI/dengue_features_test.csv")
predictions <- read_csv("/Users/laykhoonyu/Documents/MSAA_AE/predictive_analytics/DengAI/submission_format.csv")
head(train_features)
## # A tibble: 6 × 24
##   city   year weekofyear week_start_date ndvi_ne ndvi_nw ndvi_se ndvi_sw
##   <chr> <dbl>      <dbl> <date>            <dbl>   <dbl>   <dbl>   <dbl>
## 1 sj     1990         18 1990-04-30       0.123    0.104   0.198   0.178
## 2 sj     1990         19 1990-05-07       0.170    0.142   0.162   0.155
## 3 sj     1990         20 1990-05-14       0.0322   0.173   0.157   0.171
## 4 sj     1990         21 1990-05-21       0.129    0.245   0.228   0.236
## 5 sj     1990         22 1990-05-28       0.196    0.262   0.251   0.247
## 6 sj     1990         23 1990-06-04      NA        0.175   0.254   0.182
## # ℹ 16 more variables: precipitation_amt_mm <dbl>, reanalysis_air_temp_k <dbl>,
## #   reanalysis_avg_temp_k <dbl>, reanalysis_dew_point_temp_k <dbl>,
## #   reanalysis_max_air_temp_k <dbl>, reanalysis_min_air_temp_k <dbl>,
## #   reanalysis_precip_amt_kg_per_m2 <dbl>,
## #   reanalysis_relative_humidity_percent <dbl>,
## #   reanalysis_sat_precip_amt_mm <dbl>,
## #   reanalysis_specific_humidity_g_per_kg <dbl>, reanalysis_tdtr_k <dbl>, …
head(train_labels)
## # A tibble: 6 × 4
##   city   year weekofyear total_cases
##   <chr> <dbl>      <dbl>       <dbl>
## 1 sj     1990         18           4
## 2 sj     1990         19           5
## 3 sj     1990         20           4
## 4 sj     1990         21           3
## 5 sj     1990         22           6
## 6 sj     1990         23           2
skim(train_features)
Data summary
Name train_features
Number of rows 1456
Number of columns 24
_______________________
Column type frequency:
character 1
Date 1
numeric 22
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
city 0 1 2 2 0 2 0

Variable type: Date

skim_variable n_missing complete_rate min max median n_unique
week_start_date 0 1 1990-04-30 2010-06-25 2002-05-28 1049

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
year 0 1.00 2001.03 5.41 1990.00 1997.00 2002.00 2005.00 2010.00 ▅▃▆▇▅
weekofyear 0 1.00 26.50 15.02 1.00 13.75 26.50 39.25 53.00 ▇▇▇▇▇
ndvi_ne 194 0.87 0.14 0.14 -0.41 0.04 0.13 0.25 0.51 ▁▂▇▇▂
ndvi_nw 52 0.96 0.13 0.12 -0.46 0.05 0.12 0.22 0.45 ▁▁▆▇▂
ndvi_se 22 0.98 0.20 0.07 -0.02 0.16 0.20 0.25 0.54 ▁▇▆▁▁
ndvi_sw 22 0.98 0.20 0.08 -0.06 0.14 0.19 0.25 0.55 ▁▇▇▂▁
precipitation_amt_mm 13 0.99 45.76 43.72 0.00 9.80 38.34 70.24 390.60 ▇▂▁▁▁
reanalysis_air_temp_k 10 0.99 298.70 1.36 294.64 297.66 298.65 299.83 302.20 ▁▅▇▇▂
reanalysis_avg_temp_k 10 0.99 299.23 1.26 294.89 298.26 299.29 300.21 302.93 ▁▃▇▇▁
reanalysis_dew_point_temp_k 10 0.99 295.25 1.53 289.64 294.12 295.64 296.46 298.45 ▁▂▅▇▃
reanalysis_max_air_temp_k 10 0.99 303.43 3.23 297.80 301.00 302.40 305.50 314.00 ▅▇▃▂▁
reanalysis_min_air_temp_k 10 0.99 295.72 2.57 286.90 293.90 296.20 297.90 299.90 ▁▂▆▆▇
reanalysis_precip_amt_kg_per_m2 10 0.99 40.15 43.43 0.00 13.05 27.24 52.20 570.50 ▇▁▁▁▁
reanalysis_relative_humidity_percent 10 0.99 82.16 7.15 57.79 77.18 80.30 86.36 98.61 ▁▁▇▃▃
reanalysis_sat_precip_amt_mm 13 0.99 45.76 43.72 0.00 9.80 38.34 70.24 390.60 ▇▂▁▁▁
reanalysis_specific_humidity_g_per_kg 10 0.99 16.75 1.54 11.72 15.56 17.09 17.98 20.46 ▁▃▅▇▁
reanalysis_tdtr_k 10 0.99 4.90 3.55 1.36 2.33 2.86 7.62 16.03 ▇▁▂▁▁
station_avg_temp_c 43 0.97 27.19 1.29 21.40 26.30 27.41 28.16 30.80 ▁▁▅▇▁
station_diur_temp_rng_c 43 0.97 8.06 2.13 4.53 6.51 7.30 9.57 15.80 ▇▇▃▂▁
station_max_temp_c 20 0.99 32.45 1.96 26.70 31.10 32.80 33.90 42.20 ▂▇▇▁▁
station_min_temp_c 14 0.99 22.10 1.57 14.70 21.10 22.20 23.30 25.60 ▁▁▅▇▃
station_precip_mm 22 0.98 39.33 47.46 0.00 8.70 23.85 53.90 543.30 ▇▁▁▁▁

Data Cleaning

train_labels <- train_labels %>% mutate(beginning = ymd(str_c(year, "-01-01")),
           final_date = beginning + weeks(weekofyear))
train_labels %>%
  ggplot(aes(x = final_date, y = total_cases, color = city)) +
  geom_line() +
  labs(title = "Total Cases by City",
       x = "Date",
       y = "Value",
       color = "Region") +
  theme_minimal()

for (col in names(train_features)) {
  # Check if the column has missing values
  if (any(is.na(train_features[[col]]))) {
    # Apply linear interpolation to the column and overwrite the original dataset
    train_features[[col]] <- na.approx(train_features[[col]])
  }
}
train_merged <- train_features %>%
  left_join(select(train_labels, year, weekofyear, city, total_cases), by = c("year", "weekofyear", "city"))
skim(train_merged)
Data summary
Name train_merged
Number of rows 1456
Number of columns 25
_______________________
Column type frequency:
character 1
Date 1
numeric 23
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
city 0 1 2 2 0 2 0

Variable type: Date

skim_variable n_missing complete_rate min max median n_unique
week_start_date 0 1 1990-04-30 2010-06-25 2002-05-28 1049

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
year 0 1 2001.03 5.41 1990.00 1997.00 2002.00 2005.00 2010.00 ▅▃▆▇▅
weekofyear 0 1 26.50 15.02 1.00 13.75 26.50 39.25 53.00 ▇▇▇▇▇
ndvi_ne 0 1 0.13 0.14 -0.41 0.04 0.12 0.23 0.51 ▁▂▇▆▂
ndvi_nw 0 1 0.13 0.12 -0.46 0.05 0.12 0.21 0.45 ▁▁▇▇▂
ndvi_se 0 1 0.20 0.07 -0.02 0.15 0.20 0.25 0.54 ▁▇▆▁▁
ndvi_sw 0 1 0.20 0.08 -0.06 0.14 0.19 0.25 0.55 ▁▇▇▂▁
precipitation_amt_mm 0 1 45.70 43.65 0.00 9.79 38.32 70.23 390.60 ▇▂▁▁▁
reanalysis_air_temp_k 0 1 298.70 1.36 294.64 297.66 298.64 299.83 302.20 ▁▅▇▇▂
reanalysis_avg_temp_k 0 1 299.22 1.26 294.89 298.26 299.29 300.21 302.93 ▁▃▇▇▁
reanalysis_dew_point_temp_k 0 1 295.24 1.53 289.64 294.12 295.64 296.46 298.45 ▁▂▅▇▃
reanalysis_max_air_temp_k 0 1 303.42 3.23 297.80 301.00 302.40 305.50 314.00 ▅▇▃▂▁
reanalysis_min_air_temp_k 0 1 295.72 2.56 286.90 293.90 296.20 297.90 299.90 ▁▂▆▆▇
reanalysis_precip_amt_kg_per_m2 0 1 40.13 43.31 0.00 13.20 27.30 52.20 570.50 ▇▁▁▁▁
reanalysis_relative_humidity_percent 0 1 82.17 7.15 57.79 77.20 80.29 86.44 98.61 ▁▁▇▃▃
reanalysis_sat_precip_amt_mm 0 1 45.70 43.65 0.00 9.79 38.32 70.23 390.60 ▇▂▁▁▁
reanalysis_specific_humidity_g_per_kg 0 1 16.74 1.54 11.72 15.55 17.08 17.98 20.46 ▁▃▅▇▁
reanalysis_tdtr_k 0 1 4.90 3.54 1.36 2.33 2.86 7.63 16.03 ▇▁▂▁▁
station_avg_temp_c 0 1 27.18 1.28 21.40 26.31 27.40 28.13 30.80 ▁▁▅▇▁
station_diur_temp_rng_c 0 1 8.10 2.13 4.53 6.53 7.35 9.60 15.80 ▇▇▃▂▁
station_max_temp_c 0 1 32.45 1.96 26.70 31.10 32.80 33.90 42.20 ▂▇▇▁▁
station_min_temp_c 0 1 22.10 1.57 14.70 21.10 22.20 23.30 25.60 ▁▁▅▇▃
station_precip_mm 0 1 39.36 47.29 0.00 8.85 24.05 53.90 543.30 ▇▁▁▁▁
total_cases 0 1 24.68 43.60 0.00 5.00 12.00 28.00 461.00 ▇▁▁▁▁
# subset train_merged by city
train_merged_sj <- train_merged %>% 
  filter(city == "sj")

train_merged_iq <- train_merged %>%
  filter(city == "iq")

Feature Selection

# Reorder columns so that 'total_cases' is the first column
train_merged_sj_reordered <- train_merged_sj %>%
  select(total_cases, everything())

# Compute correlation matrix
correlation_matrix_sj <- cor(train_merged_sj_reordered[sapply(train_merged_sj_reordered, is.numeric)])

# Create correlation heatmap
corrplot(correlation_matrix_sj, method = "color", type = "upper", tl.cex = 0.5)

# Convert correlation matrix to a correlation table
correlation_table_sj <- as.data.frame(correlation_matrix_sj)

# Print formatted correlation table
kable(correlation_table_sj, "html") %>%
  kable_styling(full_width = FALSE)
total_cases year weekofyear ndvi_ne ndvi_nw ndvi_se ndvi_sw precipitation_amt_mm reanalysis_air_temp_k reanalysis_avg_temp_k reanalysis_dew_point_temp_k reanalysis_max_air_temp_k reanalysis_min_air_temp_k reanalysis_precip_amt_kg_per_m2 reanalysis_relative_humidity_percent reanalysis_sat_precip_amt_mm reanalysis_specific_humidity_g_per_kg reanalysis_tdtr_k station_avg_temp_c station_diur_temp_rng_c station_max_temp_c station_min_temp_c station_precip_mm
total_cases 1.0000000 -0.2126898 0.2871342 0.0859184 0.0474027 -0.0436799 0.0426704 0.0602955 0.1803106 0.1736703 0.2028068 0.1932326 0.1865386 0.1069389 0.1444042 0.0602955 0.2069423 -0.0676227 0.1948228 0.0353035 0.1884475 0.1752482 0.0518830
year -0.2126898 1.0000000 -0.0731425 -0.4126646 -0.4945010 -0.0066941 -0.0822972 0.0333330 0.1869462 0.1907631 0.0166096 0.1781211 0.1068788 -0.1321410 -0.2862127 0.0333330 0.0192497 0.3244377 -0.0963725 -0.2783312 -0.1722829 -0.0007298 0.0827184
weekofyear 0.2871342 -0.0731425 1.0000000 -0.0308390 -0.0373903 -0.0191420 -0.0715881 0.2317451 0.5676632 0.5536197 0.5721412 0.5112365 0.5689748 0.2522260 0.3059991 0.2317451 0.5791925 -0.1010152 0.4792601 -0.1403304 0.3193425 0.5166650 0.2114750
ndvi_ne 0.0859184 -0.4126646 -0.0308390 1.0000000 0.6326599 0.2193530 0.1810355 -0.0476045 -0.0814177 -0.0800987 -0.0446575 -0.0546411 -0.0904151 0.0120165 0.0422167 -0.0476045 -0.0405857 -0.0338958 0.0655268 0.1810871 0.1118221 0.0114240 -0.0753148
ndvi_nw 0.0474027 -0.4945010 -0.0373903 0.6326599 1.0000000 0.1973110 0.2177400 -0.0397861 -0.0755900 -0.0742216 -0.0248703 -0.0428293 -0.0743365 0.0057100 0.0762943 -0.0397861 -0.0195721 -0.0508752 0.0886967 0.1837657 0.1371997 0.0182586 -0.0874670
ndvi_se -0.0436799 -0.0066941 -0.0191420 0.2193530 0.1973110 1.0000000 0.8146215 -0.1180438 -0.0152739 -0.0128028 -0.0636048 -0.0094001 -0.0468787 -0.1293738 -0.1143034 -0.1180438 -0.0591023 0.0329361 -0.0609560 0.0161918 -0.0703627 -0.0735414 -0.1378234
ndvi_sw 0.0426704 -0.0822972 -0.0715881 0.1810355 0.2177400 0.8146215 1.0000000 -0.1164566 -0.0378028 -0.0307194 -0.0822551 -0.0086754 -0.0672088 -0.1230979 -0.1153108 -0.1164566 -0.0752546 0.0526967 -0.0340891 0.0744763 -0.0088301 -0.0689030 -0.1715959
precipitation_amt_mm 0.0602955 0.0333330 0.2317451 -0.0476045 -0.0397861 -0.1180438 -0.1164566 1.0000000 0.2327357 0.2210826 0.4009756 0.2560687 0.2446586 0.5082735 0.4993110 1.0000000 0.4083665 -0.0937647 0.1956208 -0.1576758 0.1901354 0.2235604 0.5666605
reanalysis_air_temp_k 0.1803106 0.1869462 0.5676632 -0.0814177 -0.0755900 -0.0152739 -0.0378028 0.2327357 1.0000000 0.9975070 0.9034811 0.9353394 0.9422484 0.0804000 0.2979414 0.2327357 0.9050036 0.1782327 0.8808712 0.0417913 0.6987284 0.8331576 0.1132962
reanalysis_avg_temp_k 0.1736703 0.1907631 0.5536197 -0.0800987 -0.0742216 -0.0128028 -0.0307194 0.2210826 0.9975070 1.0000000 0.8953727 0.9392017 0.9391272 0.0626353 0.2840727 0.2210826 0.8964199 0.2012651 0.8791185 0.0563202 0.7041286 0.8274975 0.0974891
reanalysis_dew_point_temp_k 0.2028068 0.0166096 0.5721412 -0.0446575 -0.0248703 -0.0636048 -0.0822551 0.4009756 0.9034811 0.8953727 1.0000000 0.8476544 0.8990078 0.3284607 0.6781165 0.4009756 0.9985332 -0.0329880 0.8688371 -0.0544207 0.6902640 0.8504789 0.2847830
reanalysis_max_air_temp_k 0.1932326 0.1781211 0.5112365 -0.0546411 -0.0428293 -0.0094001 -0.0086754 0.2560687 0.9353394 0.9392017 0.8476544 1.0000000 0.8286651 0.0916054 0.2874617 0.2560687 0.8536287 0.3526397 0.8528307 0.1163216 0.7616858 0.7712400 0.1047280
reanalysis_min_air_temp_k 0.1865386 0.1068788 0.5689748 -0.0904151 -0.0743365 -0.0468787 -0.0672088 0.2446586 0.9422484 0.9391272 0.8990078 0.8286651 1.0000000 0.1327427 0.3849484 0.2446586 0.8963762 -0.0496985 0.8412998 -0.0224385 0.6271057 0.8297925 0.1502997
reanalysis_precip_amt_kg_per_m2 0.1069389 -0.1321410 0.2522260 0.0120165 0.0057100 -0.1293738 -0.1230979 0.5082735 0.0804000 0.0626353 0.3284607 0.0916054 0.1327427 1.0000000 0.6025475 0.5082735 0.3344285 -0.3068675 0.1351272 -0.2508274 0.0809598 0.1985403 0.4779837
reanalysis_relative_humidity_percent 0.1444042 -0.2862127 0.3059991 0.0422167 0.0762943 -0.1143034 -0.1153108 0.4993110 0.2979414 0.2840727 0.6781165 0.2874617 0.3849484 0.6025475 1.0000000 0.4993110 0.6730096 -0.3745471 0.4264236 -0.1931811 0.3419622 0.4660515 0.4444624
reanalysis_sat_precip_amt_mm 0.0602955 0.0333330 0.2317451 -0.0476045 -0.0397861 -0.1180438 -0.1164566 1.0000000 0.2327357 0.2210826 0.4009756 0.2560687 0.2446586 0.5082735 0.4993110 1.0000000 0.4083665 -0.0937647 0.1956208 -0.1576758 0.1901354 0.2235604 0.5666605
reanalysis_specific_humidity_g_per_kg 0.2069423 0.0192497 0.5791925 -0.0405857 -0.0195721 -0.0591023 -0.0752546 0.4083665 0.9050036 0.8964199 0.9985332 0.8536287 0.8963762 0.3344285 0.6730096 0.4083665 1.0000000 -0.0258105 0.8699817 -0.0572908 0.6915627 0.8495735 0.2880637
reanalysis_tdtr_k -0.0676227 0.3244377 -0.1010152 -0.0338958 -0.0508752 0.0329361 0.0526967 -0.0937647 0.1782327 0.2012651 -0.0329880 0.3526397 -0.0496985 -0.3068675 -0.3745471 -0.0937647 -0.0258105 1.0000000 0.1390107 0.3724141 0.2825697 0.0102683 -0.2059301
station_avg_temp_c 0.1948228 -0.0963725 0.4792601 0.0655268 0.0886967 -0.0609560 -0.0340891 0.1956208 0.8808712 0.8791185 0.8688371 0.8528307 0.8412998 0.1351272 0.4264236 0.1956208 0.8699817 0.1390107 1.0000000 0.1863269 0.8652398 0.8985060 0.0308073
station_diur_temp_rng_c 0.0353035 -0.2783312 -0.1403304 0.1810871 0.1837657 0.0161918 0.0744763 -0.1576758 0.0417913 0.0563202 -0.0544207 0.1163216 -0.0224385 -0.2508274 -0.1931811 -0.1576758 -0.0572908 0.3724141 0.1863269 1.0000000 0.4756504 -0.1217538 -0.2645564
station_max_temp_c 0.1884475 -0.1722829 0.3193425 0.1118221 0.1371997 -0.0703627 -0.0088301 0.1901354 0.6987284 0.7041286 0.6902640 0.7616858 0.6271057 0.0809598 0.3419622 0.1901354 0.6915627 0.2825697 0.8652398 0.4756504 1.0000000 0.6740197 0.0060700
station_min_temp_c 0.1752482 -0.0007298 0.5166650 0.0114240 0.0182586 -0.0735414 -0.0689030 0.2235604 0.8331576 0.8274975 0.8504789 0.7712400 0.8297925 0.1985403 0.4660515 0.2235604 0.8495735 0.0102683 0.8985060 -0.1217538 0.6740197 1.0000000 0.0849552
station_precip_mm 0.0518830 0.0827184 0.2114750 -0.0753148 -0.0874670 -0.1378234 -0.1715959 0.5666605 0.1132962 0.0974891 0.2847830 0.1047280 0.1502997 0.4779837 0.4444624 0.5666605 0.2880637 -0.2059301 0.0308073 -0.2645564 0.0060700 0.0849552 1.0000000
# Reorder columns so that 'total_cases' is the first column
train_merged_iq_reordered <- train_merged_iq %>%
  select(total_cases, everything())

# Compute correlation matrix
correlation_matrix_iq <- cor(train_merged_iq_reordered[sapply(train_merged_sj_reordered, is.numeric)])


# Create correlation heatmap
corrplot(correlation_matrix_iq, method = "color", type = "upper", tl.cex = 0.5)

# Convert correlation matrix to a correlation table
correlation_table_iq <- as.data.frame(correlation_matrix_iq)

# Print formatted correlation table
kable(correlation_table_iq, "html") %>%
  kable_styling(full_width = FALSE)
total_cases year weekofyear ndvi_ne ndvi_nw ndvi_se ndvi_sw precipitation_amt_mm reanalysis_air_temp_k reanalysis_avg_temp_k reanalysis_dew_point_temp_k reanalysis_max_air_temp_k reanalysis_min_air_temp_k reanalysis_precip_amt_kg_per_m2 reanalysis_relative_humidity_percent reanalysis_sat_precip_amt_mm reanalysis_specific_humidity_g_per_kg reanalysis_tdtr_k station_avg_temp_c station_diur_temp_rng_c station_max_temp_c station_min_temp_c station_precip_mm
total_cases 1.0000000 0.1794511 -0.0118505 0.0207942 -0.0099516 -0.0406426 0.0328120 0.0906277 0.0963270 0.0795155 0.2291832 -0.0541752 0.2116788 0.1011505 0.1294858 0.0906277 0.2352177 -0.1321909 0.1339888 -0.0361342 0.0766647 0.2068168 0.0445683
year 0.1794511 1.0000000 -0.1368422 -0.0641159 0.0316934 -0.0706738 -0.0167118 0.0839134 0.0265268 0.0087092 0.3175299 -0.1568073 0.2174518 0.1624353 0.2534260 0.0839134 0.3211407 -0.2348002 0.0769614 -0.1145089 0.0224573 0.1280877 -0.0328965
weekofyear -0.0118505 -0.1368422 1.0000000 0.2539798 0.2214391 0.3232486 0.2564013 -0.1112095 0.2923895 0.3054914 -0.0987181 0.4100806 -0.0303174 -0.1498296 -0.2857707 -0.1112095 -0.0885344 0.3521710 0.0808283 0.2295054 0.1953287 -0.0945173 -0.0423159
ndvi_ne 0.0207942 -0.0641159 0.2539798 1.0000000 0.7642003 0.7693605 0.8429220 -0.0096776 0.1532940 0.1690034 -0.0330754 0.2169326 -0.0059333 -0.0829182 -0.1360848 -0.0096776 -0.0322194 0.1700031 0.1171250 0.1430750 0.1413499 -0.0078087 0.0133861
ndvi_nw -0.0099516 0.0316934 0.2214391 0.7642003 1.0000000 0.6445407 0.7653893 -0.0588619 0.1530328 0.1698186 -0.0364774 0.2068618 0.0015648 -0.0768680 -0.1353845 -0.0588619 -0.0323280 0.1689186 0.1272436 0.1988208 0.1528468 -0.0915672 -0.0153423
ndvi_se -0.0406426 -0.0706738 0.3232486 0.7693605 0.6445407 1.0000000 0.7151920 -0.0406126 0.1963635 0.2086462 -0.0632229 0.2624704 -0.0251627 -0.1222016 -0.1905314 -0.0406126 -0.0610467 0.2239094 0.1172213 0.1798671 0.1609706 -0.0520851 0.0146203
ndvi_sw 0.0328120 -0.0167118 0.2564013 0.8429220 0.7653893 0.7151920 1.0000000 -0.0184216 0.1639569 0.1768961 -0.0335247 0.2311766 -0.0028814 -0.0641212 -0.1425473 -0.0184216 -0.0300112 0.1749937 0.1186557 0.1741468 0.1733543 -0.0540292 -0.0016017
precipitation_amt_mm 0.0906277 0.0839134 -0.1112095 -0.0096776 -0.0588619 -0.0406126 -0.0184216 1.0000000 -0.0483261 -0.0537699 0.4719121 -0.2273346 0.3245030 0.3391155 0.4315369 1.0000000 0.4685600 -0.3774718 0.1164326 -0.1815484 -0.0023567 0.3155489 0.3745055
reanalysis_air_temp_k 0.0963270 0.0265268 0.2923895 0.1532940 0.1530328 0.1963635 0.1639569 -0.0483261 1.0000000 0.9733337 0.1513343 0.7512190 0.4121815 -0.0852959 -0.5478146 -0.0483261 0.1779876 0.5522957 0.5963278 0.5066823 0.6470764 0.2386999 -0.1307270
reanalysis_avg_temp_k 0.0795155 0.0087092 0.3054914 0.1690034 0.1698186 0.2086462 0.1768961 -0.0537699 0.9733337 1.0000000 0.1426519 0.7830266 0.3964063 -0.1081496 -0.5397850 -0.0537699 0.1664930 0.6016871 0.5662927 0.5054662 0.6225771 0.2101760 -0.1343537
reanalysis_dew_point_temp_k 0.2291832 0.3175299 -0.0987181 -0.0330754 -0.0364774 -0.0632229 -0.0335247 0.4719121 0.1513343 0.1426519 1.0000000 -0.2541932 0.7549847 0.5709717 0.7417486 0.4719121 0.9977770 -0.6078306 0.3345682 -0.2457371 0.0932149 0.6175029 0.1871635
reanalysis_max_air_temp_k -0.0541752 -0.1568073 0.4100806 0.2169326 0.2068618 0.2624704 0.2311766 -0.2273346 0.7512190 0.7830266 -0.2541932 1.0000000 -0.0494339 -0.2582632 -0.7265173 -0.2273346 -0.2363375 0.8003587 0.3659290 0.5859265 0.5853963 -0.0978262 -0.1977715
reanalysis_min_air_temp_k 0.2116788 0.2174518 -0.0303174 -0.0059333 0.0015648 -0.0251627 -0.0028814 0.3245030 0.4121815 0.3964063 0.7549847 -0.0494339 1.0000000 0.3990866 0.3644196 0.3245030 0.7587888 -0.4033810 0.4083750 -0.0512174 0.2230738 0.5909327 0.1007235
reanalysis_precip_amt_kg_per_m2 0.1011505 0.1624353 -0.1498296 -0.0829182 -0.0768680 -0.1222016 -0.0641212 0.3391155 -0.0852959 -0.1081496 0.5709717 -0.2582632 0.3990866 1.0000000 0.5509296 0.3391155 0.5771415 -0.5389011 0.0583611 -0.2125206 -0.0515601 0.2545844 0.1548380
reanalysis_relative_humidity_percent 0.1294858 0.2534260 -0.2857707 -0.1360848 -0.1353845 -0.1905314 -0.1425473 0.4315369 -0.5478146 -0.5397850 0.7417486 -0.7265173 0.3644196 0.5509296 1.0000000 0.4315369 0.7228807 -0.8959841 -0.1186443 -0.5502336 -0.3568823 0.3620996 0.2485204
reanalysis_sat_precip_amt_mm 0.0906277 0.0839134 -0.1112095 -0.0096776 -0.0588619 -0.0406126 -0.0184216 1.0000000 -0.0483261 -0.0537699 0.4719121 -0.2273346 0.3245030 0.3391155 0.4315369 1.0000000 0.4685600 -0.3774718 0.1164326 -0.1815484 -0.0023567 0.3155489 0.3745055
reanalysis_specific_humidity_g_per_kg 0.2352177 0.3211407 -0.0885344 -0.0322194 -0.0323280 -0.0610467 -0.0300112 0.4685600 0.1779876 0.1664930 0.9977770 -0.2363375 0.7587888 0.5771415 0.7228807 0.4685600 1.0000000 -0.5937305 0.3506903 -0.2312012 0.1104195 0.6149441 0.1793954
reanalysis_tdtr_k -0.1321909 -0.2348002 0.3521710 0.1700031 0.1689186 0.2239094 0.1749937 -0.3774718 0.5522957 0.6016871 -0.6078306 0.8003587 -0.4033810 -0.5389011 -0.8959841 -0.3774718 -0.5937305 1.0000000 0.1517012 0.5570897 0.3692137 -0.3428757 -0.2537170
station_avg_temp_c 0.1339888 0.0769614 0.0808283 0.1171250 0.1272436 0.1172213 0.1186557 0.1164326 0.5963278 0.5662927 0.3345682 0.3659290 0.4083750 0.0583611 -0.1186443 0.1164326 0.3506903 0.1517012 1.0000000 0.5022154 0.6511249 0.4536592 -0.0512563
station_diur_temp_rng_c -0.0361342 -0.1145089 0.2295054 0.1430750 0.1988208 0.1798671 0.1741468 -0.1815484 0.5066823 0.5054662 -0.2457371 0.5859265 -0.0512174 -0.2125206 -0.5502336 -0.1815484 -0.2312012 0.5570897 0.5022154 1.0000000 0.6688533 -0.2535025 -0.2638489
station_max_temp_c 0.0766647 0.0224573 0.1953287 0.1413499 0.1528468 0.1609706 0.1733543 -0.0023567 0.6470764 0.6225771 0.0932149 0.5853963 0.2230738 -0.0515601 -0.3568823 -0.0023567 0.1104195 0.3692137 0.6511249 0.6688533 1.0000000 0.1248263 -0.1402289
station_min_temp_c 0.2068168 0.1280877 -0.0945173 -0.0078087 -0.0915672 -0.0520851 -0.0540292 0.3155489 0.2386999 0.2101760 0.6175029 -0.0978262 0.5909327 0.2545844 0.3620996 0.3155489 0.6149441 -0.3428757 0.4536592 -0.2535025 0.1248263 1.0000000 0.1941040
station_precip_mm 0.0445683 -0.0328965 -0.0423159 0.0133861 -0.0153423 0.0146203 -0.0016017 0.3745055 -0.1307270 -0.1343537 0.1871635 -0.1977715 0.1007235 0.1548380 0.2485204 0.3745055 0.1793954 -0.2537170 -0.0512563 -0.2638489 -0.1402289 0.1941040 1.0000000

San Juan

Split Dataset

# total observations are 936 
# proceeding with 80:20 train test split
train_sj <- train_merged_sj[1:749, ]
test_sj <- train_merged_sj[750:936, ]

train_ts_sj <- ts(train_sj, start = c(1990, 04), frequency = 52)
test_ts_sj <- ts(test_sj, start = c(2004, 09), frequency = 52)

train_full_ts_sj <- ts(train_merged_sj, start = c(1990, 4), frequency = 52)

ARIMA Model

arima_model <- auto.arima(train_ts_sj[,25])
summary(arima_model)
## Series: train_ts_sj[, 25] 
## ARIMA(2,1,1) 
## 
## Coefficients:
##           ar1     ar2     ma1
##       -0.6327  0.2073  0.8079
## s.e.   0.1221  0.0359  0.1214
## 
## sigma^2 = 191.9:  log likelihood = -3026.1
## AIC=6060.2   AICc=6060.25   BIC=6078.67
## 
## Training set error measures:
##                       ME     RMSE      MAE MPE MAPE      MASE          ACF1
## Training set 0.009219299 13.81737 8.366687 NaN  Inf 0.2101244 -3.805921e-05
arima_forecast <- forecast(arima_model, h = 187)
autoplot(arima_forecast)

accuracy(arima_forecast, test_ts_sj[, "total_cases"])
##                        ME     RMSE       MAE  MPE MAPE      MASE          ACF1
## Training set  0.009219299 13.81737  8.366687  NaN  Inf 0.2101244 -3.805921e-05
## Test set     12.981694179 34.94797 19.151819 -Inf  Inf 0.4809865  9.339254e-01
##              Theil's U
## Training set        NA
## Test set             0

Dynamic ARIMA Model

# extract features for sj where correlation with total_cases > 0.2
xreg_train_sj <- as.matrix(train_sj[, c("reanalysis_dew_point_temp_k", "reanalysis_specific_humidity_g_per_kg")])

dynamic_arima <- auto.arima(train_ts_sj[, "total_cases"], xreg = xreg_train_sj)
summary(dynamic_arima)
## Series: train_ts_sj[, "total_cases"] 
## Regression with ARIMA(1,1,2) errors 
## 
## Coefficients:
##           ar1     ma1     ma2  reanalysis_dew_point_temp_k
##       -0.5954  0.7647  0.1906                     -11.0981
## s.e.   0.5368  0.5231  0.0429                       6.1746
##       reanalysis_specific_humidity_g_per_kg
##                                     12.7064
## s.e.                                 6.4982
## 
## sigma^2 = 191.3:  log likelihood = -3023.77
## AIC=6059.54   AICc=6059.66   BIC=6087.25
## 
## Training set error measures:
##                       ME     RMSE      MAE  MPE MAPE      MASE        ACF1
## Training set 0.004909138 13.77507 8.412971 -Inf  Inf 0.2112867 0.002459711
xreg_test_sj <- as.matrix(test_sj[, c("reanalysis_dew_point_temp_k", "reanalysis_specific_humidity_g_per_kg")])

dynamic_arima_fc <- forecast(dynamic_arima, xreg = xreg_test_sj, h = 187)
autoplot(dynamic_arima_fc)

accuracy(dynamic_arima_fc, test_ts_sj[, "total_cases"])
##                        ME     RMSE       MAE  MPE MAPE      MASE        ACF1
## Training set  0.004909138 13.77507  8.412971 -Inf  Inf 0.2112867 0.002459711
## Test set     18.076811151 36.98968 19.854713 -Inf  Inf 0.4986392 0.931029114
##              Theil's U
## Training set        NA
## Test set             0

Dynamic Neural Network Model

nn_model_sj <- nnetar(train_ts_sj[, "total_cases"], xreg = xreg_train_sj)
summary(nn_model_sj)
##           Length Class        Mode     
## x          749   ts           numeric  
## m            1   -none-       numeric  
## p            1   -none-       numeric  
## P            1   -none-       numeric  
## scalex       2   -none-       list     
## scalexreg    2   -none-       list     
## size         1   -none-       numeric  
## xreg      1498   -none-       numeric  
## subset     749   -none-       numeric  
## model       20   nnetarmodels list     
## nnetargs     0   -none-       list     
## fitted     749   ts           numeric  
## residuals  749   ts           numeric  
## lags        15   -none-       numeric  
## series       1   -none-       character
## method       1   -none-       character
## call         3   -none-       call
nn_fc_sj <- forecast(nn_model_sj, xreg = xreg_test_sj, h = 187)
autoplot(nn_fc_sj)

accuracy(nn_fc_sj, test_ts_sj[, "total_cases"])
##                        ME     RMSE       MAE  MPE MAPE      MASE        ACF1
## Training set  -0.01612171  5.80064  4.348197 -Inf  Inf 0.1092024 -0.02628943
## Test set     -11.47230403 52.01948 40.061283 -Inf  Inf 1.0061152  0.96849465
##              Theil's U
## Training set        NA
## Test set             0
# visual comparison
autoplot(train_ts_sj[, "total_cases"]) +
  autolayer(arima_forecast,
    series="ARIMA", PI=FALSE) +
  autolayer(dynamic_arima_fc,
    series="Dynamic ARIMA", PI=FALSE) +
  autolayer(nn_fc_sj,
    series="NN", PI=FALSE) +
  autolayer(test_ts_sj[, "total_cases"],
    series = "Actual") +
  ggtitle("Comparison between Forecasts for San Jose") +
  xlab("Year") + ylab("Total Cases") +
  guides(colour=guide_legend(title="Forecast"))

Iquitos

Split Dataset

train_iq <- train_merged_iq[1:416, ]
test_iq <- train_merged_iq[417:520,]

train_ts_iq <- ts(train_iq, start = c(2000, 07), frequency = 52)
test_ts_iq <- ts(test_iq, start = c(2008, 07), frequency = 52)

train_full_ts_iq <- ts(train_merged_iq, start = c(2000, 7), frequency = 52)

ARIMA Model

arima_mod_iq <- auto.arima(train_ts_iq[,"total_cases"])

summary(arima_mod_iq)
## Series: train_ts_iq[, "total_cases"] 
## ARIMA(2,1,1) 
## 
## Coefficients:
##          ar1      ar2      ma1
##       0.0042  -0.3261  -0.2806
## s.e.  0.1195   0.0555   0.1266
## 
## sigma^2 = 52.81:  log likelihood = -1410.62
## AIC=2829.25   AICc=2829.35   BIC=2845.36
## 
## Training set error measures:
##                       ME     RMSE      MAE MPE MAPE      MASE        ACF1
## Training set 0.002798189 7.232062 3.765364 NaN  Inf 0.4096212 -0.00214506
arima_fc_iq <- forecast(arima_mod_iq, h = 104)

autoplot(arima_fc_iq)

accuracy(arima_fc_iq, test_ts_iq[,"total_cases"])
##                       ME      RMSE      MAE  MPE MAPE      MASE        ACF1
## Training set 0.002798189  7.232062 3.765364  NaN  Inf 0.4096212 -0.00214506
## Test set     8.885144036 14.469655 8.986599 -Inf  Inf 0.9776217  0.84254559
##              Theil's U
## Training set        NA
## Test set             0

Dynamic ARIMA Model

# extract top three features for iq 
xreg_train_iq <- as.matrix(train_iq[, c("reanalysis_dew_point_temp_k", "reanalysis_min_air_temp_k", "reanalysis_specific_humidity_g_per_kg")])

dynamic_arima_mod_iq <- auto.arima(train_ts_iq[, "total_cases"], xreg = xreg_train_iq)

summary(dynamic_arima_mod_iq)
## Series: train_ts_iq[, "total_cases"] 
## Regression with ARIMA(3,0,0) errors 
## 
## Coefficients:
##          ar1      ar2     ar3  reanalysis_dew_point_temp_k
##       0.6914  -0.1772  0.2935                       -0.572
## s.e.  0.0469   0.0573  0.0468                        0.269
##       reanalysis_min_air_temp_k  reanalysis_specific_humidity_g_per_kg
##                          0.6130                                -0.2131
## s.e.                     0.2666                                 0.3423
## 
## sigma^2 = 49.73:  log likelihood = -1400.32
## AIC=2814.64   AICc=2814.91   BIC=2842.85
## 
## Training set error measures:
##                     ME     RMSE      MAE MPE MAPE      MASE        ACF1
## Training set 0.0304088 7.000685 3.923278 NaN  Inf 0.4268001 -0.01360866
xreg_test_iq <- as.matrix(test_iq[, c("reanalysis_dew_point_temp_k", "reanalysis_min_air_temp_k", "reanalysis_specific_humidity_g_per_kg")])

dynamic_arima_fc_iq <- forecast(dynamic_arima_mod_iq, xreg = xreg_test_iq, h = 104)

autoplot(dynamic_arima_fc_iq)

accuracy(dynamic_arima_fc_iq, test_ts_iq[,"total_cases"])
##                     ME      RMSE      MAE  MPE MAPE      MASE        ACF1
## Training set 0.0304088  7.000685 3.923278  NaN  Inf 0.4268001 -0.01360866
## Test set     3.1284068 11.929512 7.146473 -Inf  Inf 0.7774406  0.84296808
##              Theil's U
## Training set        NA
## Test set             0

Dynamic Neural Network Model

nn_model_iq <- nnetar(train_ts_iq[, "total_cases"], xreg = xreg_train_iq)

summary(nn_model_iq)
##           Length Class        Mode     
## x          416   ts           numeric  
## m            1   -none-       numeric  
## p            1   -none-       numeric  
## P            1   -none-       numeric  
## scalex       2   -none-       list     
## scalexreg    2   -none-       list     
## size         1   -none-       numeric  
## xreg      1248   -none-       numeric  
## subset     416   -none-       numeric  
## model       20   nnetarmodels list     
## nnetargs     0   -none-       list     
## fitted     416   ts           numeric  
## residuals  416   ts           numeric  
## lags         4   -none-       numeric  
## series       1   -none-       character
## method       1   -none-       character
## call         3   -none-       call
nn_fc_iq <- forecast(nn_model_iq, xreg = xreg_test_iq, h = 104)

autoplot(nn_fc_iq)

accuracy(nn_fc_iq, test_ts_iq[, "total_cases"])
##                       ME      RMSE      MAE  MPE MAPE      MASE         ACF1
## Training set -0.03591436  4.879157 3.046150 -Inf  Inf 0.3313804 -0.006041115
## Test set      2.98946011 12.423230 7.219992 -Inf  Inf 0.7854385  0.858253602
##              Theil's U
## Training set        NA
## Test set             0
autoplot(train_ts_iq[, "total_cases"]) +
  autolayer(arima_fc_iq,
    series="ARIMA", PI=FALSE) +
  autolayer(dynamic_arima_fc_iq,
    series="Dynamic ARIMA", PI=FALSE) +
  autolayer(nn_fc_iq,
    series="NN", PI=FALSE) +
  autolayer(test_ts_iq[, "total_cases"],
    series = "Actual") +
  ggtitle("Comparison between Forecasts for Iquitos") +
  xlab("Year") + ylab("Total Cases") +
  guides(colour=guide_legend(title="Forecast"))

Predictions

# handling missing data in test features using interpolation
for (col in names(test_features)) {
  # Check if the column has missing values
  if (any(is.na(test_features[[col]]))) {
    # Apply linear interpolation to the column and overwrite the original dataset
    test_features[[col]] <- na.approx(test_features[[col]])
  }
}
skim(test_features)
Data summary
Name test_features
Number of rows 416
Number of columns 24
_______________________
Column type frequency:
character 1
Date 1
numeric 22
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
city 0 1 2 2 0 2 0

Variable type: Date

skim_variable n_missing complete_rate min max median n_unique
week_start_date 0 1 2008-04-29 2013-06-25 2011-05-28 269

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
year 0 1 2010.77 1.43 2008.00 2010.00 2011.00 2012.00 2013.00 ▇▆▇▇▃
weekofyear 0 1 26.44 14.98 1.00 13.75 26.00 39.00 53.00 ▇▇▇▇▇
ndvi_ne 0 1 0.12 0.16 -0.46 0.00 0.09 0.25 0.50 ▁▂▇▆▂
ndvi_nw 0 1 0.12 0.14 -0.21 0.01 0.09 0.24 0.65 ▁▇▅▂▁
ndvi_se 0 1 0.21 0.08 0.01 0.15 0.20 0.25 0.45 ▁▇▇▃▁
ndvi_sw 0 1 0.20 0.09 -0.01 0.13 0.19 0.25 0.53 ▂▇▅▂▁
precipitation_amt_mm 0 1 38.17 35.19 0.00 8.04 31.12 57.72 169.34 ▇▅▂▁▁
reanalysis_air_temp_k 0 1 298.82 1.47 294.55 297.75 298.55 300.24 301.94 ▁▃▇▅▃
reanalysis_avg_temp_k 0 1 299.35 1.30 295.24 298.32 299.33 300.52 303.33 ▁▆▇▇▁
reanalysis_dew_point_temp_k 0 1 295.41 1.53 290.82 294.33 295.82 296.64 297.79 ▁▃▅▆▇
reanalysis_max_air_temp_k 0 1 303.61 3.10 298.20 301.40 302.70 305.80 314.10 ▅▇▃▂▁
reanalysis_min_air_temp_k 0 1 295.74 2.76 286.20 293.50 296.30 298.22 299.70 ▁▂▅▆▇
reanalysis_precip_amt_kg_per_m2 0 1 42.02 48.84 0.00 9.49 25.77 56.23 301.40 ▇▂▁▁▁
reanalysis_relative_humidity_percent 0 1 82.47 7.38 64.92 77.33 80.32 87.98 97.98 ▁▇▇▃▅
reanalysis_sat_precip_amt_mm 0 1 38.17 35.19 0.00 8.04 31.12 57.72 169.34 ▇▅▂▁▁
reanalysis_specific_humidity_g_per_kg 0 1 16.92 1.56 12.54 15.79 17.32 18.17 19.60 ▁▃▅▇▆
reanalysis_tdtr_k 0 1 5.11 3.54 1.49 2.45 2.91 8.17 14.49 ▇▁▂▂▁
station_avg_temp_c 0 1 27.36 1.22 24.16 26.53 27.44 28.28 30.27 ▂▅▇▇▂
station_diur_temp_rng_c 0 1 7.86 2.45 4.04 5.94 6.67 9.88 14.72 ▇▇▃▅▁
station_max_temp_c 0 1 32.53 1.93 27.20 31.10 32.80 33.90 38.40 ▂▃▇▃▁
station_min_temp_c 0 1 22.34 1.73 14.20 21.20 22.20 23.30 26.70 ▁▁▇▇▃
station_precip_mm 0 1 33.94 34.59 0.00 8.97 23.30 47.28 212.00 ▇▂▁▁▁
# split dataset by city
test_features_sj <- test_features %>% 
  filter(city == "sj")

test_features_iq <- test_features %>%
  filter(city == "iq")
# extract external regressors for both cities
xreg_test_sj_features <- as.matrix(test_features_sj[, c("reanalysis_dew_point_temp_k", "reanalysis_specific_humidity_g_per_kg")])

xreg_test_iq_features <- as.matrix(test_features_iq[, c("reanalysis_dew_point_temp_k", "reanalysis_min_air_temp_k", "reanalysis_specific_humidity_g_per_kg")])
# forecast for sj using nn model
xreg_train_sj <- as.matrix(train_merged_sj[, c("reanalysis_dew_point_temp_k", "reanalysis_specific_humidity_g_per_kg")])

sj_model <- nnetar(train_full_ts_sj[, "total_cases"], xreg = xreg_train_sj)

test_sj_predictions <- forecast(sj_model, xreg = xreg_test_sj_features, h = nrow(test_features_sj))
# forecast for iq using nn model
xreg_train_iq <- as.matrix(train_merged_iq[, c("reanalysis_dew_point_temp_k", "reanalysis_min_air_temp_k", "reanalysis_specific_humidity_g_per_kg")])

iq_model <- nnetar(train_full_ts_iq[, "total_cases"], xreg = xreg_train_iq)

test_iq_predictions <- forecast(nn_model_iq, xreg = xreg_test_iq_features, h = nrow(test_features_iq))
# visual predictions for sj
autoplot(test_sj_predictions)

# visualize predictions for iq
autoplot(test_iq_predictions)

# extract predictions and place into submission dataset
sj_predictions <- round(test_sj_predictions$mean)

predictions$total_cases[1:260] <- sj_predictions[1:260]
# extract predictions and place into submission dataset
iq_predictions <- round(test_iq_predictions$mean)

predictions$total_cases[261:416] <- iq_predictions[1:156]
# convert dataset into csv file
write.csv(predictions, "predictions2.csv", row.names = FALSE)