# load packages
library(readr)
library(ISOweek)
library(skimr)
library(stringr)
library(zoo)
library(forecast)
library(tidyr)
library(tidyverse)
library(fable)
library(fabletools)
library(dplyr)
library(ggplot2)
library(ggcorrplot)
library(corrplot)
library(kableExtra)
library(tsibbledata)
library(fpp3)

# import datasets 
train_features <- read_csv("/Users/laykhoonyu/Documents/MSAA_AE/predictive_analytics/DengAI/dengue_features_train.csv")
train_labels <- read_csv("/Users/laykhoonyu/Documents/MSAA_AE/predictive_analytics/DengAI/dengue_labels_train.csv")
test_features <- read_csv("/Users/laykhoonyu/Documents/MSAA_AE/predictive_analytics/DengAI/dengue_features_test.csv")
predictions <- read_csv("/Users/laykhoonyu/Documents/MSAA_AE/predictive_analytics/DengAI/submission_format.csv")

head(train_features)

## # A tibble: 6 × 24
##   city   year weekofyear week_start_date ndvi_ne ndvi_nw ndvi_se ndvi_sw
##   <chr> <dbl>      <dbl> <date>            <dbl>   <dbl>   <dbl>   <dbl>
## 1 sj     1990         18 1990-04-30       0.123    0.104   0.198   0.178
## 2 sj     1990         19 1990-05-07       0.170    0.142   0.162   0.155
## 3 sj     1990         20 1990-05-14       0.0322   0.173   0.157   0.171
## 4 sj     1990         21 1990-05-21       0.129    0.245   0.228   0.236
## 5 sj     1990         22 1990-05-28       0.196    0.262   0.251   0.247
## 6 sj     1990         23 1990-06-04      NA        0.175   0.254   0.182
## # ℹ 16 more variables: precipitation_amt_mm <dbl>, reanalysis_air_temp_k <dbl>,
## #   reanalysis_avg_temp_k <dbl>, reanalysis_dew_point_temp_k <dbl>,
## #   reanalysis_max_air_temp_k <dbl>, reanalysis_min_air_temp_k <dbl>,
## #   reanalysis_precip_amt_kg_per_m2 <dbl>,
## #   reanalysis_relative_humidity_percent <dbl>,
## #   reanalysis_sat_precip_amt_mm <dbl>,
## #   reanalysis_specific_humidity_g_per_kg <dbl>, reanalysis_tdtr_k <dbl>, …

head(train_labels)

## # A tibble: 6 × 4
##   city   year weekofyear total_cases
##   <chr> <dbl>      <dbl>       <dbl>
## 1 sj     1990         18           4
## 2 sj     1990         19           5
## 3 sj     1990         20           4
## 4 sj     1990         21           3
## 5 sj     1990         22           6
## 6 sj     1990         23           2

skim(train_features)

Data summary
Name	train_features
Number of rows	1456
Number of columns	24
_______________________
Column type frequency:
character	1
Date	1
numeric	22
________________________
Group variables	None

Variable type: character

skim_variable	n_missing	complete_rate	min	max	empty	n_unique	whitespace
city	0	1	2	2	0	2	0

Variable type: Date

skim_variable	n_missing	complete_rate	min	max	median	n_unique
week_start_date	0	1	1990-04-30	2010-06-25	2002-05-28	1049

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
year	0	1.00	2001.03	5.41	1990.00	1997.00	2002.00	2005.00	2010.00	▅▃▆▇▅
weekofyear	0	1.00	26.50	15.02	1.00	13.75	26.50	39.25	53.00	▇▇▇▇▇
ndvi_ne	194	0.87	0.14	0.14	-0.41	0.04	0.13	0.25	0.51	▁▂▇▇▂
ndvi_nw	52	0.96	0.13	0.12	-0.46	0.05	0.12	0.22	0.45	▁▁▆▇▂
ndvi_se	22	0.98	0.20	0.07	-0.02	0.16	0.20	0.25	0.54	▁▇▆▁▁
ndvi_sw	22	0.98	0.20	0.08	-0.06	0.14	0.19	0.25	0.55	▁▇▇▂▁
precipitation_amt_mm	13	0.99	45.76	43.72	0.00	9.80	38.34	70.24	390.60	▇▂▁▁▁
reanalysis_air_temp_k	10	0.99	298.70	1.36	294.64	297.66	298.65	299.83	302.20	▁▅▇▇▂
reanalysis_avg_temp_k	10	0.99	299.23	1.26	294.89	298.26	299.29	300.21	302.93	▁▃▇▇▁
reanalysis_dew_point_temp_k	10	0.99	295.25	1.53	289.64	294.12	295.64	296.46	298.45	▁▂▅▇▃
reanalysis_max_air_temp_k	10	0.99	303.43	3.23	297.80	301.00	302.40	305.50	314.00	▅▇▃▂▁
reanalysis_min_air_temp_k	10	0.99	295.72	2.57	286.90	293.90	296.20	297.90	299.90	▁▂▆▆▇
reanalysis_precip_amt_kg_per_m2	10	0.99	40.15	43.43	0.00	13.05	27.24	52.20	570.50	▇▁▁▁▁
reanalysis_relative_humidity_percent	10	0.99	82.16	7.15	57.79	77.18	80.30	86.36	98.61	▁▁▇▃▃
reanalysis_sat_precip_amt_mm	13	0.99	45.76	43.72	0.00	9.80	38.34	70.24	390.60	▇▂▁▁▁
reanalysis_specific_humidity_g_per_kg	10	0.99	16.75	1.54	11.72	15.56	17.09	17.98	20.46	▁▃▅▇▁
reanalysis_tdtr_k	10	0.99	4.90	3.55	1.36	2.33	2.86	7.62	16.03	▇▁▂▁▁
station_avg_temp_c	43	0.97	27.19	1.29	21.40	26.30	27.41	28.16	30.80	▁▁▅▇▁
station_diur_temp_rng_c	43	0.97	8.06	2.13	4.53	6.51	7.30	9.57	15.80	▇▇▃▂▁
station_max_temp_c	20	0.99	32.45	1.96	26.70	31.10	32.80	33.90	42.20	▂▇▇▁▁
station_min_temp_c	14	0.99	22.10	1.57	14.70	21.10	22.20	23.30	25.60	▁▁▅▇▃
station_precip_mm	22	0.98	39.33	47.46	0.00	8.70	23.85	53.90	543.30	▇▁▁▁▁

Data Cleaning

train_labels <- train_labels %>% mutate(beginning = ymd(str_c(year, "-01-01")),
           final_date = beginning + weeks(weekofyear))

train_labels %>%
  ggplot(aes(x = final_date, y = total_cases, color = city)) +
  geom_line() +
  labs(title = "Total Cases by City",
       x = "Date",
       y = "Value",
       color = "Region") +
  theme_minimal()

for (col in names(train_features)) {
  # Check if the column has missing values
  if (any(is.na(train_features[[col]]))) {
    # Apply linear interpolation to the column and overwrite the original dataset
    train_features[[col]] <- na.approx(train_features[[col]])
  }
}

train_merged <- train_features %>%
  left_join(select(train_labels, year, weekofyear, city, total_cases), by = c("year", "weekofyear", "city"))

skim(train_merged)

Data summary
Name	train_merged
Number of rows	1456
Number of columns	25
_______________________
Column type frequency:
character	1
Date	1
numeric	23
________________________
Group variables	None

Variable type: character

skim_variable	n_missing	complete_rate	min	max	empty	n_unique	whitespace
city	0	1	2	2	0	2	0

Variable type: Date

skim_variable	n_missing	complete_rate	min	max	median	n_unique
week_start_date	0	1	1990-04-30	2010-06-25	2002-05-28	1049

Variable type: numeric

skim_variable	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
year	1	2001.03	5.41	1990.00	1997.00	2002.00	2005.00	2010.00	▅▃▆▇▅
weekofyear	1	26.50	15.02	1.00	13.75	26.50	39.25	53.00	▇▇▇▇▇
ndvi_ne	1	0.13	0.14	-0.41	0.04	0.12	0.23	0.51	▁▂▇▆▂
ndvi_nw	1	0.13	0.12	-0.46	0.05	0.12	0.21	0.45	▁▁▇▇▂
ndvi_se	1	0.20	0.07	-0.02	0.15	0.20	0.25	0.54	▁▇▆▁▁
ndvi_sw	1	0.20	0.08	-0.06	0.14	0.19	0.25	0.55	▁▇▇▂▁
precipitation_amt_mm	1	45.70	43.65	0.00	9.79	38.32	70.23	390.60	▇▂▁▁▁
reanalysis_air_temp_k	1	298.70	1.36	294.64	297.66	298.64	299.83	302.20	▁▅▇▇▂
reanalysis_avg_temp_k	1	299.22	1.26	294.89	298.26	299.29	300.21	302.93	▁▃▇▇▁
reanalysis_dew_point_temp_k	1	295.24	1.53	289.64	294.12	295.64	296.46	298.45	▁▂▅▇▃
reanalysis_max_air_temp_k	1	303.42	3.23	297.80	301.00	302.40	305.50	314.00	▅▇▃▂▁
reanalysis_min_air_temp_k	1	295.72	2.56	286.90	293.90	296.20	297.90	299.90	▁▂▆▆▇
reanalysis_precip_amt_kg_per_m2	1	40.13	43.31	0.00	13.20	27.30	52.20	570.50	▇▁▁▁▁
reanalysis_relative_humidity_percent	1	82.17	7.15	57.79	77.20	80.29	86.44	98.61	▁▁▇▃▃
reanalysis_sat_precip_amt_mm	1	45.70	43.65	0.00	9.79	38.32	70.23	390.60	▇▂▁▁▁
reanalysis_specific_humidity_g_per_kg	1	16.74	1.54	11.72	15.55	17.08	17.98	20.46	▁▃▅▇▁
reanalysis_tdtr_k	1	4.90	3.54	1.36	2.33	2.86	7.63	16.03	▇▁▂▁▁
station_avg_temp_c	1	27.18	1.28	21.40	26.31	27.40	28.13	30.80	▁▁▅▇▁
station_diur_temp_rng_c	1	8.10	2.13	4.53	6.53	7.35	9.60	15.80	▇▇▃▂▁
station_max_temp_c	1	32.45	1.96	26.70	31.10	32.80	33.90	42.20	▂▇▇▁▁
station_min_temp_c	1	22.10	1.57	14.70	21.10	22.20	23.30	25.60	▁▁▅▇▃
station_precip_mm	1	39.36	47.29	0.00	8.85	24.05	53.90	543.30	▇▁▁▁▁
total_cases	1	24.68	43.60	0.00	5.00	12.00	28.00	461.00	▇▁▁▁▁

# subset train_merged by city
train_merged_sj <- train_merged %>% 
  filter(city == "sj")

train_merged_iq <- train_merged %>%
  filter(city == "iq")

Feature Selection

# Reorder columns so that 'total_cases' is the first column
train_merged_sj_reordered <- train_merged_sj %>%
  select(total_cases, everything())

# Compute correlation matrix
correlation_matrix_sj <- cor(train_merged_sj_reordered[sapply(train_merged_sj_reordered, is.numeric)])

# Create correlation heatmap
corrplot(correlation_matrix_sj, method = "color", type = "upper", tl.cex = 0.5)

# Convert correlation matrix to a correlation table
correlation_table_sj <- as.data.frame(correlation_matrix_sj)

# Print formatted correlation table
kable(correlation_table_sj, "html") %>%
  kable_styling(full_width = FALSE)

	total_cases	year	weekofyear	ndvi_ne	ndvi_nw	ndvi_se	ndvi_sw	precipitation_amt_mm	reanalysis_air_temp_k	reanalysis_avg_temp_k	reanalysis_dew_point_temp_k	reanalysis_max_air_temp_k	reanalysis_min_air_temp_k	reanalysis_precip_amt_kg_per_m2	reanalysis_relative_humidity_percent	reanalysis_sat_precip_amt_mm	reanalysis_specific_humidity_g_per_kg	reanalysis_tdtr_k	station_avg_temp_c	station_diur_temp_rng_c	station_max_temp_c	station_min_temp_c	station_precip_mm
total_cases	1.0000000	-0.2126898	0.2871342	0.0859184	0.0474027	-0.0436799	0.0426704	0.0602955	0.1803106	0.1736703	0.2028068	0.1932326	0.1865386	0.1069389	0.1444042	0.0602955	0.2069423	-0.0676227	0.1948228	0.0353035	0.1884475	0.1752482	0.0518830
year	-0.2126898	1.0000000	-0.0731425	-0.4126646	-0.4945010	-0.0066941	-0.0822972	0.0333330	0.1869462	0.1907631	0.0166096	0.1781211	0.1068788	-0.1321410	-0.2862127	0.0333330	0.0192497	0.3244377	-0.0963725	-0.2783312	-0.1722829	-0.0007298	0.0827184
weekofyear	0.2871342	-0.0731425	1.0000000	-0.0308390	-0.0373903	-0.0191420	-0.0715881	0.2317451	0.5676632	0.5536197	0.5721412	0.5112365	0.5689748	0.2522260	0.3059991	0.2317451	0.5791925	-0.1010152	0.4792601	-0.1403304	0.3193425	0.5166650	0.2114750
ndvi_ne	0.0859184	-0.4126646	-0.0308390	1.0000000	0.6326599	0.2193530	0.1810355	-0.0476045	-0.0814177	-0.0800987	-0.0446575	-0.0546411	-0.0904151	0.0120165	0.0422167	-0.0476045	-0.0405857	-0.0338958	0.0655268	0.1810871	0.1118221	0.0114240	-0.0753148
ndvi_nw	0.0474027	-0.4945010	-0.0373903	0.6326599	1.0000000	0.1973110	0.2177400	-0.0397861	-0.0755900	-0.0742216	-0.0248703	-0.0428293	-0.0743365	0.0057100	0.0762943	-0.0397861	-0.0195721	-0.0508752	0.0886967	0.1837657	0.1371997	0.0182586	-0.0874670
ndvi_se	-0.0436799	-0.0066941	-0.0191420	0.2193530	0.1973110	1.0000000	0.8146215	-0.1180438	-0.0152739	-0.0128028	-0.0636048	-0.0094001	-0.0468787	-0.1293738	-0.1143034	-0.1180438	-0.0591023	0.0329361	-0.0609560	0.0161918	-0.0703627	-0.0735414	-0.1378234
ndvi_sw	0.0426704	-0.0822972	-0.0715881	0.1810355	0.2177400	0.8146215	1.0000000	-0.1164566	-0.0378028	-0.0307194	-0.0822551	-0.0086754	-0.0672088	-0.1230979	-0.1153108	-0.1164566	-0.0752546	0.0526967	-0.0340891	0.0744763	-0.0088301	-0.0689030	-0.1715959
precipitation_amt_mm	0.0602955	0.0333330	0.2317451	-0.0476045	-0.0397861	-0.1180438	-0.1164566	1.0000000	0.2327357	0.2210826	0.4009756	0.2560687	0.2446586	0.5082735	0.4993110	1.0000000	0.4083665	-0.0937647	0.1956208	-0.1576758	0.1901354	0.2235604	0.5666605
reanalysis_air_temp_k	0.1803106	0.1869462	0.5676632	-0.0814177	-0.0755900	-0.0152739	-0.0378028	0.2327357	1.0000000	0.9975070	0.9034811	0.9353394	0.9422484	0.0804000	0.2979414	0.2327357	0.9050036	0.1782327	0.8808712	0.0417913	0.6987284	0.8331576	0.1132962
reanalysis_avg_temp_k	0.1736703	0.1907631	0.5536197	-0.0800987	-0.0742216	-0.0128028	-0.0307194	0.2210826	0.9975070	1.0000000	0.8953727	0.9392017	0.9391272	0.0626353	0.2840727	0.2210826	0.8964199	0.2012651	0.8791185	0.0563202	0.7041286	0.8274975	0.0974891
reanalysis_dew_point_temp_k	0.2028068	0.0166096	0.5721412	-0.0446575	-0.0248703	-0.0636048	-0.0822551	0.4009756	0.9034811	0.8953727	1.0000000	0.8476544	0.8990078	0.3284607	0.6781165	0.4009756	0.9985332	-0.0329880	0.8688371	-0.0544207	0.6902640	0.8504789	0.2847830
reanalysis_max_air_temp_k	0.1932326	0.1781211	0.5112365	-0.0546411	-0.0428293	-0.0094001	-0.0086754	0.2560687	0.9353394	0.9392017	0.8476544	1.0000000	0.8286651	0.0916054	0.2874617	0.2560687	0.8536287	0.3526397	0.8528307	0.1163216	0.7616858	0.7712400	0.1047280
reanalysis_min_air_temp_k	0.1865386	0.1068788	0.5689748	-0.0904151	-0.0743365	-0.0468787	-0.0672088	0.2446586	0.9422484	0.9391272	0.8990078	0.8286651	1.0000000	0.1327427	0.3849484	0.2446586	0.8963762	-0.0496985	0.8412998	-0.0224385	0.6271057	0.8297925	0.1502997
reanalysis_precip_amt_kg_per_m2	0.1069389	-0.1321410	0.2522260	0.0120165	0.0057100	-0.1293738	-0.1230979	0.5082735	0.0804000	0.0626353	0.3284607	0.0916054	0.1327427	1.0000000	0.6025475	0.5082735	0.3344285	-0.3068675	0.1351272	-0.2508274	0.0809598	0.1985403	0.4779837
reanalysis_relative_humidity_percent	0.1444042	-0.2862127	0.3059991	0.0422167	0.0762943	-0.1143034	-0.1153108	0.4993110	0.2979414	0.2840727	0.6781165	0.2874617	0.3849484	0.6025475	1.0000000	0.4993110	0.6730096	-0.3745471	0.4264236	-0.1931811	0.3419622	0.4660515	0.4444624
reanalysis_sat_precip_amt_mm	0.0602955	0.0333330	0.2317451	-0.0476045	-0.0397861	-0.1180438	-0.1164566	1.0000000	0.2327357	0.2210826	0.4009756	0.2560687	0.2446586	0.5082735	0.4993110	1.0000000	0.4083665	-0.0937647	0.1956208	-0.1576758	0.1901354	0.2235604	0.5666605
reanalysis_specific_humidity_g_per_kg	0.2069423	0.0192497	0.5791925	-0.0405857	-0.0195721	-0.0591023	-0.0752546	0.4083665	0.9050036	0.8964199	0.9985332	0.8536287	0.8963762	0.3344285	0.6730096	0.4083665	1.0000000	-0.0258105	0.8699817	-0.0572908	0.6915627	0.8495735	0.2880637
reanalysis_tdtr_k	-0.0676227	0.3244377	-0.1010152	-0.0338958	-0.0508752	0.0329361	0.0526967	-0.0937647	0.1782327	0.2012651	-0.0329880	0.3526397	-0.0496985	-0.3068675	-0.3745471	-0.0937647	-0.0258105	1.0000000	0.1390107	0.3724141	0.2825697	0.0102683	-0.2059301
station_avg_temp_c	0.1948228	-0.0963725	0.4792601	0.0655268	0.0886967	-0.0609560	-0.0340891	0.1956208	0.8808712	0.8791185	0.8688371	0.8528307	0.8412998	0.1351272	0.4264236	0.1956208	0.8699817	0.1390107	1.0000000	0.1863269	0.8652398	0.8985060	0.0308073
station_diur_temp_rng_c	0.0353035	-0.2783312	-0.1403304	0.1810871	0.1837657	0.0161918	0.0744763	-0.1576758	0.0417913	0.0563202	-0.0544207	0.1163216	-0.0224385	-0.2508274	-0.1931811	-0.1576758	-0.0572908	0.3724141	0.1863269	1.0000000	0.4756504	-0.1217538	-0.2645564
station_max_temp_c	0.1884475	-0.1722829	0.3193425	0.1118221	0.1371997	-0.0703627	-0.0088301	0.1901354	0.6987284	0.7041286	0.6902640	0.7616858	0.6271057	0.0809598	0.3419622	0.1901354	0.6915627	0.2825697	0.8652398	0.4756504	1.0000000	0.6740197	0.0060700
station_min_temp_c	0.1752482	-0.0007298	0.5166650	0.0114240	0.0182586	-0.0735414	-0.0689030	0.2235604	0.8331576	0.8274975	0.8504789	0.7712400	0.8297925	0.1985403	0.4660515	0.2235604	0.8495735	0.0102683	0.8985060	-0.1217538	0.6740197	1.0000000	0.0849552
station_precip_mm	0.0518830	0.0827184	0.2114750	-0.0753148	-0.0874670	-0.1378234	-0.1715959	0.5666605	0.1132962	0.0974891	0.2847830	0.1047280	0.1502997	0.4779837	0.4444624	0.5666605	0.2880637	-0.2059301	0.0308073	-0.2645564	0.0060700	0.0849552	1.0000000

# Reorder columns so that 'total_cases' is the first column
train_merged_iq_reordered <- train_merged_iq %>%
  select(total_cases, everything())

# Compute correlation matrix
correlation_matrix_iq <- cor(train_merged_iq_reordered[sapply(train_merged_sj_reordered, is.numeric)])


# Create correlation heatmap
corrplot(correlation_matrix_iq, method = "color", type = "upper", tl.cex = 0.5)

# Convert correlation matrix to a correlation table
correlation_table_iq <- as.data.frame(correlation_matrix_iq)

# Print formatted correlation table
kable(correlation_table_iq, "html") %>%
  kable_styling(full_width = FALSE)

	total_cases	year	weekofyear	ndvi_ne	ndvi_nw	ndvi_se	ndvi_sw	precipitation_amt_mm	reanalysis_air_temp_k	reanalysis_avg_temp_k	reanalysis_dew_point_temp_k	reanalysis_max_air_temp_k	reanalysis_min_air_temp_k	reanalysis_precip_amt_kg_per_m2	reanalysis_relative_humidity_percent	reanalysis_sat_precip_amt_mm	reanalysis_specific_humidity_g_per_kg	reanalysis_tdtr_k	station_avg_temp_c	station_diur_temp_rng_c	station_max_temp_c	station_min_temp_c	station_precip_mm
total_cases	1.0000000	0.1794511	-0.0118505	0.0207942	-0.0099516	-0.0406426	0.0328120	0.0906277	0.0963270	0.0795155	0.2291832	-0.0541752	0.2116788	0.1011505	0.1294858	0.0906277	0.2352177	-0.1321909	0.1339888	-0.0361342	0.0766647	0.2068168	0.0445683
year	0.1794511	1.0000000	-0.1368422	-0.0641159	0.0316934	-0.0706738	-0.0167118	0.0839134	0.0265268	0.0087092	0.3175299	-0.1568073	0.2174518	0.1624353	0.2534260	0.0839134	0.3211407	-0.2348002	0.0769614	-0.1145089	0.0224573	0.1280877	-0.0328965
weekofyear	-0.0118505	-0.1368422	1.0000000	0.2539798	0.2214391	0.3232486	0.2564013	-0.1112095	0.2923895	0.3054914	-0.0987181	0.4100806	-0.0303174	-0.1498296	-0.2857707	-0.1112095	-0.0885344	0.3521710	0.0808283	0.2295054	0.1953287	-0.0945173	-0.0423159
ndvi_ne	0.0207942	-0.0641159	0.2539798	1.0000000	0.7642003	0.7693605	0.8429220	-0.0096776	0.1532940	0.1690034	-0.0330754	0.2169326	-0.0059333	-0.0829182	-0.1360848	-0.0096776	-0.0322194	0.1700031	0.1171250	0.1430750	0.1413499	-0.0078087	0.0133861
ndvi_nw	-0.0099516	0.0316934	0.2214391	0.7642003	1.0000000	0.6445407	0.7653893	-0.0588619	0.1530328	0.1698186	-0.0364774	0.2068618	0.0015648	-0.0768680	-0.1353845	-0.0588619	-0.0323280	0.1689186	0.1272436	0.1988208	0.1528468	-0.0915672	-0.0153423
ndvi_se	-0.0406426	-0.0706738	0.3232486	0.7693605	0.6445407	1.0000000	0.7151920	-0.0406126	0.1963635	0.2086462	-0.0632229	0.2624704	-0.0251627	-0.1222016	-0.1905314	-0.0406126	-0.0610467	0.2239094	0.1172213	0.1798671	0.1609706	-0.0520851	0.0146203
ndvi_sw	0.0328120	-0.0167118	0.2564013	0.8429220	0.7653893	0.7151920	1.0000000	-0.0184216	0.1639569	0.1768961	-0.0335247	0.2311766	-0.0028814	-0.0641212	-0.1425473	-0.0184216	-0.0300112	0.1749937	0.1186557	0.1741468	0.1733543	-0.0540292	-0.0016017
precipitation_amt_mm	0.0906277	0.0839134	-0.1112095	-0.0096776	-0.0588619	-0.0406126	-0.0184216	1.0000000	-0.0483261	-0.0537699	0.4719121	-0.2273346	0.3245030	0.3391155	0.4315369	1.0000000	0.4685600	-0.3774718	0.1164326	-0.1815484	-0.0023567	0.3155489	0.3745055
reanalysis_air_temp_k	0.0963270	0.0265268	0.2923895	0.1532940	0.1530328	0.1963635	0.1639569	-0.0483261	1.0000000	0.9733337	0.1513343	0.7512190	0.4121815	-0.0852959	-0.5478146	-0.0483261	0.1779876	0.5522957	0.5963278	0.5066823	0.6470764	0.2386999	-0.1307270
reanalysis_avg_temp_k	0.0795155	0.0087092	0.3054914	0.1690034	0.1698186	0.2086462	0.1768961	-0.0537699	0.9733337	1.0000000	0.1426519	0.7830266	0.3964063	-0.1081496	-0.5397850	-0.0537699	0.1664930	0.6016871	0.5662927	0.5054662	0.6225771	0.2101760	-0.1343537
reanalysis_dew_point_temp_k	0.2291832	0.3175299	-0.0987181	-0.0330754	-0.0364774	-0.0632229	-0.0335247	0.4719121	0.1513343	0.1426519	1.0000000	-0.2541932	0.7549847	0.5709717	0.7417486	0.4719121	0.9977770	-0.6078306	0.3345682	-0.2457371	0.0932149	0.6175029	0.1871635
reanalysis_max_air_temp_k	-0.0541752	-0.1568073	0.4100806	0.2169326	0.2068618	0.2624704	0.2311766	-0.2273346	0.7512190	0.7830266	-0.2541932	1.0000000	-0.0494339	-0.2582632	-0.7265173	-0.2273346	-0.2363375	0.8003587	0.3659290	0.5859265	0.5853963	-0.0978262	-0.1977715
reanalysis_min_air_temp_k	0.2116788	0.2174518	-0.0303174	-0.0059333	0.0015648	-0.0251627	-0.0028814	0.3245030	0.4121815	0.3964063	0.7549847	-0.0494339	1.0000000	0.3990866	0.3644196	0.3245030	0.7587888	-0.4033810	0.4083750	-0.0512174	0.2230738	0.5909327	0.1007235
reanalysis_precip_amt_kg_per_m2	0.1011505	0.1624353	-0.1498296	-0.0829182	-0.0768680	-0.1222016	-0.0641212	0.3391155	-0.0852959	-0.1081496	0.5709717	-0.2582632	0.3990866	1.0000000	0.5509296	0.3391155	0.5771415	-0.5389011	0.0583611	-0.2125206	-0.0515601	0.2545844	0.1548380
reanalysis_relative_humidity_percent	0.1294858	0.2534260	-0.2857707	-0.1360848	-0.1353845	-0.1905314	-0.1425473	0.4315369	-0.5478146	-0.5397850	0.7417486	-0.7265173	0.3644196	0.5509296	1.0000000	0.4315369	0.7228807	-0.8959841	-0.1186443	-0.5502336	-0.3568823	0.3620996	0.2485204
reanalysis_sat_precip_amt_mm	0.0906277	0.0839134	-0.1112095	-0.0096776	-0.0588619	-0.0406126	-0.0184216	1.0000000	-0.0483261	-0.0537699	0.4719121	-0.2273346	0.3245030	0.3391155	0.4315369	1.0000000	0.4685600	-0.3774718	0.1164326	-0.1815484	-0.0023567	0.3155489	0.3745055
reanalysis_specific_humidity_g_per_kg	0.2352177	0.3211407	-0.0885344	-0.0322194	-0.0323280	-0.0610467	-0.0300112	0.4685600	0.1779876	0.1664930	0.9977770	-0.2363375	0.7587888	0.5771415	0.7228807	0.4685600	1.0000000	-0.5937305	0.3506903	-0.2312012	0.1104195	0.6149441	0.1793954
reanalysis_tdtr_k	-0.1321909	-0.2348002	0.3521710	0.1700031	0.1689186	0.2239094	0.1749937	-0.3774718	0.5522957	0.6016871	-0.6078306	0.8003587	-0.4033810	-0.5389011	-0.8959841	-0.3774718	-0.5937305	1.0000000	0.1517012	0.5570897	0.3692137	-0.3428757	-0.2537170
station_avg_temp_c	0.1339888	0.0769614	0.0808283	0.1171250	0.1272436	0.1172213	0.1186557	0.1164326	0.5963278	0.5662927	0.3345682	0.3659290	0.4083750	0.0583611	-0.1186443	0.1164326	0.3506903	0.1517012	1.0000000	0.5022154	0.6511249	0.4536592	-0.0512563
station_diur_temp_rng_c	-0.0361342	-0.1145089	0.2295054	0.1430750	0.1988208	0.1798671	0.1741468	-0.1815484	0.5066823	0.5054662	-0.2457371	0.5859265	-0.0512174	-0.2125206	-0.5502336	-0.1815484	-0.2312012	0.5570897	0.5022154	1.0000000	0.6688533	-0.2535025	-0.2638489
station_max_temp_c	0.0766647	0.0224573	0.1953287	0.1413499	0.1528468	0.1609706	0.1733543	-0.0023567	0.6470764	0.6225771	0.0932149	0.5853963	0.2230738	-0.0515601	-0.3568823	-0.0023567	0.1104195	0.3692137	0.6511249	0.6688533	1.0000000	0.1248263	-0.1402289
station_min_temp_c	0.2068168	0.1280877	-0.0945173	-0.0078087	-0.0915672	-0.0520851	-0.0540292	0.3155489	0.2386999	0.2101760	0.6175029	-0.0978262	0.5909327	0.2545844	0.3620996	0.3155489	0.6149441	-0.3428757	0.4536592	-0.2535025	0.1248263	1.0000000	0.1941040
station_precip_mm	0.0445683	-0.0328965	-0.0423159	0.0133861	-0.0153423	0.0146203	-0.0016017	0.3745055	-0.1307270	-0.1343537	0.1871635	-0.1977715	0.1007235	0.1548380	0.2485204	0.3745055	0.1793954	-0.2537170	-0.0512563	-0.2638489	-0.1402289	0.1941040	1.0000000

San Juan

Split Dataset

# total observations are 936 
# proceeding with 80:20 train test split
train_sj <- train_merged_sj[1:749, ]
test_sj <- train_merged_sj[750:936, ]

train_ts_sj <- ts(train_sj, start = c(1990, 04), frequency = 52)
test_ts_sj <- ts(test_sj, start = c(2004, 09), frequency = 52)

train_full_ts_sj <- ts(train_merged_sj, start = c(1990, 4), frequency = 52)

ARIMA Model

arima_model <- auto.arima(train_ts_sj[,25])

summary(arima_model)

## Series: train_ts_sj[, 25] 
## ARIMA(2,1,1) 
## 
## Coefficients:
##           ar1     ar2     ma1
##       -0.6327  0.2073  0.8079
## s.e.   0.1221  0.0359  0.1214
## 
## sigma^2 = 191.9:  log likelihood = -3026.1
## AIC=6060.2   AICc=6060.25   BIC=6078.67
## 
## Training set error measures:
##                       ME     RMSE      MAE MPE MAPE      MASE          ACF1
## Training set 0.009219299 13.81737 8.366687 NaN  Inf 0.2101244 -3.805921e-05

arima_forecast <- forecast(arima_model, h = 187)

autoplot(arima_forecast)

accuracy(arima_forecast, test_ts_sj[, "total_cases"])

##                        ME     RMSE       MAE  MPE MAPE      MASE          ACF1
## Training set  0.009219299 13.81737  8.366687  NaN  Inf 0.2101244 -3.805921e-05
## Test set     12.981694179 34.94797 19.151819 -Inf  Inf 0.4809865  9.339254e-01
##              Theil's U
## Training set        NA
## Test set             0

Dynamic ARIMA Model

# extract features for sj where correlation with total_cases > 0.2
xreg_train_sj <- as.matrix(train_sj[, c("reanalysis_dew_point_temp_k", "reanalysis_specific_humidity_g_per_kg")])

dynamic_arima <- auto.arima(train_ts_sj[, "total_cases"], xreg = xreg_train_sj)

summary(dynamic_arima)

## Series: train_ts_sj[, "total_cases"] 
## Regression with ARIMA(1,1,2) errors 
## 
## Coefficients:
##           ar1     ma1     ma2  reanalysis_dew_point_temp_k
##       -0.5954  0.7647  0.1906                     -11.0981
## s.e.   0.5368  0.5231  0.0429                       6.1746
##       reanalysis_specific_humidity_g_per_kg
##                                     12.7064
## s.e.                                 6.4982
## 
## sigma^2 = 191.3:  log likelihood = -3023.77
## AIC=6059.54   AICc=6059.66   BIC=6087.25
## 
## Training set error measures:
##                       ME     RMSE      MAE  MPE MAPE      MASE        ACF1
## Training set 0.004909138 13.77507 8.412971 -Inf  Inf 0.2112867 0.002459711

xreg_test_sj <- as.matrix(test_sj[, c("reanalysis_dew_point_temp_k", "reanalysis_specific_humidity_g_per_kg")])

dynamic_arima_fc <- forecast(dynamic_arima, xreg = xreg_test_sj, h = 187)

autoplot(dynamic_arima_fc)

accuracy(dynamic_arima_fc, test_ts_sj[, "total_cases"])

##                        ME     RMSE       MAE  MPE MAPE      MASE        ACF1
## Training set  0.004909138 13.77507  8.412971 -Inf  Inf 0.2112867 0.002459711
## Test set     18.076811151 36.98968 19.854713 -Inf  Inf 0.4986392 0.931029114
##              Theil's U
## Training set        NA
## Test set             0

Dynamic Neural Network Model

nn_model_sj <- nnetar(train_ts_sj[, "total_cases"], xreg = xreg_train_sj)

summary(nn_model_sj)

##           Length Class        Mode     
## x          749   ts           numeric  
## m            1   -none-       numeric  
## p            1   -none-       numeric  
## P            1   -none-       numeric  
## scalex       2   -none-       list     
## scalexreg    2   -none-       list     
## size         1   -none-       numeric  
## xreg      1498   -none-       numeric  
## subset     749   -none-       numeric  
## model       20   nnetarmodels list     
## nnetargs     0   -none-       list     
## fitted     749   ts           numeric  
## residuals  749   ts           numeric  
## lags        15   -none-       numeric  
## series       1   -none-       character
## method       1   -none-       character
## call         3   -none-       call

nn_fc_sj <- forecast(nn_model_sj, xreg = xreg_test_sj, h = 187)
autoplot(nn_fc_sj)

accuracy(nn_fc_sj, test_ts_sj[, "total_cases"])

##                        ME     RMSE       MAE  MPE MAPE      MASE        ACF1
## Training set  -0.01612171  5.80064  4.348197 -Inf  Inf 0.1092024 -0.02628943
## Test set     -11.47230403 52.01948 40.061283 -Inf  Inf 1.0061152  0.96849465
##              Theil's U
## Training set        NA
## Test set             0

# visual comparison
autoplot(train_ts_sj[, "total_cases"]) +
  autolayer(arima_forecast,
    series="ARIMA", PI=FALSE) +
  autolayer(dynamic_arima_fc,
    series="Dynamic ARIMA", PI=FALSE) +
  autolayer(nn_fc_sj,
    series="NN", PI=FALSE) +
  autolayer(test_ts_sj[, "total_cases"],
    series = "Actual") +
  ggtitle("Comparison between Forecasts for San Jose") +
  xlab("Year") + ylab("Total Cases") +
  guides(colour=guide_legend(title="Forecast"))

Iquitos

Split Dataset

train_iq <- train_merged_iq[1:416, ]
test_iq <- train_merged_iq[417:520,]

train_ts_iq <- ts(train_iq, start = c(2000, 07), frequency = 52)
test_ts_iq <- ts(test_iq, start = c(2008, 07), frequency = 52)

train_full_ts_iq <- ts(train_merged_iq, start = c(2000, 7), frequency = 52)

ARIMA Model

arima_mod_iq <- auto.arima(train_ts_iq[,"total_cases"])

summary(arima_mod_iq)

## Series: train_ts_iq[, "total_cases"] 
## ARIMA(2,1,1) 
## 
## Coefficients:
##          ar1      ar2      ma1
##       0.0042  -0.3261  -0.2806
## s.e.  0.1195   0.0555   0.1266
## 
## sigma^2 = 52.81:  log likelihood = -1410.62
## AIC=2829.25   AICc=2829.35   BIC=2845.36
## 
## Training set error measures:
##                       ME     RMSE      MAE MPE MAPE      MASE        ACF1
## Training set 0.002798189 7.232062 3.765364 NaN  Inf 0.4096212 -0.00214506

arima_fc_iq <- forecast(arima_mod_iq, h = 104)

autoplot(arima_fc_iq)

accuracy(arima_fc_iq, test_ts_iq[,"total_cases"])

##                       ME      RMSE      MAE  MPE MAPE      MASE        ACF1
## Training set 0.002798189  7.232062 3.765364  NaN  Inf 0.4096212 -0.00214506
## Test set     8.885144036 14.469655 8.986599 -Inf  Inf 0.9776217  0.84254559
##              Theil's U
## Training set        NA
## Test set             0

Dynamic ARIMA Model

# extract top three features for iq 
xreg_train_iq <- as.matrix(train_iq[, c("reanalysis_dew_point_temp_k", "reanalysis_min_air_temp_k", "reanalysis_specific_humidity_g_per_kg")])

dynamic_arima_mod_iq <- auto.arima(train_ts_iq[, "total_cases"], xreg = xreg_train_iq)

summary(dynamic_arima_mod_iq)

## Series: train_ts_iq[, "total_cases"] 
## Regression with ARIMA(3,0,0) errors 
## 
## Coefficients:
##          ar1      ar2     ar3  reanalysis_dew_point_temp_k
##       0.6914  -0.1772  0.2935                       -0.572
## s.e.  0.0469   0.0573  0.0468                        0.269
##       reanalysis_min_air_temp_k  reanalysis_specific_humidity_g_per_kg
##                          0.6130                                -0.2131
## s.e.                     0.2666                                 0.3423
## 
## sigma^2 = 49.73:  log likelihood = -1400.32
## AIC=2814.64   AICc=2814.91   BIC=2842.85
## 
## Training set error measures:
##                     ME     RMSE      MAE MPE MAPE      MASE        ACF1
## Training set 0.0304088 7.000685 3.923278 NaN  Inf 0.4268001 -0.01360866

xreg_test_iq <- as.matrix(test_iq[, c("reanalysis_dew_point_temp_k", "reanalysis_min_air_temp_k", "reanalysis_specific_humidity_g_per_kg")])

dynamic_arima_fc_iq <- forecast(dynamic_arima_mod_iq, xreg = xreg_test_iq, h = 104)

autoplot(dynamic_arima_fc_iq)

accuracy(dynamic_arima_fc_iq, test_ts_iq[,"total_cases"])

##                     ME      RMSE      MAE  MPE MAPE      MASE        ACF1
## Training set 0.0304088  7.000685 3.923278  NaN  Inf 0.4268001 -0.01360866
## Test set     3.1284068 11.929512 7.146473 -Inf  Inf 0.7774406  0.84296808
##              Theil's U
## Training set        NA
## Test set             0

Dynamic Neural Network Model

nn_model_iq <- nnetar(train_ts_iq[, "total_cases"], xreg = xreg_train_iq)

summary(nn_model_iq)

##           Length Class        Mode     
## x          416   ts           numeric  
## m            1   -none-       numeric  
## p            1   -none-       numeric  
## P            1   -none-       numeric  
## scalex       2   -none-       list     
## scalexreg    2   -none-       list     
## size         1   -none-       numeric  
## xreg      1248   -none-       numeric  
## subset     416   -none-       numeric  
## model       20   nnetarmodels list     
## nnetargs     0   -none-       list     
## fitted     416   ts           numeric  
## residuals  416   ts           numeric  
## lags         4   -none-       numeric  
## series       1   -none-       character
## method       1   -none-       character
## call         3   -none-       call

nn_fc_iq <- forecast(nn_model_iq, xreg = xreg_test_iq, h = 104)

autoplot(nn_fc_iq)

accuracy(nn_fc_iq, test_ts_iq[, "total_cases"])

##                       ME      RMSE      MAE  MPE MAPE      MASE         ACF1
## Training set -0.03591436  4.879157 3.046150 -Inf  Inf 0.3313804 -0.006041115
## Test set      2.98946011 12.423230 7.219992 -Inf  Inf 0.7854385  0.858253602
##              Theil's U
## Training set        NA
## Test set             0

autoplot(train_ts_iq[, "total_cases"]) +
  autolayer(arima_fc_iq,
    series="ARIMA", PI=FALSE) +
  autolayer(dynamic_arima_fc_iq,
    series="Dynamic ARIMA", PI=FALSE) +
  autolayer(nn_fc_iq,
    series="NN", PI=FALSE) +
  autolayer(test_ts_iq[, "total_cases"],
    series = "Actual") +
  ggtitle("Comparison between Forecasts for Iquitos") +
  xlab("Year") + ylab("Total Cases") +
  guides(colour=guide_legend(title="Forecast"))

Predictions

# handling missing data in test features using interpolation
for (col in names(test_features)) {
  # Check if the column has missing values
  if (any(is.na(test_features[[col]]))) {
    # Apply linear interpolation to the column and overwrite the original dataset
    test_features[[col]] <- na.approx(test_features[[col]])
  }
}

skim(test_features)

Data summary
Name	test_features
Number of rows	416
Number of columns	24
_______________________
Column type frequency:
character	1
Date	1
numeric	22
________________________
Group variables	None

Variable type: character

skim_variable	n_missing	complete_rate	min	max	empty	n_unique	whitespace
city	0	1	2	2	0	2	0

Variable type: Date

skim_variable	n_missing	complete_rate	min	max	median	n_unique
week_start_date	0	1	2008-04-29	2013-06-25	2011-05-28	269

Variable type: numeric

skim_variable	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
year	1	2010.77	1.43	2008.00	2010.00	2011.00	2012.00	2013.00	▇▆▇▇▃
weekofyear	1	26.44	14.98	1.00	13.75	26.00	39.00	53.00	▇▇▇▇▇
ndvi_ne	1	0.12	0.16	-0.46	0.00	0.09	0.25	0.50	▁▂▇▆▂
ndvi_nw	1	0.12	0.14	-0.21	0.01	0.09	0.24	0.65	▁▇▅▂▁
ndvi_se	1	0.21	0.08	0.01	0.15	0.20	0.25	0.45	▁▇▇▃▁
ndvi_sw	1	0.20	0.09	-0.01	0.13	0.19	0.25	0.53	▂▇▅▂▁
precipitation_amt_mm	1	38.17	35.19	0.00	8.04	31.12	57.72	169.34	▇▅▂▁▁
reanalysis_air_temp_k	1	298.82	1.47	294.55	297.75	298.55	300.24	301.94	▁▃▇▅▃
reanalysis_avg_temp_k	1	299.35	1.30	295.24	298.32	299.33	300.52	303.33	▁▆▇▇▁
reanalysis_dew_point_temp_k	1	295.41	1.53	290.82	294.33	295.82	296.64	297.79	▁▃▅▆▇
reanalysis_max_air_temp_k	1	303.61	3.10	298.20	301.40	302.70	305.80	314.10	▅▇▃▂▁
reanalysis_min_air_temp_k	1	295.74	2.76	286.20	293.50	296.30	298.22	299.70	▁▂▅▆▇
reanalysis_precip_amt_kg_per_m2	1	42.02	48.84	0.00	9.49	25.77	56.23	301.40	▇▂▁▁▁
reanalysis_relative_humidity_percent	1	82.47	7.38	64.92	77.33	80.32	87.98	97.98	▁▇▇▃▅
reanalysis_sat_precip_amt_mm	1	38.17	35.19	0.00	8.04	31.12	57.72	169.34	▇▅▂▁▁
reanalysis_specific_humidity_g_per_kg	1	16.92	1.56	12.54	15.79	17.32	18.17	19.60	▁▃▅▇▆
reanalysis_tdtr_k	1	5.11	3.54	1.49	2.45	2.91	8.17	14.49	▇▁▂▂▁
station_avg_temp_c	1	27.36	1.22	24.16	26.53	27.44	28.28	30.27	▂▅▇▇▂
station_diur_temp_rng_c	1	7.86	2.45	4.04	5.94	6.67	9.88	14.72	▇▇▃▅▁
station_max_temp_c	1	32.53	1.93	27.20	31.10	32.80	33.90	38.40	▂▃▇▃▁
station_min_temp_c	1	22.34	1.73	14.20	21.20	22.20	23.30	26.70	▁▁▇▇▃
station_precip_mm	1	33.94	34.59	0.00	8.97	23.30	47.28	212.00	▇▂▁▁▁

# split dataset by city
test_features_sj <- test_features %>% 
  filter(city == "sj")

test_features_iq <- test_features %>%
  filter(city == "iq")

# extract external regressors for both cities
xreg_test_sj_features <- as.matrix(test_features_sj[, c("reanalysis_dew_point_temp_k", "reanalysis_specific_humidity_g_per_kg")])

xreg_test_iq_features <- as.matrix(test_features_iq[, c("reanalysis_dew_point_temp_k", "reanalysis_min_air_temp_k", "reanalysis_specific_humidity_g_per_kg")])

# forecast for sj using nn model
xreg_train_sj <- as.matrix(train_merged_sj[, c("reanalysis_dew_point_temp_k", "reanalysis_specific_humidity_g_per_kg")])

sj_model <- nnetar(train_full_ts_sj[, "total_cases"], xreg = xreg_train_sj)

test_sj_predictions <- forecast(sj_model, xreg = xreg_test_sj_features, h = nrow(test_features_sj))

# forecast for iq using nn model
xreg_train_iq <- as.matrix(train_merged_iq[, c("reanalysis_dew_point_temp_k", "reanalysis_min_air_temp_k", "reanalysis_specific_humidity_g_per_kg")])

iq_model <- nnetar(train_full_ts_iq[, "total_cases"], xreg = xreg_train_iq)

test_iq_predictions <- forecast(nn_model_iq, xreg = xreg_test_iq_features, h = nrow(test_features_iq))

# visual predictions for sj
autoplot(test_sj_predictions)

# visualize predictions for iq
autoplot(test_iq_predictions)

# extract predictions and place into submission dataset
sj_predictions <- round(test_sj_predictions$mean)

predictions$total_cases[1:260] <- sj_predictions[1:260]

# extract predictions and place into submission dataset
iq_predictions <- round(test_iq_predictions$mean)

predictions$total_cases[261:416] <- iq_predictions[1:156]

# convert dataset into csv file
write.csv(predictions, "predictions2.csv", row.names = FALSE)

DengAI

Lay Khoon Yu

Data Cleaning

Feature Selection

San Juan

Split Dataset

ARIMA Model

Dynamic ARIMA Model

Dynamic Neural Network Model

Iquitos

Split Dataset

ARIMA Model

Dynamic ARIMA Model

Dynamic Neural Network Model

Predictions