# load packages
library(readr)
library(ISOweek)
library(skimr)
library(stringr)
library(zoo)
library(forecast)
library(tidyr)
library(tidyverse)
library(fable)
library(fabletools)
library(dplyr)
library(ggplot2)
library(ggcorrplot)
library(corrplot)
library(kableExtra)
library(tsibbledata)
library(fpp3)
# import datasets
train_features <- read_csv("/Users/laykhoonyu/Documents/MSAA_AE/predictive_analytics/DengAI/dengue_features_train.csv")
train_labels <- read_csv("/Users/laykhoonyu/Documents/MSAA_AE/predictive_analytics/DengAI/dengue_labels_train.csv")
test_features <- read_csv("/Users/laykhoonyu/Documents/MSAA_AE/predictive_analytics/DengAI/dengue_features_test.csv")
predictions <- read_csv("/Users/laykhoonyu/Documents/MSAA_AE/predictive_analytics/DengAI/submission_format.csv")
head(train_features)
## # A tibble: 6 × 24
## city year weekofyear week_start_date ndvi_ne ndvi_nw ndvi_se ndvi_sw
## <chr> <dbl> <dbl> <date> <dbl> <dbl> <dbl> <dbl>
## 1 sj 1990 18 1990-04-30 0.123 0.104 0.198 0.178
## 2 sj 1990 19 1990-05-07 0.170 0.142 0.162 0.155
## 3 sj 1990 20 1990-05-14 0.0322 0.173 0.157 0.171
## 4 sj 1990 21 1990-05-21 0.129 0.245 0.228 0.236
## 5 sj 1990 22 1990-05-28 0.196 0.262 0.251 0.247
## 6 sj 1990 23 1990-06-04 NA 0.175 0.254 0.182
## # ℹ 16 more variables: precipitation_amt_mm <dbl>, reanalysis_air_temp_k <dbl>,
## # reanalysis_avg_temp_k <dbl>, reanalysis_dew_point_temp_k <dbl>,
## # reanalysis_max_air_temp_k <dbl>, reanalysis_min_air_temp_k <dbl>,
## # reanalysis_precip_amt_kg_per_m2 <dbl>,
## # reanalysis_relative_humidity_percent <dbl>,
## # reanalysis_sat_precip_amt_mm <dbl>,
## # reanalysis_specific_humidity_g_per_kg <dbl>, reanalysis_tdtr_k <dbl>, …
head(train_labels)
## # A tibble: 6 × 4
## city year weekofyear total_cases
## <chr> <dbl> <dbl> <dbl>
## 1 sj 1990 18 4
## 2 sj 1990 19 5
## 3 sj 1990 20 4
## 4 sj 1990 21 3
## 5 sj 1990 22 6
## 6 sj 1990 23 2
skim(train_features)
Data summary
| Name |
train_features |
| Number of rows |
1456 |
| Number of columns |
24 |
| _______________________ |
|
| Column type frequency: |
|
| character |
1 |
| Date |
1 |
| numeric |
22 |
| ________________________ |
|
| Group variables |
None |
Variable type: character
Variable type: Date
| week_start_date |
0 |
1 |
1990-04-30 |
2010-06-25 |
2002-05-28 |
1049 |
Variable type: numeric
| year |
0 |
1.00 |
2001.03 |
5.41 |
1990.00 |
1997.00 |
2002.00 |
2005.00 |
2010.00 |
▅▃▆▇▅ |
| weekofyear |
0 |
1.00 |
26.50 |
15.02 |
1.00 |
13.75 |
26.50 |
39.25 |
53.00 |
▇▇▇▇▇ |
| ndvi_ne |
194 |
0.87 |
0.14 |
0.14 |
-0.41 |
0.04 |
0.13 |
0.25 |
0.51 |
▁▂▇▇▂ |
| ndvi_nw |
52 |
0.96 |
0.13 |
0.12 |
-0.46 |
0.05 |
0.12 |
0.22 |
0.45 |
▁▁▆▇▂ |
| ndvi_se |
22 |
0.98 |
0.20 |
0.07 |
-0.02 |
0.16 |
0.20 |
0.25 |
0.54 |
▁▇▆▁▁ |
| ndvi_sw |
22 |
0.98 |
0.20 |
0.08 |
-0.06 |
0.14 |
0.19 |
0.25 |
0.55 |
▁▇▇▂▁ |
| precipitation_amt_mm |
13 |
0.99 |
45.76 |
43.72 |
0.00 |
9.80 |
38.34 |
70.24 |
390.60 |
▇▂▁▁▁ |
| reanalysis_air_temp_k |
10 |
0.99 |
298.70 |
1.36 |
294.64 |
297.66 |
298.65 |
299.83 |
302.20 |
▁▅▇▇▂ |
| reanalysis_avg_temp_k |
10 |
0.99 |
299.23 |
1.26 |
294.89 |
298.26 |
299.29 |
300.21 |
302.93 |
▁▃▇▇▁ |
| reanalysis_dew_point_temp_k |
10 |
0.99 |
295.25 |
1.53 |
289.64 |
294.12 |
295.64 |
296.46 |
298.45 |
▁▂▅▇▃ |
| reanalysis_max_air_temp_k |
10 |
0.99 |
303.43 |
3.23 |
297.80 |
301.00 |
302.40 |
305.50 |
314.00 |
▅▇▃▂▁ |
| reanalysis_min_air_temp_k |
10 |
0.99 |
295.72 |
2.57 |
286.90 |
293.90 |
296.20 |
297.90 |
299.90 |
▁▂▆▆▇ |
| reanalysis_precip_amt_kg_per_m2 |
10 |
0.99 |
40.15 |
43.43 |
0.00 |
13.05 |
27.24 |
52.20 |
570.50 |
▇▁▁▁▁ |
| reanalysis_relative_humidity_percent |
10 |
0.99 |
82.16 |
7.15 |
57.79 |
77.18 |
80.30 |
86.36 |
98.61 |
▁▁▇▃▃ |
| reanalysis_sat_precip_amt_mm |
13 |
0.99 |
45.76 |
43.72 |
0.00 |
9.80 |
38.34 |
70.24 |
390.60 |
▇▂▁▁▁ |
| reanalysis_specific_humidity_g_per_kg |
10 |
0.99 |
16.75 |
1.54 |
11.72 |
15.56 |
17.09 |
17.98 |
20.46 |
▁▃▅▇▁ |
| reanalysis_tdtr_k |
10 |
0.99 |
4.90 |
3.55 |
1.36 |
2.33 |
2.86 |
7.62 |
16.03 |
▇▁▂▁▁ |
| station_avg_temp_c |
43 |
0.97 |
27.19 |
1.29 |
21.40 |
26.30 |
27.41 |
28.16 |
30.80 |
▁▁▅▇▁ |
| station_diur_temp_rng_c |
43 |
0.97 |
8.06 |
2.13 |
4.53 |
6.51 |
7.30 |
9.57 |
15.80 |
▇▇▃▂▁ |
| station_max_temp_c |
20 |
0.99 |
32.45 |
1.96 |
26.70 |
31.10 |
32.80 |
33.90 |
42.20 |
▂▇▇▁▁ |
| station_min_temp_c |
14 |
0.99 |
22.10 |
1.57 |
14.70 |
21.10 |
22.20 |
23.30 |
25.60 |
▁▁▅▇▃ |
| station_precip_mm |
22 |
0.98 |
39.33 |
47.46 |
0.00 |
8.70 |
23.85 |
53.90 |
543.30 |
▇▁▁▁▁ |
Data Cleaning
train_labels <- train_labels %>% mutate(beginning = ymd(str_c(year, "-01-01")),
final_date = beginning + weeks(weekofyear))
train_labels %>%
ggplot(aes(x = final_date, y = total_cases, color = city)) +
geom_line() +
labs(title = "Total Cases by City",
x = "Date",
y = "Value",
color = "Region") +
theme_minimal()

for (col in names(train_features)) {
# Check if the column has missing values
if (any(is.na(train_features[[col]]))) {
# Apply linear interpolation to the column and overwrite the original dataset
train_features[[col]] <- na.approx(train_features[[col]])
}
}
train_merged <- train_features %>%
left_join(select(train_labels, year, weekofyear, city, total_cases), by = c("year", "weekofyear", "city"))
skim(train_merged)
Data summary
| Name |
train_merged |
| Number of rows |
1456 |
| Number of columns |
25 |
| _______________________ |
|
| Column type frequency: |
|
| character |
1 |
| Date |
1 |
| numeric |
23 |
| ________________________ |
|
| Group variables |
None |
Variable type: character
Variable type: Date
| week_start_date |
0 |
1 |
1990-04-30 |
2010-06-25 |
2002-05-28 |
1049 |
Variable type: numeric
| year |
0 |
1 |
2001.03 |
5.41 |
1990.00 |
1997.00 |
2002.00 |
2005.00 |
2010.00 |
▅▃▆▇▅ |
| weekofyear |
0 |
1 |
26.50 |
15.02 |
1.00 |
13.75 |
26.50 |
39.25 |
53.00 |
▇▇▇▇▇ |
| ndvi_ne |
0 |
1 |
0.13 |
0.14 |
-0.41 |
0.04 |
0.12 |
0.23 |
0.51 |
▁▂▇▆▂ |
| ndvi_nw |
0 |
1 |
0.13 |
0.12 |
-0.46 |
0.05 |
0.12 |
0.21 |
0.45 |
▁▁▇▇▂ |
| ndvi_se |
0 |
1 |
0.20 |
0.07 |
-0.02 |
0.15 |
0.20 |
0.25 |
0.54 |
▁▇▆▁▁ |
| ndvi_sw |
0 |
1 |
0.20 |
0.08 |
-0.06 |
0.14 |
0.19 |
0.25 |
0.55 |
▁▇▇▂▁ |
| precipitation_amt_mm |
0 |
1 |
45.70 |
43.65 |
0.00 |
9.79 |
38.32 |
70.23 |
390.60 |
▇▂▁▁▁ |
| reanalysis_air_temp_k |
0 |
1 |
298.70 |
1.36 |
294.64 |
297.66 |
298.64 |
299.83 |
302.20 |
▁▅▇▇▂ |
| reanalysis_avg_temp_k |
0 |
1 |
299.22 |
1.26 |
294.89 |
298.26 |
299.29 |
300.21 |
302.93 |
▁▃▇▇▁ |
| reanalysis_dew_point_temp_k |
0 |
1 |
295.24 |
1.53 |
289.64 |
294.12 |
295.64 |
296.46 |
298.45 |
▁▂▅▇▃ |
| reanalysis_max_air_temp_k |
0 |
1 |
303.42 |
3.23 |
297.80 |
301.00 |
302.40 |
305.50 |
314.00 |
▅▇▃▂▁ |
| reanalysis_min_air_temp_k |
0 |
1 |
295.72 |
2.56 |
286.90 |
293.90 |
296.20 |
297.90 |
299.90 |
▁▂▆▆▇ |
| reanalysis_precip_amt_kg_per_m2 |
0 |
1 |
40.13 |
43.31 |
0.00 |
13.20 |
27.30 |
52.20 |
570.50 |
▇▁▁▁▁ |
| reanalysis_relative_humidity_percent |
0 |
1 |
82.17 |
7.15 |
57.79 |
77.20 |
80.29 |
86.44 |
98.61 |
▁▁▇▃▃ |
| reanalysis_sat_precip_amt_mm |
0 |
1 |
45.70 |
43.65 |
0.00 |
9.79 |
38.32 |
70.23 |
390.60 |
▇▂▁▁▁ |
| reanalysis_specific_humidity_g_per_kg |
0 |
1 |
16.74 |
1.54 |
11.72 |
15.55 |
17.08 |
17.98 |
20.46 |
▁▃▅▇▁ |
| reanalysis_tdtr_k |
0 |
1 |
4.90 |
3.54 |
1.36 |
2.33 |
2.86 |
7.63 |
16.03 |
▇▁▂▁▁ |
| station_avg_temp_c |
0 |
1 |
27.18 |
1.28 |
21.40 |
26.31 |
27.40 |
28.13 |
30.80 |
▁▁▅▇▁ |
| station_diur_temp_rng_c |
0 |
1 |
8.10 |
2.13 |
4.53 |
6.53 |
7.35 |
9.60 |
15.80 |
▇▇▃▂▁ |
| station_max_temp_c |
0 |
1 |
32.45 |
1.96 |
26.70 |
31.10 |
32.80 |
33.90 |
42.20 |
▂▇▇▁▁ |
| station_min_temp_c |
0 |
1 |
22.10 |
1.57 |
14.70 |
21.10 |
22.20 |
23.30 |
25.60 |
▁▁▅▇▃ |
| station_precip_mm |
0 |
1 |
39.36 |
47.29 |
0.00 |
8.85 |
24.05 |
53.90 |
543.30 |
▇▁▁▁▁ |
| total_cases |
0 |
1 |
24.68 |
43.60 |
0.00 |
5.00 |
12.00 |
28.00 |
461.00 |
▇▁▁▁▁ |
# subset train_merged by city
train_merged_sj <- train_merged %>%
filter(city == "sj")
train_merged_iq <- train_merged %>%
filter(city == "iq")
Feature Selection
# Reorder columns so that 'total_cases' is the first column
train_merged_sj_reordered <- train_merged_sj %>%
select(total_cases, everything())
# Compute correlation matrix
correlation_matrix_sj <- cor(train_merged_sj_reordered[sapply(train_merged_sj_reordered, is.numeric)])
# Create correlation heatmap
corrplot(correlation_matrix_sj, method = "color", type = "upper", tl.cex = 0.5)

# Convert correlation matrix to a correlation table
correlation_table_sj <- as.data.frame(correlation_matrix_sj)
# Print formatted correlation table
kable(correlation_table_sj, "html") %>%
kable_styling(full_width = FALSE)
|
|
total_cases
|
year
|
weekofyear
|
ndvi_ne
|
ndvi_nw
|
ndvi_se
|
ndvi_sw
|
precipitation_amt_mm
|
reanalysis_air_temp_k
|
reanalysis_avg_temp_k
|
reanalysis_dew_point_temp_k
|
reanalysis_max_air_temp_k
|
reanalysis_min_air_temp_k
|
reanalysis_precip_amt_kg_per_m2
|
reanalysis_relative_humidity_percent
|
reanalysis_sat_precip_amt_mm
|
reanalysis_specific_humidity_g_per_kg
|
reanalysis_tdtr_k
|
station_avg_temp_c
|
station_diur_temp_rng_c
|
station_max_temp_c
|
station_min_temp_c
|
station_precip_mm
|
|
total_cases
|
1.0000000
|
-0.2126898
|
0.2871342
|
0.0859184
|
0.0474027
|
-0.0436799
|
0.0426704
|
0.0602955
|
0.1803106
|
0.1736703
|
0.2028068
|
0.1932326
|
0.1865386
|
0.1069389
|
0.1444042
|
0.0602955
|
0.2069423
|
-0.0676227
|
0.1948228
|
0.0353035
|
0.1884475
|
0.1752482
|
0.0518830
|
|
year
|
-0.2126898
|
1.0000000
|
-0.0731425
|
-0.4126646
|
-0.4945010
|
-0.0066941
|
-0.0822972
|
0.0333330
|
0.1869462
|
0.1907631
|
0.0166096
|
0.1781211
|
0.1068788
|
-0.1321410
|
-0.2862127
|
0.0333330
|
0.0192497
|
0.3244377
|
-0.0963725
|
-0.2783312
|
-0.1722829
|
-0.0007298
|
0.0827184
|
|
weekofyear
|
0.2871342
|
-0.0731425
|
1.0000000
|
-0.0308390
|
-0.0373903
|
-0.0191420
|
-0.0715881
|
0.2317451
|
0.5676632
|
0.5536197
|
0.5721412
|
0.5112365
|
0.5689748
|
0.2522260
|
0.3059991
|
0.2317451
|
0.5791925
|
-0.1010152
|
0.4792601
|
-0.1403304
|
0.3193425
|
0.5166650
|
0.2114750
|
|
ndvi_ne
|
0.0859184
|
-0.4126646
|
-0.0308390
|
1.0000000
|
0.6326599
|
0.2193530
|
0.1810355
|
-0.0476045
|
-0.0814177
|
-0.0800987
|
-0.0446575
|
-0.0546411
|
-0.0904151
|
0.0120165
|
0.0422167
|
-0.0476045
|
-0.0405857
|
-0.0338958
|
0.0655268
|
0.1810871
|
0.1118221
|
0.0114240
|
-0.0753148
|
|
ndvi_nw
|
0.0474027
|
-0.4945010
|
-0.0373903
|
0.6326599
|
1.0000000
|
0.1973110
|
0.2177400
|
-0.0397861
|
-0.0755900
|
-0.0742216
|
-0.0248703
|
-0.0428293
|
-0.0743365
|
0.0057100
|
0.0762943
|
-0.0397861
|
-0.0195721
|
-0.0508752
|
0.0886967
|
0.1837657
|
0.1371997
|
0.0182586
|
-0.0874670
|
|
ndvi_se
|
-0.0436799
|
-0.0066941
|
-0.0191420
|
0.2193530
|
0.1973110
|
1.0000000
|
0.8146215
|
-0.1180438
|
-0.0152739
|
-0.0128028
|
-0.0636048
|
-0.0094001
|
-0.0468787
|
-0.1293738
|
-0.1143034
|
-0.1180438
|
-0.0591023
|
0.0329361
|
-0.0609560
|
0.0161918
|
-0.0703627
|
-0.0735414
|
-0.1378234
|
|
ndvi_sw
|
0.0426704
|
-0.0822972
|
-0.0715881
|
0.1810355
|
0.2177400
|
0.8146215
|
1.0000000
|
-0.1164566
|
-0.0378028
|
-0.0307194
|
-0.0822551
|
-0.0086754
|
-0.0672088
|
-0.1230979
|
-0.1153108
|
-0.1164566
|
-0.0752546
|
0.0526967
|
-0.0340891
|
0.0744763
|
-0.0088301
|
-0.0689030
|
-0.1715959
|
|
precipitation_amt_mm
|
0.0602955
|
0.0333330
|
0.2317451
|
-0.0476045
|
-0.0397861
|
-0.1180438
|
-0.1164566
|
1.0000000
|
0.2327357
|
0.2210826
|
0.4009756
|
0.2560687
|
0.2446586
|
0.5082735
|
0.4993110
|
1.0000000
|
0.4083665
|
-0.0937647
|
0.1956208
|
-0.1576758
|
0.1901354
|
0.2235604
|
0.5666605
|
|
reanalysis_air_temp_k
|
0.1803106
|
0.1869462
|
0.5676632
|
-0.0814177
|
-0.0755900
|
-0.0152739
|
-0.0378028
|
0.2327357
|
1.0000000
|
0.9975070
|
0.9034811
|
0.9353394
|
0.9422484
|
0.0804000
|
0.2979414
|
0.2327357
|
0.9050036
|
0.1782327
|
0.8808712
|
0.0417913
|
0.6987284
|
0.8331576
|
0.1132962
|
|
reanalysis_avg_temp_k
|
0.1736703
|
0.1907631
|
0.5536197
|
-0.0800987
|
-0.0742216
|
-0.0128028
|
-0.0307194
|
0.2210826
|
0.9975070
|
1.0000000
|
0.8953727
|
0.9392017
|
0.9391272
|
0.0626353
|
0.2840727
|
0.2210826
|
0.8964199
|
0.2012651
|
0.8791185
|
0.0563202
|
0.7041286
|
0.8274975
|
0.0974891
|
|
reanalysis_dew_point_temp_k
|
0.2028068
|
0.0166096
|
0.5721412
|
-0.0446575
|
-0.0248703
|
-0.0636048
|
-0.0822551
|
0.4009756
|
0.9034811
|
0.8953727
|
1.0000000
|
0.8476544
|
0.8990078
|
0.3284607
|
0.6781165
|
0.4009756
|
0.9985332
|
-0.0329880
|
0.8688371
|
-0.0544207
|
0.6902640
|
0.8504789
|
0.2847830
|
|
reanalysis_max_air_temp_k
|
0.1932326
|
0.1781211
|
0.5112365
|
-0.0546411
|
-0.0428293
|
-0.0094001
|
-0.0086754
|
0.2560687
|
0.9353394
|
0.9392017
|
0.8476544
|
1.0000000
|
0.8286651
|
0.0916054
|
0.2874617
|
0.2560687
|
0.8536287
|
0.3526397
|
0.8528307
|
0.1163216
|
0.7616858
|
0.7712400
|
0.1047280
|
|
reanalysis_min_air_temp_k
|
0.1865386
|
0.1068788
|
0.5689748
|
-0.0904151
|
-0.0743365
|
-0.0468787
|
-0.0672088
|
0.2446586
|
0.9422484
|
0.9391272
|
0.8990078
|
0.8286651
|
1.0000000
|
0.1327427
|
0.3849484
|
0.2446586
|
0.8963762
|
-0.0496985
|
0.8412998
|
-0.0224385
|
0.6271057
|
0.8297925
|
0.1502997
|
|
reanalysis_precip_amt_kg_per_m2
|
0.1069389
|
-0.1321410
|
0.2522260
|
0.0120165
|
0.0057100
|
-0.1293738
|
-0.1230979
|
0.5082735
|
0.0804000
|
0.0626353
|
0.3284607
|
0.0916054
|
0.1327427
|
1.0000000
|
0.6025475
|
0.5082735
|
0.3344285
|
-0.3068675
|
0.1351272
|
-0.2508274
|
0.0809598
|
0.1985403
|
0.4779837
|
|
reanalysis_relative_humidity_percent
|
0.1444042
|
-0.2862127
|
0.3059991
|
0.0422167
|
0.0762943
|
-0.1143034
|
-0.1153108
|
0.4993110
|
0.2979414
|
0.2840727
|
0.6781165
|
0.2874617
|
0.3849484
|
0.6025475
|
1.0000000
|
0.4993110
|
0.6730096
|
-0.3745471
|
0.4264236
|
-0.1931811
|
0.3419622
|
0.4660515
|
0.4444624
|
|
reanalysis_sat_precip_amt_mm
|
0.0602955
|
0.0333330
|
0.2317451
|
-0.0476045
|
-0.0397861
|
-0.1180438
|
-0.1164566
|
1.0000000
|
0.2327357
|
0.2210826
|
0.4009756
|
0.2560687
|
0.2446586
|
0.5082735
|
0.4993110
|
1.0000000
|
0.4083665
|
-0.0937647
|
0.1956208
|
-0.1576758
|
0.1901354
|
0.2235604
|
0.5666605
|
|
reanalysis_specific_humidity_g_per_kg
|
0.2069423
|
0.0192497
|
0.5791925
|
-0.0405857
|
-0.0195721
|
-0.0591023
|
-0.0752546
|
0.4083665
|
0.9050036
|
0.8964199
|
0.9985332
|
0.8536287
|
0.8963762
|
0.3344285
|
0.6730096
|
0.4083665
|
1.0000000
|
-0.0258105
|
0.8699817
|
-0.0572908
|
0.6915627
|
0.8495735
|
0.2880637
|
|
reanalysis_tdtr_k
|
-0.0676227
|
0.3244377
|
-0.1010152
|
-0.0338958
|
-0.0508752
|
0.0329361
|
0.0526967
|
-0.0937647
|
0.1782327
|
0.2012651
|
-0.0329880
|
0.3526397
|
-0.0496985
|
-0.3068675
|
-0.3745471
|
-0.0937647
|
-0.0258105
|
1.0000000
|
0.1390107
|
0.3724141
|
0.2825697
|
0.0102683
|
-0.2059301
|
|
station_avg_temp_c
|
0.1948228
|
-0.0963725
|
0.4792601
|
0.0655268
|
0.0886967
|
-0.0609560
|
-0.0340891
|
0.1956208
|
0.8808712
|
0.8791185
|
0.8688371
|
0.8528307
|
0.8412998
|
0.1351272
|
0.4264236
|
0.1956208
|
0.8699817
|
0.1390107
|
1.0000000
|
0.1863269
|
0.8652398
|
0.8985060
|
0.0308073
|
|
station_diur_temp_rng_c
|
0.0353035
|
-0.2783312
|
-0.1403304
|
0.1810871
|
0.1837657
|
0.0161918
|
0.0744763
|
-0.1576758
|
0.0417913
|
0.0563202
|
-0.0544207
|
0.1163216
|
-0.0224385
|
-0.2508274
|
-0.1931811
|
-0.1576758
|
-0.0572908
|
0.3724141
|
0.1863269
|
1.0000000
|
0.4756504
|
-0.1217538
|
-0.2645564
|
|
station_max_temp_c
|
0.1884475
|
-0.1722829
|
0.3193425
|
0.1118221
|
0.1371997
|
-0.0703627
|
-0.0088301
|
0.1901354
|
0.6987284
|
0.7041286
|
0.6902640
|
0.7616858
|
0.6271057
|
0.0809598
|
0.3419622
|
0.1901354
|
0.6915627
|
0.2825697
|
0.8652398
|
0.4756504
|
1.0000000
|
0.6740197
|
0.0060700
|
|
station_min_temp_c
|
0.1752482
|
-0.0007298
|
0.5166650
|
0.0114240
|
0.0182586
|
-0.0735414
|
-0.0689030
|
0.2235604
|
0.8331576
|
0.8274975
|
0.8504789
|
0.7712400
|
0.8297925
|
0.1985403
|
0.4660515
|
0.2235604
|
0.8495735
|
0.0102683
|
0.8985060
|
-0.1217538
|
0.6740197
|
1.0000000
|
0.0849552
|
|
station_precip_mm
|
0.0518830
|
0.0827184
|
0.2114750
|
-0.0753148
|
-0.0874670
|
-0.1378234
|
-0.1715959
|
0.5666605
|
0.1132962
|
0.0974891
|
0.2847830
|
0.1047280
|
0.1502997
|
0.4779837
|
0.4444624
|
0.5666605
|
0.2880637
|
-0.2059301
|
0.0308073
|
-0.2645564
|
0.0060700
|
0.0849552
|
1.0000000
|
# Reorder columns so that 'total_cases' is the first column
train_merged_iq_reordered <- train_merged_iq %>%
select(total_cases, everything())
# Compute correlation matrix
correlation_matrix_iq <- cor(train_merged_iq_reordered[sapply(train_merged_sj_reordered, is.numeric)])
# Create correlation heatmap
corrplot(correlation_matrix_iq, method = "color", type = "upper", tl.cex = 0.5)

# Convert correlation matrix to a correlation table
correlation_table_iq <- as.data.frame(correlation_matrix_iq)
# Print formatted correlation table
kable(correlation_table_iq, "html") %>%
kable_styling(full_width = FALSE)
|
|
total_cases
|
year
|
weekofyear
|
ndvi_ne
|
ndvi_nw
|
ndvi_se
|
ndvi_sw
|
precipitation_amt_mm
|
reanalysis_air_temp_k
|
reanalysis_avg_temp_k
|
reanalysis_dew_point_temp_k
|
reanalysis_max_air_temp_k
|
reanalysis_min_air_temp_k
|
reanalysis_precip_amt_kg_per_m2
|
reanalysis_relative_humidity_percent
|
reanalysis_sat_precip_amt_mm
|
reanalysis_specific_humidity_g_per_kg
|
reanalysis_tdtr_k
|
station_avg_temp_c
|
station_diur_temp_rng_c
|
station_max_temp_c
|
station_min_temp_c
|
station_precip_mm
|
|
total_cases
|
1.0000000
|
0.1794511
|
-0.0118505
|
0.0207942
|
-0.0099516
|
-0.0406426
|
0.0328120
|
0.0906277
|
0.0963270
|
0.0795155
|
0.2291832
|
-0.0541752
|
0.2116788
|
0.1011505
|
0.1294858
|
0.0906277
|
0.2352177
|
-0.1321909
|
0.1339888
|
-0.0361342
|
0.0766647
|
0.2068168
|
0.0445683
|
|
year
|
0.1794511
|
1.0000000
|
-0.1368422
|
-0.0641159
|
0.0316934
|
-0.0706738
|
-0.0167118
|
0.0839134
|
0.0265268
|
0.0087092
|
0.3175299
|
-0.1568073
|
0.2174518
|
0.1624353
|
0.2534260
|
0.0839134
|
0.3211407
|
-0.2348002
|
0.0769614
|
-0.1145089
|
0.0224573
|
0.1280877
|
-0.0328965
|
|
weekofyear
|
-0.0118505
|
-0.1368422
|
1.0000000
|
0.2539798
|
0.2214391
|
0.3232486
|
0.2564013
|
-0.1112095
|
0.2923895
|
0.3054914
|
-0.0987181
|
0.4100806
|
-0.0303174
|
-0.1498296
|
-0.2857707
|
-0.1112095
|
-0.0885344
|
0.3521710
|
0.0808283
|
0.2295054
|
0.1953287
|
-0.0945173
|
-0.0423159
|
|
ndvi_ne
|
0.0207942
|
-0.0641159
|
0.2539798
|
1.0000000
|
0.7642003
|
0.7693605
|
0.8429220
|
-0.0096776
|
0.1532940
|
0.1690034
|
-0.0330754
|
0.2169326
|
-0.0059333
|
-0.0829182
|
-0.1360848
|
-0.0096776
|
-0.0322194
|
0.1700031
|
0.1171250
|
0.1430750
|
0.1413499
|
-0.0078087
|
0.0133861
|
|
ndvi_nw
|
-0.0099516
|
0.0316934
|
0.2214391
|
0.7642003
|
1.0000000
|
0.6445407
|
0.7653893
|
-0.0588619
|
0.1530328
|
0.1698186
|
-0.0364774
|
0.2068618
|
0.0015648
|
-0.0768680
|
-0.1353845
|
-0.0588619
|
-0.0323280
|
0.1689186
|
0.1272436
|
0.1988208
|
0.1528468
|
-0.0915672
|
-0.0153423
|
|
ndvi_se
|
-0.0406426
|
-0.0706738
|
0.3232486
|
0.7693605
|
0.6445407
|
1.0000000
|
0.7151920
|
-0.0406126
|
0.1963635
|
0.2086462
|
-0.0632229
|
0.2624704
|
-0.0251627
|
-0.1222016
|
-0.1905314
|
-0.0406126
|
-0.0610467
|
0.2239094
|
0.1172213
|
0.1798671
|
0.1609706
|
-0.0520851
|
0.0146203
|
|
ndvi_sw
|
0.0328120
|
-0.0167118
|
0.2564013
|
0.8429220
|
0.7653893
|
0.7151920
|
1.0000000
|
-0.0184216
|
0.1639569
|
0.1768961
|
-0.0335247
|
0.2311766
|
-0.0028814
|
-0.0641212
|
-0.1425473
|
-0.0184216
|
-0.0300112
|
0.1749937
|
0.1186557
|
0.1741468
|
0.1733543
|
-0.0540292
|
-0.0016017
|
|
precipitation_amt_mm
|
0.0906277
|
0.0839134
|
-0.1112095
|
-0.0096776
|
-0.0588619
|
-0.0406126
|
-0.0184216
|
1.0000000
|
-0.0483261
|
-0.0537699
|
0.4719121
|
-0.2273346
|
0.3245030
|
0.3391155
|
0.4315369
|
1.0000000
|
0.4685600
|
-0.3774718
|
0.1164326
|
-0.1815484
|
-0.0023567
|
0.3155489
|
0.3745055
|
|
reanalysis_air_temp_k
|
0.0963270
|
0.0265268
|
0.2923895
|
0.1532940
|
0.1530328
|
0.1963635
|
0.1639569
|
-0.0483261
|
1.0000000
|
0.9733337
|
0.1513343
|
0.7512190
|
0.4121815
|
-0.0852959
|
-0.5478146
|
-0.0483261
|
0.1779876
|
0.5522957
|
0.5963278
|
0.5066823
|
0.6470764
|
0.2386999
|
-0.1307270
|
|
reanalysis_avg_temp_k
|
0.0795155
|
0.0087092
|
0.3054914
|
0.1690034
|
0.1698186
|
0.2086462
|
0.1768961
|
-0.0537699
|
0.9733337
|
1.0000000
|
0.1426519
|
0.7830266
|
0.3964063
|
-0.1081496
|
-0.5397850
|
-0.0537699
|
0.1664930
|
0.6016871
|
0.5662927
|
0.5054662
|
0.6225771
|
0.2101760
|
-0.1343537
|
|
reanalysis_dew_point_temp_k
|
0.2291832
|
0.3175299
|
-0.0987181
|
-0.0330754
|
-0.0364774
|
-0.0632229
|
-0.0335247
|
0.4719121
|
0.1513343
|
0.1426519
|
1.0000000
|
-0.2541932
|
0.7549847
|
0.5709717
|
0.7417486
|
0.4719121
|
0.9977770
|
-0.6078306
|
0.3345682
|
-0.2457371
|
0.0932149
|
0.6175029
|
0.1871635
|
|
reanalysis_max_air_temp_k
|
-0.0541752
|
-0.1568073
|
0.4100806
|
0.2169326
|
0.2068618
|
0.2624704
|
0.2311766
|
-0.2273346
|
0.7512190
|
0.7830266
|
-0.2541932
|
1.0000000
|
-0.0494339
|
-0.2582632
|
-0.7265173
|
-0.2273346
|
-0.2363375
|
0.8003587
|
0.3659290
|
0.5859265
|
0.5853963
|
-0.0978262
|
-0.1977715
|
|
reanalysis_min_air_temp_k
|
0.2116788
|
0.2174518
|
-0.0303174
|
-0.0059333
|
0.0015648
|
-0.0251627
|
-0.0028814
|
0.3245030
|
0.4121815
|
0.3964063
|
0.7549847
|
-0.0494339
|
1.0000000
|
0.3990866
|
0.3644196
|
0.3245030
|
0.7587888
|
-0.4033810
|
0.4083750
|
-0.0512174
|
0.2230738
|
0.5909327
|
0.1007235
|
|
reanalysis_precip_amt_kg_per_m2
|
0.1011505
|
0.1624353
|
-0.1498296
|
-0.0829182
|
-0.0768680
|
-0.1222016
|
-0.0641212
|
0.3391155
|
-0.0852959
|
-0.1081496
|
0.5709717
|
-0.2582632
|
0.3990866
|
1.0000000
|
0.5509296
|
0.3391155
|
0.5771415
|
-0.5389011
|
0.0583611
|
-0.2125206
|
-0.0515601
|
0.2545844
|
0.1548380
|
|
reanalysis_relative_humidity_percent
|
0.1294858
|
0.2534260
|
-0.2857707
|
-0.1360848
|
-0.1353845
|
-0.1905314
|
-0.1425473
|
0.4315369
|
-0.5478146
|
-0.5397850
|
0.7417486
|
-0.7265173
|
0.3644196
|
0.5509296
|
1.0000000
|
0.4315369
|
0.7228807
|
-0.8959841
|
-0.1186443
|
-0.5502336
|
-0.3568823
|
0.3620996
|
0.2485204
|
|
reanalysis_sat_precip_amt_mm
|
0.0906277
|
0.0839134
|
-0.1112095
|
-0.0096776
|
-0.0588619
|
-0.0406126
|
-0.0184216
|
1.0000000
|
-0.0483261
|
-0.0537699
|
0.4719121
|
-0.2273346
|
0.3245030
|
0.3391155
|
0.4315369
|
1.0000000
|
0.4685600
|
-0.3774718
|
0.1164326
|
-0.1815484
|
-0.0023567
|
0.3155489
|
0.3745055
|
|
reanalysis_specific_humidity_g_per_kg
|
0.2352177
|
0.3211407
|
-0.0885344
|
-0.0322194
|
-0.0323280
|
-0.0610467
|
-0.0300112
|
0.4685600
|
0.1779876
|
0.1664930
|
0.9977770
|
-0.2363375
|
0.7587888
|
0.5771415
|
0.7228807
|
0.4685600
|
1.0000000
|
-0.5937305
|
0.3506903
|
-0.2312012
|
0.1104195
|
0.6149441
|
0.1793954
|
|
reanalysis_tdtr_k
|
-0.1321909
|
-0.2348002
|
0.3521710
|
0.1700031
|
0.1689186
|
0.2239094
|
0.1749937
|
-0.3774718
|
0.5522957
|
0.6016871
|
-0.6078306
|
0.8003587
|
-0.4033810
|
-0.5389011
|
-0.8959841
|
-0.3774718
|
-0.5937305
|
1.0000000
|
0.1517012
|
0.5570897
|
0.3692137
|
-0.3428757
|
-0.2537170
|
|
station_avg_temp_c
|
0.1339888
|
0.0769614
|
0.0808283
|
0.1171250
|
0.1272436
|
0.1172213
|
0.1186557
|
0.1164326
|
0.5963278
|
0.5662927
|
0.3345682
|
0.3659290
|
0.4083750
|
0.0583611
|
-0.1186443
|
0.1164326
|
0.3506903
|
0.1517012
|
1.0000000
|
0.5022154
|
0.6511249
|
0.4536592
|
-0.0512563
|
|
station_diur_temp_rng_c
|
-0.0361342
|
-0.1145089
|
0.2295054
|
0.1430750
|
0.1988208
|
0.1798671
|
0.1741468
|
-0.1815484
|
0.5066823
|
0.5054662
|
-0.2457371
|
0.5859265
|
-0.0512174
|
-0.2125206
|
-0.5502336
|
-0.1815484
|
-0.2312012
|
0.5570897
|
0.5022154
|
1.0000000
|
0.6688533
|
-0.2535025
|
-0.2638489
|
|
station_max_temp_c
|
0.0766647
|
0.0224573
|
0.1953287
|
0.1413499
|
0.1528468
|
0.1609706
|
0.1733543
|
-0.0023567
|
0.6470764
|
0.6225771
|
0.0932149
|
0.5853963
|
0.2230738
|
-0.0515601
|
-0.3568823
|
-0.0023567
|
0.1104195
|
0.3692137
|
0.6511249
|
0.6688533
|
1.0000000
|
0.1248263
|
-0.1402289
|
|
station_min_temp_c
|
0.2068168
|
0.1280877
|
-0.0945173
|
-0.0078087
|
-0.0915672
|
-0.0520851
|
-0.0540292
|
0.3155489
|
0.2386999
|
0.2101760
|
0.6175029
|
-0.0978262
|
0.5909327
|
0.2545844
|
0.3620996
|
0.3155489
|
0.6149441
|
-0.3428757
|
0.4536592
|
-0.2535025
|
0.1248263
|
1.0000000
|
0.1941040
|
|
station_precip_mm
|
0.0445683
|
-0.0328965
|
-0.0423159
|
0.0133861
|
-0.0153423
|
0.0146203
|
-0.0016017
|
0.3745055
|
-0.1307270
|
-0.1343537
|
0.1871635
|
-0.1977715
|
0.1007235
|
0.1548380
|
0.2485204
|
0.3745055
|
0.1793954
|
-0.2537170
|
-0.0512563
|
-0.2638489
|
-0.1402289
|
0.1941040
|
1.0000000
|
San Juan
Split Dataset
# total observations are 936
# proceeding with 80:20 train test split
train_sj <- train_merged_sj[1:749, ]
test_sj <- train_merged_sj[750:936, ]
train_ts_sj <- ts(train_sj, start = c(1990, 04), frequency = 52)
test_ts_sj <- ts(test_sj, start = c(2004, 09), frequency = 52)
train_full_ts_sj <- ts(train_merged_sj, start = c(1990, 4), frequency = 52)
ARIMA Model
arima_model <- auto.arima(train_ts_sj[,25])
summary(arima_model)
## Series: train_ts_sj[, 25]
## ARIMA(2,1,1)
##
## Coefficients:
## ar1 ar2 ma1
## -0.6327 0.2073 0.8079
## s.e. 0.1221 0.0359 0.1214
##
## sigma^2 = 191.9: log likelihood = -3026.1
## AIC=6060.2 AICc=6060.25 BIC=6078.67
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE ACF1
## Training set 0.009219299 13.81737 8.366687 NaN Inf 0.2101244 -3.805921e-05
arima_forecast <- forecast(arima_model, h = 187)
autoplot(arima_forecast)

accuracy(arima_forecast, test_ts_sj[, "total_cases"])
## ME RMSE MAE MPE MAPE MASE ACF1
## Training set 0.009219299 13.81737 8.366687 NaN Inf 0.2101244 -3.805921e-05
## Test set 12.981694179 34.94797 19.151819 -Inf Inf 0.4809865 9.339254e-01
## Theil's U
## Training set NA
## Test set 0
Dynamic ARIMA Model
# extract features for sj where correlation with total_cases > 0.2
xreg_train_sj <- as.matrix(train_sj[, c("reanalysis_dew_point_temp_k", "reanalysis_specific_humidity_g_per_kg")])
dynamic_arima <- auto.arima(train_ts_sj[, "total_cases"], xreg = xreg_train_sj)
summary(dynamic_arima)
## Series: train_ts_sj[, "total_cases"]
## Regression with ARIMA(1,1,2) errors
##
## Coefficients:
## ar1 ma1 ma2 reanalysis_dew_point_temp_k
## -0.5954 0.7647 0.1906 -11.0981
## s.e. 0.5368 0.5231 0.0429 6.1746
## reanalysis_specific_humidity_g_per_kg
## 12.7064
## s.e. 6.4982
##
## sigma^2 = 191.3: log likelihood = -3023.77
## AIC=6059.54 AICc=6059.66 BIC=6087.25
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE ACF1
## Training set 0.004909138 13.77507 8.412971 -Inf Inf 0.2112867 0.002459711
xreg_test_sj <- as.matrix(test_sj[, c("reanalysis_dew_point_temp_k", "reanalysis_specific_humidity_g_per_kg")])
dynamic_arima_fc <- forecast(dynamic_arima, xreg = xreg_test_sj, h = 187)
autoplot(dynamic_arima_fc)

accuracy(dynamic_arima_fc, test_ts_sj[, "total_cases"])
## ME RMSE MAE MPE MAPE MASE ACF1
## Training set 0.004909138 13.77507 8.412971 -Inf Inf 0.2112867 0.002459711
## Test set 18.076811151 36.98968 19.854713 -Inf Inf 0.4986392 0.931029114
## Theil's U
## Training set NA
## Test set 0
Dynamic Neural Network Model
nn_model_sj <- nnetar(train_ts_sj[, "total_cases"], xreg = xreg_train_sj)
summary(nn_model_sj)
## Length Class Mode
## x 749 ts numeric
## m 1 -none- numeric
## p 1 -none- numeric
## P 1 -none- numeric
## scalex 2 -none- list
## scalexreg 2 -none- list
## size 1 -none- numeric
## xreg 1498 -none- numeric
## subset 749 -none- numeric
## model 20 nnetarmodels list
## nnetargs 0 -none- list
## fitted 749 ts numeric
## residuals 749 ts numeric
## lags 15 -none- numeric
## series 1 -none- character
## method 1 -none- character
## call 3 -none- call
nn_fc_sj <- forecast(nn_model_sj, xreg = xreg_test_sj, h = 187)
autoplot(nn_fc_sj)

accuracy(nn_fc_sj, test_ts_sj[, "total_cases"])
## ME RMSE MAE MPE MAPE MASE ACF1
## Training set -0.01612171 5.80064 4.348197 -Inf Inf 0.1092024 -0.02628943
## Test set -11.47230403 52.01948 40.061283 -Inf Inf 1.0061152 0.96849465
## Theil's U
## Training set NA
## Test set 0
# visual comparison
autoplot(train_ts_sj[, "total_cases"]) +
autolayer(arima_forecast,
series="ARIMA", PI=FALSE) +
autolayer(dynamic_arima_fc,
series="Dynamic ARIMA", PI=FALSE) +
autolayer(nn_fc_sj,
series="NN", PI=FALSE) +
autolayer(test_ts_sj[, "total_cases"],
series = "Actual") +
ggtitle("Comparison between Forecasts for San Jose") +
xlab("Year") + ylab("Total Cases") +
guides(colour=guide_legend(title="Forecast"))

Iquitos
Split Dataset
train_iq <- train_merged_iq[1:416, ]
test_iq <- train_merged_iq[417:520,]
train_ts_iq <- ts(train_iq, start = c(2000, 07), frequency = 52)
test_ts_iq <- ts(test_iq, start = c(2008, 07), frequency = 52)
train_full_ts_iq <- ts(train_merged_iq, start = c(2000, 7), frequency = 52)
ARIMA Model
arima_mod_iq <- auto.arima(train_ts_iq[,"total_cases"])
summary(arima_mod_iq)
## Series: train_ts_iq[, "total_cases"]
## ARIMA(2,1,1)
##
## Coefficients:
## ar1 ar2 ma1
## 0.0042 -0.3261 -0.2806
## s.e. 0.1195 0.0555 0.1266
##
## sigma^2 = 52.81: log likelihood = -1410.62
## AIC=2829.25 AICc=2829.35 BIC=2845.36
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE ACF1
## Training set 0.002798189 7.232062 3.765364 NaN Inf 0.4096212 -0.00214506
arima_fc_iq <- forecast(arima_mod_iq, h = 104)
autoplot(arima_fc_iq)

accuracy(arima_fc_iq, test_ts_iq[,"total_cases"])
## ME RMSE MAE MPE MAPE MASE ACF1
## Training set 0.002798189 7.232062 3.765364 NaN Inf 0.4096212 -0.00214506
## Test set 8.885144036 14.469655 8.986599 -Inf Inf 0.9776217 0.84254559
## Theil's U
## Training set NA
## Test set 0
Dynamic ARIMA Model
# extract top three features for iq
xreg_train_iq <- as.matrix(train_iq[, c("reanalysis_dew_point_temp_k", "reanalysis_min_air_temp_k", "reanalysis_specific_humidity_g_per_kg")])
dynamic_arima_mod_iq <- auto.arima(train_ts_iq[, "total_cases"], xreg = xreg_train_iq)
summary(dynamic_arima_mod_iq)
## Series: train_ts_iq[, "total_cases"]
## Regression with ARIMA(3,0,0) errors
##
## Coefficients:
## ar1 ar2 ar3 reanalysis_dew_point_temp_k
## 0.6914 -0.1772 0.2935 -0.572
## s.e. 0.0469 0.0573 0.0468 0.269
## reanalysis_min_air_temp_k reanalysis_specific_humidity_g_per_kg
## 0.6130 -0.2131
## s.e. 0.2666 0.3423
##
## sigma^2 = 49.73: log likelihood = -1400.32
## AIC=2814.64 AICc=2814.91 BIC=2842.85
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE ACF1
## Training set 0.0304088 7.000685 3.923278 NaN Inf 0.4268001 -0.01360866
xreg_test_iq <- as.matrix(test_iq[, c("reanalysis_dew_point_temp_k", "reanalysis_min_air_temp_k", "reanalysis_specific_humidity_g_per_kg")])
dynamic_arima_fc_iq <- forecast(dynamic_arima_mod_iq, xreg = xreg_test_iq, h = 104)
autoplot(dynamic_arima_fc_iq)

accuracy(dynamic_arima_fc_iq, test_ts_iq[,"total_cases"])
## ME RMSE MAE MPE MAPE MASE ACF1
## Training set 0.0304088 7.000685 3.923278 NaN Inf 0.4268001 -0.01360866
## Test set 3.1284068 11.929512 7.146473 -Inf Inf 0.7774406 0.84296808
## Theil's U
## Training set NA
## Test set 0
Dynamic Neural Network Model
nn_model_iq <- nnetar(train_ts_iq[, "total_cases"], xreg = xreg_train_iq)
summary(nn_model_iq)
## Length Class Mode
## x 416 ts numeric
## m 1 -none- numeric
## p 1 -none- numeric
## P 1 -none- numeric
## scalex 2 -none- list
## scalexreg 2 -none- list
## size 1 -none- numeric
## xreg 1248 -none- numeric
## subset 416 -none- numeric
## model 20 nnetarmodels list
## nnetargs 0 -none- list
## fitted 416 ts numeric
## residuals 416 ts numeric
## lags 4 -none- numeric
## series 1 -none- character
## method 1 -none- character
## call 3 -none- call
nn_fc_iq <- forecast(nn_model_iq, xreg = xreg_test_iq, h = 104)
autoplot(nn_fc_iq)

accuracy(nn_fc_iq, test_ts_iq[, "total_cases"])
## ME RMSE MAE MPE MAPE MASE ACF1
## Training set -0.03591436 4.879157 3.046150 -Inf Inf 0.3313804 -0.006041115
## Test set 2.98946011 12.423230 7.219992 -Inf Inf 0.7854385 0.858253602
## Theil's U
## Training set NA
## Test set 0
autoplot(train_ts_iq[, "total_cases"]) +
autolayer(arima_fc_iq,
series="ARIMA", PI=FALSE) +
autolayer(dynamic_arima_fc_iq,
series="Dynamic ARIMA", PI=FALSE) +
autolayer(nn_fc_iq,
series="NN", PI=FALSE) +
autolayer(test_ts_iq[, "total_cases"],
series = "Actual") +
ggtitle("Comparison between Forecasts for Iquitos") +
xlab("Year") + ylab("Total Cases") +
guides(colour=guide_legend(title="Forecast"))

Predictions
# handling missing data in test features using interpolation
for (col in names(test_features)) {
# Check if the column has missing values
if (any(is.na(test_features[[col]]))) {
# Apply linear interpolation to the column and overwrite the original dataset
test_features[[col]] <- na.approx(test_features[[col]])
}
}
skim(test_features)
Data summary
| Name |
test_features |
| Number of rows |
416 |
| Number of columns |
24 |
| _______________________ |
|
| Column type frequency: |
|
| character |
1 |
| Date |
1 |
| numeric |
22 |
| ________________________ |
|
| Group variables |
None |
Variable type: character
Variable type: Date
| week_start_date |
0 |
1 |
2008-04-29 |
2013-06-25 |
2011-05-28 |
269 |
Variable type: numeric
| year |
0 |
1 |
2010.77 |
1.43 |
2008.00 |
2010.00 |
2011.00 |
2012.00 |
2013.00 |
▇▆▇▇▃ |
| weekofyear |
0 |
1 |
26.44 |
14.98 |
1.00 |
13.75 |
26.00 |
39.00 |
53.00 |
▇▇▇▇▇ |
| ndvi_ne |
0 |
1 |
0.12 |
0.16 |
-0.46 |
0.00 |
0.09 |
0.25 |
0.50 |
▁▂▇▆▂ |
| ndvi_nw |
0 |
1 |
0.12 |
0.14 |
-0.21 |
0.01 |
0.09 |
0.24 |
0.65 |
▁▇▅▂▁ |
| ndvi_se |
0 |
1 |
0.21 |
0.08 |
0.01 |
0.15 |
0.20 |
0.25 |
0.45 |
▁▇▇▃▁ |
| ndvi_sw |
0 |
1 |
0.20 |
0.09 |
-0.01 |
0.13 |
0.19 |
0.25 |
0.53 |
▂▇▅▂▁ |
| precipitation_amt_mm |
0 |
1 |
38.17 |
35.19 |
0.00 |
8.04 |
31.12 |
57.72 |
169.34 |
▇▅▂▁▁ |
| reanalysis_air_temp_k |
0 |
1 |
298.82 |
1.47 |
294.55 |
297.75 |
298.55 |
300.24 |
301.94 |
▁▃▇▅▃ |
| reanalysis_avg_temp_k |
0 |
1 |
299.35 |
1.30 |
295.24 |
298.32 |
299.33 |
300.52 |
303.33 |
▁▆▇▇▁ |
| reanalysis_dew_point_temp_k |
0 |
1 |
295.41 |
1.53 |
290.82 |
294.33 |
295.82 |
296.64 |
297.79 |
▁▃▅▆▇ |
| reanalysis_max_air_temp_k |
0 |
1 |
303.61 |
3.10 |
298.20 |
301.40 |
302.70 |
305.80 |
314.10 |
▅▇▃▂▁ |
| reanalysis_min_air_temp_k |
0 |
1 |
295.74 |
2.76 |
286.20 |
293.50 |
296.30 |
298.22 |
299.70 |
▁▂▅▆▇ |
| reanalysis_precip_amt_kg_per_m2 |
0 |
1 |
42.02 |
48.84 |
0.00 |
9.49 |
25.77 |
56.23 |
301.40 |
▇▂▁▁▁ |
| reanalysis_relative_humidity_percent |
0 |
1 |
82.47 |
7.38 |
64.92 |
77.33 |
80.32 |
87.98 |
97.98 |
▁▇▇▃▅ |
| reanalysis_sat_precip_amt_mm |
0 |
1 |
38.17 |
35.19 |
0.00 |
8.04 |
31.12 |
57.72 |
169.34 |
▇▅▂▁▁ |
| reanalysis_specific_humidity_g_per_kg |
0 |
1 |
16.92 |
1.56 |
12.54 |
15.79 |
17.32 |
18.17 |
19.60 |
▁▃▅▇▆ |
| reanalysis_tdtr_k |
0 |
1 |
5.11 |
3.54 |
1.49 |
2.45 |
2.91 |
8.17 |
14.49 |
▇▁▂▂▁ |
| station_avg_temp_c |
0 |
1 |
27.36 |
1.22 |
24.16 |
26.53 |
27.44 |
28.28 |
30.27 |
▂▅▇▇▂ |
| station_diur_temp_rng_c |
0 |
1 |
7.86 |
2.45 |
4.04 |
5.94 |
6.67 |
9.88 |
14.72 |
▇▇▃▅▁ |
| station_max_temp_c |
0 |
1 |
32.53 |
1.93 |
27.20 |
31.10 |
32.80 |
33.90 |
38.40 |
▂▃▇▃▁ |
| station_min_temp_c |
0 |
1 |
22.34 |
1.73 |
14.20 |
21.20 |
22.20 |
23.30 |
26.70 |
▁▁▇▇▃ |
| station_precip_mm |
0 |
1 |
33.94 |
34.59 |
0.00 |
8.97 |
23.30 |
47.28 |
212.00 |
▇▂▁▁▁ |
# split dataset by city
test_features_sj <- test_features %>%
filter(city == "sj")
test_features_iq <- test_features %>%
filter(city == "iq")
# extract external regressors for both cities
xreg_test_sj_features <- as.matrix(test_features_sj[, c("reanalysis_dew_point_temp_k", "reanalysis_specific_humidity_g_per_kg")])
xreg_test_iq_features <- as.matrix(test_features_iq[, c("reanalysis_dew_point_temp_k", "reanalysis_min_air_temp_k", "reanalysis_specific_humidity_g_per_kg")])
# forecast for sj using nn model
xreg_train_sj <- as.matrix(train_merged_sj[, c("reanalysis_dew_point_temp_k", "reanalysis_specific_humidity_g_per_kg")])
sj_model <- nnetar(train_full_ts_sj[, "total_cases"], xreg = xreg_train_sj)
test_sj_predictions <- forecast(sj_model, xreg = xreg_test_sj_features, h = nrow(test_features_sj))
# forecast for iq using nn model
xreg_train_iq <- as.matrix(train_merged_iq[, c("reanalysis_dew_point_temp_k", "reanalysis_min_air_temp_k", "reanalysis_specific_humidity_g_per_kg")])
iq_model <- nnetar(train_full_ts_iq[, "total_cases"], xreg = xreg_train_iq)
test_iq_predictions <- forecast(nn_model_iq, xreg = xreg_test_iq_features, h = nrow(test_features_iq))
# visual predictions for sj
autoplot(test_sj_predictions)

# visualize predictions for iq
autoplot(test_iq_predictions)

# extract predictions and place into submission dataset
sj_predictions <- round(test_sj_predictions$mean)
predictions$total_cases[1:260] <- sj_predictions[1:260]
# extract predictions and place into submission dataset
iq_predictions <- round(test_iq_predictions$mean)
predictions$total_cases[261:416] <- iq_predictions[1:156]
# convert dataset into csv file
write.csv(predictions, "predictions2.csv", row.names = FALSE)