library(readr)
library(reshape2)
library(tidyr)
library(h2o)
library(dplyr)
library(ggplot2)
library(caret)
library(lubridate)
library(imputeTS)
library(corrplot)
library(data.table)
library(quantreg)
library(Ecfun)
library(randomForest)
library(forecast)
library(kableExtra)
library(magrittr)
library(zoo)
library(RColorBrewer)
library(tsibble)
library(feasts)
library(gridExtra)
features_tr <- read.csv("C:/Users/hunai/Google Drive/BC_EDU Pred Modeling/ADEC 7640 Final/DATA/dengue_features_train.csv")
#col_types = cols(week_start_date = col_date(format = "%Y-%m-%d")))
labels_tr <- read.csv("C:/Users/hunai/Google Drive/BC_EDU Pred Modeling/ADEC 7640 Final/DATA/dengue_labels_train.csv")
features_te <- read.csv("C:/Users/hunai/Google Drive/BC_EDU Pred Modeling/ADEC 7640 Final/DATA/dengue_features_test.csv")
submission_format <- read.csv("C:/Users/hunai/Google Drive/BC_EDU Pred Modeling/ADEC 7640 Final/DATA/submission_format.csv")
names(features_tr)
## [1] "city"
## [2] "year"
## [3] "weekofyear"
## [4] "week_start_date"
## [5] "ndvi_ne"
## [6] "ndvi_nw"
## [7] "ndvi_se"
## [8] "ndvi_sw"
## [9] "precipitation_amt_mm"
## [10] "reanalysis_air_temp_k"
## [11] "reanalysis_avg_temp_k"
## [12] "reanalysis_dew_point_temp_k"
## [13] "reanalysis_max_air_temp_k"
## [14] "reanalysis_min_air_temp_k"
## [15] "reanalysis_precip_amt_kg_per_m2"
## [16] "reanalysis_relative_humidity_percent"
## [17] "reanalysis_sat_precip_amt_mm"
## [18] "reanalysis_specific_humidity_g_per_kg"
## [19] "reanalysis_tdtr_k"
## [20] "station_avg_temp_c"
## [21] "station_diur_temp_rng_c"
## [22] "station_max_temp_c"
## [23] "station_min_temp_c"
## [24] "station_precip_mm"
##
rbind(data.frame(group = "train", features_tr),
data.frame(group = "test", features_te)) %>%
gather(x, y, precipitation_amt_mm:station_precip_mm) %>%
ggplot(aes(x = y, color = group, fill = group)) +
geom_density(alpha = 0.3) +
facet_wrap( ~ x, scales = "free", ncol = 3)
train <- left_join(x = features_tr, y = labels_tr, by = c("year", "weekofyear", "city"))
test <- left_join(x = features_te, y = submission_format, by = c("year", "weekofyear", "city"))
train$type <- "train"
test$type <- "test"
all <- rbind(train, test)
summary(all[ , c("station_avg_temp_c", "reanalysis_avg_temp_k", "reanalysis_air_temp_k")])
## station_avg_temp_c reanalysis_avg_temp_k reanalysis_air_temp_k
## Min. :21.40 Min. :294.9 Min. :294.6
## 1st Qu.:26.33 1st Qu.:298.3 1st Qu.:297.7
## Median :27.43 Median :299.3 Median :298.6
## Mean :27.23 Mean :299.3 Mean :298.7
## 3rd Qu.:28.20 3rd Qu.:300.3 3rd Qu.:299.9
## Max. :30.80 Max. :303.3 Max. :302.2
## NA's :55 NA's :12 NA's :12
summary(all[ , c("station_precip_mm", "precipitation_amt_mm")])
## station_precip_mm precipitation_amt_mm
## Min. : 0.0 Min. : 0.00
## 1st Qu.: 8.8 1st Qu.: 9.43
## Median : 23.8 Median : 36.64
## Mean : 38.2 Mean : 44.11
## 3rd Qu.: 51.5 3rd Qu.: 67.50
## Max. :543.3 Max. :390.60
## NA's :27 NA's :15
sj_train <- all %>% filter(city == "sj" & type == "train")
summary(sj_train)
## city year weekofyear week_start_date
## Length:936 Min. :1990 Min. : 1.00 Length:936
## Class :character 1st Qu.:1994 1st Qu.:13.75 Class :character
## Mode :character Median :1999 Median :26.50 Mode :character
## Mean :1999 Mean :26.50
## 3rd Qu.:2003 3rd Qu.:39.25
## Max. :2008 Max. :53.00
##
## ndvi_ne ndvi_nw ndvi_se ndvi_sw
## Min. :-0.40625 Min. :-0.45610 Min. :-0.01553 Min. :-0.06346
## 1st Qu.: 0.00450 1st Qu.: 0.01642 1st Qu.: 0.13928 1st Qu.: 0.12916
## Median : 0.05770 Median : 0.06807 Median : 0.17719 Median : 0.16597
## Mean : 0.05792 Mean : 0.06747 Mean : 0.17766 Mean : 0.16596
## 3rd Qu.: 0.11110 3rd Qu.: 0.11520 3rd Qu.: 0.21256 3rd Qu.: 0.20277
## Max. : 0.49340 Max. : 0.43710 Max. : 0.39313 Max. : 0.38142
## NA's :191 NA's :49 NA's :19 NA's :19
## precipitation_amt_mm reanalysis_air_temp_k reanalysis_avg_temp_k
## Min. : 0.00 Min. :295.9 Min. :296.1
## 1st Qu.: 0.00 1st Qu.:298.2 1st Qu.:298.3
## Median : 20.80 Median :299.3 Median :299.4
## Mean : 35.47 Mean :299.2 Mean :299.3
## 3rd Qu.: 52.18 3rd Qu.:300.1 3rd Qu.:300.2
## Max. :390.60 Max. :302.2 Max. :302.2
## NA's :9 NA's :6 NA's :6
## reanalysis_dew_point_temp_k reanalysis_max_air_temp_k
## Min. :289.6 Min. :297.8
## 1st Qu.:293.8 1st Qu.:300.4
## Median :295.5 Median :301.5
## Mean :295.1 Mean :301.4
## 3rd Qu.:296.4 3rd Qu.:302.4
## Max. :297.8 Max. :304.3
## NA's :6 NA's :6
## reanalysis_min_air_temp_k reanalysis_precip_amt_kg_per_m2
## Min. :292.6 Min. : 0.00
## 1st Qu.:296.3 1st Qu.: 10.82
## Median :297.5 Median : 21.30
## Mean :297.3 Mean : 30.47
## 3rd Qu.:298.4 3rd Qu.: 37.00
## Max. :299.9 Max. :570.50
## NA's :6 NA's :6
## reanalysis_relative_humidity_percent reanalysis_sat_precip_amt_mm
## Min. :66.74 Min. : 0.00
## 1st Qu.:76.25 1st Qu.: 0.00
## Median :78.67 Median : 20.80
## Mean :78.57 Mean : 35.47
## 3rd Qu.:80.96 3rd Qu.: 52.18
## Max. :87.58 Max. :390.60
## NA's :6 NA's :9
## reanalysis_specific_humidity_g_per_kg reanalysis_tdtr_k station_avg_temp_c
## Min. :11.72 Min. :1.357 Min. :22.84
## 1st Qu.:15.24 1st Qu.:2.157 1st Qu.:25.84
## Median :16.85 Median :2.457 Median :27.23
## Mean :16.55 Mean :2.516 Mean :27.01
## 3rd Qu.:17.86 3rd Qu.:2.800 3rd Qu.:28.19
## Max. :19.44 Max. :4.429 Max. :30.07
## NA's :6 NA's :6 NA's :6
## station_diur_temp_rng_c station_max_temp_c station_min_temp_c
## Min. :4.529 Min. :26.70 Min. :17.8
## 1st Qu.:6.200 1st Qu.:30.60 1st Qu.:21.7
## Median :6.757 Median :31.70 Median :22.8
## Mean :6.757 Mean :31.61 Mean :22.6
## 3rd Qu.:7.286 3rd Qu.:32.80 3rd Qu.:23.9
## Max. :9.914 Max. :35.60 Max. :25.6
## NA's :6 NA's :6 NA's :6
## station_precip_mm total_cases type
## Min. : 0.000 Min. : 0.00 Length:936
## 1st Qu.: 6.825 1st Qu.: 9.00 Class :character
## Median : 17.750 Median : 19.00 Mode :character
## Mean : 26.785 Mean : 34.18
## 3rd Qu.: 35.450 3rd Qu.: 37.00
## Max. :305.900 Max. :461.00
## NA's :6
# Missing value analysis
# Get percentage of missing value by column
sj_miss<- dplyr::select(sj_train,-c("city","year","weekofyear","week_start_date","total_cases","type"))
sj_miss<- apply(X = sj_miss, 2, function(x) sum(is.na(x))/nrow(sj_miss)) %>%
knitr:: kable() %>% kable_styling(bootstrap_options = "striped", full_width = F)
sj_miss
| x | |
|---|---|
| ndvi_ne | 0.2040598 |
| ndvi_nw | 0.0523504 |
| ndvi_se | 0.0202991 |
| ndvi_sw | 0.0202991 |
| precipitation_amt_mm | 0.0096154 |
| reanalysis_air_temp_k | 0.0064103 |
| reanalysis_avg_temp_k | 0.0064103 |
| reanalysis_dew_point_temp_k | 0.0064103 |
| reanalysis_max_air_temp_k | 0.0064103 |
| reanalysis_min_air_temp_k | 0.0064103 |
| reanalysis_precip_amt_kg_per_m2 | 0.0064103 |
| reanalysis_relative_humidity_percent | 0.0064103 |
| reanalysis_sat_precip_amt_mm | 0.0096154 |
| reanalysis_specific_humidity_g_per_kg | 0.0064103 |
| reanalysis_tdtr_k | 0.0064103 |
| station_avg_temp_c | 0.0064103 |
| station_diur_temp_rng_c | 0.0064103 |
| station_max_temp_c | 0.0064103 |
| station_min_temp_c | 0.0064103 |
| station_precip_mm | 0.0064103 |
sj_train <- cbind(sj_train[ , c("city", "type","week_start_date")] , apply(X = sj_train[, which(sapply(sj_train,is.numeric))], MARGIN = 2, function(x) na.locf(x,fromLast = TRUE)))
# # IQUITO
iq_train <- all %>% filter(city == "iq" & type == "train")
summary(iq_train)
## city year weekofyear week_start_date
## Length:520 Min. :2000 Min. : 1.00 Length:520
## Class :character 1st Qu.:2003 1st Qu.:13.75 Class :character
## Mode :character Median :2005 Median :26.50 Mode :character
## Mean :2005 Mean :26.50
## 3rd Qu.:2007 3rd Qu.:39.25
## Max. :2010 Max. :53.00
##
## ndvi_ne ndvi_nw ndvi_se ndvi_sw
## Min. :0.06173 Min. :0.03586 Min. :0.02988 Min. :0.06418
## 1st Qu.:0.20000 1st Qu.:0.17954 1st Qu.:0.19474 1st Qu.:0.20413
## Median :0.26364 Median :0.23297 Median :0.24980 Median :0.26214
## Mean :0.26387 Mean :0.23878 Mean :0.25013 Mean :0.26678
## 3rd Qu.:0.31997 3rd Qu.:0.29393 3rd Qu.:0.30230 3rd Qu.:0.32515
## Max. :0.50836 Max. :0.45443 Max. :0.53831 Max. :0.54602
## NA's :3 NA's :3 NA's :3 NA's :3
## precipitation_amt_mm reanalysis_air_temp_k reanalysis_avg_temp_k
## Min. : 0.00 Min. :294.6 Min. :294.9
## 1st Qu.: 39.10 1st Qu.:297.1 1st Qu.:298.2
## Median : 60.47 Median :297.8 Median :299.1
## Mean : 64.25 Mean :297.9 Mean :299.1
## 3rd Qu.: 85.76 3rd Qu.:298.6 3rd Qu.:300.1
## Max. :210.83 Max. :301.6 Max. :302.9
## NA's :4 NA's :4 NA's :4
## reanalysis_dew_point_temp_k reanalysis_max_air_temp_k
## Min. :290.1 Min. :300.0
## 1st Qu.:294.6 1st Qu.:305.2
## Median :295.9 Median :307.1
## Mean :295.5 Mean :307.1
## 3rd Qu.:296.5 3rd Qu.:308.7
## Max. :298.4 Max. :314.0
## NA's :4 NA's :4
## reanalysis_min_air_temp_k reanalysis_precip_amt_kg_per_m2
## Min. :286.9 Min. : 0.00
## 1st Qu.:292.0 1st Qu.: 24.07
## Median :293.1 Median : 46.44
## Mean :292.9 Mean : 57.61
## 3rd Qu.:294.2 3rd Qu.: 71.07
## Max. :296.0 Max. :362.03
## NA's :4 NA's :4
## reanalysis_relative_humidity_percent reanalysis_sat_precip_amt_mm
## Min. :57.79 Min. : 0.00
## 1st Qu.:84.30 1st Qu.: 39.10
## Median :90.92 Median : 60.47
## Mean :88.64 Mean : 64.25
## 3rd Qu.:94.56 3rd Qu.: 85.76
## Max. :98.61 Max. :210.83
## NA's :4 NA's :4
## reanalysis_specific_humidity_g_per_kg reanalysis_tdtr_k station_avg_temp_c
## Min. :12.11 Min. : 3.714 Min. :21.40
## 1st Qu.:16.10 1st Qu.: 7.371 1st Qu.:27.00
## Median :17.43 Median : 8.964 Median :27.60
## Mean :17.10 Mean : 9.207 Mean :27.53
## 3rd Qu.:18.18 3rd Qu.:11.014 3rd Qu.:28.10
## Max. :20.46 Max. :16.029 Max. :30.80
## NA's :4 NA's :4 NA's :37
## station_diur_temp_rng_c station_max_temp_c station_min_temp_c
## Min. : 5.20 Min. :30.1 Min. :14.7
## 1st Qu.: 9.50 1st Qu.:33.2 1st Qu.:20.6
## Median :10.62 Median :34.0 Median :21.3
## Mean :10.57 Mean :34.0 Mean :21.2
## 3rd Qu.:11.65 3rd Qu.:34.9 3rd Qu.:22.0
## Max. :15.80 Max. :42.2 Max. :24.2
## NA's :37 NA's :14 NA's :8
## station_precip_mm total_cases type
## Min. : 0.00 Min. : 0.000 Length:520
## 1st Qu.: 17.20 1st Qu.: 1.000 Class :character
## Median : 45.30 Median : 5.000 Mode :character
## Mean : 62.47 Mean : 7.565
## 3rd Qu.: 85.95 3rd Qu.: 9.000
## Max. :543.30 Max. :116.000
## NA's :16
# Missing value analysis
# Get percentage of missing value by column
iq_miss<- dplyr::select(iq_train,-c("city","year","weekofyear","week_start_date","total_cases","type"))
iq_miss<- apply(X = iq_miss, 2, function(x) sum(is.na(x))/nrow(iq_miss)) %>%
knitr:: kable() %>% kable_styling(bootstrap_options = "striped", full_width = F)
iq_miss
| x | |
|---|---|
| ndvi_ne | 0.0057692 |
| ndvi_nw | 0.0057692 |
| ndvi_se | 0.0057692 |
| ndvi_sw | 0.0057692 |
| precipitation_amt_mm | 0.0076923 |
| reanalysis_air_temp_k | 0.0076923 |
| reanalysis_avg_temp_k | 0.0076923 |
| reanalysis_dew_point_temp_k | 0.0076923 |
| reanalysis_max_air_temp_k | 0.0076923 |
| reanalysis_min_air_temp_k | 0.0076923 |
| reanalysis_precip_amt_kg_per_m2 | 0.0076923 |
| reanalysis_relative_humidity_percent | 0.0076923 |
| reanalysis_sat_precip_amt_mm | 0.0076923 |
| reanalysis_specific_humidity_g_per_kg | 0.0076923 |
| reanalysis_tdtr_k | 0.0076923 |
| station_avg_temp_c | 0.0711538 |
| station_diur_temp_rng_c | 0.0711538 |
| station_max_temp_c | 0.0269231 |
| station_min_temp_c | 0.0153846 |
| station_precip_mm | 0.0307692 |
plotNA.distribution(iq_train$ndvi_ne)
plotNA.distributionBar(iq_train$ndvi_ne, breaks = 20)
iq_train <- cbind(iq_train[ , c("city", "type","week_start_date")] , apply(X = iq_train[, which(sapply(iq_train,is.numeric))], MARGIN = 2, function(x) na.locf(x,fromLast = TRUE)))
## Total cases across cities
p1<- ggplot(sj_train, aes(x=as.Date(week_start_date), y = total_cases, group = 1)) + geom_line(color="blue") +
xlab("Week Start Date") + ylab("Total Cases") + ggtitle("Case Trend for San Juan") +
theme(axis.text.x = element_text(),axis.text.y = element_text()) +
theme_minimal()
p2<- ggplot(iq_train, aes(x=as.Date(week_start_date), y = total_cases, group = 1)) + geom_line(color="red") +
xlab("Week Start Date") + ylab("Total Cases") + ggtitle("Case Trend for Iquito") +
theme(axis.text.x = element_text(),axis.text.y = element_text()) +
theme_minimal()
grid.arrange(p1,p2,nrow =2)
## Modelling San Juan
#sj_train$city <- NULL
sj_train$weekofyear <- as.character(sj_train$weekofyear)
sj_train$year <- as.character(sj_train$year)
## train model to predict Test (All train)
# Get the best mtry train a random forest using that mtry
preprocessparams <- preProcess(x = select(sj_train[ , which(sapply(sj_train,is.numeric))], -total_cases) ,method = c("center", "scale"))
sj_train <- predict(preprocessparams, sj_train)
sj_train$type <- NULL
rf_reg <- tuneRF(x = sj_train[ , -which(names(sj_train) == "total_cases")], y = sj_train$total_cases, ntreeTry=100,stepFactor=2,improve=0.05, doBest = T)
## mtry = 8 OOB error = 626.2102
## Searching left ...
## mtry = 4 OOB error = 820.3542
## -0.3100302 0.05
## Searching right ...
## mtry = 16 OOB error = 495.1907
## 0.2092261 0.05
## mtry = 24 OOB error = 538.3619
## -0.08718098 0.05
sj_train$pred <- predict(rf_reg, sj_train)
postResample(sj_train$pred, sj_train$total_cases)
## RMSE Rsquared MAE
## 9.8591277 0.9694185 5.3541332
ggplot(data = sj_train, aes(x = as.Date(week_start_date))) + geom_line(aes(y = total_cases, color = "real")) + geom_line(aes(y = pred, color = "pred" ))
sj_train$pred <- NULL
sj_train$type <- "train"
1 with station_precip lag of 1 RMSE Rsquared MAE 9.510456 0.972371 5.251435
with station_precip + station_avg lag of 1 RMSE Rsquared MAE 9.633140 0.968891 5.086512
with station_precip + station_avg + ndvi_ne lag of 1 RMSE Rsquared MAE 9.8423416 0.9692057 5.2444620
No lagged vars RMSE Rsquared MAE 9.6823447 0.9703827 5.1047858
sj_te <- all %>% filter(city == "sj" & type == "test")
# Missing value analysis
# Get percentage of missing value by column
sjte_miss<- dplyr::select(sj_te,-c("city","year","weekofyear","week_start_date","total_cases","type"))
sjte_miss<- apply(X = sjte_miss, 2, function(x) sum(is.na(x))/nrow(sjte_miss)) %>%
knitr:: kable() %>% kable_styling(bootstrap_options = "striped", full_width = F)
sjte_miss
| x | |
|---|---|
| ndvi_ne | 0.1653846 |
| ndvi_nw | 0.0423077 |
| ndvi_se | 0.0038462 |
| ndvi_sw | 0.0038462 |
| precipitation_amt_mm | 0.0076923 |
| reanalysis_air_temp_k | 0.0076923 |
| reanalysis_avg_temp_k | 0.0076923 |
| reanalysis_dew_point_temp_k | 0.0076923 |
| reanalysis_max_air_temp_k | 0.0076923 |
| reanalysis_min_air_temp_k | 0.0076923 |
| reanalysis_precip_amt_kg_per_m2 | 0.0076923 |
| reanalysis_relative_humidity_percent | 0.0076923 |
| reanalysis_sat_precip_amt_mm | 0.0076923 |
| reanalysis_specific_humidity_g_per_kg | 0.0076923 |
| reanalysis_tdtr_k | 0.0076923 |
| station_avg_temp_c | 0.0076923 |
| station_diur_temp_rng_c | 0.0076923 |
| station_max_temp_c | 0.0076923 |
| station_min_temp_c | 0.0076923 |
| station_precip_mm | 0.0076923 |
##The missing values will be replaced by last values
sj_te <- cbind(sj_te[ , c("city", "type", "week_start_date")] , apply(X = sj_te[, which(sapply(sj_te,is.numeric))], MARGIN = 2, function(x) na.locf(x,fromLast = TRUE)))
## Apply parameters and predict
#sj_te <- predict(preprocessparams, sj_te)
sj_te$total_cases <- predict(rf_reg , sj_te)
ggplot(sj_te, aes(as.Date(week_start_date), total_cases, group=1)) + geom_line()
sj_train$year <- as.numeric(sj_train$year)
sj_train$weekofyear <- as.numeric(sj_train$weekofyear)
ggplot(data = union_all(sj_train, sj_te), aes(x = as.Date(week_start_date), y = total_cases, color = type, group =1)) + geom_line()
iq_train$weekofyear <- as.character(iq_train$weekofyear)
#iq_train$city <- NULL
iq_train$year <- as.character(iq_train$year)
# train model to predict Test (All train)
preprocessparams <- preProcess(x = select(iq_train[ , which(sapply(iq_train,is.numeric))], -total_cases),method = c("center", "scale"))
iq_train <- predict(preprocessparams, iq_train)
rf_reg <- tuneRF(x = iq_train[ , -which(colnames(iq_train)=="total_cases")], iq_train$total_cases, ntreeTry=100,stepFactor=2,improve=0.05,trace=FALSE, doBest = T)
## -0.001493603 0.05
## 0.03296734 0.05
iq_train$pred <- predict(rf_reg, iq_train)
postResample(iq_train$pred, iq_train$total_cases)
## RMSE Rsquared MAE
## 4.1436702 0.9135643 2.1860947
ggplot(data = iq_train, aes(x = as.Date(week_start_date))) + geom_line(aes(y = total_cases, color = "real")) + geom_line(aes(y = pred, color = "pred" ))
iq_train$pred <- NULL
iq_te <- all %>% filter(city == "iq" & type == "test")
# Missing value analysis
# Get percentage of missing value by column
iqte_miss<- dplyr::select(iq_te,-c("city","year","weekofyear","week_start_date","total_cases","type"))
iqte_miss<- apply(X = iqte_miss, 2, function(x) sum(is.na(x))/nrow(iqte_miss)) %>%
knitr:: kable() %>% kable_styling(bootstrap_options = "striped", full_width = F)
iqte_miss
| x | |
|---|---|
| ndvi_ne | 0.0000000 |
| ndvi_nw | 0.0000000 |
| ndvi_se | 0.0000000 |
| ndvi_sw | 0.0000000 |
| precipitation_amt_mm | 0.0000000 |
| reanalysis_air_temp_k | 0.0000000 |
| reanalysis_avg_temp_k | 0.0000000 |
| reanalysis_dew_point_temp_k | 0.0000000 |
| reanalysis_max_air_temp_k | 0.0000000 |
| reanalysis_min_air_temp_k | 0.0000000 |
| reanalysis_precip_amt_kg_per_m2 | 0.0000000 |
| reanalysis_relative_humidity_percent | 0.0000000 |
| reanalysis_sat_precip_amt_mm | 0.0000000 |
| reanalysis_specific_humidity_g_per_kg | 0.0000000 |
| reanalysis_tdtr_k | 0.0000000 |
| station_avg_temp_c | 0.0641026 |
| station_diur_temp_rng_c | 0.0641026 |
| station_max_temp_c | 0.0064103 |
| station_min_temp_c | 0.0448718 |
| station_precip_mm | 0.0192308 |
## The missing values will be replaced by a forecast
iq_te <- cbind(iq_te[ , c("city", "type", "week_start_date")] , apply(X = iq_te[, which(sapply(iq_te,is.numeric))], MARGIN = 2, function(x) na.locf(x,fromLast = TRUE)))
iq_te$weekofyear <- as.character(iq_te$weekofyear)
#iq_te$city <- NULL
iq_te$year <- as.character(iq_te$year)
iq_te <- predict(preprocessparams, iq_te)
iq_te$total_cases <- predict(rf_reg, iq_te)
iq_train$year <- as.numeric(iq_train$year)
iq_train$weekofyear <- as.numeric(iq_train$weekofyear)
iq_te$year <- as.numeric(iq_te$year)
iq_te$weekofyear <- as.numeric(iq_te$weekofyear)
#names(iq_train)
#names(iq_te)
ggplot(data = union_all(iq_train, iq_te), aes(x = as.Date(week_start_date), y = total_cases, color = type)) + geom_line()
iq_te$city <- "iq"
iq_te$type <- "test"
rf_final_df <- rbind(sj_te[ , c("city", "type", "total_cases","weekofyear", "week_start_date", "year")],
iq_te[ , c("city", "type", "total_cases","weekofyear", "week_start_date", "year")])
submission_rf <- full_join(x = submission_format, y = rf_final_df, by = c("year","weekofyear", "city"))
data.table::setnames(submission_rf, old = "total_cases.y", "total_cases")
submission_rf$total_cases <- base::round(submission_rf$total_cases,0) # Round
ggplot(submission_rf, aes(as.Date(week_start_date), total_cases, group=city)) + geom_line() + facet_wrap(city~.)
submission_rf <- submission_rf %>% select(city, year, weekofyear, total_cases)
write.csv(submission_rf, "submission_rf.csv", quote = F, row.names = F)
First Submission with Random Forest:SUBMISSION SCORE: 28.9904
sj_train %>%
dplyr::select(-c( "city","type", "year", "week_start_date")) %>%
cor(use = 'pairwise.complete.obs') -> M1
corrplot(M1, type = "lower", order = "hclust", cl.align = "r", tl.col = "black", addCoef.col = "white", tl.cex = .7, number.cex = .6,col = brewer.pal(n=8, name="RdBu"), insig = "blank",diag = FALSE)
iq_train %>%
dplyr::select(-c("city", "type", "year", "week_start_date")) %>%
cor(use = 'pairwise.complete.obs') -> M2
corrplot(M2, type = "lower", order = "hclust", cl.align = "r", tl.col = "black", addCoef.col = "white", tl.cex = .7, number.cex = .6,col = brewer.pal(n=8, name="RdBu"), insig = "blank",diag = FALSE)
Remove [5] “ndvi_ne”
Remove [6] “ndvi_nw”
Remove [7] “ndvi_se”
Keep [8] “ndvi_sw” Remove [9] “precipitation_amt_mm”
Remove, mutate avg with station_precip_mm
Remove [10] “reanalysis_air_temp_k”
Remove [11] “reanalysis_avg_temp_k”
Remove [12] “reanalysis_dew_point_temp_k” 100% corr with specific_humidity
Remove temps in Kelvin: [13] “reanalysis_max_air_temp_k”
Remove temps in Kelvin: [14] “reanalysis_min_air_temp_k”
Remove: [15] “reanalysis_precip_amt_kg_per_m2”
Remove [16] “reanalysis_relative_humidity_percent” Remove [17] “reanalysis_sat_precip_amt_mm”
Keep [18] “reanalysis_specific_humidity_g_per_kg” Remove [19] “reanalysis_tdtr_k”
Keep [20] “station_avg_temp_c”
Remove [21] “station_diur_temp_rng_c”
Remove [22] “station_max_temp_c”
Remove [23] “station_min_temp_c”
Remove [24] “station_precip_mm” (average w/ precipitation_amt_mm)
sj_train$avg_precipitation<- (sj_train$precipitation_amt_mm+sj_train$station_precip_mm+ sj_train$reanalysis_sat_precip_amt_mm)/3
sj_tr<- sj_train %>% #group_by(city,year, week_start_date) %>%
mutate(avg_temp_c = station_avg_temp_c,
humidity_g_per_kg = reanalysis_specific_humidity_g_per_kg,
ndvi_ne = NULL,
ndvi_nw = NULL,
ndvi_se = NULL,
reanalysis_avg_temp_k = NULL,
reanalysis_air_temp_k = NULL,
station_precip_mm = NULL,
precipitation_amt_mm = NULL,
reanalysis_air_temp_k = NULL,
reanalysis_avg_temp_k = NULL,
reanalysis_dew_point_temp_k = NULL,
reanalysis_max_air_temp_k = NULL,
reanalysis_min_air_temp_k = NULL,
reanalysis_precip_amt_kg_per_m2 = NULL,
reanalysis_relative_humidity_percent = NULL,
reanalysis_sat_precip_amt_mm = NULL,
reanalysis_tdtr_k = NULL,
station_diur_temp_rng_c = NULL,
station_max_temp_c = NULL,
station_min_temp_c = NULL,
station_avg_temp_c = NULL,
reanalysis_specific_humidity_g_per_kg = NULL,
station_precip_mm = NULL)
names(sj_tr)
## [1] "city" "week_start_date" "year"
## [4] "weekofyear" "ndvi_sw" "total_cases"
## [7] "type" "avg_precipitation" "avg_temp_c"
## [10] "humidity_g_per_kg"
iq_train$avg_precipitation<- (iq_train$precipitation_amt_mm+iq_train$station_precip_mm+ iq_train$reanalysis_sat_precip_amt_mm)/3
iq_tr<- iq_train %>% #group_by(city,year, week_start_date) %>%
mutate(avg_temp_c = station_avg_temp_c,
humidity_g_per_kg = reanalysis_specific_humidity_g_per_kg,
ndvi_ne = NULL,
ndvi_nw = NULL,
ndvi_se = NULL,
reanalysis_avg_temp_k = NULL,
reanalysis_air_temp_k = NULL,
station_precip_mm = NULL,
precipitation_amt_mm = NULL,
reanalysis_air_temp_k = NULL,
reanalysis_avg_temp_k = NULL,
reanalysis_dew_point_temp_k = NULL,
reanalysis_max_air_temp_k = NULL,
reanalysis_min_air_temp_k = NULL,
reanalysis_precip_amt_kg_per_m2 = NULL,
reanalysis_relative_humidity_percent = NULL,
reanalysis_sat_precip_amt_mm = NULL,
reanalysis_tdtr_k = NULL,
station_diur_temp_rng_c = NULL,
station_max_temp_c = NULL,
station_min_temp_c = NULL,
station_avg_temp_c = NULL,
reanalysis_specific_humidity_g_per_kg = NULL,
station_precip_mm = NULL)
names(iq_tr)
## [1] "city" "type" "week_start_date"
## [4] "year" "weekofyear" "ndvi_sw"
## [7] "total_cases" "avg_precipitation" "avg_temp_c"
## [10] "humidity_g_per_kg"
sj_tr$city <- NULL
sj_tr$weekofyear <- as.character(sj_tr$weekofyear)
sj_tr$year <- as.character(sj_tr$year)
sj_tr$type <- NULL
## train model to predict Test (All train)
## Get the best mtry train a random forest using that mtry
#sj_preprocparams <- preProcess(x = select(sj_tr[ , which(sapply(sj_tr,is.numeric))], -total_cases) ,method = c("center", "scale"))
#sj_tr <- predict(sj_preprocparams, sj_tr)
sj_tr$avg_precipitation <- dplyr::lag(sj_tr$avg_precipitation, 1)
#Plots of total cases to the temp/hum variables do not show any particular relationship
#plot(sj_tr$avg_precipitation,sj_tr$total_cases)
#plot(sj_tr$avg_ndvi,sj_tr$total_cases)
#plot(sj_tr$avg_temp_c,sj_tr$total_cases)
#plot(sj_tr$humidity_g_per_kg,sj_tr$total_cases)
sj_tr[1,6] = sj_tr[53,6]
#sj_tr <- sj_tr[complete.cases(sj_tr), ]
rf_reg <- tuneRF(x = sj_tr[ , -which(names(sj_tr) == "total_cases")], y = sj_tr$total_cases, ntreeTry=100,stepFactor=2,improve=0.05, doBest = T)
## mtry = 2 OOB error = 507.1013
## Searching left ...
## mtry = 1 OOB error = 824.093
## -0.6251052 0.05
## Searching right ...
## mtry = 4 OOB error = 328.6548
## 0.3518953 0.05
## mtry = 7 OOB error = 347.3086
## -0.05675813 0.05
sj_tr$pred <- predict(rf_reg, sj_tr)
sjtr_pred<- sj_tr
postResample(sj_tr$pred, sj_tr$total_cases)
## RMSE Rsquared MAE
## 8.2003435 0.9806783 4.3406866
ggplot(data = sj_tr, aes(x = as.Date(week_start_date))) + geom_line(aes(y = total_cases, color = "real")) + geom_line(aes(y = pred, color = "pred" ))
sj_tr$pred <- NULL
sj_tr$type <- "train"
sj_te <- all %>% filter(city == "sj" & type == "test")
##The missing values will be replaced by fromLast:
sj_te <- cbind(sj_te[ , c("city", "type", "week_start_date")] , apply(X = sj_te[, which(sapply(sj_te,is.numeric))], MARGIN = 2, function(x) na.locf(x,fromLast = TRUE)))
sj_te$avg_precipitation<- (sj_te$precipitation_amt_mm+sj_te$station_precip_mm+ sj_te$reanalysis_sat_precip_amt_mm)/3
sj_test <- sj_te %>% #group_by(city,year, week_start_date) %>%
mutate(avg_temp_c = station_avg_temp_c,
humidity_g_per_kg = reanalysis_specific_humidity_g_per_kg,
ndvi_ne = NULL,
ndvi_nw = NULL,
ndvi_se = NULL,
reanalysis_avg_temp_k = NULL,
reanalysis_air_temp_k = NULL,
station_precip_mm = NULL,
precipitation_amt_mm = NULL,
reanalysis_air_temp_k = NULL,
reanalysis_avg_temp_k = NULL,
reanalysis_dew_point_temp_k = NULL,
reanalysis_max_air_temp_k = NULL,
reanalysis_min_air_temp_k = NULL,
reanalysis_precip_amt_kg_per_m2 = NULL,
reanalysis_relative_humidity_percent = NULL,
reanalysis_sat_precip_amt_mm = NULL,
reanalysis_tdtr_k = NULL,
station_diur_temp_rng_c = NULL,
station_max_temp_c = NULL,
station_min_temp_c = NULL,
station_avg_temp_c = NULL,
reanalysis_specific_humidity_g_per_kg = NULL,
station_precip_mm = NULL)
## Lag SJ Precipitation
sj_test$avg_precipitation <- dplyr::lag(sj_test$avg_precipitation, 1)
sj_test[1,8] = sj_train[936,27]
#sj_test <- sj_test[complete.cases(sj_test), ]
## Apply parameters and predict
#sj_test <- predict(sj_preprocparams, sj_test)
sj_test$total_cases <- predict(rf_reg , sj_test)
ggplot(sj_test, aes(as.Date(week_start_date), total_cases, group=1)) + geom_line()
sj_tr$year <- as.numeric(sj_tr$year)
sj_tr$weekofyear <- as.numeric(sj_tr$weekofyear)
ggplot(data = union_all(sj_tr, sj_test), aes(x = as.Date(week_start_date), y = total_cases, color = type)) + geom_line()
iq_tr$weekofyear <- as.character(iq_tr$weekofyear)
iq_tr$city <- NULL
iq_tr$type <- NULL
iq_tr$year <- as.character(iq_tr$year)
## Get the best mtry train a random forest using that mtry
#iq_preprocparams <- preProcess(x = select(iq_tr[ , which(sapply(iq_tr,is.numeric))], -total_cases),method = c("center", "scale"))
#iq_tr <- predict(iq_preprocparams, iq_tr)
iq_tr$avg_precipitation <- dplyr::lag(iq_tr$avg_precipitation, 1)
iq_tr[1,6] = iq_tr[53,6]
#iq_tr <- iq_tr[complete.cases(iq_tr), ]
rf_reg <- tuneRF(x = iq_tr[ , -which(names(iq_tr) =="total_cases")], iq_tr$total_cases, ntreeTry=100,stepFactor=2,improve=0.05,trace=FALSE, doBest = T)
## -0.163319 0.05
## -0.009337287 0.05
iq_tr$pred <- predict(rf_reg, iq_tr)
iqtr_pred<- iq_tr
postResample(iq_tr$pred, iq_tr$total_cases)
## RMSE Rsquared MAE
## 4.2498241 0.9085896 2.1890413
ggplot(data = iq_tr, aes(x = as.Date(week_start_date))) + geom_line(aes(y = total_cases, color = "real")) + geom_line(aes(y = pred, color = "pred" ))
iq_tr$pred <- NULL
iq_tr$type <- "train"
## Make final prediction for Iquito (test)
iq_te <- all %>% filter(city == "iq" & type == "test")
## The missing values will be replaced by fromLast
iq_te <- cbind(iq_te[ , c("city", "type", "week_start_date")] , apply(X = iq_te[, which(sapply(iq_te,is.numeric))], MARGIN = 2, function(x) na.locf(x,fromLast = TRUE)))
iq_te$avg_precipitation<- (iq_te$precipitation_amt_mm+iq_te$station_precip_mm+ iq_te$reanalysis_sat_precip_amt_mm)/3
iq_test <- iq_te %>% #group_by(city,year, week_start_date) %>%
mutate(avg_temp_c = station_avg_temp_c,
humidity_g_per_kg = reanalysis_specific_humidity_g_per_kg,
ndvi_ne = NULL,
ndvi_nw = NULL,
ndvi_se = NULL,
reanalysis_avg_temp_k = NULL,
reanalysis_air_temp_k = NULL,
station_precip_mm = NULL,
precipitation_amt_mm = NULL,
reanalysis_air_temp_k = NULL,
reanalysis_avg_temp_k = NULL,
reanalysis_dew_point_temp_k = NULL,
reanalysis_max_air_temp_k = NULL,
reanalysis_min_air_temp_k = NULL,
reanalysis_precip_amt_kg_per_m2 = NULL,
reanalysis_relative_humidity_percent = NULL,
reanalysis_sat_precip_amt_mm = NULL,
reanalysis_tdtr_k = NULL,
station_diur_temp_rng_c = NULL,
station_max_temp_c = NULL,
station_min_temp_c = NULL,
station_avg_temp_c = NULL,
reanalysis_specific_humidity_g_per_kg = NULL,
station_precip_mm = NULL)
## Lag IQ Precipitation
iq_test$avg_precipitation <- dplyr::lag(iq_test$avg_precipitation, 1)
iq_test[1,8] = iq_train[520,27]
#iq_test <- iq_test[complete.cases(iq_test), ]
## Apply parameters and predict
#iq_test <- predict(iq_preprocparams, iq_test)
iq_test$total_cases <- predict(rf_reg, iq_test)
ggplot(iq_test, aes(as.Date(week_start_date), total_cases, group=1)) + geom_line()
iq_tr$year <- as.numeric(iq_tr$year)
iq_tr$weekofyear <- as.numeric(iq_tr$weekofyear)
ggplot(data = union_all(iq_tr, iq_test), aes(x = as.Date(week_start_date), y = total_cases, color = type)) + geom_line()
#iq_test$weekofyear <- as.character(iq_test$weekofyear)
#iq_test$weekofyear <- as.numeric(iq_test$weekofyear)
iq_test$city <- "iq"
iq_test$type <- "test"
rf_final_df2 <- rbind(sj_test[ , c("city", "type", "total_cases","weekofyear", "week_start_date", "year")],
iq_test[ , c("city", "type", "total_cases","weekofyear", "week_start_date", "year")])
submission_rf2 <- full_join(x = submission_format, y = rf_final_df2, by = c("year","weekofyear", "city"))
data.table::setnames(submission_rf2, old = "total_cases.y", "total_cases")
submission_rf2$total_cases <- base::round(submission_rf2$total_cases,0) # Round
ggplot(submission_rf2, aes(as.Date(week_start_date), total_cases, group=city)) + geom_line() + facet_wrap(city~.)
submission_rf2 <- submission_rf2 %>% select(city, year, weekofyear, total_cases)
write.csv(submission_rf2, "submission_rf2.csv", quote = F, row.names = F)
SUBMISSION SCORE #2: 27.1034
sjtot=ts(sj_tr$total_cases, frequency=52, start=c(1990,04,30))
sjtotplot<- window(sjtot, start= 2000, end = 2008)
ggseasonplot(sjtotplot, polar=TRUE) + ylab("Cases") +
ggtitle("Polar seasonal plot: San Juan Dengue Fever Cases") + theme_bw()
iqtot=ts(iq_tr$total_cases, frequency=52, start=c(2000,07,01))
iqtotplot<- window(iqtot, start= 2000, end = 2008)
ggseasonplot(iqtotplot, polar=TRUE) + ylab("Cases") +
ggtitle("Polar seasonal plot: Iquito Dengue Fever Cases") + theme_bw()
autoplot(sjtot) + ylab("Weeks") +
xlab("Cases") + ggtitle("San Juan: Total Cases over Training Period")
autoplot(iqtot) + ylab("Weeks") +
xlab("Cases") + ggtitle("Iquito: Total Cases over Training Period")
acf(sjtot)
pacf(sjtot)
tsdisplay(sjtot)
autoplot(diff(sjtot,1))
acf(iqtot)
pacf(iqtot)
tsdisplay(iqtot)
autoplot(diff(iqtot,1))
exvar_sj<- as.matrix(sj_tr[,c("weekofyear","ndvi_sw","avg_precipitation" , "avg_temp_c" ,"humidity_g_per_kg")])
exvar_sjtest<- as.matrix(sj_test[,c("weekofyear","ndvi_sw","avg_precipitation" , "avg_temp_c" ,"humidity_g_per_kg")])
(f1=auto.arima(sjtot, xreg =exvar_sj))
## Series: sjtot
## Regression with ARIMA(1,1,1) errors
##
## Coefficients:
## ar1 ma1 weekofyear ndvi_sw avg_precipitation avg_temp_c
## 0.7071 -0.5895 -0.0007 -0.2307 0.4248 1.2978
## s.e. 0.0965 0.1093 0.0608 0.3237 0.3701 1.0526
## humidity_g_per_kg
## 1.0069
## s.e. 1.0387
##
## sigma^2 estimated as 180.5: log likelihood=-3752.11
## AIC=7520.21 AICc=7520.37 BIC=7558.94
cbind("Regression Errors" = residuals(f1, type="regression"),
"ARIMA errors" = residuals(f1, type="innovation")) %>%
autoplot(facets=TRUE)
checkresiduals(f1)
##
## Ljung-Box test
##
## data: Residuals from Regression with ARIMA(1,1,1) errors
## Q* = 90.045, df = 97, p-value = 0.6785
##
## Model df: 7. Total lags used: 104
fc_sj=forecast(f1, h = 260, xreg = exvar_sjtest)
autoplot(fc_sj)
exvar_iq<- as.matrix(iq_tr[,c("weekofyear","ndvi_sw","avg_precipitation" , "avg_temp_c" ,"humidity_g_per_kg")])
exvar_iqtest<- as.matrix(iq_test[,c("weekofyear","ndvi_sw","avg_precipitation", "avg_temp_c" ,"humidity_g_per_kg")])
(f2=auto.arima(iqtot,xreg = exvar_iq))
## Series: iqtot
## Regression with ARIMA(3,0,0) errors
##
## Coefficients:
## ar1 ar2 ar3 intercept weekofyear ndvi_sw
## 0.6944 -0.0720 0.1961 9.4552 -0.0766 0.6362
## s.e. 0.0432 0.0535 0.0434 1.9829 0.0415 0.2704
## avg_precipitation avg_temp_c humidity_g_per_kg
## -0.2527 0.3630 -0.4703
## s.e. 0.3343 0.3264 0.4322
##
## sigma^2 estimated as 48.59: log likelihood=-1743.46
## AIC=3506.92 AICc=3507.36 BIC=3549.46
cbind("Regression Errors" = residuals(f2, type="regression"),
"ARIMA errors" = residuals(f2, type="innovation")) %>%
autoplot(facets=TRUE)
checkresiduals(f2)
##
## Ljung-Box test
##
## data: Residuals from Regression with ARIMA(3,0,0) errors
## Q* = 81.259, df = 95, p-value = 0.8415
##
## Model df: 9. Total lags used: 104
fc_iq=forecast(f2, h = 156, xreg = exvar_iqtest)
autoplot(fc_iq)
sj_te <- all %>% filter(city == “sj” & type == “test”)
##The missing values will be replaced by a forecast sj_te <- cbind(sj_te[ , c(“city”, “type”, “week_start_date”)] , apply(X = sj_te[, which(sapply(sj_te,is.numeric))], MARGIN = 2, function(x) na.locf(x,fromLast = TRUE)))
sj_te <- sj_te %>% group_by(city,year, week_start_date) %>%
mutate(avg_precipitation = mean(station_precip_mm, precipitation_amt_mm,reanalysis_sat_precip_amt_mm, na.rm = T, trim = 0), avg_ndvi = mean(ndvi_ne,ndvi_nw, ndvi_se, ndvi_sw, na.rm = T, trim = 0), avg_temp_c = station_avg_temp_c, humidity_g_per_kg = reanalysis_specific_humidity_g_per_kg, ndvi_ne = NULL, ndvi_nw = NULL, ndvi_se = NULL, ndvi_sw = NULL,
reanalysis_avg_temp_k = NULL, reanalysis_air_temp_k = NULL, station_precip_mm = NULL, precipitation_amt_mm = NULL, reanalysis_air_temp_k = NULL, reanalysis_avg_temp_k = NULL, reanalysis_dew_point_temp_k = NULL, reanalysis_max_air_temp_k = NULL, reanalysis_min_air_temp_k = NULL, reanalysis_precip_amt_kg_per_m2 = NULL, reanalysis_relative_humidity_percent = NULL, reanalysis_sat_precip_amt_mm = NULL, reanalysis_tdtr_k = NULL, station_diur_temp_rng_c = NULL, station_max_temp_c = NULL,
station_min_temp_c = NULL, station_avg_temp_c = NULL, reanalysis_specific_humidity_g_per_kg = NULL, station_precip_mm = NULL)
(fit_sj <- Arima(sjtot, order=c(2,0,1), seasonal=c(0,0,0), xreg =exvar_sj ))
## Series: sjtot
## Regression with ARIMA(2,0,1) errors
##
## Coefficients:
## ar1 ar2 ma1 intercept weekofyear ndvi_sw
## 1.8474 -0.8621 -0.7800 34.3293 -0.0165 -0.2214
## s.e. 0.0387 0.0368 0.0504 6.5680 0.0608 0.3240
## avg_precipitation avg_temp_c humidity_g_per_kg
## 0.4180 1.3613 0.8754
## s.e. 0.3702 1.0528 1.0397
##
## sigma^2 estimated as 172.7: log likelihood=-3735.91
## AIC=7491.82 AICc=7492.06 BIC=7540.24
fcast_sj=forecast(fit_sj, h = 260, xreg = exvar_sjtest)
autoplot(fcast_sj) + xlab("Year") +
ylab("Cases")
tsdisplay(residuals(fit_sj))
Box.test(residuals(fcast_sj),fitdf=7, lag=151, type="Ljung")
##
## Box-Ljung test
##
## data: residuals(fcast_sj)
## X-squared = 107.77, df = 144, p-value = 0.9894
(fit_sj2 <- Arima(sjtot, order=c(1,2,1), seasonal=c(0,0,0), xreg =exvar_sj ))
## Series: sjtot
## Regression with ARIMA(1,2,1) errors
##
## Coefficients:
## ar1 ma1 weekofyear ndvi_sw avg_precipitation avg_temp_c
## 0.1359 -1.0000 0.0283 -0.2766 0.3883 1.1983
## s.e. 0.0326 0.0029 0.0610 0.3220 0.3723 1.0615
## humidity_g_per_kg
## 1.0907
## s.e. 1.0465
##
## sigma^2 estimated as 182.3: log likelihood=-3756.15
## AIC=7528.3 AICc=7528.46 BIC=7567.02
fcast_sj2=forecast(fit_sj2, h = 260, xreg = exvar_sjtest)
autoplot(fcast_sj2) + xlab("Year") +
ylab("Cases")
tsdisplay(residuals(fit_sj2))
Box.test(residuals(fcast_sj2),fitdf=7, lag=151, type="Ljung")
##
## Box-Ljung test
##
## data: residuals(fcast_sj2)
## X-squared = 149.65, df = 144, p-value = 0.3565
iq_te <- all %>% filter(city == “iq” & type == “test”)
##The missing values will be replaced by a forecast iq_te <- cbind(iq_te[ , c(“city”, “type”, “week_start_date”)] , apply(X = iq_te[, which(sapply(iq_te,is.numeric))], MARGIN = 2, function(x) na.locf(x,fromLast = TRUE)))
iq_te <- iq_te %>% group_by(city,year, week_start_date) %>%
mutate(avg_precipitation = mean(station_precip_mm, precipitation_amt_mm,reanalysis_sat_precip_amt_mm, na.rm = T, trim = 0), avg_ndvi = mean(ndvi_ne,ndvi_nw, ndvi_se, ndvi_sw, na.rm = T, trim = 0), avg_temp_c = station_avg_temp_c, humidity_g_per_kg = reanalysis_specific_humidity_g_per_kg, ndvi_ne = NULL, ndvi_nw = NULL, ndvi_se = NULL, ndvi_sw = NULL,
reanalysis_avg_temp_k = NULL, reanalysis_air_temp_k = NULL, station_precip_mm = NULL, precipitation_amt_mm = NULL, reanalysis_air_temp_k = NULL, reanalysis_avg_temp_k = NULL, reanalysis_dew_point_temp_k = NULL, reanalysis_max_air_temp_k = NULL, reanalysis_min_air_temp_k = NULL, reanalysis_precip_amt_kg_per_m2 = NULL, reanalysis_relative_humidity_percent = NULL, reanalysis_sat_precip_amt_mm = NULL, reanalysis_tdtr_k = NULL, station_diur_temp_rng_c = NULL, station_max_temp_c = NULL,
station_min_temp_c = NULL, station_avg_temp_c = NULL, reanalysis_specific_humidity_g_per_kg = NULL, station_precip_mm = NULL)
(fit_iq <- Arima(iqtot, order=c(1,0,4), seasonal=c(0,0,0), xreg =exvar_iq ))
## Series: iqtot
## Regression with ARIMA(1,0,4) errors
##
## Coefficients:
## ar1 ma1 ma2 ma3 ma4 intercept weekofyear ndvi_sw
## 0.7795 -0.0838 -0.0967 0.0935 0.1344 9.3419 -0.0706 0.6043
## s.e. 0.0610 0.0731 0.0615 0.0532 0.0542 1.8097 0.0422 0.2716
## avg_precipitation avg_temp_c humidity_g_per_kg
## -0.3761 0.3001 -0.3601
## s.e. 0.3423 0.3238 0.4441
##
## sigma^2 estimated as 48.45: log likelihood=-1741.75
## AIC=3507.51 AICc=3508.12 BIC=3558.55
fcast_iq=forecast(fit_iq, h = 156, xreg = exvar_iqtest)
autoplot(fcast_iq) + xlab("Year") +
ylab("Cases")
tsdisplay(residuals(fit_iq))
Box.test(residuals(fit_iq),fitdf=7, lag=151, type="Ljung")
##
## Box-Ljung test
##
## data: residuals(fit_iq)
## X-squared = 99.443, df = 144, p-value = 0.9982
xreg_arima_sj_sol <- data.frame(submission_format[1:260,-4], total_cases = round(fcast_sj$mean))
xreg_arima_iq_sol <- data.frame(submission_format[261:416,-4], total_cases =round(fcast_iq$mean))
xreg_arima_solution <- bind_rows(xreg_arima_sj_sol,xreg_arima_iq_sol)
xreg_arima_solution$total_cases[xreg_arima_solution$total_cases < 0 ] <- 0
write.csv(xreg_arima_solution, file = 'xreg_arima_submission.csv', row.names = F)
xreg_arima_iq_sol2 <- data.frame(submission_format[261:416,-4], total_cases =round(fc_iq$mean))
xreg_arima_solution2 <- bind_rows(xreg_arima_sj_sol,xreg_arima_iq_sol2)
xreg_arima_solution2$total_cases[xreg_arima_solution2$total_cases < 0 ] <- 0
write.csv(xreg_arima_solution2, file = 'xreg_arima_submission2.csv', row.names = F)
Model 3: SUBMISSION SCORE #3: 41.8606
acf(sjtot)
pacf(sjtot)
tsdisplay(sjtot)
autoplot(diff(sjtot,1))
sj_ar <- Arima(sjtot, order=c(2,0,1), seasonal=c(0,1,0))
sj_fcast<- forecast(sj_ar, h = 260)
autoplot(sj_fcast)
tsdisplay(residuals(sj_ar))
Box.test(residuals(sj_fcast),fitdf=2, lag=151, type="Ljung")
##
## Box-Ljung test
##
## data: residuals(sj_fcast)
## X-squared = 437.35, df = 149, p-value < 2.2e-16
iq_ar <- Arima(iqtot, order=c(1,0,4), seasonal=c(0,1,0))
iq_fcast<- forecast(iq_ar, h = 156)
autoplot(iq_fcast)
tsdisplay(residuals(iq_ar))
Box.test(residuals(iq_fcast),fitdf=3, lag=151, type="Ljung")
##
## Box-Ljung test
##
## data: residuals(iq_fcast)
## X-squared = 308.4, df = 148, p-value = 2.43e-13
arima_sj_sol <- data.frame(submission_format[1:260,-4], total_cases = round(sj_fcast$mean))
arima_iq_sol <- data.frame(submission_format[261:416,-4], total_cases =round(iq_fcast$mean))
arima_solution <- bind_rows(arima_sj_sol,arima_iq_sol)
write.csv(arima_solution, file = 'arima_predicted_Solution.csv', row.names = F)
Your score for this submission is: 29.8413
sjtot=ts(sj_train$total_cases, start=c(1990,4,30), frequency=52)
stl_decomposition = stl(sjtot, t.window=13, s.window="periodic",robust=TRUE)
autoplot(stl_decomposition)
sj_fcast3<- forecast(stl_decomposition,method = "ets",h=260)
accuracy(sj_fcast3)
## ME RMSE MAE MPE MAPE MASE ACF1
## Training set 0.006078719 13.26227 7.841533 NaN Inf 0.2146436 0.02412127
autoplot(sj_fcast3)
#mstl
mstl_decomposition = mstl(sjtot)
autoplot(mstl_decomposition)
sj_fcast4<- forecast(mstl_decomposition,method = "ets",h=260)
kable(accuracy(sj_fcast4))
| ME | RMSE | MAE | MPE | MAPE | MASE | ACF1 | |
|---|---|---|---|---|---|---|---|
| Training set | -0.0080337 | 12.20282 | 7.629666 | -Inf | Inf | 0.2088442 | 0.042466 |
autoplot(sj_fcast4) + ylab("Total Cases") + ggtitle("San Juan: MSTL + ETS")
iqtot=ts(iq_train$total_cases, start=c(2000,07,01), frequency=52)
iqstl_decomp = stl(iqtot, t.window=13, s.window="periodic",robust=TRUE)
autoplot(iqstl_decomp)
iq_fcast3<- forecast(iqstl_decomp,method = "ets",h=156)
kable(accuracy(iq_fcast3))
| ME | RMSE | MAE | MPE | MAPE | MASE | ACF1 | |
|---|---|---|---|---|---|---|---|
| Training set | 0.009541 | 7.161697 | 3.675609 | NaN | Inf | 0.3893583 | 0.0698369 |
autoplot(iq_fcast3)
#mstl
iqmstl_decomp = mstl(iqtot)
autoplot(iqmstl_decomp)
iq_fcast4<- forecast(iqmstl_decomp,method = "ets",h=156)
accuracy(iq_fcast4)
## ME RMSE MAE MPE MAPE MASE ACF1
## Training set 0.01981199 6.48256 3.828215 NaN Inf 0.4055239 0.06328665
autoplot(iq_fcast4)
ets_sj_sol <- data.frame(submission_format[1:260,-4], total_cases = round(sj_fcast4$mean))
ets_iq_sol <- data.frame(submission_format[261:416,-4], total_cases =round(iq_fcast4$mean))
ets_solution <- bind_rows(ets_sj_sol,ets_iq_sol)
write.csv(ets_solution, file = 'ets_predicted_Solution.csv', row.names = F)
MODEL 5 SUBMISSION SCORE: 26.5192
(fit_nnetar_sj <- nnetar(sjtot))
## Series: sjtot
## Model: NNAR(14,1,8)[52]
## Call: nnetar(y = sjtot)
##
## Average of 20 networks, each of which is
## a 15-8-1 network with 137 weights
## options were - linear output units
##
## sigma^2 estimated as 51.01
#repeats=25, size=12, decay=0.1,linout=TRUE
plot(forecast(fit_nnetar_sj,h=260))
summary(fit_nnetar_sj)
## Length Class Mode
## x 936 ts numeric
## m 1 -none- numeric
## p 1 -none- numeric
## P 1 -none- numeric
## scalex 2 -none- list
## size 1 -none- numeric
## subset 936 -none- numeric
## model 20 nnetarmodels list
## nnetargs 0 -none- list
## fitted 936 ts numeric
## residuals 936 ts numeric
## lags 15 -none- numeric
## series 1 -none- character
## method 1 -none- character
## call 2 -none- call
checkresiduals(fit_nnetar_sj)
a=forecast(fit_nnetar_sj,h=260)
(fit_nnetar_iq <- nnetar(iqtot))
## Series: iqtot
## Model: NNAR(5,1,4)[52]
## Call: nnetar(y = iqtot)
##
## Average of 20 networks, each of which is
## a 6-4-1 network with 33 weights
## options were - linear output units
##
## sigma^2 estimated as 23.08
plot(forecast(fit_nnetar_iq,h=156))
checkresiduals(fit_nnetar_iq)
b=forecast(fit_nnetar_iq,h=156)
nnetar_sj_sol <- data.frame(submission_format[1:260,-4], total_cases = round(a$mean))
nnetar_iq_sol <- data.frame(submission_format[261:416,-4], total_cases =round(b$mean))
nnetar_solution <- bind_rows(nnetar_sj_sol,nnetar_iq_sol)
write.csv(nnetar_solution, file = 'nnetar_predicted_Solution.csv', row.names = F)
MODEL 5 SUBMISSION SCORE: 37.4351