Setup

Loading Libraries

knitr::opts_chunk$set(echo = TRUE)
library(fpp2)
## Warning: package 'fpp2' was built under R version 4.0.5
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo
## -- Attaching packages ---------------------------------------------- fpp2 2.4 --
## v ggplot2   3.3.3     v fma       2.4  
## v forecast  8.14      v expsmooth 2.3
## Warning: package 'fma' was built under R version 4.0.5
## Warning: package 'expsmooth' was built under R version 4.0.5
## 
library(fastDummies)
## Warning: package 'fastDummies' was built under R version 4.0.5
library(kableExtra)
## Warning: package 'kableExtra' was built under R version 4.0.5
library(data.table)
library(lubridate)
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:data.table':
## 
##     hour, isoweek, mday, minute, month, quarter, second, wday, week,
##     yday, year
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
library(stargazer)
## 
## Please cite as:
##  Hlavac, Marek (2018). stargazer: Well-Formatted Regression and Summary Statistics Tables.
##  R package version 5.2.2. https://CRAN.R-project.org/package=stargazer
library(plyr)
## 
## Attaching package: 'plyr'
## The following object is masked from 'package:fma':
## 
##     ozone
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:plyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise,
##     summarize
## The following objects are masked from 'package:data.table':
## 
##     between, first, last
## The following object is masked from 'package:kableExtra':
## 
##     group_rows
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --
## v tibble  3.0.6     v purrr   0.3.4
## v tidyr   1.1.2     v stringr 1.4.0
## v readr   1.4.0     v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::arrange()         masks plyr::arrange()
## x lubridate::as.difftime() masks base::as.difftime()
## x dplyr::between()         masks data.table::between()
## x purrr::compact()         masks plyr::compact()
## x dplyr::count()           masks plyr::count()
## x lubridate::date()        masks base::date()
## x dplyr::failwith()        masks plyr::failwith()
## x dplyr::filter()          masks stats::filter()
## x dplyr::first()           masks data.table::first()
## x dplyr::group_rows()      masks kableExtra::group_rows()
## x lubridate::hour()        masks data.table::hour()
## x dplyr::id()              masks plyr::id()
## x lubridate::intersect()   masks base::intersect()
## x lubridate::isoweek()     masks data.table::isoweek()
## x dplyr::lag()             masks stats::lag()
## x dplyr::last()            masks data.table::last()
## x lubridate::mday()        masks data.table::mday()
## x lubridate::minute()      masks data.table::minute()
## x lubridate::month()       masks data.table::month()
## x dplyr::mutate()          masks plyr::mutate()
## x lubridate::quarter()     masks data.table::quarter()
## x dplyr::rename()          masks plyr::rename()
## x lubridate::second()      masks data.table::second()
## x lubridate::setdiff()     masks base::setdiff()
## x dplyr::summarise()       masks plyr::summarise()
## x dplyr::summarize()       masks plyr::summarize()
## x purrr::transpose()       masks data.table::transpose()
## x lubridate::union()       masks base::union()
## x lubridate::wday()        masks data.table::wday()
## x lubridate::week()        masks data.table::week()
## x lubridate::yday()        masks data.table::yday()
## x lubridate::year()        masks data.table::year()
library(zoo)
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
library(psych)
## 
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha
library(grid)
library(seasonal)
## Warning: package 'seasonal' was built under R version 4.0.5
## 
## Attaching package: 'seasonal'
## The following object is masked from 'package:psych':
## 
##     outlier
## The following object is masked from 'package:tibble':
## 
##     view
library(reshape2)
## Warning: package 'reshape2' was built under R version 4.0.5
## 
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
## 
##     smiths
## The following objects are masked from 'package:data.table':
## 
##     dcast, melt

Load in Data

#Dengue Data
d.all <- fread("dengue_labels_train.csv")
d.all.sj <- d.all[city=="sj"]
d.all.iq <- d.all[city=="iq"]
#Additional Variables
d.vars <- fread("dengue_features_train.csv")
d.vars.sj <- d.vars[city=="sj"]
d.vars.iq <- d.vars[city=="iq"]
#Merge
d.sj <- merge(d.all.sj, d.vars.sj, by.x = c("year","weekofyear","city"), 
             by.y = c("year","weekofyear","city"), all.x = TRUE, all.y = TRUE)
d.iq <- merge(d.all.iq, d.vars.iq, by.x = c("year","weekofyear","city"), 
             by.y = c("year","weekofyear","city"), all.x = TRUE, all.y = TRUE)

Covariance and Correlation

#SJ
cor.sj <- data.frame(cor(d.sj[,-c("city","week_start_date")], use = "complete.obs"))
cor.sj.sub <- subset(cor.sj, select = "total_cases")
cor.sj.sub
##                                        total_cases
## year                                  -0.207912900
## weekofyear                             0.289133825
## total_cases                            1.000000000
## ndvi_ne                                0.031533393
## ndvi_nw                                0.083304916
## ndvi_se                               -0.004387998
## ndvi_sw                               -0.002335392
## precipitation_amt_mm                   0.101033458
## reanalysis_air_temp_k                  0.238730865
## reanalysis_avg_temp_k                  0.231786509
## reanalysis_dew_point_temp_k            0.271975113
## reanalysis_max_air_temp_k              0.256103688
## reanalysis_min_air_temp_k              0.243080906
## reanalysis_precip_amt_kg_per_m2        0.154953190
## reanalysis_relative_humidity_percent   0.197329663
## reanalysis_sat_precip_amt_mm           0.101033458
## reanalysis_specific_humidity_g_per_kg  0.280365672
## reanalysis_tdtr_k                     -0.049046214
## station_avg_temp_c                     0.224524237
## station_diur_temp_rng_c                0.010370823
## station_max_temp_c                     0.170781332
## station_min_temp_c                     0.215872005
## station_precip_mm                      0.069941830
cor.sj.sub.sig <- subset(cor.sj.sub, abs(total_cases) >= .15)
cor.sj.sub.sig
##                                       total_cases
## year                                   -0.2079129
## weekofyear                              0.2891338
## total_cases                             1.0000000
## reanalysis_air_temp_k                   0.2387309
## reanalysis_avg_temp_k                   0.2317865
## reanalysis_dew_point_temp_k             0.2719751
## reanalysis_max_air_temp_k               0.2561037
## reanalysis_min_air_temp_k               0.2430809
## reanalysis_precip_amt_kg_per_m2         0.1549532
## reanalysis_relative_humidity_percent    0.1973297
## reanalysis_specific_humidity_g_per_kg   0.2803657
## station_avg_temp_c                      0.2245242
## station_max_temp_c                      0.1707813
## station_min_temp_c                      0.2158720
#IQ
cor.iq <- data.frame(cor(d.iq[,-c("city","week_start_date")], use = "complete.obs"))
cor.iq.sub <- subset(cor.iq, select = "total_cases")
cor.iq.sub
##                                        total_cases
## year                                   0.218685930
## weekofyear                            -0.017409241
## total_cases                            1.000000000
## ndvi_ne                               -0.015188790
## ndvi_nw                               -0.056017695
## ndvi_se                               -0.063522481
## ndvi_sw                                0.001880791
## precipitation_amt_mm                   0.104963004
## reanalysis_air_temp_k                  0.104898299
## reanalysis_avg_temp_k                  0.083218569
## reanalysis_dew_point_temp_k            0.250422175
## reanalysis_max_air_temp_k             -0.051879135
## reanalysis_min_air_temp_k              0.206488508
## reanalysis_precip_amt_kg_per_m2        0.084730868
## reanalysis_relative_humidity_percent   0.139303890
## reanalysis_sat_precip_amt_mm           0.104963004
## reanalysis_specific_humidity_g_per_kg  0.257292843
## reanalysis_tdtr_k                     -0.127453682
## station_avg_temp_c                     0.112726442
## station_diur_temp_rng_c               -0.060938281
## station_max_temp_c                     0.069411063
## station_min_temp_c                     0.230102160
## station_precip_mm                      0.069049546
cor.iq.sub.sig <- subset(cor.iq.sub, abs(total_cases) >= .15)
cor.iq.sub.sig
##                                       total_cases
## year                                    0.2186859
## total_cases                             1.0000000
## reanalysis_dew_point_temp_k             0.2504222
## reanalysis_min_air_temp_k               0.2064885
## reanalysis_specific_humidity_g_per_kg   0.2572928
## station_min_temp_c                      0.2301022

Define Xreg

#SJ
xreg.sj <- cbind(d.sj[,"year"],d.sj[,"weekofyear"],d.sj[,"reanalysis_air_temp_k"],d.sj[,"reanalysis_avg_temp_k"],
                 d.sj[,"reanalysis_dew_point_temp_k"],d.sj[,"reanalysis_max_air_temp_k"],d.sj[,"reanalysis_min_air_temp_k"],
                 d.sj[,"reanalysis_precip_amt_kg_per_m2"],d.sj[,"reanalysis_relative_humidity_percent"],
                 d.sj[,"reanalysis_specific_humidity_g_per_kg"],d.sj[,"station_avg_temp_c"],d.sj[,"station_max_temp_c"],
                 d.sj[,"station_min_temp_c"])
xreg.sj.train <- xreg.sj[1:749]
xreg.sj.test <- xreg.sj[750:936]
#IQ
xreg.iq <- cbind(d.iq[,"year"],d.iq[,"reanalysis_dew_point_temp_k"],d.iq[,"reanalysis_min_air_temp_k"],
                 d.iq[,"reanalysis_specific_humidity_g_per_kg"],d.iq[,"station_min_temp_c"])
xreg.iq.train <- xreg.iq[1:416]
xreg.iq.test <- xreg.iq[417:520]

Format Data to Time Series

#SJ Time series
d.sj.ts <- ts(d.sj$total_cases, start = c(1990, 18), frequency = 52)
#IQ Time Series
d.iq.ts <- ts(d.iq$total_cases, start = c(2000, 26), frequency = 52)

Data Visualization

Correlation Matrix Heatmap

#SJ
melted.cor.sj <- melt(cor(d.sj[,-c("city","week_start_date")], use = "complete.obs"))
ggplot(data = melted.cor.sj, aes(x=Var1, y=Var2, fill=value)) + geom_tile()

#IQ
melted.cor.iq <- melt(cor(d.iq[,-c("city","week_start_date")], use = "complete.obs"))
ggplot(data = melted.cor.iq, aes(x=Var1, y=Var2, fill=value)) + geom_tile()

Basic Time Series Plot by Week

#Plot Time Series
plot(d.sj.ts, main = "Figure 1: SJ Dengue Cases by Week 1990-2008",
     ylab = "Daily Cases")

plot(d.iq.ts, main = "Figure 2: IQ Dengue Cases by Week 2000-2010",
     ylab = "Daily Cases")

Decompositions

#SJ
decomp.add.sj <- decompose(d.sj.ts, type = "additive", filter = NULL)
autoplot(decomp.add.sj) + ggtitle("Additive Decomp. for SJ")

decomp.mult.sj <- decompose(d.sj.ts, type = "multiplicative", filter = NULL)
autoplot(decomp.mult.sj) + ggtitle("Multip. Decomp. for SJ")

#IQ
decomp.add.iq <- decompose(d.iq.ts, type = "additive", filter = NULL)
autoplot(decomp.add.iq) + ggtitle("Additive Decomp. for IQ")

decomp.mult.iq <- decompose(d.iq.ts, type = "multiplicative", filter = NULL)
autoplot(decomp.mult.iq) + ggtitle("Multip. Decomp. for IQ")

Split into Training and Test Data

#SJ Data (80:20 split)
d.sj.end <- length(d.sj.ts)
d.sj.train.cutoff <- round(.8*d.sj.end, 0)
d.sj.train <- d.sj.ts[1:d.sj.train.cutoff]
d.sj.test <- d.sj.ts[(d.sj.train.cutoff+1):d.sj.end]
d.sj.train.ts <- ts(d.sj.train, start = c(1990, 18), frequency = 52)
d.sj.test.ts <- ts(d.sj.test, start = c(2004, 39), frequency = 52)
#IQ Data (80:20 split)
d.iq.end <- length(d.iq.ts)
d.iq.train.cutoff <- round(.8*d.iq.end, 0)
d.iq.train <- d.iq.ts[1:d.iq.train.cutoff]
d.iq.test <- d.iq.ts[(d.iq.train.cutoff+1):d.iq.end]
d.iq.train.ts <- ts(d.iq.train, start = c(2000, 26), frequency = 52)
d.iq.test.ts <- ts(d.iq.test, start = c(2008, 27), frequency = 52)

Naive Models

#SJ Model
naive.sj <- naive(d.sj.train.ts,h=187)
autoplot(naive.sj) + autolayer(d.sj.test.ts)

acc.naive.sj <- accuracy(naive.sj$mean,d.sj.test.ts[1:187])
acc.naive.sj
##                ME     RMSE      MAE  MPE MAPE
## Test set 11.74866 33.38056 17.86631 -Inf  Inf
#IQ Model
naive.iq <- naive(d.iq.train.ts,h=104)
autoplot(naive.iq) + autolayer(d.iq.test.ts)

acc.naive.iq <- accuracy(naive.iq$mean,d.iq.test.ts[1:104])
acc.naive.iq
##                ME     RMSE      MAE  MPE MAPE
## Test set 8.519231 14.24713 8.673077 -Inf  Inf
#SJ Model
snaive.sj <- snaive(d.sj.train.ts,h=187)
autoplot(snaive.sj) + autolayer(d.sj.test.ts)

acc.snaive.sj <- accuracy(snaive.sj$mean,d.sj.test.ts[1:187])
acc.snaive.sj
##                ME     RMSE     MAE  MPE MAPE
## Test set 7.550802 31.57336 17.5615 -Inf  Inf
#IQ Model
snaive.iq <- snaive(d.iq.train.ts,h=104)
autoplot(snaive.iq) + autolayer(d.iq.test.ts)

acc.snaive.iq <- accuracy(snaive.iq$mean,d.iq.test.ts[1:104])
acc.snaive.iq
##                 ME     RMSE      MAE  MPE MAPE
## Test set -1.288462 14.74658 9.288462 -Inf  Inf

STLF Models

STLF ETS Model

#SJ Model
stlf.ets.sj <-stlf(d.sj.train.ts,method="ets",h=187)
autoplot(stlf.ets.sj) + autolayer(d.sj.test.ts)

acc.stlf.ets.sj <- accuracy(stlf.ets.sj$mean,d.sj.test.ts[1:187])
acc.stlf.ets.sj
##                ME     RMSE      MAE MPE MAPE
## Test set 17.46665 32.18493 20.76644 Inf  Inf
#IQ Model
stlf.ets.iq <-stlf(d.iq.train.ts,method="ets",h=104)
autoplot(stlf.ets.iq) + autolayer(d.iq.test.ts)

acc.stlf.ets.iq <- accuracy(stlf.ets.iq$mean,d.iq.test.ts[1:104])
acc.stlf.ets.iq
##                ME     RMSE      MAE  MPE MAPE
## Test set 3.215535 11.72406 6.852651 -Inf  Inf

STLF Naive/RW Model

#SJ Model
stlf.naive.sj <-stlf(d.sj.train.ts,method="naive",h=187)
autoplot(stlf.naive.sj) + autolayer(d.sj.test.ts)

acc.stlf.naive.sj <- accuracy(stlf.naive.sj$mean,d.sj.test.ts[1:187])
acc.stlf.naive.sj
##                ME     RMSE      MAE MPE MAPE
## Test set 17.46305 32.18298 20.76423 Inf  Inf
#IQ Model
stlf.naive.iq <-stlf(d.iq.train.ts,method="naive",h=104)
autoplot(stlf.naive.iq) + autolayer(d.iq.test.ts)

acc.stlf.naive.iq <- accuracy(stlf.naive.iq$mean,d.iq.test.ts[1:104])
acc.stlf.naive.iq
##                ME     RMSE      MAE  MPE MAPE
## Test set 2.481035 11.54424 6.756134 -Inf  Inf

STLF ARIMA Model

#SJ Model
stlf.arima.sj <-stlf(d.sj.train.ts,method="arima",h=187)
autoplot(stlf.arima.sj) + autolayer(d.sj.test.ts)

acc.stlf.arima.sj <- accuracy(stlf.arima.sj$mean,d.sj.test.ts[1:187])
acc.stlf.arima.sj
##                ME     RMSE     MAE MPE MAPE
## Test set 17.44394 32.17409 20.7551 Inf  Inf
#IQ Model
stlf.arima.iq <-stlf(d.iq.train.ts,method="arima",h=104)
autoplot(stlf.arima.iq) + autolayer(d.iq.test.ts)

acc.stlf.arima.iq <- accuracy(stlf.arima.iq$mean,d.iq.test.ts[1:104])
acc.stlf.arima.iq
##                ME     RMSE      MAE  MPE MAPE
## Test set 3.468611 11.79906 6.917627 -Inf  Inf

Xreg STLF ARIMAs

#SJ Model
stlf.arima.xreg.sj <-stlf(d.sj.train.ts,method="arima",h=187,xreg=as.matrix(xreg.sj.train),newxreg=as.matrix(xreg.sj.test))
## Warning in forecast.forecast_ARIMA(fit, h = h, level = level, xreg = newxreg):
## Upper prediction intervals are not finite.
autoplot(stlf.arima.xreg.sj) + autolayer(d.sj.test.ts)

acc.stlf.arima.xreg.sj <- accuracy(stlf.arima.xreg.sj$mean,d.sj.test.ts[1:187])
acc.stlf.arima.xreg.sj
##                ME     RMSE      MAE  MPE MAPE
## Test set 11.68336 28.03475 16.79356 -Inf  Inf
#IQ Model
stlf.arima.xreg.iq <-stlf(d.iq.train.ts,method="arima",h=104,xreg=as.matrix(xreg.iq.train),newxreg=as.matrix(xreg.iq.test))
## Warning in forecast.forecast_ARIMA(fit, h = h, level = level, xreg = newxreg):
## Upper prediction intervals are not finite.
autoplot(stlf.arima.xreg.iq) + autolayer(d.iq.test.ts)

acc.stlf.arima.xreg.iq <- accuracy(stlf.arima.xreg.iq$mean,d.iq.test.ts[1:104])
acc.stlf.arima.xreg.iq
##                 ME     RMSE     MAE  MPE MAPE
## Test set -1.836337 11.92539 8.18647 -Inf  Inf
#SJ Model
tbats.sj <- tbats(d.sj.train.ts)
for.tbats.sj <- forecast(tbats.sj,187)
autoplot(for.tbats.sj) + autolayer(d.sj.test.ts)

acc.tbats.sj <- accuracy(for.tbats.sj,d.sj.test.ts[1:187])
acc.tbats.sj
##                     ME     RMSE       MAE  MPE MAPE      MASE        ACF1
## Training set 0.5644022 15.74908  8.752217 -Inf  Inf 0.9975101 0.006240324
## Test set     2.5404329 27.33667 19.668676 -Inf  Inf 2.2416837          NA
#IQ Model
tbats.iq <- tbats(d.iq.train.ts)
for.tbats.iq <- forecast(tbats.iq,104)
autoplot(for.tbats.iq) + autolayer(d.iq.test.ts)

acc.tbats.iq <- accuracy(for.tbats.iq,d.iq.test.ts[1:104])
acc.tbats.iq
##                       ME      RMSE      MAE  MPE MAPE      MASE        ACF1
## Training set 0.002721555  7.225633 3.799959  NaN  Inf 0.9692581 0.006681934
## Test set     8.838031070 14.440930 8.946828 -Inf  Inf 2.2820735          NA

ARIMA Model

#SJ Model
aarima.sj <- auto.arima(d.sj.train.ts, xreg = as.matrix(xreg.sj.train))
for.aarima.sj <- forecast(aarima.sj,187, xreg = as.matrix(xreg.sj.test))
## Warning in forecast.forecast_ARIMA(aarima.sj, 187, xreg =
## as.matrix(xreg.sj.test)): Upper prediction intervals are not finite.
autoplot(for.aarima.sj) + autolayer(d.sj.test.ts)

acc.aarima.sj <- accuracy(for.aarima.sj,d.sj.test.ts[1:187])
acc.aarima.sj
##                       ME     RMSE       MAE  MPE MAPE      MASE       ACF1
## Training set  0.06477151 15.61662  8.747112 -Inf  Inf 0.9969282 0.07948093
## Test set     11.43920732 30.50516 17.205513 -Inf  Inf 1.9609514         NA
#IQ Model
aarima.iq <- auto.arima(d.iq.train.ts, xreg = as.matrix(xreg.iq.train))
for.aarima.iq <- forecast(aarima.iq,104, xreg = as.matrix(xreg.iq.test))
## Warning in forecast.forecast_ARIMA(aarima.iq, 104, xreg =
## as.matrix(xreg.iq.test)): Upper prediction intervals are not finite.
autoplot(for.aarima.iq) + autolayer(d.iq.test.ts)

acc.aarima.iq <- accuracy(for.aarima.iq,d.iq.test.ts[1:104])
acc.aarima.iq
##                       ME      RMSE      MAE  MPE MAPE      MASE       ACF1
## Training set  0.02127012  5.900787 3.334367  NaN  Inf 0.8504992 0.04751688
## Test set     -1.12419094 11.871375 8.418910 -Inf  Inf 2.1474172         NA

NNAR Model

#SJ Model
#Random Seed
set.seed(1240932)
#Deal with NAs in Xreg Test
xreg.sj.test.na <- data.table(xreg.sj.test)
xreg.sj.test.na <- na.locf(xreg.sj.test.na)
#Model
nnar.sj <- nnetar(d.sj.train.ts, xreg = as.matrix(xreg.sj.train))
## Warning in nnetar(d.sj.train.ts, xreg = as.matrix(xreg.sj.train)): Missing
## values in xreg, omitting rows
for.nnar.sj <- forecast(nnar.sj,187, xreg = as.matrix(xreg.sj.test.na))
autoplot(for.nnar.sj) + autolayer(d.sj.test.ts)

acc.nnar.sj <- accuracy(for.nnar.sj,d.sj.test.ts[1:187])
acc.nnar.sj
##                       ME      RMSE      MAE  MPE MAPE      MASE        ACF1
## Training set -0.01297781  4.081275  2.90118 -Inf  Inf 0.3306541 -0.07797094
## Test set      1.15344162 24.882368 15.01318 -Inf  Inf 1.7110857          NA
#IQ Model
#Random Seed
set.seed(1234216)
#Deal with NAs in Xreg Test
xreg.iq.test.na <- data.table(xreg.iq.test)
xreg.iq.test.na <- na.locf(xreg.iq.test.na)
#Model
nnar.iq <- nnetar(d.iq.train.ts, xreg = as.matrix(xreg.iq.train))
## Warning in nnetar(d.iq.train.ts, xreg = as.matrix(xreg.iq.train)): Missing
## values in xreg, omitting rows
for.nnar.iq <- forecast(nnar.iq,104, xreg = as.matrix(xreg.iq.test.na))
autoplot(for.nnar.iq) + autolayer(d.iq.test.ts)

acc.nnar.iq <- accuracy(for.nnar.iq,d.iq.test.ts[1:104])
acc.nnar.iq
##                        ME      RMSE      MAE  MPE MAPE      MASE        ACF1
## Training set -0.007956413  3.774465 2.484315 -Inf  Inf 0.6336759 -0.02258968
## Test set      3.750152741 11.884336 6.759713 -Inf  Inf 1.7242046          NA

Preparing Submission Data

#Dengue Data
d.sub <- fread("submission_format.csv")
d.sub.sj <- d.sub[city=="sj"]
d.sub.iq <- d.sub[city=="iq"]
#Additional Variables
d.sub.feat <- fread("dengue_features_test.csv")
d.sub.feat.sj <- d.sub.feat[city=="sj"]
d.sub.feat.iq <- d.sub.feat[city=="iq"]
#Merge
d.msub.sj <- merge(d.sub.sj, d.sub.feat.sj, by.x = c("year","weekofyear","city"), 
             by.y = c("year","weekofyear","city"), all.x = TRUE, all.y = TRUE)
d.msub.iq <- merge(d.sub.iq, d.sub.feat.iq, by.x = c("year","weekofyear","city"), 
             by.y = c("year","weekofyear","city"), all.x = TRUE, all.y = TRUE)

Building Xreg for Submission Data

#SJ
sub.xreg.sj <- cbind(d.msub.sj[,"year"],d.msub.sj[,"weekofyear"],d.msub.sj[,"reanalysis_air_temp_k"],d.msub.sj[,"reanalysis_avg_temp_k"],
                 d.msub.sj[,"reanalysis_dew_point_temp_k"],d.msub.sj[,"reanalysis_max_air_temp_k"],
                 d.msub.sj[,"reanalysis_min_air_temp_k"],d.msub.sj[,"reanalysis_precip_amt_kg_per_m2"],
                 d.msub.sj[,"reanalysis_relative_humidity_percent"],d.msub.sj[,"reanalysis_specific_humidity_g_per_kg"],
                 d.msub.sj[,"station_avg_temp_c"],d.msub.sj[,"station_max_temp_c"],d.msub.sj[,"station_min_temp_c"])
sub.xreg.sj.test <- sub.xreg.sj
#IQ
sub.xreg.iq <- cbind(d.msub.iq[,"year"],d.msub.iq[,"reanalysis_dew_point_temp_k"],d.msub.iq[,"reanalysis_min_air_temp_k"],
                 d.msub.iq[,"reanalysis_specific_humidity_g_per_kg"],d.msub.iq[,"station_min_temp_c"])
sub.xreg.iq.test <- sub.xreg.iq

Top Models for Full Training by MAE and RMSE

SJ 1: STLF ARIMA w/ Xreg

ft.stlf.arima.xreg.sj <-stlf(d.sj.ts,method="arima",h=260,xreg=as.matrix(xreg.sj),newxreg=as.matrix(sub.xreg.sj))
## Warning in forecast.forecast_ARIMA(fit, h = h, level = level, xreg = newxreg):
## Upper prediction intervals are not finite.
#Set negative forecasts to 0, carry forward on NAs
num.stax.sj <- as.numeric(ft.stlf.arima.xreg.sj$mean)
num.stax.sj <- na.locf(num.stax.sj)
num.stax.sj[num.stax.sj<0] = 0
ts.num.stax.sj <- ts(num.stax.sj, start = c(2008, 18), frequency = 52)
#Plot and Train Accuracy
autoplot(ft.stlf.arima.xreg.sj) + autolayer(ts.num.stax.sj)

acc.ft.stlf.arima.xreg.sj <- accuracy(stlf.arima.xreg.sj)
acc.ft.stlf.arima.xreg.sj
##                      ME     RMSE      MAE MPE MAPE      MASE        ACF1
## Training set 0.02384946 14.45873 8.507962 NaN  Inf 0.2139962 0.006687136

SJ 2: NNAR w/ Xreg

#SJ Model
#Random Seed
set.seed(1240932)
#Deal with NAs in Xreg Test
sub.xreg.sj.test.na <- data.table(sub.xreg.sj.test)
sub.xreg.sj.test.na <- na.locf(sub.xreg.sj.test.na)
#Model
sub.nnar.sj <- nnetar(d.sj.ts, xreg = as.matrix(xreg.sj))
## Warning in nnetar(d.sj.ts, xreg = as.matrix(xreg.sj)): Missing values in xreg,
## omitting rows
for.sub.nnar.sj <- forecast(sub.nnar.sj,260, xreg = as.matrix(sub.xreg.sj.test.na))
#Set negative forecasts to 0, carry forward on NAs
num.nnar.sj <- as.numeric(for.sub.nnar.sj$mean)
num.nnar.sj <- na.locf(num.nnar.sj)
num.nnar.sj[num.nnar.sj<0] = 0
ts.num.nnar.sj <- ts(num.nnar.sj, start = c(2008, 18), frequency = 52)
#Plot and Train Accuracy
autoplot(for.sub.nnar.sj) + autolayer(ts.num.nnar.sj)

acc.sub.nnar.sj <- accuracy(for.sub.nnar.sj)
acc.sub.nnar.sj
##                       ME     RMSE     MAE  MPE MAPE       MASE        ACF1
## Training set 0.002474817 4.916843 3.44649 -Inf  Inf 0.09452106 -0.08119953

SJ 3: TBATS

sub.tbats.sj <- tbats(d.sj.ts)
for.sub.tbats.sj <- forecast(sub.tbats.sj,260)
#Set negative forecasts to 0, carry forward on NAs
num.tbats.sj <- as.numeric(for.sub.tbats.sj$mean)
num.tbats.sj <- na.locf(num.tbats.sj)
num.tbats.sj[num.tbats.sj<0] = 0
ts.num.tbats.sj <- ts(num.tbats.sj, start = c(2008, 18), frequency = 52)
#Plot and Train Accuracy
autoplot(for.sub.tbats.sj) + autolayer(ts.num.tbats.sj)

acc.sub.tbats.sj <- accuracy(for.sub.tbats.sj)
acc.sub.tbats.sj
##                     ME     RMSE      MAE  MPE MAPE      MASE         ACF1
## Training set 0.0114407 15.00084 8.312624 -Inf  Inf 0.2279763 -0.001365316

IQ 1: STLF Naive/RW

sub.stlf.naive.iq <-stlf(d.iq.ts,method="naive",h=156)
#Set negative forecasts to 0, carry forward on NAs
num.stnrw.iq <- as.numeric(sub.stlf.naive.iq$mean)
num.stnrw.iq <- na.locf(num.stnrw.iq)
num.stnrw.iq[num.stnrw.iq<0] = 0
ts.num.stnrw.iq <- ts(num.stnrw.iq, start = c(2010, 26), frequency = 52)
#Plot and Train Accuracy
autoplot(sub.stlf.naive.iq) + autolayer(ts.num.stnrw.iq)

acc.sub.stlf.naive.iq <- accuracy(sub.stlf.naive.iq)
acc.sub.stlf.naive.iq
##                       ME     RMSE      MAE MPE MAPE      MASE       ACF1
## Training set 0.008697743 6.952827 4.180876 NaN  Inf 0.4438861 -0.2026068

IQ 2: NNAR w/ Xreg

set.seed(4632905)
#Deal with NAs
sub.xreg.iq.test.na <- data.table(sub.xreg.iq.test)
sub.xreg.iq.test.na <- na.locf(sub.xreg.iq.test.na)
#Model
sub.nnar.iq <- nnetar(d.iq.ts, xreg = as.matrix(xreg.iq))
## Warning in nnetar(d.iq.ts, xreg = as.matrix(xreg.iq)): Missing values in xreg,
## omitting rows
for.sub.nnar.iq <- forecast(sub.nnar.iq,156, xreg = as.matrix(sub.xreg.iq.test.na))
#Set negative forecasts to 0, carry forward on NAs
num.nnar.iq <- as.numeric(for.sub.nnar.iq$mean)
num.nnar.iq <- na.locf(num.nnar.iq)
num.nnar.iq[num.nnar.iq<0] = 0
ts.num.nnar.iq <- ts(num.nnar.iq, start = c(2010, 26), frequency = 52)
#Plot and Train Accuracy
autoplot(for.sub.nnar.iq) + autolayer(ts.num.nnar.iq)

acc.sub.nnar.iq <- accuracy(for.sub.nnar.iq)
acc.sub.nnar.iq
##                        ME     RMSE      MAE  MPE MAPE      MASE         ACF1
## Training set -0.009030504 3.333286 2.368186 -Inf  Inf 0.2514317 0.0002581162

IQ 3: STLF ETS

sub.stlf.ets.iq <-stlf(d.iq.ts,method="ets",h=156)
#Set negative forecasts to 0, carry forward on NAs
num.ets.iq <- as.numeric(sub.stlf.ets.iq$mean)
num.ets.iq <- na.locf(num.ets.iq)
num.ets.iq[num.ets.iq<0] = 0
ts.num.ets.iq <- ts(num.ets.iq, start = c(2010, 26), frequency = 52)
#Plot and Train Accuracy
autoplot(sub.stlf.ets.iq) + autolayer(ts.num.ets.iq)

acc.sub.stlf.ets.iq <- accuracy(sub.stlf.ets.iq)
acc.sub.stlf.ets.iq
##                      ME     RMSE      MAE MPE MAPE      MASE     ACF1
## Training set 0.01465928 6.681039 3.888844 NaN  Inf 0.4128809 0.062921

Submission Output 1

d.sub1 <- d.sub
output1.ts <- append(ts.num.stax.sj, ts.num.stnrw.iq)
d.sub1$total_cases <- round(as.numeric(output1.ts), 0)
write.csv(d.sub1,"submission1.csv", row.names = FALSE)

MAE on unseen test set of 30.4663.

Submission Output 2

d.sub2 <- d.sub
output2.ts <- append(ts.num.nnar.sj, ts.num.nnar.iq)
d.sub2$total_cases <- round(as.numeric(output2.ts), 0)
write.csv(d.sub2,"submission2.csv", row.names = FALSE)

MAE on unseen test set of 29.4904.

Submission Output 3

d.sub3 <- d.sub
output3.ts <- append(ts.num.tbats.sj, ts.num.ets.iq)
d.sub3$total_cases <- round(as.numeric(output3.ts), 0)
write.csv(d.sub3,"submission3.csv", row.names = FALSE)

MAE on unseen test set of 25.8654,

Submission 4: Combo of SJ TBATS and IQ STLF RW

d.sub4 <- d.sub
output4.ts <- append(ts.num.tbats.sj, ts.num.stnrw.iq)
d.sub4$total_cases <- round(as.numeric(output4.ts), 0)
write.csv(d.sub4,"submission4.csv", row.names = FALSE)

MAE on unseen test set of 25.7909.