true
library(readr)
library(reshape2)
library(tidyr)
library(h2o)
library(dplyr)
library(ggplot2) 
library(caret)
library(lubridate)
library(imputeTS)
library(corrplot)
library(data.table)
library(quantreg)
library(Ecfun)
library(randomForest)
library(forecast)
library(kableExtra)
library(magrittr)
library(zoo)
library(RColorBrewer)
library(tsibble)
library(feasts)
library(gridExtra)
features_tr <- read.csv("C:/Users/hunai/Google Drive/BC_EDU Pred Modeling/ADEC 7640 Final/DATA/dengue_features_train.csv")
                        #col_types = cols(week_start_date = col_date(format = "%Y-%m-%d")))

labels_tr <- read.csv("C:/Users/hunai/Google Drive/BC_EDU Pred Modeling/ADEC 7640 Final/DATA/dengue_labels_train.csv")

features_te <- read.csv("C:/Users/hunai/Google Drive/BC_EDU Pred Modeling/ADEC 7640 Final/DATA/dengue_features_test.csv")

submission_format <- read.csv("C:/Users/hunai/Google Drive/BC_EDU Pred Modeling/ADEC 7640 Final/DATA/submission_format.csv")
names(features_tr)
##  [1] "city"                                 
##  [2] "year"                                 
##  [3] "weekofyear"                           
##  [4] "week_start_date"                      
##  [5] "ndvi_ne"                              
##  [6] "ndvi_nw"                              
##  [7] "ndvi_se"                              
##  [8] "ndvi_sw"                              
##  [9] "precipitation_amt_mm"                 
## [10] "reanalysis_air_temp_k"                
## [11] "reanalysis_avg_temp_k"                
## [12] "reanalysis_dew_point_temp_k"          
## [13] "reanalysis_max_air_temp_k"            
## [14] "reanalysis_min_air_temp_k"            
## [15] "reanalysis_precip_amt_kg_per_m2"      
## [16] "reanalysis_relative_humidity_percent" 
## [17] "reanalysis_sat_precip_amt_mm"         
## [18] "reanalysis_specific_humidity_g_per_kg"
## [19] "reanalysis_tdtr_k"                    
## [20] "station_avg_temp_c"                   
## [21] "station_diur_temp_rng_c"              
## [22] "station_max_temp_c"                   
## [23] "station_min_temp_c"                   
## [24] "station_precip_mm"

Checking distributions

##
rbind(data.frame(group = "train", features_tr),
      data.frame(group = "test", features_te)) %>%
  gather(x, y, precipitation_amt_mm:station_precip_mm) %>%
  ggplot(aes(x = y, color = group, fill = group)) +
  geom_density(alpha = 0.3) +
  facet_wrap( ~ x, scales = "free", ncol = 3)

Create Train and Test dataset.

train <- left_join(x = features_tr, y = labels_tr, by = c("year", "weekofyear", "city"))

test <- left_join(x = features_te, y = submission_format, by = c("year", "weekofyear", "city"))

train$type <- "train"

test$type <- "test"

Combine Train+ Test for EDA and Viz

all <- rbind(train, test)

Summary of data

summary(all[ , c("station_avg_temp_c", "reanalysis_avg_temp_k", "reanalysis_air_temp_k")])
##  station_avg_temp_c reanalysis_avg_temp_k reanalysis_air_temp_k
##  Min.   :21.40      Min.   :294.9         Min.   :294.6        
##  1st Qu.:26.33      1st Qu.:298.3         1st Qu.:297.7        
##  Median :27.43      Median :299.3         Median :298.6        
##  Mean   :27.23      Mean   :299.3         Mean   :298.7        
##  3rd Qu.:28.20      3rd Qu.:300.3         3rd Qu.:299.9        
##  Max.   :30.80      Max.   :303.3         Max.   :302.2        
##  NA's   :55         NA's   :12            NA's   :12
summary(all[ , c("station_precip_mm", "precipitation_amt_mm")])
##  station_precip_mm precipitation_amt_mm
##  Min.   :  0.0     Min.   :  0.00      
##  1st Qu.:  8.8     1st Qu.:  9.43      
##  Median : 23.8     Median : 36.64      
##  Mean   : 38.2     Mean   : 44.11      
##  3rd Qu.: 51.5     3rd Qu.: 67.50      
##  Max.   :543.3     Max.   :390.60      
##  NA's   :27        NA's   :15

Split data by city and treat missing values

SAN JUAN

sj_train <-  all %>% filter(city == "sj" & type == "train")

summary(sj_train) 
##      city                year        weekofyear    week_start_date   
##  Length:936         Min.   :1990   Min.   : 1.00   Length:936        
##  Class :character   1st Qu.:1994   1st Qu.:13.75   Class :character  
##  Mode  :character   Median :1999   Median :26.50   Mode  :character  
##                     Mean   :1999   Mean   :26.50                     
##                     3rd Qu.:2003   3rd Qu.:39.25                     
##                     Max.   :2008   Max.   :53.00                     
##                                                                      
##     ndvi_ne            ndvi_nw            ndvi_se            ndvi_sw        
##  Min.   :-0.40625   Min.   :-0.45610   Min.   :-0.01553   Min.   :-0.06346  
##  1st Qu.: 0.00450   1st Qu.: 0.01642   1st Qu.: 0.13928   1st Qu.: 0.12916  
##  Median : 0.05770   Median : 0.06807   Median : 0.17719   Median : 0.16597  
##  Mean   : 0.05792   Mean   : 0.06747   Mean   : 0.17766   Mean   : 0.16596  
##  3rd Qu.: 0.11110   3rd Qu.: 0.11520   3rd Qu.: 0.21256   3rd Qu.: 0.20277  
##  Max.   : 0.49340   Max.   : 0.43710   Max.   : 0.39313   Max.   : 0.38142  
##  NA's   :191        NA's   :49         NA's   :19         NA's   :19        
##  precipitation_amt_mm reanalysis_air_temp_k reanalysis_avg_temp_k
##  Min.   :  0.00       Min.   :295.9         Min.   :296.1        
##  1st Qu.:  0.00       1st Qu.:298.2         1st Qu.:298.3        
##  Median : 20.80       Median :299.3         Median :299.4        
##  Mean   : 35.47       Mean   :299.2         Mean   :299.3        
##  3rd Qu.: 52.18       3rd Qu.:300.1         3rd Qu.:300.2        
##  Max.   :390.60       Max.   :302.2         Max.   :302.2        
##  NA's   :9            NA's   :6             NA's   :6            
##  reanalysis_dew_point_temp_k reanalysis_max_air_temp_k
##  Min.   :289.6               Min.   :297.8            
##  1st Qu.:293.8               1st Qu.:300.4            
##  Median :295.5               Median :301.5            
##  Mean   :295.1               Mean   :301.4            
##  3rd Qu.:296.4               3rd Qu.:302.4            
##  Max.   :297.8               Max.   :304.3            
##  NA's   :6                   NA's   :6                
##  reanalysis_min_air_temp_k reanalysis_precip_amt_kg_per_m2
##  Min.   :292.6             Min.   :  0.00                 
##  1st Qu.:296.3             1st Qu.: 10.82                 
##  Median :297.5             Median : 21.30                 
##  Mean   :297.3             Mean   : 30.47                 
##  3rd Qu.:298.4             3rd Qu.: 37.00                 
##  Max.   :299.9             Max.   :570.50                 
##  NA's   :6                 NA's   :6                      
##  reanalysis_relative_humidity_percent reanalysis_sat_precip_amt_mm
##  Min.   :66.74                        Min.   :  0.00              
##  1st Qu.:76.25                        1st Qu.:  0.00              
##  Median :78.67                        Median : 20.80              
##  Mean   :78.57                        Mean   : 35.47              
##  3rd Qu.:80.96                        3rd Qu.: 52.18              
##  Max.   :87.58                        Max.   :390.60              
##  NA's   :6                            NA's   :9                   
##  reanalysis_specific_humidity_g_per_kg reanalysis_tdtr_k station_avg_temp_c
##  Min.   :11.72                         Min.   :1.357     Min.   :22.84     
##  1st Qu.:15.24                         1st Qu.:2.157     1st Qu.:25.84     
##  Median :16.85                         Median :2.457     Median :27.23     
##  Mean   :16.55                         Mean   :2.516     Mean   :27.01     
##  3rd Qu.:17.86                         3rd Qu.:2.800     3rd Qu.:28.19     
##  Max.   :19.44                         Max.   :4.429     Max.   :30.07     
##  NA's   :6                             NA's   :6         NA's   :6         
##  station_diur_temp_rng_c station_max_temp_c station_min_temp_c
##  Min.   :4.529           Min.   :26.70      Min.   :17.8      
##  1st Qu.:6.200           1st Qu.:30.60      1st Qu.:21.7      
##  Median :6.757           Median :31.70      Median :22.8      
##  Mean   :6.757           Mean   :31.61      Mean   :22.6      
##  3rd Qu.:7.286           3rd Qu.:32.80      3rd Qu.:23.9      
##  Max.   :9.914           Max.   :35.60      Max.   :25.6      
##  NA's   :6               NA's   :6          NA's   :6         
##  station_precip_mm  total_cases         type          
##  Min.   :  0.000   Min.   :  0.00   Length:936        
##  1st Qu.:  6.825   1st Qu.:  9.00   Class :character  
##  Median : 17.750   Median : 19.00   Mode  :character  
##  Mean   : 26.785   Mean   : 34.18                     
##  3rd Qu.: 35.450   3rd Qu.: 37.00                     
##  Max.   :305.900   Max.   :461.00                     
##  NA's   :6
# Missing value analysis
# Get percentage of missing value by column

sj_miss<- dplyr::select(sj_train,-c("city","year","weekofyear","week_start_date","total_cases","type")) 

sj_miss<- apply(X = sj_miss, 2, function(x) sum(is.na(x))/nrow(sj_miss)) %>% 
  knitr:: kable() %>%  kable_styling(bootstrap_options = "striped", full_width = F) 

sj_miss
x
ndvi_ne 0.2040598
ndvi_nw 0.0523504
ndvi_se 0.0202991
ndvi_sw 0.0202991
precipitation_amt_mm 0.0096154
reanalysis_air_temp_k 0.0064103
reanalysis_avg_temp_k 0.0064103
reanalysis_dew_point_temp_k 0.0064103
reanalysis_max_air_temp_k 0.0064103
reanalysis_min_air_temp_k 0.0064103
reanalysis_precip_amt_kg_per_m2 0.0064103
reanalysis_relative_humidity_percent 0.0064103
reanalysis_sat_precip_amt_mm 0.0096154
reanalysis_specific_humidity_g_per_kg 0.0064103
reanalysis_tdtr_k 0.0064103
station_avg_temp_c 0.0064103
station_diur_temp_rng_c 0.0064103
station_max_temp_c 0.0064103
station_min_temp_c 0.0064103
station_precip_mm 0.0064103

Impute missing values Using LOCF from ‘zoo’

sj_train <- cbind(sj_train[ , c("city", "type","week_start_date")] ,  apply(X = sj_train[, which(sapply(sj_train,is.numeric))], MARGIN = 2, function(x) na.locf(x,fromLast = TRUE)))

IQUITO

# # IQUITO
iq_train <-  all %>% filter(city == "iq" & type == "train") 

summary(iq_train)
##      city                year        weekofyear    week_start_date   
##  Length:520         Min.   :2000   Min.   : 1.00   Length:520        
##  Class :character   1st Qu.:2003   1st Qu.:13.75   Class :character  
##  Mode  :character   Median :2005   Median :26.50   Mode  :character  
##                     Mean   :2005   Mean   :26.50                     
##                     3rd Qu.:2007   3rd Qu.:39.25                     
##                     Max.   :2010   Max.   :53.00                     
##                                                                      
##     ndvi_ne           ndvi_nw           ndvi_se           ndvi_sw       
##  Min.   :0.06173   Min.   :0.03586   Min.   :0.02988   Min.   :0.06418  
##  1st Qu.:0.20000   1st Qu.:0.17954   1st Qu.:0.19474   1st Qu.:0.20413  
##  Median :0.26364   Median :0.23297   Median :0.24980   Median :0.26214  
##  Mean   :0.26387   Mean   :0.23878   Mean   :0.25013   Mean   :0.26678  
##  3rd Qu.:0.31997   3rd Qu.:0.29393   3rd Qu.:0.30230   3rd Qu.:0.32515  
##  Max.   :0.50836   Max.   :0.45443   Max.   :0.53831   Max.   :0.54602  
##  NA's   :3         NA's   :3         NA's   :3         NA's   :3        
##  precipitation_amt_mm reanalysis_air_temp_k reanalysis_avg_temp_k
##  Min.   :  0.00       Min.   :294.6         Min.   :294.9        
##  1st Qu.: 39.10       1st Qu.:297.1         1st Qu.:298.2        
##  Median : 60.47       Median :297.8         Median :299.1        
##  Mean   : 64.25       Mean   :297.9         Mean   :299.1        
##  3rd Qu.: 85.76       3rd Qu.:298.6         3rd Qu.:300.1        
##  Max.   :210.83       Max.   :301.6         Max.   :302.9        
##  NA's   :4            NA's   :4             NA's   :4            
##  reanalysis_dew_point_temp_k reanalysis_max_air_temp_k
##  Min.   :290.1               Min.   :300.0            
##  1st Qu.:294.6               1st Qu.:305.2            
##  Median :295.9               Median :307.1            
##  Mean   :295.5               Mean   :307.1            
##  3rd Qu.:296.5               3rd Qu.:308.7            
##  Max.   :298.4               Max.   :314.0            
##  NA's   :4                   NA's   :4                
##  reanalysis_min_air_temp_k reanalysis_precip_amt_kg_per_m2
##  Min.   :286.9             Min.   :  0.00                 
##  1st Qu.:292.0             1st Qu.: 24.07                 
##  Median :293.1             Median : 46.44                 
##  Mean   :292.9             Mean   : 57.61                 
##  3rd Qu.:294.2             3rd Qu.: 71.07                 
##  Max.   :296.0             Max.   :362.03                 
##  NA's   :4                 NA's   :4                      
##  reanalysis_relative_humidity_percent reanalysis_sat_precip_amt_mm
##  Min.   :57.79                        Min.   :  0.00              
##  1st Qu.:84.30                        1st Qu.: 39.10              
##  Median :90.92                        Median : 60.47              
##  Mean   :88.64                        Mean   : 64.25              
##  3rd Qu.:94.56                        3rd Qu.: 85.76              
##  Max.   :98.61                        Max.   :210.83              
##  NA's   :4                            NA's   :4                   
##  reanalysis_specific_humidity_g_per_kg reanalysis_tdtr_k station_avg_temp_c
##  Min.   :12.11                         Min.   : 3.714    Min.   :21.40     
##  1st Qu.:16.10                         1st Qu.: 7.371    1st Qu.:27.00     
##  Median :17.43                         Median : 8.964    Median :27.60     
##  Mean   :17.10                         Mean   : 9.207    Mean   :27.53     
##  3rd Qu.:18.18                         3rd Qu.:11.014    3rd Qu.:28.10     
##  Max.   :20.46                         Max.   :16.029    Max.   :30.80     
##  NA's   :4                             NA's   :4         NA's   :37        
##  station_diur_temp_rng_c station_max_temp_c station_min_temp_c
##  Min.   : 5.20           Min.   :30.1       Min.   :14.7      
##  1st Qu.: 9.50           1st Qu.:33.2       1st Qu.:20.6      
##  Median :10.62           Median :34.0       Median :21.3      
##  Mean   :10.57           Mean   :34.0       Mean   :21.2      
##  3rd Qu.:11.65           3rd Qu.:34.9       3rd Qu.:22.0      
##  Max.   :15.80           Max.   :42.2       Max.   :24.2      
##  NA's   :37              NA's   :14         NA's   :8         
##  station_precip_mm  total_cases          type          
##  Min.   :  0.00    Min.   :  0.000   Length:520        
##  1st Qu.: 17.20    1st Qu.:  1.000   Class :character  
##  Median : 45.30    Median :  5.000   Mode  :character  
##  Mean   : 62.47    Mean   :  7.565                     
##  3rd Qu.: 85.95    3rd Qu.:  9.000                     
##  Max.   :543.30    Max.   :116.000                     
##  NA's   :16
# Missing value analysis
# Get percentage of missing value by column

iq_miss<- dplyr::select(iq_train,-c("city","year","weekofyear","week_start_date","total_cases","type"))
                        
iq_miss<- apply(X = iq_miss, 2, function(x) sum(is.na(x))/nrow(iq_miss)) %>% 
  knitr:: kable() %>%  kable_styling(bootstrap_options = "striped", full_width = F) 

iq_miss
x
ndvi_ne 0.0057692
ndvi_nw 0.0057692
ndvi_se 0.0057692
ndvi_sw 0.0057692
precipitation_amt_mm 0.0076923
reanalysis_air_temp_k 0.0076923
reanalysis_avg_temp_k 0.0076923
reanalysis_dew_point_temp_k 0.0076923
reanalysis_max_air_temp_k 0.0076923
reanalysis_min_air_temp_k 0.0076923
reanalysis_precip_amt_kg_per_m2 0.0076923
reanalysis_relative_humidity_percent 0.0076923
reanalysis_sat_precip_amt_mm 0.0076923
reanalysis_specific_humidity_g_per_kg 0.0076923
reanalysis_tdtr_k 0.0076923
station_avg_temp_c 0.0711538
station_diur_temp_rng_c 0.0711538
station_max_temp_c 0.0269231
station_min_temp_c 0.0153846
station_precip_mm 0.0307692
plotNA.distribution(iq_train$ndvi_ne)

plotNA.distributionBar(iq_train$ndvi_ne, breaks = 20)

Impute missing values Using LOCF from ‘zoo’

iq_train <- cbind(iq_train[ , c("city", "type","week_start_date")] ,  apply(X = iq_train[, which(sapply(iq_train,is.numeric))], MARGIN = 2, function(x) na.locf(x,fromLast = TRUE)))
## Total cases across cities

p1<- ggplot(sj_train, aes(x=as.Date(week_start_date), y = total_cases, group = 1)) + geom_line(color="blue") +
  xlab("Week Start Date") + ylab("Total Cases") + ggtitle("Case Trend for San Juan") + 
  theme(axis.text.x = element_text(),axis.text.y = element_text()) +
    theme_minimal()

p2<- ggplot(iq_train, aes(x=as.Date(week_start_date), y = total_cases, group = 1)) + geom_line(color="red") +
  xlab("Week Start Date") + ylab("Total Cases") +  ggtitle("Case Trend for Iquito") +
  theme(axis.text.x = element_text(),axis.text.y = element_text()) +
    theme_minimal()

grid.arrange(p1,p2,nrow =2)

Model 1: Random Forest San Juan (all features)

## Modelling San Juan
#sj_train$city <- NULL
sj_train$weekofyear <- as.character(sj_train$weekofyear) 
sj_train$year <- as.character(sj_train$year)

## train model to predict Test (All train)
# Get the best mtry train a random forest using that mtry

preprocessparams <- preProcess(x = select(sj_train[ , which(sapply(sj_train,is.numeric))], -total_cases) ,method = c("center", "scale"))

sj_train <- predict(preprocessparams, sj_train)

sj_train$type <- NULL

rf_reg <- tuneRF(x = sj_train[ , -which(names(sj_train) == "total_cases")], y = sj_train$total_cases, ntreeTry=100,stepFactor=2,improve=0.05, doBest = T) 
## mtry = 8  OOB error = 626.2102 
## Searching left ...
## mtry = 4     OOB error = 820.3542 
## -0.3100302 0.05 
## Searching right ...
## mtry = 16    OOB error = 495.1907 
## 0.2092261 0.05 
## mtry = 24    OOB error = 538.3619 
## -0.08718098 0.05

sj_train$pred <- predict(rf_reg, sj_train)

postResample(sj_train$pred, sj_train$total_cases)
##      RMSE  Rsquared       MAE 
## 9.8591277 0.9694185 5.3541332
ggplot(data = sj_train, aes(x = as.Date(week_start_date))) + geom_line(aes(y = total_cases, color = "real")) + geom_line(aes(y = pred, color = "pred" )) 

sj_train$pred <- NULL
sj_train$type <- "train"

1 with station_precip lag of 1 RMSE Rsquared MAE 9.510456 0.972371 5.251435

  1. with station_precip + station_avg lag of 1 RMSE Rsquared MAE 9.633140 0.968891 5.086512

  2. with station_precip + station_avg + ndvi_ne lag of 1 RMSE Rsquared MAE 9.8423416 0.9692057 5.2444620

  3. No lagged vars RMSE Rsquared MAE 9.6823447 0.9703827 5.1047858

Model 1: Make final prediction for San Juan (test)

sj_te <- all %>% filter(city == "sj" & type == "test")

# Missing value analysis
# Get percentage of missing value by column

sjte_miss<- dplyr::select(sj_te,-c("city","year","weekofyear","week_start_date","total_cases","type")) 

sjte_miss<- apply(X = sjte_miss, 2, function(x) sum(is.na(x))/nrow(sjte_miss)) %>% 
  knitr:: kable() %>%  kable_styling(bootstrap_options = "striped", full_width = F) 

sjte_miss
x
ndvi_ne 0.1653846
ndvi_nw 0.0423077
ndvi_se 0.0038462
ndvi_sw 0.0038462
precipitation_amt_mm 0.0076923
reanalysis_air_temp_k 0.0076923
reanalysis_avg_temp_k 0.0076923
reanalysis_dew_point_temp_k 0.0076923
reanalysis_max_air_temp_k 0.0076923
reanalysis_min_air_temp_k 0.0076923
reanalysis_precip_amt_kg_per_m2 0.0076923
reanalysis_relative_humidity_percent 0.0076923
reanalysis_sat_precip_amt_mm 0.0076923
reanalysis_specific_humidity_g_per_kg 0.0076923
reanalysis_tdtr_k 0.0076923
station_avg_temp_c 0.0076923
station_diur_temp_rng_c 0.0076923
station_max_temp_c 0.0076923
station_min_temp_c 0.0076923
station_precip_mm 0.0076923
##The missing values will be replaced by last values
sj_te <- cbind(sj_te[ , c("city", "type", "week_start_date")] ,  apply(X = sj_te[, which(sapply(sj_te,is.numeric))], MARGIN = 2, function(x) na.locf(x,fromLast = TRUE)))

## Apply parameters and predict
#sj_te <- predict(preprocessparams, sj_te)

sj_te$total_cases <- predict(rf_reg , sj_te)

ggplot(sj_te, aes(as.Date(week_start_date), total_cases, group=1)) + geom_line()

sj_train$year <- as.numeric(sj_train$year)
sj_train$weekofyear <- as.numeric(sj_train$weekofyear)

ggplot(data = union_all(sj_train, sj_te), aes(x = as.Date(week_start_date), y = total_cases, color = type, group =1)) + geom_line() 

Model 1: Iquito (all features)

iq_train$weekofyear <- as.character(iq_train$weekofyear) 
#iq_train$city <- NULL
iq_train$year <- as.character(iq_train$year)

# train model to predict Test (All train)
preprocessparams <- preProcess(x = select(iq_train[ , which(sapply(iq_train,is.numeric))], -total_cases),method = c("center", "scale"))

iq_train <- predict(preprocessparams, iq_train)

rf_reg <- tuneRF(x = iq_train[ , -which(colnames(iq_train)=="total_cases")], iq_train$total_cases, ntreeTry=100,stepFactor=2,improve=0.05,trace=FALSE, doBest = T) 
## -0.001493603 0.05 
## 0.03296734 0.05

iq_train$pred <- predict(rf_reg, iq_train)

postResample(iq_train$pred, iq_train$total_cases)
##      RMSE  Rsquared       MAE 
## 4.1436702 0.9135643 2.1860947
ggplot(data = iq_train, aes(x = as.Date(week_start_date))) + geom_line(aes(y = total_cases, color = "real")) + geom_line(aes(y = pred, color = "pred" )) 

iq_train$pred <- NULL

Model 1: Make final prediction for Iquito (test)

iq_te <- all %>% filter(city == "iq" & type == "test")

# Missing value analysis
# Get percentage of missing value by column

iqte_miss<- dplyr::select(iq_te,-c("city","year","weekofyear","week_start_date","total_cases","type")) 

iqte_miss<- apply(X = iqte_miss, 2, function(x) sum(is.na(x))/nrow(iqte_miss)) %>% 
  knitr:: kable() %>%  kable_styling(bootstrap_options = "striped", full_width = F) 

iqte_miss
x
ndvi_ne 0.0000000
ndvi_nw 0.0000000
ndvi_se 0.0000000
ndvi_sw 0.0000000
precipitation_amt_mm 0.0000000
reanalysis_air_temp_k 0.0000000
reanalysis_avg_temp_k 0.0000000
reanalysis_dew_point_temp_k 0.0000000
reanalysis_max_air_temp_k 0.0000000
reanalysis_min_air_temp_k 0.0000000
reanalysis_precip_amt_kg_per_m2 0.0000000
reanalysis_relative_humidity_percent 0.0000000
reanalysis_sat_precip_amt_mm 0.0000000
reanalysis_specific_humidity_g_per_kg 0.0000000
reanalysis_tdtr_k 0.0000000
station_avg_temp_c 0.0641026
station_diur_temp_rng_c 0.0641026
station_max_temp_c 0.0064103
station_min_temp_c 0.0448718
station_precip_mm 0.0192308
## The missing values will be replaced by a forecast
iq_te <- cbind(iq_te[ , c("city", "type", "week_start_date")] ,  apply(X = iq_te[, which(sapply(iq_te,is.numeric))], MARGIN = 2, function(x) na.locf(x,fromLast = TRUE)))

iq_te$weekofyear <- as.character(iq_te$weekofyear) 
#iq_te$city <- NULL
iq_te$year <- as.character(iq_te$year)


iq_te <- predict(preprocessparams, iq_te)
iq_te$total_cases <- predict(rf_reg, iq_te)


iq_train$year <- as.numeric(iq_train$year)
iq_train$weekofyear <- as.numeric(iq_train$weekofyear)

iq_te$year <- as.numeric(iq_te$year)
iq_te$weekofyear <- as.numeric(iq_te$weekofyear)

#names(iq_train)
#names(iq_te)

ggplot(data = union_all(iq_train, iq_te), aes(x = as.Date(week_start_date), y = total_cases, color = type)) + geom_line()

Model 1: First Submission with Random Forest:

iq_te$city <- "iq" 

iq_te$type <- "test"

rf_final_df <- rbind(sj_te[ , c("city", "type", "total_cases","weekofyear", "week_start_date", "year")], 
                     iq_te[ , c("city", "type", "total_cases","weekofyear", "week_start_date", "year")])

submission_rf <- full_join(x = submission_format, y = rf_final_df, by = c("year","weekofyear", "city"))

data.table::setnames(submission_rf, old = "total_cases.y", "total_cases")

submission_rf$total_cases <- base::round(submission_rf$total_cases,0) # Round

ggplot(submission_rf, aes(as.Date(week_start_date), total_cases, group=city)) + geom_line() + facet_wrap(city~.)

submission_rf <- submission_rf %>% select(city, year, weekofyear, total_cases) 

write.csv(submission_rf, "submission_rf.csv", quote = F, row.names = F)

First Submission with Random Forest:SUBMISSION SCORE: 28.9904

Correlation Matrix

Corr SJ

sj_train %>% 
  dplyr::select(-c( "city","type", "year", "week_start_date")) %>%
  cor(use = 'pairwise.complete.obs') -> M1

corrplot(M1, type = "lower", order = "hclust",  cl.align = "r", tl.col = "black", addCoef.col = "white", tl.cex = .7, number.cex = .6,col = brewer.pal(n=8, name="RdBu"), insig = "blank",diag = FALSE)

Corr IQ

iq_train %>% 
  dplyr::select(-c("city", "type", "year", "week_start_date")) %>%
  cor(use = 'pairwise.complete.obs') -> M2

corrplot(M2, type = "lower", order = "hclust",  cl.align = "r", tl.col = "black", addCoef.col = "white", tl.cex = .7, number.cex = .6,col = brewer.pal(n=8, name="RdBu"), insig = "blank",diag = FALSE)

Feature Engineering

Remove [5] “ndvi_ne”
Remove [6] “ndvi_nw”
Remove [7] “ndvi_se”
Keep [8] “ndvi_sw” Remove [9] “precipitation_amt_mm”
Remove, mutate avg with station_precip_mm
Remove [10] “reanalysis_air_temp_k”
Remove [11] “reanalysis_avg_temp_k”
Remove [12] “reanalysis_dew_point_temp_k” 100% corr with specific_humidity
Remove temps in Kelvin: [13] “reanalysis_max_air_temp_k”
Remove temps in Kelvin: [14] “reanalysis_min_air_temp_k”
Remove: [15] “reanalysis_precip_amt_kg_per_m2”
Remove [16] “reanalysis_relative_humidity_percent” Remove [17] “reanalysis_sat_precip_amt_mm”
Keep [18] “reanalysis_specific_humidity_g_per_kg” Remove [19] “reanalysis_tdtr_k”
Keep [20] “station_avg_temp_c”
Remove [21] “station_diur_temp_rng_c”
Remove [22] “station_max_temp_c”
Remove [23] “station_min_temp_c”
Remove [24] “station_precip_mm” (average w/ precipitation_amt_mm)

sj_train$avg_precipitation<- (sj_train$precipitation_amt_mm+sj_train$station_precip_mm+ sj_train$reanalysis_sat_precip_amt_mm)/3

sj_tr<- sj_train %>% #group_by(city,year, week_start_date) %>%  
  mutate(avg_temp_c = station_avg_temp_c,
         humidity_g_per_kg = reanalysis_specific_humidity_g_per_kg,
         ndvi_ne = NULL,
         ndvi_nw = NULL, 
         ndvi_se = NULL,
         reanalysis_avg_temp_k = NULL, 
         reanalysis_air_temp_k = NULL, 
         station_precip_mm = NULL, 
         precipitation_amt_mm = NULL,
         reanalysis_air_temp_k = NULL,
         reanalysis_avg_temp_k = NULL,
         reanalysis_dew_point_temp_k = NULL,
         reanalysis_max_air_temp_k = NULL,
         reanalysis_min_air_temp_k = NULL,
         reanalysis_precip_amt_kg_per_m2 = NULL,
         reanalysis_relative_humidity_percent = NULL,
         reanalysis_sat_precip_amt_mm = NULL,
         reanalysis_tdtr_k = NULL,
         station_diur_temp_rng_c = NULL,
         station_max_temp_c = NULL,                   
         station_min_temp_c = NULL,
         station_avg_temp_c = NULL,
        reanalysis_specific_humidity_g_per_kg   = NULL,
         station_precip_mm = NULL) 

names(sj_tr)
##  [1] "city"              "week_start_date"   "year"             
##  [4] "weekofyear"        "ndvi_sw"           "total_cases"      
##  [7] "type"              "avg_precipitation" "avg_temp_c"       
## [10] "humidity_g_per_kg"
iq_train$avg_precipitation<- (iq_train$precipitation_amt_mm+iq_train$station_precip_mm+ iq_train$reanalysis_sat_precip_amt_mm)/3

iq_tr<- iq_train %>% #group_by(city,year, week_start_date) %>%  
  mutate(avg_temp_c = station_avg_temp_c,
         humidity_g_per_kg = reanalysis_specific_humidity_g_per_kg,
         ndvi_ne = NULL,
         ndvi_nw = NULL, 
         ndvi_se = NULL,
         reanalysis_avg_temp_k = NULL, 
         reanalysis_air_temp_k = NULL, 
         station_precip_mm = NULL, 
         precipitation_amt_mm = NULL,
         reanalysis_air_temp_k = NULL,
         reanalysis_avg_temp_k = NULL,
         reanalysis_dew_point_temp_k = NULL,
         reanalysis_max_air_temp_k = NULL,
         reanalysis_min_air_temp_k = NULL,
         reanalysis_precip_amt_kg_per_m2 = NULL,
         reanalysis_relative_humidity_percent = NULL,
         reanalysis_sat_precip_amt_mm = NULL,
         reanalysis_tdtr_k = NULL,
         station_diur_temp_rng_c = NULL,
         station_max_temp_c = NULL,                   
         station_min_temp_c = NULL,
         station_avg_temp_c = NULL,
        reanalysis_specific_humidity_g_per_kg   = NULL,
         station_precip_mm = NULL) 


names(iq_tr)
##  [1] "city"              "type"              "week_start_date"  
##  [4] "year"              "weekofyear"        "ndvi_sw"          
##  [7] "total_cases"       "avg_precipitation" "avg_temp_c"       
## [10] "humidity_g_per_kg"

MODEL 2: Random Forest San Juan (new features)

sj_tr$city <- NULL
sj_tr$weekofyear <- as.character(sj_tr$weekofyear) 
sj_tr$year <- as.character(sj_tr$year)
sj_tr$type <- NULL


## train model to predict Test (All train)
## Get the best mtry train a random forest using that mtry

#sj_preprocparams <- preProcess(x = select(sj_tr[ , which(sapply(sj_tr,is.numeric))], -total_cases) ,method = c("center", "scale"))

#sj_tr <- predict(sj_preprocparams, sj_tr)

sj_tr$avg_precipitation <- dplyr::lag(sj_tr$avg_precipitation, 1)

#Plots of total cases to the temp/hum variables do not show any particular relationship
#plot(sj_tr$avg_precipitation,sj_tr$total_cases)
#plot(sj_tr$avg_ndvi,sj_tr$total_cases)
#plot(sj_tr$avg_temp_c,sj_tr$total_cases)
#plot(sj_tr$humidity_g_per_kg,sj_tr$total_cases)

sj_tr[1,6] = sj_tr[53,6]

#sj_tr <- sj_tr[complete.cases(sj_tr), ]

rf_reg <- tuneRF(x = sj_tr[ , -which(names(sj_tr) == "total_cases")], y = sj_tr$total_cases, ntreeTry=100,stepFactor=2,improve=0.05, doBest = T) 
## mtry = 2  OOB error = 507.1013 
## Searching left ...
## mtry = 1     OOB error = 824.093 
## -0.6251052 0.05 
## Searching right ...
## mtry = 4     OOB error = 328.6548 
## 0.3518953 0.05 
## mtry = 7     OOB error = 347.3086 
## -0.05675813 0.05

sj_tr$pred <- predict(rf_reg, sj_tr)

sjtr_pred<- sj_tr

postResample(sj_tr$pred, sj_tr$total_cases)
##      RMSE  Rsquared       MAE 
## 8.2003435 0.9806783 4.3406866
ggplot(data = sj_tr, aes(x = as.Date(week_start_date))) + geom_line(aes(y = total_cases, color = "real")) + geom_line(aes(y = pred, color = "pred" )) 

sj_tr$pred <- NULL
sj_tr$type <- "train"

Model 2: Make final prediction for San Juan (test)

sj_te <- all %>% filter(city == "sj" & type == "test")

##The missing values will be replaced by fromLast:
sj_te <- cbind(sj_te[ , c("city", "type", "week_start_date")] ,  apply(X = sj_te[, which(sapply(sj_te,is.numeric))], MARGIN = 2, function(x) na.locf(x,fromLast = TRUE)))

Feature Engineer San Juan

sj_te$avg_precipitation<- (sj_te$precipitation_amt_mm+sj_te$station_precip_mm+ sj_te$reanalysis_sat_precip_amt_mm)/3

sj_test <- sj_te %>% #group_by(city,year, week_start_date) %>%  
  mutate(avg_temp_c = station_avg_temp_c,
         humidity_g_per_kg = reanalysis_specific_humidity_g_per_kg,
         ndvi_ne = NULL,
         ndvi_nw = NULL, 
         ndvi_se = NULL,
         reanalysis_avg_temp_k = NULL, 
         reanalysis_air_temp_k = NULL, 
         station_precip_mm = NULL, 
         precipitation_amt_mm = NULL,
         reanalysis_air_temp_k = NULL,
         reanalysis_avg_temp_k = NULL,
         reanalysis_dew_point_temp_k = NULL,
         reanalysis_max_air_temp_k = NULL,
         reanalysis_min_air_temp_k = NULL,
         reanalysis_precip_amt_kg_per_m2 = NULL,
         reanalysis_relative_humidity_percent = NULL,
         reanalysis_sat_precip_amt_mm = NULL,
         reanalysis_tdtr_k = NULL,
         station_diur_temp_rng_c = NULL,
         station_max_temp_c = NULL,                   
         station_min_temp_c = NULL,
         station_avg_temp_c = NULL,
        reanalysis_specific_humidity_g_per_kg   = NULL,
         station_precip_mm = NULL) 

## Lag SJ Precipitation
sj_test$avg_precipitation <- dplyr::lag(sj_test$avg_precipitation, 1)

sj_test[1,8] = sj_train[936,27]

#sj_test <- sj_test[complete.cases(sj_test), ]

## Apply parameters and predict
#sj_test <- predict(sj_preprocparams, sj_test)

sj_test$total_cases <- predict(rf_reg , sj_test)

ggplot(sj_test, aes(as.Date(week_start_date), total_cases, group=1)) + geom_line()

sj_tr$year <- as.numeric(sj_tr$year)
sj_tr$weekofyear <- as.numeric(sj_tr$weekofyear)

ggplot(data = union_all(sj_tr, sj_test), aes(x = as.Date(week_start_date), y = total_cases, color = type)) + geom_line() 

Modeling Iquito (new features)

iq_tr$weekofyear <- as.character(iq_tr$weekofyear) 
iq_tr$city <- NULL
iq_tr$type <- NULL
iq_tr$year <- as.character(iq_tr$year)



## Get the best mtry train a random forest using that mtry

#iq_preprocparams <- preProcess(x = select(iq_tr[ , which(sapply(iq_tr,is.numeric))], -total_cases),method = c("center", "scale"))

#iq_tr <- predict(iq_preprocparams, iq_tr)

iq_tr$avg_precipitation <- dplyr::lag(iq_tr$avg_precipitation, 1)

iq_tr[1,6] = iq_tr[53,6]

#iq_tr <- iq_tr[complete.cases(iq_tr), ]

rf_reg <- tuneRF(x = iq_tr[ , -which(names(iq_tr) =="total_cases")], iq_tr$total_cases, ntreeTry=100,stepFactor=2,improve=0.05,trace=FALSE, doBest = T) 
## -0.163319 0.05 
## -0.009337287 0.05

iq_tr$pred <- predict(rf_reg, iq_tr)

iqtr_pred<- iq_tr

postResample(iq_tr$pred, iq_tr$total_cases)
##      RMSE  Rsquared       MAE 
## 4.2498241 0.9085896 2.1890413
ggplot(data = iq_tr, aes(x = as.Date(week_start_date))) + geom_line(aes(y = total_cases, color = "real")) + geom_line(aes(y = pred, color = "pred" )) 

iq_tr$pred <- NULL
iq_tr$type <- "train"

Make final prediction for Iquito (test): Model 2

## Make final prediction for Iquito (test)
iq_te <- all %>% filter(city == "iq" & type == "test")

## The missing values will be replaced by fromLast
iq_te <- cbind(iq_te[ , c("city", "type", "week_start_date")] ,  apply(X = iq_te[, which(sapply(iq_te,is.numeric))], MARGIN = 2, function(x) na.locf(x,fromLast = TRUE)))

Feature Engineer IQUITO Test

iq_te$avg_precipitation<- (iq_te$precipitation_amt_mm+iq_te$station_precip_mm+ iq_te$reanalysis_sat_precip_amt_mm)/3

iq_test <- iq_te %>% #group_by(city,year, week_start_date) %>%  
  mutate(avg_temp_c = station_avg_temp_c,
         humidity_g_per_kg = reanalysis_specific_humidity_g_per_kg,
         ndvi_ne = NULL,
         ndvi_nw = NULL, 
         ndvi_se = NULL,
         reanalysis_avg_temp_k = NULL, 
         reanalysis_air_temp_k = NULL, 
         station_precip_mm = NULL, 
         precipitation_amt_mm = NULL,
         reanalysis_air_temp_k = NULL,
         reanalysis_avg_temp_k = NULL,
         reanalysis_dew_point_temp_k = NULL,
         reanalysis_max_air_temp_k = NULL,
         reanalysis_min_air_temp_k = NULL,
         reanalysis_precip_amt_kg_per_m2 = NULL,
         reanalysis_relative_humidity_percent = NULL,
         reanalysis_sat_precip_amt_mm = NULL,
         reanalysis_tdtr_k = NULL,
         station_diur_temp_rng_c = NULL,
         station_max_temp_c = NULL,                   
         station_min_temp_c = NULL,
         station_avg_temp_c = NULL,
        reanalysis_specific_humidity_g_per_kg   = NULL,
         station_precip_mm = NULL) 


## Lag IQ Precipitation
iq_test$avg_precipitation <- dplyr::lag(iq_test$avg_precipitation, 1)

iq_test[1,8] = iq_train[520,27]

#iq_test <- iq_test[complete.cases(iq_test), ]


## Apply parameters and predict
#iq_test <- predict(iq_preprocparams, iq_test)

iq_test$total_cases <- predict(rf_reg, iq_test)

ggplot(iq_test, aes(as.Date(week_start_date), total_cases, group=1)) + geom_line()

iq_tr$year <- as.numeric(iq_tr$year)
iq_tr$weekofyear <- as.numeric(iq_tr$weekofyear)

ggplot(data = union_all(iq_tr, iq_test), aes(x = as.Date(week_start_date), y = total_cases, color = type)) + geom_line()

#iq_test$weekofyear <- as.character(iq_test$weekofyear)

#iq_test$weekofyear <- as.numeric(iq_test$weekofyear)

Model 2: Second Submission with Random Forest:

iq_test$city <- "iq" 

iq_test$type <- "test"

rf_final_df2 <- rbind(sj_test[ , c("city", "type", "total_cases","weekofyear", "week_start_date", "year")], 
                     iq_test[ , c("city", "type", "total_cases","weekofyear", "week_start_date", "year")])

submission_rf2 <- full_join(x = submission_format, y = rf_final_df2, by = c("year","weekofyear", "city"))

data.table::setnames(submission_rf2, old = "total_cases.y", "total_cases")

submission_rf2$total_cases <- base::round(submission_rf2$total_cases,0) # Round

ggplot(submission_rf2, aes(as.Date(week_start_date), total_cases, group=city)) + geom_line() + facet_wrap(city~.)

submission_rf2 <- submission_rf2 %>% select(city, year, weekofyear, total_cases) 

write.csv(submission_rf2, "submission_rf2.csv", quote = F, row.names = F)

SUBMISSION SCORE #2: 27.1034

MODEL 3: Auto.ARIMA with Exogenous Variables

sjtot=ts(sj_tr$total_cases, frequency=52, start=c(1990,04,30)) 

sjtotplot<- window(sjtot, start= 2000, end = 2008)

ggseasonplot(sjtotplot, polar=TRUE) + ylab("Cases") +
  ggtitle("Polar seasonal plot: San Juan Dengue Fever Cases") + theme_bw()

iqtot=ts(iq_tr$total_cases, frequency=52, start=c(2000,07,01)) 

iqtotplot<- window(iqtot, start= 2000, end = 2008)

ggseasonplot(iqtotplot, polar=TRUE) + ylab("Cases") +
  ggtitle("Polar seasonal plot: Iquito Dengue Fever Cases") + theme_bw()

autoplot(sjtot) + ylab("Weeks") +
  xlab("Cases") + ggtitle("San Juan: Total Cases over Training Period")

autoplot(iqtot) + ylab("Weeks") +
  xlab("Cases") + ggtitle("Iquito: Total Cases over Training Period")

acf(sjtot)

pacf(sjtot)

tsdisplay(sjtot)

autoplot(diff(sjtot,1))

acf(iqtot)

pacf(iqtot)

tsdisplay(iqtot)

autoplot(diff(iqtot,1))

exvar_sj<- as.matrix(sj_tr[,c("weekofyear","ndvi_sw","avg_precipitation" , "avg_temp_c"  ,"humidity_g_per_kg")])

exvar_sjtest<- as.matrix(sj_test[,c("weekofyear","ndvi_sw","avg_precipitation" , "avg_temp_c"  ,"humidity_g_per_kg")])

(f1=auto.arima(sjtot, xreg =exvar_sj))
## Series: sjtot 
## Regression with ARIMA(1,1,1) errors 
## 
## Coefficients:
##          ar1      ma1  weekofyear  ndvi_sw  avg_precipitation  avg_temp_c
##       0.7071  -0.5895     -0.0007  -0.2307             0.4248      1.2978
## s.e.  0.0965   0.1093      0.0608   0.3237             0.3701      1.0526
##       humidity_g_per_kg
##                  1.0069
## s.e.             1.0387
## 
## sigma^2 estimated as 180.5:  log likelihood=-3752.11
## AIC=7520.21   AICc=7520.37   BIC=7558.94
cbind("Regression Errors" = residuals(f1, type="regression"),
      "ARIMA errors" = residuals(f1, type="innovation")) %>%
  autoplot(facets=TRUE)

checkresiduals(f1)

## 
##  Ljung-Box test
## 
## data:  Residuals from Regression with ARIMA(1,1,1) errors
## Q* = 90.045, df = 97, p-value = 0.6785
## 
## Model df: 7.   Total lags used: 104
fc_sj=forecast(f1, h = 260, xreg = exvar_sjtest)

autoplot(fc_sj)

exvar_iq<- as.matrix(iq_tr[,c("weekofyear","ndvi_sw","avg_precipitation" , "avg_temp_c"  ,"humidity_g_per_kg")])

exvar_iqtest<- as.matrix(iq_test[,c("weekofyear","ndvi_sw","avg_precipitation", "avg_temp_c"  ,"humidity_g_per_kg")])

(f2=auto.arima(iqtot,xreg = exvar_iq))
## Series: iqtot 
## Regression with ARIMA(3,0,0) errors 
## 
## Coefficients:
##          ar1      ar2     ar3  intercept  weekofyear  ndvi_sw
##       0.6944  -0.0720  0.1961     9.4552     -0.0766   0.6362
## s.e.  0.0432   0.0535  0.0434     1.9829      0.0415   0.2704
##       avg_precipitation  avg_temp_c  humidity_g_per_kg
##                 -0.2527      0.3630            -0.4703
## s.e.             0.3343      0.3264             0.4322
## 
## sigma^2 estimated as 48.59:  log likelihood=-1743.46
## AIC=3506.92   AICc=3507.36   BIC=3549.46
cbind("Regression Errors" = residuals(f2, type="regression"),
      "ARIMA errors" = residuals(f2, type="innovation")) %>%
  autoplot(facets=TRUE)

checkresiduals(f2)

## 
##  Ljung-Box test
## 
## data:  Residuals from Regression with ARIMA(3,0,0) errors
## Q* = 81.259, df = 95, p-value = 0.8415
## 
## Model df: 9.   Total lags used: 104
fc_iq=forecast(f2, h = 156, xreg = exvar_iqtest)

autoplot(fc_iq)

Arima

sj_te <- all %>% filter(city == “sj” & type == “test”)

##The missing values will be replaced by a forecast sj_te <- cbind(sj_te[ , c(“city”, “type”, “week_start_date”)] , apply(X = sj_te[, which(sapply(sj_te,is.numeric))], MARGIN = 2, function(x) na.locf(x,fromLast = TRUE)))

Feature Engineer San Juan

sj_te <- sj_te %>% group_by(city,year, week_start_date) %>%
mutate(avg_precipitation = mean(station_precip_mm, precipitation_amt_mm,reanalysis_sat_precip_amt_mm, na.rm = T, trim = 0), avg_ndvi = mean(ndvi_ne,ndvi_nw, ndvi_se, ndvi_sw, na.rm = T, trim = 0), avg_temp_c = station_avg_temp_c, humidity_g_per_kg = reanalysis_specific_humidity_g_per_kg, ndvi_ne = NULL, ndvi_nw = NULL, ndvi_se = NULL, ndvi_sw = NULL,
reanalysis_avg_temp_k = NULL, reanalysis_air_temp_k = NULL, station_precip_mm = NULL, precipitation_amt_mm = NULL, reanalysis_air_temp_k = NULL, reanalysis_avg_temp_k = NULL, reanalysis_dew_point_temp_k = NULL, reanalysis_max_air_temp_k = NULL, reanalysis_min_air_temp_k = NULL, reanalysis_precip_amt_kg_per_m2 = NULL, reanalysis_relative_humidity_percent = NULL, reanalysis_sat_precip_amt_mm = NULL, reanalysis_tdtr_k = NULL, station_diur_temp_rng_c = NULL, station_max_temp_c = NULL,
station_min_temp_c = NULL, station_avg_temp_c = NULL, reanalysis_specific_humidity_g_per_kg = NULL, station_precip_mm = NULL)

(fit_sj <- Arima(sjtot, order=c(2,0,1), seasonal=c(0,0,0), xreg =exvar_sj )) 
## Series: sjtot 
## Regression with ARIMA(2,0,1) errors 
## 
## Coefficients:
##          ar1      ar2      ma1  intercept  weekofyear  ndvi_sw
##       1.8474  -0.8621  -0.7800    34.3293     -0.0165  -0.2214
## s.e.  0.0387   0.0368   0.0504     6.5680      0.0608   0.3240
##       avg_precipitation  avg_temp_c  humidity_g_per_kg
##                  0.4180      1.3613             0.8754
## s.e.             0.3702      1.0528             1.0397
## 
## sigma^2 estimated as 172.7:  log likelihood=-3735.91
## AIC=7491.82   AICc=7492.06   BIC=7540.24
fcast_sj=forecast(fit_sj, h = 260, xreg = exvar_sjtest)

autoplot(fcast_sj) + xlab("Year") +
  ylab("Cases")

tsdisplay(residuals(fit_sj))

Box.test(residuals(fcast_sj),fitdf=7, lag=151, type="Ljung")  
## 
##  Box-Ljung test
## 
## data:  residuals(fcast_sj)
## X-squared = 107.77, df = 144, p-value = 0.9894
(fit_sj2 <- Arima(sjtot, order=c(1,2,1), seasonal=c(0,0,0), xreg =exvar_sj )) 
## Series: sjtot 
## Regression with ARIMA(1,2,1) errors 
## 
## Coefficients:
##          ar1      ma1  weekofyear  ndvi_sw  avg_precipitation  avg_temp_c
##       0.1359  -1.0000      0.0283  -0.2766             0.3883      1.1983
## s.e.  0.0326   0.0029      0.0610   0.3220             0.3723      1.0615
##       humidity_g_per_kg
##                  1.0907
## s.e.             1.0465
## 
## sigma^2 estimated as 182.3:  log likelihood=-3756.15
## AIC=7528.3   AICc=7528.46   BIC=7567.02
fcast_sj2=forecast(fit_sj2, h = 260, xreg = exvar_sjtest)

autoplot(fcast_sj2) + xlab("Year") +
  ylab("Cases")

tsdisplay(residuals(fit_sj2))

Box.test(residuals(fcast_sj2),fitdf=7, lag=151, type="Ljung")  
## 
##  Box-Ljung test
## 
## data:  residuals(fcast_sj2)
## X-squared = 149.65, df = 144, p-value = 0.3565

iq_te <- all %>% filter(city == “iq” & type == “test”)

##The missing values will be replaced by a forecast iq_te <- cbind(iq_te[ , c(“city”, “type”, “week_start_date”)] , apply(X = iq_te[, which(sapply(iq_te,is.numeric))], MARGIN = 2, function(x) na.locf(x,fromLast = TRUE)))

Feature Engineer San Juan

iq_te <- iq_te %>% group_by(city,year, week_start_date) %>%
mutate(avg_precipitation = mean(station_precip_mm, precipitation_amt_mm,reanalysis_sat_precip_amt_mm, na.rm = T, trim = 0), avg_ndvi = mean(ndvi_ne,ndvi_nw, ndvi_se, ndvi_sw, na.rm = T, trim = 0), avg_temp_c = station_avg_temp_c, humidity_g_per_kg = reanalysis_specific_humidity_g_per_kg, ndvi_ne = NULL, ndvi_nw = NULL, ndvi_se = NULL, ndvi_sw = NULL,
reanalysis_avg_temp_k = NULL, reanalysis_air_temp_k = NULL, station_precip_mm = NULL, precipitation_amt_mm = NULL, reanalysis_air_temp_k = NULL, reanalysis_avg_temp_k = NULL, reanalysis_dew_point_temp_k = NULL, reanalysis_max_air_temp_k = NULL, reanalysis_min_air_temp_k = NULL, reanalysis_precip_amt_kg_per_m2 = NULL, reanalysis_relative_humidity_percent = NULL, reanalysis_sat_precip_amt_mm = NULL, reanalysis_tdtr_k = NULL, station_diur_temp_rng_c = NULL, station_max_temp_c = NULL,
station_min_temp_c = NULL, station_avg_temp_c = NULL, reanalysis_specific_humidity_g_per_kg = NULL, station_precip_mm = NULL)

(fit_iq <- Arima(iqtot, order=c(1,0,4), seasonal=c(0,0,0), xreg =exvar_iq ))
## Series: iqtot 
## Regression with ARIMA(1,0,4) errors 
## 
## Coefficients:
##          ar1      ma1      ma2     ma3     ma4  intercept  weekofyear  ndvi_sw
##       0.7795  -0.0838  -0.0967  0.0935  0.1344     9.3419     -0.0706   0.6043
## s.e.  0.0610   0.0731   0.0615  0.0532  0.0542     1.8097      0.0422   0.2716
##       avg_precipitation  avg_temp_c  humidity_g_per_kg
##                 -0.3761      0.3001            -0.3601
## s.e.             0.3423      0.3238             0.4441
## 
## sigma^2 estimated as 48.45:  log likelihood=-1741.75
## AIC=3507.51   AICc=3508.12   BIC=3558.55
fcast_iq=forecast(fit_iq, h = 156, xreg = exvar_iqtest)

autoplot(fcast_iq) + xlab("Year") +
  ylab("Cases")

tsdisplay(residuals(fit_iq))

Box.test(residuals(fit_iq),fitdf=7, lag=151, type="Ljung")  
## 
##  Box-Ljung test
## 
## data:  residuals(fit_iq)
## X-squared = 99.443, df = 144, p-value = 0.9982

Model 3: Third Submission with ARIMA xreg

xreg_arima_sj_sol <- data.frame(submission_format[1:260,-4], total_cases = round(fcast_sj$mean))

xreg_arima_iq_sol <- data.frame(submission_format[261:416,-4], total_cases =round(fcast_iq$mean))

xreg_arima_solution <- bind_rows(xreg_arima_sj_sol,xreg_arima_iq_sol)

xreg_arima_solution$total_cases[xreg_arima_solution$total_cases < 0 ] <- 0

write.csv(xreg_arima_solution, file = 'xreg_arima_submission.csv', row.names = F)
xreg_arima_iq_sol2 <- data.frame(submission_format[261:416,-4], total_cases =round(fc_iq$mean))

xreg_arima_solution2 <- bind_rows(xreg_arima_sj_sol,xreg_arima_iq_sol2)

xreg_arima_solution2$total_cases[xreg_arima_solution2$total_cases < 0 ] <- 0

write.csv(xreg_arima_solution2, file = 'xreg_arima_submission2.csv', row.names = F)

Model 3: SUBMISSION SCORE #3: 41.8606

MODEL 4: Seasonal ARIMA

ARIMA: San Juan

acf(sjtot)

pacf(sjtot)

tsdisplay(sjtot)

autoplot(diff(sjtot,1))

sj_ar <- Arima(sjtot, order=c(2,0,1), seasonal=c(0,1,0)) 

sj_fcast<- forecast(sj_ar, h = 260)

autoplot(sj_fcast)

tsdisplay(residuals(sj_ar))

Box.test(residuals(sj_fcast),fitdf=2, lag=151, type="Ljung")  
## 
##  Box-Ljung test
## 
## data:  residuals(sj_fcast)
## X-squared = 437.35, df = 149, p-value < 2.2e-16

ARIMA: Iquito

iq_ar <- Arima(iqtot, order=c(1,0,4), seasonal=c(0,1,0)) 

iq_fcast<- forecast(iq_ar, h = 156)

autoplot(iq_fcast)

tsdisplay(residuals(iq_ar))

Box.test(residuals(iq_fcast),fitdf=3, lag=151, type="Ljung")  
## 
##  Box-Ljung test
## 
## data:  residuals(iq_fcast)
## X-squared = 308.4, df = 148, p-value = 2.43e-13

Model 4: Fourth Submission with Seasonal ARIMA:

arima_sj_sol <- data.frame(submission_format[1:260,-4], total_cases = round(sj_fcast$mean))

arima_iq_sol <- data.frame(submission_format[261:416,-4], total_cases =round(iq_fcast$mean))

arima_solution <- bind_rows(arima_sj_sol,arima_iq_sol)

write.csv(arima_solution, file = 'arima_predicted_Solution.csv', row.names = F)

Your score for this submission is: 29.8413

MODEL 5: ETS with Decomposition

sjtot=ts(sj_train$total_cases, start=c(1990,4,30), frequency=52) 

stl_decomposition = stl(sjtot, t.window=13, s.window="periodic",robust=TRUE)

autoplot(stl_decomposition)

sj_fcast3<- forecast(stl_decomposition,method = "ets",h=260)

accuracy(sj_fcast3)
##                       ME     RMSE      MAE MPE MAPE      MASE       ACF1
## Training set 0.006078719 13.26227 7.841533 NaN  Inf 0.2146436 0.02412127
autoplot(sj_fcast3)

#mstl

mstl_decomposition = mstl(sjtot)

autoplot(mstl_decomposition)

sj_fcast4<- forecast(mstl_decomposition,method = "ets",h=260)

kable(accuracy(sj_fcast4))
ME RMSE MAE MPE MAPE MASE ACF1
Training set -0.0080337 12.20282 7.629666 -Inf Inf 0.2088442 0.042466
autoplot(sj_fcast4) + ylab("Total Cases") + ggtitle("San Juan: MSTL + ETS")

iqtot=ts(iq_train$total_cases, start=c(2000,07,01), frequency=52) 

iqstl_decomp = stl(iqtot, t.window=13, s.window="periodic",robust=TRUE)

autoplot(iqstl_decomp)

iq_fcast3<- forecast(iqstl_decomp,method = "ets",h=156)

kable(accuracy(iq_fcast3))
ME RMSE MAE MPE MAPE MASE ACF1
Training set 0.009541 7.161697 3.675609 NaN Inf 0.3893583 0.0698369
autoplot(iq_fcast3)

#mstl

iqmstl_decomp = mstl(iqtot)

autoplot(iqmstl_decomp)

iq_fcast4<- forecast(iqmstl_decomp,method = "ets",h=156)

accuracy(iq_fcast4)
##                      ME    RMSE      MAE MPE MAPE      MASE       ACF1
## Training set 0.01981199 6.48256 3.828215 NaN  Inf 0.4055239 0.06328665
autoplot(iq_fcast4)


ets_sj_sol <- data.frame(submission_format[1:260,-4], total_cases = round(sj_fcast4$mean))

ets_iq_sol <- data.frame(submission_format[261:416,-4], total_cases =round(iq_fcast4$mean))

ets_solution <- bind_rows(ets_sj_sol,ets_iq_sol)

write.csv(ets_solution, file = 'ets_predicted_Solution.csv', row.names = F)

MODEL 5 SUBMISSION SCORE: 26.5192

MODEL 6: NEURAL NET

(fit_nnetar_sj <- nnetar(sjtot))
## Series: sjtot 
## Model:  NNAR(14,1,8)[52] 
## Call:   nnetar(y = sjtot)
## 
## Average of 20 networks, each of which is
## a 15-8-1 network with 137 weights
## options were - linear output units 
## 
## sigma^2 estimated as 51.01
#repeats=25, size=12, decay=0.1,linout=TRUE
plot(forecast(fit_nnetar_sj,h=260))

summary(fit_nnetar_sj)
##           Length Class        Mode     
## x         936    ts           numeric  
## m           1    -none-       numeric  
## p           1    -none-       numeric  
## P           1    -none-       numeric  
## scalex      2    -none-       list     
## size        1    -none-       numeric  
## subset    936    -none-       numeric  
## model      20    nnetarmodels list     
## nnetargs    0    -none-       list     
## fitted    936    ts           numeric  
## residuals 936    ts           numeric  
## lags       15    -none-       numeric  
## series      1    -none-       character
## method      1    -none-       character
## call        2    -none-       call
checkresiduals(fit_nnetar_sj)

a=forecast(fit_nnetar_sj,h=260)
(fit_nnetar_iq <- nnetar(iqtot))
## Series: iqtot 
## Model:  NNAR(5,1,4)[52] 
## Call:   nnetar(y = iqtot)
## 
## Average of 20 networks, each of which is
## a 6-4-1 network with 33 weights
## options were - linear output units 
## 
## sigma^2 estimated as 23.08
plot(forecast(fit_nnetar_iq,h=156))

checkresiduals(fit_nnetar_iq)

b=forecast(fit_nnetar_iq,h=156)
nnetar_sj_sol <- data.frame(submission_format[1:260,-4], total_cases = round(a$mean))

nnetar_iq_sol <- data.frame(submission_format[261:416,-4], total_cases =round(b$mean))

nnetar_solution <- bind_rows(nnetar_sj_sol,nnetar_iq_sol)

write.csv(nnetar_solution, file = 'nnetar_predicted_Solution.csv', row.names = F)

MODEL 5 SUBMISSION SCORE: 37.4351