true

library(readr)
library(reshape2)
library(tidyr)
library(h2o)
library(dplyr)
library(ggplot2) 
library(caret)
library(lubridate)
library(imputeTS)
library(corrplot)
library(data.table)
library(quantreg)
library(Ecfun)
library(randomForest)
library(forecast)
library(kableExtra)
library(magrittr)
library(zoo)
library(RColorBrewer)
library(tsibble)
library(feasts)
library(gridExtra)

features_tr <- read.csv("C:/Users/hunai/Google Drive/BC_EDU Pred Modeling/ADEC 7640 Final/DATA/dengue_features_train.csv")
                        #col_types = cols(week_start_date = col_date(format = "%Y-%m-%d")))

labels_tr <- read.csv("C:/Users/hunai/Google Drive/BC_EDU Pred Modeling/ADEC 7640 Final/DATA/dengue_labels_train.csv")

features_te <- read.csv("C:/Users/hunai/Google Drive/BC_EDU Pred Modeling/ADEC 7640 Final/DATA/dengue_features_test.csv")

submission_format <- read.csv("C:/Users/hunai/Google Drive/BC_EDU Pred Modeling/ADEC 7640 Final/DATA/submission_format.csv")

names(features_tr)

##  [1] "city"                                 
##  [2] "year"                                 
##  [3] "weekofyear"                           
##  [4] "week_start_date"                      
##  [5] "ndvi_ne"                              
##  [6] "ndvi_nw"                              
##  [7] "ndvi_se"                              
##  [8] "ndvi_sw"                              
##  [9] "precipitation_amt_mm"                 
## [10] "reanalysis_air_temp_k"                
## [11] "reanalysis_avg_temp_k"                
## [12] "reanalysis_dew_point_temp_k"          
## [13] "reanalysis_max_air_temp_k"            
## [14] "reanalysis_min_air_temp_k"            
## [15] "reanalysis_precip_amt_kg_per_m2"      
## [16] "reanalysis_relative_humidity_percent" 
## [17] "reanalysis_sat_precip_amt_mm"         
## [18] "reanalysis_specific_humidity_g_per_kg"
## [19] "reanalysis_tdtr_k"                    
## [20] "station_avg_temp_c"                   
## [21] "station_diur_temp_rng_c"              
## [22] "station_max_temp_c"                   
## [23] "station_min_temp_c"                   
## [24] "station_precip_mm"

Checking distributions

##
rbind(data.frame(group = "train", features_tr),
      data.frame(group = "test", features_te)) %>%
  gather(x, y, precipitation_amt_mm:station_precip_mm) %>%
  ggplot(aes(x = y, color = group, fill = group)) +
  geom_density(alpha = 0.3) +
  facet_wrap( ~ x, scales = "free", ncol = 3)

Create Train and Test dataset.

train <- left_join(x = features_tr, y = labels_tr, by = c("year", "weekofyear", "city"))

test <- left_join(x = features_te, y = submission_format, by = c("year", "weekofyear", "city"))

train$type <- "train"

test$type <- "test"

Combine Train+ Test for EDA and Viz

all <- rbind(train, test)

Summary of data

summary(all[ , c("station_avg_temp_c", "reanalysis_avg_temp_k", "reanalysis_air_temp_k")])

##  station_avg_temp_c reanalysis_avg_temp_k reanalysis_air_temp_k
##  Min.   :21.40      Min.   :294.9         Min.   :294.6        
##  1st Qu.:26.33      1st Qu.:298.3         1st Qu.:297.7        
##  Median :27.43      Median :299.3         Median :298.6        
##  Mean   :27.23      Mean   :299.3         Mean   :298.7        
##  3rd Qu.:28.20      3rd Qu.:300.3         3rd Qu.:299.9        
##  Max.   :30.80      Max.   :303.3         Max.   :302.2        
##  NA's   :55         NA's   :12            NA's   :12

summary(all[ , c("station_precip_mm", "precipitation_amt_mm")])

##  station_precip_mm precipitation_amt_mm
##  Min.   :  0.0     Min.   :  0.00      
##  1st Qu.:  8.8     1st Qu.:  9.43      
##  Median : 23.8     Median : 36.64      
##  Mean   : 38.2     Mean   : 44.11      
##  3rd Qu.: 51.5     3rd Qu.: 67.50      
##  Max.   :543.3     Max.   :390.60      
##  NA's   :27        NA's   :15

Split data by city and treat missing values

SAN JUAN

sj_train <-  all %>% filter(city == "sj" & type == "train")

summary(sj_train)

##      city                year        weekofyear    week_start_date   
##  Length:936         Min.   :1990   Min.   : 1.00   Length:936        
##  Class :character   1st Qu.:1994   1st Qu.:13.75   Class :character  
##  Mode  :character   Median :1999   Median :26.50   Mode  :character  
##                     Mean   :1999   Mean   :26.50                     
##                     3rd Qu.:2003   3rd Qu.:39.25                     
##                     Max.   :2008   Max.   :53.00                     
##                                                                      
##     ndvi_ne            ndvi_nw            ndvi_se            ndvi_sw        
##  Min.   :-0.40625   Min.   :-0.45610   Min.   :-0.01553   Min.   :-0.06346  
##  1st Qu.: 0.00450   1st Qu.: 0.01642   1st Qu.: 0.13928   1st Qu.: 0.12916  
##  Median : 0.05770   Median : 0.06807   Median : 0.17719   Median : 0.16597  
##  Mean   : 0.05792   Mean   : 0.06747   Mean   : 0.17766   Mean   : 0.16596  
##  3rd Qu.: 0.11110   3rd Qu.: 0.11520   3rd Qu.: 0.21256   3rd Qu.: 0.20277  
##  Max.   : 0.49340   Max.   : 0.43710   Max.   : 0.39313   Max.   : 0.38142  
##  NA's   :191        NA's   :49         NA's   :19         NA's   :19        
##  precipitation_amt_mm reanalysis_air_temp_k reanalysis_avg_temp_k
##  Min.   :  0.00       Min.   :295.9         Min.   :296.1        
##  1st Qu.:  0.00       1st Qu.:298.2         1st Qu.:298.3        
##  Median : 20.80       Median :299.3         Median :299.4        
##  Mean   : 35.47       Mean   :299.2         Mean   :299.3        
##  3rd Qu.: 52.18       3rd Qu.:300.1         3rd Qu.:300.2        
##  Max.   :390.60       Max.   :302.2         Max.   :302.2        
##  NA's   :9            NA's   :6             NA's   :6            
##  reanalysis_dew_point_temp_k reanalysis_max_air_temp_k
##  Min.   :289.6               Min.   :297.8            
##  1st Qu.:293.8               1st Qu.:300.4            
##  Median :295.5               Median :301.5            
##  Mean   :295.1               Mean   :301.4            
##  3rd Qu.:296.4               3rd Qu.:302.4            
##  Max.   :297.8               Max.   :304.3            
##  NA's   :6                   NA's   :6                
##  reanalysis_min_air_temp_k reanalysis_precip_amt_kg_per_m2
##  Min.   :292.6             Min.   :  0.00                 
##  1st Qu.:296.3             1st Qu.: 10.82                 
##  Median :297.5             Median : 21.30                 
##  Mean   :297.3             Mean   : 30.47                 
##  3rd Qu.:298.4             3rd Qu.: 37.00                 
##  Max.   :299.9             Max.   :570.50                 
##  NA's   :6                 NA's   :6                      
##  reanalysis_relative_humidity_percent reanalysis_sat_precip_amt_mm
##  Min.   :66.74                        Min.   :  0.00              
##  1st Qu.:76.25                        1st Qu.:  0.00              
##  Median :78.67                        Median : 20.80              
##  Mean   :78.57                        Mean   : 35.47              
##  3rd Qu.:80.96                        3rd Qu.: 52.18              
##  Max.   :87.58                        Max.   :390.60              
##  NA's   :6                            NA's   :9                   
##  reanalysis_specific_humidity_g_per_kg reanalysis_tdtr_k station_avg_temp_c
##  Min.   :11.72                         Min.   :1.357     Min.   :22.84     
##  1st Qu.:15.24                         1st Qu.:2.157     1st Qu.:25.84     
##  Median :16.85                         Median :2.457     Median :27.23     
##  Mean   :16.55                         Mean   :2.516     Mean   :27.01     
##  3rd Qu.:17.86                         3rd Qu.:2.800     3rd Qu.:28.19     
##  Max.   :19.44                         Max.   :4.429     Max.   :30.07     
##  NA's   :6                             NA's   :6         NA's   :6         
##  station_diur_temp_rng_c station_max_temp_c station_min_temp_c
##  Min.   :4.529           Min.   :26.70      Min.   :17.8      
##  1st Qu.:6.200           1st Qu.:30.60      1st Qu.:21.7      
##  Median :6.757           Median :31.70      Median :22.8      
##  Mean   :6.757           Mean   :31.61      Mean   :22.6      
##  3rd Qu.:7.286           3rd Qu.:32.80      3rd Qu.:23.9      
##  Max.   :9.914           Max.   :35.60      Max.   :25.6      
##  NA's   :6               NA's   :6          NA's   :6         
##  station_precip_mm  total_cases         type          
##  Min.   :  0.000   Min.   :  0.00   Length:936        
##  1st Qu.:  6.825   1st Qu.:  9.00   Class :character  
##  Median : 17.750   Median : 19.00   Mode  :character  
##  Mean   : 26.785   Mean   : 34.18                     
##  3rd Qu.: 35.450   3rd Qu.: 37.00                     
##  Max.   :305.900   Max.   :461.00                     
##  NA's   :6

# Missing value analysis
# Get percentage of missing value by column

sj_miss<- dplyr::select(sj_train,-c("city","year","weekofyear","week_start_date","total_cases","type")) 

sj_miss<- apply(X = sj_miss, 2, function(x) sum(is.na(x))/nrow(sj_miss)) %>% 
  knitr:: kable() %>%  kable_styling(bootstrap_options = "striped", full_width = F) 

sj_miss

	x
ndvi_ne	0.2040598
ndvi_nw	0.0523504
ndvi_se	0.0202991
ndvi_sw	0.0202991
precipitation_amt_mm	0.0096154
reanalysis_air_temp_k	0.0064103
reanalysis_avg_temp_k	0.0064103
reanalysis_dew_point_temp_k	0.0064103
reanalysis_max_air_temp_k	0.0064103
reanalysis_min_air_temp_k	0.0064103
reanalysis_precip_amt_kg_per_m2	0.0064103
reanalysis_relative_humidity_percent	0.0064103
reanalysis_sat_precip_amt_mm	0.0096154
reanalysis_specific_humidity_g_per_kg	0.0064103
reanalysis_tdtr_k	0.0064103
station_avg_temp_c	0.0064103
station_diur_temp_rng_c	0.0064103
station_max_temp_c	0.0064103
station_min_temp_c	0.0064103
station_precip_mm	0.0064103

Impute missing values Using LOCF from ‘zoo’

sj_train <- cbind(sj_train[ , c("city", "type","week_start_date")] ,  apply(X = sj_train[, which(sapply(sj_train,is.numeric))], MARGIN = 2, function(x) na.locf(x,fromLast = TRUE)))

IQUITO

# # IQUITO
iq_train <-  all %>% filter(city == "iq" & type == "train") 

summary(iq_train)

##      city                year        weekofyear    week_start_date   
##  Length:520         Min.   :2000   Min.   : 1.00   Length:520        
##  Class :character   1st Qu.:2003   1st Qu.:13.75   Class :character  
##  Mode  :character   Median :2005   Median :26.50   Mode  :character  
##                     Mean   :2005   Mean   :26.50                     
##                     3rd Qu.:2007   3rd Qu.:39.25                     
##                     Max.   :2010   Max.   :53.00                     
##                                                                      
##     ndvi_ne           ndvi_nw           ndvi_se           ndvi_sw       
##  Min.   :0.06173   Min.   :0.03586   Min.   :0.02988   Min.   :0.06418  
##  1st Qu.:0.20000   1st Qu.:0.17954   1st Qu.:0.19474   1st Qu.:0.20413  
##  Median :0.26364   Median :0.23297   Median :0.24980   Median :0.26214  
##  Mean   :0.26387   Mean   :0.23878   Mean   :0.25013   Mean   :0.26678  
##  3rd Qu.:0.31997   3rd Qu.:0.29393   3rd Qu.:0.30230   3rd Qu.:0.32515  
##  Max.   :0.50836   Max.   :0.45443   Max.   :0.53831   Max.   :0.54602  
##  NA's   :3         NA's   :3         NA's   :3         NA's   :3        
##  precipitation_amt_mm reanalysis_air_temp_k reanalysis_avg_temp_k
##  Min.   :  0.00       Min.   :294.6         Min.   :294.9        
##  1st Qu.: 39.10       1st Qu.:297.1         1st Qu.:298.2        
##  Median : 60.47       Median :297.8         Median :299.1        
##  Mean   : 64.25       Mean   :297.9         Mean   :299.1        
##  3rd Qu.: 85.76       3rd Qu.:298.6         3rd Qu.:300.1        
##  Max.   :210.83       Max.   :301.6         Max.   :302.9        
##  NA's   :4            NA's   :4             NA's   :4            
##  reanalysis_dew_point_temp_k reanalysis_max_air_temp_k
##  Min.   :290.1               Min.   :300.0            
##  1st Qu.:294.6               1st Qu.:305.2            
##  Median :295.9               Median :307.1            
##  Mean   :295.5               Mean   :307.1            
##  3rd Qu.:296.5               3rd Qu.:308.7            
##  Max.   :298.4               Max.   :314.0            
##  NA's   :4                   NA's   :4                
##  reanalysis_min_air_temp_k reanalysis_precip_amt_kg_per_m2
##  Min.   :286.9             Min.   :  0.00                 
##  1st Qu.:292.0             1st Qu.: 24.07                 
##  Median :293.1             Median : 46.44                 
##  Mean   :292.9             Mean   : 57.61                 
##  3rd Qu.:294.2             3rd Qu.: 71.07                 
##  Max.   :296.0             Max.   :362.03                 
##  NA's   :4                 NA's   :4                      
##  reanalysis_relative_humidity_percent reanalysis_sat_precip_amt_mm
##  Min.   :57.79                        Min.   :  0.00              
##  1st Qu.:84.30                        1st Qu.: 39.10              
##  Median :90.92                        Median : 60.47              
##  Mean   :88.64                        Mean   : 64.25              
##  3rd Qu.:94.56                        3rd Qu.: 85.76              
##  Max.   :98.61                        Max.   :210.83              
##  NA's   :4                            NA's   :4                   
##  reanalysis_specific_humidity_g_per_kg reanalysis_tdtr_k station_avg_temp_c
##  Min.   :12.11                         Min.   : 3.714    Min.   :21.40     
##  1st Qu.:16.10                         1st Qu.: 7.371    1st Qu.:27.00     
##  Median :17.43                         Median : 8.964    Median :27.60     
##  Mean   :17.10                         Mean   : 9.207    Mean   :27.53     
##  3rd Qu.:18.18                         3rd Qu.:11.014    3rd Qu.:28.10     
##  Max.   :20.46                         Max.   :16.029    Max.   :30.80     
##  NA's   :4                             NA's   :4         NA's   :37        
##  station_diur_temp_rng_c station_max_temp_c station_min_temp_c
##  Min.   : 5.20           Min.   :30.1       Min.   :14.7      
##  1st Qu.: 9.50           1st Qu.:33.2       1st Qu.:20.6      
##  Median :10.62           Median :34.0       Median :21.3      
##  Mean   :10.57           Mean   :34.0       Mean   :21.2      
##  3rd Qu.:11.65           3rd Qu.:34.9       3rd Qu.:22.0      
##  Max.   :15.80           Max.   :42.2       Max.   :24.2      
##  NA's   :37              NA's   :14         NA's   :8         
##  station_precip_mm  total_cases          type          
##  Min.   :  0.00    Min.   :  0.000   Length:520        
##  1st Qu.: 17.20    1st Qu.:  1.000   Class :character  
##  Median : 45.30    Median :  5.000   Mode  :character  
##  Mean   : 62.47    Mean   :  7.565                     
##  3rd Qu.: 85.95    3rd Qu.:  9.000                     
##  Max.   :543.30    Max.   :116.000                     
##  NA's   :16

# Missing value analysis
# Get percentage of missing value by column

iq_miss<- dplyr::select(iq_train,-c("city","year","weekofyear","week_start_date","total_cases","type"))
                        
iq_miss<- apply(X = iq_miss, 2, function(x) sum(is.na(x))/nrow(iq_miss)) %>% 
  knitr:: kable() %>%  kable_styling(bootstrap_options = "striped", full_width = F) 

iq_miss

	x
ndvi_ne	0.0057692
ndvi_nw	0.0057692
ndvi_se	0.0057692
ndvi_sw	0.0057692
precipitation_amt_mm	0.0076923
reanalysis_air_temp_k	0.0076923
reanalysis_avg_temp_k	0.0076923
reanalysis_dew_point_temp_k	0.0076923
reanalysis_max_air_temp_k	0.0076923
reanalysis_min_air_temp_k	0.0076923
reanalysis_precip_amt_kg_per_m2	0.0076923
reanalysis_relative_humidity_percent	0.0076923
reanalysis_sat_precip_amt_mm	0.0076923
reanalysis_specific_humidity_g_per_kg	0.0076923
reanalysis_tdtr_k	0.0076923
station_avg_temp_c	0.0711538
station_diur_temp_rng_c	0.0711538
station_max_temp_c	0.0269231
station_min_temp_c	0.0153846
station_precip_mm	0.0307692

plotNA.distribution(iq_train$ndvi_ne)

plotNA.distributionBar(iq_train$ndvi_ne, breaks = 20)

Impute missing values Using LOCF from ‘zoo’

iq_train <- cbind(iq_train[ , c("city", "type","week_start_date")] ,  apply(X = iq_train[, which(sapply(iq_train,is.numeric))], MARGIN = 2, function(x) na.locf(x,fromLast = TRUE)))

## Total cases across cities

p1<- ggplot(sj_train, aes(x=as.Date(week_start_date), y = total_cases, group = 1)) + geom_line(color="blue") +
  xlab("Week Start Date") + ylab("Total Cases") + ggtitle("Case Trend for San Juan") + 
  theme(axis.text.x = element_text(),axis.text.y = element_text()) +
    theme_minimal()

p2<- ggplot(iq_train, aes(x=as.Date(week_start_date), y = total_cases, group = 1)) + geom_line(color="red") +
  xlab("Week Start Date") + ylab("Total Cases") +  ggtitle("Case Trend for Iquito") +
  theme(axis.text.x = element_text(),axis.text.y = element_text()) +
    theme_minimal()

grid.arrange(p1,p2,nrow =2)

Model 1: Random Forest San Juan (all features)

## Modelling San Juan
#sj_train$city <- NULL
sj_train$weekofyear <- as.character(sj_train$weekofyear) 
sj_train$year <- as.character(sj_train$year)

## train model to predict Test (All train)
# Get the best mtry train a random forest using that mtry

preprocessparams <- preProcess(x = select(sj_train[ , which(sapply(sj_train,is.numeric))], -total_cases) ,method = c("center", "scale"))

sj_train <- predict(preprocessparams, sj_train)

sj_train$type <- NULL

rf_reg <- tuneRF(x = sj_train[ , -which(names(sj_train) == "total_cases")], y = sj_train$total_cases, ntreeTry=100,stepFactor=2,improve=0.05, doBest = T)

## mtry = 8  OOB error = 626.2102 
## Searching left ...
## mtry = 4     OOB error = 820.3542 
## -0.3100302 0.05 
## Searching right ...
## mtry = 16    OOB error = 495.1907 
## 0.2092261 0.05 
## mtry = 24    OOB error = 538.3619 
## -0.08718098 0.05

sj_train$pred <- predict(rf_reg, sj_train)

postResample(sj_train$pred, sj_train$total_cases)

##      RMSE  Rsquared       MAE 
## 9.8591277 0.9694185 5.3541332

ggplot(data = sj_train, aes(x = as.Date(week_start_date))) + geom_line(aes(y = total_cases, color = "real")) + geom_line(aes(y = pred, color = "pred" ))

sj_train$pred <- NULL
sj_train$type <- "train"

1 with station_precip lag of 1 RMSE Rsquared MAE 9.510456 0.972371 5.251435

with station_precip + station_avg lag of 1 RMSE Rsquared MAE 9.633140 0.968891 5.086512
with station_precip + station_avg + ndvi_ne lag of 1 RMSE Rsquared MAE 9.8423416 0.9692057 5.2444620
No lagged vars RMSE Rsquared MAE 9.6823447 0.9703827 5.1047858

Model 1: Make final prediction for San Juan (test)

sj_te <- all %>% filter(city == "sj" & type == "test")

# Missing value analysis
# Get percentage of missing value by column

sjte_miss<- dplyr::select(sj_te,-c("city","year","weekofyear","week_start_date","total_cases","type")) 

sjte_miss<- apply(X = sjte_miss, 2, function(x) sum(is.na(x))/nrow(sjte_miss)) %>% 
  knitr:: kable() %>%  kable_styling(bootstrap_options = "striped", full_width = F) 

sjte_miss

	x
ndvi_ne	0.1653846
ndvi_nw	0.0423077
ndvi_se	0.0038462
ndvi_sw	0.0038462
precipitation_amt_mm	0.0076923
reanalysis_air_temp_k	0.0076923
reanalysis_avg_temp_k	0.0076923
reanalysis_dew_point_temp_k	0.0076923
reanalysis_max_air_temp_k	0.0076923
reanalysis_min_air_temp_k	0.0076923
reanalysis_precip_amt_kg_per_m2	0.0076923
reanalysis_relative_humidity_percent	0.0076923
reanalysis_sat_precip_amt_mm	0.0076923
reanalysis_specific_humidity_g_per_kg	0.0076923
reanalysis_tdtr_k	0.0076923
station_avg_temp_c	0.0076923
station_diur_temp_rng_c	0.0076923
station_max_temp_c	0.0076923
station_min_temp_c	0.0076923
station_precip_mm	0.0076923

##The missing values will be replaced by last values
sj_te <- cbind(sj_te[ , c("city", "type", "week_start_date")] ,  apply(X = sj_te[, which(sapply(sj_te,is.numeric))], MARGIN = 2, function(x) na.locf(x,fromLast = TRUE)))

## Apply parameters and predict
#sj_te <- predict(preprocessparams, sj_te)

sj_te$total_cases <- predict(rf_reg , sj_te)

ggplot(sj_te, aes(as.Date(week_start_date), total_cases, group=1)) + geom_line()

sj_train$year <- as.numeric(sj_train$year)
sj_train$weekofyear <- as.numeric(sj_train$weekofyear)

ggplot(data = union_all(sj_train, sj_te), aes(x = as.Date(week_start_date), y = total_cases, color = type, group =1)) + geom_line()

Model 1: Iquito (all features)

iq_train$weekofyear <- as.character(iq_train$weekofyear) 
#iq_train$city <- NULL
iq_train$year <- as.character(iq_train$year)

# train model to predict Test (All train)
preprocessparams <- preProcess(x = select(iq_train[ , which(sapply(iq_train,is.numeric))], -total_cases),method = c("center", "scale"))

iq_train <- predict(preprocessparams, iq_train)

rf_reg <- tuneRF(x = iq_train[ , -which(colnames(iq_train)=="total_cases")], iq_train$total_cases, ntreeTry=100,stepFactor=2,improve=0.05,trace=FALSE, doBest = T)

## -0.001493603 0.05 
## 0.03296734 0.05

iq_train$pred <- predict(rf_reg, iq_train)

postResample(iq_train$pred, iq_train$total_cases)

##      RMSE  Rsquared       MAE 
## 4.1436702 0.9135643 2.1860947

ggplot(data = iq_train, aes(x = as.Date(week_start_date))) + geom_line(aes(y = total_cases, color = "real")) + geom_line(aes(y = pred, color = "pred" ))

iq_train$pred <- NULL

Model 1: Make final prediction for Iquito (test)

iq_te <- all %>% filter(city == "iq" & type == "test")

# Missing value analysis
# Get percentage of missing value by column

iqte_miss<- dplyr::select(iq_te,-c("city","year","weekofyear","week_start_date","total_cases","type")) 

iqte_miss<- apply(X = iqte_miss, 2, function(x) sum(is.na(x))/nrow(iqte_miss)) %>% 
  knitr:: kable() %>%  kable_styling(bootstrap_options = "striped", full_width = F) 

iqte_miss

	x
ndvi_ne	0.0000000
ndvi_nw	0.0000000
ndvi_se	0.0000000
ndvi_sw	0.0000000
precipitation_amt_mm	0.0000000
reanalysis_air_temp_k	0.0000000
reanalysis_avg_temp_k	0.0000000
reanalysis_dew_point_temp_k	0.0000000
reanalysis_max_air_temp_k	0.0000000
reanalysis_min_air_temp_k	0.0000000
reanalysis_precip_amt_kg_per_m2	0.0000000
reanalysis_relative_humidity_percent	0.0000000
reanalysis_sat_precip_amt_mm	0.0000000
reanalysis_specific_humidity_g_per_kg	0.0000000
reanalysis_tdtr_k	0.0000000
station_avg_temp_c	0.0641026
station_diur_temp_rng_c	0.0641026
station_max_temp_c	0.0064103
station_min_temp_c	0.0448718
station_precip_mm	0.0192308

## The missing values will be replaced by a forecast
iq_te <- cbind(iq_te[ , c("city", "type", "week_start_date")] ,  apply(X = iq_te[, which(sapply(iq_te,is.numeric))], MARGIN = 2, function(x) na.locf(x,fromLast = TRUE)))

iq_te$weekofyear <- as.character(iq_te$weekofyear) 
#iq_te$city <- NULL
iq_te$year <- as.character(iq_te$year)


iq_te <- predict(preprocessparams, iq_te)
iq_te$total_cases <- predict(rf_reg, iq_te)


iq_train$year <- as.numeric(iq_train$year)
iq_train$weekofyear <- as.numeric(iq_train$weekofyear)

iq_te$year <- as.numeric(iq_te$year)
iq_te$weekofyear <- as.numeric(iq_te$weekofyear)

#names(iq_train)
#names(iq_te)

ggplot(data = union_all(iq_train, iq_te), aes(x = as.Date(week_start_date), y = total_cases, color = type)) + geom_line()

Model 1: First Submission with Random Forest:

iq_te$city <- "iq" 

iq_te$type <- "test"

rf_final_df <- rbind(sj_te[ , c("city", "type", "total_cases","weekofyear", "week_start_date", "year")], 
                     iq_te[ , c("city", "type", "total_cases","weekofyear", "week_start_date", "year")])

submission_rf <- full_join(x = submission_format, y = rf_final_df, by = c("year","weekofyear", "city"))

data.table::setnames(submission_rf, old = "total_cases.y", "total_cases")

submission_rf$total_cases <- base::round(submission_rf$total_cases,0) # Round

ggplot(submission_rf, aes(as.Date(week_start_date), total_cases, group=city)) + geom_line() + facet_wrap(city~.)

submission_rf <- submission_rf %>% select(city, year, weekofyear, total_cases) 

write.csv(submission_rf, "submission_rf.csv", quote = F, row.names = F)

First Submission with Random Forest:SUBMISSION SCORE: 28.9904

Correlation Matrix

Corr SJ

sj_train %>% 
  dplyr::select(-c( "city","type", "year", "week_start_date")) %>%
  cor(use = 'pairwise.complete.obs') -> M1

corrplot(M1, type = "lower", order = "hclust",  cl.align = "r", tl.col = "black", addCoef.col = "white", tl.cex = .7, number.cex = .6,col = brewer.pal(n=8, name="RdBu"), insig = "blank",diag = FALSE)

Corr IQ

iq_train %>% 
  dplyr::select(-c("city", "type", "year", "week_start_date")) %>%
  cor(use = 'pairwise.complete.obs') -> M2

corrplot(M2, type = "lower", order = "hclust",  cl.align = "r", tl.col = "black", addCoef.col = "white", tl.cex = .7, number.cex = .6,col = brewer.pal(n=8, name="RdBu"), insig = "blank",diag = FALSE)

Feature Engineering

Remove [5] “ndvi_ne”
Remove [6] “ndvi_nw”
Remove [7] “ndvi_se”
Keep [8] “ndvi_sw” Remove [9] “precipitation_amt_mm”
Remove, mutate avg with station_precip_mm
Remove [10] “reanalysis_air_temp_k”
Remove [11] “reanalysis_avg_temp_k”
Remove [12] “reanalysis_dew_point_temp_k” 100% corr with specific_humidity
Remove temps in Kelvin: [13] “reanalysis_max_air_temp_k”
Remove temps in Kelvin: [14] “reanalysis_min_air_temp_k”
Remove: [15] “reanalysis_precip_amt_kg_per_m2”
Remove [16] “reanalysis_relative_humidity_percent” Remove [17] “reanalysis_sat_precip_amt_mm”
Keep [18] “reanalysis_specific_humidity_g_per_kg” Remove [19] “reanalysis_tdtr_k”
Keep [20] “station_avg_temp_c”
Remove [21] “station_diur_temp_rng_c”
Remove [22] “station_max_temp_c”
Remove [23] “station_min_temp_c”
Remove [24] “station_precip_mm” (average w/ precipitation_amt_mm)

sj_train$avg_precipitation<- (sj_train$precipitation_amt_mm+sj_train$station_precip_mm+ sj_train$reanalysis_sat_precip_amt_mm)/3

sj_tr<- sj_train %>% #group_by(city,year, week_start_date) %>%  
  mutate(avg_temp_c = station_avg_temp_c,
         humidity_g_per_kg = reanalysis_specific_humidity_g_per_kg,
         ndvi_ne = NULL,
         ndvi_nw = NULL, 
         ndvi_se = NULL,
         reanalysis_avg_temp_k = NULL, 
         reanalysis_air_temp_k = NULL, 
         station_precip_mm = NULL, 
         precipitation_amt_mm = NULL,
         reanalysis_air_temp_k = NULL,
         reanalysis_avg_temp_k = NULL,
         reanalysis_dew_point_temp_k = NULL,
         reanalysis_max_air_temp_k = NULL,
         reanalysis_min_air_temp_k = NULL,
         reanalysis_precip_amt_kg_per_m2 = NULL,
         reanalysis_relative_humidity_percent = NULL,
         reanalysis_sat_precip_amt_mm = NULL,
         reanalysis_tdtr_k = NULL,
         station_diur_temp_rng_c = NULL,
         station_max_temp_c = NULL,                   
         station_min_temp_c = NULL,
         station_avg_temp_c = NULL,
        reanalysis_specific_humidity_g_per_kg   = NULL,
         station_precip_mm = NULL) 

names(sj_tr)

##  [1] "city"              "week_start_date"   "year"             
##  [4] "weekofyear"        "ndvi_sw"           "total_cases"      
##  [7] "type"              "avg_precipitation" "avg_temp_c"       
## [10] "humidity_g_per_kg"

iq_train$avg_precipitation<- (iq_train$precipitation_amt_mm+iq_train$station_precip_mm+ iq_train$reanalysis_sat_precip_amt_mm)/3

iq_tr<- iq_train %>% #group_by(city,year, week_start_date) %>%  
  mutate(avg_temp_c = station_avg_temp_c,
         humidity_g_per_kg = reanalysis_specific_humidity_g_per_kg,
         ndvi_ne = NULL,
         ndvi_nw = NULL, 
         ndvi_se = NULL,
         reanalysis_avg_temp_k = NULL, 
         reanalysis_air_temp_k = NULL, 
         station_precip_mm = NULL, 
         precipitation_amt_mm = NULL,
         reanalysis_air_temp_k = NULL,
         reanalysis_avg_temp_k = NULL,
         reanalysis_dew_point_temp_k = NULL,
         reanalysis_max_air_temp_k = NULL,
         reanalysis_min_air_temp_k = NULL,
         reanalysis_precip_amt_kg_per_m2 = NULL,
         reanalysis_relative_humidity_percent = NULL,
         reanalysis_sat_precip_amt_mm = NULL,
         reanalysis_tdtr_k = NULL,
         station_diur_temp_rng_c = NULL,
         station_max_temp_c = NULL,                   
         station_min_temp_c = NULL,
         station_avg_temp_c = NULL,
        reanalysis_specific_humidity_g_per_kg   = NULL,
         station_precip_mm = NULL) 


names(iq_tr)

##  [1] "city"              "type"              "week_start_date"  
##  [4] "year"              "weekofyear"        "ndvi_sw"          
##  [7] "total_cases"       "avg_precipitation" "avg_temp_c"       
## [10] "humidity_g_per_kg"

MODEL 2: Random Forest San Juan (new features)

sj_tr$city <- NULL
sj_tr$weekofyear <- as.character(sj_tr$weekofyear) 
sj_tr$year <- as.character(sj_tr$year)
sj_tr$type <- NULL


## train model to predict Test (All train)
## Get the best mtry train a random forest using that mtry

#sj_preprocparams <- preProcess(x = select(sj_tr[ , which(sapply(sj_tr,is.numeric))], -total_cases) ,method = c("center", "scale"))

#sj_tr <- predict(sj_preprocparams, sj_tr)

sj_tr$avg_precipitation <- dplyr::lag(sj_tr$avg_precipitation, 1)

#Plots of total cases to the temp/hum variables do not show any particular relationship
#plot(sj_tr$avg_precipitation,sj_tr$total_cases)
#plot(sj_tr$avg_ndvi,sj_tr$total_cases)
#plot(sj_tr$avg_temp_c,sj_tr$total_cases)
#plot(sj_tr$humidity_g_per_kg,sj_tr$total_cases)

sj_tr[1,6] = sj_tr[53,6]

#sj_tr <- sj_tr[complete.cases(sj_tr), ]

rf_reg <- tuneRF(x = sj_tr[ , -which(names(sj_tr) == "total_cases")], y = sj_tr$total_cases, ntreeTry=100,stepFactor=2,improve=0.05, doBest = T)

## mtry = 2  OOB error = 507.1013 
## Searching left ...
## mtry = 1     OOB error = 824.093 
## -0.6251052 0.05 
## Searching right ...
## mtry = 4     OOB error = 328.6548 
## 0.3518953 0.05 
## mtry = 7     OOB error = 347.3086 
## -0.05675813 0.05

sj_tr$pred <- predict(rf_reg, sj_tr)

sjtr_pred<- sj_tr

postResample(sj_tr$pred, sj_tr$total_cases)

##      RMSE  Rsquared       MAE 
## 8.2003435 0.9806783 4.3406866

ggplot(data = sj_tr, aes(x = as.Date(week_start_date))) + geom_line(aes(y = total_cases, color = "real")) + geom_line(aes(y = pred, color = "pred" ))

sj_tr$pred <- NULL
sj_tr$type <- "train"

Model 2: Make final prediction for San Juan (test)

sj_te <- all %>% filter(city == "sj" & type == "test")

##The missing values will be replaced by fromLast:
sj_te <- cbind(sj_te[ , c("city", "type", "week_start_date")] ,  apply(X = sj_te[, which(sapply(sj_te,is.numeric))], MARGIN = 2, function(x) na.locf(x,fromLast = TRUE)))

Feature Engineer San Juan

sj_te$avg_precipitation<- (sj_te$precipitation_amt_mm+sj_te$station_precip_mm+ sj_te$reanalysis_sat_precip_amt_mm)/3

sj_test <- sj_te %>% #group_by(city,year, week_start_date) %>%  
  mutate(avg_temp_c = station_avg_temp_c,
         humidity_g_per_kg = reanalysis_specific_humidity_g_per_kg,
         ndvi_ne = NULL,
         ndvi_nw = NULL, 
         ndvi_se = NULL,
         reanalysis_avg_temp_k = NULL, 
         reanalysis_air_temp_k = NULL, 
         station_precip_mm = NULL, 
         precipitation_amt_mm = NULL,
         reanalysis_air_temp_k = NULL,
         reanalysis_avg_temp_k = NULL,
         reanalysis_dew_point_temp_k = NULL,
         reanalysis_max_air_temp_k = NULL,
         reanalysis_min_air_temp_k = NULL,
         reanalysis_precip_amt_kg_per_m2 = NULL,
         reanalysis_relative_humidity_percent = NULL,
         reanalysis_sat_precip_amt_mm = NULL,
         reanalysis_tdtr_k = NULL,
         station_diur_temp_rng_c = NULL,
         station_max_temp_c = NULL,                   
         station_min_temp_c = NULL,
         station_avg_temp_c = NULL,
        reanalysis_specific_humidity_g_per_kg   = NULL,
         station_precip_mm = NULL) 

## Lag SJ Precipitation
sj_test$avg_precipitation <- dplyr::lag(sj_test$avg_precipitation, 1)

sj_test[1,8] = sj_train[936,27]

#sj_test <- sj_test[complete.cases(sj_test), ]

## Apply parameters and predict
#sj_test <- predict(sj_preprocparams, sj_test)

sj_test$total_cases <- predict(rf_reg , sj_test)

ggplot(sj_test, aes(as.Date(week_start_date), total_cases, group=1)) + geom_line()

sj_tr$year <- as.numeric(sj_tr$year)
sj_tr$weekofyear <- as.numeric(sj_tr$weekofyear)

ggplot(data = union_all(sj_tr, sj_test), aes(x = as.Date(week_start_date), y = total_cases, color = type)) + geom_line()

Modeling Iquito (new features)

iq_tr$weekofyear <- as.character(iq_tr$weekofyear) 
iq_tr$city <- NULL
iq_tr$type <- NULL
iq_tr$year <- as.character(iq_tr$year)



## Get the best mtry train a random forest using that mtry

#iq_preprocparams <- preProcess(x = select(iq_tr[ , which(sapply(iq_tr,is.numeric))], -total_cases),method = c("center", "scale"))

#iq_tr <- predict(iq_preprocparams, iq_tr)

iq_tr$avg_precipitation <- dplyr::lag(iq_tr$avg_precipitation, 1)

iq_tr[1,6] = iq_tr[53,6]

#iq_tr <- iq_tr[complete.cases(iq_tr), ]

rf_reg <- tuneRF(x = iq_tr[ , -which(names(iq_tr) =="total_cases")], iq_tr$total_cases, ntreeTry=100,stepFactor=2,improve=0.05,trace=FALSE, doBest = T)

## -0.163319 0.05 
## -0.009337287 0.05

iq_tr$pred <- predict(rf_reg, iq_tr)

iqtr_pred<- iq_tr

postResample(iq_tr$pred, iq_tr$total_cases)

##      RMSE  Rsquared       MAE 
## 4.2498241 0.9085896 2.1890413

ggplot(data = iq_tr, aes(x = as.Date(week_start_date))) + geom_line(aes(y = total_cases, color = "real")) + geom_line(aes(y = pred, color = "pred" ))

iq_tr$pred <- NULL
iq_tr$type <- "train"

Make final prediction for Iquito (test): Model 2

## Make final prediction for Iquito (test)
iq_te <- all %>% filter(city == "iq" & type == "test")

## The missing values will be replaced by fromLast
iq_te <- cbind(iq_te[ , c("city", "type", "week_start_date")] ,  apply(X = iq_te[, which(sapply(iq_te,is.numeric))], MARGIN = 2, function(x) na.locf(x,fromLast = TRUE)))

Feature Engineer IQUITO Test

iq_te$avg_precipitation<- (iq_te$precipitation_amt_mm+iq_te$station_precip_mm+ iq_te$reanalysis_sat_precip_amt_mm)/3

iq_test <- iq_te %>% #group_by(city,year, week_start_date) %>%  
  mutate(avg_temp_c = station_avg_temp_c,
         humidity_g_per_kg = reanalysis_specific_humidity_g_per_kg,
         ndvi_ne = NULL,
         ndvi_nw = NULL, 
         ndvi_se = NULL,
         reanalysis_avg_temp_k = NULL, 
         reanalysis_air_temp_k = NULL, 
         station_precip_mm = NULL, 
         precipitation_amt_mm = NULL,
         reanalysis_air_temp_k = NULL,
         reanalysis_avg_temp_k = NULL,
         reanalysis_dew_point_temp_k = NULL,
         reanalysis_max_air_temp_k = NULL,
         reanalysis_min_air_temp_k = NULL,
         reanalysis_precip_amt_kg_per_m2 = NULL,
         reanalysis_relative_humidity_percent = NULL,
         reanalysis_sat_precip_amt_mm = NULL,
         reanalysis_tdtr_k = NULL,
         station_diur_temp_rng_c = NULL,
         station_max_temp_c = NULL,                   
         station_min_temp_c = NULL,
         station_avg_temp_c = NULL,
        reanalysis_specific_humidity_g_per_kg   = NULL,
         station_precip_mm = NULL) 


## Lag IQ Precipitation
iq_test$avg_precipitation <- dplyr::lag(iq_test$avg_precipitation, 1)

iq_test[1,8] = iq_train[520,27]

#iq_test <- iq_test[complete.cases(iq_test), ]


## Apply parameters and predict
#iq_test <- predict(iq_preprocparams, iq_test)

iq_test$total_cases <- predict(rf_reg, iq_test)

ggplot(iq_test, aes(as.Date(week_start_date), total_cases, group=1)) + geom_line()

iq_tr$year <- as.numeric(iq_tr$year)
iq_tr$weekofyear <- as.numeric(iq_tr$weekofyear)

ggplot(data = union_all(iq_tr, iq_test), aes(x = as.Date(week_start_date), y = total_cases, color = type)) + geom_line()

#iq_test$weekofyear <- as.character(iq_test$weekofyear)

#iq_test$weekofyear <- as.numeric(iq_test$weekofyear)

Model 2: Second Submission with Random Forest:

iq_test$city <- "iq" 

iq_test$type <- "test"

rf_final_df2 <- rbind(sj_test[ , c("city", "type", "total_cases","weekofyear", "week_start_date", "year")], 
                     iq_test[ , c("city", "type", "total_cases","weekofyear", "week_start_date", "year")])

submission_rf2 <- full_join(x = submission_format, y = rf_final_df2, by = c("year","weekofyear", "city"))

data.table::setnames(submission_rf2, old = "total_cases.y", "total_cases")

submission_rf2$total_cases <- base::round(submission_rf2$total_cases,0) # Round

ggplot(submission_rf2, aes(as.Date(week_start_date), total_cases, group=city)) + geom_line() + facet_wrap(city~.)

submission_rf2 <- submission_rf2 %>% select(city, year, weekofyear, total_cases) 

write.csv(submission_rf2, "submission_rf2.csv", quote = F, row.names = F)

SUBMISSION SCORE #2: 27.1034

MODEL 3: Auto.ARIMA with Exogenous Variables

sjtot=ts(sj_tr$total_cases, frequency=52, start=c(1990,04,30)) 

sjtotplot<- window(sjtot, start= 2000, end = 2008)

ggseasonplot(sjtotplot, polar=TRUE) + ylab("Cases") +
  ggtitle("Polar seasonal plot: San Juan Dengue Fever Cases") + theme_bw()

iqtot=ts(iq_tr$total_cases, frequency=52, start=c(2000,07,01)) 

iqtotplot<- window(iqtot, start= 2000, end = 2008)

ggseasonplot(iqtotplot, polar=TRUE) + ylab("Cases") +
  ggtitle("Polar seasonal plot: Iquito Dengue Fever Cases") + theme_bw()

autoplot(sjtot) + ylab("Weeks") +
  xlab("Cases") + ggtitle("San Juan: Total Cases over Training Period")

autoplot(iqtot) + ylab("Weeks") +
  xlab("Cases") + ggtitle("Iquito: Total Cases over Training Period")

acf(sjtot)

pacf(sjtot)

tsdisplay(sjtot)

autoplot(diff(sjtot,1))

acf(iqtot)

pacf(iqtot)

tsdisplay(iqtot)

autoplot(diff(iqtot,1))

exvar_sj<- as.matrix(sj_tr[,c("weekofyear","ndvi_sw","avg_precipitation" , "avg_temp_c"  ,"humidity_g_per_kg")])

exvar_sjtest<- as.matrix(sj_test[,c("weekofyear","ndvi_sw","avg_precipitation" , "avg_temp_c"  ,"humidity_g_per_kg")])

(f1=auto.arima(sjtot, xreg =exvar_sj))

## Series: sjtot 
## Regression with ARIMA(1,1,1) errors 
## 
## Coefficients:
##          ar1      ma1  weekofyear  ndvi_sw  avg_precipitation  avg_temp_c
##       0.7071  -0.5895     -0.0007  -0.2307             0.4248      1.2978
## s.e.  0.0965   0.1093      0.0608   0.3237             0.3701      1.0526
##       humidity_g_per_kg
##                  1.0069
## s.e.             1.0387
## 
## sigma^2 estimated as 180.5:  log likelihood=-3752.11
## AIC=7520.21   AICc=7520.37   BIC=7558.94

cbind("Regression Errors" = residuals(f1, type="regression"),
      "ARIMA errors" = residuals(f1, type="innovation")) %>%
  autoplot(facets=TRUE)

checkresiduals(f1)

## 
##  Ljung-Box test
## 
## data:  Residuals from Regression with ARIMA(1,1,1) errors
## Q* = 90.045, df = 97, p-value = 0.6785
## 
## Model df: 7.   Total lags used: 104

fc_sj=forecast(f1, h = 260, xreg = exvar_sjtest)

autoplot(fc_sj)

exvar_iq<- as.matrix(iq_tr[,c("weekofyear","ndvi_sw","avg_precipitation" , "avg_temp_c"  ,"humidity_g_per_kg")])

exvar_iqtest<- as.matrix(iq_test[,c("weekofyear","ndvi_sw","avg_precipitation", "avg_temp_c"  ,"humidity_g_per_kg")])

(f2=auto.arima(iqtot,xreg = exvar_iq))

## Series: iqtot 
## Regression with ARIMA(3,0,0) errors 
## 
## Coefficients:
##          ar1      ar2     ar3  intercept  weekofyear  ndvi_sw
##       0.6944  -0.0720  0.1961     9.4552     -0.0766   0.6362
## s.e.  0.0432   0.0535  0.0434     1.9829      0.0415   0.2704
##       avg_precipitation  avg_temp_c  humidity_g_per_kg
##                 -0.2527      0.3630            -0.4703
## s.e.             0.3343      0.3264             0.4322
## 
## sigma^2 estimated as 48.59:  log likelihood=-1743.46
## AIC=3506.92   AICc=3507.36   BIC=3549.46

cbind("Regression Errors" = residuals(f2, type="regression"),
      "ARIMA errors" = residuals(f2, type="innovation")) %>%
  autoplot(facets=TRUE)

checkresiduals(f2)

## 
##  Ljung-Box test
## 
## data:  Residuals from Regression with ARIMA(3,0,0) errors
## Q* = 81.259, df = 95, p-value = 0.8415
## 
## Model df: 9.   Total lags used: 104

fc_iq=forecast(f2, h = 156, xreg = exvar_iqtest)

autoplot(fc_iq)

Arima

sj_te <- all %>% filter(city == “sj” & type == “test”)

##The missing values will be replaced by a forecast sj_te <- cbind(sj_te[ , c(“city”, “type”, “week_start_date”)] , apply(X = sj_te[, which(sapply(sj_te,is.numeric))], MARGIN = 2, function(x) na.locf(x,fromLast = TRUE)))

Feature Engineer San Juan

sj_te <- sj_te %>% group_by(city,year, week_start_date) %>%
mutate(avg_precipitation = mean(station_precip_mm, precipitation_amt_mm,reanalysis_sat_precip_amt_mm, na.rm = T, trim = 0), avg_ndvi = mean(ndvi_ne,ndvi_nw, ndvi_se, ndvi_sw, na.rm = T, trim = 0), avg_temp_c = station_avg_temp_c, humidity_g_per_kg = reanalysis_specific_humidity_g_per_kg, ndvi_ne = NULL, ndvi_nw = NULL, ndvi_se = NULL, ndvi_sw = NULL,
reanalysis_avg_temp_k = NULL, reanalysis_air_temp_k = NULL, station_precip_mm = NULL, precipitation_amt_mm = NULL, reanalysis_air_temp_k = NULL, reanalysis_avg_temp_k = NULL, reanalysis_dew_point_temp_k = NULL, reanalysis_max_air_temp_k = NULL, reanalysis_min_air_temp_k = NULL, reanalysis_precip_amt_kg_per_m2 = NULL, reanalysis_relative_humidity_percent = NULL, reanalysis_sat_precip_amt_mm = NULL, reanalysis_tdtr_k = NULL, station_diur_temp_rng_c = NULL, station_max_temp_c = NULL,
station_min_temp_c = NULL, station_avg_temp_c = NULL, reanalysis_specific_humidity_g_per_kg = NULL, station_precip_mm = NULL)

(fit_sj <- Arima(sjtot, order=c(2,0,1), seasonal=c(0,0,0), xreg =exvar_sj ))

## Series: sjtot 
## Regression with ARIMA(2,0,1) errors 
## 
## Coefficients:
##          ar1      ar2      ma1  intercept  weekofyear  ndvi_sw
##       1.8474  -0.8621  -0.7800    34.3293     -0.0165  -0.2214
## s.e.  0.0387   0.0368   0.0504     6.5680      0.0608   0.3240
##       avg_precipitation  avg_temp_c  humidity_g_per_kg
##                  0.4180      1.3613             0.8754
## s.e.             0.3702      1.0528             1.0397
## 
## sigma^2 estimated as 172.7:  log likelihood=-3735.91
## AIC=7491.82   AICc=7492.06   BIC=7540.24

fcast_sj=forecast(fit_sj, h = 260, xreg = exvar_sjtest)

autoplot(fcast_sj) + xlab("Year") +
  ylab("Cases")

tsdisplay(residuals(fit_sj))

Box.test(residuals(fcast_sj),fitdf=7, lag=151, type="Ljung")

## 
##  Box-Ljung test
## 
## data:  residuals(fcast_sj)
## X-squared = 107.77, df = 144, p-value = 0.9894

(fit_sj2 <- Arima(sjtot, order=c(1,2,1), seasonal=c(0,0,0), xreg =exvar_sj ))

## Series: sjtot 
## Regression with ARIMA(1,2,1) errors 
## 
## Coefficients:
##          ar1      ma1  weekofyear  ndvi_sw  avg_precipitation  avg_temp_c
##       0.1359  -1.0000      0.0283  -0.2766             0.3883      1.1983
## s.e.  0.0326   0.0029      0.0610   0.3220             0.3723      1.0615
##       humidity_g_per_kg
##                  1.0907
## s.e.             1.0465
## 
## sigma^2 estimated as 182.3:  log likelihood=-3756.15
## AIC=7528.3   AICc=7528.46   BIC=7567.02

fcast_sj2=forecast(fit_sj2, h = 260, xreg = exvar_sjtest)

autoplot(fcast_sj2) + xlab("Year") +
  ylab("Cases")

tsdisplay(residuals(fit_sj2))

Box.test(residuals(fcast_sj2),fitdf=7, lag=151, type="Ljung")

## 
##  Box-Ljung test
## 
## data:  residuals(fcast_sj2)
## X-squared = 149.65, df = 144, p-value = 0.3565

iq_te <- all %>% filter(city == “iq” & type == “test”)

##The missing values will be replaced by a forecast iq_te <- cbind(iq_te[ , c(“city”, “type”, “week_start_date”)] , apply(X = iq_te[, which(sapply(iq_te,is.numeric))], MARGIN = 2, function(x) na.locf(x,fromLast = TRUE)))

Feature Engineer San Juan

iq_te <- iq_te %>% group_by(city,year, week_start_date) %>%
mutate(avg_precipitation = mean(station_precip_mm, precipitation_amt_mm,reanalysis_sat_precip_amt_mm, na.rm = T, trim = 0), avg_ndvi = mean(ndvi_ne,ndvi_nw, ndvi_se, ndvi_sw, na.rm = T, trim = 0), avg_temp_c = station_avg_temp_c, humidity_g_per_kg = reanalysis_specific_humidity_g_per_kg, ndvi_ne = NULL, ndvi_nw = NULL, ndvi_se = NULL, ndvi_sw = NULL,
reanalysis_avg_temp_k = NULL, reanalysis_air_temp_k = NULL, station_precip_mm = NULL, precipitation_amt_mm = NULL, reanalysis_air_temp_k = NULL, reanalysis_avg_temp_k = NULL, reanalysis_dew_point_temp_k = NULL, reanalysis_max_air_temp_k = NULL, reanalysis_min_air_temp_k = NULL, reanalysis_precip_amt_kg_per_m2 = NULL, reanalysis_relative_humidity_percent = NULL, reanalysis_sat_precip_amt_mm = NULL, reanalysis_tdtr_k = NULL, station_diur_temp_rng_c = NULL, station_max_temp_c = NULL,
station_min_temp_c = NULL, station_avg_temp_c = NULL, reanalysis_specific_humidity_g_per_kg = NULL, station_precip_mm = NULL)

(fit_iq <- Arima(iqtot, order=c(1,0,4), seasonal=c(0,0,0), xreg =exvar_iq ))

## Series: iqtot 
## Regression with ARIMA(1,0,4) errors 
## 
## Coefficients:
##          ar1      ma1      ma2     ma3     ma4  intercept  weekofyear  ndvi_sw
##       0.7795  -0.0838  -0.0967  0.0935  0.1344     9.3419     -0.0706   0.6043
## s.e.  0.0610   0.0731   0.0615  0.0532  0.0542     1.8097      0.0422   0.2716
##       avg_precipitation  avg_temp_c  humidity_g_per_kg
##                 -0.3761      0.3001            -0.3601
## s.e.             0.3423      0.3238             0.4441
## 
## sigma^2 estimated as 48.45:  log likelihood=-1741.75
## AIC=3507.51   AICc=3508.12   BIC=3558.55

fcast_iq=forecast(fit_iq, h = 156, xreg = exvar_iqtest)

autoplot(fcast_iq) + xlab("Year") +
  ylab("Cases")

tsdisplay(residuals(fit_iq))

Box.test(residuals(fit_iq),fitdf=7, lag=151, type="Ljung")

## 
##  Box-Ljung test
## 
## data:  residuals(fit_iq)
## X-squared = 99.443, df = 144, p-value = 0.9982

Model 3: Third Submission with ARIMA xreg

xreg_arima_sj_sol <- data.frame(submission_format[1:260,-4], total_cases = round(fcast_sj$mean))

xreg_arima_iq_sol <- data.frame(submission_format[261:416,-4], total_cases =round(fcast_iq$mean))

xreg_arima_solution <- bind_rows(xreg_arima_sj_sol,xreg_arima_iq_sol)

xreg_arima_solution$total_cases[xreg_arima_solution$total_cases < 0 ] <- 0

write.csv(xreg_arima_solution, file = 'xreg_arima_submission.csv', row.names = F)

xreg_arima_iq_sol2 <- data.frame(submission_format[261:416,-4], total_cases =round(fc_iq$mean))

xreg_arima_solution2 <- bind_rows(xreg_arima_sj_sol,xreg_arima_iq_sol2)

xreg_arima_solution2$total_cases[xreg_arima_solution2$total_cases < 0 ] <- 0

write.csv(xreg_arima_solution2, file = 'xreg_arima_submission2.csv', row.names = F)

Model 3: SUBMISSION SCORE #3: 41.8606

MODEL 4: Seasonal ARIMA

ARIMA: San Juan

acf(sjtot)

pacf(sjtot)

tsdisplay(sjtot)

autoplot(diff(sjtot,1))

sj_ar <- Arima(sjtot, order=c(2,0,1), seasonal=c(0,1,0)) 

sj_fcast<- forecast(sj_ar, h = 260)

autoplot(sj_fcast)

tsdisplay(residuals(sj_ar))

Box.test(residuals(sj_fcast),fitdf=2, lag=151, type="Ljung")

## 
##  Box-Ljung test
## 
## data:  residuals(sj_fcast)
## X-squared = 437.35, df = 149, p-value < 2.2e-16

ARIMA: Iquito

iq_ar <- Arima(iqtot, order=c(1,0,4), seasonal=c(0,1,0)) 

iq_fcast<- forecast(iq_ar, h = 156)

autoplot(iq_fcast)

tsdisplay(residuals(iq_ar))

Box.test(residuals(iq_fcast),fitdf=3, lag=151, type="Ljung")

## 
##  Box-Ljung test
## 
## data:  residuals(iq_fcast)
## X-squared = 308.4, df = 148, p-value = 2.43e-13

Model 4: Fourth Submission with Seasonal ARIMA:

arima_sj_sol <- data.frame(submission_format[1:260,-4], total_cases = round(sj_fcast$mean))

arima_iq_sol <- data.frame(submission_format[261:416,-4], total_cases =round(iq_fcast$mean))

arima_solution <- bind_rows(arima_sj_sol,arima_iq_sol)

write.csv(arima_solution, file = 'arima_predicted_Solution.csv', row.names = F)

Your score for this submission is: 29.8413

MODEL 5: ETS with Decomposition

sjtot=ts(sj_train$total_cases, start=c(1990,4,30), frequency=52) 

stl_decomposition = stl(sjtot, t.window=13, s.window="periodic",robust=TRUE)

autoplot(stl_decomposition)

sj_fcast3<- forecast(stl_decomposition,method = "ets",h=260)

accuracy(sj_fcast3)

##                       ME     RMSE      MAE MPE MAPE      MASE       ACF1
## Training set 0.006078719 13.26227 7.841533 NaN  Inf 0.2146436 0.02412127

autoplot(sj_fcast3)

#mstl

mstl_decomposition = mstl(sjtot)

autoplot(mstl_decomposition)

sj_fcast4<- forecast(mstl_decomposition,method = "ets",h=260)

kable(accuracy(sj_fcast4))

	ME	RMSE	MAE	MPE	MAPE	MASE	ACF1
Training set	-0.0080337	12.20282	7.629666	-Inf	Inf	0.2088442	0.042466

autoplot(sj_fcast4) + ylab("Total Cases") + ggtitle("San Juan: MSTL + ETS")

iqtot=ts(iq_train$total_cases, start=c(2000,07,01), frequency=52) 

iqstl_decomp = stl(iqtot, t.window=13, s.window="periodic",robust=TRUE)

autoplot(iqstl_decomp)

iq_fcast3<- forecast(iqstl_decomp,method = "ets",h=156)

kable(accuracy(iq_fcast3))

	ME	RMSE	MAE	MPE	MAPE	MASE	ACF1
Training set	0.009541	7.161697	3.675609	NaN	Inf	0.3893583	0.0698369

autoplot(iq_fcast3)

#mstl

iqmstl_decomp = mstl(iqtot)

autoplot(iqmstl_decomp)

iq_fcast4<- forecast(iqmstl_decomp,method = "ets",h=156)

accuracy(iq_fcast4)

##                      ME    RMSE      MAE MPE MAPE      MASE       ACF1
## Training set 0.01981199 6.48256 3.828215 NaN  Inf 0.4055239 0.06328665

autoplot(iq_fcast4)


ets_sj_sol <- data.frame(submission_format[1:260,-4], total_cases = round(sj_fcast4$mean))

ets_iq_sol <- data.frame(submission_format[261:416,-4], total_cases =round(iq_fcast4$mean))

ets_solution <- bind_rows(ets_sj_sol,ets_iq_sol)

write.csv(ets_solution, file = 'ets_predicted_Solution.csv', row.names = F)

MODEL 5 SUBMISSION SCORE: 26.5192

MODEL 6: NEURAL NET

(fit_nnetar_sj <- nnetar(sjtot))

## Series: sjtot 
## Model:  NNAR(14,1,8)[52] 
## Call:   nnetar(y = sjtot)
## 
## Average of 20 networks, each of which is
## a 15-8-1 network with 137 weights
## options were - linear output units 
## 
## sigma^2 estimated as 51.01

#repeats=25, size=12, decay=0.1,linout=TRUE
plot(forecast(fit_nnetar_sj,h=260))

summary(fit_nnetar_sj)

##           Length Class        Mode     
## x         936    ts           numeric  
## m           1    -none-       numeric  
## p           1    -none-       numeric  
## P           1    -none-       numeric  
## scalex      2    -none-       list     
## size        1    -none-       numeric  
## subset    936    -none-       numeric  
## model      20    nnetarmodels list     
## nnetargs    0    -none-       list     
## fitted    936    ts           numeric  
## residuals 936    ts           numeric  
## lags       15    -none-       numeric  
## series      1    -none-       character
## method      1    -none-       character
## call        2    -none-       call

checkresiduals(fit_nnetar_sj)

a=forecast(fit_nnetar_sj,h=260)

(fit_nnetar_iq <- nnetar(iqtot))

## Series: iqtot 
## Model:  NNAR(5,1,4)[52] 
## Call:   nnetar(y = iqtot)
## 
## Average of 20 networks, each of which is
## a 6-4-1 network with 33 weights
## options were - linear output units 
## 
## sigma^2 estimated as 23.08

plot(forecast(fit_nnetar_iq,h=156))

checkresiduals(fit_nnetar_iq)

b=forecast(fit_nnetar_iq,h=156)

nnetar_sj_sol <- data.frame(submission_format[1:260,-4], total_cases = round(a$mean))

nnetar_iq_sol <- data.frame(submission_format[261:416,-4], total_cases =round(b$mean))

nnetar_solution <- bind_rows(nnetar_sj_sol,nnetar_iq_sol)

write.csv(nnetar_solution, file = 'nnetar_predicted_Solution.csv', row.names = F)

MODEL 5 SUBMISSION SCORE: 37.4351

ZEngineer ADEC 7460 - Final

Zille Engineer

8/2/2020

Checking distributions

Create Train and Test dataset.

Combine Train+ Test for EDA and Viz

Summary of data

Split data by city and treat missing values

SAN JUAN

Impute missing values Using LOCF from ‘zoo’

IQUITO

Impute missing values Using LOCF from ‘zoo’

Model 1: Random Forest San Juan (all features)

Model 1: Make final prediction for San Juan (test)

Model 1: Iquito (all features)

Model 1: Make final prediction for Iquito (test)

Model 1: First Submission with Random Forest:

Correlation Matrix

Corr SJ

Corr IQ

Feature Engineering

MODEL 2: Random Forest San Juan (new features)

Model 2: Make final prediction for San Juan (test)

Feature Engineer San Juan

Modeling Iquito (new features)

Make final prediction for Iquito (test): Model 2

Feature Engineer IQUITO Test

Model 2: Second Submission with Random Forest:

MODEL 3: Auto.ARIMA with Exogenous Variables

Arima

Feature Engineer San Juan

Feature Engineer San Juan

Model 3: Third Submission with ARIMA xreg

MODEL 4: Seasonal ARIMA

ARIMA: San Juan

ARIMA: Iquito

Model 4: Fourth Submission with Seasonal ARIMA:

MODEL 5: ETS with Decomposition

MODEL 6: NEURAL NET