###########################################################################################################
rm(list = ls())

Load useful libraries

library(plyr)
library(dplyr)
library(tidyr)
library(ggplot2)
library(lubridate)
library(zoo)
library(data.table)
library(forecast)
library(fpp2)
library(rugarch)
library(fGarch)
library(ggcorrplot)
library(corrplot)
library(stats)

Import Data

train.y <- read.csv("/Users/yannos/Documents/PredictiveAnalytics/Final/dengue_labels_train.csv")
train.x <- read.csv("/Users/yannos/Documents/PredictiveAnalytics/Final/dengue_features_train.csv")
test.x <- read.csv("/Users/yannos/Documents/PredictiveAnalytics/Final/dengue_features_test.csv")
test.y <- read.csv("/Users/yannos/Documents/PredictiveAnalytics/Final/submission_format.csv")
submission <- read.csv("/Users/yannos/Documents/PredictiveAnalytics/Final/submission_format.csv")

Data Preparation - Part 1

Split by City

train <- train.y %>% left_join(train.x, by = c("city","year","weekofyear"))
test <- test.y %>% left_join(test.x, by = c("city","year","weekofyear"))

train.sj <- dplyr::filter(train, city == "sj") 
train.iq <- dplyr::filter(train, city == "iq") 

test.sj <- dplyr::filter(test, city == "sj") 
test.iq <- dplyr::filter(test, city == "iq") 

train.sj.y <- dplyr::filter(train.y, city == "sj") 
train.iq.y <- dplyr::filter(train.y, city == "iq") 

cases.sj <- train.sj.y %>% dplyr::select("total_cases")
cases.iq <- train.iq.y %>% dplyr::select("total_cases")

Data Exploration

dim(train.y)
## [1] 1456    4
any(is.na(train.y))
## [1] FALSE
train.y %>% summary()
##  city          year        weekofyear     total_cases    
##  iq:520   Min.   :1990   Min.   : 1.00   Min.   :  0.00  
##  sj:936   1st Qu.:1997   1st Qu.:13.75   1st Qu.:  5.00  
##           Median :2002   Median :26.50   Median : 12.00  
##           Mean   :2001   Mean   :26.50   Mean   : 24.68  
##           3rd Qu.:2005   3rd Qu.:39.25   3rd Qu.: 28.00  
##           Max.   :2010   Max.   :53.00   Max.   :461.00
dim(train.x)
## [1] 1456   24
any(is.na(train.x))
## [1] TRUE
train.x %>% summary()
##  city          year        weekofyear      week_start_date    ndvi_ne        
##  iq:520   Min.   :1990   Min.   : 1.00   2000-07-01:   2   Min.   :-0.40625  
##  sj:936   1st Qu.:1997   1st Qu.:13.75   2000-07-08:   2   1st Qu.: 0.04495  
##           Median :2002   Median :26.50   2000-07-15:   2   Median : 0.12882  
##           Mean   :2001   Mean   :26.50   2000-07-22:   2   Mean   : 0.14229  
##           3rd Qu.:2005   3rd Qu.:39.25   2000-07-29:   2   3rd Qu.: 0.24848  
##           Max.   :2010   Max.   :53.00   2000-08-05:   2   Max.   : 0.50836  
##                                          (Other)   :1444   NA's   :194       
##     ndvi_nw            ndvi_se            ndvi_sw         precipitation_amt_mm
##  Min.   :-0.45610   Min.   :-0.01553   Min.   :-0.06346   Min.   :  0.00      
##  1st Qu.: 0.04922   1st Qu.: 0.15509   1st Qu.: 0.14421   1st Qu.:  9.80      
##  Median : 0.12143   Median : 0.19605   Median : 0.18945   Median : 38.34      
##  Mean   : 0.13055   Mean   : 0.20378   Mean   : 0.20231   Mean   : 45.76      
##  3rd Qu.: 0.21660   3rd Qu.: 0.24885   3rd Qu.: 0.24698   3rd Qu.: 70.23      
##  Max.   : 0.45443   Max.   : 0.53831   Max.   : 0.54602   Max.   :390.60      
##  NA's   :52         NA's   :22         NA's   :22         NA's   :13          
##  reanalysis_air_temp_k reanalysis_avg_temp_k reanalysis_dew_point_temp_k
##  Min.   :294.6         Min.   :294.9         Min.   :289.6              
##  1st Qu.:297.7         1st Qu.:298.3         1st Qu.:294.1              
##  Median :298.6         Median :299.3         Median :295.6              
##  Mean   :298.7         Mean   :299.2         Mean   :295.2              
##  3rd Qu.:299.8         3rd Qu.:300.2         3rd Qu.:296.5              
##  Max.   :302.2         Max.   :302.9         Max.   :298.4              
##  NA's   :10            NA's   :10            NA's   :10                 
##  reanalysis_max_air_temp_k reanalysis_min_air_temp_k
##  Min.   :297.8             Min.   :286.9            
##  1st Qu.:301.0             1st Qu.:293.9            
##  Median :302.4             Median :296.2            
##  Mean   :303.4             Mean   :295.7            
##  3rd Qu.:305.5             3rd Qu.:297.9            
##  Max.   :314.0             Max.   :299.9            
##  NA's   :10                NA's   :10               
##  reanalysis_precip_amt_kg_per_m2 reanalysis_relative_humidity_percent
##  Min.   :  0.00                  Min.   :57.79                       
##  1st Qu.: 13.05                  1st Qu.:77.18                       
##  Median : 27.25                  Median :80.30                       
##  Mean   : 40.15                  Mean   :82.16                       
##  3rd Qu.: 52.20                  3rd Qu.:86.36                       
##  Max.   :570.50                  Max.   :98.61                       
##  NA's   :10                      NA's   :10                          
##  reanalysis_sat_precip_amt_mm reanalysis_specific_humidity_g_per_kg
##  Min.   :  0.00               Min.   :11.72                        
##  1st Qu.:  9.80               1st Qu.:15.56                        
##  Median : 38.34               Median :17.09                        
##  Mean   : 45.76               Mean   :16.75                        
##  3rd Qu.: 70.23               3rd Qu.:17.98                        
##  Max.   :390.60               Max.   :20.46                        
##  NA's   :13                   NA's   :10                           
##  reanalysis_tdtr_k station_avg_temp_c station_diur_temp_rng_c
##  Min.   : 1.357    Min.   :21.40      Min.   : 4.529         
##  1st Qu.: 2.329    1st Qu.:26.30      1st Qu.: 6.514         
##  Median : 2.857    Median :27.41      Median : 7.300         
##  Mean   : 4.904    Mean   :27.19      Mean   : 8.059         
##  3rd Qu.: 7.625    3rd Qu.:28.16      3rd Qu.: 9.567         
##  Max.   :16.029    Max.   :30.80      Max.   :15.800         
##  NA's   :10        NA's   :43         NA's   :43             
##  station_max_temp_c station_min_temp_c station_precip_mm
##  Min.   :26.70      Min.   :14.7       Min.   :  0.00   
##  1st Qu.:31.10      1st Qu.:21.1       1st Qu.:  8.70   
##  Median :32.80      Median :22.2       Median : 23.85   
##  Mean   :32.45      Mean   :22.1       Mean   : 39.33   
##  3rd Qu.:33.90      3rd Qu.:23.3       3rd Qu.: 53.90   
##  Max.   :42.20      Max.   :25.6       Max.   :543.30   
##  NA's   :20         NA's   :14         NA's   :22
dim(test.x)
## [1] 416  24
any(is.na(test.x))
## [1] TRUE
test.x %>% summary()
##  city          year        weekofyear      week_start_date    ndvi_ne       
##  iq:156   Min.   :2008   Min.   : 1.00   2010-07-02:  2    Min.   :-0.4634  
##  sj:260   1st Qu.:2010   1st Qu.:13.75   2010-07-09:  2    1st Qu.:-0.0015  
##           Median :2011   Median :26.00   2010-07-16:  2    Median : 0.1101  
##           Mean   :2011   Mean   :26.44   2010-07-23:  2    Mean   : 0.1260  
##           3rd Qu.:2012   3rd Qu.:39.00   2010-07-30:  2    3rd Qu.: 0.2633  
##           Max.   :2013   Max.   :53.00   2010-08-06:  2    Max.   : 0.5004  
##                                          (Other)   :404    NA's   :43       
##     ndvi_nw            ndvi_se          ndvi_sw         precipitation_amt_mm
##  Min.   :-0.21180   Min.   :0.0062   Min.   :-0.01467   Min.   :  0.000     
##  1st Qu.: 0.01597   1st Qu.:0.1487   1st Qu.: 0.13408   1st Qu.:  8.175     
##  Median : 0.08870   Median :0.2042   Median : 0.18647   Median : 31.455     
##  Mean   : 0.12680   Mean   :0.2077   Mean   : 0.20172   Mean   : 38.354     
##  3rd Qu.: 0.24240   3rd Qu.:0.2549   3rd Qu.: 0.25324   3rd Qu.: 57.773     
##  Max.   : 0.64900   Max.   :0.4530   Max.   : 0.52904   Max.   :169.340     
##  NA's   :11         NA's   :1        NA's   :1          NA's   :2           
##  reanalysis_air_temp_k reanalysis_avg_temp_k reanalysis_dew_point_temp_k
##  Min.   :294.6         Min.   :295.2         Min.   :290.8              
##  1st Qu.:297.8         1st Qu.:298.3         1st Qu.:294.3              
##  Median :298.5         Median :299.3         Median :295.8              
##  Mean   :298.8         Mean   :299.4         Mean   :295.4              
##  3rd Qu.:300.2         3rd Qu.:300.5         3rd Qu.:296.6              
##  Max.   :301.9         Max.   :303.3         Max.   :297.8              
##  NA's   :2             NA's   :2             NA's   :2                  
##  reanalysis_max_air_temp_k reanalysis_min_air_temp_k
##  Min.   :298.2             Min.   :286.2            
##  1st Qu.:301.4             1st Qu.:293.5            
##  Median :302.8             Median :296.3            
##  Mean   :303.6             Mean   :295.7            
##  3rd Qu.:305.8             3rd Qu.:298.3            
##  Max.   :314.1             Max.   :299.7            
##  NA's   :2                 NA's   :2                
##  reanalysis_precip_amt_kg_per_m2 reanalysis_relative_humidity_percent
##  Min.   :  0.00                  Min.   :64.92                       
##  1st Qu.:  9.43                  1st Qu.:77.40                       
##  Median : 25.85                  Median :80.33                       
##  Mean   : 42.17                  Mean   :82.50                       
##  3rd Qu.: 56.48                  3rd Qu.:88.33                       
##  Max.   :301.40                  Max.   :97.98                       
##  NA's   :2                       NA's   :2                           
##  reanalysis_sat_precip_amt_mm reanalysis_specific_humidity_g_per_kg
##  Min.   :  0.000              Min.   :12.54                        
##  1st Qu.:  8.175              1st Qu.:15.79                        
##  Median : 31.455              Median :17.34                        
##  Mean   : 38.354              Mean   :16.93                        
##  3rd Qu.: 57.773              3rd Qu.:18.17                        
##  Max.   :169.340              Max.   :19.60                        
##  NA's   :2                    NA's   :2                            
##  reanalysis_tdtr_k station_avg_temp_c station_diur_temp_rng_c
##  Min.   : 1.486    Min.   :24.16      Min.   : 4.043         
##  1st Qu.: 2.446    1st Qu.:26.51      1st Qu.: 5.929         
##  Median : 2.914    Median :27.48      Median : 6.643         
##  Mean   : 5.125    Mean   :27.37      Mean   : 7.811         
##  3rd Qu.: 8.171    3rd Qu.:28.32      3rd Qu.: 9.812         
##  Max.   :14.486    Max.   :30.27      Max.   :14.725         
##  NA's   :2         NA's   :12         NA's   :12             
##  station_max_temp_c station_min_temp_c station_precip_mm
##  Min.   :27.20      Min.   :14.20      Min.   :  0.00   
##  1st Qu.:31.10      1st Qu.:21.20      1st Qu.:  9.10   
##  Median :32.80      Median :22.20      Median : 23.60   
##  Mean   :32.53      Mean   :22.37      Mean   : 34.28   
##  3rd Qu.:33.90      3rd Qu.:23.30      3rd Qu.: 47.75   
##  Max.   :38.40      Max.   :26.70      Max.   :212.00   
##  NA's   :3          NA's   :9          NA's   :5

From above we can observe the following:

  1. Our train data has has no missing values as far as our taget variable - total cases - is concerned

  2. Both our train and test data have some missing values for some of the environment related features given

# Look at min, max in train data for each city
train.sj.y %>% head()
##   city year weekofyear total_cases
## 1   sj 1990         18           4
## 2   sj 1990         19           5
## 3   sj 1990         20           4
## 4   sj 1990         21           3
## 5   sj 1990         22           6
## 6   sj 1990         23           2
train.sj.y %>% tail()
##     city year weekofyear total_cases
## 931   sj 2008         12           3
## 932   sj 2008         13           4
## 933   sj 2008         14           3
## 934   sj 2008         15           1
## 935   sj 2008         16           3
## 936   sj 2008         17           5
train.iq.y %>% head()
##   city year weekofyear total_cases
## 1   iq 2000         26           0
## 2   iq 2000         27           0
## 3   iq 2000         28           0
## 4   iq 2000         29           0
## 5   iq 2000         30           0
## 6   iq 2000         31           0
train.iq.y %>% tail()
##     city year weekofyear total_cases
## 515   iq 2010         20           6
## 516   iq 2010         21           5
## 517   iq 2010         22           8
## 518   iq 2010         23           1
## 519   iq 2010         24           1
## 520   iq 2010         25           4

Data Preparation - Part 2

As we saw above many of the environment related variables have some missing values. We will replace those with the previous observation (assume variables approximately stay the same if missing). Then redefine dfs with filled in data.

# replace missing with previous observations
train.x <- fill(train.x, ndvi_ne:station_precip_mm)
test.x <- fill(test.x, ndvi_ne:station_precip_mm)

# redefine working dataframes with filled values
train <- train.y %>% left_join(train.x, by = c("city","year","weekofyear"))
test <- test.y %>% left_join(test.x, by = c("city","year","weekofyear"))

train.sj <- dplyr::filter(train, city == "sj") 
train.iq <- dplyr::filter(train, city == "iq") 

test.sj <- dplyr::filter(test, city == "sj") 
test.iq <- dplyr::filter(test, city == "iq") 

train.sj.y <- dplyr::filter(train.y, city == "sj") 
train.iq.y <- dplyr::filter(train.y, city == "iq") 

cases.sj <- train.sj.y %>% dplyr::select("total_cases")
cases.iq <- train.iq.y %>% dplyr::select("total_cases")

Data Visualization

We can now create time series object to possibly use for investigation and modeling and plot our series:

cases.sj.ts <- ts(cases.sj, frequency = 52, start=1)
cases.sj.ts %>% autoplot(alpha=0.6, col="red")  +
  ggtitle("Total Dengue Fever Cases in San Juan") +
  ylab("Cases") +
  xlab("Week")

cases.iq.ts <- ts(cases.iq, frequency = 52, start=1)
cases.iq.ts %>% autoplot(alpha=0.6, col="red")  +
  ggtitle("Total Dengue Fever Cases in Iquitos") +
  ylab("Cases") +
  xlab("Week")

From above we observe that there is no clear seasonality and trend and that data is characterized in both cases by some very low values and sudden peaks that vary in severity / size.

Plot Overall Distribution

train %>%
  ggplot(aes(x=total_cases, fill=city)) +
  geom_density(alpha=.3) +
  scale_x_continuous(limits = c(0, 150)) +
  ggtitle("Total Case Distributions by City") +
  ylab("Density") +
  xlab("Cases") +
  labs(fill = "City")
## Warning: Removed 25 rows containing non-finite values (stat_density).

Above we can observe that in both cities total cases distributions are right skewed with long tails of high values and the majority of cases falling within the very low quantiles and with many zeros. Indication for count data, possibly negative binomial distrubuted.

Plot Cases Distributions By Year

train.sj %>% 
  ggplot(aes(x=year,y=total_cases, group=year, fill=year, alpha=0.8)) +
  geom_boxplot() +
  geom_jitter(width=0.05,alpha=0.1) +
  theme(legend.position = "none")  +
  ggtitle("Total Case Distributions by Year in San Juan") +
  ylab("Cases") +
  xlab("Year")

train.iq %>% 
  ggplot(aes(x=year,y=total_cases, group=year, fill=year, alpha=0.8)) +
  geom_boxplot() +
  geom_jitter(width=0.05,alpha=0.1) +
  theme(legend.position = "none")  +
  ggtitle("Total Case Distributions by Year in Iquitos") +
  ylab("Cases") +
  xlab("Year") +
  scale_x_continuous(breaks=c(2000, 2002, 2004, 2006, 2008, 2010))

We do not observe any particular or consistent yearly variation.

Plot Cases Distribution by Week of Year

train.sj %>% 
  ggplot(aes(x=weekofyear,y=total_cases, group=weekofyear, fill=weekofyear, alpha=0.8)) +
  geom_boxplot() +
  geom_jitter(width=0.05,alpha=0.1) +
  theme(legend.position = "none")  +
  ggtitle("Total Case Distributions by Week in San Juan") +
  ylab("Cases") +
  xlab("Week")

train.iq %>% 
  ggplot(aes(x=weekofyear,y=total_cases, group=weekofyear, fill=weekofyear, alpha=0.8)) +
  geom_boxplot() +
  geom_jitter(width=0.05,alpha=0.1) +
  theme(legend.position = "none")  +
  ggtitle("Total Case Distributions by Week in Iquitos") +
  ylab("Cases") +
  xlab("Week") +
  scale_x_continuous(breaks=c(2000, 2002, 2004, 2006, 2008, 2010))

Similarly, there is no great variation in mean/median weekly cases. However, we do observe slightly differing overall levels of total cases, especially to some extent for San Juan.

Calculate and Plot Correlations

# Calculate Correlations for all variables for each city
corr.sj <- cor(train.sj %>% dplyr::select(-city, -week_start_date, -year), use = "pairwise.complete.obs")
corr.iq <- cor(train.iq %>% dplyr::select(-city, -week_start_date, -year), use = "pairwise.complete.obs")
# Plot Matrices
corrplot(corr.sj, type="lower", tl.col="black", tl.cex = 0.45, tl.srt = 35)

corrplot(corr.iq, type="lower", tl.col="black", tl.cex = 0.45, tl.srt = 35)

We can observe the following:

  1. Surprisingly vegetation indeces do not seem to be correlated with total cases in either city.

  2. On the other hand we observe moderate positive correlations between the specific humidity variable and total cases as well as between dew point temperature and total cases.

Investigate and Plot Total Cases vs Specific Humidity Variable

train.sj %>% ggplot(aes(x=reanalysis_specific_humidity_g_per_kg, y=total_cases,
                        color=city, alpha=0.3)) + geom_point() + 
  geom_smooth(method=lm, col="black", alpha=0.5) +
  theme(legend.position = "none")  +
  ggtitle("Total Cases vs Specific Humidity in San Juan") +
  ylab("Total Cases") +
  xlab("Specific Humidity (g/kg)")
## `geom_smooth()` using formula 'y ~ x'

train.iq %>% ggplot(aes(x=reanalysis_specific_humidity_g_per_kg, y=total_cases,
                        color=city, alpha=0.3)) + geom_point() + 
  geom_smooth(method=lm, col="black", alpha=0.5) +
  theme(legend.position = "none")  +
  ggtitle("Total Cases vs Specific Humidity in Iquitos") +
  ylab("Total Cases") +
  xlab("Specific Humidity (g/kg)")
## `geom_smooth()` using formula 'y ~ x'

As expected there is a positive relationship. However, one could argue that it is not linear as variation in total cases increases dramatically as specific humidity rises to the extent that the overall trend could be exponential for high cases.

Feature Engineering

Adjust dtypes in case categoricals are used somewhere

train.sj <- train.sj %>% mutate(Year=as.factor(year), Weekofyear=as.factor(weekofyear))
train.iq <- train.iq %>% mutate(Year=as.factor(year), Weekofyear=as.factor(weekofyear))

test.sj <- test.sj %>% mutate(Year=as.factor(year), Weekofyear=as.factor(weekofyear))
test.iq <- test.iq %>% mutate(Year=as.factor(year), Weekofyear=as.factor(weekofyear))

Split Training set to Generate a Hold-Out set for Model Evaluation

train2.sj <- train.sj %>% dplyr::filter(year<2006)
test2.sj <- train.sj %>% dplyr::filter(year>=2006)

train2.iq <- train.iq %>% dplyr::filter(year<2009)
test2.iq <- train.iq %>% dplyr::filter(year>=2009)

Models - Part 1 (Simple and Robust)

In this section I am fitting and evaluating the first 2 simple models. After each model its MAE metric is computed on the hold out set and printed for comparison at the end.

Simple Historical Mean by City

# San Juan

# get mean
mean.cases.sj <- mean(train2.sj$total_cases)
# predict
test2.sj <- test2.sj %>% dplyr::mutate(MeanCases = mean.cases.sj)
# evaluate
simple.mean.sj.eval <- mean(abs(test2.sj$total_cases - test2.sj$MeanCases))
simple.mean.sj.eval
## [1] 27.32279
# Iquitos

# get mean
mean.cases.iq <- mean(train2.iq$total_cases)
# predict
test2.iq <- test2.iq %>% dplyr::mutate(MeanCases = mean.cases.iq)
# evaluate
simple.mean.iq.eval <- mean(abs(test2.iq$total_cases - test2.iq$MeanCases))
simple.mean.iq.eval
## [1] 5.404919

Simple Historical Mean by Week of Year and City

# San Juan

# get mean
mean.weekly.cases.sj <- train2.sj %>% group_by(weekofyear) %>% summarise(MeanWeeklyCases=mean(total_cases))
# predict
test2.sj <- test2.sj %>% left_join(mean.weekly.cases.sj, by=c("weekofyear"))
# evaluate
simple.weekly.mean.sj.eval <- mean(abs(test2.sj$total_cases - test2.sj$MeanWeeklyCases))
simple.weekly.mean.sj.eval
## [1] 21.21503
# Iquitos

# get mean
mean.weekly.cases.iq <- train2.iq %>% group_by(weekofyear) %>% summarise(MeanWeeklyCases=mean(total_cases))
# predict
test2.iq <- test2.iq %>% left_join(mean.weekly.cases.iq, by=c("weekofyear"))
# evaluate
simple.weekly.mean.iq.eval <- mean(abs(test2.iq$total_cases - test2.iq$MeanWeeklyCases))
simple.weekly.mean.iq.eval
## [1] 4.702635

Models - Part 2 (Time Series methods only using total cases historical data)

In this section, I am fitting time series models using the forecast package only using the single target variable. Similarly evaluation on the hold out set is computed and printed after each model.

train2.sj.ts <- ts(train2.sj$total_cases, frequency = 52, start=1)
train2.iq.ts <- ts(train2.iq$total_cases, frequency = 52, start=1)

test2.sj.ts <- ts(test2.sj$total_cases, frequency = 52, start=1)
test2.iq.ts <- ts(test2.iq$total_cases, frequency = 52, start=1)

STL Decomposition + ETS

# San Juan

# fit model and plot
fit.sj.ets <- stlf(train2.sj.ts, etsmodel="ANN", h=121)
fit.sj.ets %>% autoplot() +
  ggtitle("Forecasts from STL + ETS(A,N,N) for San Juan") +
  ylab("Cases") +
  xlab("Time")

# predict
test2.sj <- test2.sj %>% mutate(stl.ets.ann = fit.sj.ets$mean) %>%
  mutate(stl.ets.ann = ifelse(stl.ets.ann>0, stl.ets.ann, ifelse(stl.ets.ann<0, 0, NA)))
# evaluate
stl.ets.sj.eval <- mean(abs(test2.sj$total_cases - test2.sj$stl.ets.ann))
stl.ets.sj.eval
## [1] 13.84629
# Iquitos

# fit model and plot
fit.iq.ets <- stlf(train2.iq.ts, etsmodel="ANN", h=78)
fit.iq.ets %>% autoplot() +
  ggtitle("Forecasts from STL + ETS(A,N,N) for Iquitos") +
  ylab("Cases") +
  xlab("Time")

# predict
test2.iq <- test2.iq %>% mutate(stl.ets.ann = fit.iq.ets$mean) %>%
  mutate(stl.ets.ann = ifelse(stl.ets.ann>0, stl.ets.ann, ifelse(stl.ets.ann<0, 0, NA)))
# evaluate
stl.ets.iq.eval <- mean(abs(test2.iq$total_cases - test2.iq$stl.ets.ann))
stl.ets.iq.eval
## [1] 5.32278

Neural Network Autoregression

# San Juan

# fit model and plot
fit.sj.nnar <- nnetar(train2.sj.ts)
# predict
forecast.sj.nnar <- fit.sj.nnar %>% forecast(h=121, PI=TRUE, npaths=100)
autoplot(forecast.sj.nnar)

test2.sj <- test2.sj %>% mutate(nnar = forecast.sj.nnar$mean) %>%
  mutate(nnar = ifelse(nnar>0, nnar, ifelse(nnar<0, 0, NA)))
# evaluate
nnar.sj.eval <- mean(abs(test2.sj$total_cases - test2.sj$nnar))
nnar.sj.eval
## [1] 28.05033
# Iquitos

# fit model and plot
fit.iq.nnar <- nnetar(train2.iq.ts)
# predict
forecast.iq.nnar <- fit.iq.nnar %>% forecast(h=78, PI=TRUE, npaths=100)
autoplot(forecast.iq.nnar)

test2.iq <- test2.iq %>% mutate(nnar = forecast.iq.nnar$mean) %>%
  mutate(nnar = ifelse(nnar>0, nnar, ifelse(nnar<0, 0, NA)))
# evaluate
nnar.iq.eval <- mean(abs(test2.iq$total_cases - test2.iq$nnar))
nnar.iq.eval
## [1] 6.356278

Model Combination

We decided trying out averaging two models that seem to have reasonable predictions: the STL+ETS and the Regression with Lagged Predictors. We are hoping that this ensemble will outperform every single model structure tried out above.

# San Juan

# Lets use the average from the best two models, the stl+ets and the arima with lag predictors
# taking into account both info

mean(abs(test2.sj$total_cases - (test2.sj$arima.lag.sj + test2.sj$stl.ets.ann)/2))
## [1] 13.31814

Best so far for San Juan, great!

# Iquitos

mean(abs(test2.iq$total_cases - (test2.iq$stl.ets.ann + test2.iq$arima.lag.iq)/2))
## [1] 4.480311

Also best so far for Iquitos!

As a result, we will be using the combined model shown above to generate final predictions.

Final Training and Prediction

Fit on the whole training set and predict on the submission (official test set)

# GENERATE FEATURES ON TEST SET AS WELL (NEED TO COMBINE TO GENERATE LAGS)

full.sj <- rbind(dplyr::filter(train, city == "sj"), dplyr::filter(test, city == "sj") )
full.iq <- rbind(dplyr::filter(train, city == "iq"), dplyr::filter(test, city == "iq") )


# Lagged predictors.
full.sj <- cbind(
  full.sj,
  HumLag1 = dplyr::lag(full.sj$reanalysis_specific_humidity_g_per_kg,1),
  HumLag2 = dplyr::lag(full.sj$reanalysis_specific_humidity_g_per_kg,2),
  HumLag3 = dplyr::lag(full.sj$reanalysis_specific_humidity_g_per_kg,3),
  HumLag4 = dplyr::lag(full.sj$reanalysis_specific_humidity_g_per_kg,4),
  PrecLag1 = dplyr::lag(full.sj$reanalysis_precip_amt_kg_per_m2,1),
  PrecLag2 = dplyr::lag(full.sj$reanalysis_precip_amt_kg_per_m2,2),
  PrecLag3 = dplyr::lag(full.sj$reanalysis_precip_amt_kg_per_m2,3),
  PrecLag4 = dplyr::lag(full.sj$reanalysis_precip_amt_kg_per_m2,4),
  DewLag1 = dplyr::lag(full.sj$reanalysis_dew_point_temp_k,1),
  DewLag2 = dplyr::lag(full.sj$reanalysis_dew_point_temp_k,2),
  DewLag3 = dplyr::lag(full.sj$reanalysis_dew_point_temp_k,3),
  DewLag4 = dplyr::lag(full.sj$reanalysis_dew_point_temp_k,4),
  TempLag1 = dplyr::lag(full.sj$reanalysis_air_temp_k,1),
  TempLag2 = dplyr::lag(full.sj$reanalysis_air_temp_k,2),
  TempLag3 = dplyr::lag(full.sj$reanalysis_air_temp_k,3),
  TempLag4 = dplyr::lag(full.sj$reanalysis_air_temp_k,4)
)

full.iq <- cbind(
  full.iq,
  HumLag1 = dplyr::lag(full.iq$reanalysis_specific_humidity_g_per_kg,1),
  HumLag2 = dplyr::lag(full.iq$reanalysis_specific_humidity_g_per_kg,2),
  HumLag3 = dplyr::lag(full.iq$reanalysis_specific_humidity_g_per_kg,3),
  HumLag4 = dplyr::lag(full.iq$reanalysis_specific_humidity_g_per_kg,4),
  PrecLag1 = dplyr::lag(full.iq$reanalysis_precip_amt_kg_per_m2,1),
  PrecLag2 = dplyr::lag(full.iq$reanalysis_precip_amt_kg_per_m2,2),
  PrecLag3 = dplyr::lag(full.iq$reanalysis_precip_amt_kg_per_m2,3),
  PrecLag4 = dplyr::lag(full.iq$reanalysis_precip_amt_kg_per_m2,4),
  DewLag1 = dplyr::lag(full.iq$reanalysis_dew_point_temp_k,1),
  DewLag2 = dplyr::lag(full.iq$reanalysis_dew_point_temp_k,2),
  DewLag3 = dplyr::lag(full.iq$reanalysis_dew_point_temp_k,3),
  DewLag4 = dplyr::lag(full.iq$reanalysis_dew_point_temp_k,4),
  TempLag1 = dplyr::lag(full.iq$reanalysis_air_temp_k,1),
  TempLag2 = dplyr::lag(full.iq$reanalysis_air_temp_k,2),
  TempLag3 = dplyr::lag(full.iq$reanalysis_air_temp_k,3),
  TempLag4 = dplyr::lag(full.iq$reanalysis_air_temp_k,4)
)



# get full train data

train.ind.sj <- train.sj %>% dplyr::select(year, weekofyear)
train.sj <- train.ind.sj %>%
  dplyr::inner_join(full.sj, by=c("year", "weekofyear"))

train.ind.iq <- train.iq %>% dplyr::select(year, weekofyear)
train.iq <- train.ind.iq %>%
  dplyr::inner_join(full.iq, by=c("year", "weekofyear"))

train.sj.ts <- ts(train.sj, frequency = 52, start=1)
train.iq.ts <- ts(train.iq, frequency = 52, start=1)


# get final test data

test.ind.sj <- test.sj %>% dplyr::select(year, weekofyear)
test.sj <- test.ind.sj %>%
  dplyr::inner_join(full.sj, by=c("year", "weekofyear"))

test.ind.iq <- test.iq %>% dplyr::select(year, weekofyear)
test.iq <- test.ind.iq %>%
  dplyr::inner_join(full.iq, by=c("year", "weekofyear"))

test.sj.ts <- ts(test.sj, frequency = 52, start=1)
test.iq.ts <- ts(test.iq, frequency = 52, start=1)

STL + ETS Models

# San Juan

# fit model and plot
fit.sj.ets <- stlf(train.sj.ts[,'total_cases'], etsmodel="ANN", h=260)

fit.sj.ets %>% autoplot() +
  ggtitle("Forecasts from STL + ETS(A,N,N) for San Juan") +
  ylab("Cases") +
  xlab("Time")

# predict
test.sj <- test.sj %>% mutate(stl.ets.ann = fit.sj.ets$mean) %>%
  mutate(stl.ets.ann = ifelse(stl.ets.ann>0, stl.ets.ann, ifelse(stl.ets.ann<0, 0, NA)))
# Iquitos

# fit model and plot
fit.iq.ets <- stlf(train.iq.ts[,'total_cases'], etsmodel="ANN", h=156)

fit.iq.ets %>% autoplot() +
  ggtitle("Forecasts from STL + ETS(A,N,N) for Iquitos") +
  ylab("Cases") +
  xlab("Time")

# predict
test.iq <- test.iq %>% mutate(stl.ets.ann = fit.iq.ets$mean) %>%
  mutate(stl.ets.ann = ifelse(stl.ets.ann>0, stl.ets.ann, ifelse(stl.ets.ann<0, 0, NA)))

Regression with Lagged Predictors and ARIMA errors

# San Juan

fit.sj.arima.lag <- auto.arima(train.sj.ts[,"total_cases"],
                               xreg=cbind(train.sj.ts[,"reanalysis_specific_humidity_g_per_kg"],
                                          train.sj.ts[,"HumLag1"],
                                          train.sj.ts[,"HumLag2"],
                                          train.sj.ts[,"reanalysis_precip_amt_kg_per_m2"],
                                          train.sj.ts[,"PrecLag1"],
                                          train.sj.ts[,"PrecLag2"],
                                          train.sj.ts[,"reanalysis_dew_point_temp_k"],
                                          train.sj.ts[,"DewLag1"],
                                          train.sj.ts[,"DewLag2"],
                                          train.sj.ts[,"reanalysis_air_temp_k"],
                                          train.sj.ts[,"TempLag1"],
                                          train.sj.ts[,"TempLag2"]
                               ))

forecast.sj.arima.lag <- fit.sj.arima.lag %>%
  forecast(xreg=cbind(test.sj.ts[,"reanalysis_specific_humidity_g_per_kg"],
                      test.sj.ts[,"HumLag1"],
                      test.sj.ts[,"HumLag2"],
                      test.sj.ts[,"reanalysis_precip_amt_kg_per_m2"],
                      test.sj.ts[,"PrecLag1"],
                      test.sj.ts[,"PrecLag2"],
                      test.sj.ts[,"reanalysis_dew_point_temp_k"],
                      test.sj.ts[,"DewLag1"],
                      test.sj.ts[,"DewLag2"],
                      test.sj.ts[,"reanalysis_air_temp_k"],
                      test.sj.ts[,"TempLag1"],
                      test.sj.ts[,"TempLag2"]
  ))
## Warning in forecast.forecast_ARIMA(., xreg = cbind(test.sj.ts[,
## "reanalysis_specific_humidity_g_per_kg"], : xreg contains different column names
## from the xreg used in training. Please check that the regressors are in the same
## order.
forecast.sj.arima.lag %>% autoplot()

test.sj <- test.sj %>% mutate(arima.lag.sj = forecast.sj.arima.lag$mean)
# Iquitos

fit.iq.arima.lag <- auto.arima(train.iq.ts[,"total_cases"],
                               xreg=cbind(train.iq.ts[,"reanalysis_specific_humidity_g_per_kg"],
                                          train.iq.ts[,"HumLag1"],
                                          train.iq.ts[,"HumLag2"],
                                          train.iq.ts[,"reanalysis_precip_amt_kg_per_m2"],
                                          train.iq.ts[,"PrecLag1"],
                                          train.iq.ts[,"PrecLag2"],
                                          train.iq.ts[,"reanalysis_dew_point_temp_k"],
                                          train.iq.ts[,"DewLag1"],
                                          train.iq.ts[,"DewLag2"],
                                          train.iq.ts[,"reanalysis_air_temp_k"],
                                          train.iq.ts[,"TempLag1"],
                                          train.iq.ts[,"TempLag2"]
                               ))

forecast.iq.arima.lag <- fit.iq.arima.lag %>%
  forecast(xreg=cbind(test.iq.ts[,"reanalysis_specific_humidity_g_per_kg"],
                      test.iq.ts[,"HumLag1"],
                      test.iq.ts[,"HumLag2"],
                      test.iq.ts[,"reanalysis_precip_amt_kg_per_m2"],
                      test.iq.ts[,"PrecLag1"],
                      test.iq.ts[,"PrecLag2"],
                      test.iq.ts[,"reanalysis_dew_point_temp_k"],
                      test.iq.ts[,"DewLag1"],
                      test.iq.ts[,"DewLag2"],
                      test.iq.ts[,"reanalysis_air_temp_k"],
                      test.iq.ts[,"TempLag1"],
                      test.iq.ts[,"TempLag2"]
  ))
## Warning in forecast.forecast_ARIMA(., xreg = cbind(test.iq.ts[,
## "reanalysis_specific_humidity_g_per_kg"], : xreg contains different column names
## from the xreg used in training. Please check that the regressors are in the same
## order.
forecast.iq.arima.lag %>% autoplot()

test.iq <- test.iq %>% mutate(arima.lag.iq = forecast.iq.arima.lag$mean)

Combine / Average Forecasts

test.sj <- test.sj %>% mutate(total_cases = (stl.ets.ann+arima.lag.sj)/2)
test.iq <- test.iq %>% mutate(total_cases = (stl.ets.ann+arima.lag.iq)/2)
# Plot final forecasts
train.sj <- train.sj %>% mutate(enum=seq(1:936))
test.sj <- test.sj %>% mutate(enum=seq(937:1196)+936)
train.iq <- train.iq %>% mutate(enum=seq(1:520))
test.iq <- test.iq %>% mutate(enum=seq(521:676)+520)

ggplot(train.sj, aes(y=total_cases, x=enum)) +
  geom_line() +
  geom_line(test.sj,mapping = aes(y=total_cases, x=enum), color="red") +
  ggtitle("Dengue Fever Cases Forecasts in San Juan") +
  xlab("Time") +
  ylab("Total Cases")

ggplot(train.iq, aes(y=total_cases, x=enum)) +
  geom_line() +
  geom_line(test.iq, mapping = aes(y=total_cases, x=enum), col="red") +
  ggtitle("Dengue Fever Cases Forecasts in Iquitos") +
  xlab("Time") +
  ylab("Total Cases")

Generate and write Submission File

a <- test.sj %>% dplyr::select(city, year, weekofyear, total_cases) %>%
  mutate(total_cases = as.numeric(total_cases))
b <- test.iq %>% dplyr::select(city, year, weekofyear, total_cases) %>%
  mutate(total_cases = as.numeric(total_cases))

final.submission = rbind(a,b)
final.submission <- final.submission %>%
  mutate(total_cases = as.integer(floor(total_cases))+1)

write.csv(final.submission, 'final_sub_dengai_michailidis.csv', row.names = FALSE)