Import Data

setwd('/Users/brendanobrien/Downloads')
train_features = read.csv('Training_Data_Features.csv')
train_labels   = read.csv('Training_Data_Labels.csv')
densubformat = read.csv('Submission_Format.csv')

Load Libraries

require(tidyverse)
## Loading required package: tidyverse
## ── Attaching packages ───────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 2.2.1     ✔ purrr   0.2.4
## ✔ tibble  1.4.2     ✔ dplyr   0.7.4
## ✔ tidyr   0.8.0     ✔ stringr 1.2.0
## ✔ readr   1.2.1     ✔ forcats 0.3.0
## Warning: package 'readr' was built under R version 3.4.4
## ── Conflicts ──────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
library(tidyverse)
library(MASS)
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
## 
##     select
library(forecast)
## Warning: package 'forecast' was built under R version 3.4.4
library(fpp)
## Loading required package: fma
## 
## Attaching package: 'fma'
## The following objects are masked from 'package:MASS':
## 
##     cement, housing, petrol
## Loading required package: expsmooth
## Loading required package: lmtest
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
## Loading required package: tseries
## Warning: package 'tseries' was built under R version 3.4.4
library(corrplot)
## corrplot 0.84 loaded
library(magrittr)
## 
## Attaching package: 'magrittr'
## The following object is masked from 'package:purrr':
## 
##     set_names
## The following object is masked from 'package:tidyr':
## 
##     extract
library(zoo)
library(RColorBrewer)
library(gridExtra)
## 
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
## 
##     combine
library(MASS)
require(Amelia)
## Loading required package: Amelia
## Warning: package 'Amelia' was built under R version 3.4.4
## Loading required package: Rcpp
## ## 
## ## Amelia II: Multiple Imputation
## ## (Version 1.7.5, built: 2018-05-07)
## ## Copyright (C) 2005-2018 James Honaker, Gary King and Matthew Blackwell
## ## Refer to http://gking.harvard.edu/amelia/ for more information
## ##
library(Amelia)
library('broom') 
library('purrr')
library('lubridate')
## 
## Attaching package: 'lubridate'
## The following object is masked from 'package:base':
## 
##     date
library('timeDate') 
library('tseries') 
library('forecast') 
library('prophet')
## Warning: package 'prophet' was built under R version 3.4.4
library('ggplot2') 
library('scales') 
## 
## Attaching package: 'scales'
## The following object is masked from 'package:purrr':
## 
##     discard
## The following object is masked from 'package:readr':
## 
##     col_factor
library('grid') 
library('gridExtra') 
library('RColorBrewer') 
library('corrplot') 
library('ggridges')
## Warning: package 'ggridges' was built under R version 3.4.4
library('dplyr') 
library('readr') 
library('data.table')
## 
## Attaching package: 'data.table'
## The following objects are masked from 'package:lubridate':
## 
##     hour, isoweek, mday, minute, month, quarter, second, wday,
##     week, yday, year
## The following objects are masked from 'package:dplyr':
## 
##     between, first, last
## The following object is masked from 'package:purrr':
## 
##     transpose
library('tibble')
library('tidyr')
library('stringr')
library('forcats') 
library('lazyeval') 
## 
## Attaching package: 'lazyeval'
## The following objects are masked from 'package:purrr':
## 
##     is_atomic, is_formula

In viewing the data, I found there to to be data on two different cities: San Juan and Iquitos. Therefore, I will seperate the data based on their city.

sj_train_features = train_features %>% filter(city == 'sj')
sj_train_labels   = train_labels   %>% filter(city == 'sj')

iq_train_features = train_features %>% filter(city == 'iq')
iq_train_labels   = train_labels   %>% filter(city == 'iq')
cat('\nSan Juan\n',
    '\t features: ', sj_train_features %>% ncol, 
    '\t entries: ' , sj_train_features %>% nrow,
    '\t labels: '  , sj_train_labels %>% nrow)
## 
## San Juan
##       features:  24   entries:  936   labels:  936
cat('\nIquitos\n',
    '\t features: ', iq_train_features %>% ncol, 
    '\t entries: ' , iq_train_features %>% nrow,
    '\t labels: '  , iq_train_labels %>% nrow)
## 
## Iquitos
##       features:  24   entries:  520   labels:  520

Descriptive Statistics

summary(sj_train_features)
##  city          year        weekofyear      week_start_date
##  iq:  0   Min.   :1990   Min.   : 1.00   1990-04-30:  1   
##  sj:936   1st Qu.:1994   1st Qu.:13.75   1990-05-07:  1   
##           Median :1999   Median :26.50   1990-05-14:  1   
##           Mean   :1999   Mean   :26.50   1990-05-21:  1   
##           3rd Qu.:2003   3rd Qu.:39.25   1990-05-28:  1   
##           Max.   :2008   Max.   :53.00   1990-06-04:  1   
##                                          (Other)   :930   
##     ndvi_ne            ndvi_nw            ndvi_se        
##  Min.   :-0.40625   Min.   :-0.45610   Min.   :-0.01553  
##  1st Qu.: 0.00450   1st Qu.: 0.01642   1st Qu.: 0.13928  
##  Median : 0.05770   Median : 0.06808   Median : 0.17719  
##  Mean   : 0.05792   Mean   : 0.06747   Mean   : 0.17766  
##  3rd Qu.: 0.11110   3rd Qu.: 0.11520   3rd Qu.: 0.21256  
##  Max.   : 0.49340   Max.   : 0.43710   Max.   : 0.39313  
##  NA's   :191        NA's   :49         NA's   :19        
##     ndvi_sw         precipitation_amt_mm reanalysis_air_temp_k
##  Min.   :-0.06346   Min.   :  0.00       Min.   :295.9        
##  1st Qu.: 0.12916   1st Qu.:  0.00       1st Qu.:298.2        
##  Median : 0.16597   Median : 20.80       Median :299.3        
##  Mean   : 0.16596   Mean   : 35.47       Mean   :299.2        
##  3rd Qu.: 0.20277   3rd Qu.: 52.18       3rd Qu.:300.1        
##  Max.   : 0.38142   Max.   :390.60       Max.   :302.2        
##  NA's   :19         NA's   :9            NA's   :6            
##  reanalysis_avg_temp_k reanalysis_dew_point_temp_k
##  Min.   :296.1         Min.   :289.6              
##  1st Qu.:298.3         1st Qu.:293.8              
##  Median :299.4         Median :295.5              
##  Mean   :299.3         Mean   :295.1              
##  3rd Qu.:300.2         3rd Qu.:296.4              
##  Max.   :302.2         Max.   :297.8              
##  NA's   :6             NA's   :6                  
##  reanalysis_max_air_temp_k reanalysis_min_air_temp_k
##  Min.   :297.8             Min.   :292.6            
##  1st Qu.:300.4             1st Qu.:296.3            
##  Median :301.5             Median :297.5            
##  Mean   :301.4             Mean   :297.3            
##  3rd Qu.:302.4             3rd Qu.:298.4            
##  Max.   :304.3             Max.   :299.9            
##  NA's   :6                 NA's   :6                
##  reanalysis_precip_amt_kg_per_m2 reanalysis_relative_humidity_percent
##  Min.   :  0.00                  Min.   :66.74                       
##  1st Qu.: 10.82                  1st Qu.:76.25                       
##  Median : 21.30                  Median :78.67                       
##  Mean   : 30.47                  Mean   :78.57                       
##  3rd Qu.: 37.00                  3rd Qu.:80.96                       
##  Max.   :570.50                  Max.   :87.58                       
##  NA's   :6                       NA's   :6                           
##  reanalysis_sat_precip_amt_mm reanalysis_specific_humidity_g_per_kg
##  Min.   :  0.00               Min.   :11.72                        
##  1st Qu.:  0.00               1st Qu.:15.24                        
##  Median : 20.80               Median :16.85                        
##  Mean   : 35.47               Mean   :16.55                        
##  3rd Qu.: 52.18               3rd Qu.:17.86                        
##  Max.   :390.60               Max.   :19.44                        
##  NA's   :9                    NA's   :6                            
##  reanalysis_tdtr_k station_avg_temp_c station_diur_temp_rng_c
##  Min.   :1.357     Min.   :22.84      Min.   :4.529          
##  1st Qu.:2.157     1st Qu.:25.84      1st Qu.:6.200          
##  Median :2.457     Median :27.23      Median :6.757          
##  Mean   :2.516     Mean   :27.01      Mean   :6.757          
##  3rd Qu.:2.800     3rd Qu.:28.19      3rd Qu.:7.286          
##  Max.   :4.429     Max.   :30.07      Max.   :9.914          
##  NA's   :6         NA's   :6          NA's   :6              
##  station_max_temp_c station_min_temp_c station_precip_mm
##  Min.   :26.70      Min.   :17.8       Min.   :  0.000  
##  1st Qu.:30.60      1st Qu.:21.7       1st Qu.:  6.825  
##  Median :31.70      Median :22.8       Median : 17.750  
##  Mean   :31.61      Mean   :22.6       Mean   : 26.785  
##  3rd Qu.:32.80      3rd Qu.:23.9       3rd Qu.: 35.450  
##  Max.   :35.60      Max.   :25.6       Max.   :305.900  
##  NA's   :6          NA's   :6          NA's   :6
summary(iq_train_features)
##  city          year        weekofyear      week_start_date
##  iq:520   Min.   :2000   Min.   : 1.00   2000-07-01:  1   
##  sj:  0   1st Qu.:2003   1st Qu.:13.75   2000-07-08:  1   
##           Median :2005   Median :26.50   2000-07-15:  1   
##           Mean   :2005   Mean   :26.50   2000-07-22:  1   
##           3rd Qu.:2007   3rd Qu.:39.25   2000-07-29:  1   
##           Max.   :2010   Max.   :53.00   2000-08-05:  1   
##                                          (Other)   :514   
##     ndvi_ne           ndvi_nw           ndvi_se           ndvi_sw       
##  Min.   :0.06173   Min.   :0.03586   Min.   :0.02988   Min.   :0.06418  
##  1st Qu.:0.20000   1st Qu.:0.17954   1st Qu.:0.19474   1st Qu.:0.20413  
##  Median :0.26364   Median :0.23297   Median :0.24980   Median :0.26214  
##  Mean   :0.26387   Mean   :0.23878   Mean   :0.25013   Mean   :0.26678  
##  3rd Qu.:0.31997   3rd Qu.:0.29393   3rd Qu.:0.30230   3rd Qu.:0.32515  
##  Max.   :0.50836   Max.   :0.45443   Max.   :0.53831   Max.   :0.54602  
##  NA's   :3         NA's   :3         NA's   :3         NA's   :3        
##  precipitation_amt_mm reanalysis_air_temp_k reanalysis_avg_temp_k
##  Min.   :  0.00       Min.   :294.6         Min.   :294.9        
##  1st Qu.: 39.10       1st Qu.:297.1         1st Qu.:298.2        
##  Median : 60.47       Median :297.8         Median :299.1        
##  Mean   : 64.25       Mean   :297.9         Mean   :299.1        
##  3rd Qu.: 85.76       3rd Qu.:298.6         3rd Qu.:300.1        
##  Max.   :210.83       Max.   :301.6         Max.   :302.9        
##  NA's   :4            NA's   :4             NA's   :4            
##  reanalysis_dew_point_temp_k reanalysis_max_air_temp_k
##  Min.   :290.1               Min.   :300.0            
##  1st Qu.:294.6               1st Qu.:305.2            
##  Median :295.9               Median :307.1            
##  Mean   :295.5               Mean   :307.1            
##  3rd Qu.:296.5               3rd Qu.:308.7            
##  Max.   :298.4               Max.   :314.0            
##  NA's   :4                   NA's   :4                
##  reanalysis_min_air_temp_k reanalysis_precip_amt_kg_per_m2
##  Min.   :286.9             Min.   :  0.00                 
##  1st Qu.:292.0             1st Qu.: 24.07                 
##  Median :293.1             Median : 46.44                 
##  Mean   :292.9             Mean   : 57.61                 
##  3rd Qu.:294.2             3rd Qu.: 71.07                 
##  Max.   :296.0             Max.   :362.03                 
##  NA's   :4                 NA's   :4                      
##  reanalysis_relative_humidity_percent reanalysis_sat_precip_amt_mm
##  Min.   :57.79                        Min.   :  0.00              
##  1st Qu.:84.30                        1st Qu.: 39.10              
##  Median :90.92                        Median : 60.47              
##  Mean   :88.64                        Mean   : 64.25              
##  3rd Qu.:94.56                        3rd Qu.: 85.76              
##  Max.   :98.61                        Max.   :210.83              
##  NA's   :4                            NA's   :4                   
##  reanalysis_specific_humidity_g_per_kg reanalysis_tdtr_k
##  Min.   :12.11                         Min.   : 3.714   
##  1st Qu.:16.10                         1st Qu.: 7.371   
##  Median :17.43                         Median : 8.964   
##  Mean   :17.10                         Mean   : 9.207   
##  3rd Qu.:18.18                         3rd Qu.:11.014   
##  Max.   :20.46                         Max.   :16.029   
##  NA's   :4                             NA's   :4        
##  station_avg_temp_c station_diur_temp_rng_c station_max_temp_c
##  Min.   :21.40      Min.   : 5.20           Min.   :30.1      
##  1st Qu.:27.00      1st Qu.: 9.50           1st Qu.:33.2      
##  Median :27.60      Median :10.62           Median :34.0      
##  Mean   :27.53      Mean   :10.57           Mean   :34.0      
##  3rd Qu.:28.10      3rd Qu.:11.65           3rd Qu.:34.9      
##  Max.   :30.80      Max.   :15.80           Max.   :42.2      
##  NA's   :37         NA's   :37              NA's   :14        
##  station_min_temp_c station_precip_mm
##  Min.   :14.7       Min.   :  0.00   
##  1st Qu.:20.6       1st Qu.: 17.20   
##  Median :21.3       Median : 45.30   
##  Mean   :21.2       Mean   : 62.47   
##  3rd Qu.:22.0       3rd Qu.: 85.95   
##  Max.   :24.2       Max.   :543.30   
##  NA's   :8          NA's   :16

There are NAs in many of the observations, particularly in the week_start_data variable. Therefore, I chose to remove this variable.

Data Preperation

sj_train_features %<>% dplyr::select(-week_start_date)
iq_train_features %<>% dplyr::select(-week_start_date)

Fill in remainder of NAs with the most recent value

sj_train_features$ndvi_ne %<>% na.locf(fromLast = TRUE)
iq_train_features$ndvi_ne %<>% na.locf(fromLast = TRUE)

Now, the labels must be applied to the features of the data set.

cat('\nSan Juan\n',
    '\t total cases mean: ',      sj_train_labels$total_cases %>% mean(), 
    '\t total cases variance: ' , sj_train_labels$total_cases %>% var() )
## 
## San Juan
##       total cases mean:  34.18056     total cases variance:  2640.045
cat('\nIquitos\n',
    '\t total cases mean: ',      iq_train_labels$total_cases %>% mean(), 
    '\t total cases variance: ' , iq_train_labels$total_cases %>% var() )
## 
## Iquitos
##       total cases mean:  7.565385     total cases variance:  115.8955

Complete data frames/structure

preprocessData <- function(data_path, labels_path = NULL)
{
  df <- read.csv(data_path)
  features = c("reanalysis_specific_humidity_g_per_kg", "reanalysis_dew_point_temp_k",
               "station_avg_temp_c", "station_min_temp_c", "ndvi_ne", "ndvi_se", "ndvi_nw","ndvi_sw", "station_precip_mm" )
  df[features] %<>% na.locf(fromLast = TRUE) 
  if (is.null(labels_path)) features %<>% c("city", "year", "weekofyear")
  df <- df[features]
  if (!is.null(labels_path)) df %<>% cbind(read.csv(labels_path))
  df_sj <- filter(df, city == 'sj')
  df_iq <- filter(df, city == 'iq')
  return(list(df_sj, df_iq))
}
preprocessData(data_path = 'Training_Data_Features.csv', labels_path = 'Training_Data_Labels.csv') -> trains
sj_train <- trains[[1]]; iq_train <- as.data.frame(trains[2])

Data Exploration

rbind(iq_train_labels, sj_train_labels) %>% 
  ggplot(aes(x = total_cases, fill = ..count..)) + 
  geom_histogram(bins = 20, colour = 'black') + ggtitle('Total Cases of Dengue') +scale_y_continuous(breaks = seq(0,700,100)) + facet_wrap(~city)

A look at the total cases of Dengue in each city.

sj_train_features %<>% mutate('total_cases' = sj_train_labels$total_cases)
iq_train_features %<>% mutate('total_cases' = iq_train_labels$total_cases)
Adding the total number of cases in each city to the data
sj_train_features %<>% mutate('total_cases' = sj_train_labels$total_cases)
iq_train_features %<>% mutate('total_cases' = iq_train_labels$total_cases)

Create Time Series

sj.ts<-ts(sj_train_features$total_cases, frequency=52, start=c(1990, 18))
iq.ts<-ts(iq_train_features$total_cases, frequency=52, start=c(2000, 26))
plot(sj.ts)

plot(iq.ts)

Create Training and Test Set for Time Series

sj.train.ts<-window(sj.ts, end=c(2004,37))
sj.test.ts<-window(sj.ts,start=c(2004,38))

iq.train.ts<-window(iq.ts, end=c(2008,27))
iq.test.ts<-window(iq.ts,start=c(2008,28))

Build Models

Auto.ARIMA Models

San Juan
sj.arima=auto.arima(sj.train.ts)
sj.forecast1=forecast(sj.arima,h=260)
autoplot(sj.forecast1)

Iquitos
iq.arima=auto.arima(iq.train.ts)
iq.forecast1=forecast(iq.arima,h=156)
autoplot(iq.forecast1)

Neural Network Models

San Juan
sj.nnetar<-nnetar(sj.train.ts)
sj.forecast2<-forecast(sj.nnetar,h=260)
autoplot(sj.forecast2) 

#####Iquitos

iq.nnetar<-nnetar(iq.train.ts)
iq.forecast2<-forecast(iq.nnetar,h=156)
autoplot(iq.forecast2) 

Random Forest

San Juan
library(randomForest)
## Warning: package 'randomForest' was built under R version 3.4.4
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:gridExtra':
## 
##     combine
## The following object is masked from 'package:dplyr':
## 
##     combine
## The following object is masked from 'package:ggplot2':
## 
##     margin
sj.rF<-randomForest(total_cases ~ ndvi_ne + ndvi_nw + ndvi_se + ndvi_sw +  reanalysis_specific_humidity_g_per_kg + station_avg_temp_c + reanalysis_dew_point_temp_k + station_min_temp_c + station_precip_mm , data = sj_train)
sj.forecast3<- predict(object=sj.rF, h=260)
plot(sj.forecast3) 

Iquitos
iq.rF<-randomForest(total_cases ~ ndvi_ne + ndvi_nw + ndvi_se + ndvi_sw +  reanalysis_specific_humidity_g_per_kg + station_avg_temp_c + reanalysis_dew_point_temp_k + station_min_temp_c + station_precip_mm , data = iq_train)
iq.forecast3<- predict(object=iq.rF, h=156)
plot(iq.forecast3) 

Model Selection

Arima Accuracy and Residuals
acc.sj.arima <- accuracy(sj.forecast1, sj.test.ts)

acc.iq.arima <- accuracy(iq.forecast1,iq.test.ts)

tsdisplay(residuals(sj.forecast1))

acc.sj.arima
##                        ME     RMSE       MAE  MPE MAPE      MASE
## Training set  0.006577048 13.83894  8.379156  NaN  Inf 0.2102492
## Test set     14.740977080 34.48529 18.234742 -Inf  Inf 0.4575449
##                     ACF1 Theil's U
## Training set 0.007353076        NA
## Test set     0.932173942         0
tsdisplay(residuals(iq.forecast1))

acc.iq.arima
##                       ME     RMSE      MAE  MPE MAPE      MASE        ACF1
## Training set 0.008331804  7.16080 3.773915  NaN  Inf 0.4123143 0.006181246
## Test set     7.882537945 13.92229 8.265254 -Inf  Inf 0.9030098 0.842766026
##              Theil's U
## Training set        NA
## Test set             0
Neural Network Accuracy and Residuals
acc.sj.nnetar <- accuracy(sj.forecast2, sj.test.ts)

acc.iq.nnetar <- accuracy(iq.forecast2,iq.test.ts)

tsdisplay(residuals(sj.forecast2))

acc.sj.nnetar
##                       ME      RMSE       MAE  MPE MAPE      MASE
## Training set -0.01880461  6.504068  4.777888 -Inf  Inf 0.1198864
## Test set      6.05422937 32.475855 19.514153 -Inf  Inf 0.4896478
##                     ACF1 Theil's U
## Training set -0.01533117        NA
## Test set      0.93554975         0
tsdisplay(residuals(iq.forecast2))

acc.iq.nnetar
##                       ME      RMSE      MAE  MPE MAPE      MASE
## Training set -0.01342703  5.887483 3.558980 -Inf  Inf 0.3888319
## Test set      5.08383902 12.626048 7.267046 -Inf  Inf 0.7939519
##                     ACF1 Theil's U
## Training set 0.002369741        NA
## Test set     0.846631790         0
RandomForest Accuracy and Residuals
acc.sj.rF <- accuracy(sj.forecast3, sj.test.ts)

acc.iq.rF <- accuracy(iq.forecast3,iq.test.ts)


acc.sj.rF
##                 ME     RMSE      MAE  MPE MAPE      ACF1 Theil's U
## Test set -7.436914 36.15461 28.11977 -Inf  Inf 0.8541588         0
acc.iq.rF
##                ME     RMSE      MAE  MPE MAPE      ACF1 Theil's U
## Test set 3.222535 12.99857 7.777463 -Inf  Inf 0.8037225         0

Neural Network is the best forecasting model according to test metrics

Submit to Competition

NN_sj_final=data.frame(densubformat[1:260,-4],total_cases=round(sj.forecast2$mean))
NN_iq_final=data.frame(densubformat[261:416,-4],total_cases=round(iq.forecast2$mean))
Dengue_Arima_solution=bind_rows(NN_sj_final,NN_iq_final)
## Warning in bind_rows_(x, .id): Vectorizing 'ts' elements may not preserve
## their attributes

## Warning in bind_rows_(x, .id): Vectorizing 'ts' elements may not preserve
## their attributes
write.csv(Dengue_Arima_solution,file="Dengue_Arima_predictions.csv",row.names=F)

I recieved a final score of 28.64 from my Neural Networking model