Final Project

Import Data

setwd('/Users/brendanobrien/Downloads')
train_features = read.csv('Training_Data_Features.csv')
train_labels   = read.csv('Training_Data_Labels.csv')
densubformat = read.csv('Submission_Format.csv')

Load Libraries

require(tidyverse)

## Loading required package: tidyverse

## ── Attaching packages ───────────────────────── tidyverse 1.2.1 ──

## ✔ ggplot2 2.2.1     ✔ purrr   0.2.4
## ✔ tibble  1.4.2     ✔ dplyr   0.7.4
## ✔ tidyr   0.8.0     ✔ stringr 1.2.0
## ✔ readr   1.2.1     ✔ forcats 0.3.0

## Warning: package 'readr' was built under R version 3.4.4

## ── Conflicts ──────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()

library(tidyverse)
library(MASS)

## 
## Attaching package: 'MASS'

## The following object is masked from 'package:dplyr':
## 
##     select

library(forecast)

## Warning: package 'forecast' was built under R version 3.4.4

library(fpp)

## Loading required package: fma

## 
## Attaching package: 'fma'

## The following objects are masked from 'package:MASS':
## 
##     cement, housing, petrol

## Loading required package: expsmooth

## Loading required package: lmtest

## Loading required package: zoo

## 
## Attaching package: 'zoo'

## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric

## Loading required package: tseries

## Warning: package 'tseries' was built under R version 3.4.4

library(corrplot)

## corrplot 0.84 loaded

library(magrittr)

## 
## Attaching package: 'magrittr'

## The following object is masked from 'package:purrr':
## 
##     set_names

## The following object is masked from 'package:tidyr':
## 
##     extract

library(zoo)
library(RColorBrewer)
library(gridExtra)

## 
## Attaching package: 'gridExtra'

## The following object is masked from 'package:dplyr':
## 
##     combine

library(MASS)
require(Amelia)

## Loading required package: Amelia

## Warning: package 'Amelia' was built under R version 3.4.4

## Loading required package: Rcpp

## ## 
## ## Amelia II: Multiple Imputation
## ## (Version 1.7.5, built: 2018-05-07)
## ## Copyright (C) 2005-2018 James Honaker, Gary King and Matthew Blackwell
## ## Refer to http://gking.harvard.edu/amelia/ for more information
## ##

library(Amelia)
library('broom') 
library('purrr')
library('lubridate')

## 
## Attaching package: 'lubridate'

## The following object is masked from 'package:base':
## 
##     date

library('timeDate') 
library('tseries') 
library('forecast') 
library('prophet')

## Warning: package 'prophet' was built under R version 3.4.4

library('ggplot2') 
library('scales')

## 
## Attaching package: 'scales'

## The following object is masked from 'package:purrr':
## 
##     discard

## The following object is masked from 'package:readr':
## 
##     col_factor

library('grid') 
library('gridExtra') 
library('RColorBrewer') 
library('corrplot') 
library('ggridges')

## Warning: package 'ggridges' was built under R version 3.4.4

library('dplyr') 
library('readr') 
library('data.table')

## 
## Attaching package: 'data.table'

## The following objects are masked from 'package:lubridate':
## 
##     hour, isoweek, mday, minute, month, quarter, second, wday,
##     week, yday, year

## The following objects are masked from 'package:dplyr':
## 
##     between, first, last

## The following object is masked from 'package:purrr':
## 
##     transpose

library('tibble')
library('tidyr')
library('stringr')
library('forcats') 
library('lazyeval')

## 
## Attaching package: 'lazyeval'

## The following objects are masked from 'package:purrr':
## 
##     is_atomic, is_formula

In viewing the data, I found there to to be data on two different cities: San Juan and Iquitos. Therefore, I will seperate the data based on their city.

sj_train_features = train_features %>% filter(city == 'sj')
sj_train_labels   = train_labels   %>% filter(city == 'sj')

iq_train_features = train_features %>% filter(city == 'iq')
iq_train_labels   = train_labels   %>% filter(city == 'iq')

cat('\nSan Juan\n',
    '\t features: ', sj_train_features %>% ncol, 
    '\t entries: ' , sj_train_features %>% nrow,
    '\t labels: '  , sj_train_labels %>% nrow)

## 
## San Juan
##       features:  24   entries:  936   labels:  936

cat('\nIquitos\n',
    '\t features: ', iq_train_features %>% ncol, 
    '\t entries: ' , iq_train_features %>% nrow,
    '\t labels: '  , iq_train_labels %>% nrow)

## 
## Iquitos
##       features:  24   entries:  520   labels:  520

Descriptive Statistics

summary(sj_train_features)

##  city          year        weekofyear      week_start_date
##  iq:  0   Min.   :1990   Min.   : 1.00   1990-04-30:  1   
##  sj:936   1st Qu.:1994   1st Qu.:13.75   1990-05-07:  1   
##           Median :1999   Median :26.50   1990-05-14:  1   
##           Mean   :1999   Mean   :26.50   1990-05-21:  1   
##           3rd Qu.:2003   3rd Qu.:39.25   1990-05-28:  1   
##           Max.   :2008   Max.   :53.00   1990-06-04:  1   
##                                          (Other)   :930   
##     ndvi_ne            ndvi_nw            ndvi_se        
##  Min.   :-0.40625   Min.   :-0.45610   Min.   :-0.01553  
##  1st Qu.: 0.00450   1st Qu.: 0.01642   1st Qu.: 0.13928  
##  Median : 0.05770   Median : 0.06808   Median : 0.17719  
##  Mean   : 0.05792   Mean   : 0.06747   Mean   : 0.17766  
##  3rd Qu.: 0.11110   3rd Qu.: 0.11520   3rd Qu.: 0.21256  
##  Max.   : 0.49340   Max.   : 0.43710   Max.   : 0.39313  
##  NA's   :191        NA's   :49         NA's   :19        
##     ndvi_sw         precipitation_amt_mm reanalysis_air_temp_k
##  Min.   :-0.06346   Min.   :  0.00       Min.   :295.9        
##  1st Qu.: 0.12916   1st Qu.:  0.00       1st Qu.:298.2        
##  Median : 0.16597   Median : 20.80       Median :299.3        
##  Mean   : 0.16596   Mean   : 35.47       Mean   :299.2        
##  3rd Qu.: 0.20277   3rd Qu.: 52.18       3rd Qu.:300.1        
##  Max.   : 0.38142   Max.   :390.60       Max.   :302.2        
##  NA's   :19         NA's   :9            NA's   :6            
##  reanalysis_avg_temp_k reanalysis_dew_point_temp_k
##  Min.   :296.1         Min.   :289.6              
##  1st Qu.:298.3         1st Qu.:293.8              
##  Median :299.4         Median :295.5              
##  Mean   :299.3         Mean   :295.1              
##  3rd Qu.:300.2         3rd Qu.:296.4              
##  Max.   :302.2         Max.   :297.8              
##  NA's   :6             NA's   :6                  
##  reanalysis_max_air_temp_k reanalysis_min_air_temp_k
##  Min.   :297.8             Min.   :292.6            
##  1st Qu.:300.4             1st Qu.:296.3            
##  Median :301.5             Median :297.5            
##  Mean   :301.4             Mean   :297.3            
##  3rd Qu.:302.4             3rd Qu.:298.4            
##  Max.   :304.3             Max.   :299.9            
##  NA's   :6                 NA's   :6                
##  reanalysis_precip_amt_kg_per_m2 reanalysis_relative_humidity_percent
##  Min.   :  0.00                  Min.   :66.74                       
##  1st Qu.: 10.82                  1st Qu.:76.25                       
##  Median : 21.30                  Median :78.67                       
##  Mean   : 30.47                  Mean   :78.57                       
##  3rd Qu.: 37.00                  3rd Qu.:80.96                       
##  Max.   :570.50                  Max.   :87.58                       
##  NA's   :6                       NA's   :6                           
##  reanalysis_sat_precip_amt_mm reanalysis_specific_humidity_g_per_kg
##  Min.   :  0.00               Min.   :11.72                        
##  1st Qu.:  0.00               1st Qu.:15.24                        
##  Median : 20.80               Median :16.85                        
##  Mean   : 35.47               Mean   :16.55                        
##  3rd Qu.: 52.18               3rd Qu.:17.86                        
##  Max.   :390.60               Max.   :19.44                        
##  NA's   :9                    NA's   :6                            
##  reanalysis_tdtr_k station_avg_temp_c station_diur_temp_rng_c
##  Min.   :1.357     Min.   :22.84      Min.   :4.529          
##  1st Qu.:2.157     1st Qu.:25.84      1st Qu.:6.200          
##  Median :2.457     Median :27.23      Median :6.757          
##  Mean   :2.516     Mean   :27.01      Mean   :6.757          
##  3rd Qu.:2.800     3rd Qu.:28.19      3rd Qu.:7.286          
##  Max.   :4.429     Max.   :30.07      Max.   :9.914          
##  NA's   :6         NA's   :6          NA's   :6              
##  station_max_temp_c station_min_temp_c station_precip_mm
##  Min.   :26.70      Min.   :17.8       Min.   :  0.000  
##  1st Qu.:30.60      1st Qu.:21.7       1st Qu.:  6.825  
##  Median :31.70      Median :22.8       Median : 17.750  
##  Mean   :31.61      Mean   :22.6       Mean   : 26.785  
##  3rd Qu.:32.80      3rd Qu.:23.9       3rd Qu.: 35.450  
##  Max.   :35.60      Max.   :25.6       Max.   :305.900  
##  NA's   :6          NA's   :6          NA's   :6

summary(iq_train_features)

##  city          year        weekofyear      week_start_date
##  iq:520   Min.   :2000   Min.   : 1.00   2000-07-01:  1   
##  sj:  0   1st Qu.:2003   1st Qu.:13.75   2000-07-08:  1   
##           Median :2005   Median :26.50   2000-07-15:  1   
##           Mean   :2005   Mean   :26.50   2000-07-22:  1   
##           3rd Qu.:2007   3rd Qu.:39.25   2000-07-29:  1   
##           Max.   :2010   Max.   :53.00   2000-08-05:  1   
##                                          (Other)   :514   
##     ndvi_ne           ndvi_nw           ndvi_se           ndvi_sw       
##  Min.   :0.06173   Min.   :0.03586   Min.   :0.02988   Min.   :0.06418  
##  1st Qu.:0.20000   1st Qu.:0.17954   1st Qu.:0.19474   1st Qu.:0.20413  
##  Median :0.26364   Median :0.23297   Median :0.24980   Median :0.26214  
##  Mean   :0.26387   Mean   :0.23878   Mean   :0.25013   Mean   :0.26678  
##  3rd Qu.:0.31997   3rd Qu.:0.29393   3rd Qu.:0.30230   3rd Qu.:0.32515  
##  Max.   :0.50836   Max.   :0.45443   Max.   :0.53831   Max.   :0.54602  
##  NA's   :3         NA's   :3         NA's   :3         NA's   :3        
##  precipitation_amt_mm reanalysis_air_temp_k reanalysis_avg_temp_k
##  Min.   :  0.00       Min.   :294.6         Min.   :294.9        
##  1st Qu.: 39.10       1st Qu.:297.1         1st Qu.:298.2        
##  Median : 60.47       Median :297.8         Median :299.1        
##  Mean   : 64.25       Mean   :297.9         Mean   :299.1        
##  3rd Qu.: 85.76       3rd Qu.:298.6         3rd Qu.:300.1        
##  Max.   :210.83       Max.   :301.6         Max.   :302.9        
##  NA's   :4            NA's   :4             NA's   :4            
##  reanalysis_dew_point_temp_k reanalysis_max_air_temp_k
##  Min.   :290.1               Min.   :300.0            
##  1st Qu.:294.6               1st Qu.:305.2            
##  Median :295.9               Median :307.1            
##  Mean   :295.5               Mean   :307.1            
##  3rd Qu.:296.5               3rd Qu.:308.7            
##  Max.   :298.4               Max.   :314.0            
##  NA's   :4                   NA's   :4                
##  reanalysis_min_air_temp_k reanalysis_precip_amt_kg_per_m2
##  Min.   :286.9             Min.   :  0.00                 
##  1st Qu.:292.0             1st Qu.: 24.07                 
##  Median :293.1             Median : 46.44                 
##  Mean   :292.9             Mean   : 57.61                 
##  3rd Qu.:294.2             3rd Qu.: 71.07                 
##  Max.   :296.0             Max.   :362.03                 
##  NA's   :4                 NA's   :4                      
##  reanalysis_relative_humidity_percent reanalysis_sat_precip_amt_mm
##  Min.   :57.79                        Min.   :  0.00              
##  1st Qu.:84.30                        1st Qu.: 39.10              
##  Median :90.92                        Median : 60.47              
##  Mean   :88.64                        Mean   : 64.25              
##  3rd Qu.:94.56                        3rd Qu.: 85.76              
##  Max.   :98.61                        Max.   :210.83              
##  NA's   :4                            NA's   :4                   
##  reanalysis_specific_humidity_g_per_kg reanalysis_tdtr_k
##  Min.   :12.11                         Min.   : 3.714   
##  1st Qu.:16.10                         1st Qu.: 7.371   
##  Median :17.43                         Median : 8.964   
##  Mean   :17.10                         Mean   : 9.207   
##  3rd Qu.:18.18                         3rd Qu.:11.014   
##  Max.   :20.46                         Max.   :16.029   
##  NA's   :4                             NA's   :4        
##  station_avg_temp_c station_diur_temp_rng_c station_max_temp_c
##  Min.   :21.40      Min.   : 5.20           Min.   :30.1      
##  1st Qu.:27.00      1st Qu.: 9.50           1st Qu.:33.2      
##  Median :27.60      Median :10.62           Median :34.0      
##  Mean   :27.53      Mean   :10.57           Mean   :34.0      
##  3rd Qu.:28.10      3rd Qu.:11.65           3rd Qu.:34.9      
##  Max.   :30.80      Max.   :15.80           Max.   :42.2      
##  NA's   :37         NA's   :37              NA's   :14        
##  station_min_temp_c station_precip_mm
##  Min.   :14.7       Min.   :  0.00   
##  1st Qu.:20.6       1st Qu.: 17.20   
##  Median :21.3       Median : 45.30   
##  Mean   :21.2       Mean   : 62.47   
##  3rd Qu.:22.0       3rd Qu.: 85.95   
##  Max.   :24.2       Max.   :543.30   
##  NA's   :8          NA's   :16

There are NAs in many of the observations, particularly in the week_start_data variable. Therefore, I chose to remove this variable.

Data Preperation

sj_train_features %<>% dplyr::select(-week_start_date)
iq_train_features %<>% dplyr::select(-week_start_date)

Fill in remainder of NAs with the most recent value

sj_train_features$ndvi_ne %<>% na.locf(fromLast = TRUE)
iq_train_features$ndvi_ne %<>% na.locf(fromLast = TRUE)

Now, the labels must be applied to the features of the data set.

cat('\nSan Juan\n',
    '\t total cases mean: ',      sj_train_labels$total_cases %>% mean(), 
    '\t total cases variance: ' , sj_train_labels$total_cases %>% var() )

## 
## San Juan
##       total cases mean:  34.18056     total cases variance:  2640.045

cat('\nIquitos\n',
    '\t total cases mean: ',      iq_train_labels$total_cases %>% mean(), 
    '\t total cases variance: ' , iq_train_labels$total_cases %>% var() )

## 
## Iquitos
##       total cases mean:  7.565385     total cases variance:  115.8955

Complete data frames/structure

preprocessData <- function(data_path, labels_path = NULL)
{
  df <- read.csv(data_path)
  features = c("reanalysis_specific_humidity_g_per_kg", "reanalysis_dew_point_temp_k",
               "station_avg_temp_c", "station_min_temp_c", "ndvi_ne", "ndvi_se", "ndvi_nw","ndvi_sw", "station_precip_mm" )
  df[features] %<>% na.locf(fromLast = TRUE) 
  if (is.null(labels_path)) features %<>% c("city", "year", "weekofyear")
  df <- df[features]
  if (!is.null(labels_path)) df %<>% cbind(read.csv(labels_path))
  df_sj <- filter(df, city == 'sj')
  df_iq <- filter(df, city == 'iq')
  return(list(df_sj, df_iq))
}
preprocessData(data_path = 'Training_Data_Features.csv', labels_path = 'Training_Data_Labels.csv') -> trains
sj_train <- trains[[1]]; iq_train <- as.data.frame(trains[2])

Data Exploration

rbind(iq_train_labels, sj_train_labels) %>% 
  ggplot(aes(x = total_cases, fill = ..count..)) + 
  geom_histogram(bins = 20, colour = 'black') + ggtitle('Total Cases of Dengue') +scale_y_continuous(breaks = seq(0,700,100)) + facet_wrap(~city)

A look at the total cases of Dengue in each city.

sj_train_features %<>% mutate('total_cases' = sj_train_labels$total_cases)
iq_train_features %<>% mutate('total_cases' = iq_train_labels$total_cases)

Adding the total number of cases in each city to the data

sj_train_features %<>% mutate('total_cases' = sj_train_labels$total_cases)
iq_train_features %<>% mutate('total_cases' = iq_train_labels$total_cases)

Create Time Series

sj.ts<-ts(sj_train_features$total_cases, frequency=52, start=c(1990, 18))
iq.ts<-ts(iq_train_features$total_cases, frequency=52, start=c(2000, 26))
plot(sj.ts)

plot(iq.ts)

Create Training and Test Set for Time Series

sj.train.ts<-window(sj.ts, end=c(2004,37))
sj.test.ts<-window(sj.ts,start=c(2004,38))

iq.train.ts<-window(iq.ts, end=c(2008,27))
iq.test.ts<-window(iq.ts,start=c(2008,28))

Build Models

Auto.ARIMA Models

San Juan

sj.arima=auto.arima(sj.train.ts)

sj.forecast1=forecast(sj.arima,h=260)
autoplot(sj.forecast1)

Iquitos

iq.arima=auto.arima(iq.train.ts)

iq.forecast1=forecast(iq.arima,h=156)
autoplot(iq.forecast1)

Neural Network Models

San Juan

sj.nnetar<-nnetar(sj.train.ts)

sj.forecast2<-forecast(sj.nnetar,h=260)
autoplot(sj.forecast2)

#####Iquitos

iq.nnetar<-nnetar(iq.train.ts)

iq.forecast2<-forecast(iq.nnetar,h=156)
autoplot(iq.forecast2)

Random Forest

San Juan

library(randomForest)

## Warning: package 'randomForest' was built under R version 3.4.4

## randomForest 4.6-14

## Type rfNews() to see new features/changes/bug fixes.

## 
## Attaching package: 'randomForest'

## The following object is masked from 'package:gridExtra':
## 
##     combine

## The following object is masked from 'package:dplyr':
## 
##     combine

## The following object is masked from 'package:ggplot2':
## 
##     margin

sj.rF<-randomForest(total_cases ~ ndvi_ne + ndvi_nw + ndvi_se + ndvi_sw +  reanalysis_specific_humidity_g_per_kg + station_avg_temp_c + reanalysis_dew_point_temp_k + station_min_temp_c + station_precip_mm , data = sj_train)

sj.forecast3<- predict(object=sj.rF, h=260)
plot(sj.forecast3)

Iquitos

iq.rF<-randomForest(total_cases ~ ndvi_ne + ndvi_nw + ndvi_se + ndvi_sw +  reanalysis_specific_humidity_g_per_kg + station_avg_temp_c + reanalysis_dew_point_temp_k + station_min_temp_c + station_precip_mm , data = iq_train)

iq.forecast3<- predict(object=iq.rF, h=156)
plot(iq.forecast3)

Model Selection

Arima Accuracy and Residuals

acc.sj.arima <- accuracy(sj.forecast1, sj.test.ts)

acc.iq.arima <- accuracy(iq.forecast1,iq.test.ts)

tsdisplay(residuals(sj.forecast1))

acc.sj.arima

##                        ME     RMSE       MAE  MPE MAPE      MASE
## Training set  0.006577048 13.83894  8.379156  NaN  Inf 0.2102492
## Test set     14.740977080 34.48529 18.234742 -Inf  Inf 0.4575449
##                     ACF1 Theil's U
## Training set 0.007353076        NA
## Test set     0.932173942         0

tsdisplay(residuals(iq.forecast1))

acc.iq.arima

##                       ME     RMSE      MAE  MPE MAPE      MASE        ACF1
## Training set 0.008331804  7.16080 3.773915  NaN  Inf 0.4123143 0.006181246
## Test set     7.882537945 13.92229 8.265254 -Inf  Inf 0.9030098 0.842766026
##              Theil's U
## Training set        NA
## Test set             0

Neural Network Accuracy and Residuals

acc.sj.nnetar <- accuracy(sj.forecast2, sj.test.ts)

acc.iq.nnetar <- accuracy(iq.forecast2,iq.test.ts)

tsdisplay(residuals(sj.forecast2))

acc.sj.nnetar

##                       ME      RMSE       MAE  MPE MAPE      MASE
## Training set -0.01880461  6.504068  4.777888 -Inf  Inf 0.1198864
## Test set      6.05422937 32.475855 19.514153 -Inf  Inf 0.4896478
##                     ACF1 Theil's U
## Training set -0.01533117        NA
## Test set      0.93554975         0

tsdisplay(residuals(iq.forecast2))

acc.iq.nnetar

##                       ME      RMSE      MAE  MPE MAPE      MASE
## Training set -0.01342703  5.887483 3.558980 -Inf  Inf 0.3888319
## Test set      5.08383902 12.626048 7.267046 -Inf  Inf 0.7939519
##                     ACF1 Theil's U
## Training set 0.002369741        NA
## Test set     0.846631790         0

RandomForest Accuracy and Residuals

acc.sj.rF <- accuracy(sj.forecast3, sj.test.ts)

acc.iq.rF <- accuracy(iq.forecast3,iq.test.ts)


acc.sj.rF

##                 ME     RMSE      MAE  MPE MAPE      ACF1 Theil's U
## Test set -7.436914 36.15461 28.11977 -Inf  Inf 0.8541588         0

acc.iq.rF

##                ME     RMSE      MAE  MPE MAPE      ACF1 Theil's U
## Test set 3.222535 12.99857 7.777463 -Inf  Inf 0.8037225         0

Neural Network is the best forecasting model according to test metrics

Submit to Competition

NN_sj_final=data.frame(densubformat[1:260,-4],total_cases=round(sj.forecast2$mean))

NN_iq_final=data.frame(densubformat[261:416,-4],total_cases=round(iq.forecast2$mean))

Dengue_Arima_solution=bind_rows(NN_sj_final,NN_iq_final)

## Warning in bind_rows_(x, .id): Vectorizing 'ts' elements may not preserve
## their attributes

## Warning in bind_rows_(x, .id): Vectorizing 'ts' elements may not preserve
## their attributes

write.csv(Dengue_Arima_solution,file="Dengue_Arima_predictions.csv",row.names=F)

I recieved a final score of 28.64 from my Neural Networking model

Final Project

Brendan O’Brien

12/11/2018

Import Data

Load Libraries

Descriptive Statistics

Data Preperation

Data Exploration

Adding the total number of cases in each city to the data

Create Time Series

Create Training and Test Set for Time Series

Build Models

Auto.ARIMA Models

San Juan

Iquitos

Neural Network Models

San Juan

Random Forest

San Juan

Iquitos

Model Selection

Arima Accuracy and Residuals

Neural Network Accuracy and Residuals

RandomForest Accuracy and Residuals

Submit to Competition