library(forecast)
## Warning: package 'forecast' was built under R version 3.4.4
library(fpp)
## Loading required package: fma
## Loading required package: expsmooth
## Loading required package: lmtest
## Loading required package: zoo
## Warning: package 'zoo' was built under R version 3.4.3
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
## Loading required package: tseries
## Warning: package 'tseries' was built under R version 3.4.3
library(caret)
## Warning: package 'caret' was built under R version 3.4.3
## Loading required package: lattice
## Loading required package: ggplot2
library(neuralnet)
library(randomForest)
## randomForest 4.6-12
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
## 
##     margin
library(psych)
## 
## Attaching package: 'psych'
## The following object is masked from 'package:randomForest':
## 
##     outlier
## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha
library(VIM)
## Loading required package: colorspace
## Loading required package: grid
## Loading required package: data.table
## Warning: package 'data.table' was built under R version 3.4.2
## VIM is ready to use. 
##  Since version 4.0.0 the GUI is in its own package VIMGUI.
## 
##           Please use the package to use the new (and old) GUI.
## Suggestions and bug-reports can be submitted at: https://github.com/alexkowa/VIM/issues
## 
## Attaching package: 'VIM'
## The following object is masked from 'package:datasets':
## 
##     sleep
library(mice)
## Warning: package 'mice' was built under R version 3.4.2
library(ResourceSelection)
## ResourceSelection 0.3-2   2017-02-28
library(corrplot)
## Warning: package 'corrplot' was built under R version 3.4.2
## corrplot 0.84 loaded

Load Data

library(readr)
mydata <- read_csv("~/Dropbox/Boston College/Predictive Analytics/combined_train.csv")
## Parsed with column specification:
## cols(
##   .default = col_double(),
##   city = col_character(),
##   year = col_integer(),
##   weekofyear = col_integer(),
##   week_start_date = col_character(),
##   total_cases = col_integer()
## )
## See spec(...) for full column specifications.
labels <- read_csv("~/Dropbox/Boston College/Predictive Analytics/Final/dengue_labels_train.csv")
## Parsed with column specification:
## cols(
##   city = col_character(),
##   year = col_integer(),
##   weekofyear = col_integer(),
##   total_cases = col_integer()
## )
test <- read_csv("~/Dropbox/Boston College/Predictive Analytics/Final/dengue_features_test.csv")
## Parsed with column specification:
## cols(
##   .default = col_double(),
##   city = col_character(),
##   year = col_integer(),
##   weekofyear = col_integer(),
##   week_start_date = col_date(format = "")
## )
## See spec(...) for full column specifications.
submission <- read_csv("~/Dropbox/Boston College/Predictive Analytics/Final/submission_format.csv")
## Parsed with column specification:
## cols(
##   city = col_character(),
##   year = col_integer(),
##   weekofyear = col_integer(),
##   total_cases = col_integer()
## )

Starter Code:

aggr(mydata, prop=T,numbers=T)
## Warning in plot.aggr(res, ...): not enough horizontal space to display
## frequencies

mydata$total_cases<-as.numeric(mydata$total_cases)
head(mydata)
## # A tibble: 6 x 25
##   city   year weekofyear week_start_date ndvi_ne ndvi_nw ndvi_se ndvi_sw
##   <chr> <int>      <int> <chr>             <dbl>   <dbl>   <dbl>   <dbl>
## 1 sj     1990         18 4/30/90          0.123    0.104   0.198   0.178
## 2 sj     1990         19 5/7/90           0.170    0.142   0.162   0.155
## 3 sj     1990         20 5/14/90          0.0322   0.173   0.157   0.171
## 4 sj     1990         21 5/21/90          0.129    0.245   0.228   0.236
## 5 sj     1990         22 5/28/90          0.196    0.262   0.251   0.247
## 6 sj     1990         23 6/4/90          NA        0.175   0.254   0.182
## # ... with 17 more variables: precipitation_amt_mm <dbl>,
## #   reanalysis_air_temp_k <dbl>, reanalysis_avg_temp_k <dbl>,
## #   reanalysis_dew_point_temp_k <dbl>, reanalysis_max_air_temp_k <dbl>,
## #   reanalysis_min_air_temp_k <dbl>,
## #   reanalysis_precip_amt_kg_per_m2 <dbl>,
## #   reanalysis_relative_humidity_percent <dbl>,
## #   reanalysis_sat_precip_amt_mm <dbl>,
## #   reanalysis_specific_humidity_g_per_kg <dbl>, reanalysis_tdtr_k <dbl>,
## #   station_avg_temp_c <dbl>, station_diur_temp_rng_c <dbl>,
## #   station_max_temp_c <dbl>, station_min_temp_c <dbl>,
## #   station_precip_mm <dbl>, total_cases <dbl>
str(mydata)
## Classes 'tbl_df', 'tbl' and 'data.frame':    1456 obs. of  25 variables:
##  $ city                                 : chr  "sj" "sj" "sj" "sj" ...
##  $ year                                 : int  1990 1990 1990 1990 1990 1990 1990 1990 1990 1990 ...
##  $ weekofyear                           : int  18 19 20 21 22 23 24 25 26 27 ...
##  $ week_start_date                      : chr  "4/30/90" "5/7/90" "5/14/90" "5/21/90" ...
##  $ ndvi_ne                              : num  0.1226 0.1699 0.0323 0.1286 0.1962 ...
##  $ ndvi_nw                              : num  0.104 0.142 0.173 0.245 0.262 ...
##  $ ndvi_se                              : num  0.198 0.162 0.157 0.228 0.251 ...
##  $ ndvi_sw                              : num  0.178 0.155 0.171 0.236 0.247 ...
##  $ precipitation_amt_mm                 : num  12.42 22.82 34.54 15.36 7.52 ...
##  $ reanalysis_air_temp_k                : num  298 298 299 299 300 ...
##  $ reanalysis_avg_temp_k                : num  298 298 299 299 300 ...
##  $ reanalysis_dew_point_temp_k          : num  292 294 295 295 296 ...
##  $ reanalysis_max_air_temp_k            : num  300 301 300 301 302 ...
##  $ reanalysis_min_air_temp_k            : num  296 296 297 297 298 ...
##  $ reanalysis_precip_amt_kg_per_m2      : num  32 17.9 26.1 13.9 12.2 ...
##  $ reanalysis_relative_humidity_percent : num  73.4 77.4 82.1 80.3 80.5 ...
##  $ reanalysis_sat_precip_amt_mm         : num  12.42 22.82 34.54 15.36 7.52 ...
##  $ reanalysis_specific_humidity_g_per_kg: num  14 15.4 16.8 16.7 17.2 ...
##  $ reanalysis_tdtr_k                    : num  2.63 2.37 2.3 2.43 3.01 ...
##  $ station_avg_temp_c                   : num  25.4 26.7 26.7 27.5 28.9 ...
##  $ station_diur_temp_rng_c              : num  6.9 6.37 6.49 6.77 9.37 ...
##  $ station_max_temp_c                   : num  29.4 31.7 32.2 33.3 35 34.4 32.2 33.9 33.9 33.9 ...
##  $ station_min_temp_c                   : num  20 22.2 22.8 23.3 23.9 23.9 23.3 22.8 22.8 24.4 ...
##  $ station_precip_mm                    : num  16 8.6 41.4 4 5.8 39.1 29.7 21.1 21.1 1.1 ...
##  $ total_cases                          : num  4 5 4 3 6 2 4 5 10 6 ...
##  - attr(*, "spec")=List of 2
##   ..$ cols   :List of 25
##   .. ..$ city                                 : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ year                                 : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ weekofyear                           : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ week_start_date                      : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ ndvi_ne                              : list()
##   .. .. ..- attr(*, "class")= chr  "collector_double" "collector"
##   .. ..$ ndvi_nw                              : list()
##   .. .. ..- attr(*, "class")= chr  "collector_double" "collector"
##   .. ..$ ndvi_se                              : list()
##   .. .. ..- attr(*, "class")= chr  "collector_double" "collector"
##   .. ..$ ndvi_sw                              : list()
##   .. .. ..- attr(*, "class")= chr  "collector_double" "collector"
##   .. ..$ precipitation_amt_mm                 : list()
##   .. .. ..- attr(*, "class")= chr  "collector_double" "collector"
##   .. ..$ reanalysis_air_temp_k                : list()
##   .. .. ..- attr(*, "class")= chr  "collector_double" "collector"
##   .. ..$ reanalysis_avg_temp_k                : list()
##   .. .. ..- attr(*, "class")= chr  "collector_double" "collector"
##   .. ..$ reanalysis_dew_point_temp_k          : list()
##   .. .. ..- attr(*, "class")= chr  "collector_double" "collector"
##   .. ..$ reanalysis_max_air_temp_k            : list()
##   .. .. ..- attr(*, "class")= chr  "collector_double" "collector"
##   .. ..$ reanalysis_min_air_temp_k            : list()
##   .. .. ..- attr(*, "class")= chr  "collector_double" "collector"
##   .. ..$ reanalysis_precip_amt_kg_per_m2      : list()
##   .. .. ..- attr(*, "class")= chr  "collector_double" "collector"
##   .. ..$ reanalysis_relative_humidity_percent : list()
##   .. .. ..- attr(*, "class")= chr  "collector_double" "collector"
##   .. ..$ reanalysis_sat_precip_amt_mm         : list()
##   .. .. ..- attr(*, "class")= chr  "collector_double" "collector"
##   .. ..$ reanalysis_specific_humidity_g_per_kg: list()
##   .. .. ..- attr(*, "class")= chr  "collector_double" "collector"
##   .. ..$ reanalysis_tdtr_k                    : list()
##   .. .. ..- attr(*, "class")= chr  "collector_double" "collector"
##   .. ..$ station_avg_temp_c                   : list()
##   .. .. ..- attr(*, "class")= chr  "collector_double" "collector"
##   .. ..$ station_diur_temp_rng_c              : list()
##   .. .. ..- attr(*, "class")= chr  "collector_double" "collector"
##   .. ..$ station_max_temp_c                   : list()
##   .. .. ..- attr(*, "class")= chr  "collector_double" "collector"
##   .. ..$ station_min_temp_c                   : list()
##   .. .. ..- attr(*, "class")= chr  "collector_double" "collector"
##   .. ..$ station_precip_mm                    : list()
##   .. .. ..- attr(*, "class")= chr  "collector_double" "collector"
##   .. ..$ total_cases                          : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   ..$ default: list()
##   .. ..- attr(*, "class")= chr  "collector_guess" "collector"
##   ..- attr(*, "class")= chr "col_spec"
#combine train and test sets
alldata<-rbind(mydata[,-25],test)

#replace NA values with median

anyNA(alldata$ndvi_ne)
## [1] TRUE
anyNA(alldata$ndvi_nw)
## [1] TRUE
anyNA(mydata$total_cases) #False
## [1] FALSE
alldata$ndvi_ne[is.na(alldata$ndvi_ne)]<-median(alldata$ndvi_ne, na.rm=TRUE)
alldata$ndvi_nw[is.na(alldata$ndvi_nw)]<-median(alldata$ndvi_nw, na.rm=TRUE)
alldata$ndvi_se[is.na(alldata$ndvi_se)]<-median(alldata$ndvi_se, na.rm=TRUE)
alldata$ndvi_sw[is.na(alldata$ndvi_sw)]<-median(alldata$ndvi_sw, na.rm=TRUE)
alldata$precipitation_amt_mm[is.na(alldata$precipitation_amt_mm)]<-median(alldata$precipitation_amt_mm, na.rm=TRUE)

alldata$reanalysis_air_temp_k[is.na(alldata$reanalysis_air_temp_k)]<-median(alldata$reanalysis_air_temp_k, na.rm=TRUE)
alldata$reanalysis_avg_temp_k[is.na(alldata$reanalysis_avg_temp_k)]<-median(alldata$reanalysis_avg_temp_k, na.rm=TRUE)
alldata$reanalysis_dew_point_temp_k[is.na(alldata$reanalysis_dew_point_temp_k)]<-median(alldata$reanalysis_dew_point_temp_k, na.rm=TRUE)
alldata$reanalysis_max_air_temp_k[is.na(alldata$reanalysis_max_air_temp_k)]<-median(alldata$reanalysis_max_air_temp_k, na.rm=TRUE)
alldata$reanalysis_min_air_temp_k[is.na(alldata$reanalysis_min_air_temp_k)]<-median(alldata$reanalysis_min_air_temp_k, na.rm=TRUE)
alldata$reanalysis_precip_amt_kg_per_m2[is.na(alldata$reanalysis_precip_amt_kg_per_m2)]<-median(alldata$reanalysis_precip_amt_kg_per_m2, na.rm=TRUE)
alldata$reanalysis_relative_humidity_percent[is.na(alldata$reanalysis_relative_humidity_percent)]<-median(alldata$reanalysis_relative_humidity_percent, na.rm=TRUE)
alldata$reanalysis_sat_precip_amt_mm[is.na(alldata$reanalysis_sat_precip_amt_mm)]<-median(alldata$reanalysis_sat_precip_amt_mm, na.rm=TRUE)
alldata$reanalysis_specific_humidity_g_per_kg[is.na(alldata$reanalysis_specific_humidity_g_per_kg)]<-median(alldata$reanalysis_specific_humidity_g_per_kg, na.rm=TRUE)
alldata$reanalysis_tdtr_k[is.na(alldata$reanalysis_tdtr_k)]<-median(alldata$reanalysis_tdtr_k, na.rm=TRUE)

alldata$station_avg_temp_c[is.na(alldata$station_avg_temp_c)]<-median(alldata$station_avg_temp_c, na.rm=TRUE)
alldata$station_diur_temp_rng_c[is.na(alldata$station_diur_temp_rng_c)]<-median(alldata$station_diur_temp_rng_c, na.rm=TRUE)
alldata$station_max_temp_c[is.na(alldata$station_max_temp_c)]<-median(alldata$station_max_temp_c, na.rm=TRUE)
alldata$station_min_temp_c[is.na(alldata$station_min_temp_c)]<-median(alldata$station_min_temp_c, na.rm=TRUE)
alldata$station_precip_mm[is.na(alldata$station_precip_mm)]<-median(alldata$station_precip_mm, na.rm=TRUE)

anyNA(alldata) #False
## [1] FALSE
#putting back in the "total_cases" variable
alldata$total_cases<-0
alldata$total_cases[1:1456]<-mydata$total_cases

#separate into sj and iq train and test

sj<-alldata[1:936,]
iq<-alldata[937:1456,]

sj.test<-alldata[1457:1716, -25]
iq.test<-alldata[1717:1872, -25]

head(sj)
## # A tibble: 6 x 25
##   city   year weekofyear week_start_date ndvi_ne ndvi_nw ndvi_se ndvi_sw
##   <chr> <int>      <int> <chr>             <dbl>   <dbl>   <dbl>   <dbl>
## 1 sj     1990         18 4/30/90          0.123    0.104   0.198   0.178
## 2 sj     1990         19 5/7/90           0.170    0.142   0.162   0.155
## 3 sj     1990         20 5/14/90          0.0322   0.173   0.157   0.171
## 4 sj     1990         21 5/21/90          0.129    0.245   0.228   0.236
## 5 sj     1990         22 5/28/90          0.196    0.262   0.251   0.247
## 6 sj     1990         23 6/4/90           0.127    0.175   0.254   0.182
## # ... with 17 more variables: precipitation_amt_mm <dbl>,
## #   reanalysis_air_temp_k <dbl>, reanalysis_avg_temp_k <dbl>,
## #   reanalysis_dew_point_temp_k <dbl>, reanalysis_max_air_temp_k <dbl>,
## #   reanalysis_min_air_temp_k <dbl>,
## #   reanalysis_precip_amt_kg_per_m2 <dbl>,
## #   reanalysis_relative_humidity_percent <dbl>,
## #   reanalysis_sat_precip_amt_mm <dbl>,
## #   reanalysis_specific_humidity_g_per_kg <dbl>, reanalysis_tdtr_k <dbl>,
## #   station_avg_temp_c <dbl>, station_diur_temp_rng_c <dbl>,
## #   station_max_temp_c <dbl>, station_min_temp_c <dbl>,
## #   station_precip_mm <dbl>, total_cases <dbl>
mycor=cor(sj[,5:25])
corrplot(mycor, method="circle", type="lower")

sjtot=ts(sj$total_cases, frequency=52, start=c(1990,04,30))
plot(sjtot)

acf(sjtot)

pacf(sjtot)

plot(decompose(sjtot))

mycor=cor(iq[,5:25])
corrplot(mycor, method="circle", type="lower")

iqtot=ts(iq$total_cases, frequency=52, start=c(2000,07,01))
plot(iqtot)

acf(iqtot)

pacf(iqtot)

plot(decompose(iqtot))

ARIMA Model

#(1,1,1)(0,0,0)
myarima1<-auto.arima(sjtot) 
myarima1
## Series: sjtot 
## ARIMA(1,1,1) 
## 
## Coefficients:
##          ar1      ma1
##       0.7116  -0.5929
## s.e.  0.0948   0.1078
## 
## sigma^2 estimated as 180.9:  log likelihood=-3755.85
## AIC=7517.71   AICc=7517.73   BIC=7532.23
forecast.arima1<-forecast(myarima1, h=260)
plot(forecast.arima1)

plot(residuals(myarima1))

#(0,1,2)(0,0,1)
myarima2<-auto.arima(iqtot)
myarima2
## Series: iqtot 
## ARIMA(0,1,2)(0,0,1)[52] 
## 
## Coefficients:
##           ma1      ma2    sma1
##       -0.2508  -0.2322  0.0568
## s.e.   0.0430   0.0449  0.0375
## 
## sigma^2 estimated as 52.12:  log likelihood=-1761.09
## AIC=3530.18   AICc=3530.26   BIC=3547.19
forecast.arima2<-forecast(myarima2, h=156)
plot(forecast.arima2)

plot(residuals(myarima2))

submission$total_cases[1:260]<-round(forecast.arima1$mean)
results.arima1<-as.data.frame(submission[1:260,])

submission$total_cases[261:416]<-round(forecast.arima2$mean)
results.arima2<-as.data.frame(submission[261:416,])

arima.solve<-rbind(results.arima1, results.arima2)

write.csv(arima.solve, file="~/Dropbox/Boston College/Predictive Analytics/Final/Submissions/Arima submission.csv", row.names = FALSE)

NNETAR Model

mynnetar1<-nnetar(sjtot, repeats=25, size=12, decay=0.1, linout=TRUE)
mynnetar1
## Series: sjtot 
## Model:  NNAR(14,1,12)[52] 
## Call:   nnetar(y = sjtot, size = 12, repeats = 25, decay = 0.1, linout = TRUE)
## 
## Average of 25 networks, each of which is
## a 15-12-1 network with 205 weights
## options were - linear output units  decay=0.1
## 
## sigma^2 estimated as 63.08
forecast.nnetar1<-forecast(mynnetar1, h=260)
plot(forecast.nnetar1)

plot(residuals(mynnetar1))

mynnetar2<-nnetar(iqtot, repeats=25, size=18, decay=0.1, lineout=TRUE)
mynnetar2
## Series: iqtot 
## Model:  NNAR(5,1,18)[52] 
## Call:   nnetar(y = iqtot, size = 18, repeats = 25, decay = 0.1, lineout = TRUE)
## 
## Average of 25 networks, each of which is
## a 6-18-1 network with 145 weights
## options were - linear output units  decay=0.1
## 
## sigma^2 estimated as 13.42
forecast.nnetar2<-forecast(mynnetar2, h=156)
plot(forecast.nnetar2)

plot(residuals(mynnetar2))

submission$total_cases[1:260]<-round(forecast.nnetar1$mean)
results.nnetar1<-as.data.frame(submission[1:260,])

submission$total_cases[261:416]<-round(forecast.nnetar2$mean)
results.nnetar2<-as.data.frame(submission[261:416,])

nnetar.solve<-rbind(results.nnetar1, results.nnetar2)

write.csv(nnetar.solve, file="~/Dropbox/Boston College/Predictive Analytics/Final/Submissions/Submission Neural.csv", row.names = FALSE)

ETS Model

myets1<-ets(sjtot, model="ZZZ")
## Warning in ets(sjtot, model = "ZZZ"): I can't handle data with frequency
## greater than 24. Seasonality will be ignored. Try stlf() if you need
## seasonal forecasts.
myets1
## ETS(A,Ad,N) 
## 
## Call:
##  ets(y = sjtot, model = "ZZZ") 
## 
##   Smoothing parameters:
##     alpha = 0.9999 
##     beta  = 0.1355 
##     phi   = 0.8 
## 
##   Initial states:
##     l = 3.0767 
##     b = 0.2624 
## 
##   sigma:  13.4753
## 
##      AIC     AICc      BIC 
## 11279.55 11279.64 11308.60
forecast.ets1<-forecast(myets1, h=260)
plot(forecast.ets1)

plot(residuals(myets1))

myets2<-ets(iqtot, model="ZZZ")
## Warning in ets(iqtot, model = "ZZZ"): I can't handle data with frequency
## greater than 24. Seasonality will be ignored. Try stlf() if you need
## seasonal forecasts.
myets2
## ETS(A,N,N) 
## 
## Call:
##  ets(y = iqtot, model = "ZZZ") 
## 
##   Smoothing parameters:
##     alpha = 0.6543 
## 
##   Initial states:
##     l = 0.0744 
## 
##   sigma:  7.4063
## 
##      AIC     AICc      BIC 
## 5338.417 5338.463 5351.178
forecast.ets2<-forecast(myets2, h=156)
plot(forecast.ets2)

plot(residuals(myets2))

submission$total_cases[1:260]<-round(forecast.ets1$mean)
results.ets1<-as.data.frame(submission[1:260,])

submission$total_cases[261:416]<-round(forecast.ets2$mean)
results.ets2<-as.data.frame(submission[261:416,])

nnetar.solve<-rbind(results.ets1, results.ets2)

write.csv(nnetar.solve, file="~/Dropbox/Boston College/Predictive Analytics/Final/Submissions/ETS submit.csv", row.names = FALSE)