library(forecast)
## Warning: package 'forecast' was built under R version 3.4.4
library(fpp)
## Loading required package: fma
## Loading required package: expsmooth
## Loading required package: lmtest
## Loading required package: zoo
## Warning: package 'zoo' was built under R version 3.4.3
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
## Loading required package: tseries
## Warning: package 'tseries' was built under R version 3.4.3
library(caret)
## Warning: package 'caret' was built under R version 3.4.3
## Loading required package: lattice
## Loading required package: ggplot2
library(neuralnet)
library(randomForest)
## randomForest 4.6-12
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
##
## margin
library(psych)
##
## Attaching package: 'psych'
## The following object is masked from 'package:randomForest':
##
## outlier
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
library(VIM)
## Loading required package: colorspace
## Loading required package: grid
## Loading required package: data.table
## Warning: package 'data.table' was built under R version 3.4.2
## VIM is ready to use.
## Since version 4.0.0 the GUI is in its own package VIMGUI.
##
## Please use the package to use the new (and old) GUI.
## Suggestions and bug-reports can be submitted at: https://github.com/alexkowa/VIM/issues
##
## Attaching package: 'VIM'
## The following object is masked from 'package:datasets':
##
## sleep
library(mice)
## Warning: package 'mice' was built under R version 3.4.2
library(ResourceSelection)
## ResourceSelection 0.3-2 2017-02-28
library(corrplot)
## Warning: package 'corrplot' was built under R version 3.4.2
## corrplot 0.84 loaded
Load Data
library(readr)
mydata <- read_csv("~/Dropbox/Boston College/Predictive Analytics/combined_train.csv")
## Parsed with column specification:
## cols(
## .default = col_double(),
## city = col_character(),
## year = col_integer(),
## weekofyear = col_integer(),
## week_start_date = col_character(),
## total_cases = col_integer()
## )
## See spec(...) for full column specifications.
labels <- read_csv("~/Dropbox/Boston College/Predictive Analytics/Final/dengue_labels_train.csv")
## Parsed with column specification:
## cols(
## city = col_character(),
## year = col_integer(),
## weekofyear = col_integer(),
## total_cases = col_integer()
## )
test <- read_csv("~/Dropbox/Boston College/Predictive Analytics/Final/dengue_features_test.csv")
## Parsed with column specification:
## cols(
## .default = col_double(),
## city = col_character(),
## year = col_integer(),
## weekofyear = col_integer(),
## week_start_date = col_date(format = "")
## )
## See spec(...) for full column specifications.
submission <- read_csv("~/Dropbox/Boston College/Predictive Analytics/Final/submission_format.csv")
## Parsed with column specification:
## cols(
## city = col_character(),
## year = col_integer(),
## weekofyear = col_integer(),
## total_cases = col_integer()
## )
Starter Code:
aggr(mydata, prop=T,numbers=T)
## Warning in plot.aggr(res, ...): not enough horizontal space to display
## frequencies
mydata$total_cases<-as.numeric(mydata$total_cases)
head(mydata)
## # A tibble: 6 x 25
## city year weekofyear week_start_date ndvi_ne ndvi_nw ndvi_se ndvi_sw
## <chr> <int> <int> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 sj 1990 18 4/30/90 0.123 0.104 0.198 0.178
## 2 sj 1990 19 5/7/90 0.170 0.142 0.162 0.155
## 3 sj 1990 20 5/14/90 0.0322 0.173 0.157 0.171
## 4 sj 1990 21 5/21/90 0.129 0.245 0.228 0.236
## 5 sj 1990 22 5/28/90 0.196 0.262 0.251 0.247
## 6 sj 1990 23 6/4/90 NA 0.175 0.254 0.182
## # ... with 17 more variables: precipitation_amt_mm <dbl>,
## # reanalysis_air_temp_k <dbl>, reanalysis_avg_temp_k <dbl>,
## # reanalysis_dew_point_temp_k <dbl>, reanalysis_max_air_temp_k <dbl>,
## # reanalysis_min_air_temp_k <dbl>,
## # reanalysis_precip_amt_kg_per_m2 <dbl>,
## # reanalysis_relative_humidity_percent <dbl>,
## # reanalysis_sat_precip_amt_mm <dbl>,
## # reanalysis_specific_humidity_g_per_kg <dbl>, reanalysis_tdtr_k <dbl>,
## # station_avg_temp_c <dbl>, station_diur_temp_rng_c <dbl>,
## # station_max_temp_c <dbl>, station_min_temp_c <dbl>,
## # station_precip_mm <dbl>, total_cases <dbl>
str(mydata)
## Classes 'tbl_df', 'tbl' and 'data.frame': 1456 obs. of 25 variables:
## $ city : chr "sj" "sj" "sj" "sj" ...
## $ year : int 1990 1990 1990 1990 1990 1990 1990 1990 1990 1990 ...
## $ weekofyear : int 18 19 20 21 22 23 24 25 26 27 ...
## $ week_start_date : chr "4/30/90" "5/7/90" "5/14/90" "5/21/90" ...
## $ ndvi_ne : num 0.1226 0.1699 0.0323 0.1286 0.1962 ...
## $ ndvi_nw : num 0.104 0.142 0.173 0.245 0.262 ...
## $ ndvi_se : num 0.198 0.162 0.157 0.228 0.251 ...
## $ ndvi_sw : num 0.178 0.155 0.171 0.236 0.247 ...
## $ precipitation_amt_mm : num 12.42 22.82 34.54 15.36 7.52 ...
## $ reanalysis_air_temp_k : num 298 298 299 299 300 ...
## $ reanalysis_avg_temp_k : num 298 298 299 299 300 ...
## $ reanalysis_dew_point_temp_k : num 292 294 295 295 296 ...
## $ reanalysis_max_air_temp_k : num 300 301 300 301 302 ...
## $ reanalysis_min_air_temp_k : num 296 296 297 297 298 ...
## $ reanalysis_precip_amt_kg_per_m2 : num 32 17.9 26.1 13.9 12.2 ...
## $ reanalysis_relative_humidity_percent : num 73.4 77.4 82.1 80.3 80.5 ...
## $ reanalysis_sat_precip_amt_mm : num 12.42 22.82 34.54 15.36 7.52 ...
## $ reanalysis_specific_humidity_g_per_kg: num 14 15.4 16.8 16.7 17.2 ...
## $ reanalysis_tdtr_k : num 2.63 2.37 2.3 2.43 3.01 ...
## $ station_avg_temp_c : num 25.4 26.7 26.7 27.5 28.9 ...
## $ station_diur_temp_rng_c : num 6.9 6.37 6.49 6.77 9.37 ...
## $ station_max_temp_c : num 29.4 31.7 32.2 33.3 35 34.4 32.2 33.9 33.9 33.9 ...
## $ station_min_temp_c : num 20 22.2 22.8 23.3 23.9 23.9 23.3 22.8 22.8 24.4 ...
## $ station_precip_mm : num 16 8.6 41.4 4 5.8 39.1 29.7 21.1 21.1 1.1 ...
## $ total_cases : num 4 5 4 3 6 2 4 5 10 6 ...
## - attr(*, "spec")=List of 2
## ..$ cols :List of 25
## .. ..$ city : list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## .. ..$ year : list()
## .. .. ..- attr(*, "class")= chr "collector_integer" "collector"
## .. ..$ weekofyear : list()
## .. .. ..- attr(*, "class")= chr "collector_integer" "collector"
## .. ..$ week_start_date : list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## .. ..$ ndvi_ne : list()
## .. .. ..- attr(*, "class")= chr "collector_double" "collector"
## .. ..$ ndvi_nw : list()
## .. .. ..- attr(*, "class")= chr "collector_double" "collector"
## .. ..$ ndvi_se : list()
## .. .. ..- attr(*, "class")= chr "collector_double" "collector"
## .. ..$ ndvi_sw : list()
## .. .. ..- attr(*, "class")= chr "collector_double" "collector"
## .. ..$ precipitation_amt_mm : list()
## .. .. ..- attr(*, "class")= chr "collector_double" "collector"
## .. ..$ reanalysis_air_temp_k : list()
## .. .. ..- attr(*, "class")= chr "collector_double" "collector"
## .. ..$ reanalysis_avg_temp_k : list()
## .. .. ..- attr(*, "class")= chr "collector_double" "collector"
## .. ..$ reanalysis_dew_point_temp_k : list()
## .. .. ..- attr(*, "class")= chr "collector_double" "collector"
## .. ..$ reanalysis_max_air_temp_k : list()
## .. .. ..- attr(*, "class")= chr "collector_double" "collector"
## .. ..$ reanalysis_min_air_temp_k : list()
## .. .. ..- attr(*, "class")= chr "collector_double" "collector"
## .. ..$ reanalysis_precip_amt_kg_per_m2 : list()
## .. .. ..- attr(*, "class")= chr "collector_double" "collector"
## .. ..$ reanalysis_relative_humidity_percent : list()
## .. .. ..- attr(*, "class")= chr "collector_double" "collector"
## .. ..$ reanalysis_sat_precip_amt_mm : list()
## .. .. ..- attr(*, "class")= chr "collector_double" "collector"
## .. ..$ reanalysis_specific_humidity_g_per_kg: list()
## .. .. ..- attr(*, "class")= chr "collector_double" "collector"
## .. ..$ reanalysis_tdtr_k : list()
## .. .. ..- attr(*, "class")= chr "collector_double" "collector"
## .. ..$ station_avg_temp_c : list()
## .. .. ..- attr(*, "class")= chr "collector_double" "collector"
## .. ..$ station_diur_temp_rng_c : list()
## .. .. ..- attr(*, "class")= chr "collector_double" "collector"
## .. ..$ station_max_temp_c : list()
## .. .. ..- attr(*, "class")= chr "collector_double" "collector"
## .. ..$ station_min_temp_c : list()
## .. .. ..- attr(*, "class")= chr "collector_double" "collector"
## .. ..$ station_precip_mm : list()
## .. .. ..- attr(*, "class")= chr "collector_double" "collector"
## .. ..$ total_cases : list()
## .. .. ..- attr(*, "class")= chr "collector_integer" "collector"
## ..$ default: list()
## .. ..- attr(*, "class")= chr "collector_guess" "collector"
## ..- attr(*, "class")= chr "col_spec"
#combine train and test sets
alldata<-rbind(mydata[,-25],test)
#replace NA values with median
anyNA(alldata$ndvi_ne)
## [1] TRUE
anyNA(alldata$ndvi_nw)
## [1] TRUE
anyNA(mydata$total_cases) #False
## [1] FALSE
alldata$ndvi_ne[is.na(alldata$ndvi_ne)]<-median(alldata$ndvi_ne, na.rm=TRUE)
alldata$ndvi_nw[is.na(alldata$ndvi_nw)]<-median(alldata$ndvi_nw, na.rm=TRUE)
alldata$ndvi_se[is.na(alldata$ndvi_se)]<-median(alldata$ndvi_se, na.rm=TRUE)
alldata$ndvi_sw[is.na(alldata$ndvi_sw)]<-median(alldata$ndvi_sw, na.rm=TRUE)
alldata$precipitation_amt_mm[is.na(alldata$precipitation_amt_mm)]<-median(alldata$precipitation_amt_mm, na.rm=TRUE)
alldata$reanalysis_air_temp_k[is.na(alldata$reanalysis_air_temp_k)]<-median(alldata$reanalysis_air_temp_k, na.rm=TRUE)
alldata$reanalysis_avg_temp_k[is.na(alldata$reanalysis_avg_temp_k)]<-median(alldata$reanalysis_avg_temp_k, na.rm=TRUE)
alldata$reanalysis_dew_point_temp_k[is.na(alldata$reanalysis_dew_point_temp_k)]<-median(alldata$reanalysis_dew_point_temp_k, na.rm=TRUE)
alldata$reanalysis_max_air_temp_k[is.na(alldata$reanalysis_max_air_temp_k)]<-median(alldata$reanalysis_max_air_temp_k, na.rm=TRUE)
alldata$reanalysis_min_air_temp_k[is.na(alldata$reanalysis_min_air_temp_k)]<-median(alldata$reanalysis_min_air_temp_k, na.rm=TRUE)
alldata$reanalysis_precip_amt_kg_per_m2[is.na(alldata$reanalysis_precip_amt_kg_per_m2)]<-median(alldata$reanalysis_precip_amt_kg_per_m2, na.rm=TRUE)
alldata$reanalysis_relative_humidity_percent[is.na(alldata$reanalysis_relative_humidity_percent)]<-median(alldata$reanalysis_relative_humidity_percent, na.rm=TRUE)
alldata$reanalysis_sat_precip_amt_mm[is.na(alldata$reanalysis_sat_precip_amt_mm)]<-median(alldata$reanalysis_sat_precip_amt_mm, na.rm=TRUE)
alldata$reanalysis_specific_humidity_g_per_kg[is.na(alldata$reanalysis_specific_humidity_g_per_kg)]<-median(alldata$reanalysis_specific_humidity_g_per_kg, na.rm=TRUE)
alldata$reanalysis_tdtr_k[is.na(alldata$reanalysis_tdtr_k)]<-median(alldata$reanalysis_tdtr_k, na.rm=TRUE)
alldata$station_avg_temp_c[is.na(alldata$station_avg_temp_c)]<-median(alldata$station_avg_temp_c, na.rm=TRUE)
alldata$station_diur_temp_rng_c[is.na(alldata$station_diur_temp_rng_c)]<-median(alldata$station_diur_temp_rng_c, na.rm=TRUE)
alldata$station_max_temp_c[is.na(alldata$station_max_temp_c)]<-median(alldata$station_max_temp_c, na.rm=TRUE)
alldata$station_min_temp_c[is.na(alldata$station_min_temp_c)]<-median(alldata$station_min_temp_c, na.rm=TRUE)
alldata$station_precip_mm[is.na(alldata$station_precip_mm)]<-median(alldata$station_precip_mm, na.rm=TRUE)
anyNA(alldata) #False
## [1] FALSE
#putting back in the "total_cases" variable
alldata$total_cases<-0
alldata$total_cases[1:1456]<-mydata$total_cases
#separate into sj and iq train and test
sj<-alldata[1:936,]
iq<-alldata[937:1456,]
sj.test<-alldata[1457:1716, -25]
iq.test<-alldata[1717:1872, -25]
head(sj)
## # A tibble: 6 x 25
## city year weekofyear week_start_date ndvi_ne ndvi_nw ndvi_se ndvi_sw
## <chr> <int> <int> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 sj 1990 18 4/30/90 0.123 0.104 0.198 0.178
## 2 sj 1990 19 5/7/90 0.170 0.142 0.162 0.155
## 3 sj 1990 20 5/14/90 0.0322 0.173 0.157 0.171
## 4 sj 1990 21 5/21/90 0.129 0.245 0.228 0.236
## 5 sj 1990 22 5/28/90 0.196 0.262 0.251 0.247
## 6 sj 1990 23 6/4/90 0.127 0.175 0.254 0.182
## # ... with 17 more variables: precipitation_amt_mm <dbl>,
## # reanalysis_air_temp_k <dbl>, reanalysis_avg_temp_k <dbl>,
## # reanalysis_dew_point_temp_k <dbl>, reanalysis_max_air_temp_k <dbl>,
## # reanalysis_min_air_temp_k <dbl>,
## # reanalysis_precip_amt_kg_per_m2 <dbl>,
## # reanalysis_relative_humidity_percent <dbl>,
## # reanalysis_sat_precip_amt_mm <dbl>,
## # reanalysis_specific_humidity_g_per_kg <dbl>, reanalysis_tdtr_k <dbl>,
## # station_avg_temp_c <dbl>, station_diur_temp_rng_c <dbl>,
## # station_max_temp_c <dbl>, station_min_temp_c <dbl>,
## # station_precip_mm <dbl>, total_cases <dbl>
mycor=cor(sj[,5:25])
corrplot(mycor, method="circle", type="lower")
sjtot=ts(sj$total_cases, frequency=52, start=c(1990,04,30))
plot(sjtot)
acf(sjtot)
pacf(sjtot)
plot(decompose(sjtot))
mycor=cor(iq[,5:25])
corrplot(mycor, method="circle", type="lower")
iqtot=ts(iq$total_cases, frequency=52, start=c(2000,07,01))
plot(iqtot)
acf(iqtot)
pacf(iqtot)
plot(decompose(iqtot))
ARIMA Model
#(1,1,1)(0,0,0)
myarima1<-auto.arima(sjtot)
myarima1
## Series: sjtot
## ARIMA(1,1,1)
##
## Coefficients:
## ar1 ma1
## 0.7116 -0.5929
## s.e. 0.0948 0.1078
##
## sigma^2 estimated as 180.9: log likelihood=-3755.85
## AIC=7517.71 AICc=7517.73 BIC=7532.23
forecast.arima1<-forecast(myarima1, h=260)
plot(forecast.arima1)
plot(residuals(myarima1))
#(0,1,2)(0,0,1)
myarima2<-auto.arima(iqtot)
myarima2
## Series: iqtot
## ARIMA(0,1,2)(0,0,1)[52]
##
## Coefficients:
## ma1 ma2 sma1
## -0.2508 -0.2322 0.0568
## s.e. 0.0430 0.0449 0.0375
##
## sigma^2 estimated as 52.12: log likelihood=-1761.09
## AIC=3530.18 AICc=3530.26 BIC=3547.19
forecast.arima2<-forecast(myarima2, h=156)
plot(forecast.arima2)
plot(residuals(myarima2))
submission$total_cases[1:260]<-round(forecast.arima1$mean)
results.arima1<-as.data.frame(submission[1:260,])
submission$total_cases[261:416]<-round(forecast.arima2$mean)
results.arima2<-as.data.frame(submission[261:416,])
arima.solve<-rbind(results.arima1, results.arima2)
write.csv(arima.solve, file="~/Dropbox/Boston College/Predictive Analytics/Final/Submissions/Arima submission.csv", row.names = FALSE)
NNETAR Model
mynnetar1<-nnetar(sjtot, repeats=25, size=12, decay=0.1, linout=TRUE)
mynnetar1
## Series: sjtot
## Model: NNAR(14,1,12)[52]
## Call: nnetar(y = sjtot, size = 12, repeats = 25, decay = 0.1, linout = TRUE)
##
## Average of 25 networks, each of which is
## a 15-12-1 network with 205 weights
## options were - linear output units decay=0.1
##
## sigma^2 estimated as 63.08
forecast.nnetar1<-forecast(mynnetar1, h=260)
plot(forecast.nnetar1)
plot(residuals(mynnetar1))
mynnetar2<-nnetar(iqtot, repeats=25, size=18, decay=0.1, lineout=TRUE)
mynnetar2
## Series: iqtot
## Model: NNAR(5,1,18)[52]
## Call: nnetar(y = iqtot, size = 18, repeats = 25, decay = 0.1, lineout = TRUE)
##
## Average of 25 networks, each of which is
## a 6-18-1 network with 145 weights
## options were - linear output units decay=0.1
##
## sigma^2 estimated as 13.42
forecast.nnetar2<-forecast(mynnetar2, h=156)
plot(forecast.nnetar2)
plot(residuals(mynnetar2))
submission$total_cases[1:260]<-round(forecast.nnetar1$mean)
results.nnetar1<-as.data.frame(submission[1:260,])
submission$total_cases[261:416]<-round(forecast.nnetar2$mean)
results.nnetar2<-as.data.frame(submission[261:416,])
nnetar.solve<-rbind(results.nnetar1, results.nnetar2)
write.csv(nnetar.solve, file="~/Dropbox/Boston College/Predictive Analytics/Final/Submissions/Submission Neural.csv", row.names = FALSE)
ETS Model
myets1<-ets(sjtot, model="ZZZ")
## Warning in ets(sjtot, model = "ZZZ"): I can't handle data with frequency
## greater than 24. Seasonality will be ignored. Try stlf() if you need
## seasonal forecasts.
myets1
## ETS(A,Ad,N)
##
## Call:
## ets(y = sjtot, model = "ZZZ")
##
## Smoothing parameters:
## alpha = 0.9999
## beta = 0.1355
## phi = 0.8
##
## Initial states:
## l = 3.0767
## b = 0.2624
##
## sigma: 13.4753
##
## AIC AICc BIC
## 11279.55 11279.64 11308.60
forecast.ets1<-forecast(myets1, h=260)
plot(forecast.ets1)
plot(residuals(myets1))
myets2<-ets(iqtot, model="ZZZ")
## Warning in ets(iqtot, model = "ZZZ"): I can't handle data with frequency
## greater than 24. Seasonality will be ignored. Try stlf() if you need
## seasonal forecasts.
myets2
## ETS(A,N,N)
##
## Call:
## ets(y = iqtot, model = "ZZZ")
##
## Smoothing parameters:
## alpha = 0.6543
##
## Initial states:
## l = 0.0744
##
## sigma: 7.4063
##
## AIC AICc BIC
## 5338.417 5338.463 5351.178
forecast.ets2<-forecast(myets2, h=156)
plot(forecast.ets2)
plot(residuals(myets2))
submission$total_cases[1:260]<-round(forecast.ets1$mean)
results.ets1<-as.data.frame(submission[1:260,])
submission$total_cases[261:416]<-round(forecast.ets2$mean)
results.ets2<-as.data.frame(submission[261:416,])
nnetar.solve<-rbind(results.ets1, results.ets2)
write.csv(nnetar.solve, file="~/Dropbox/Boston College/Predictive Analytics/Final/Submissions/ETS submit.csv", row.names = FALSE)