setwd('/Users/brendanobrien/Downloads')
train_features = read.csv('Training_Data_Features.csv')
train_labels = read.csv('Training_Data_Labels.csv')
densubformat = read.csv('Submission_Format.csv')
require(tidyverse)
## Loading required package: tidyverse
## ── Attaching packages ───────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 2.2.1 ✔ purrr 0.2.4
## ✔ tibble 1.4.2 ✔ dplyr 0.7.4
## ✔ tidyr 0.8.0 ✔ stringr 1.2.0
## ✔ readr 1.2.1 ✔ forcats 0.3.0
## Warning: package 'readr' was built under R version 3.4.4
## ── Conflicts ──────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(tidyverse)
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
library(forecast)
## Warning: package 'forecast' was built under R version 3.4.4
library(fpp)
## Loading required package: fma
##
## Attaching package: 'fma'
## The following objects are masked from 'package:MASS':
##
## cement, housing, petrol
## Loading required package: expsmooth
## Loading required package: lmtest
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
## Loading required package: tseries
## Warning: package 'tseries' was built under R version 3.4.4
library(corrplot)
## corrplot 0.84 loaded
library(magrittr)
##
## Attaching package: 'magrittr'
## The following object is masked from 'package:purrr':
##
## set_names
## The following object is masked from 'package:tidyr':
##
## extract
library(zoo)
library(RColorBrewer)
library(gridExtra)
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
library(MASS)
require(Amelia)
## Loading required package: Amelia
## Warning: package 'Amelia' was built under R version 3.4.4
## Loading required package: Rcpp
## ##
## ## Amelia II: Multiple Imputation
## ## (Version 1.7.5, built: 2018-05-07)
## ## Copyright (C) 2005-2018 James Honaker, Gary King and Matthew Blackwell
## ## Refer to http://gking.harvard.edu/amelia/ for more information
## ##
library(Amelia)
library('broom')
library('purrr')
library('lubridate')
##
## Attaching package: 'lubridate'
## The following object is masked from 'package:base':
##
## date
library('timeDate')
library('tseries')
library('forecast')
library('prophet')
## Warning: package 'prophet' was built under R version 3.4.4
library('ggplot2')
library('scales')
##
## Attaching package: 'scales'
## The following object is masked from 'package:purrr':
##
## discard
## The following object is masked from 'package:readr':
##
## col_factor
library('grid')
library('gridExtra')
library('RColorBrewer')
library('corrplot')
library('ggridges')
## Warning: package 'ggridges' was built under R version 3.4.4
library('dplyr')
library('readr')
library('data.table')
##
## Attaching package: 'data.table'
## The following objects are masked from 'package:lubridate':
##
## hour, isoweek, mday, minute, month, quarter, second, wday,
## week, yday, year
## The following objects are masked from 'package:dplyr':
##
## between, first, last
## The following object is masked from 'package:purrr':
##
## transpose
library('tibble')
library('tidyr')
library('stringr')
library('forcats')
library('lazyeval')
##
## Attaching package: 'lazyeval'
## The following objects are masked from 'package:purrr':
##
## is_atomic, is_formula
In viewing the data, I found there to to be data on two different cities: San Juan and Iquitos. Therefore, I will seperate the data based on their city.
sj_train_features = train_features %>% filter(city == 'sj')
sj_train_labels = train_labels %>% filter(city == 'sj')
iq_train_features = train_features %>% filter(city == 'iq')
iq_train_labels = train_labels %>% filter(city == 'iq')
cat('\nSan Juan\n',
'\t features: ', sj_train_features %>% ncol,
'\t entries: ' , sj_train_features %>% nrow,
'\t labels: ' , sj_train_labels %>% nrow)
##
## San Juan
## features: 24 entries: 936 labels: 936
cat('\nIquitos\n',
'\t features: ', iq_train_features %>% ncol,
'\t entries: ' , iq_train_features %>% nrow,
'\t labels: ' , iq_train_labels %>% nrow)
##
## Iquitos
## features: 24 entries: 520 labels: 520
summary(sj_train_features)
## city year weekofyear week_start_date
## iq: 0 Min. :1990 Min. : 1.00 1990-04-30: 1
## sj:936 1st Qu.:1994 1st Qu.:13.75 1990-05-07: 1
## Median :1999 Median :26.50 1990-05-14: 1
## Mean :1999 Mean :26.50 1990-05-21: 1
## 3rd Qu.:2003 3rd Qu.:39.25 1990-05-28: 1
## Max. :2008 Max. :53.00 1990-06-04: 1
## (Other) :930
## ndvi_ne ndvi_nw ndvi_se
## Min. :-0.40625 Min. :-0.45610 Min. :-0.01553
## 1st Qu.: 0.00450 1st Qu.: 0.01642 1st Qu.: 0.13928
## Median : 0.05770 Median : 0.06808 Median : 0.17719
## Mean : 0.05792 Mean : 0.06747 Mean : 0.17766
## 3rd Qu.: 0.11110 3rd Qu.: 0.11520 3rd Qu.: 0.21256
## Max. : 0.49340 Max. : 0.43710 Max. : 0.39313
## NA's :191 NA's :49 NA's :19
## ndvi_sw precipitation_amt_mm reanalysis_air_temp_k
## Min. :-0.06346 Min. : 0.00 Min. :295.9
## 1st Qu.: 0.12916 1st Qu.: 0.00 1st Qu.:298.2
## Median : 0.16597 Median : 20.80 Median :299.3
## Mean : 0.16596 Mean : 35.47 Mean :299.2
## 3rd Qu.: 0.20277 3rd Qu.: 52.18 3rd Qu.:300.1
## Max. : 0.38142 Max. :390.60 Max. :302.2
## NA's :19 NA's :9 NA's :6
## reanalysis_avg_temp_k reanalysis_dew_point_temp_k
## Min. :296.1 Min. :289.6
## 1st Qu.:298.3 1st Qu.:293.8
## Median :299.4 Median :295.5
## Mean :299.3 Mean :295.1
## 3rd Qu.:300.2 3rd Qu.:296.4
## Max. :302.2 Max. :297.8
## NA's :6 NA's :6
## reanalysis_max_air_temp_k reanalysis_min_air_temp_k
## Min. :297.8 Min. :292.6
## 1st Qu.:300.4 1st Qu.:296.3
## Median :301.5 Median :297.5
## Mean :301.4 Mean :297.3
## 3rd Qu.:302.4 3rd Qu.:298.4
## Max. :304.3 Max. :299.9
## NA's :6 NA's :6
## reanalysis_precip_amt_kg_per_m2 reanalysis_relative_humidity_percent
## Min. : 0.00 Min. :66.74
## 1st Qu.: 10.82 1st Qu.:76.25
## Median : 21.30 Median :78.67
## Mean : 30.47 Mean :78.57
## 3rd Qu.: 37.00 3rd Qu.:80.96
## Max. :570.50 Max. :87.58
## NA's :6 NA's :6
## reanalysis_sat_precip_amt_mm reanalysis_specific_humidity_g_per_kg
## Min. : 0.00 Min. :11.72
## 1st Qu.: 0.00 1st Qu.:15.24
## Median : 20.80 Median :16.85
## Mean : 35.47 Mean :16.55
## 3rd Qu.: 52.18 3rd Qu.:17.86
## Max. :390.60 Max. :19.44
## NA's :9 NA's :6
## reanalysis_tdtr_k station_avg_temp_c station_diur_temp_rng_c
## Min. :1.357 Min. :22.84 Min. :4.529
## 1st Qu.:2.157 1st Qu.:25.84 1st Qu.:6.200
## Median :2.457 Median :27.23 Median :6.757
## Mean :2.516 Mean :27.01 Mean :6.757
## 3rd Qu.:2.800 3rd Qu.:28.19 3rd Qu.:7.286
## Max. :4.429 Max. :30.07 Max. :9.914
## NA's :6 NA's :6 NA's :6
## station_max_temp_c station_min_temp_c station_precip_mm
## Min. :26.70 Min. :17.8 Min. : 0.000
## 1st Qu.:30.60 1st Qu.:21.7 1st Qu.: 6.825
## Median :31.70 Median :22.8 Median : 17.750
## Mean :31.61 Mean :22.6 Mean : 26.785
## 3rd Qu.:32.80 3rd Qu.:23.9 3rd Qu.: 35.450
## Max. :35.60 Max. :25.6 Max. :305.900
## NA's :6 NA's :6 NA's :6
summary(iq_train_features)
## city year weekofyear week_start_date
## iq:520 Min. :2000 Min. : 1.00 2000-07-01: 1
## sj: 0 1st Qu.:2003 1st Qu.:13.75 2000-07-08: 1
## Median :2005 Median :26.50 2000-07-15: 1
## Mean :2005 Mean :26.50 2000-07-22: 1
## 3rd Qu.:2007 3rd Qu.:39.25 2000-07-29: 1
## Max. :2010 Max. :53.00 2000-08-05: 1
## (Other) :514
## ndvi_ne ndvi_nw ndvi_se ndvi_sw
## Min. :0.06173 Min. :0.03586 Min. :0.02988 Min. :0.06418
## 1st Qu.:0.20000 1st Qu.:0.17954 1st Qu.:0.19474 1st Qu.:0.20413
## Median :0.26364 Median :0.23297 Median :0.24980 Median :0.26214
## Mean :0.26387 Mean :0.23878 Mean :0.25013 Mean :0.26678
## 3rd Qu.:0.31997 3rd Qu.:0.29393 3rd Qu.:0.30230 3rd Qu.:0.32515
## Max. :0.50836 Max. :0.45443 Max. :0.53831 Max. :0.54602
## NA's :3 NA's :3 NA's :3 NA's :3
## precipitation_amt_mm reanalysis_air_temp_k reanalysis_avg_temp_k
## Min. : 0.00 Min. :294.6 Min. :294.9
## 1st Qu.: 39.10 1st Qu.:297.1 1st Qu.:298.2
## Median : 60.47 Median :297.8 Median :299.1
## Mean : 64.25 Mean :297.9 Mean :299.1
## 3rd Qu.: 85.76 3rd Qu.:298.6 3rd Qu.:300.1
## Max. :210.83 Max. :301.6 Max. :302.9
## NA's :4 NA's :4 NA's :4
## reanalysis_dew_point_temp_k reanalysis_max_air_temp_k
## Min. :290.1 Min. :300.0
## 1st Qu.:294.6 1st Qu.:305.2
## Median :295.9 Median :307.1
## Mean :295.5 Mean :307.1
## 3rd Qu.:296.5 3rd Qu.:308.7
## Max. :298.4 Max. :314.0
## NA's :4 NA's :4
## reanalysis_min_air_temp_k reanalysis_precip_amt_kg_per_m2
## Min. :286.9 Min. : 0.00
## 1st Qu.:292.0 1st Qu.: 24.07
## Median :293.1 Median : 46.44
## Mean :292.9 Mean : 57.61
## 3rd Qu.:294.2 3rd Qu.: 71.07
## Max. :296.0 Max. :362.03
## NA's :4 NA's :4
## reanalysis_relative_humidity_percent reanalysis_sat_precip_amt_mm
## Min. :57.79 Min. : 0.00
## 1st Qu.:84.30 1st Qu.: 39.10
## Median :90.92 Median : 60.47
## Mean :88.64 Mean : 64.25
## 3rd Qu.:94.56 3rd Qu.: 85.76
## Max. :98.61 Max. :210.83
## NA's :4 NA's :4
## reanalysis_specific_humidity_g_per_kg reanalysis_tdtr_k
## Min. :12.11 Min. : 3.714
## 1st Qu.:16.10 1st Qu.: 7.371
## Median :17.43 Median : 8.964
## Mean :17.10 Mean : 9.207
## 3rd Qu.:18.18 3rd Qu.:11.014
## Max. :20.46 Max. :16.029
## NA's :4 NA's :4
## station_avg_temp_c station_diur_temp_rng_c station_max_temp_c
## Min. :21.40 Min. : 5.20 Min. :30.1
## 1st Qu.:27.00 1st Qu.: 9.50 1st Qu.:33.2
## Median :27.60 Median :10.62 Median :34.0
## Mean :27.53 Mean :10.57 Mean :34.0
## 3rd Qu.:28.10 3rd Qu.:11.65 3rd Qu.:34.9
## Max. :30.80 Max. :15.80 Max. :42.2
## NA's :37 NA's :37 NA's :14
## station_min_temp_c station_precip_mm
## Min. :14.7 Min. : 0.00
## 1st Qu.:20.6 1st Qu.: 17.20
## Median :21.3 Median : 45.30
## Mean :21.2 Mean : 62.47
## 3rd Qu.:22.0 3rd Qu.: 85.95
## Max. :24.2 Max. :543.30
## NA's :8 NA's :16
There are NAs in many of the observations, particularly in the week_start_data variable. Therefore, I chose to remove this variable.
sj_train_features %<>% dplyr::select(-week_start_date)
iq_train_features %<>% dplyr::select(-week_start_date)
Fill in remainder of NAs with the most recent value
sj_train_features$ndvi_ne %<>% na.locf(fromLast = TRUE)
iq_train_features$ndvi_ne %<>% na.locf(fromLast = TRUE)
Now, the labels must be applied to the features of the data set.
cat('\nSan Juan\n',
'\t total cases mean: ', sj_train_labels$total_cases %>% mean(),
'\t total cases variance: ' , sj_train_labels$total_cases %>% var() )
##
## San Juan
## total cases mean: 34.18056 total cases variance: 2640.045
cat('\nIquitos\n',
'\t total cases mean: ', iq_train_labels$total_cases %>% mean(),
'\t total cases variance: ' , iq_train_labels$total_cases %>% var() )
##
## Iquitos
## total cases mean: 7.565385 total cases variance: 115.8955
Complete data frames/structure
preprocessData <- function(data_path, labels_path = NULL)
{
df <- read.csv(data_path)
features = c("reanalysis_specific_humidity_g_per_kg", "reanalysis_dew_point_temp_k",
"station_avg_temp_c", "station_min_temp_c", "ndvi_ne", "ndvi_se", "ndvi_nw","ndvi_sw", "station_precip_mm" )
df[features] %<>% na.locf(fromLast = TRUE)
if (is.null(labels_path)) features %<>% c("city", "year", "weekofyear")
df <- df[features]
if (!is.null(labels_path)) df %<>% cbind(read.csv(labels_path))
df_sj <- filter(df, city == 'sj')
df_iq <- filter(df, city == 'iq')
return(list(df_sj, df_iq))
}
preprocessData(data_path = 'Training_Data_Features.csv', labels_path = 'Training_Data_Labels.csv') -> trains
sj_train <- trains[[1]]; iq_train <- as.data.frame(trains[2])
rbind(iq_train_labels, sj_train_labels) %>%
ggplot(aes(x = total_cases, fill = ..count..)) +
geom_histogram(bins = 20, colour = 'black') + ggtitle('Total Cases of Dengue') +scale_y_continuous(breaks = seq(0,700,100)) + facet_wrap(~city)
A look at the total cases of Dengue in each city.
sj_train_features %<>% mutate('total_cases' = sj_train_labels$total_cases)
iq_train_features %<>% mutate('total_cases' = iq_train_labels$total_cases)
sj_train_features %<>% mutate('total_cases' = sj_train_labels$total_cases)
iq_train_features %<>% mutate('total_cases' = iq_train_labels$total_cases)
sj.ts<-ts(sj_train_features$total_cases, frequency=52, start=c(1990, 18))
iq.ts<-ts(iq_train_features$total_cases, frequency=52, start=c(2000, 26))
plot(sj.ts)
plot(iq.ts)
sj.train.ts<-window(sj.ts, end=c(2004,37))
sj.test.ts<-window(sj.ts,start=c(2004,38))
iq.train.ts<-window(iq.ts, end=c(2008,27))
iq.test.ts<-window(iq.ts,start=c(2008,28))
sj.arima=auto.arima(sj.train.ts)
sj.forecast1=forecast(sj.arima,h=260)
autoplot(sj.forecast1)
iq.arima=auto.arima(iq.train.ts)
iq.forecast1=forecast(iq.arima,h=156)
autoplot(iq.forecast1)
sj.nnetar<-nnetar(sj.train.ts)
sj.forecast2<-forecast(sj.nnetar,h=260)
autoplot(sj.forecast2)
#####Iquitos
iq.nnetar<-nnetar(iq.train.ts)
iq.forecast2<-forecast(iq.nnetar,h=156)
autoplot(iq.forecast2)
library(randomForest)
## Warning: package 'randomForest' was built under R version 3.4.4
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:gridExtra':
##
## combine
## The following object is masked from 'package:dplyr':
##
## combine
## The following object is masked from 'package:ggplot2':
##
## margin
sj.rF<-randomForest(total_cases ~ ndvi_ne + ndvi_nw + ndvi_se + ndvi_sw + reanalysis_specific_humidity_g_per_kg + station_avg_temp_c + reanalysis_dew_point_temp_k + station_min_temp_c + station_precip_mm , data = sj_train)
sj.forecast3<- predict(object=sj.rF, h=260)
plot(sj.forecast3)
iq.rF<-randomForest(total_cases ~ ndvi_ne + ndvi_nw + ndvi_se + ndvi_sw + reanalysis_specific_humidity_g_per_kg + station_avg_temp_c + reanalysis_dew_point_temp_k + station_min_temp_c + station_precip_mm , data = iq_train)
iq.forecast3<- predict(object=iq.rF, h=156)
plot(iq.forecast3)
acc.sj.arima <- accuracy(sj.forecast1, sj.test.ts)
acc.iq.arima <- accuracy(iq.forecast1,iq.test.ts)
tsdisplay(residuals(sj.forecast1))
acc.sj.arima
## ME RMSE MAE MPE MAPE MASE
## Training set 0.006577048 13.83894 8.379156 NaN Inf 0.2102492
## Test set 14.740977080 34.48529 18.234742 -Inf Inf 0.4575449
## ACF1 Theil's U
## Training set 0.007353076 NA
## Test set 0.932173942 0
tsdisplay(residuals(iq.forecast1))
acc.iq.arima
## ME RMSE MAE MPE MAPE MASE ACF1
## Training set 0.008331804 7.16080 3.773915 NaN Inf 0.4123143 0.006181246
## Test set 7.882537945 13.92229 8.265254 -Inf Inf 0.9030098 0.842766026
## Theil's U
## Training set NA
## Test set 0
acc.sj.nnetar <- accuracy(sj.forecast2, sj.test.ts)
acc.iq.nnetar <- accuracy(iq.forecast2,iq.test.ts)
tsdisplay(residuals(sj.forecast2))
acc.sj.nnetar
## ME RMSE MAE MPE MAPE MASE
## Training set -0.01880461 6.504068 4.777888 -Inf Inf 0.1198864
## Test set 6.05422937 32.475855 19.514153 -Inf Inf 0.4896478
## ACF1 Theil's U
## Training set -0.01533117 NA
## Test set 0.93554975 0
tsdisplay(residuals(iq.forecast2))
acc.iq.nnetar
## ME RMSE MAE MPE MAPE MASE
## Training set -0.01342703 5.887483 3.558980 -Inf Inf 0.3888319
## Test set 5.08383902 12.626048 7.267046 -Inf Inf 0.7939519
## ACF1 Theil's U
## Training set 0.002369741 NA
## Test set 0.846631790 0
acc.sj.rF <- accuracy(sj.forecast3, sj.test.ts)
acc.iq.rF <- accuracy(iq.forecast3,iq.test.ts)
acc.sj.rF
## ME RMSE MAE MPE MAPE ACF1 Theil's U
## Test set -7.436914 36.15461 28.11977 -Inf Inf 0.8541588 0
acc.iq.rF
## ME RMSE MAE MPE MAPE ACF1 Theil's U
## Test set 3.222535 12.99857 7.777463 -Inf Inf 0.8037225 0
Neural Network is the best forecasting model according to test metrics
NN_sj_final=data.frame(densubformat[1:260,-4],total_cases=round(sj.forecast2$mean))
NN_iq_final=data.frame(densubformat[261:416,-4],total_cases=round(iq.forecast2$mean))
Dengue_Arima_solution=bind_rows(NN_sj_final,NN_iq_final)
## Warning in bind_rows_(x, .id): Vectorizing 'ts' elements may not preserve
## their attributes
## Warning in bind_rows_(x, .id): Vectorizing 'ts' elements may not preserve
## their attributes
write.csv(Dengue_Arima_solution,file="Dengue_Arima_predictions.csv",row.names=F)
I recieved a final score of 28.64 from my Neural Networking model