###########################################################################################################
rm(list = ls())
library(plyr)
library(dplyr)
library(tidyr)
library(ggplot2)
library(lubridate)
library(zoo)
library(data.table)
library(forecast)
library(fpp2)
library(rugarch)
library(fGarch)
library(ggcorrplot)
library(corrplot)
library(stats)
train.y <- read.csv("/Users/yannos/Documents/PredictiveAnalytics/Final/dengue_labels_train.csv")
train.x <- read.csv("/Users/yannos/Documents/PredictiveAnalytics/Final/dengue_features_train.csv")
test.x <- read.csv("/Users/yannos/Documents/PredictiveAnalytics/Final/dengue_features_test.csv")
test.y <- read.csv("/Users/yannos/Documents/PredictiveAnalytics/Final/submission_format.csv")
submission <- read.csv("/Users/yannos/Documents/PredictiveAnalytics/Final/submission_format.csv")
train <- train.y %>% left_join(train.x, by = c("city","year","weekofyear"))
test <- test.y %>% left_join(test.x, by = c("city","year","weekofyear"))
train.sj <- dplyr::filter(train, city == "sj")
train.iq <- dplyr::filter(train, city == "iq")
test.sj <- dplyr::filter(test, city == "sj")
test.iq <- dplyr::filter(test, city == "iq")
train.sj.y <- dplyr::filter(train.y, city == "sj")
train.iq.y <- dplyr::filter(train.y, city == "iq")
cases.sj <- train.sj.y %>% dplyr::select("total_cases")
cases.iq <- train.iq.y %>% dplyr::select("total_cases")
dim(train.y)
## [1] 1456 4
any(is.na(train.y))
## [1] FALSE
train.y %>% summary()
## city year weekofyear total_cases
## iq:520 Min. :1990 Min. : 1.00 Min. : 0.00
## sj:936 1st Qu.:1997 1st Qu.:13.75 1st Qu.: 5.00
## Median :2002 Median :26.50 Median : 12.00
## Mean :2001 Mean :26.50 Mean : 24.68
## 3rd Qu.:2005 3rd Qu.:39.25 3rd Qu.: 28.00
## Max. :2010 Max. :53.00 Max. :461.00
dim(train.x)
## [1] 1456 24
any(is.na(train.x))
## [1] TRUE
train.x %>% summary()
## city year weekofyear week_start_date ndvi_ne
## iq:520 Min. :1990 Min. : 1.00 2000-07-01: 2 Min. :-0.40625
## sj:936 1st Qu.:1997 1st Qu.:13.75 2000-07-08: 2 1st Qu.: 0.04495
## Median :2002 Median :26.50 2000-07-15: 2 Median : 0.12882
## Mean :2001 Mean :26.50 2000-07-22: 2 Mean : 0.14229
## 3rd Qu.:2005 3rd Qu.:39.25 2000-07-29: 2 3rd Qu.: 0.24848
## Max. :2010 Max. :53.00 2000-08-05: 2 Max. : 0.50836
## (Other) :1444 NA's :194
## ndvi_nw ndvi_se ndvi_sw precipitation_amt_mm
## Min. :-0.45610 Min. :-0.01553 Min. :-0.06346 Min. : 0.00
## 1st Qu.: 0.04922 1st Qu.: 0.15509 1st Qu.: 0.14421 1st Qu.: 9.80
## Median : 0.12143 Median : 0.19605 Median : 0.18945 Median : 38.34
## Mean : 0.13055 Mean : 0.20378 Mean : 0.20231 Mean : 45.76
## 3rd Qu.: 0.21660 3rd Qu.: 0.24885 3rd Qu.: 0.24698 3rd Qu.: 70.23
## Max. : 0.45443 Max. : 0.53831 Max. : 0.54602 Max. :390.60
## NA's :52 NA's :22 NA's :22 NA's :13
## reanalysis_air_temp_k reanalysis_avg_temp_k reanalysis_dew_point_temp_k
## Min. :294.6 Min. :294.9 Min. :289.6
## 1st Qu.:297.7 1st Qu.:298.3 1st Qu.:294.1
## Median :298.6 Median :299.3 Median :295.6
## Mean :298.7 Mean :299.2 Mean :295.2
## 3rd Qu.:299.8 3rd Qu.:300.2 3rd Qu.:296.5
## Max. :302.2 Max. :302.9 Max. :298.4
## NA's :10 NA's :10 NA's :10
## reanalysis_max_air_temp_k reanalysis_min_air_temp_k
## Min. :297.8 Min. :286.9
## 1st Qu.:301.0 1st Qu.:293.9
## Median :302.4 Median :296.2
## Mean :303.4 Mean :295.7
## 3rd Qu.:305.5 3rd Qu.:297.9
## Max. :314.0 Max. :299.9
## NA's :10 NA's :10
## reanalysis_precip_amt_kg_per_m2 reanalysis_relative_humidity_percent
## Min. : 0.00 Min. :57.79
## 1st Qu.: 13.05 1st Qu.:77.18
## Median : 27.25 Median :80.30
## Mean : 40.15 Mean :82.16
## 3rd Qu.: 52.20 3rd Qu.:86.36
## Max. :570.50 Max. :98.61
## NA's :10 NA's :10
## reanalysis_sat_precip_amt_mm reanalysis_specific_humidity_g_per_kg
## Min. : 0.00 Min. :11.72
## 1st Qu.: 9.80 1st Qu.:15.56
## Median : 38.34 Median :17.09
## Mean : 45.76 Mean :16.75
## 3rd Qu.: 70.23 3rd Qu.:17.98
## Max. :390.60 Max. :20.46
## NA's :13 NA's :10
## reanalysis_tdtr_k station_avg_temp_c station_diur_temp_rng_c
## Min. : 1.357 Min. :21.40 Min. : 4.529
## 1st Qu.: 2.329 1st Qu.:26.30 1st Qu.: 6.514
## Median : 2.857 Median :27.41 Median : 7.300
## Mean : 4.904 Mean :27.19 Mean : 8.059
## 3rd Qu.: 7.625 3rd Qu.:28.16 3rd Qu.: 9.567
## Max. :16.029 Max. :30.80 Max. :15.800
## NA's :10 NA's :43 NA's :43
## station_max_temp_c station_min_temp_c station_precip_mm
## Min. :26.70 Min. :14.7 Min. : 0.00
## 1st Qu.:31.10 1st Qu.:21.1 1st Qu.: 8.70
## Median :32.80 Median :22.2 Median : 23.85
## Mean :32.45 Mean :22.1 Mean : 39.33
## 3rd Qu.:33.90 3rd Qu.:23.3 3rd Qu.: 53.90
## Max. :42.20 Max. :25.6 Max. :543.30
## NA's :20 NA's :14 NA's :22
dim(test.x)
## [1] 416 24
any(is.na(test.x))
## [1] TRUE
test.x %>% summary()
## city year weekofyear week_start_date ndvi_ne
## iq:156 Min. :2008 Min. : 1.00 2010-07-02: 2 Min. :-0.4634
## sj:260 1st Qu.:2010 1st Qu.:13.75 2010-07-09: 2 1st Qu.:-0.0015
## Median :2011 Median :26.00 2010-07-16: 2 Median : 0.1101
## Mean :2011 Mean :26.44 2010-07-23: 2 Mean : 0.1260
## 3rd Qu.:2012 3rd Qu.:39.00 2010-07-30: 2 3rd Qu.: 0.2633
## Max. :2013 Max. :53.00 2010-08-06: 2 Max. : 0.5004
## (Other) :404 NA's :43
## ndvi_nw ndvi_se ndvi_sw precipitation_amt_mm
## Min. :-0.21180 Min. :0.0062 Min. :-0.01467 Min. : 0.000
## 1st Qu.: 0.01597 1st Qu.:0.1487 1st Qu.: 0.13408 1st Qu.: 8.175
## Median : 0.08870 Median :0.2042 Median : 0.18647 Median : 31.455
## Mean : 0.12680 Mean :0.2077 Mean : 0.20172 Mean : 38.354
## 3rd Qu.: 0.24240 3rd Qu.:0.2549 3rd Qu.: 0.25324 3rd Qu.: 57.773
## Max. : 0.64900 Max. :0.4530 Max. : 0.52904 Max. :169.340
## NA's :11 NA's :1 NA's :1 NA's :2
## reanalysis_air_temp_k reanalysis_avg_temp_k reanalysis_dew_point_temp_k
## Min. :294.6 Min. :295.2 Min. :290.8
## 1st Qu.:297.8 1st Qu.:298.3 1st Qu.:294.3
## Median :298.5 Median :299.3 Median :295.8
## Mean :298.8 Mean :299.4 Mean :295.4
## 3rd Qu.:300.2 3rd Qu.:300.5 3rd Qu.:296.6
## Max. :301.9 Max. :303.3 Max. :297.8
## NA's :2 NA's :2 NA's :2
## reanalysis_max_air_temp_k reanalysis_min_air_temp_k
## Min. :298.2 Min. :286.2
## 1st Qu.:301.4 1st Qu.:293.5
## Median :302.8 Median :296.3
## Mean :303.6 Mean :295.7
## 3rd Qu.:305.8 3rd Qu.:298.3
## Max. :314.1 Max. :299.7
## NA's :2 NA's :2
## reanalysis_precip_amt_kg_per_m2 reanalysis_relative_humidity_percent
## Min. : 0.00 Min. :64.92
## 1st Qu.: 9.43 1st Qu.:77.40
## Median : 25.85 Median :80.33
## Mean : 42.17 Mean :82.50
## 3rd Qu.: 56.48 3rd Qu.:88.33
## Max. :301.40 Max. :97.98
## NA's :2 NA's :2
## reanalysis_sat_precip_amt_mm reanalysis_specific_humidity_g_per_kg
## Min. : 0.000 Min. :12.54
## 1st Qu.: 8.175 1st Qu.:15.79
## Median : 31.455 Median :17.34
## Mean : 38.354 Mean :16.93
## 3rd Qu.: 57.773 3rd Qu.:18.17
## Max. :169.340 Max. :19.60
## NA's :2 NA's :2
## reanalysis_tdtr_k station_avg_temp_c station_diur_temp_rng_c
## Min. : 1.486 Min. :24.16 Min. : 4.043
## 1st Qu.: 2.446 1st Qu.:26.51 1st Qu.: 5.929
## Median : 2.914 Median :27.48 Median : 6.643
## Mean : 5.125 Mean :27.37 Mean : 7.811
## 3rd Qu.: 8.171 3rd Qu.:28.32 3rd Qu.: 9.812
## Max. :14.486 Max. :30.27 Max. :14.725
## NA's :2 NA's :12 NA's :12
## station_max_temp_c station_min_temp_c station_precip_mm
## Min. :27.20 Min. :14.20 Min. : 0.00
## 1st Qu.:31.10 1st Qu.:21.20 1st Qu.: 9.10
## Median :32.80 Median :22.20 Median : 23.60
## Mean :32.53 Mean :22.37 Mean : 34.28
## 3rd Qu.:33.90 3rd Qu.:23.30 3rd Qu.: 47.75
## Max. :38.40 Max. :26.70 Max. :212.00
## NA's :3 NA's :9 NA's :5
From above we can observe the following:
Our train data has has no missing values as far as our taget variable - total cases - is concerned
Both our train and test data have some missing values for some of the environment related features given
# Look at min, max in train data for each city
train.sj.y %>% head()
## city year weekofyear total_cases
## 1 sj 1990 18 4
## 2 sj 1990 19 5
## 3 sj 1990 20 4
## 4 sj 1990 21 3
## 5 sj 1990 22 6
## 6 sj 1990 23 2
train.sj.y %>% tail()
## city year weekofyear total_cases
## 931 sj 2008 12 3
## 932 sj 2008 13 4
## 933 sj 2008 14 3
## 934 sj 2008 15 1
## 935 sj 2008 16 3
## 936 sj 2008 17 5
train.iq.y %>% head()
## city year weekofyear total_cases
## 1 iq 2000 26 0
## 2 iq 2000 27 0
## 3 iq 2000 28 0
## 4 iq 2000 29 0
## 5 iq 2000 30 0
## 6 iq 2000 31 0
train.iq.y %>% tail()
## city year weekofyear total_cases
## 515 iq 2010 20 6
## 516 iq 2010 21 5
## 517 iq 2010 22 8
## 518 iq 2010 23 1
## 519 iq 2010 24 1
## 520 iq 2010 25 4
As we saw above many of the environment related variables have some missing values. We will replace those with the previous observation (assume variables approximately stay the same if missing). Then redefine dfs with filled in data.
# replace missing with previous observations
train.x <- fill(train.x, ndvi_ne:station_precip_mm)
test.x <- fill(test.x, ndvi_ne:station_precip_mm)
# redefine working dataframes with filled values
train <- train.y %>% left_join(train.x, by = c("city","year","weekofyear"))
test <- test.y %>% left_join(test.x, by = c("city","year","weekofyear"))
train.sj <- dplyr::filter(train, city == "sj")
train.iq <- dplyr::filter(train, city == "iq")
test.sj <- dplyr::filter(test, city == "sj")
test.iq <- dplyr::filter(test, city == "iq")
train.sj.y <- dplyr::filter(train.y, city == "sj")
train.iq.y <- dplyr::filter(train.y, city == "iq")
cases.sj <- train.sj.y %>% dplyr::select("total_cases")
cases.iq <- train.iq.y %>% dplyr::select("total_cases")
We can now create time series object to possibly use for investigation and modeling and plot our series:
cases.sj.ts <- ts(cases.sj, frequency = 52, start=1)
cases.sj.ts %>% autoplot(alpha=0.6, col="red") +
ggtitle("Total Dengue Fever Cases in San Juan") +
ylab("Cases") +
xlab("Week")
cases.iq.ts <- ts(cases.iq, frequency = 52, start=1)
cases.iq.ts %>% autoplot(alpha=0.6, col="red") +
ggtitle("Total Dengue Fever Cases in Iquitos") +
ylab("Cases") +
xlab("Week")
From above we observe that there is no clear seasonality and trend and that data is characterized in both cases by some very low values and sudden peaks that vary in severity / size.
train %>%
ggplot(aes(x=total_cases, fill=city)) +
geom_density(alpha=.3) +
scale_x_continuous(limits = c(0, 150)) +
ggtitle("Total Case Distributions by City") +
ylab("Density") +
xlab("Cases") +
labs(fill = "City")
## Warning: Removed 25 rows containing non-finite values (stat_density).
Above we can observe that in both cities total cases distributions are right skewed with long tails of high values and the majority of cases falling within the very low quantiles and with many zeros. Indication for count data, possibly negative binomial distrubuted.
train.sj %>%
ggplot(aes(x=year,y=total_cases, group=year, fill=year, alpha=0.8)) +
geom_boxplot() +
geom_jitter(width=0.05,alpha=0.1) +
theme(legend.position = "none") +
ggtitle("Total Case Distributions by Year in San Juan") +
ylab("Cases") +
xlab("Year")
train.iq %>%
ggplot(aes(x=year,y=total_cases, group=year, fill=year, alpha=0.8)) +
geom_boxplot() +
geom_jitter(width=0.05,alpha=0.1) +
theme(legend.position = "none") +
ggtitle("Total Case Distributions by Year in Iquitos") +
ylab("Cases") +
xlab("Year") +
scale_x_continuous(breaks=c(2000, 2002, 2004, 2006, 2008, 2010))
We do not observe any particular or consistent yearly variation.
train.sj %>%
ggplot(aes(x=weekofyear,y=total_cases, group=weekofyear, fill=weekofyear, alpha=0.8)) +
geom_boxplot() +
geom_jitter(width=0.05,alpha=0.1) +
theme(legend.position = "none") +
ggtitle("Total Case Distributions by Week in San Juan") +
ylab("Cases") +
xlab("Week")
train.iq %>%
ggplot(aes(x=weekofyear,y=total_cases, group=weekofyear, fill=weekofyear, alpha=0.8)) +
geom_boxplot() +
geom_jitter(width=0.05,alpha=0.1) +
theme(legend.position = "none") +
ggtitle("Total Case Distributions by Week in Iquitos") +
ylab("Cases") +
xlab("Week") +
scale_x_continuous(breaks=c(2000, 2002, 2004, 2006, 2008, 2010))
Similarly, there is no great variation in mean/median weekly cases. However, we do observe slightly differing overall levels of total cases, especially to some extent for San Juan.
# Calculate Correlations for all variables for each city
corr.sj <- cor(train.sj %>% dplyr::select(-city, -week_start_date, -year), use = "pairwise.complete.obs")
corr.iq <- cor(train.iq %>% dplyr::select(-city, -week_start_date, -year), use = "pairwise.complete.obs")
# Plot Matrices
corrplot(corr.sj, type="lower", tl.col="black", tl.cex = 0.45, tl.srt = 35)
corrplot(corr.iq, type="lower", tl.col="black", tl.cex = 0.45, tl.srt = 35)
We can observe the following:
Surprisingly vegetation indeces do not seem to be correlated with total cases in either city.
On the other hand we observe moderate positive correlations between the specific humidity variable and total cases as well as between dew point temperature and total cases.
train.sj %>% ggplot(aes(x=reanalysis_specific_humidity_g_per_kg, y=total_cases,
color=city, alpha=0.3)) + geom_point() +
geom_smooth(method=lm, col="black", alpha=0.5) +
theme(legend.position = "none") +
ggtitle("Total Cases vs Specific Humidity in San Juan") +
ylab("Total Cases") +
xlab("Specific Humidity (g/kg)")
## `geom_smooth()` using formula 'y ~ x'
train.iq %>% ggplot(aes(x=reanalysis_specific_humidity_g_per_kg, y=total_cases,
color=city, alpha=0.3)) + geom_point() +
geom_smooth(method=lm, col="black", alpha=0.5) +
theme(legend.position = "none") +
ggtitle("Total Cases vs Specific Humidity in Iquitos") +
ylab("Total Cases") +
xlab("Specific Humidity (g/kg)")
## `geom_smooth()` using formula 'y ~ x'
As expected there is a positive relationship. However, one could argue that it is not linear as variation in total cases increases dramatically as specific humidity rises to the extent that the overall trend could be exponential for high cases.
train.sj <- train.sj %>% mutate(Year=as.factor(year), Weekofyear=as.factor(weekofyear))
train.iq <- train.iq %>% mutate(Year=as.factor(year), Weekofyear=as.factor(weekofyear))
test.sj <- test.sj %>% mutate(Year=as.factor(year), Weekofyear=as.factor(weekofyear))
test.iq <- test.iq %>% mutate(Year=as.factor(year), Weekofyear=as.factor(weekofyear))
train2.sj <- train.sj %>% dplyr::filter(year<2006)
test2.sj <- train.sj %>% dplyr::filter(year>=2006)
train2.iq <- train.iq %>% dplyr::filter(year<2009)
test2.iq <- train.iq %>% dplyr::filter(year>=2009)
In this section I am fitting and evaluating the first 2 simple models. After each model its MAE metric is computed on the hold out set and printed for comparison at the end.
# San Juan
# get mean
mean.cases.sj <- mean(train2.sj$total_cases)
# predict
test2.sj <- test2.sj %>% dplyr::mutate(MeanCases = mean.cases.sj)
# evaluate
simple.mean.sj.eval <- mean(abs(test2.sj$total_cases - test2.sj$MeanCases))
simple.mean.sj.eval
## [1] 27.32279
# Iquitos
# get mean
mean.cases.iq <- mean(train2.iq$total_cases)
# predict
test2.iq <- test2.iq %>% dplyr::mutate(MeanCases = mean.cases.iq)
# evaluate
simple.mean.iq.eval <- mean(abs(test2.iq$total_cases - test2.iq$MeanCases))
simple.mean.iq.eval
## [1] 5.404919
# San Juan
# get mean
mean.weekly.cases.sj <- train2.sj %>% group_by(weekofyear) %>% summarise(MeanWeeklyCases=mean(total_cases))
# predict
test2.sj <- test2.sj %>% left_join(mean.weekly.cases.sj, by=c("weekofyear"))
# evaluate
simple.weekly.mean.sj.eval <- mean(abs(test2.sj$total_cases - test2.sj$MeanWeeklyCases))
simple.weekly.mean.sj.eval
## [1] 21.21503
# Iquitos
# get mean
mean.weekly.cases.iq <- train2.iq %>% group_by(weekofyear) %>% summarise(MeanWeeklyCases=mean(total_cases))
# predict
test2.iq <- test2.iq %>% left_join(mean.weekly.cases.iq, by=c("weekofyear"))
# evaluate
simple.weekly.mean.iq.eval <- mean(abs(test2.iq$total_cases - test2.iq$MeanWeeklyCases))
simple.weekly.mean.iq.eval
## [1] 4.702635
In this section, I am fitting time series models using the forecast package only using the single target variable. Similarly evaluation on the hold out set is computed and printed after each model.
train2.sj.ts <- ts(train2.sj$total_cases, frequency = 52, start=1)
train2.iq.ts <- ts(train2.iq$total_cases, frequency = 52, start=1)
test2.sj.ts <- ts(test2.sj$total_cases, frequency = 52, start=1)
test2.iq.ts <- ts(test2.iq$total_cases, frequency = 52, start=1)
# San Juan
# fit model and plot
fit.sj.ets <- stlf(train2.sj.ts, etsmodel="ANN", h=121)
fit.sj.ets %>% autoplot() +
ggtitle("Forecasts from STL + ETS(A,N,N) for San Juan") +
ylab("Cases") +
xlab("Time")
# predict
test2.sj <- test2.sj %>% mutate(stl.ets.ann = fit.sj.ets$mean) %>%
mutate(stl.ets.ann = ifelse(stl.ets.ann>0, stl.ets.ann, ifelse(stl.ets.ann<0, 0, NA)))
# evaluate
stl.ets.sj.eval <- mean(abs(test2.sj$total_cases - test2.sj$stl.ets.ann))
stl.ets.sj.eval
## [1] 13.84629
# Iquitos
# fit model and plot
fit.iq.ets <- stlf(train2.iq.ts, etsmodel="ANN", h=78)
fit.iq.ets %>% autoplot() +
ggtitle("Forecasts from STL + ETS(A,N,N) for Iquitos") +
ylab("Cases") +
xlab("Time")
# predict
test2.iq <- test2.iq %>% mutate(stl.ets.ann = fit.iq.ets$mean) %>%
mutate(stl.ets.ann = ifelse(stl.ets.ann>0, stl.ets.ann, ifelse(stl.ets.ann<0, 0, NA)))
# evaluate
stl.ets.iq.eval <- mean(abs(test2.iq$total_cases - test2.iq$stl.ets.ann))
stl.ets.iq.eval
## [1] 5.32278
# San Juan
# fit model and plot
fit.sj.nnar <- nnetar(train2.sj.ts)
# predict
forecast.sj.nnar <- fit.sj.nnar %>% forecast(h=121, PI=TRUE, npaths=100)
autoplot(forecast.sj.nnar)
test2.sj <- test2.sj %>% mutate(nnar = forecast.sj.nnar$mean) %>%
mutate(nnar = ifelse(nnar>0, nnar, ifelse(nnar<0, 0, NA)))
# evaluate
nnar.sj.eval <- mean(abs(test2.sj$total_cases - test2.sj$nnar))
nnar.sj.eval
## [1] 28.05033
# Iquitos
# fit model and plot
fit.iq.nnar <- nnetar(train2.iq.ts)
# predict
forecast.iq.nnar <- fit.iq.nnar %>% forecast(h=78, PI=TRUE, npaths=100)
autoplot(forecast.iq.nnar)
test2.iq <- test2.iq %>% mutate(nnar = forecast.iq.nnar$mean) %>%
mutate(nnar = ifelse(nnar>0, nnar, ifelse(nnar<0, 0, NA)))
# evaluate
nnar.iq.eval <- mean(abs(test2.iq$total_cases - test2.iq$nnar))
nnar.iq.eval
## [1] 6.356278
We decided trying out averaging two models that seem to have reasonable predictions: the STL+ETS and the Regression with Lagged Predictors. We are hoping that this ensemble will outperform every single model structure tried out above.
# San Juan
# Lets use the average from the best two models, the stl+ets and the arima with lag predictors
# taking into account both info
mean(abs(test2.sj$total_cases - (test2.sj$arima.lag.sj + test2.sj$stl.ets.ann)/2))
## [1] 13.31814
Best so far for San Juan, great!
# Iquitos
mean(abs(test2.iq$total_cases - (test2.iq$stl.ets.ann + test2.iq$arima.lag.iq)/2))
## [1] 4.480311
Also best so far for Iquitos!
As a result, we will be using the combined model shown above to generate final predictions.
Fit on the whole training set and predict on the submission (official test set)
# GENERATE FEATURES ON TEST SET AS WELL (NEED TO COMBINE TO GENERATE LAGS)
full.sj <- rbind(dplyr::filter(train, city == "sj"), dplyr::filter(test, city == "sj") )
full.iq <- rbind(dplyr::filter(train, city == "iq"), dplyr::filter(test, city == "iq") )
# Lagged predictors.
full.sj <- cbind(
full.sj,
HumLag1 = dplyr::lag(full.sj$reanalysis_specific_humidity_g_per_kg,1),
HumLag2 = dplyr::lag(full.sj$reanalysis_specific_humidity_g_per_kg,2),
HumLag3 = dplyr::lag(full.sj$reanalysis_specific_humidity_g_per_kg,3),
HumLag4 = dplyr::lag(full.sj$reanalysis_specific_humidity_g_per_kg,4),
PrecLag1 = dplyr::lag(full.sj$reanalysis_precip_amt_kg_per_m2,1),
PrecLag2 = dplyr::lag(full.sj$reanalysis_precip_amt_kg_per_m2,2),
PrecLag3 = dplyr::lag(full.sj$reanalysis_precip_amt_kg_per_m2,3),
PrecLag4 = dplyr::lag(full.sj$reanalysis_precip_amt_kg_per_m2,4),
DewLag1 = dplyr::lag(full.sj$reanalysis_dew_point_temp_k,1),
DewLag2 = dplyr::lag(full.sj$reanalysis_dew_point_temp_k,2),
DewLag3 = dplyr::lag(full.sj$reanalysis_dew_point_temp_k,3),
DewLag4 = dplyr::lag(full.sj$reanalysis_dew_point_temp_k,4),
TempLag1 = dplyr::lag(full.sj$reanalysis_air_temp_k,1),
TempLag2 = dplyr::lag(full.sj$reanalysis_air_temp_k,2),
TempLag3 = dplyr::lag(full.sj$reanalysis_air_temp_k,3),
TempLag4 = dplyr::lag(full.sj$reanalysis_air_temp_k,4)
)
full.iq <- cbind(
full.iq,
HumLag1 = dplyr::lag(full.iq$reanalysis_specific_humidity_g_per_kg,1),
HumLag2 = dplyr::lag(full.iq$reanalysis_specific_humidity_g_per_kg,2),
HumLag3 = dplyr::lag(full.iq$reanalysis_specific_humidity_g_per_kg,3),
HumLag4 = dplyr::lag(full.iq$reanalysis_specific_humidity_g_per_kg,4),
PrecLag1 = dplyr::lag(full.iq$reanalysis_precip_amt_kg_per_m2,1),
PrecLag2 = dplyr::lag(full.iq$reanalysis_precip_amt_kg_per_m2,2),
PrecLag3 = dplyr::lag(full.iq$reanalysis_precip_amt_kg_per_m2,3),
PrecLag4 = dplyr::lag(full.iq$reanalysis_precip_amt_kg_per_m2,4),
DewLag1 = dplyr::lag(full.iq$reanalysis_dew_point_temp_k,1),
DewLag2 = dplyr::lag(full.iq$reanalysis_dew_point_temp_k,2),
DewLag3 = dplyr::lag(full.iq$reanalysis_dew_point_temp_k,3),
DewLag4 = dplyr::lag(full.iq$reanalysis_dew_point_temp_k,4),
TempLag1 = dplyr::lag(full.iq$reanalysis_air_temp_k,1),
TempLag2 = dplyr::lag(full.iq$reanalysis_air_temp_k,2),
TempLag3 = dplyr::lag(full.iq$reanalysis_air_temp_k,3),
TempLag4 = dplyr::lag(full.iq$reanalysis_air_temp_k,4)
)
# get full train data
train.ind.sj <- train.sj %>% dplyr::select(year, weekofyear)
train.sj <- train.ind.sj %>%
dplyr::inner_join(full.sj, by=c("year", "weekofyear"))
train.ind.iq <- train.iq %>% dplyr::select(year, weekofyear)
train.iq <- train.ind.iq %>%
dplyr::inner_join(full.iq, by=c("year", "weekofyear"))
train.sj.ts <- ts(train.sj, frequency = 52, start=1)
train.iq.ts <- ts(train.iq, frequency = 52, start=1)
# get final test data
test.ind.sj <- test.sj %>% dplyr::select(year, weekofyear)
test.sj <- test.ind.sj %>%
dplyr::inner_join(full.sj, by=c("year", "weekofyear"))
test.ind.iq <- test.iq %>% dplyr::select(year, weekofyear)
test.iq <- test.ind.iq %>%
dplyr::inner_join(full.iq, by=c("year", "weekofyear"))
test.sj.ts <- ts(test.sj, frequency = 52, start=1)
test.iq.ts <- ts(test.iq, frequency = 52, start=1)
# San Juan
# fit model and plot
fit.sj.ets <- stlf(train.sj.ts[,'total_cases'], etsmodel="ANN", h=260)
fit.sj.ets %>% autoplot() +
ggtitle("Forecasts from STL + ETS(A,N,N) for San Juan") +
ylab("Cases") +
xlab("Time")
# predict
test.sj <- test.sj %>% mutate(stl.ets.ann = fit.sj.ets$mean) %>%
mutate(stl.ets.ann = ifelse(stl.ets.ann>0, stl.ets.ann, ifelse(stl.ets.ann<0, 0, NA)))
# Iquitos
# fit model and plot
fit.iq.ets <- stlf(train.iq.ts[,'total_cases'], etsmodel="ANN", h=156)
fit.iq.ets %>% autoplot() +
ggtitle("Forecasts from STL + ETS(A,N,N) for Iquitos") +
ylab("Cases") +
xlab("Time")
# predict
test.iq <- test.iq %>% mutate(stl.ets.ann = fit.iq.ets$mean) %>%
mutate(stl.ets.ann = ifelse(stl.ets.ann>0, stl.ets.ann, ifelse(stl.ets.ann<0, 0, NA)))
# San Juan
fit.sj.arima.lag <- auto.arima(train.sj.ts[,"total_cases"],
xreg=cbind(train.sj.ts[,"reanalysis_specific_humidity_g_per_kg"],
train.sj.ts[,"HumLag1"],
train.sj.ts[,"HumLag2"],
train.sj.ts[,"reanalysis_precip_amt_kg_per_m2"],
train.sj.ts[,"PrecLag1"],
train.sj.ts[,"PrecLag2"],
train.sj.ts[,"reanalysis_dew_point_temp_k"],
train.sj.ts[,"DewLag1"],
train.sj.ts[,"DewLag2"],
train.sj.ts[,"reanalysis_air_temp_k"],
train.sj.ts[,"TempLag1"],
train.sj.ts[,"TempLag2"]
))
forecast.sj.arima.lag <- fit.sj.arima.lag %>%
forecast(xreg=cbind(test.sj.ts[,"reanalysis_specific_humidity_g_per_kg"],
test.sj.ts[,"HumLag1"],
test.sj.ts[,"HumLag2"],
test.sj.ts[,"reanalysis_precip_amt_kg_per_m2"],
test.sj.ts[,"PrecLag1"],
test.sj.ts[,"PrecLag2"],
test.sj.ts[,"reanalysis_dew_point_temp_k"],
test.sj.ts[,"DewLag1"],
test.sj.ts[,"DewLag2"],
test.sj.ts[,"reanalysis_air_temp_k"],
test.sj.ts[,"TempLag1"],
test.sj.ts[,"TempLag2"]
))
## Warning in forecast.forecast_ARIMA(., xreg = cbind(test.sj.ts[,
## "reanalysis_specific_humidity_g_per_kg"], : xreg contains different column names
## from the xreg used in training. Please check that the regressors are in the same
## order.
forecast.sj.arima.lag %>% autoplot()
test.sj <- test.sj %>% mutate(arima.lag.sj = forecast.sj.arima.lag$mean)
# Iquitos
fit.iq.arima.lag <- auto.arima(train.iq.ts[,"total_cases"],
xreg=cbind(train.iq.ts[,"reanalysis_specific_humidity_g_per_kg"],
train.iq.ts[,"HumLag1"],
train.iq.ts[,"HumLag2"],
train.iq.ts[,"reanalysis_precip_amt_kg_per_m2"],
train.iq.ts[,"PrecLag1"],
train.iq.ts[,"PrecLag2"],
train.iq.ts[,"reanalysis_dew_point_temp_k"],
train.iq.ts[,"DewLag1"],
train.iq.ts[,"DewLag2"],
train.iq.ts[,"reanalysis_air_temp_k"],
train.iq.ts[,"TempLag1"],
train.iq.ts[,"TempLag2"]
))
forecast.iq.arima.lag <- fit.iq.arima.lag %>%
forecast(xreg=cbind(test.iq.ts[,"reanalysis_specific_humidity_g_per_kg"],
test.iq.ts[,"HumLag1"],
test.iq.ts[,"HumLag2"],
test.iq.ts[,"reanalysis_precip_amt_kg_per_m2"],
test.iq.ts[,"PrecLag1"],
test.iq.ts[,"PrecLag2"],
test.iq.ts[,"reanalysis_dew_point_temp_k"],
test.iq.ts[,"DewLag1"],
test.iq.ts[,"DewLag2"],
test.iq.ts[,"reanalysis_air_temp_k"],
test.iq.ts[,"TempLag1"],
test.iq.ts[,"TempLag2"]
))
## Warning in forecast.forecast_ARIMA(., xreg = cbind(test.iq.ts[,
## "reanalysis_specific_humidity_g_per_kg"], : xreg contains different column names
## from the xreg used in training. Please check that the regressors are in the same
## order.
forecast.iq.arima.lag %>% autoplot()
test.iq <- test.iq %>% mutate(arima.lag.iq = forecast.iq.arima.lag$mean)
test.sj <- test.sj %>% mutate(total_cases = (stl.ets.ann+arima.lag.sj)/2)
test.iq <- test.iq %>% mutate(total_cases = (stl.ets.ann+arima.lag.iq)/2)
# Plot final forecasts
train.sj <- train.sj %>% mutate(enum=seq(1:936))
test.sj <- test.sj %>% mutate(enum=seq(937:1196)+936)
train.iq <- train.iq %>% mutate(enum=seq(1:520))
test.iq <- test.iq %>% mutate(enum=seq(521:676)+520)
ggplot(train.sj, aes(y=total_cases, x=enum)) +
geom_line() +
geom_line(test.sj,mapping = aes(y=total_cases, x=enum), color="red") +
ggtitle("Dengue Fever Cases Forecasts in San Juan") +
xlab("Time") +
ylab("Total Cases")
ggplot(train.iq, aes(y=total_cases, x=enum)) +
geom_line() +
geom_line(test.iq, mapping = aes(y=total_cases, x=enum), col="red") +
ggtitle("Dengue Fever Cases Forecasts in Iquitos") +
xlab("Time") +
ylab("Total Cases")
a <- test.sj %>% dplyr::select(city, year, weekofyear, total_cases) %>%
mutate(total_cases = as.numeric(total_cases))
b <- test.iq %>% dplyr::select(city, year, weekofyear, total_cases) %>%
mutate(total_cases = as.numeric(total_cases))
final.submission = rbind(a,b)
final.submission <- final.submission %>%
mutate(total_cases = as.integer(floor(total_cases))+1)
write.csv(final.submission, 'final_sub_dengai_michailidis.csv', row.names = FALSE)