The goal of this project is to predict local epidemics of dengue fever. So, let’s get started and see what we can find!
library(psych) #describe
library(forecast) #time series
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
library(ggfortify) # neural nets
## Loading required package: ggplot2
##
## Attaching package: 'ggplot2'
## The following objects are masked from 'package:psych':
##
## %+%, alpha
## Registered S3 methods overwritten by 'ggfortify':
## method from
## autoplot.Arima forecast
## autoplot.acf forecast
## autoplot.ar forecast
## autoplot.bats forecast
## autoplot.decomposed.ts forecast
## autoplot.ets forecast
## autoplot.forecast forecast
## autoplot.stl forecast
## autoplot.ts forecast
## fitted.ar forecast
## fortify.ts forecast
## residuals.ar forecast
library(ggplot2) # visualizations
library(dplyr) # %>%
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(stargazer) # tables
##
## Please cite as:
## Hlavac, Marek (2018). stargazer: Well-Formatted Regression and Summary Statistics Tables.
## R package version 5.2.2. https://CRAN.R-project.org/package=stargazer
library(corrplot) #correlation plots
## corrplot 0.84 loaded
library(vars) #VAR modelling
## Loading required package: MASS
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
## Loading required package: strucchange
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
## Loading required package: sandwich
## Loading required package: urca
## Loading required package: lmtest
library(neuralnet) #neural networks
##
## Attaching package: 'neuralnet'
## The following object is masked from 'package:dplyr':
##
## compute
#import training data
labels <- read.csv("C:/Users/andre/Documents/Boston College/2. Summer 2020/Forecasting/final/dengue_labels_train.csv")
features <- read.csv("C:/Users/andre/Documents/Boston College/2. Summer 2020/Forecasting/final/dengue_features_train.csv")
test <- read.csv("C:/Users/andre/Documents/Boston College/2. Summer 2020/Forecasting/final/dengue_features_test.csv")
labels and features make up the training data for this project, while the aptly named test data will be the test set.
#descriptive stats training data
str(labels)
## 'data.frame': 1456 obs. of 4 variables:
## $ city : chr "sj" "sj" "sj" "sj" ...
## $ year : int 1990 1990 1990 1990 1990 1990 1990 1990 1990 1990 ...
## $ weekofyear : int 18 19 20 21 22 23 24 25 26 27 ...
## $ total_cases: int 4 5 4 3 6 2 4 5 10 6 ...
summary(labels)
## city year weekofyear total_cases
## Length:1456 Min. :1990 Min. : 1.00 Min. : 0.00
## Class :character 1st Qu.:1997 1st Qu.:13.75 1st Qu.: 5.00
## Mode :character Median :2002 Median :26.50 Median : 12.00
## Mean :2001 Mean :26.50 Mean : 24.68
## 3rd Qu.:2005 3rd Qu.:39.25 3rd Qu.: 28.00
## Max. :2010 Max. :53.00 Max. :461.00
str(features)
## 'data.frame': 1456 obs. of 24 variables:
## $ city : chr "sj" "sj" "sj" "sj" ...
## $ year : int 1990 1990 1990 1990 1990 1990 1990 1990 1990 1990 ...
## $ weekofyear : int 18 19 20 21 22 23 24 25 26 27 ...
## $ week_start_date : chr "4/30/1990" "5/7/1990" "5/14/1990" "5/21/1990" ...
## $ ndvi_ne : num 0.1226 0.1699 0.0323 0.1286 0.1962 ...
## $ ndvi_nw : num 0.104 0.142 0.173 0.245 0.262 ...
## $ ndvi_se : num 0.198 0.162 0.157 0.228 0.251 ...
## $ ndvi_sw : num 0.178 0.155 0.171 0.236 0.247 ...
## $ precipitation_amt_mm : num 12.42 22.82 34.54 15.36 7.52 ...
## $ reanalysis_air_temp_k : num 298 298 299 299 300 ...
## $ reanalysis_avg_temp_k : num 298 298 299 299 300 ...
## $ reanalysis_dew_point_temp_k : num 292 294 295 295 296 ...
## $ reanalysis_max_air_temp_k : num 300 301 300 301 302 ...
## $ reanalysis_min_air_temp_k : num 296 296 297 297 298 ...
## $ reanalysis_precip_amt_kg_per_m2 : num 32 17.9 26.1 13.9 12.2 ...
## $ reanalysis_relative_humidity_percent : num 73.4 77.4 82.1 80.3 80.5 ...
## $ reanalysis_sat_precip_amt_mm : num 12.42 22.82 34.54 15.36 7.52 ...
## $ reanalysis_specific_humidity_g_per_kg: num 14 15.4 16.8 16.7 17.2 ...
## $ reanalysis_tdtr_k : num 2.63 2.37 2.3 2.43 3.01 ...
## $ station_avg_temp_c : num 25.4 26.7 26.7 27.5 28.9 ...
## $ station_diur_temp_rng_c : num 6.9 6.37 6.49 6.77 9.37 ...
## $ station_max_temp_c : num 29.4 31.7 32.2 33.3 35 34.4 32.2 33.9 33.9 33.9 ...
## $ station_min_temp_c : num 20 22.2 22.8 23.3 23.9 23.9 23.3 22.8 22.8 24.4 ...
## $ station_precip_mm : num 16 8.6 41.4 4 5.8 39.1 29.7 21.1 21.1 1.1 ...
summary(features)
## city year weekofyear week_start_date
## Length:1456 Min. :1990 Min. : 1.00 Length:1456
## Class :character 1st Qu.:1997 1st Qu.:13.75 Class :character
## Mode :character Median :2002 Median :26.50 Mode :character
## Mean :2001 Mean :26.50
## 3rd Qu.:2005 3rd Qu.:39.25
## Max. :2010 Max. :53.00
##
## ndvi_ne ndvi_nw ndvi_se ndvi_sw
## Min. :-0.40625 Min. :-0.45610 Min. :-0.01553 Min. :-0.06346
## 1st Qu.: 0.04495 1st Qu.: 0.04922 1st Qu.: 0.15509 1st Qu.: 0.14421
## Median : 0.12882 Median : 0.12143 Median : 0.19605 Median : 0.18945
## Mean : 0.14229 Mean : 0.13055 Mean : 0.20378 Mean : 0.20231
## 3rd Qu.: 0.24848 3rd Qu.: 0.21660 3rd Qu.: 0.24885 3rd Qu.: 0.24698
## Max. : 0.50836 Max. : 0.45443 Max. : 0.53831 Max. : 0.54602
## NA's :194 NA's :52 NA's :22 NA's :22
## precipitation_amt_mm reanalysis_air_temp_k reanalysis_avg_temp_k
## Min. : 0.00 Min. :294.6 Min. :294.9
## 1st Qu.: 9.80 1st Qu.:297.7 1st Qu.:298.3
## Median : 38.34 Median :298.6 Median :299.3
## Mean : 45.76 Mean :298.7 Mean :299.2
## 3rd Qu.: 70.23 3rd Qu.:299.8 3rd Qu.:300.2
## Max. :390.60 Max. :302.2 Max. :302.9
## NA's :13 NA's :10 NA's :10
## reanalysis_dew_point_temp_k reanalysis_max_air_temp_k
## Min. :289.6 Min. :297.8
## 1st Qu.:294.1 1st Qu.:301.0
## Median :295.6 Median :302.4
## Mean :295.2 Mean :303.4
## 3rd Qu.:296.5 3rd Qu.:305.5
## Max. :298.4 Max. :314.0
## NA's :10 NA's :10
## reanalysis_min_air_temp_k reanalysis_precip_amt_kg_per_m2
## Min. :286.9 Min. : 0.00
## 1st Qu.:293.9 1st Qu.: 13.05
## Median :296.2 Median : 27.25
## Mean :295.7 Mean : 40.15
## 3rd Qu.:297.9 3rd Qu.: 52.20
## Max. :299.9 Max. :570.50
## NA's :10 NA's :10
## reanalysis_relative_humidity_percent reanalysis_sat_precip_amt_mm
## Min. :57.79 Min. : 0.00
## 1st Qu.:77.18 1st Qu.: 9.80
## Median :80.30 Median : 38.34
## Mean :82.16 Mean : 45.76
## 3rd Qu.:86.36 3rd Qu.: 70.23
## Max. :98.61 Max. :390.60
## NA's :10 NA's :13
## reanalysis_specific_humidity_g_per_kg reanalysis_tdtr_k station_avg_temp_c
## Min. :11.72 Min. : 1.357 Min. :21.40
## 1st Qu.:15.56 1st Qu.: 2.329 1st Qu.:26.30
## Median :17.09 Median : 2.857 Median :27.41
## Mean :16.75 Mean : 4.904 Mean :27.19
## 3rd Qu.:17.98 3rd Qu.: 7.625 3rd Qu.:28.16
## Max. :20.46 Max. :16.029 Max. :30.80
## NA's :10 NA's :10 NA's :43
## station_diur_temp_rng_c station_max_temp_c station_min_temp_c
## Min. : 4.529 Min. :26.70 Min. :14.7
## 1st Qu.: 6.514 1st Qu.:31.10 1st Qu.:21.1
## Median : 7.300 Median :32.80 Median :22.2
## Mean : 8.059 Mean :32.45 Mean :22.1
## 3rd Qu.: 9.567 3rd Qu.:33.90 3rd Qu.:23.3
## Max. :15.800 Max. :42.20 Max. :25.6
## NA's :43 NA's :20 NA's :14
## station_precip_mm
## Min. : 0.00
## 1st Qu.: 8.70
## Median : 23.85
## Mean : 39.33
## 3rd Qu.: 53.90
## Max. :543.30
## NA's :22
While this gives me some nice descriptive statistics, it is rather unweildy. However, there is a fix to this! Both of these files have the same number of observations, and also have three of the same variables: city, year, and week of the year. This means that we can add these variables together by those three variables. Using the dplyer function left_join() is something new that I learned with this project.
train <- left_join(x = labels, y = features, by = c("year", "weekofyear", "city"))
From the summary of the two datasets before they were joined, we know that there are in fact NAs in the data. So, we need to get rid of those.
#descriptive stats
names(train)
## [1] "city"
## [2] "year"
## [3] "weekofyear"
## [4] "total_cases"
## [5] "week_start_date"
## [6] "ndvi_ne"
## [7] "ndvi_nw"
## [8] "ndvi_se"
## [9] "ndvi_sw"
## [10] "precipitation_amt_mm"
## [11] "reanalysis_air_temp_k"
## [12] "reanalysis_avg_temp_k"
## [13] "reanalysis_dew_point_temp_k"
## [14] "reanalysis_max_air_temp_k"
## [15] "reanalysis_min_air_temp_k"
## [16] "reanalysis_precip_amt_kg_per_m2"
## [17] "reanalysis_relative_humidity_percent"
## [18] "reanalysis_sat_precip_amt_mm"
## [19] "reanalysis_specific_humidity_g_per_kg"
## [20] "reanalysis_tdtr_k"
## [21] "station_avg_temp_c"
## [22] "station_diur_temp_rng_c"
## [23] "station_max_temp_c"
## [24] "station_min_temp_c"
## [25] "station_precip_mm"
summary(train)
## city year weekofyear total_cases
## Length:1456 Min. :1990 Min. : 1.00 Min. : 0.00
## Class :character 1st Qu.:1997 1st Qu.:13.75 1st Qu.: 5.00
## Mode :character Median :2002 Median :26.50 Median : 12.00
## Mean :2001 Mean :26.50 Mean : 24.68
## 3rd Qu.:2005 3rd Qu.:39.25 3rd Qu.: 28.00
## Max. :2010 Max. :53.00 Max. :461.00
##
## week_start_date ndvi_ne ndvi_nw ndvi_se
## Length:1456 Min. :-0.40625 Min. :-0.45610 Min. :-0.01553
## Class :character 1st Qu.: 0.04495 1st Qu.: 0.04922 1st Qu.: 0.15509
## Mode :character Median : 0.12882 Median : 0.12143 Median : 0.19605
## Mean : 0.14229 Mean : 0.13055 Mean : 0.20378
## 3rd Qu.: 0.24848 3rd Qu.: 0.21660 3rd Qu.: 0.24885
## Max. : 0.50836 Max. : 0.45443 Max. : 0.53831
## NA's :194 NA's :52 NA's :22
## ndvi_sw precipitation_amt_mm reanalysis_air_temp_k
## Min. :-0.06346 Min. : 0.00 Min. :294.6
## 1st Qu.: 0.14421 1st Qu.: 9.80 1st Qu.:297.7
## Median : 0.18945 Median : 38.34 Median :298.6
## Mean : 0.20231 Mean : 45.76 Mean :298.7
## 3rd Qu.: 0.24698 3rd Qu.: 70.23 3rd Qu.:299.8
## Max. : 0.54602 Max. :390.60 Max. :302.2
## NA's :22 NA's :13 NA's :10
## reanalysis_avg_temp_k reanalysis_dew_point_temp_k reanalysis_max_air_temp_k
## Min. :294.9 Min. :289.6 Min. :297.8
## 1st Qu.:298.3 1st Qu.:294.1 1st Qu.:301.0
## Median :299.3 Median :295.6 Median :302.4
## Mean :299.2 Mean :295.2 Mean :303.4
## 3rd Qu.:300.2 3rd Qu.:296.5 3rd Qu.:305.5
## Max. :302.9 Max. :298.4 Max. :314.0
## NA's :10 NA's :10 NA's :10
## reanalysis_min_air_temp_k reanalysis_precip_amt_kg_per_m2
## Min. :286.9 Min. : 0.00
## 1st Qu.:293.9 1st Qu.: 13.05
## Median :296.2 Median : 27.25
## Mean :295.7 Mean : 40.15
## 3rd Qu.:297.9 3rd Qu.: 52.20
## Max. :299.9 Max. :570.50
## NA's :10 NA's :10
## reanalysis_relative_humidity_percent reanalysis_sat_precip_amt_mm
## Min. :57.79 Min. : 0.00
## 1st Qu.:77.18 1st Qu.: 9.80
## Median :80.30 Median : 38.34
## Mean :82.16 Mean : 45.76
## 3rd Qu.:86.36 3rd Qu.: 70.23
## Max. :98.61 Max. :390.60
## NA's :10 NA's :13
## reanalysis_specific_humidity_g_per_kg reanalysis_tdtr_k station_avg_temp_c
## Min. :11.72 Min. : 1.357 Min. :21.40
## 1st Qu.:15.56 1st Qu.: 2.329 1st Qu.:26.30
## Median :17.09 Median : 2.857 Median :27.41
## Mean :16.75 Mean : 4.904 Mean :27.19
## 3rd Qu.:17.98 3rd Qu.: 7.625 3rd Qu.:28.16
## Max. :20.46 Max. :16.029 Max. :30.80
## NA's :10 NA's :10 NA's :43
## station_diur_temp_rng_c station_max_temp_c station_min_temp_c
## Min. : 4.529 Min. :26.70 Min. :14.7
## 1st Qu.: 6.514 1st Qu.:31.10 1st Qu.:21.1
## Median : 7.300 Median :32.80 Median :22.2
## Mean : 8.059 Mean :32.45 Mean :22.1
## 3rd Qu.: 9.567 3rd Qu.:33.90 3rd Qu.:23.3
## Max. :15.800 Max. :42.20 Max. :25.6
## NA's :43 NA's :20 NA's :14
## station_precip_mm
## Min. : 0.00
## 1st Qu.: 8.70
## Median : 23.85
## Mean : 39.33
## 3rd Qu.: 53.90
## Max. :543.30
## NA's :22
describe(train) ## what might be wrong with this?
## Warning in describe(train): NAs introduced by coercion
## Warning in describe(train): NAs introduced by coercion
## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning -Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning -Inf
## vars n mean sd median trimmed
## city* 1 1456 NaN NA NA NaN
## year 2 1456 2001.03 5.41 2002.00 2001.30
## weekofyear 3 1456 26.50 15.02 26.50 26.50
## total_cases 4 1456 24.68 43.60 12.00 15.97
## week_start_date* 5 1456 NaN NA NA NaN
## ndvi_ne 6 1262 0.14 0.14 0.13 0.14
## ndvi_nw 7 1404 0.13 0.12 0.12 0.13
## ndvi_se 8 1434 0.20 0.07 0.20 0.20
## ndvi_sw 9 1434 0.20 0.08 0.19 0.20
## precipitation_amt_mm 10 1443 45.76 43.72 38.34 40.16
## reanalysis_air_temp_k 11 1446 298.70 1.36 298.65 298.72
## reanalysis_avg_temp_k 12 1446 299.23 1.26 299.29 299.25
## reanalysis_dew_point_temp_k 13 1446 295.25 1.53 295.64 295.37
## reanalysis_max_air_temp_k 14 1446 303.43 3.23 302.40 303.10
## reanalysis_min_air_temp_k 15 1446 295.72 2.57 296.20 295.93
## reanalysis_precip_amt_kg_per_m2 16 1446 40.15 43.43 27.24 32.38
## reanalysis_relative_humidity_percent 17 1446 82.16 7.15 80.30 81.69
## reanalysis_sat_precip_amt_mm 18 1443 45.76 43.72 38.34 40.16
## reanalysis_specific_humidity_g_per_kg 19 1446 16.75 1.54 17.09 16.85
## reanalysis_tdtr_k 20 1446 4.90 3.55 2.86 4.36
## station_avg_temp_c 21 1413 27.19 1.29 27.41 27.27
## station_diur_temp_rng_c 22 1413 8.06 2.13 7.30 7.85
## station_max_temp_c 23 1436 32.45 1.96 32.80 32.52
## station_min_temp_c 24 1442 22.10 1.57 22.20 22.14
## station_precip_mm 25 1434 39.33 47.46 23.85 30.46
## mad min max range skew
## city* NA Inf -Inf -Inf NA
## year 5.93 1990.00 2010.00 20.00 -0.40
## weekofyear 19.27 1.00 53.00 52.00 0.00
## total_cases 13.34 0.00 461.00 461.00 5.26
## week_start_date* NA Inf -Inf -Inf NA
## ndvi_ne 0.15 -0.41 0.51 0.91 -0.11
## ndvi_nw 0.12 -0.46 0.45 0.91 -0.01
## ndvi_se 0.07 -0.02 0.54 0.55 0.57
## ndvi_sw 0.07 -0.06 0.55 0.61 0.75
## precipitation_amt_mm 44.23 0.00 390.60 390.60 1.73
## reanalysis_air_temp_k 1.57 294.64 302.20 7.56 -0.08
## reanalysis_avg_temp_k 1.43 294.89 302.93 8.04 -0.19
## reanalysis_dew_point_temp_k 1.52 289.64 298.45 8.81 -0.72
## reanalysis_max_air_temp_k 2.67 297.80 314.00 16.20 0.85
## reanalysis_min_air_temp_k 2.82 286.90 299.90 13.00 -0.67
## reanalysis_precip_amt_kg_per_m2 25.12 0.00 570.50 570.50 3.38
## reanalysis_relative_humidity_percent 5.68 57.79 98.61 40.82 0.57
## reanalysis_sat_precip_amt_mm 44.23 0.00 390.60 390.60 1.73
## reanalysis_specific_humidity_g_per_kg 1.63 11.72 20.46 8.75 -0.54
## reanalysis_tdtr_k 1.16 1.36 16.03 14.67 1.07
## station_avg_temp_c 1.28 21.40 30.80 9.40 -0.57
## station_diur_temp_rng_c 1.63 4.53 15.80 11.27 0.84
## station_max_temp_c 1.63 26.70 42.20 15.50 -0.26
## station_min_temp_c 1.63 14.70 25.60 10.90 -0.31
## station_precip_mm 26.76 0.00 543.30 543.30 2.98
## kurtosis se
## city* NA NA
## year -0.89 0.14
## weekofyear -1.20 0.39
## total_cases 36.33 1.14
## week_start_date* NA NA
## ndvi_ne -0.14 0.00
## ndvi_nw 0.05 0.00
## ndvi_se 0.56 0.00
## ndvi_sw 0.70 0.00
## precipitation_amt_mm 6.74 1.15
## reanalysis_air_temp_k -0.69 0.04
## reanalysis_avg_temp_k -0.54 0.03
## reanalysis_dew_point_temp_k -0.12 0.04
## reanalysis_max_air_temp_k -0.19 0.09
## reanalysis_min_air_temp_k -0.22 0.07
## reanalysis_precip_amt_kg_per_m2 22.12 1.14
## reanalysis_relative_humidity_percent -0.40 0.19
## reanalysis_sat_precip_amt_mm 6.74 1.15
## reanalysis_specific_humidity_g_per_kg -0.49 0.04
## reanalysis_tdtr_k -0.21 0.09
## station_avg_temp_c -0.16 0.03
## station_diur_temp_rng_c -0.26 0.06
## station_max_temp_c 0.21 0.05
## station_min_temp_c 0.21 0.04
## station_precip_mm 15.29 1.25
Next part of my EDA, I want to look at how the data might be correlated (or not).
library(corrplot)
co <- cor(train[6:25])
co <- round(co,2)
corrplot(co, tl.cex = 0.5)
There are a fair amount of strong positive correlations and less negative correlations.
Next I want to look at the data for each month. To do this, I first convert the start_of_week variable, which is given as chr, and convert that to a date format. Once in the date format, I can split this into “%m”, or change in months, and finally convert the character that this gives me into numbers.
train$week_start_date <- as.Date(train$week_start_date, "%m/%d/%Y")
train$Month <- format(train$week_start_date, "%m")
train$Month <- as.numeric(train$Month)
This, along with the already present weekofyear and year variables, enables me to make some visuals to explore.
#cases by week
fig3 <- ggplot(data = train, aes(x=weekofyear, y=total_cases, fill = city)) +
geom_bar(stat = "identity") +
facet_wrap(.~city) +
ggtitle("Cases of Dengue by Week in Iquitos (IQ) and San Juan (SJ)")
fig3
#cases by month
fig1 <- ggplot(data = train, aes(x=Month, y=total_cases, fill = city)) +
geom_bar(stat = "identity") +
facet_wrap(.~city) +
ggtitle("Cases of Dengue by Month in Iquitos (IQ) and San Juan (SJ)")
fig1
#cases by year
fig2 <- ggplot(data = train, aes(x = year, y = total_cases, fill = city)) +
geom_bar(stat = "identity") +
facet_wrap(.~city) +
ggtitle("Cases of Dengue by Year in Iquitos (IQ) and San Juan (SJ)")
fig2
These visuals show me the the dips and spikes for cases is different between the two cities. This makes sense from the literature review, that while similar types of models will be good for forecasting this data, the specific area will cause the parameters of the models to diverge. So, I want to split the data into an Iquitos set and a San Juan set. From now on I will most likely try the same models on both sets but I’m predicting that different parameters will fit each city.
train.sj <- train[1:936,]
train.iq <- train[937:1456,]
Now that these have been split up, let’s do some descriptive statistics for these separately now and see what kind of picture that paints.
summary(train.sj)
## city year weekofyear total_cases
## Length:936 Min. :1990 Min. : 1.00 Min. : 0.00
## Class :character 1st Qu.:1994 1st Qu.:13.75 1st Qu.: 9.00
## Mode :character Median :1999 Median :26.50 Median : 19.00
## Mean :1999 Mean :26.50 Mean : 34.18
## 3rd Qu.:2003 3rd Qu.:39.25 3rd Qu.: 37.00
## Max. :2008 Max. :53.00 Max. :461.00
##
## week_start_date ndvi_ne ndvi_nw ndvi_se
## Min. :1990-04-30 Min. :-0.40625 Min. :-0.45610 Min. :-0.01553
## 1st Qu.:1994-10-27 1st Qu.: 0.00450 1st Qu.: 0.01642 1st Qu.: 0.13928
## Median :1999-04-26 Median : 0.05770 Median : 0.06807 Median : 0.17719
## Mean :1999-04-26 Mean : 0.05792 Mean : 0.06747 Mean : 0.17766
## 3rd Qu.:2003-10-23 3rd Qu.: 0.11110 3rd Qu.: 0.11520 3rd Qu.: 0.21256
## Max. :2008-04-22 Max. : 0.49340 Max. : 0.43710 Max. : 0.39313
## NA's :191 NA's :49 NA's :19
## ndvi_sw precipitation_amt_mm reanalysis_air_temp_k
## Min. :-0.06346 Min. : 0.00 Min. :295.9
## 1st Qu.: 0.12916 1st Qu.: 0.00 1st Qu.:298.2
## Median : 0.16597 Median : 20.80 Median :299.3
## Mean : 0.16596 Mean : 35.47 Mean :299.2
## 3rd Qu.: 0.20277 3rd Qu.: 52.18 3rd Qu.:300.1
## Max. : 0.38142 Max. :390.60 Max. :302.2
## NA's :19 NA's :9 NA's :6
## reanalysis_avg_temp_k reanalysis_dew_point_temp_k reanalysis_max_air_temp_k
## Min. :296.1 Min. :289.6 Min. :297.8
## 1st Qu.:298.3 1st Qu.:293.8 1st Qu.:300.4
## Median :299.4 Median :295.5 Median :301.5
## Mean :299.3 Mean :295.1 Mean :301.4
## 3rd Qu.:300.2 3rd Qu.:296.4 3rd Qu.:302.4
## Max. :302.2 Max. :297.8 Max. :304.3
## NA's :6 NA's :6 NA's :6
## reanalysis_min_air_temp_k reanalysis_precip_amt_kg_per_m2
## Min. :292.6 Min. : 0.00
## 1st Qu.:296.3 1st Qu.: 10.82
## Median :297.5 Median : 21.30
## Mean :297.3 Mean : 30.47
## 3rd Qu.:298.4 3rd Qu.: 37.00
## Max. :299.9 Max. :570.50
## NA's :6 NA's :6
## reanalysis_relative_humidity_percent reanalysis_sat_precip_amt_mm
## Min. :66.74 Min. : 0.00
## 1st Qu.:76.25 1st Qu.: 0.00
## Median :78.67 Median : 20.80
## Mean :78.57 Mean : 35.47
## 3rd Qu.:80.96 3rd Qu.: 52.18
## Max. :87.58 Max. :390.60
## NA's :6 NA's :9
## reanalysis_specific_humidity_g_per_kg reanalysis_tdtr_k station_avg_temp_c
## Min. :11.72 Min. :1.357 Min. :22.84
## 1st Qu.:15.24 1st Qu.:2.157 1st Qu.:25.84
## Median :16.85 Median :2.457 Median :27.23
## Mean :16.55 Mean :2.516 Mean :27.01
## 3rd Qu.:17.86 3rd Qu.:2.800 3rd Qu.:28.19
## Max. :19.44 Max. :4.429 Max. :30.07
## NA's :6 NA's :6 NA's :6
## station_diur_temp_rng_c station_max_temp_c station_min_temp_c
## Min. :4.529 Min. :26.70 Min. :17.8
## 1st Qu.:6.200 1st Qu.:30.60 1st Qu.:21.7
## Median :6.757 Median :31.70 Median :22.8
## Mean :6.757 Mean :31.61 Mean :22.6
## 3rd Qu.:7.286 3rd Qu.:32.80 3rd Qu.:23.9
## Max. :9.914 Max. :35.60 Max. :25.6
## NA's :6 NA's :6 NA's :6
## station_precip_mm Month
## Min. : 0.000 Min. : 1.000
## 1st Qu.: 6.825 1st Qu.: 3.750
## Median : 17.750 Median : 6.500
## Mean : 26.785 Mean : 6.419
## 3rd Qu.: 35.450 3rd Qu.: 9.000
## Max. :305.900 Max. :12.000
## NA's :6
summary(train.iq)
## city year weekofyear total_cases
## Length:520 Min. :2000 Min. : 1.00 Min. : 0.000
## Class :character 1st Qu.:2003 1st Qu.:13.75 1st Qu.: 1.000
## Mode :character Median :2005 Median :26.50 Median : 5.000
## Mean :2005 Mean :26.50 Mean : 7.565
## 3rd Qu.:2007 3rd Qu.:39.25 3rd Qu.: 9.000
## Max. :2010 Max. :53.00 Max. :116.000
##
## week_start_date ndvi_ne ndvi_nw ndvi_se
## Min. :2000-07-01 Min. :0.06173 Min. :0.03586 Min. :0.02988
## 1st Qu.:2002-12-30 1st Qu.:0.20000 1st Qu.:0.17954 1st Qu.:0.19474
## Median :2005-06-28 Median :0.26364 Median :0.23297 Median :0.24980
## Mean :2005-06-28 Mean :0.26387 Mean :0.23878 Mean :0.25013
## 3rd Qu.:2007-12-26 3rd Qu.:0.31997 3rd Qu.:0.29393 3rd Qu.:0.30230
## Max. :2010-06-25 Max. :0.50836 Max. :0.45443 Max. :0.53831
## NA's :3 NA's :3 NA's :3
## ndvi_sw precipitation_amt_mm reanalysis_air_temp_k
## Min. :0.06418 Min. : 0.00 Min. :294.6
## 1st Qu.:0.20413 1st Qu.: 39.10 1st Qu.:297.1
## Median :0.26214 Median : 60.47 Median :297.8
## Mean :0.26678 Mean : 64.25 Mean :297.9
## 3rd Qu.:0.32515 3rd Qu.: 85.76 3rd Qu.:298.6
## Max. :0.54602 Max. :210.83 Max. :301.6
## NA's :3 NA's :4 NA's :4
## reanalysis_avg_temp_k reanalysis_dew_point_temp_k reanalysis_max_air_temp_k
## Min. :294.9 Min. :290.1 Min. :300.0
## 1st Qu.:298.2 1st Qu.:294.6 1st Qu.:305.2
## Median :299.1 Median :295.9 Median :307.1
## Mean :299.1 Mean :295.5 Mean :307.1
## 3rd Qu.:300.1 3rd Qu.:296.5 3rd Qu.:308.7
## Max. :302.9 Max. :298.4 Max. :314.0
## NA's :4 NA's :4 NA's :4
## reanalysis_min_air_temp_k reanalysis_precip_amt_kg_per_m2
## Min. :286.9 Min. : 0.00
## 1st Qu.:292.0 1st Qu.: 24.07
## Median :293.1 Median : 46.44
## Mean :292.9 Mean : 57.61
## 3rd Qu.:294.2 3rd Qu.: 71.07
## Max. :296.0 Max. :362.03
## NA's :4 NA's :4
## reanalysis_relative_humidity_percent reanalysis_sat_precip_amt_mm
## Min. :57.79 Min. : 0.00
## 1st Qu.:84.30 1st Qu.: 39.10
## Median :90.92 Median : 60.47
## Mean :88.64 Mean : 64.25
## 3rd Qu.:94.56 3rd Qu.: 85.76
## Max. :98.61 Max. :210.83
## NA's :4 NA's :4
## reanalysis_specific_humidity_g_per_kg reanalysis_tdtr_k station_avg_temp_c
## Min. :12.11 Min. : 3.714 Min. :21.40
## 1st Qu.:16.10 1st Qu.: 7.371 1st Qu.:27.00
## Median :17.43 Median : 8.964 Median :27.60
## Mean :17.10 Mean : 9.207 Mean :27.53
## 3rd Qu.:18.18 3rd Qu.:11.014 3rd Qu.:28.10
## Max. :20.46 Max. :16.029 Max. :30.80
## NA's :4 NA's :4 NA's :37
## station_diur_temp_rng_c station_max_temp_c station_min_temp_c
## Min. : 5.20 Min. :30.1 Min. :14.7
## 1st Qu.: 9.50 1st Qu.:33.2 1st Qu.:20.6
## Median :10.62 Median :34.0 Median :21.3
## Mean :10.57 Mean :34.0 Mean :21.2
## 3rd Qu.:11.65 3rd Qu.:34.9 3rd Qu.:22.0
## Max. :15.80 Max. :42.2 Max. :24.2
## NA's :37 NA's :14 NA's :8
## station_precip_mm Month
## Min. : 0.00 Min. : 1.000
## 1st Qu.: 17.20 1st Qu.: 3.750
## Median : 45.30 Median : 6.500
## Mean : 62.47 Mean : 6.417
## 3rd Qu.: 85.95 3rd Qu.: 9.000
## Max. :543.30 Max. :12.000
## NA's :16
fig4 <- ggplot(data = train.sj,
aes(total_cases)) +
geom_histogram(fill = "orange", bins = 30) +
ggtitle("Histogram of Cases in San Juan")
fig4
fig5 <- ggplot(data = train.iq,
aes(total_cases)) +
geom_histogram(fill = "pink", bins = 30) +
ggtitle("Histogram of Cases in Iquitos")
fig5
And now create some time plots for each city. I can’t overlay them because they do happen at different times, which is why it doesn’t make much sense to even really do a single time series for the train data.
ts.sj <- ts(train.sj, start = c(1990, 4, 30), end = c(2004, 10, 28), frequency = 52)
ts.iq <- ts(train.iq, start = c(2005, 11, 11), end = c(2010, 6, 25), frequency = 52)
autoplot(ts.sj[,4], main = "Total Cases San Juan")
autoplot(ts.iq[,4], main = "Total Cases Iquitos")
Now that I have these set up as time series, I want to decompose both of these to get an idea of what the trend and seasonality may look like before building any models for them.
ts.sj[,4] %>%
decompose(type = c("additive")) %>%
autoplot() +
ggtitle("Classical Additive Decomposition of San Jose Train Data")
## Warning: attributes are not identical across measure variables;
## they will be dropped
## Warning: Removed 104 row(s) containing missing values (geom_path).
ts.iq[,4] %>%
decompose(type = c("additive")) %>%
autoplot() +
ggtitle("Classical Additive Decomposition of Iquitos Train Data")
## Warning: attributes are not identical across measure variables;
## they will be dropped
## Warning: Removed 104 row(s) containing missing values (geom_path).
This doesn’t seem very clear to me, so when I build an ETS model, I will be interested to see what it chooses (additive vs. multiplicative). What this does make clear for me is that there is for sure a strong seasonality but not a strong trend in both cities’ data.
Also, while it’s important to know what the total cases were over time, it’s also important to understand the climate and weather throughout the years as well. I’ll now try to focus on visualizing climate variables before jumping in to modelling data. I’m going to focus on the NOAA’s NCEP Climate Forecast System Reanalysis measurements for my climate variables.
autoplot(ts.sj[,11], main = "Temperature in San Juan", series = "Mean Air Temp") + #average, max, and min air temp sj
autolayer(ts.sj[,14], series = "Max Air Temp") +
autolayer(ts.sj[,15], series = "Min Air Temp")
autoplot(ts.iq[,11], main = "Temperature in Iquitos", series = "Mean Air Temp") + #average, max, and min air temp iq
autolayer(ts.iq[,14], series = "Max Air Temp") +
autolayer(ts.iq[,15], series = "Min Air Temp")
Here I am comparing the differences in the temperatures between the two cities. I find it notable that the maximum and minimum are much closer to the average in San Juan than they are in Iquitos.
round(mean(ts.sj[,20]),2) #average diurnal temp range
## [1] NA
round(mean(ts.iq[,20]),2) #average diurnal temp range
## [1] NA
Additionally, San Juan has a large spike in max and large dip in min towards the end of the data set. I’m not sure wha the explanation for that is right now.
Next, I move on to looking at precipitation.
autoplot(ts.sj[,18], #total precipitation in mm
main = "Total Precipitation in San Juan",
xlab = "Time",
ylab = "Precipitation (mm)")
autoplot(ts.iq[,18], #average precipitation in mm iq
main = "Total Precipitation in Iquitos",
xlab = "Time",
ylab = "Precipitation (mm)")
The final variable I’m interested in is humidity.
autoplot(ts.sj[,17], #relative humidity sj
main = "Relative Humidity in San Juan",
xlab = "Time",
ylab = "Relative Humidity (%)")
autoplot(ts.iq[,17], #relative humidity iq
main = "Relative Humidity in Iquitos",
xlab = "Time",
ylab = "Relative Humidity (%)")
I’m realizing now, however, that before I start building any models, that I probably want to make sure that my test data is good to go so that I can also look at the accuracy of the forecasts going forward.
However, I’m not really sure how I messed up splitting the cities data sets in the beginning, but I’m having a hard time findind everything in one place. I’m going to try to just use the first training set to try to set myself straight. However, I’m realizing that it most likely have to do with the fact that I just omitted NA values, which is now messing with my dates. I’m not really sure of how to deal with this, either, so I’m going to see what I can do with the data that I have
#using window to separate original time series
sj.train <- window(ts.sj, end = c(1999,12))
sj.test <- window(ts.sj, start = c(1999,13))
autoplot(sj.train[,4]) + autolayer(sj.test[,4])
autoplot(ts.sj[,4])
iq.train <- window(ts.iq, end = c(2008,27))
iq.test <- window(ts.iq, start = c(2008,28))
autoplot(iq.train[,4]) + autolayer(iq.test[,4])
autoplot(ts.iq[,4])
It’s a little bit rough, but fingers crossed it will give me something to work with going forward.
Since the literature review just loved SARIMA, I’m going to start there with auto.arima and see what I get back
iq.mod1 <- auto.arima(iq.train[,4]) #auto arima
summary(iq.mod1)
## Series: iq.train[, 4]
## ARIMA(0,1,2)
##
## Coefficients:
## ma1 ma2
## -0.5516 0.2021
## s.e. 0.0728 0.0720
##
## sigma^2 estimated as 15.38: log likelihood=-478.26
## AIC=962.51 AICc=962.66 BIC=971.96
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE ACF1
## Training set 0.01768816 3.887614 2.114454 NaN Inf 0.226415 -0.01503691
fc_iqmod1 <- forecast(iq.mod1, h = 5)
autoplot(fc_iqmod1)
## Warning: `filter_()` is deprecated as of dplyr 0.7.0.
## Please use `filter()` instead.
## See vignette('programming') for more help
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
accuracy(fc_iqmod1,iq.test[1:5,4])
## ME RMSE MAE MPE MAPE MASE
## Training set 0.01768816 3.8876143 2.1144544 NaN Inf 0.8827334
## Test set 0.45847457 0.9584038 0.8546684 2.107743 41.72712 0.3568033
## ACF1
## Training set -0.01503691
## Test set NA
My auto.arima did find some seasonality in the total cases in Iquitos, which agrees with the literature review. The model fits the training data slightly better than the test data, but we won’t really know what any of this means until we have other models to compare this to.
The auto.arima didn’t have any seasonal integration, so I want to try to add that as well.
iq.mod2 <- arima(iq.train[,4], order = c(2,0,2), seasonal = list(order = c(1,0,1))) #arima
fc_iqmod2 <- forecast(iq.mod2, h = 5)
autoplot(fc_iqmod2)
accuracy(fc_iqmod2, iq.test[1:5,4])
## ME RMSE MAE MPE MAPE MASE
## Training set 0.04732523 3.8207655 2.238910 -Inf Inf 0.9346905
## Test set -0.17454568 0.9890804 0.887075 -29.89234 53.64332 0.3703323
## ACF1
## Training set -0.0003072499
## Test set NA
This model fits thr training data a little bit better than the last model, but fits the testing data worse. I think between these two models, I would want to go with something that fit the test data better. So, I think that I would stick with the auto.arima model.
Let’s try some models that weren’t talked about in the literature review. In particular, I want to try neural networks and VAR models. I’m going to start with a neural network and see how that goes.
For some reason rmarkdown really doesn’t like this code, even though it is fully functional in my regular script. So, here is the code:
iq.mod3 <- nnetar(iq.train[,4]) #neural network fc_iqmod3 <- forecast(iq.mod3, h = 5) autoplot(fc_iqmod3) accuracy(fc_iqmod3, iq.test[1:5,4])
and here are the accuracy results:
ME RMSE MAE MPE MAPE MASE ACF1
Training set -0.03066757 5.061519 3.378104 -Inf Inf 0.6580225 0.006958621 Test set 5.93589429 8.310183 5.935894 37.79275 37.79275 1.1562557 NA
the neural network does the best so far at fitting the training data. However, it doesn’t do any better than the auto.arima model at fitting the test data, so the auto.arima model still sticks out as the best model.
Finally, I’m going to try doing a VAR model to see how adding climate variables improves (or not) the forecast of total cases. Before running a VAR model, I’m actually going to run a regression model first. I’m a little bit unsure if running a VAR model is even really necessary, since I’m not concern about how much the cases of dengue affects climate because, well, it won’t. So here’s a regression model real quick. It will be with average air temp [,12], total precipitation [,18], diurnal temp range [,20], and relative humidity [,17].
iq.mod4 <- tslm(iq.train[,4] ~ iq.train[,12] + iq.train[,17] + iq.train[,18] + iq.train[,20])
summary(iq.mod4)
##
## Call:
## tslm(formula = iq.train[, 4] ~ iq.train[, 12] + iq.train[, 17] +
## iq.train[, 18] + iq.train[, 20])
##
## Residuals:
## Min 1Q Median 3Q Max
## -7.490 -4.500 -2.651 2.694 30.480
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -460.21258 156.48833 -2.941 0.00373 **
## iq.train[, 12] 1.53858 0.51916 2.964 0.00348 **
## iq.train[, 17] 0.14026 0.14954 0.938 0.34965
## iq.train[, 18] -0.02481 0.01968 -1.261 0.20921
## iq.train[, 20] -0.59963 0.47698 -1.257 0.21045
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.326 on 168 degrees of freedom
## Multiple R-squared: 0.0686, Adjusted R-squared: 0.04642
## F-statistic: 3.093 on 4 and 168 DF, p-value: 0.01729
The only two results that have any statistical significance are average air temperature and relative humidity. HOwever, the greatest magnitude effects were from average temperature and diurnal temperature range. Here I think we see clearly the difference between economic significance and statistical significance. I’m still not sure what I really want to do with this except maybe look at how these results compare to San Juan. However, since I have identified at least one variable that is statistically significant and economically significant, I will use the average temperature to build a VAR model.
iq.cases <- iq.train[,4]
iq.avg.temp <- iq.train[,12]
iq.for.VAR <- ts.intersect(iq.cases, iq.avg.temp, dframe = FALSE)
VARselect(iq.for.VAR, type = c("const"))
## $selection
## AIC(n) HQ(n) SC(n) FPE(n)
## 2 2 2 2
##
## $criteria
## 1 2 3 4 5 6 7
## AIC(n) 3.282055 3.065013 3.076177 3.085667 3.125644 3.159825 3.174662
## HQ(n) 3.328289 3.142070 3.184057 3.224369 3.295170 3.360173 3.405833
## SC(n) 3.395935 3.254814 3.341898 3.427308 3.543206 3.653307 3.744064
## FPE(n) 26.630656 21.435571 21.677673 21.886969 22.783925 23.582487 23.943739
## 8 9 10
## AIC(n) 3.219592 3.262189 3.223388
## HQ(n) 3.481586 3.555006 3.547028
## SC(n) 3.864914 3.983432 4.020551
## FPE(n) 25.056032 26.162272 25.185499
I created a new time series just to make it easier on myself. I used VARselect to determine the number of lags that I want to use. Looks like I need to build a model with 3 lags as well as a model with 2 lags. Since there really isn’t a trend in this data, I decided that I would just have my type be constant.
iq.mod5 <- VAR(iq.for.VAR, p = 2, type = c("const")) #VAR model with lag 2
summary(iq.mod5)
##
## VAR Estimation Results:
## =========================
## Endogenous variables: iq.cases, iq.avg.temp
## Deterministic variables: const
## Sample size: 171
## Log Likelihood: -732.171
## Roots of the characteristic polynomial:
## 0.9296 0.7614 0.4937 0.1942
## Call:
## VAR(y = iq.for.VAR, p = 2, type = c("const"))
##
##
## Estimation results for equation iq.cases:
## =========================================
## iq.cases = iq.cases.l1 + iq.avg.temp.l1 + iq.cases.l2 + iq.avg.temp.l2 + const
##
## Estimate Std. Error t value Pr(>|t|)
## iq.cases.l1 0.43039 0.06954 6.189 4.57e-09 ***
## iq.avg.temp.l1 0.22954 0.27188 0.844 0.400
## iq.cases.l2 0.46789 0.06961 6.721 2.76e-10 ***
## iq.avg.temp.l2 -0.11090 0.27061 -0.410 0.682
## const -34.97444 66.42434 -0.527 0.599
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
##
## Residual standard error: 3.922 on 166 degrees of freedom
## Multiple R-Squared: 0.735, Adjusted R-squared: 0.7286
## F-statistic: 115.1 on 4 and 166 DF, p-value: < 2.2e-16
##
##
## Estimation results for equation iq.avg.temp:
## ============================================
## iq.avg.temp = iq.cases.l1 + iq.avg.temp.l1 + iq.cases.l2 + iq.avg.temp.l2 + const
##
## Estimate Std. Error t value Pr(>|t|)
## iq.cases.l1 -0.01905 0.01993 -0.956 0.340
## iq.avg.temp.l1 0.57280 0.07791 7.352 8.46e-12 ***
## iq.cases.l2 0.01069 0.01995 0.536 0.593
## iq.avg.temp.l2 0.14246 0.07754 1.837 0.068 .
## const 85.18752 19.03405 4.476 1.41e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
##
## Residual standard error: 1.124 on 166 degrees of freedom
## Multiple R-Squared: 0.4487, Adjusted R-squared: 0.4354
## F-statistic: 33.77 on 4 and 166 DF, p-value: < 2.2e-16
##
##
##
## Covariance matrix of residuals:
## iq.cases iq.avg.temp
## iq.cases 15.3823 0.6166
## iq.avg.temp 0.6166 1.2631
##
## Correlation matrix of residuals:
## iq.cases iq.avg.temp
## iq.cases 1.0000 0.1399
## iq.avg.temp 0.1399 1.0000
fc_iqmod5 <- forecast(iq.mod5, h = 5)
autoplot(fc_iqmod5)
accuracy(fc_iqmod5$forecast$iq.cases, iq.test[1:5,4])
## ME RMSE MAE MPE MAPE MASE
## Training set 5.978297e-17 3.864257 2.3036659 NaN Inf 0.9617246
## Test set -5.004923e-01 1.096881 0.8154455 -48.17555 58.67399 0.3404287
## ACF1
## Training set -0.04739087
## Test set NA
Doesn’t really perform a whole lot different than my auto.arima model. However, I still need to try again with a lag = 3
iq.mod6 <- VAR(iq.for.VAR, p = 3, type = c("const")) #VAR model with lag 3
summary(iq.mod6)
##
## VAR Estimation Results:
## =========================
## Endogenous variables: iq.cases, iq.avg.temp
## Deterministic variables: const
## Sample size: 170
## Log Likelihood: -725.329
## Roots of the characteristic polynomial:
## 0.9372 0.8388 0.4218 0.4218 0.3131 0.3131
## Call:
## VAR(y = iq.for.VAR, p = 3, type = c("const"))
##
##
## Estimation results for equation iq.cases:
## =========================================
## iq.cases = iq.cases.l1 + iq.avg.temp.l1 + iq.cases.l2 + iq.avg.temp.l2 + iq.cases.l3 + iq.avg.temp.l3 + const
##
## Estimate Std. Error t value Pr(>|t|)
## iq.cases.l1 0.38399 0.07854 4.889 2.41e-06 ***
## iq.avg.temp.l1 0.20406 0.27527 0.741 0.460
## iq.cases.l2 0.42327 0.07797 5.429 2.02e-07 ***
## iq.avg.temp.l2 -0.18902 0.31436 -0.601 0.548
## iq.cases.l3 0.09932 0.07879 1.261 0.209
## iq.avg.temp.l3 0.18785 0.27446 0.684 0.495
## const -60.19705 70.86262 -0.849 0.397
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
##
## Residual standard error: 3.93 on 163 degrees of freedom
## Multiple R-Squared: 0.7381, Adjusted R-squared: 0.7285
## F-statistic: 76.58 on 6 and 163 DF, p-value: < 2.2e-16
##
##
## Estimation results for equation iq.avg.temp:
## ============================================
## iq.avg.temp = iq.cases.l1 + iq.avg.temp.l1 + iq.cases.l2 + iq.avg.temp.l2 + iq.cases.l3 + iq.avg.temp.l3 + const
##
## Estimate Std. Error t value Pr(>|t|)
## iq.cases.l1 -0.021799 0.022328 -0.976 0.330368
## iq.avg.temp.l1 0.550733 0.078254 7.038 5.15e-11 ***
## iq.cases.l2 0.007957 0.022164 0.359 0.720075
## iq.avg.temp.l2 0.060518 0.089368 0.677 0.499251
## iq.cases.l3 0.004462 0.022399 0.199 0.842330
## iq.avg.temp.l3 0.146484 0.078025 1.877 0.062250 .
## const 72.499565 20.144902 3.599 0.000424 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
##
## Residual standard error: 1.117 on 163 degrees of freedom
## Multiple R-Squared: 0.4614, Adjusted R-squared: 0.4416
## F-statistic: 23.27 on 6 and 163 DF, p-value: < 2.2e-16
##
##
##
## Covariance matrix of residuals:
## iq.cases iq.avg.temp
## iq.cases 15.4438 0.5734
## iq.avg.temp 0.5734 1.2481
##
## Correlation matrix of residuals:
## iq.cases iq.avg.temp
## iq.cases 1.0000 0.1306
## iq.avg.temp 0.1306 1.0000
fc_iqmod6 <- forecast(iq.mod6, h = 5)
autoplot(fc_iqmod6)
accuracy(fc_iqmod6$forecast$iq.cases, iq.test[1:5,4])
## ME RMSE MAE MPE MAPE MASE
## Training set 1.743474e-16 3.848102 2.286984 NaN Inf 0.9547601
## Test set -4.631348e-01 1.150401 0.893939 -47.80478 62.16492 0.3731978
## ACF1
## Training set 0.01410457
## Test set NA
I think that this may in fact have been the closest to modelling the train and test data that any of my models have actually come, which is very exciting.
I know that they’re already above, but I’m going to put all of my accuracy “tables” in one place right here:
accuracy(fc_iqmod1,iq.test[1:5,4]) #auto.arima
## ME RMSE MAE MPE MAPE MASE
## Training set 0.01768816 3.8876143 2.1144544 NaN Inf 0.8827334
## Test set 0.45847457 0.9584038 0.8546684 2.107743 41.72712 0.3568033
## ACF1
## Training set -0.01503691
## Test set NA
accuracy(fc_iqmod2, iq.test[1:5,4]) #SARIMA
## ME RMSE MAE MPE MAPE MASE
## Training set 0.04732523 3.8207655 2.238910 -Inf Inf 0.9346905
## Test set -0.17454568 0.9890804 0.887075 -29.89234 53.64332 0.3703323
## ACF1
## Training set -0.0003072499
## Test set NA
accuracy(fc_iqmod5$forecast$iq.cases, iq.test[1:5,4]) #VAR lag 2
## ME RMSE MAE MPE MAPE MASE
## Training set 5.978297e-17 3.864257 2.3036659 NaN Inf 0.9617246
## Test set -5.004923e-01 1.096881 0.8154455 -48.17555 58.67399 0.3404287
## ACF1
## Training set -0.04739087
## Test set NA
accuracy(fc_iqmod6$forecast$iq.cases, iq.test[1:5,4]) #VAR lag 3
## ME RMSE MAE MPE MAPE MASE
## Training set 1.743474e-16 3.848102 2.286984 NaN Inf 0.9547601
## Test set -4.631348e-01 1.150401 0.893939 -47.80478 62.16492 0.3731978
## ACF1
## Training set 0.01410457
## Test set NA
ME RMSE MAE MPE MAPE MASE ACF1
Training set -0.03066757 5.061519 3.378104 -Inf Inf 0.6580225 0.006958621 Test set 5.93589429 8.310183 5.935894 37.79275 37.79275 1.1562557 NA
the last set being my neural net which wouldn’t go through markdown. I think that, looking at all of these metrics together, that the best fit model for Iquitos is, like the literature review said, an ARIMA model. My neural network does have the lowest RMSE for training data, but the auto.arima model has the lowest RMSE for the test set, and like it says in the instructions on datadriven, RMSE is the mettric by which we are judging.
Following the same procedure now for San Juan. Starting with an ARIMA model
sj.mod1 <- auto.arima(sj.train[,4])
summary(sj.mod1)
## Series: sj.train[, 4]
## ARIMA(2,0,1) with non-zero mean
##
## Coefficients:
## ar1 ar2 ma1 mean
## 1.8413 -0.8584 -0.7471 46.7680
## s.e. 0.0569 0.0539 0.0793 10.7979
##
## sigma^2 estimated as 262.7: log likelihood=-2005.02
## AIC=4020.04 AICc=4020.16 BIC=4040.87
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set 0.04574525 16.14103 9.884084 -15.96648 34.09738 0.2007904
## ACF1
## Training set 0.05126605
checkresiduals(sj.mod1)
##
## Ljung-Box test
##
## data: Residuals from ARIMA(2,0,1) with non-zero mean
## Q* = 67.835, df = 91, p-value = 0.967
##
## Model df: 4. Total lags used: 95
fc_sjmod1 <- forecast(sj.mod1, h = 5)
autoplot(fc_sjmod1)
accuracy(fc_sjmod1, sj.test[1:5,4])
## ME RMSE MAE MPE MAPE MASE
## Training set 0.04574525 16.14103 9.884084 -15.96648 34.09738 0.9787443
## Test set 4.47025234 6.53345 4.792105 21.12432 23.30603 0.4745250
## ACF1
## Training set 0.05126605
## Test set NA
Interestingly, auto.arima didn’t include any seasonality. This might be something that I want to try, especially given the support of the literature that I found. Interestingly enough, this fits the test data better than it fits the training data. Good sign? The other positive about this forecast is that the ARIMA model showed only one spike in the ACF out of the permissible range, and the residuals (while there were outliers) are centered around 0 and normally distributed.
I’m going to take another crack at an ARIMA model and this time add some seasonality to it.
sj.mod2 <- arima(sj.train[,4], order = c(2,1,2), seasonal = list(order = c(1,1,1)))
checkresiduals(sj.mod2)
##
## Ljung-Box test
##
## data: Residuals from ARIMA(2,1,2)(1,1,1)[52]
## Q* = 91.256, df = 89, p-value = 0.4139
##
## Model df: 6. Total lags used: 95
fc_sjmod2 <- forecast(sj.mod2, h = 5)
autoplot(fc_sjmod2)
accuracy(fc_sjmod2, sj.test[1:5,4])
## ME RMSE MAE MPE MAPE MASE
## Training set -0.1038928 16.314622 10.149330 -6.390530 32.23370 1.005010
## Test set -1.7616603 7.823223 7.066139 -5.245001 38.78016 0.699705
## ACF1
## Training set 0.04332763
## Test set NA
My RMSE is better in this model for the training data but worse for the test data. And again, but MASE is also better for the training data, but not for the testing data. For the training data, my ACF is better for this model than my other ARIMA model by auto.arima.
sj.mod3 <- nnetar(sj.train[,4]) #neural network
fc_sjmod3 <- forecast(sj.mod3, h = 5)
accuracy(fc_sjmod3, sj.test[1:5,4])
## ME RMSE MAE MPE MAPE MASE
## Training set -0.04771595 6.639574 4.894112 -9.539623 23.13390 0.4846260
## Test set -0.46913342 3.536070 3.067352 -7.452337 17.86926 0.3037361
## ACF1
## Training set -0.02839626
## Test set NA
Instead of comparing metrics as I go I’m going to wait until the end.
markdown didn’t like my code again, so I have the text here:
sj.mod4 <- tslm(sj.train[,4] ~ sj.train[,12] + sj.train[,17] + sj.train[,18] + sj.train[,20]) summary(sj.mod4)
Coefficients: Estimate Std. Error t value Pr(>|t|)
(Intercept) -2.332e+03 5.035e+02 -4.633 4.67e-06 sj.train[, 12] 7.725e+00 1.768e+00 4.370 1.53e-05 sj.train[, 17] 8.845e-01 7.077e-01 1.250 0.212
sj.train[, 18] -9.629e-03 4.996e-02 -0.193 0.847
sj.train[, 20] -4.932e+00 4.158e+00 -1.186 0.236
Interestingly, the same two variables are significant in San Juan as were in the Iquitos data set: temperature and relative humidity. In this case, however, it is relative humidity and total precipitation that have the greatest magnitude effects. So, as relative humidity is both statistically and economically significant, I will use this climate variable within my VAR model.
I’m having a lot of trouble with rmarkdown, and things that worked in the console do not work here. here it is pasted
sj.cases <- sj.train[,4] sj.rel.hum <- sj.train[,17] sj.for.VAR <- ts.intersect(sj.cases, sj.rel.hum, dframe = FALSE) sj.for.VAR <- (sj.for.VAR)
VARselect(sj.for.VAR, type = c(“const”))
Again, I created a di-variate time series to make it easier on myself. I chose to run my VARselect with constant since there wasn’t much of a trend when I was doing my EDA. So, I will now be building two different VAR models, one with lag 2 and one with lag 5.
Here is my lag = 2
sj.mod5 <- VAR(sj.for.VAR, p = 2, type = c(“const”)) # VAR lag 2 fc_sjmod5 <- forecast(sj.mod5, h = 5) accuracy(fc_sjmod5\(forecast\)sj.cases, sj.test[1:5,4])
and here is my lag = 5
sj.mod6 <- VAR(sj.for.VAR, p = 5, type = c(“const”)) #VAR = 5 fc_sjmod6 <- forecast(sj.mod6, h = 5) accuracy(fc_sjmod6\(forecast\)sj.cases, sj.test[1:5,4])
and let’s rope them together and check it out!
accuracy(fc_sjmod1, sj.test[1:5,4]) #auto.arima
## ME RMSE MAE MPE MAPE MASE
## Training set 0.04574525 16.14103 9.884084 -15.96648 34.09738 0.9787443
## Test set 4.47025234 6.53345 4.792105 21.12432 23.30603 0.4745250
## ACF1
## Training set 0.05126605
## Test set NA
accuracy(fc_sjmod2, sj.test[1:5,4]) #arima
## ME RMSE MAE MPE MAPE MASE
## Training set -0.1038928 16.314622 10.149330 -6.390530 32.23370 1.005010
## Test set -1.7616603 7.823223 7.066139 -5.245001 38.78016 0.699705
## ACF1
## Training set 0.04332763
## Test set NA
accuracy(fc_sjmod3, sj.test[1:5,4]) #neural network
## ME RMSE MAE MPE MAPE MASE
## Training set -0.04771595 6.639574 4.894112 -9.539623 23.13390 0.4846260
## Test set -0.46913342 3.536070 3.067352 -7.452337 17.86926 0.3037361
## ACF1
## Training set -0.02839626
## Test set NA
accuracy(fc_sjmod5\(forecast\)sj.cases, sj.test[1:5,4]) #lag 2 accuracy(fc_sjmod6\(forecast\)sj.cases, sj.test[1:5,4]) #lag 5
ME RMSE MAE MPE MAPE MASE ACF1
Training set -1.136746e-16 13.076325 8.555603 -Inf Inf 0.9925584 -0.008135356 Test set -5.838463e+00 7.499002 6.884578 -347.0961 362.0406 0.7986983 NA
ME RMSE MAE MPE MAPE MASE ACF1
Training set -1.911844e-16 12.83867 8.520497 -Inf Inf 0.9884856 0.007418085 Test set -6.424397e+00 7.79104 6.869190 -364.298 370.6522 0.7969131 NA
RMSE is smallest for the training and testing data in my neural network. I find this to be interesting, given that for Iquitos and the literature review, ARIMA performed this best. .