Load the data
library(fpp2)
## Warning: package 'fpp2' was built under R version 4.0.3
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
## -- Attaching packages ----------------------------------------- fpp2 2.4 --
## v ggplot2 3.3.0 v fma 2.4
## v forecast 8.12 v expsmooth 2.3
## Warning: package 'forecast' was built under R version 4.0.2
## Warning: package 'fma' was built under R version 4.0.3
## Warning: package 'expsmooth' was built under R version 4.0.3
##
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.0.2
## -- Attaching packages ---------------------------------- tidyverse 1.3.0 --
## v tibble 3.0.1 v dplyr 1.0.0
## v tidyr 1.1.0 v stringr 1.4.0
## v readr 1.3.1 v forcats 0.5.0
## v purrr 0.3.4
## Warning: package 'forcats' was built under R version 4.0.2
## -- Conflicts ------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(readr)
library(psych)
## Warning: package 'psych' was built under R version 4.0.2
##
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
library(vars)
## Warning: package 'vars' was built under R version 4.0.3
## Loading required package: MASS
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
## The following objects are masked from 'package:fma':
##
## cement, housing, petrol
## Loading required package: strucchange
## Warning: package 'strucchange' was built under R version 4.0.3
## Loading required package: zoo
## Warning: package 'zoo' was built under R version 4.0.2
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
## Loading required package: sandwich
## Warning: package 'sandwich' was built under R version 4.0.2
##
## Attaching package: 'strucchange'
## The following object is masked from 'package:stringr':
##
## boundary
## Loading required package: urca
## Warning: package 'urca' was built under R version 4.0.2
## Loading required package: lmtest
## Warning: package 'lmtest' was built under R version 4.0.2
library(PolyPatEx)
## Warning: package 'PolyPatEx' was built under R version 4.0.3
library(magrittr)
##
## Attaching package: 'magrittr'
## The following object is masked from 'package:purrr':
##
## set_names
## The following object is masked from 'package:tidyr':
##
## extract
setwd("C:/Users/burtkb/Downloads/")
submission = read.csv("submission_format.csv")
df_test = read_csv("dengue_features_test.csv",col_types = cols(week_start_date = col_date(format = "%Y-%m-%d")))
df_train = read_csv("dengue_features_train.csv",col_types = cols(week_start_date = col_date(format = "%Y-%m-%d")))
dl_train = read_csv("dengue_labels_train.csv")
## Parsed with column specification:
## cols(
## city = col_character(),
## year = col_double(),
## weekofyear = col_double(),
## total_cases = col_double()
## )
Create train and test sets
sum(is.na(df_train)) #548 na
## [1] 548
sum(is.na(dl_train)) #0 na
## [1] 0
sum(is.na(df_test)) #119
## [1] 119
test = left_join(x = df_test, y = submission, by = c("year", "weekofyear", "city"))
train = left_join(x = dl_train, y = df_train, by = c("year", "weekofyear", "city"))
sum(is.na(train))
## [1] 548
#548 na - we will omit na's
train = na.omit(train)
#double check
sum(is.na(train))
## [1] 0
describe(test)
## Warning in describe(test): NAs introduced by coercion
## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning -Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning -Inf
## vars n mean sd median trimmed
## city* 1 416 NaN NA NA NaN
## year 2 416 2010.77 1.43 2011.00 2010.81
## weekofyear 3 416 26.44 14.98 26.00 26.44
## week_start_date 4 416 NaN NA NA NaN
## ndvi_ne 5 373 0.13 0.16 0.11 0.13
## ndvi_nw 6 405 0.13 0.14 0.09 0.12
## ndvi_se 7 415 0.21 0.08 0.20 0.20
## ndvi_sw 8 415 0.20 0.09 0.19 0.19
## precipitation_amt_mm 9 414 38.35 35.17 31.45 33.87
## reanalysis_air_temp_k 10 414 298.82 1.47 298.55 298.84
## reanalysis_avg_temp_k 11 414 299.35 1.31 299.33 299.39
## reanalysis_dew_point_temp_k 12 414 295.42 1.52 295.82 295.54
## reanalysis_max_air_temp_k 13 414 303.62 3.10 302.75 303.36
## reanalysis_min_air_temp_k 14 414 295.74 2.76 296.30 295.93
## reanalysis_precip_amt_kg_per_m2 15 414 42.17 48.91 25.85 32.62
## reanalysis_relative_humidity_percent 16 414 82.50 7.38 80.33 82.09
## reanalysis_sat_precip_amt_mm 17 414 38.35 35.17 31.45 33.87
## reanalysis_specific_humidity_g_per_kg 18 414 16.93 1.56 17.34 17.03
## reanalysis_tdtr_k 19 414 5.12 3.54 2.91 4.65
## station_avg_temp_c 20 404 27.37 1.23 27.48 27.40
## station_diur_temp_rng_c 21 404 7.81 2.45 6.64 7.55
## station_max_temp_c 22 413 32.53 1.92 32.80 32.60
## station_min_temp_c 23 407 22.37 1.73 22.20 22.36
## station_precip_mm 24 411 34.28 34.66 23.60 28.62
## total_cases 25 416 0.00 0.00 0.00 0.00
## mad min max range skew
## city* NA Inf -Inf -Inf NA
## year 1.48 2008.00 2013.00 5.00 -0.32
## weekofyear 19.27 1.00 53.00 52.00 0.00
## week_start_date NA Inf -Inf -Inf NA
## ndvi_ne 0.19 -0.46 0.50 0.96 -0.17
## ndvi_nw 0.15 -0.21 0.65 0.86 0.40
## ndvi_se 0.08 0.01 0.45 0.45 0.38
## ndvi_sw 0.09 -0.01 0.53 0.54 0.69
## precipitation_amt_mm 35.77 0.00 169.34 169.34 1.01
## reanalysis_air_temp_k 1.73 294.55 301.94 7.38 -0.03
## reanalysis_avg_temp_k 1.61 295.24 303.33 8.09 -0.15
## reanalysis_dew_point_temp_k 1.57 290.82 297.79 6.98 -0.65
## reanalysis_max_air_temp_k 3.19 298.20 314.10 15.90 0.70
## reanalysis_min_air_temp_k 3.41 286.20 299.70 13.50 -0.50
## reanalysis_precip_amt_kg_per_m2 27.58 0.00 301.40 301.40 2.32
## reanalysis_relative_humidity_percent 6.10 64.92 97.98 33.06 0.53
## reanalysis_sat_precip_amt_mm 35.77 0.00 169.34 169.34 1.01
## reanalysis_specific_humidity_g_per_kg 1.66 12.54 19.60 7.06 -0.54
## reanalysis_tdtr_k 1.14 1.49 14.49 13.00 0.88
## station_avg_temp_c 1.34 24.16 30.27 6.11 -0.23
## station_diur_temp_rng_c 1.58 4.04 14.72 10.68 0.82
## station_max_temp_c 1.78 27.20 38.40 11.20 -0.28
## station_min_temp_c 1.63 14.20 26.70 12.50 -0.22
## station_precip_mm 24.76 0.00 212.00 212.00 1.99
## total_cases 0.00 0.00 0.00 0.00 NaN
## kurtosis se
## city* NA NA
## year -0.79 0.07
## weekofyear -1.20 0.73
## week_start_date NA NA
## ndvi_ne -0.42 0.01
## ndvi_nw -0.49 0.01
## ndvi_se -0.05 0.00
## ndvi_sw 0.11 0.00
## precipitation_amt_mm 0.65 1.73
## reanalysis_air_temp_k -0.90 0.07
## reanalysis_avg_temp_k -0.70 0.06
## reanalysis_dew_point_temp_k -0.41 0.07
## reanalysis_max_air_temp_k -0.30 0.15
## reanalysis_min_air_temp_k -0.65 0.14
## reanalysis_precip_amt_kg_per_m2 6.59 2.40
## reanalysis_relative_humidity_percent -0.75 0.36
## reanalysis_sat_precip_amt_mm 0.65 1.73
## reanalysis_specific_humidity_g_per_kg -0.67 0.08
## reanalysis_tdtr_k -0.71 0.17
## station_avg_temp_c -0.67 0.06
## station_diur_temp_rng_c -0.62 0.12
## station_max_temp_c -0.36 0.09
## station_min_temp_c 0.98 0.09
## station_precip_mm 5.55 1.71
## total_cases NaN 0.00
describe(train)
## Warning in describe(train): NAs introduced by coercion
## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning -Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning -Inf
## vars n mean sd median trimmed
## city* 1 1199 NaN NA NA NaN
## year 2 1199 2001.30 5.35 2002.00 2001.62
## weekofyear 3 1199 26.50 14.90 26.00 26.49
## total_cases 4 1199 21.20 30.86 11.00 14.78
## week_start_date 5 1199 NaN NA NA NaN
## ndvi_ne 6 1199 0.14 0.14 0.12 0.14
## ndvi_nw 7 1199 0.13 0.12 0.13 0.13
## ndvi_se 8 1199 0.21 0.07 0.20 0.20
## ndvi_sw 9 1199 0.21 0.09 0.19 0.20
## precipitation_amt_mm 10 1199 47.58 43.18 41.41 42.52
## reanalysis_air_temp_k 11 1199 298.68 1.36 298.62 298.69
## reanalysis_avg_temp_k 12 1199 299.24 1.26 299.32 299.28
## reanalysis_dew_point_temp_k 13 1199 295.30 1.50 295.68 295.44
## reanalysis_max_air_temp_k 14 1199 303.66 3.30 302.60 303.37
## reanalysis_min_air_temp_k 15 1199 295.58 2.59 296.00 295.77
## reanalysis_precip_amt_kg_per_m2 16 1199 41.55 44.49 28.60 33.67
## reanalysis_relative_humidity_percent 17 1199 82.63 7.30 80.80 82.23
## reanalysis_sat_precip_amt_mm 18 1199 47.58 43.18 41.41 42.52
## reanalysis_specific_humidity_g_per_kg 19 1199 16.81 1.52 17.17 16.91
## reanalysis_tdtr_k 20 1199 5.14 3.59 3.00 4.64
## station_avg_temp_c 21 1199 27.23 1.27 27.45 27.31
## station_diur_temp_rng_c 22 1199 8.25 2.18 7.47 8.06
## station_max_temp_c 23 1199 32.57 1.95 32.80 32.64
## station_min_temp_c 24 1199 22.08 1.55 22.10 22.11
## station_precip_mm 25 1199 40.92 49.00 24.70 31.74
## mad min max range skew
## city* NA Inf -Inf -Inf NA
## year 5.93 1990.00 2010.00 20.00 -0.48
## weekofyear 19.27 1.00 52.00 51.00 0.00
## total_cases 13.34 0.00 329.00 329.00 3.80
## week_start_date NA Inf -Inf -Inf NA
## ndvi_ne 0.15 -0.41 0.51 0.91 -0.08
## ndvi_nw 0.12 -0.46 0.45 0.91 -0.06
## ndvi_se 0.07 -0.02 0.54 0.55 0.57
## ndvi_sw 0.08 -0.06 0.55 0.61 0.72
## precipitation_amt_mm 43.90 0.00 390.60 390.60 1.73
## reanalysis_air_temp_k 1.55 294.64 302.20 7.56 -0.07
## reanalysis_avg_temp_k 1.43 294.89 302.61 7.72 -0.23
## reanalysis_dew_point_temp_k 1.49 289.64 298.45 8.81 -0.77
## reanalysis_max_air_temp_k 2.97 297.80 313.20 15.40 0.74
## reanalysis_min_air_temp_k 2.97 286.90 299.90 13.00 -0.57
## reanalysis_precip_amt_kg_per_m2 26.24 0.00 570.50 570.50 3.41
## reanalysis_relative_humidity_percent 6.17 57.79 98.61 40.82 0.49
## reanalysis_sat_precip_amt_mm 43.90 0.00 390.60 390.60 1.73
## reanalysis_specific_humidity_g_per_kg 1.55 11.72 20.46 8.75 -0.58
## reanalysis_tdtr_k 1.42 1.36 16.03 14.67 0.92
## station_avg_temp_c 1.24 21.40 30.80 9.40 -0.62
## station_diur_temp_rng_c 1.87 4.53 15.80 11.27 0.70
## station_max_temp_c 1.63 26.70 42.20 15.50 -0.27
## station_min_temp_c 1.63 14.70 25.60 10.90 -0.28
## station_precip_mm 27.43 0.00 543.30 543.30 2.94
## kurtosis se
## city* NA NA
## year -0.75 0.15
## weekofyear -1.20 0.43
## total_cases 21.13 0.89
## week_start_date NA NA
## ndvi_ne -0.14 0.00
## ndvi_nw 0.13 0.00
## ndvi_se 0.44 0.00
## ndvi_sw 0.58 0.00
## precipitation_amt_mm 7.45 1.25
## reanalysis_air_temp_k -0.67 0.04
## reanalysis_avg_temp_k -0.49 0.04
## reanalysis_dew_point_temp_k 0.03 0.04
## reanalysis_max_air_temp_k -0.44 0.10
## reanalysis_min_air_temp_k -0.39 0.07
## reanalysis_precip_amt_kg_per_m2 22.70 1.28
## reanalysis_relative_humidity_percent -0.59 0.21
## reanalysis_sat_precip_amt_mm 7.45 1.25
## reanalysis_specific_humidity_g_per_kg -0.39 0.04
## reanalysis_tdtr_k -0.52 0.10
## station_avg_temp_c 0.03 0.04
## station_diur_temp_rng_c -0.54 0.06
## station_max_temp_c 0.30 0.06
## station_min_temp_c 0.29 0.04
## station_precip_mm 14.89 1.41
Separate train set into cities
#san juan train set
sj.train = train %>% filter(city == 'sj')
summary(sj.train)
## city year weekofyear total_cases
## Length:727 Min. :1990 Min. : 1.00 Min. : 0.00
## Class :character 1st Qu.:1994 1st Qu.:14.00 1st Qu.: 9.00
## Mode :character Median :1999 Median :27.00 Median : 18.00
## Mean :1999 Mean :26.46 Mean : 30.21
## 3rd Qu.:2003 3rd Qu.:39.00 3rd Qu.: 36.00
## Max. :2008 Max. :52.00 Max. :329.00
## week_start_date ndvi_ne ndvi_nw ndvi_se
## Min. :1990-04-30 Min. :-0.406250 Min. :-0.45610 Min. :-0.01553
## 1st Qu.:1994-11-05 1st Qu.: 0.004667 1st Qu.: 0.01642 1st Qu.: 0.13874
## Median :1999-09-10 Median : 0.057700 Median : 0.06822 Median : 0.17662
## Mean :1999-06-05 Mean : 0.058806 Mean : 0.06691 Mean : 0.17677
## 3rd Qu.:2003-11-22 3rd Qu.: 0.112050 3rd Qu.: 0.11493 3rd Qu.: 0.20996
## Max. :2008-04-22 Max. : 0.493400 Max. : 0.43710 Max. : 0.39313
## ndvi_sw precipitation_amt_mm reanalysis_air_temp_k
## Min. :-0.06346 Min. : 0.000 Min. :295.9
## 1st Qu.: 0.12811 1st Qu.: 1.045 1st Qu.:298.2
## Median : 0.16588 Median : 22.490 Median :299.3
## Mean : 0.16584 Mean : 35.938 Mean :299.2
## 3rd Qu.: 0.20290 3rd Qu.: 52.795 3rd Qu.:300.2
## Max. : 0.38142 Max. :390.600 Max. :302.2
## reanalysis_avg_temp_k reanalysis_dew_point_temp_k reanalysis_max_air_temp_k
## Min. :296.1 Min. :289.6 Min. :297.8
## 1st Qu.:298.3 1st Qu.:293.9 1st Qu.:300.5
## Median :299.4 Median :295.5 Median :301.5
## Mean :299.3 Mean :295.1 Mean :301.4
## 3rd Qu.:300.2 3rd Qu.:296.4 3rd Qu.:302.4
## Max. :302.2 Max. :297.8 Max. :304.3
## reanalysis_min_air_temp_k reanalysis_precip_amt_kg_per_m2
## Min. :292.6 Min. : 0.20
## 1st Qu.:296.4 1st Qu.: 10.50
## Median :297.5 Median : 21.40
## Mean :297.3 Mean : 30.17
## 3rd Qu.:298.4 3rd Qu.: 36.80
## Max. :299.9 Max. :570.50
## reanalysis_relative_humidity_percent reanalysis_sat_precip_amt_mm
## Min. :66.74 Min. : 0.000
## 1st Qu.:76.29 1st Qu.: 1.045
## Median :78.75 Median : 22.490
## Mean :78.58 Mean : 35.938
## 3rd Qu.:81.07 3rd Qu.: 52.795
## Max. :87.58 Max. :390.600
## reanalysis_specific_humidity_g_per_kg reanalysis_tdtr_k station_avg_temp_c
## Min. :11.72 Min. :1.357 Min. :22.84
## 1st Qu.:15.30 1st Qu.:2.157 1st Qu.:25.84
## Median :16.86 Median :2.471 Median :27.30
## Mean :16.58 Mean :2.529 Mean :27.04
## 3rd Qu.:17.90 3rd Qu.:2.814 3rd Qu.:28.21
## Max. :19.44 Max. :4.429 Max. :30.07
## station_diur_temp_rng_c station_max_temp_c station_min_temp_c
## Min. :4.529 Min. :26.70 Min. :17.80
## 1st Qu.:6.257 1st Qu.:30.60 1st Qu.:21.70
## Median :6.771 Median :31.70 Median :22.80
## Mean :6.762 Mean :31.62 Mean :22.62
## 3rd Qu.:7.279 3rd Qu.:32.80 3rd Qu.:23.90
## Max. :9.914 Max. :35.60 Max. :25.60
## station_precip_mm
## Min. : 0.00
## 1st Qu.: 7.05
## Median : 18.10
## Mean : 26.09
## 3rd Qu.: 34.05
## Max. :163.10
head(sj.train)
## # A tibble: 6 x 25
## city year weekofyear total_cases week_start_date ndvi_ne ndvi_nw ndvi_se
## <chr> <dbl> <dbl> <dbl> <date> <dbl> <dbl> <dbl>
## 1 sj 1990 18 4 1990-04-30 0.123 0.104 0.198
## 2 sj 1990 19 5 1990-05-07 0.170 0.142 0.162
## 3 sj 1990 20 4 1990-05-14 0.0322 0.173 0.157
## 4 sj 1990 21 3 1990-05-21 0.129 0.245 0.228
## 5 sj 1990 22 6 1990-05-28 0.196 0.262 0.251
## 6 sj 1990 24 4 1990-06-11 0.113 0.0928 0.205
## # ... with 17 more variables: ndvi_sw <dbl>, precipitation_amt_mm <dbl>,
## # reanalysis_air_temp_k <dbl>, reanalysis_avg_temp_k <dbl>,
## # reanalysis_dew_point_temp_k <dbl>, reanalysis_max_air_temp_k <dbl>,
## # reanalysis_min_air_temp_k <dbl>, reanalysis_precip_amt_kg_per_m2 <dbl>,
## # reanalysis_relative_humidity_percent <dbl>,
## # reanalysis_sat_precip_amt_mm <dbl>,
## # reanalysis_specific_humidity_g_per_kg <dbl>, reanalysis_tdtr_k <dbl>,
## # station_avg_temp_c <dbl>, station_diur_temp_rng_c <dbl>,
## # station_max_temp_c <dbl>, station_min_temp_c <dbl>, station_precip_mm <dbl>
#Iquitos train set
iq.train = train %>% filter(city == 'iq')
summary(iq.train)
## city year weekofyear total_cases
## Length:472 Min. :2000 Min. : 1.00 Min. : 0.000
## Class :character 1st Qu.:2002 1st Qu.:13.00 1st Qu.: 1.000
## Mode :character Median :2005 Median :26.00 Median : 5.000
## Mean :2005 Mean :26.55 Mean : 7.309
## 3rd Qu.:2007 3rd Qu.:40.00 3rd Qu.: 9.000
## Max. :2010 Max. :52.00 Max. :83.000
## week_start_date ndvi_ne ndvi_nw ndvi_se
## Min. :2000-07-01 Min. :0.06173 Min. :0.03586 Min. :0.02988
## 1st Qu.:2002-12-08 1st Qu.:0.20154 1st Qu.:0.18026 1st Qu.:0.19461
## Median :2005-06-04 Median :0.26538 Median :0.23094 Median :0.25070
## Mean :2005-06-12 Mean :0.26434 Mean :0.23826 Mean :0.24991
## 3rd Qu.:2007-11-27 3rd Qu.:0.31923 3rd Qu.:0.29296 3rd Qu.:0.30213
## Max. :2010-06-25 Max. :0.50836 Max. :0.45443 Max. :0.53831
## ndvi_sw precipitation_amt_mm reanalysis_air_temp_k
## Min. :0.06418 Min. : 0.00 Min. :294.6
## 1st Qu.:0.20556 1st Qu.: 41.89 1st Qu.:297.1
## Median :0.26243 Median : 61.13 Median :297.8
## Mean :0.26711 Mean : 65.51 Mean :297.9
## 3rd Qu.:0.32523 3rd Qu.: 85.80 3rd Qu.:298.7
## Max. :0.54602 Max. :210.83 Max. :301.6
## reanalysis_avg_temp_k reanalysis_dew_point_temp_k reanalysis_max_air_temp_k
## Min. :294.9 Min. :290.1 Min. :300.0
## 1st Qu.:298.2 1st Qu.:294.7 1st Qu.:305.2
## Median :299.2 Median :295.9 Median :307.1
## Mean :299.1 Mean :295.6 Mean :307.1
## 3rd Qu.:300.1 3rd Qu.:296.6 3rd Qu.:308.8
## Max. :302.6 Max. :298.4 Max. :313.2
## reanalysis_min_air_temp_k reanalysis_precip_amt_kg_per_m2
## Min. :286.9 Min. : 0.00
## 1st Qu.:292.0 1st Qu.: 24.91
## Median :293.1 Median : 47.05
## Mean :292.9 Mean : 59.07
## 3rd Qu.:294.2 3rd Qu.: 72.95
## Max. :296.0 Max. :362.03
## reanalysis_relative_humidity_percent reanalysis_sat_precip_amt_mm
## Min. :57.79 Min. : 0.00
## 1st Qu.:84.62 1st Qu.: 41.89
## Median :91.10 Median : 61.13
## Mean :88.87 Mean : 65.51
## 3rd Qu.:94.60 3rd Qu.: 85.80
## Max. :98.61 Max. :210.83
## reanalysis_specific_humidity_g_per_kg reanalysis_tdtr_k station_avg_temp_c
## Min. :12.11 Min. : 3.714 Min. :21.40
## 1st Qu.:16.18 1st Qu.: 7.371 1st Qu.:27.00
## Median :17.48 Median : 8.914 Median :27.60
## Mean :17.16 Mean : 9.153 Mean :27.53
## 3rd Qu.:18.20 3rd Qu.:10.943 3rd Qu.:28.10
## Max. :20.46 Max. :16.029 Max. :30.80
## station_diur_temp_rng_c station_max_temp_c station_min_temp_c
## Min. : 5.20 Min. :30.10 Min. :14.70
## 1st Qu.: 9.50 1st Qu.:33.20 1st Qu.:20.68
## Median :10.57 Median :34.00 Median :21.40
## Mean :10.54 Mean :34.02 Mean :21.24
## 3rd Qu.:11.63 3rd Qu.:35.00 3rd Qu.:22.00
## Max. :15.80 Max. :42.20 Max. :24.20
## station_precip_mm
## Min. : 0.00
## 1st Qu.: 18.07
## Median : 46.10
## Mean : 63.75
## 3rd Qu.: 86.67
## Max. :543.30
head(iq.train)
## # A tibble: 6 x 25
## city year weekofyear total_cases week_start_date ndvi_ne ndvi_nw ndvi_se
## <chr> <dbl> <dbl> <dbl> <date> <dbl> <dbl> <dbl>
## 1 iq 2000 26 0 2000-07-01 0.193 0.132 0.341
## 2 iq 2000 27 0 2000-07-08 0.217 0.276 0.289
## 3 iq 2000 28 0 2000-07-15 0.177 0.173 0.204
## 4 iq 2000 29 0 2000-07-22 0.228 0.145 0.254
## 5 iq 2000 30 0 2000-07-29 0.329 0.322 0.254
## 6 iq 2000 31 0 2000-08-05 0.206 0.191 0.232
## # ... with 17 more variables: ndvi_sw <dbl>, precipitation_amt_mm <dbl>,
## # reanalysis_air_temp_k <dbl>, reanalysis_avg_temp_k <dbl>,
## # reanalysis_dew_point_temp_k <dbl>, reanalysis_max_air_temp_k <dbl>,
## # reanalysis_min_air_temp_k <dbl>, reanalysis_precip_amt_kg_per_m2 <dbl>,
## # reanalysis_relative_humidity_percent <dbl>,
## # reanalysis_sat_precip_amt_mm <dbl>,
## # reanalysis_specific_humidity_g_per_kg <dbl>, reanalysis_tdtr_k <dbl>,
## # station_avg_temp_c <dbl>, station_diur_temp_rng_c <dbl>,
## # station_max_temp_c <dbl>, station_min_temp_c <dbl>, station_precip_mm <dbl>
Make each city into a time series
sj.ts = ts(sj.train$total_cases, frequency=52)
iq.ts = ts(iq.train$total_cases, frequency=52)
autoplot(sj.ts)+ggtitle("San Juan - Train")
autoplot(iq.ts)+ggtitle("Iquitos - Train")
ggseasonplot(sj.ts)
ggseasonplot(iq.ts)
#it doesn't really seem like there is any seasonality
summary(sj.ts)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 9.00 18.00 30.21 36.00 329.00
summary(iq.ts)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 1.000 5.000 7.309 9.000 83.000
San Juan Models
#ets
sj.ets = ets(sj.ts)
## Warning in ets(sj.ts): I can't handle data with frequency greater than 24.
## Seasonality will be ignored. Try stlf() if you need seasonal forecasts.
summary(sj.ets)
## ETS(A,N,N)
##
## Call:
## ets(y = sj.ts)
##
## Smoothing parameters:
## alpha = 0.9999
##
## Initial states:
## l = 4.1836
##
## sigma: 12.4615
##
## AIC AICc BIC
## 8462.075 8462.108 8475.842
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE ACF1
## Training set 0.001122585 12.44438 7.75398 -Inf Inf 0.2243436 0.09199171
f.sj.ets = forecast(sj.ets, h=260)
autoplot(f.sj.ets)
checkresiduals(f.sj.ets)
##
## Ljung-Box test
##
## data: Residuals from ETS(A,N,N)
## Q* = 106.72, df = 102, p-value = 0.3549
##
## Model df: 2. Total lags used: 104
#arima
sj.arima = auto.arima(sj.ts)
summary(sj.arima)
## Series: sj.ts
## ARIMA(2,1,2)
##
## Coefficients:
## ar1 ar2 ma1 ma2
## -1.0539 -0.4930 1.1509 0.6432
## s.e. 0.1614 0.1615 0.1429 0.1423
##
## sigma^2 estimated as 151.3: log likelihood=-2850.29
## AIC=5710.58 AICc=5710.67 BIC=5733.52
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE ACF1
## Training set 0.001702639 12.25979 7.879365 -Inf Inf 0.2279713 0.007108063
f.sj.arima = forecast(sj.arima, h=260)
autoplot(f.sj.arima)
checkresiduals(f.sj.arima)
##
## Ljung-Box test
##
## data: Residuals from ARIMA(2,1,2)
## Q* = 77.52, df = 100, p-value = 0.9534
##
## Model df: 4. Total lags used: 104
#stl
sj.stl = stl(sj.ts, t.window = 13, s.window = "periodic", robust=TRUE)
f.sj.stl = forecast(sj.stl, h=260)
autoplot(f.sj.stl)
checkresiduals(f.sj.stl)
## Warning in checkresiduals(f.sj.stl): The fitted degrees of freedom is based on
## the model used for the seasonally adjusted data.
##
## Ljung-Box test
##
## data: Residuals from STL + ETS(A,N,N)
## Q* = 106.88, df = 102, p-value = 0.351
##
## Model df: 2. Total lags used: 104
#try with differenced data to remove the trends
sj.diff = diff(sj.ts)
autoplot(sj.diff)
#ets2
sj.ets2 = ets(sj.diff)
## Warning in ets(sj.diff): I can't handle data with frequency greater than 24.
## Seasonality will be ignored. Try stlf() if you need seasonal forecasts.
summary(sj.ets2)
## ETS(A,N,N)
##
## Call:
## ets(y = sj.diff)
##
## Smoothing parameters:
## alpha = 1e-04
##
## Initial states:
## l = -0.0706
##
## sigma: 12.4708
##
## AIC AICc BIC
## 8450.525 8450.558 8464.288
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE ACF1
## Training set 0.06699624 12.45365 7.770152 Inf Inf 0.6486354 0.09188973
f.sj.ets2 = forecast(sj.ets2, h=260)
autoplot(f.sj.ets2)
checkresiduals(f.sj.ets2)
##
## Ljung-Box test
##
## data: Residuals from ETS(A,N,N)
## Q* = 106.55, df = 102, p-value = 0.3592
##
## Model df: 2. Total lags used: 104
#this did not perform as well as the 1st ets model
#arima2
sj.arima2 = auto.arima(sj.diff)
summary(sj.arima2)
## Series: sj.diff
## ARIMA(2,0,2) with zero mean
##
## Coefficients:
## ar1 ar2 ma1 ma2
## -1.0539 -0.4930 1.1509 0.6432
## s.e. 0.1614 0.1615 0.1429 0.1423
##
## sigma^2 estimated as 151.3: log likelihood=-2850.29
## AIC=5710.58 AICc=5710.67 BIC=5733.52
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE ACF1
## Training set 0.001699475 12.26823 7.890212 NaN Inf 0.6586578 0.007108043
f.sj.arima2 = forecast(sj.arima2, h=260)
autoplot(f.sj.arima2)
checkresiduals(f.sj.arima2)
##
## Ljung-Box test
##
## data: Residuals from ARIMA(2,0,2) with zero mean
## Q* = 77.421, df = 100, p-value = 0.9542
##
## Model df: 4. Total lags used: 104
#the original arima model was better
#stl2
sj.stl2 = stl(sj.diff, t.window = 13, s.window = "periodic", robust=TRUE)
f.sj.stl2 = forecast(sj.stl2, h=260)
autoplot(f.sj.stl2)
checkresiduals(f.sj.stl2)
## Warning in checkresiduals(f.sj.stl2): The fitted degrees of freedom is based on
## the model used for the seasonally adjusted data.
##
## Ljung-Box test
##
## data: Residuals from STL + ETS(A,N,N)
## Q* = 106.21, df = 102, p-value = 0.3679
##
## Model df: 2. Total lags used: 104
accuracy(f.sj.ets)
## ME RMSE MAE MPE MAPE MASE ACF1
## Training set 0.001122585 12.44438 7.75398 -Inf Inf 0.2243436 0.09199171
accuracy(f.sj.ets2)
## ME RMSE MAE MPE MAPE MASE ACF1
## Training set 0.06699624 12.45365 7.770152 Inf Inf 0.6486354 0.09188973
accuracy(f.sj.arima)
## ME RMSE MAE MPE MAPE MASE ACF1
## Training set 0.001702639 12.25979 7.879365 -Inf Inf 0.2279713 0.007108063
accuracy(f.sj.arima2)
## ME RMSE MAE MPE MAPE MASE ACF1
## Training set 0.001699475 12.26823 7.890212 NaN Inf 0.6586578 0.007108043
accuracy(f.sj.stl)
## ME RMSE MAE MPE MAPE MASE ACF1
## Training set 0.003182592 12.33912 7.610565 NaN Inf 0.2201942 0.1071663
accuracy(f.sj.stl2)
## ME RMSE MAE MPE MAPE MASE ACF1
## Training set -0.001628082 12.3095 7.567774 NaN Inf 0.6317414 0.1110626
#the best model for San Juan was the first arima model
tsdisplay(residuals(f.sj.arima))
autoplot(f.sj.arima)
Iquitos Models
#ets
iq.ets = ets(iq.ts)
## Warning in ets(iq.ts): I can't handle data with frequency greater than 24.
## Seasonality will be ignored. Try stlf() if you need seasonal forecasts.
summary(iq.ets)
## ETS(A,N,N)
##
## Call:
## ets(y = iq.ts)
##
## Smoothing parameters:
## alpha = 0.5619
##
## Initial states:
## l = 0.1055
##
## sigma: 6.5424
##
## AIC AICc BIC
## 4683.212 4683.263 4695.683
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE ACF1
## Training set 0.01157704 6.528546 3.593558 -Inf Inf 0.3895958 -0.01911295
f.iq.ets = forecast(iq.ets, h=156)
autoplot(f.iq.ets)
checkresiduals(f.iq.ets)
##
## Ljung-Box test
##
## data: Residuals from ETS(A,N,N)
## Q* = 71.21, df = 92, p-value = 0.947
##
## Model df: 2. Total lags used: 94
#arima
iq.arima = auto.arima(iq.ts)
summary(iq.arima)
## Series: iq.ts
## ARIMA(0,1,1)
##
## Coefficients:
## ma1
## -0.4370
## s.e. 0.0407
##
## sigma^2 estimated as 42.8: log likelihood=-1552.61
## AIC=3109.22 AICc=3109.24 BIC=3117.53
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE ACF1
## Training set 0.01195121 6.528547 3.59285 -Inf Inf 0.3895191 -0.02024583
f.iq.arima = forecast(iq.arima, h=156)
autoplot(f.iq.arima)
checkresiduals(f.iq.arima)
##
## Ljung-Box test
##
## data: Residuals from ARIMA(0,1,1)
## Q* = 71.108, df = 93, p-value = 0.9556
##
## Model df: 1. Total lags used: 94
#stl
iq.stl = stl(iq.ts, t.window = 13, s.window = "periodic", robust=TRUE)
f.iq.stl = forecast(iq.stl, h=156)
autoplot(f.iq.stl)
checkresiduals(f.iq.stl)
## Warning in checkresiduals(f.iq.stl): The fitted degrees of freedom is based on
## the model used for the seasonally adjusted data.
##
## Ljung-Box test
##
## data: Residuals from STL + ETS(A,N,N)
## Q* = 66.411, df = 92, p-value = 0.9796
##
## Model df: 2. Total lags used: 94
#try with differenced data to remove the trends
iq.diff = diff(iq.ts)
autoplot(iq.diff)
#ets2
iq.ets2 = ets(iq.diff)
## Warning in ets(iq.diff): I can't handle data with frequency greater than 24.
## Seasonality will be ignored. Try stlf() if you need seasonal forecasts.
summary(iq.ets2)
## ETS(A,N,N)
##
## Call:
## ets(y = iq.diff)
##
## Smoothing parameters:
## alpha = 1e-04
##
## Initial states:
## l = 0.0101
##
## sigma: 7.2005
##
## AIC AICc BIC
## 4762.585 4762.636 4775.049
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE ACF1
## Training set -0.002111055 7.185205 3.849579 -Inf Inf 0.5687494 -0.391545
f.iq.ets2 = forecast(iq.ets2, h=156)
autoplot(f.iq.ets2)
checkresiduals(f.iq.ets2)
##
## Ljung-Box test
##
## data: Residuals from ETS(A,N,N)
## Q* = 147.46, df = 92, p-value = 0.000216
##
## Model df: 2. Total lags used: 94
#this did not perform as well as the 1st ets model
#arima2
iq.arima2 = auto.arima(iq.diff)
summary(iq.arima2)
## Series: iq.diff
## ARIMA(0,0,1) with zero mean
##
## Coefficients:
## ma1
## -0.4370
## s.e. 0.0407
##
## sigma^2 estimated as 42.8: log likelihood=-1552.61
## AIC=3109.22 AICc=3109.24 BIC=3117.53
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE ACF1
## Training set 0.01197658 6.535474 3.600478 NaN Inf 0.5319465 -0.02024583
f.iq.arima2 = forecast(iq.arima2, h=156)
autoplot(f.iq.arima2)
checkresiduals(f.iq.arima2)
##
## Ljung-Box test
##
## data: Residuals from ARIMA(0,0,1) with zero mean
## Q* = 70.971, df = 93, p-value = 0.9567
##
## Model df: 1. Total lags used: 94
#stl2
iq.stl2 = stl(iq.diff, t.window = 13, s.window = "periodic", robust=TRUE)
f.iq.stl2 = forecast(iq.stl2, h=156)
autoplot(f.iq.stl2)
checkresiduals(f.iq.stl2)
## Warning in checkresiduals(f.iq.stl2): The fitted degrees of freedom is based on
## the model used for the seasonally adjusted data.
##
## Ljung-Box test
##
## data: Residuals from STL + ETS(A,N,N)
## Q* = 138, df = 92, p-value = 0.001366
##
## Model df: 2. Total lags used: 94
accuracy(f.iq.ets)
## ME RMSE MAE MPE MAPE MASE ACF1
## Training set 0.01157704 6.528546 3.593558 -Inf Inf 0.3895958 -0.01911295
accuracy(f.iq.ets2)
## ME RMSE MAE MPE MAPE MASE ACF1
## Training set -0.002111055 7.185205 3.849579 -Inf Inf 0.5687494 -0.391545
accuracy(f.iq.arima)
## ME RMSE MAE MPE MAPE MASE ACF1
## Training set 0.01195121 6.528547 3.59285 -Inf Inf 0.3895191 -0.02024583
accuracy(f.iq.arima2)
## ME RMSE MAE MPE MAPE MASE ACF1
## Training set 0.01197658 6.535474 3.600478 NaN Inf 0.5319465 -0.02024583
accuracy(f.iq.stl)
## ME RMSE MAE MPE MAPE MASE ACF1
## Training set 0.01405503 6.511848 3.640696 NaN Inf 0.3947063 -0.01166279
accuracy(f.iq.stl2)
## ME RMSE MAE MPE MAPE MASE ACF1
## Training set -0.0008546277 7.076454 3.886606 NaN Inf 0.57422 -0.3818434
#The best model for Iquitos was the first arima model
tsdisplay(residuals(f.iq.arima))
autoplot(f.iq.arima)
VAR Model
#var
ccf(sj.ts,iq.ts, type = c("correlation","covariance"))
d1= cbind(sj.ts, iq.ts)
d1 = na.omit(d1)
autoplot(d1)
VARselect(d1,lag.max=10,type="const")[["selection"]]
## AIC(n) HQ(n) SC(n) FPE(n)
## 5 2 2 5
#var model 1
var1 = VAR(d1, p=1, type="const")
serial.test(var1, lags.pt=10, type="PT.asymptotic")
##
## Portmanteau Test (asymptotic)
##
## data: Residuals of VAR object var1
## Chi-squared = 94.287, df = 36, p-value = 4.072e-07
#var model 2
var2 = VAR(d1, p=2, type="const")
serial.test(var2, lags.pt=10, type="PT.asymptotic")
##
## Portmanteau Test (asymptotic)
##
## data: Residuals of VAR object var2
## Chi-squared = 53.17, df = 32, p-value = 0.01078
#var model 3
var3 = VAR(d1, p=10, type="const")
serial.test(var3, lags.pt=10, type="PT.asymptotic")
##
## Portmanteau Test (asymptotic)
##
## data: Residuals of VAR object var3
## Chi-squared = 2.9304, df = 0, p-value < 2.2e-16
#model 3 appears to be the best of the var models
f.var1 = forecast(var1)
f.var2 = forecast(var2)
f.var3 = forecast(var3)
autoplot(f.var1)
autoplot(f.var2)
autoplot(f.var3)
tsdisplay(residuals(var1))
tsdisplay(residuals(var2))
tsdisplay(residuals(var3))
#the 3rd var model seems to be the best