Load the data

library(fpp2)
## Warning: package 'fpp2' was built under R version 4.0.3
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo
## -- Attaching packages ----------------------------------------- fpp2 2.4 --
## v ggplot2   3.3.0     v fma       2.4  
## v forecast  8.12      v expsmooth 2.3
## Warning: package 'forecast' was built under R version 4.0.2
## Warning: package 'fma' was built under R version 4.0.3
## Warning: package 'expsmooth' was built under R version 4.0.3
## 
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.0.2
## -- Attaching packages ---------------------------------- tidyverse 1.3.0 --
## v tibble  3.0.1     v dplyr   1.0.0
## v tidyr   1.1.0     v stringr 1.4.0
## v readr   1.3.1     v forcats 0.5.0
## v purrr   0.3.4
## Warning: package 'forcats' was built under R version 4.0.2
## -- Conflicts ------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(readr)
library(psych)
## Warning: package 'psych' was built under R version 4.0.2
## 
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha
library(vars)
## Warning: package 'vars' was built under R version 4.0.3
## Loading required package: MASS
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
## 
##     select
## The following objects are masked from 'package:fma':
## 
##     cement, housing, petrol
## Loading required package: strucchange
## Warning: package 'strucchange' was built under R version 4.0.3
## Loading required package: zoo
## Warning: package 'zoo' was built under R version 4.0.2
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
## Loading required package: sandwich
## Warning: package 'sandwich' was built under R version 4.0.2
## 
## Attaching package: 'strucchange'
## The following object is masked from 'package:stringr':
## 
##     boundary
## Loading required package: urca
## Warning: package 'urca' was built under R version 4.0.2
## Loading required package: lmtest
## Warning: package 'lmtest' was built under R version 4.0.2
library(PolyPatEx)
## Warning: package 'PolyPatEx' was built under R version 4.0.3
library(magrittr)
## 
## Attaching package: 'magrittr'
## The following object is masked from 'package:purrr':
## 
##     set_names
## The following object is masked from 'package:tidyr':
## 
##     extract
setwd("C:/Users/burtkb/Downloads/")

submission = read.csv("submission_format.csv")

df_test = read_csv("dengue_features_test.csv",col_types = cols(week_start_date = col_date(format = "%Y-%m-%d")))

df_train = read_csv("dengue_features_train.csv",col_types = cols(week_start_date = col_date(format = "%Y-%m-%d")))

dl_train = read_csv("dengue_labels_train.csv")
## Parsed with column specification:
## cols(
##   city = col_character(),
##   year = col_double(),
##   weekofyear = col_double(),
##   total_cases = col_double()
## )

Create train and test sets

sum(is.na(df_train)) #548 na
## [1] 548
sum(is.na(dl_train)) #0 na
## [1] 0
sum(is.na(df_test)) #119
## [1] 119
test = left_join(x = df_test, y = submission, by = c("year", "weekofyear", "city"))

train = left_join(x = dl_train, y = df_train, by = c("year", "weekofyear", "city"))
sum(is.na(train))
## [1] 548
#548 na - we will omit na's
train = na.omit(train)
#double check
sum(is.na(train))
## [1] 0
describe(test)
## Warning in describe(test): NAs introduced by coercion
## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning Inf

## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning -Inf

## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning -Inf
##                                       vars   n    mean    sd  median trimmed
## city*                                    1 416     NaN    NA      NA     NaN
## year                                     2 416 2010.77  1.43 2011.00 2010.81
## weekofyear                               3 416   26.44 14.98   26.00   26.44
## week_start_date                          4 416     NaN    NA      NA     NaN
## ndvi_ne                                  5 373    0.13  0.16    0.11    0.13
## ndvi_nw                                  6 405    0.13  0.14    0.09    0.12
## ndvi_se                                  7 415    0.21  0.08    0.20    0.20
## ndvi_sw                                  8 415    0.20  0.09    0.19    0.19
## precipitation_amt_mm                     9 414   38.35 35.17   31.45   33.87
## reanalysis_air_temp_k                   10 414  298.82  1.47  298.55  298.84
## reanalysis_avg_temp_k                   11 414  299.35  1.31  299.33  299.39
## reanalysis_dew_point_temp_k             12 414  295.42  1.52  295.82  295.54
## reanalysis_max_air_temp_k               13 414  303.62  3.10  302.75  303.36
## reanalysis_min_air_temp_k               14 414  295.74  2.76  296.30  295.93
## reanalysis_precip_amt_kg_per_m2         15 414   42.17 48.91   25.85   32.62
## reanalysis_relative_humidity_percent    16 414   82.50  7.38   80.33   82.09
## reanalysis_sat_precip_amt_mm            17 414   38.35 35.17   31.45   33.87
## reanalysis_specific_humidity_g_per_kg   18 414   16.93  1.56   17.34   17.03
## reanalysis_tdtr_k                       19 414    5.12  3.54    2.91    4.65
## station_avg_temp_c                      20 404   27.37  1.23   27.48   27.40
## station_diur_temp_rng_c                 21 404    7.81  2.45    6.64    7.55
## station_max_temp_c                      22 413   32.53  1.92   32.80   32.60
## station_min_temp_c                      23 407   22.37  1.73   22.20   22.36
## station_precip_mm                       24 411   34.28 34.66   23.60   28.62
## total_cases                             25 416    0.00  0.00    0.00    0.00
##                                         mad     min     max  range  skew
## city*                                    NA     Inf    -Inf   -Inf    NA
## year                                   1.48 2008.00 2013.00   5.00 -0.32
## weekofyear                            19.27    1.00   53.00  52.00  0.00
## week_start_date                          NA     Inf    -Inf   -Inf    NA
## ndvi_ne                                0.19   -0.46    0.50   0.96 -0.17
## ndvi_nw                                0.15   -0.21    0.65   0.86  0.40
## ndvi_se                                0.08    0.01    0.45   0.45  0.38
## ndvi_sw                                0.09   -0.01    0.53   0.54  0.69
## precipitation_amt_mm                  35.77    0.00  169.34 169.34  1.01
## reanalysis_air_temp_k                  1.73  294.55  301.94   7.38 -0.03
## reanalysis_avg_temp_k                  1.61  295.24  303.33   8.09 -0.15
## reanalysis_dew_point_temp_k            1.57  290.82  297.79   6.98 -0.65
## reanalysis_max_air_temp_k              3.19  298.20  314.10  15.90  0.70
## reanalysis_min_air_temp_k              3.41  286.20  299.70  13.50 -0.50
## reanalysis_precip_amt_kg_per_m2       27.58    0.00  301.40 301.40  2.32
## reanalysis_relative_humidity_percent   6.10   64.92   97.98  33.06  0.53
## reanalysis_sat_precip_amt_mm          35.77    0.00  169.34 169.34  1.01
## reanalysis_specific_humidity_g_per_kg  1.66   12.54   19.60   7.06 -0.54
## reanalysis_tdtr_k                      1.14    1.49   14.49  13.00  0.88
## station_avg_temp_c                     1.34   24.16   30.27   6.11 -0.23
## station_diur_temp_rng_c                1.58    4.04   14.72  10.68  0.82
## station_max_temp_c                     1.78   27.20   38.40  11.20 -0.28
## station_min_temp_c                     1.63   14.20   26.70  12.50 -0.22
## station_precip_mm                     24.76    0.00  212.00 212.00  1.99
## total_cases                            0.00    0.00    0.00   0.00   NaN
##                                       kurtosis   se
## city*                                       NA   NA
## year                                     -0.79 0.07
## weekofyear                               -1.20 0.73
## week_start_date                             NA   NA
## ndvi_ne                                  -0.42 0.01
## ndvi_nw                                  -0.49 0.01
## ndvi_se                                  -0.05 0.00
## ndvi_sw                                   0.11 0.00
## precipitation_amt_mm                      0.65 1.73
## reanalysis_air_temp_k                    -0.90 0.07
## reanalysis_avg_temp_k                    -0.70 0.06
## reanalysis_dew_point_temp_k              -0.41 0.07
## reanalysis_max_air_temp_k                -0.30 0.15
## reanalysis_min_air_temp_k                -0.65 0.14
## reanalysis_precip_amt_kg_per_m2           6.59 2.40
## reanalysis_relative_humidity_percent     -0.75 0.36
## reanalysis_sat_precip_amt_mm              0.65 1.73
## reanalysis_specific_humidity_g_per_kg    -0.67 0.08
## reanalysis_tdtr_k                        -0.71 0.17
## station_avg_temp_c                       -0.67 0.06
## station_diur_temp_rng_c                  -0.62 0.12
## station_max_temp_c                       -0.36 0.09
## station_min_temp_c                        0.98 0.09
## station_precip_mm                         5.55 1.71
## total_cases                                NaN 0.00
describe(train)
## Warning in describe(train): NAs introduced by coercion
## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning Inf

## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning -Inf

## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning -Inf
##                                       vars    n    mean    sd  median trimmed
## city*                                    1 1199     NaN    NA      NA     NaN
## year                                     2 1199 2001.30  5.35 2002.00 2001.62
## weekofyear                               3 1199   26.50 14.90   26.00   26.49
## total_cases                              4 1199   21.20 30.86   11.00   14.78
## week_start_date                          5 1199     NaN    NA      NA     NaN
## ndvi_ne                                  6 1199    0.14  0.14    0.12    0.14
## ndvi_nw                                  7 1199    0.13  0.12    0.13    0.13
## ndvi_se                                  8 1199    0.21  0.07    0.20    0.20
## ndvi_sw                                  9 1199    0.21  0.09    0.19    0.20
## precipitation_amt_mm                    10 1199   47.58 43.18   41.41   42.52
## reanalysis_air_temp_k                   11 1199  298.68  1.36  298.62  298.69
## reanalysis_avg_temp_k                   12 1199  299.24  1.26  299.32  299.28
## reanalysis_dew_point_temp_k             13 1199  295.30  1.50  295.68  295.44
## reanalysis_max_air_temp_k               14 1199  303.66  3.30  302.60  303.37
## reanalysis_min_air_temp_k               15 1199  295.58  2.59  296.00  295.77
## reanalysis_precip_amt_kg_per_m2         16 1199   41.55 44.49   28.60   33.67
## reanalysis_relative_humidity_percent    17 1199   82.63  7.30   80.80   82.23
## reanalysis_sat_precip_amt_mm            18 1199   47.58 43.18   41.41   42.52
## reanalysis_specific_humidity_g_per_kg   19 1199   16.81  1.52   17.17   16.91
## reanalysis_tdtr_k                       20 1199    5.14  3.59    3.00    4.64
## station_avg_temp_c                      21 1199   27.23  1.27   27.45   27.31
## station_diur_temp_rng_c                 22 1199    8.25  2.18    7.47    8.06
## station_max_temp_c                      23 1199   32.57  1.95   32.80   32.64
## station_min_temp_c                      24 1199   22.08  1.55   22.10   22.11
## station_precip_mm                       25 1199   40.92 49.00   24.70   31.74
##                                         mad     min     max  range  skew
## city*                                    NA     Inf    -Inf   -Inf    NA
## year                                   5.93 1990.00 2010.00  20.00 -0.48
## weekofyear                            19.27    1.00   52.00  51.00  0.00
## total_cases                           13.34    0.00  329.00 329.00  3.80
## week_start_date                          NA     Inf    -Inf   -Inf    NA
## ndvi_ne                                0.15   -0.41    0.51   0.91 -0.08
## ndvi_nw                                0.12   -0.46    0.45   0.91 -0.06
## ndvi_se                                0.07   -0.02    0.54   0.55  0.57
## ndvi_sw                                0.08   -0.06    0.55   0.61  0.72
## precipitation_amt_mm                  43.90    0.00  390.60 390.60  1.73
## reanalysis_air_temp_k                  1.55  294.64  302.20   7.56 -0.07
## reanalysis_avg_temp_k                  1.43  294.89  302.61   7.72 -0.23
## reanalysis_dew_point_temp_k            1.49  289.64  298.45   8.81 -0.77
## reanalysis_max_air_temp_k              2.97  297.80  313.20  15.40  0.74
## reanalysis_min_air_temp_k              2.97  286.90  299.90  13.00 -0.57
## reanalysis_precip_amt_kg_per_m2       26.24    0.00  570.50 570.50  3.41
## reanalysis_relative_humidity_percent   6.17   57.79   98.61  40.82  0.49
## reanalysis_sat_precip_amt_mm          43.90    0.00  390.60 390.60  1.73
## reanalysis_specific_humidity_g_per_kg  1.55   11.72   20.46   8.75 -0.58
## reanalysis_tdtr_k                      1.42    1.36   16.03  14.67  0.92
## station_avg_temp_c                     1.24   21.40   30.80   9.40 -0.62
## station_diur_temp_rng_c                1.87    4.53   15.80  11.27  0.70
## station_max_temp_c                     1.63   26.70   42.20  15.50 -0.27
## station_min_temp_c                     1.63   14.70   25.60  10.90 -0.28
## station_precip_mm                     27.43    0.00  543.30 543.30  2.94
##                                       kurtosis   se
## city*                                       NA   NA
## year                                     -0.75 0.15
## weekofyear                               -1.20 0.43
## total_cases                              21.13 0.89
## week_start_date                             NA   NA
## ndvi_ne                                  -0.14 0.00
## ndvi_nw                                   0.13 0.00
## ndvi_se                                   0.44 0.00
## ndvi_sw                                   0.58 0.00
## precipitation_amt_mm                      7.45 1.25
## reanalysis_air_temp_k                    -0.67 0.04
## reanalysis_avg_temp_k                    -0.49 0.04
## reanalysis_dew_point_temp_k               0.03 0.04
## reanalysis_max_air_temp_k                -0.44 0.10
## reanalysis_min_air_temp_k                -0.39 0.07
## reanalysis_precip_amt_kg_per_m2          22.70 1.28
## reanalysis_relative_humidity_percent     -0.59 0.21
## reanalysis_sat_precip_amt_mm              7.45 1.25
## reanalysis_specific_humidity_g_per_kg    -0.39 0.04
## reanalysis_tdtr_k                        -0.52 0.10
## station_avg_temp_c                        0.03 0.04
## station_diur_temp_rng_c                  -0.54 0.06
## station_max_temp_c                        0.30 0.06
## station_min_temp_c                        0.29 0.04
## station_precip_mm                        14.89 1.41

Separate train set into cities

#san juan train set
sj.train = train %>% filter(city == 'sj')
summary(sj.train)
##      city                year        weekofyear     total_cases    
##  Length:727         Min.   :1990   Min.   : 1.00   Min.   :  0.00  
##  Class :character   1st Qu.:1994   1st Qu.:14.00   1st Qu.:  9.00  
##  Mode  :character   Median :1999   Median :27.00   Median : 18.00  
##                     Mean   :1999   Mean   :26.46   Mean   : 30.21  
##                     3rd Qu.:2003   3rd Qu.:39.00   3rd Qu.: 36.00  
##                     Max.   :2008   Max.   :52.00   Max.   :329.00  
##  week_start_date         ndvi_ne             ndvi_nw            ndvi_se        
##  Min.   :1990-04-30   Min.   :-0.406250   Min.   :-0.45610   Min.   :-0.01553  
##  1st Qu.:1994-11-05   1st Qu.: 0.004667   1st Qu.: 0.01642   1st Qu.: 0.13874  
##  Median :1999-09-10   Median : 0.057700   Median : 0.06822   Median : 0.17662  
##  Mean   :1999-06-05   Mean   : 0.058806   Mean   : 0.06691   Mean   : 0.17677  
##  3rd Qu.:2003-11-22   3rd Qu.: 0.112050   3rd Qu.: 0.11493   3rd Qu.: 0.20996  
##  Max.   :2008-04-22   Max.   : 0.493400   Max.   : 0.43710   Max.   : 0.39313  
##     ndvi_sw         precipitation_amt_mm reanalysis_air_temp_k
##  Min.   :-0.06346   Min.   :  0.000      Min.   :295.9        
##  1st Qu.: 0.12811   1st Qu.:  1.045      1st Qu.:298.2        
##  Median : 0.16588   Median : 22.490      Median :299.3        
##  Mean   : 0.16584   Mean   : 35.938      Mean   :299.2        
##  3rd Qu.: 0.20290   3rd Qu.: 52.795      3rd Qu.:300.2        
##  Max.   : 0.38142   Max.   :390.600      Max.   :302.2        
##  reanalysis_avg_temp_k reanalysis_dew_point_temp_k reanalysis_max_air_temp_k
##  Min.   :296.1         Min.   :289.6               Min.   :297.8            
##  1st Qu.:298.3         1st Qu.:293.9               1st Qu.:300.5            
##  Median :299.4         Median :295.5               Median :301.5            
##  Mean   :299.3         Mean   :295.1               Mean   :301.4            
##  3rd Qu.:300.2         3rd Qu.:296.4               3rd Qu.:302.4            
##  Max.   :302.2         Max.   :297.8               Max.   :304.3            
##  reanalysis_min_air_temp_k reanalysis_precip_amt_kg_per_m2
##  Min.   :292.6             Min.   :  0.20                 
##  1st Qu.:296.4             1st Qu.: 10.50                 
##  Median :297.5             Median : 21.40                 
##  Mean   :297.3             Mean   : 30.17                 
##  3rd Qu.:298.4             3rd Qu.: 36.80                 
##  Max.   :299.9             Max.   :570.50                 
##  reanalysis_relative_humidity_percent reanalysis_sat_precip_amt_mm
##  Min.   :66.74                        Min.   :  0.000             
##  1st Qu.:76.29                        1st Qu.:  1.045             
##  Median :78.75                        Median : 22.490             
##  Mean   :78.58                        Mean   : 35.938             
##  3rd Qu.:81.07                        3rd Qu.: 52.795             
##  Max.   :87.58                        Max.   :390.600             
##  reanalysis_specific_humidity_g_per_kg reanalysis_tdtr_k station_avg_temp_c
##  Min.   :11.72                         Min.   :1.357     Min.   :22.84     
##  1st Qu.:15.30                         1st Qu.:2.157     1st Qu.:25.84     
##  Median :16.86                         Median :2.471     Median :27.30     
##  Mean   :16.58                         Mean   :2.529     Mean   :27.04     
##  3rd Qu.:17.90                         3rd Qu.:2.814     3rd Qu.:28.21     
##  Max.   :19.44                         Max.   :4.429     Max.   :30.07     
##  station_diur_temp_rng_c station_max_temp_c station_min_temp_c
##  Min.   :4.529           Min.   :26.70      Min.   :17.80     
##  1st Qu.:6.257           1st Qu.:30.60      1st Qu.:21.70     
##  Median :6.771           Median :31.70      Median :22.80     
##  Mean   :6.762           Mean   :31.62      Mean   :22.62     
##  3rd Qu.:7.279           3rd Qu.:32.80      3rd Qu.:23.90     
##  Max.   :9.914           Max.   :35.60      Max.   :25.60     
##  station_precip_mm
##  Min.   :  0.00   
##  1st Qu.:  7.05   
##  Median : 18.10   
##  Mean   : 26.09   
##  3rd Qu.: 34.05   
##  Max.   :163.10
head(sj.train)
## # A tibble: 6 x 25
##   city   year weekofyear total_cases week_start_date ndvi_ne ndvi_nw ndvi_se
##   <chr> <dbl>      <dbl>       <dbl> <date>            <dbl>   <dbl>   <dbl>
## 1 sj     1990         18           4 1990-04-30       0.123   0.104    0.198
## 2 sj     1990         19           5 1990-05-07       0.170   0.142    0.162
## 3 sj     1990         20           4 1990-05-14       0.0322  0.173    0.157
## 4 sj     1990         21           3 1990-05-21       0.129   0.245    0.228
## 5 sj     1990         22           6 1990-05-28       0.196   0.262    0.251
## 6 sj     1990         24           4 1990-06-11       0.113   0.0928   0.205
## # ... with 17 more variables: ndvi_sw <dbl>, precipitation_amt_mm <dbl>,
## #   reanalysis_air_temp_k <dbl>, reanalysis_avg_temp_k <dbl>,
## #   reanalysis_dew_point_temp_k <dbl>, reanalysis_max_air_temp_k <dbl>,
## #   reanalysis_min_air_temp_k <dbl>, reanalysis_precip_amt_kg_per_m2 <dbl>,
## #   reanalysis_relative_humidity_percent <dbl>,
## #   reanalysis_sat_precip_amt_mm <dbl>,
## #   reanalysis_specific_humidity_g_per_kg <dbl>, reanalysis_tdtr_k <dbl>,
## #   station_avg_temp_c <dbl>, station_diur_temp_rng_c <dbl>,
## #   station_max_temp_c <dbl>, station_min_temp_c <dbl>, station_precip_mm <dbl>
#Iquitos train set
iq.train = train %>% filter(city == 'iq')
summary(iq.train)
##      city                year        weekofyear     total_cases    
##  Length:472         Min.   :2000   Min.   : 1.00   Min.   : 0.000  
##  Class :character   1st Qu.:2002   1st Qu.:13.00   1st Qu.: 1.000  
##  Mode  :character   Median :2005   Median :26.00   Median : 5.000  
##                     Mean   :2005   Mean   :26.55   Mean   : 7.309  
##                     3rd Qu.:2007   3rd Qu.:40.00   3rd Qu.: 9.000  
##                     Max.   :2010   Max.   :52.00   Max.   :83.000  
##  week_start_date         ndvi_ne           ndvi_nw           ndvi_se       
##  Min.   :2000-07-01   Min.   :0.06173   Min.   :0.03586   Min.   :0.02988  
##  1st Qu.:2002-12-08   1st Qu.:0.20154   1st Qu.:0.18026   1st Qu.:0.19461  
##  Median :2005-06-04   Median :0.26538   Median :0.23094   Median :0.25070  
##  Mean   :2005-06-12   Mean   :0.26434   Mean   :0.23826   Mean   :0.24991  
##  3rd Qu.:2007-11-27   3rd Qu.:0.31923   3rd Qu.:0.29296   3rd Qu.:0.30213  
##  Max.   :2010-06-25   Max.   :0.50836   Max.   :0.45443   Max.   :0.53831  
##     ndvi_sw        precipitation_amt_mm reanalysis_air_temp_k
##  Min.   :0.06418   Min.   :  0.00       Min.   :294.6        
##  1st Qu.:0.20556   1st Qu.: 41.89       1st Qu.:297.1        
##  Median :0.26243   Median : 61.13       Median :297.8        
##  Mean   :0.26711   Mean   : 65.51       Mean   :297.9        
##  3rd Qu.:0.32523   3rd Qu.: 85.80       3rd Qu.:298.7        
##  Max.   :0.54602   Max.   :210.83       Max.   :301.6        
##  reanalysis_avg_temp_k reanalysis_dew_point_temp_k reanalysis_max_air_temp_k
##  Min.   :294.9         Min.   :290.1               Min.   :300.0            
##  1st Qu.:298.2         1st Qu.:294.7               1st Qu.:305.2            
##  Median :299.2         Median :295.9               Median :307.1            
##  Mean   :299.1         Mean   :295.6               Mean   :307.1            
##  3rd Qu.:300.1         3rd Qu.:296.6               3rd Qu.:308.8            
##  Max.   :302.6         Max.   :298.4               Max.   :313.2            
##  reanalysis_min_air_temp_k reanalysis_precip_amt_kg_per_m2
##  Min.   :286.9             Min.   :  0.00                 
##  1st Qu.:292.0             1st Qu.: 24.91                 
##  Median :293.1             Median : 47.05                 
##  Mean   :292.9             Mean   : 59.07                 
##  3rd Qu.:294.2             3rd Qu.: 72.95                 
##  Max.   :296.0             Max.   :362.03                 
##  reanalysis_relative_humidity_percent reanalysis_sat_precip_amt_mm
##  Min.   :57.79                        Min.   :  0.00              
##  1st Qu.:84.62                        1st Qu.: 41.89              
##  Median :91.10                        Median : 61.13              
##  Mean   :88.87                        Mean   : 65.51              
##  3rd Qu.:94.60                        3rd Qu.: 85.80              
##  Max.   :98.61                        Max.   :210.83              
##  reanalysis_specific_humidity_g_per_kg reanalysis_tdtr_k station_avg_temp_c
##  Min.   :12.11                         Min.   : 3.714    Min.   :21.40     
##  1st Qu.:16.18                         1st Qu.: 7.371    1st Qu.:27.00     
##  Median :17.48                         Median : 8.914    Median :27.60     
##  Mean   :17.16                         Mean   : 9.153    Mean   :27.53     
##  3rd Qu.:18.20                         3rd Qu.:10.943    3rd Qu.:28.10     
##  Max.   :20.46                         Max.   :16.029    Max.   :30.80     
##  station_diur_temp_rng_c station_max_temp_c station_min_temp_c
##  Min.   : 5.20           Min.   :30.10      Min.   :14.70     
##  1st Qu.: 9.50           1st Qu.:33.20      1st Qu.:20.68     
##  Median :10.57           Median :34.00      Median :21.40     
##  Mean   :10.54           Mean   :34.02      Mean   :21.24     
##  3rd Qu.:11.63           3rd Qu.:35.00      3rd Qu.:22.00     
##  Max.   :15.80           Max.   :42.20      Max.   :24.20     
##  station_precip_mm
##  Min.   :  0.00   
##  1st Qu.: 18.07   
##  Median : 46.10   
##  Mean   : 63.75   
##  3rd Qu.: 86.67   
##  Max.   :543.30
head(iq.train)
## # A tibble: 6 x 25
##   city   year weekofyear total_cases week_start_date ndvi_ne ndvi_nw ndvi_se
##   <chr> <dbl>      <dbl>       <dbl> <date>            <dbl>   <dbl>   <dbl>
## 1 iq     2000         26           0 2000-07-01        0.193   0.132   0.341
## 2 iq     2000         27           0 2000-07-08        0.217   0.276   0.289
## 3 iq     2000         28           0 2000-07-15        0.177   0.173   0.204
## 4 iq     2000         29           0 2000-07-22        0.228   0.145   0.254
## 5 iq     2000         30           0 2000-07-29        0.329   0.322   0.254
## 6 iq     2000         31           0 2000-08-05        0.206   0.191   0.232
## # ... with 17 more variables: ndvi_sw <dbl>, precipitation_amt_mm <dbl>,
## #   reanalysis_air_temp_k <dbl>, reanalysis_avg_temp_k <dbl>,
## #   reanalysis_dew_point_temp_k <dbl>, reanalysis_max_air_temp_k <dbl>,
## #   reanalysis_min_air_temp_k <dbl>, reanalysis_precip_amt_kg_per_m2 <dbl>,
## #   reanalysis_relative_humidity_percent <dbl>,
## #   reanalysis_sat_precip_amt_mm <dbl>,
## #   reanalysis_specific_humidity_g_per_kg <dbl>, reanalysis_tdtr_k <dbl>,
## #   station_avg_temp_c <dbl>, station_diur_temp_rng_c <dbl>,
## #   station_max_temp_c <dbl>, station_min_temp_c <dbl>, station_precip_mm <dbl>

Make each city into a time series

sj.ts = ts(sj.train$total_cases, frequency=52)
iq.ts = ts(iq.train$total_cases, frequency=52)
autoplot(sj.ts)+ggtitle("San Juan - Train")

autoplot(iq.ts)+ggtitle("Iquitos - Train")

ggseasonplot(sj.ts)

ggseasonplot(iq.ts)

#it doesn't really seem like there is any seasonality
summary(sj.ts)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.00    9.00   18.00   30.21   36.00  329.00
summary(iq.ts)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   1.000   5.000   7.309   9.000  83.000

San Juan Models

#ets
sj.ets = ets(sj.ts)
## Warning in ets(sj.ts): I can't handle data with frequency greater than 24.
## Seasonality will be ignored. Try stlf() if you need seasonal forecasts.
summary(sj.ets)
## ETS(A,N,N) 
## 
## Call:
##  ets(y = sj.ts) 
## 
##   Smoothing parameters:
##     alpha = 0.9999 
## 
##   Initial states:
##     l = 4.1836 
## 
##   sigma:  12.4615
## 
##      AIC     AICc      BIC 
## 8462.075 8462.108 8475.842 
## 
## Training set error measures:
##                       ME     RMSE     MAE  MPE MAPE      MASE       ACF1
## Training set 0.001122585 12.44438 7.75398 -Inf  Inf 0.2243436 0.09199171
f.sj.ets = forecast(sj.ets, h=260)
autoplot(f.sj.ets)

checkresiduals(f.sj.ets)

## 
##  Ljung-Box test
## 
## data:  Residuals from ETS(A,N,N)
## Q* = 106.72, df = 102, p-value = 0.3549
## 
## Model df: 2.   Total lags used: 104
#arima
sj.arima = auto.arima(sj.ts)
summary(sj.arima)
## Series: sj.ts 
## ARIMA(2,1,2) 
## 
## Coefficients:
##           ar1      ar2     ma1     ma2
##       -1.0539  -0.4930  1.1509  0.6432
## s.e.   0.1614   0.1615  0.1429  0.1423
## 
## sigma^2 estimated as 151.3:  log likelihood=-2850.29
## AIC=5710.58   AICc=5710.67   BIC=5733.52
## 
## Training set error measures:
##                       ME     RMSE      MAE  MPE MAPE      MASE        ACF1
## Training set 0.001702639 12.25979 7.879365 -Inf  Inf 0.2279713 0.007108063
f.sj.arima = forecast(sj.arima, h=260)
autoplot(f.sj.arima)

checkresiduals(f.sj.arima)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(2,1,2)
## Q* = 77.52, df = 100, p-value = 0.9534
## 
## Model df: 4.   Total lags used: 104
#stl
sj.stl = stl(sj.ts, t.window = 13, s.window = "periodic", robust=TRUE)
f.sj.stl = forecast(sj.stl, h=260)
autoplot(f.sj.stl)

checkresiduals(f.sj.stl)
## Warning in checkresiduals(f.sj.stl): The fitted degrees of freedom is based on
## the model used for the seasonally adjusted data.

## 
##  Ljung-Box test
## 
## data:  Residuals from STL +  ETS(A,N,N)
## Q* = 106.88, df = 102, p-value = 0.351
## 
## Model df: 2.   Total lags used: 104
#try with differenced data to remove the trends
sj.diff = diff(sj.ts)
autoplot(sj.diff)

#ets2
sj.ets2 = ets(sj.diff)
## Warning in ets(sj.diff): I can't handle data with frequency greater than 24.
## Seasonality will be ignored. Try stlf() if you need seasonal forecasts.
summary(sj.ets2)
## ETS(A,N,N) 
## 
## Call:
##  ets(y = sj.diff) 
## 
##   Smoothing parameters:
##     alpha = 1e-04 
## 
##   Initial states:
##     l = -0.0706 
## 
##   sigma:  12.4708
## 
##      AIC     AICc      BIC 
## 8450.525 8450.558 8464.288 
## 
## Training set error measures:
##                      ME     RMSE      MAE MPE MAPE      MASE       ACF1
## Training set 0.06699624 12.45365 7.770152 Inf  Inf 0.6486354 0.09188973
f.sj.ets2 = forecast(sj.ets2, h=260)
autoplot(f.sj.ets2)

checkresiduals(f.sj.ets2)

## 
##  Ljung-Box test
## 
## data:  Residuals from ETS(A,N,N)
## Q* = 106.55, df = 102, p-value = 0.3592
## 
## Model df: 2.   Total lags used: 104
#this did not perform as well as the 1st ets model

#arima2
sj.arima2 = auto.arima(sj.diff)
summary(sj.arima2)
## Series: sj.diff 
## ARIMA(2,0,2) with zero mean 
## 
## Coefficients:
##           ar1      ar2     ma1     ma2
##       -1.0539  -0.4930  1.1509  0.6432
## s.e.   0.1614   0.1615  0.1429  0.1423
## 
## sigma^2 estimated as 151.3:  log likelihood=-2850.29
## AIC=5710.58   AICc=5710.67   BIC=5733.52
## 
## Training set error measures:
##                       ME     RMSE      MAE MPE MAPE      MASE        ACF1
## Training set 0.001699475 12.26823 7.890212 NaN  Inf 0.6586578 0.007108043
f.sj.arima2 = forecast(sj.arima2, h=260)
autoplot(f.sj.arima2)

checkresiduals(f.sj.arima2)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(2,0,2) with zero mean
## Q* = 77.421, df = 100, p-value = 0.9542
## 
## Model df: 4.   Total lags used: 104
#the original arima model was better

#stl2
sj.stl2 = stl(sj.diff, t.window = 13, s.window = "periodic", robust=TRUE)
f.sj.stl2 = forecast(sj.stl2, h=260)
autoplot(f.sj.stl2)

checkresiduals(f.sj.stl2)
## Warning in checkresiduals(f.sj.stl2): The fitted degrees of freedom is based on
## the model used for the seasonally adjusted data.

## 
##  Ljung-Box test
## 
## data:  Residuals from STL +  ETS(A,N,N)
## Q* = 106.21, df = 102, p-value = 0.3679
## 
## Model df: 2.   Total lags used: 104
accuracy(f.sj.ets)
##                       ME     RMSE     MAE  MPE MAPE      MASE       ACF1
## Training set 0.001122585 12.44438 7.75398 -Inf  Inf 0.2243436 0.09199171
accuracy(f.sj.ets2)
##                      ME     RMSE      MAE MPE MAPE      MASE       ACF1
## Training set 0.06699624 12.45365 7.770152 Inf  Inf 0.6486354 0.09188973
accuracy(f.sj.arima)
##                       ME     RMSE      MAE  MPE MAPE      MASE        ACF1
## Training set 0.001702639 12.25979 7.879365 -Inf  Inf 0.2279713 0.007108063
accuracy(f.sj.arima2)
##                       ME     RMSE      MAE MPE MAPE      MASE        ACF1
## Training set 0.001699475 12.26823 7.890212 NaN  Inf 0.6586578 0.007108043
accuracy(f.sj.stl)
##                       ME     RMSE      MAE MPE MAPE      MASE      ACF1
## Training set 0.003182592 12.33912 7.610565 NaN  Inf 0.2201942 0.1071663
accuracy(f.sj.stl2)
##                        ME    RMSE      MAE MPE MAPE      MASE      ACF1
## Training set -0.001628082 12.3095 7.567774 NaN  Inf 0.6317414 0.1110626
#the best model for San Juan was the first arima model
tsdisplay(residuals(f.sj.arima))

autoplot(f.sj.arima)

Iquitos Models

#ets
iq.ets = ets(iq.ts)
## Warning in ets(iq.ts): I can't handle data with frequency greater than 24.
## Seasonality will be ignored. Try stlf() if you need seasonal forecasts.
summary(iq.ets)
## ETS(A,N,N) 
## 
## Call:
##  ets(y = iq.ts) 
## 
##   Smoothing parameters:
##     alpha = 0.5619 
## 
##   Initial states:
##     l = 0.1055 
## 
##   sigma:  6.5424
## 
##      AIC     AICc      BIC 
## 4683.212 4683.263 4695.683 
## 
## Training set error measures:
##                      ME     RMSE      MAE  MPE MAPE      MASE        ACF1
## Training set 0.01157704 6.528546 3.593558 -Inf  Inf 0.3895958 -0.01911295
f.iq.ets = forecast(iq.ets, h=156)
autoplot(f.iq.ets)

checkresiduals(f.iq.ets)

## 
##  Ljung-Box test
## 
## data:  Residuals from ETS(A,N,N)
## Q* = 71.21, df = 92, p-value = 0.947
## 
## Model df: 2.   Total lags used: 94
#arima
iq.arima = auto.arima(iq.ts)
summary(iq.arima)
## Series: iq.ts 
## ARIMA(0,1,1) 
## 
## Coefficients:
##           ma1
##       -0.4370
## s.e.   0.0407
## 
## sigma^2 estimated as 42.8:  log likelihood=-1552.61
## AIC=3109.22   AICc=3109.24   BIC=3117.53
## 
## Training set error measures:
##                      ME     RMSE     MAE  MPE MAPE      MASE        ACF1
## Training set 0.01195121 6.528547 3.59285 -Inf  Inf 0.3895191 -0.02024583
f.iq.arima = forecast(iq.arima, h=156)
autoplot(f.iq.arima)

checkresiduals(f.iq.arima)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(0,1,1)
## Q* = 71.108, df = 93, p-value = 0.9556
## 
## Model df: 1.   Total lags used: 94
#stl
iq.stl = stl(iq.ts, t.window = 13, s.window = "periodic", robust=TRUE)
f.iq.stl = forecast(iq.stl, h=156)
autoplot(f.iq.stl)

checkresiduals(f.iq.stl)
## Warning in checkresiduals(f.iq.stl): The fitted degrees of freedom is based on
## the model used for the seasonally adjusted data.

## 
##  Ljung-Box test
## 
## data:  Residuals from STL +  ETS(A,N,N)
## Q* = 66.411, df = 92, p-value = 0.9796
## 
## Model df: 2.   Total lags used: 94
#try with differenced data to remove the trends
iq.diff = diff(iq.ts)
autoplot(iq.diff)

#ets2
iq.ets2 = ets(iq.diff)
## Warning in ets(iq.diff): I can't handle data with frequency greater than 24.
## Seasonality will be ignored. Try stlf() if you need seasonal forecasts.
summary(iq.ets2)
## ETS(A,N,N) 
## 
## Call:
##  ets(y = iq.diff) 
## 
##   Smoothing parameters:
##     alpha = 1e-04 
## 
##   Initial states:
##     l = 0.0101 
## 
##   sigma:  7.2005
## 
##      AIC     AICc      BIC 
## 4762.585 4762.636 4775.049 
## 
## Training set error measures:
##                        ME     RMSE      MAE  MPE MAPE      MASE      ACF1
## Training set -0.002111055 7.185205 3.849579 -Inf  Inf 0.5687494 -0.391545
f.iq.ets2 = forecast(iq.ets2, h=156)
autoplot(f.iq.ets2)

checkresiduals(f.iq.ets2)

## 
##  Ljung-Box test
## 
## data:  Residuals from ETS(A,N,N)
## Q* = 147.46, df = 92, p-value = 0.000216
## 
## Model df: 2.   Total lags used: 94
#this did not perform as well as the 1st ets model

#arima2
iq.arima2 = auto.arima(iq.diff)
summary(iq.arima2)
## Series: iq.diff 
## ARIMA(0,0,1) with zero mean 
## 
## Coefficients:
##           ma1
##       -0.4370
## s.e.   0.0407
## 
## sigma^2 estimated as 42.8:  log likelihood=-1552.61
## AIC=3109.22   AICc=3109.24   BIC=3117.53
## 
## Training set error measures:
##                      ME     RMSE      MAE MPE MAPE      MASE        ACF1
## Training set 0.01197658 6.535474 3.600478 NaN  Inf 0.5319465 -0.02024583
f.iq.arima2 = forecast(iq.arima2, h=156)
autoplot(f.iq.arima2)

checkresiduals(f.iq.arima2)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(0,0,1) with zero mean
## Q* = 70.971, df = 93, p-value = 0.9567
## 
## Model df: 1.   Total lags used: 94
#stl2
iq.stl2 = stl(iq.diff, t.window = 13, s.window = "periodic", robust=TRUE)
f.iq.stl2 = forecast(iq.stl2, h=156)
autoplot(f.iq.stl2)

checkresiduals(f.iq.stl2)
## Warning in checkresiduals(f.iq.stl2): The fitted degrees of freedom is based on
## the model used for the seasonally adjusted data.

## 
##  Ljung-Box test
## 
## data:  Residuals from STL +  ETS(A,N,N)
## Q* = 138, df = 92, p-value = 0.001366
## 
## Model df: 2.   Total lags used: 94
accuracy(f.iq.ets)
##                      ME     RMSE      MAE  MPE MAPE      MASE        ACF1
## Training set 0.01157704 6.528546 3.593558 -Inf  Inf 0.3895958 -0.01911295
accuracy(f.iq.ets2)
##                        ME     RMSE      MAE  MPE MAPE      MASE      ACF1
## Training set -0.002111055 7.185205 3.849579 -Inf  Inf 0.5687494 -0.391545
accuracy(f.iq.arima)
##                      ME     RMSE     MAE  MPE MAPE      MASE        ACF1
## Training set 0.01195121 6.528547 3.59285 -Inf  Inf 0.3895191 -0.02024583
accuracy(f.iq.arima2)
##                      ME     RMSE      MAE MPE MAPE      MASE        ACF1
## Training set 0.01197658 6.535474 3.600478 NaN  Inf 0.5319465 -0.02024583
accuracy(f.iq.stl)
##                      ME     RMSE      MAE MPE MAPE      MASE        ACF1
## Training set 0.01405503 6.511848 3.640696 NaN  Inf 0.3947063 -0.01166279
accuracy(f.iq.stl2)
##                         ME     RMSE      MAE MPE MAPE    MASE       ACF1
## Training set -0.0008546277 7.076454 3.886606 NaN  Inf 0.57422 -0.3818434
#The best model for Iquitos was the first arima model
tsdisplay(residuals(f.iq.arima))

autoplot(f.iq.arima)

VAR Model

#var
ccf(sj.ts,iq.ts, type = c("correlation","covariance"))

d1= cbind(sj.ts, iq.ts)
d1 = na.omit(d1)
autoplot(d1)

VARselect(d1,lag.max=10,type="const")[["selection"]]
## AIC(n)  HQ(n)  SC(n) FPE(n) 
##      5      2      2      5
#var model 1
var1 = VAR(d1, p=1, type="const")
serial.test(var1, lags.pt=10, type="PT.asymptotic")
## 
##  Portmanteau Test (asymptotic)
## 
## data:  Residuals of VAR object var1
## Chi-squared = 94.287, df = 36, p-value = 4.072e-07
#var model 2
var2 = VAR(d1, p=2, type="const")
serial.test(var2, lags.pt=10, type="PT.asymptotic")
## 
##  Portmanteau Test (asymptotic)
## 
## data:  Residuals of VAR object var2
## Chi-squared = 53.17, df = 32, p-value = 0.01078
#var model 3
var3 = VAR(d1, p=10, type="const")
serial.test(var3, lags.pt=10, type="PT.asymptotic")
## 
##  Portmanteau Test (asymptotic)
## 
## data:  Residuals of VAR object var3
## Chi-squared = 2.9304, df = 0, p-value < 2.2e-16
#model 3 appears to be the best of the var models
f.var1 = forecast(var1)
f.var2 = forecast(var2)
f.var3 = forecast(var3)

autoplot(f.var1)

autoplot(f.var2)

autoplot(f.var3)

tsdisplay(residuals(var1))

tsdisplay(residuals(var2))

tsdisplay(residuals(var3))

#the 3rd var model seems to be the best