DengAI: Predicting Disease Spread

Predictive Analytics Final

The goal of this project is to predict local epidemics of dengue fever. So, let’s get started and see what we can find!

library(psych) #describe
library(forecast) #time series

## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo

library(ggfortify) # neural nets

## Loading required package: ggplot2

## 
## Attaching package: 'ggplot2'

## The following objects are masked from 'package:psych':
## 
##     %+%, alpha

## Registered S3 methods overwritten by 'ggfortify':
##   method                 from    
##   autoplot.Arima         forecast
##   autoplot.acf           forecast
##   autoplot.ar            forecast
##   autoplot.bats          forecast
##   autoplot.decomposed.ts forecast
##   autoplot.ets           forecast
##   autoplot.forecast      forecast
##   autoplot.stl           forecast
##   autoplot.ts            forecast
##   fitted.ar              forecast
##   fortify.ts             forecast
##   residuals.ar           forecast

library(ggplot2) # visualizations
library(dplyr) # %>%

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(stargazer) # tables

## 
## Please cite as:

##  Hlavac, Marek (2018). stargazer: Well-Formatted Regression and Summary Statistics Tables.

##  R package version 5.2.2. https://CRAN.R-project.org/package=stargazer

library(corrplot) #correlation plots

## corrplot 0.84 loaded

library(vars) #VAR modelling

## Loading required package: MASS

## 
## Attaching package: 'MASS'

## The following object is masked from 'package:dplyr':
## 
##     select

## Loading required package: strucchange

## Loading required package: zoo

## 
## Attaching package: 'zoo'

## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric

## Loading required package: sandwich

## Loading required package: urca

## Loading required package: lmtest

library(neuralnet) #neural networks

## 
## Attaching package: 'neuralnet'

## The following object is masked from 'package:dplyr':
## 
##     compute

#import training data
labels <- read.csv("C:/Users/andre/Documents/Boston College/2. Summer 2020/Forecasting/final/dengue_labels_train.csv")
features <- read.csv("C:/Users/andre/Documents/Boston College/2. Summer 2020/Forecasting/final/dengue_features_train.csv")
test <- read.csv("C:/Users/andre/Documents/Boston College/2. Summer 2020/Forecasting/final/dengue_features_test.csv")

labels and features make up the training data for this project, while the aptly named test data will be the test set.

Describing the Data: What’ve we Got to Work With?

#descriptive stats training data
str(labels)

## 'data.frame':    1456 obs. of  4 variables:
##  $ city       : chr  "sj" "sj" "sj" "sj" ...
##  $ year       : int  1990 1990 1990 1990 1990 1990 1990 1990 1990 1990 ...
##  $ weekofyear : int  18 19 20 21 22 23 24 25 26 27 ...
##  $ total_cases: int  4 5 4 3 6 2 4 5 10 6 ...

summary(labels)

##      city                year        weekofyear     total_cases    
##  Length:1456        Min.   :1990   Min.   : 1.00   Min.   :  0.00  
##  Class :character   1st Qu.:1997   1st Qu.:13.75   1st Qu.:  5.00  
##  Mode  :character   Median :2002   Median :26.50   Median : 12.00  
##                     Mean   :2001   Mean   :26.50   Mean   : 24.68  
##                     3rd Qu.:2005   3rd Qu.:39.25   3rd Qu.: 28.00  
##                     Max.   :2010   Max.   :53.00   Max.   :461.00

str(features)

## 'data.frame':    1456 obs. of  24 variables:
##  $ city                                 : chr  "sj" "sj" "sj" "sj" ...
##  $ year                                 : int  1990 1990 1990 1990 1990 1990 1990 1990 1990 1990 ...
##  $ weekofyear                           : int  18 19 20 21 22 23 24 25 26 27 ...
##  $ week_start_date                      : chr  "4/30/1990" "5/7/1990" "5/14/1990" "5/21/1990" ...
##  $ ndvi_ne                              : num  0.1226 0.1699 0.0323 0.1286 0.1962 ...
##  $ ndvi_nw                              : num  0.104 0.142 0.173 0.245 0.262 ...
##  $ ndvi_se                              : num  0.198 0.162 0.157 0.228 0.251 ...
##  $ ndvi_sw                              : num  0.178 0.155 0.171 0.236 0.247 ...
##  $ precipitation_amt_mm                 : num  12.42 22.82 34.54 15.36 7.52 ...
##  $ reanalysis_air_temp_k                : num  298 298 299 299 300 ...
##  $ reanalysis_avg_temp_k                : num  298 298 299 299 300 ...
##  $ reanalysis_dew_point_temp_k          : num  292 294 295 295 296 ...
##  $ reanalysis_max_air_temp_k            : num  300 301 300 301 302 ...
##  $ reanalysis_min_air_temp_k            : num  296 296 297 297 298 ...
##  $ reanalysis_precip_amt_kg_per_m2      : num  32 17.9 26.1 13.9 12.2 ...
##  $ reanalysis_relative_humidity_percent : num  73.4 77.4 82.1 80.3 80.5 ...
##  $ reanalysis_sat_precip_amt_mm         : num  12.42 22.82 34.54 15.36 7.52 ...
##  $ reanalysis_specific_humidity_g_per_kg: num  14 15.4 16.8 16.7 17.2 ...
##  $ reanalysis_tdtr_k                    : num  2.63 2.37 2.3 2.43 3.01 ...
##  $ station_avg_temp_c                   : num  25.4 26.7 26.7 27.5 28.9 ...
##  $ station_diur_temp_rng_c              : num  6.9 6.37 6.49 6.77 9.37 ...
##  $ station_max_temp_c                   : num  29.4 31.7 32.2 33.3 35 34.4 32.2 33.9 33.9 33.9 ...
##  $ station_min_temp_c                   : num  20 22.2 22.8 23.3 23.9 23.9 23.3 22.8 22.8 24.4 ...
##  $ station_precip_mm                    : num  16 8.6 41.4 4 5.8 39.1 29.7 21.1 21.1 1.1 ...

summary(features)

##      city                year        weekofyear    week_start_date   
##  Length:1456        Min.   :1990   Min.   : 1.00   Length:1456       
##  Class :character   1st Qu.:1997   1st Qu.:13.75   Class :character  
##  Mode  :character   Median :2002   Median :26.50   Mode  :character  
##                     Mean   :2001   Mean   :26.50                     
##                     3rd Qu.:2005   3rd Qu.:39.25                     
##                     Max.   :2010   Max.   :53.00                     
##                                                                      
##     ndvi_ne            ndvi_nw            ndvi_se            ndvi_sw        
##  Min.   :-0.40625   Min.   :-0.45610   Min.   :-0.01553   Min.   :-0.06346  
##  1st Qu.: 0.04495   1st Qu.: 0.04922   1st Qu.: 0.15509   1st Qu.: 0.14421  
##  Median : 0.12882   Median : 0.12143   Median : 0.19605   Median : 0.18945  
##  Mean   : 0.14229   Mean   : 0.13055   Mean   : 0.20378   Mean   : 0.20231  
##  3rd Qu.: 0.24848   3rd Qu.: 0.21660   3rd Qu.: 0.24885   3rd Qu.: 0.24698  
##  Max.   : 0.50836   Max.   : 0.45443   Max.   : 0.53831   Max.   : 0.54602  
##  NA's   :194        NA's   :52         NA's   :22         NA's   :22        
##  precipitation_amt_mm reanalysis_air_temp_k reanalysis_avg_temp_k
##  Min.   :  0.00       Min.   :294.6         Min.   :294.9        
##  1st Qu.:  9.80       1st Qu.:297.7         1st Qu.:298.3        
##  Median : 38.34       Median :298.6         Median :299.3        
##  Mean   : 45.76       Mean   :298.7         Mean   :299.2        
##  3rd Qu.: 70.23       3rd Qu.:299.8         3rd Qu.:300.2        
##  Max.   :390.60       Max.   :302.2         Max.   :302.9        
##  NA's   :13           NA's   :10            NA's   :10           
##  reanalysis_dew_point_temp_k reanalysis_max_air_temp_k
##  Min.   :289.6               Min.   :297.8            
##  1st Qu.:294.1               1st Qu.:301.0            
##  Median :295.6               Median :302.4            
##  Mean   :295.2               Mean   :303.4            
##  3rd Qu.:296.5               3rd Qu.:305.5            
##  Max.   :298.4               Max.   :314.0            
##  NA's   :10                  NA's   :10               
##  reanalysis_min_air_temp_k reanalysis_precip_amt_kg_per_m2
##  Min.   :286.9             Min.   :  0.00                 
##  1st Qu.:293.9             1st Qu.: 13.05                 
##  Median :296.2             Median : 27.25                 
##  Mean   :295.7             Mean   : 40.15                 
##  3rd Qu.:297.9             3rd Qu.: 52.20                 
##  Max.   :299.9             Max.   :570.50                 
##  NA's   :10                NA's   :10                     
##  reanalysis_relative_humidity_percent reanalysis_sat_precip_amt_mm
##  Min.   :57.79                        Min.   :  0.00              
##  1st Qu.:77.18                        1st Qu.:  9.80              
##  Median :80.30                        Median : 38.34              
##  Mean   :82.16                        Mean   : 45.76              
##  3rd Qu.:86.36                        3rd Qu.: 70.23              
##  Max.   :98.61                        Max.   :390.60              
##  NA's   :10                           NA's   :13                  
##  reanalysis_specific_humidity_g_per_kg reanalysis_tdtr_k station_avg_temp_c
##  Min.   :11.72                         Min.   : 1.357    Min.   :21.40     
##  1st Qu.:15.56                         1st Qu.: 2.329    1st Qu.:26.30     
##  Median :17.09                         Median : 2.857    Median :27.41     
##  Mean   :16.75                         Mean   : 4.904    Mean   :27.19     
##  3rd Qu.:17.98                         3rd Qu.: 7.625    3rd Qu.:28.16     
##  Max.   :20.46                         Max.   :16.029    Max.   :30.80     
##  NA's   :10                            NA's   :10        NA's   :43        
##  station_diur_temp_rng_c station_max_temp_c station_min_temp_c
##  Min.   : 4.529          Min.   :26.70      Min.   :14.7      
##  1st Qu.: 6.514          1st Qu.:31.10      1st Qu.:21.1      
##  Median : 7.300          Median :32.80      Median :22.2      
##  Mean   : 8.059          Mean   :32.45      Mean   :22.1      
##  3rd Qu.: 9.567          3rd Qu.:33.90      3rd Qu.:23.3      
##  Max.   :15.800          Max.   :42.20      Max.   :25.6      
##  NA's   :43              NA's   :20         NA's   :14        
##  station_precip_mm
##  Min.   :  0.00   
##  1st Qu.:  8.70   
##  Median : 23.85   
##  Mean   : 39.33   
##  3rd Qu.: 53.90   
##  Max.   :543.30   
##  NA's   :22

While this gives me some nice descriptive statistics, it is rather unweildy. However, there is a fix to this! Both of these files have the same number of observations, and also have three of the same variables: city, year, and week of the year. This means that we can add these variables together by those three variables. Using the dplyer function left_join() is something new that I learned with this project.

train <- left_join(x = labels, y = features, by = c("year", "weekofyear", "city"))

From the summary of the two datasets before they were joined, we know that there are in fact NAs in the data. So, we need to get rid of those.

#descriptive stats
names(train)

##  [1] "city"                                 
##  [2] "year"                                 
##  [3] "weekofyear"                           
##  [4] "total_cases"                          
##  [5] "week_start_date"                      
##  [6] "ndvi_ne"                              
##  [7] "ndvi_nw"                              
##  [8] "ndvi_se"                              
##  [9] "ndvi_sw"                              
## [10] "precipitation_amt_mm"                 
## [11] "reanalysis_air_temp_k"                
## [12] "reanalysis_avg_temp_k"                
## [13] "reanalysis_dew_point_temp_k"          
## [14] "reanalysis_max_air_temp_k"            
## [15] "reanalysis_min_air_temp_k"            
## [16] "reanalysis_precip_amt_kg_per_m2"      
## [17] "reanalysis_relative_humidity_percent" 
## [18] "reanalysis_sat_precip_amt_mm"         
## [19] "reanalysis_specific_humidity_g_per_kg"
## [20] "reanalysis_tdtr_k"                    
## [21] "station_avg_temp_c"                   
## [22] "station_diur_temp_rng_c"              
## [23] "station_max_temp_c"                   
## [24] "station_min_temp_c"                   
## [25] "station_precip_mm"

summary(train)

##      city                year        weekofyear     total_cases    
##  Length:1456        Min.   :1990   Min.   : 1.00   Min.   :  0.00  
##  Class :character   1st Qu.:1997   1st Qu.:13.75   1st Qu.:  5.00  
##  Mode  :character   Median :2002   Median :26.50   Median : 12.00  
##                     Mean   :2001   Mean   :26.50   Mean   : 24.68  
##                     3rd Qu.:2005   3rd Qu.:39.25   3rd Qu.: 28.00  
##                     Max.   :2010   Max.   :53.00   Max.   :461.00  
##                                                                    
##  week_start_date       ndvi_ne            ndvi_nw            ndvi_se        
##  Length:1456        Min.   :-0.40625   Min.   :-0.45610   Min.   :-0.01553  
##  Class :character   1st Qu.: 0.04495   1st Qu.: 0.04922   1st Qu.: 0.15509  
##  Mode  :character   Median : 0.12882   Median : 0.12143   Median : 0.19605  
##                     Mean   : 0.14229   Mean   : 0.13055   Mean   : 0.20378  
##                     3rd Qu.: 0.24848   3rd Qu.: 0.21660   3rd Qu.: 0.24885  
##                     Max.   : 0.50836   Max.   : 0.45443   Max.   : 0.53831  
##                     NA's   :194        NA's   :52         NA's   :22        
##     ndvi_sw         precipitation_amt_mm reanalysis_air_temp_k
##  Min.   :-0.06346   Min.   :  0.00       Min.   :294.6        
##  1st Qu.: 0.14421   1st Qu.:  9.80       1st Qu.:297.7        
##  Median : 0.18945   Median : 38.34       Median :298.6        
##  Mean   : 0.20231   Mean   : 45.76       Mean   :298.7        
##  3rd Qu.: 0.24698   3rd Qu.: 70.23       3rd Qu.:299.8        
##  Max.   : 0.54602   Max.   :390.60       Max.   :302.2        
##  NA's   :22         NA's   :13           NA's   :10           
##  reanalysis_avg_temp_k reanalysis_dew_point_temp_k reanalysis_max_air_temp_k
##  Min.   :294.9         Min.   :289.6               Min.   :297.8            
##  1st Qu.:298.3         1st Qu.:294.1               1st Qu.:301.0            
##  Median :299.3         Median :295.6               Median :302.4            
##  Mean   :299.2         Mean   :295.2               Mean   :303.4            
##  3rd Qu.:300.2         3rd Qu.:296.5               3rd Qu.:305.5            
##  Max.   :302.9         Max.   :298.4               Max.   :314.0            
##  NA's   :10            NA's   :10                  NA's   :10               
##  reanalysis_min_air_temp_k reanalysis_precip_amt_kg_per_m2
##  Min.   :286.9             Min.   :  0.00                 
##  1st Qu.:293.9             1st Qu.: 13.05                 
##  Median :296.2             Median : 27.25                 
##  Mean   :295.7             Mean   : 40.15                 
##  3rd Qu.:297.9             3rd Qu.: 52.20                 
##  Max.   :299.9             Max.   :570.50                 
##  NA's   :10                NA's   :10                     
##  reanalysis_relative_humidity_percent reanalysis_sat_precip_amt_mm
##  Min.   :57.79                        Min.   :  0.00              
##  1st Qu.:77.18                        1st Qu.:  9.80              
##  Median :80.30                        Median : 38.34              
##  Mean   :82.16                        Mean   : 45.76              
##  3rd Qu.:86.36                        3rd Qu.: 70.23              
##  Max.   :98.61                        Max.   :390.60              
##  NA's   :10                           NA's   :13                  
##  reanalysis_specific_humidity_g_per_kg reanalysis_tdtr_k station_avg_temp_c
##  Min.   :11.72                         Min.   : 1.357    Min.   :21.40     
##  1st Qu.:15.56                         1st Qu.: 2.329    1st Qu.:26.30     
##  Median :17.09                         Median : 2.857    Median :27.41     
##  Mean   :16.75                         Mean   : 4.904    Mean   :27.19     
##  3rd Qu.:17.98                         3rd Qu.: 7.625    3rd Qu.:28.16     
##  Max.   :20.46                         Max.   :16.029    Max.   :30.80     
##  NA's   :10                            NA's   :10        NA's   :43        
##  station_diur_temp_rng_c station_max_temp_c station_min_temp_c
##  Min.   : 4.529          Min.   :26.70      Min.   :14.7      
##  1st Qu.: 6.514          1st Qu.:31.10      1st Qu.:21.1      
##  Median : 7.300          Median :32.80      Median :22.2      
##  Mean   : 8.059          Mean   :32.45      Mean   :22.1      
##  3rd Qu.: 9.567          3rd Qu.:33.90      3rd Qu.:23.3      
##  Max.   :15.800          Max.   :42.20      Max.   :25.6      
##  NA's   :43              NA's   :20         NA's   :14        
##  station_precip_mm
##  Min.   :  0.00   
##  1st Qu.:  8.70   
##  Median : 23.85   
##  Mean   : 39.33   
##  3rd Qu.: 53.90   
##  Max.   :543.30   
##  NA's   :22

describe(train) ## what might be wrong with this?

## Warning in describe(train): NAs introduced by coercion

## Warning in describe(train): NAs introduced by coercion

## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning Inf

## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning Inf

## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning -Inf

## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning -Inf

##                                       vars    n    mean    sd  median trimmed
## city*                                    1 1456     NaN    NA      NA     NaN
## year                                     2 1456 2001.03  5.41 2002.00 2001.30
## weekofyear                               3 1456   26.50 15.02   26.50   26.50
## total_cases                              4 1456   24.68 43.60   12.00   15.97
## week_start_date*                         5 1456     NaN    NA      NA     NaN
## ndvi_ne                                  6 1262    0.14  0.14    0.13    0.14
## ndvi_nw                                  7 1404    0.13  0.12    0.12    0.13
## ndvi_se                                  8 1434    0.20  0.07    0.20    0.20
## ndvi_sw                                  9 1434    0.20  0.08    0.19    0.20
## precipitation_amt_mm                    10 1443   45.76 43.72   38.34   40.16
## reanalysis_air_temp_k                   11 1446  298.70  1.36  298.65  298.72
## reanalysis_avg_temp_k                   12 1446  299.23  1.26  299.29  299.25
## reanalysis_dew_point_temp_k             13 1446  295.25  1.53  295.64  295.37
## reanalysis_max_air_temp_k               14 1446  303.43  3.23  302.40  303.10
## reanalysis_min_air_temp_k               15 1446  295.72  2.57  296.20  295.93
## reanalysis_precip_amt_kg_per_m2         16 1446   40.15 43.43   27.24   32.38
## reanalysis_relative_humidity_percent    17 1446   82.16  7.15   80.30   81.69
## reanalysis_sat_precip_amt_mm            18 1443   45.76 43.72   38.34   40.16
## reanalysis_specific_humidity_g_per_kg   19 1446   16.75  1.54   17.09   16.85
## reanalysis_tdtr_k                       20 1446    4.90  3.55    2.86    4.36
## station_avg_temp_c                      21 1413   27.19  1.29   27.41   27.27
## station_diur_temp_rng_c                 22 1413    8.06  2.13    7.30    7.85
## station_max_temp_c                      23 1436   32.45  1.96   32.80   32.52
## station_min_temp_c                      24 1442   22.10  1.57   22.20   22.14
## station_precip_mm                       25 1434   39.33 47.46   23.85   30.46
##                                         mad     min     max  range  skew
## city*                                    NA     Inf    -Inf   -Inf    NA
## year                                   5.93 1990.00 2010.00  20.00 -0.40
## weekofyear                            19.27    1.00   53.00  52.00  0.00
## total_cases                           13.34    0.00  461.00 461.00  5.26
## week_start_date*                         NA     Inf    -Inf   -Inf    NA
## ndvi_ne                                0.15   -0.41    0.51   0.91 -0.11
## ndvi_nw                                0.12   -0.46    0.45   0.91 -0.01
## ndvi_se                                0.07   -0.02    0.54   0.55  0.57
## ndvi_sw                                0.07   -0.06    0.55   0.61  0.75
## precipitation_amt_mm                  44.23    0.00  390.60 390.60  1.73
## reanalysis_air_temp_k                  1.57  294.64  302.20   7.56 -0.08
## reanalysis_avg_temp_k                  1.43  294.89  302.93   8.04 -0.19
## reanalysis_dew_point_temp_k            1.52  289.64  298.45   8.81 -0.72
## reanalysis_max_air_temp_k              2.67  297.80  314.00  16.20  0.85
## reanalysis_min_air_temp_k              2.82  286.90  299.90  13.00 -0.67
## reanalysis_precip_amt_kg_per_m2       25.12    0.00  570.50 570.50  3.38
## reanalysis_relative_humidity_percent   5.68   57.79   98.61  40.82  0.57
## reanalysis_sat_precip_amt_mm          44.23    0.00  390.60 390.60  1.73
## reanalysis_specific_humidity_g_per_kg  1.63   11.72   20.46   8.75 -0.54
## reanalysis_tdtr_k                      1.16    1.36   16.03  14.67  1.07
## station_avg_temp_c                     1.28   21.40   30.80   9.40 -0.57
## station_diur_temp_rng_c                1.63    4.53   15.80  11.27  0.84
## station_max_temp_c                     1.63   26.70   42.20  15.50 -0.26
## station_min_temp_c                     1.63   14.70   25.60  10.90 -0.31
## station_precip_mm                     26.76    0.00  543.30 543.30  2.98
##                                       kurtosis   se
## city*                                       NA   NA
## year                                     -0.89 0.14
## weekofyear                               -1.20 0.39
## total_cases                              36.33 1.14
## week_start_date*                            NA   NA
## ndvi_ne                                  -0.14 0.00
## ndvi_nw                                   0.05 0.00
## ndvi_se                                   0.56 0.00
## ndvi_sw                                   0.70 0.00
## precipitation_amt_mm                      6.74 1.15
## reanalysis_air_temp_k                    -0.69 0.04
## reanalysis_avg_temp_k                    -0.54 0.03
## reanalysis_dew_point_temp_k              -0.12 0.04
## reanalysis_max_air_temp_k                -0.19 0.09
## reanalysis_min_air_temp_k                -0.22 0.07
## reanalysis_precip_amt_kg_per_m2          22.12 1.14
## reanalysis_relative_humidity_percent     -0.40 0.19
## reanalysis_sat_precip_amt_mm              6.74 1.15
## reanalysis_specific_humidity_g_per_kg    -0.49 0.04
## reanalysis_tdtr_k                        -0.21 0.09
## station_avg_temp_c                       -0.16 0.03
## station_diur_temp_rng_c                  -0.26 0.06
## station_max_temp_c                        0.21 0.05
## station_min_temp_c                        0.21 0.04
## station_precip_mm                        15.29 1.25

Next part of my EDA, I want to look at how the data might be correlated (or not).

library(corrplot)
co <- cor(train[6:25])
co <- round(co,2)
corrplot(co, tl.cex = 0.5)

There are a fair amount of strong positive correlations and less negative correlations.

Next I want to look at the data for each month. To do this, I first convert the start_of_week variable, which is given as chr, and convert that to a date format. Once in the date format, I can split this into “%m”, or change in months, and finally convert the character that this gives me into numbers.

train$week_start_date <- as.Date(train$week_start_date, "%m/%d/%Y")
train$Month <- format(train$week_start_date, "%m")
train$Month <- as.numeric(train$Month)

This, along with the already present weekofyear and year variables, enables me to make some visuals to explore.

#cases by week
fig3 <- ggplot(data = train, aes(x=weekofyear, y=total_cases, fill = city)) + 
  geom_bar(stat = "identity") + 
  facet_wrap(.~city) + 
  ggtitle("Cases of Dengue by Week in Iquitos (IQ) and San Juan (SJ)")
fig3

#cases by month 
fig1 <- ggplot(data = train, aes(x=Month, y=total_cases, fill = city)) +
  geom_bar(stat = "identity") + 
  facet_wrap(.~city) + 
  ggtitle("Cases of Dengue by Month in Iquitos (IQ) and San Juan (SJ)")
fig1

#cases by year
fig2 <- ggplot(data = train, aes(x = year, y = total_cases, fill = city)) + 
  geom_bar(stat = "identity") + 
  facet_wrap(.~city) + 
  ggtitle("Cases of Dengue by Year in Iquitos (IQ) and San Juan (SJ)")
fig2

These visuals show me the the dips and spikes for cases is different between the two cities. This makes sense from the literature review, that while similar types of models will be good for forecasting this data, the specific area will cause the parameters of the models to diverge. So, I want to split the data into an Iquitos set and a San Juan set. From now on I will most likely try the same models on both sets but I’m predicting that different parameters will fit each city.

train.sj <- train[1:936,]
train.iq <- train[937:1456,]

Now that these have been split up, let’s do some descriptive statistics for these separately now and see what kind of picture that paints.

summary(train.sj)

##      city                year        weekofyear     total_cases    
##  Length:936         Min.   :1990   Min.   : 1.00   Min.   :  0.00  
##  Class :character   1st Qu.:1994   1st Qu.:13.75   1st Qu.:  9.00  
##  Mode  :character   Median :1999   Median :26.50   Median : 19.00  
##                     Mean   :1999   Mean   :26.50   Mean   : 34.18  
##                     3rd Qu.:2003   3rd Qu.:39.25   3rd Qu.: 37.00  
##                     Max.   :2008   Max.   :53.00   Max.   :461.00  
##                                                                    
##  week_start_date         ndvi_ne            ndvi_nw            ndvi_se        
##  Min.   :1990-04-30   Min.   :-0.40625   Min.   :-0.45610   Min.   :-0.01553  
##  1st Qu.:1994-10-27   1st Qu.: 0.00450   1st Qu.: 0.01642   1st Qu.: 0.13928  
##  Median :1999-04-26   Median : 0.05770   Median : 0.06807   Median : 0.17719  
##  Mean   :1999-04-26   Mean   : 0.05792   Mean   : 0.06747   Mean   : 0.17766  
##  3rd Qu.:2003-10-23   3rd Qu.: 0.11110   3rd Qu.: 0.11520   3rd Qu.: 0.21256  
##  Max.   :2008-04-22   Max.   : 0.49340   Max.   : 0.43710   Max.   : 0.39313  
##                       NA's   :191        NA's   :49         NA's   :19        
##     ndvi_sw         precipitation_amt_mm reanalysis_air_temp_k
##  Min.   :-0.06346   Min.   :  0.00       Min.   :295.9        
##  1st Qu.: 0.12916   1st Qu.:  0.00       1st Qu.:298.2        
##  Median : 0.16597   Median : 20.80       Median :299.3        
##  Mean   : 0.16596   Mean   : 35.47       Mean   :299.2        
##  3rd Qu.: 0.20277   3rd Qu.: 52.18       3rd Qu.:300.1        
##  Max.   : 0.38142   Max.   :390.60       Max.   :302.2        
##  NA's   :19         NA's   :9            NA's   :6            
##  reanalysis_avg_temp_k reanalysis_dew_point_temp_k reanalysis_max_air_temp_k
##  Min.   :296.1         Min.   :289.6               Min.   :297.8            
##  1st Qu.:298.3         1st Qu.:293.8               1st Qu.:300.4            
##  Median :299.4         Median :295.5               Median :301.5            
##  Mean   :299.3         Mean   :295.1               Mean   :301.4            
##  3rd Qu.:300.2         3rd Qu.:296.4               3rd Qu.:302.4            
##  Max.   :302.2         Max.   :297.8               Max.   :304.3            
##  NA's   :6             NA's   :6                   NA's   :6                
##  reanalysis_min_air_temp_k reanalysis_precip_amt_kg_per_m2
##  Min.   :292.6             Min.   :  0.00                 
##  1st Qu.:296.3             1st Qu.: 10.82                 
##  Median :297.5             Median : 21.30                 
##  Mean   :297.3             Mean   : 30.47                 
##  3rd Qu.:298.4             3rd Qu.: 37.00                 
##  Max.   :299.9             Max.   :570.50                 
##  NA's   :6                 NA's   :6                      
##  reanalysis_relative_humidity_percent reanalysis_sat_precip_amt_mm
##  Min.   :66.74                        Min.   :  0.00              
##  1st Qu.:76.25                        1st Qu.:  0.00              
##  Median :78.67                        Median : 20.80              
##  Mean   :78.57                        Mean   : 35.47              
##  3rd Qu.:80.96                        3rd Qu.: 52.18              
##  Max.   :87.58                        Max.   :390.60              
##  NA's   :6                            NA's   :9                   
##  reanalysis_specific_humidity_g_per_kg reanalysis_tdtr_k station_avg_temp_c
##  Min.   :11.72                         Min.   :1.357     Min.   :22.84     
##  1st Qu.:15.24                         1st Qu.:2.157     1st Qu.:25.84     
##  Median :16.85                         Median :2.457     Median :27.23     
##  Mean   :16.55                         Mean   :2.516     Mean   :27.01     
##  3rd Qu.:17.86                         3rd Qu.:2.800     3rd Qu.:28.19     
##  Max.   :19.44                         Max.   :4.429     Max.   :30.07     
##  NA's   :6                             NA's   :6         NA's   :6         
##  station_diur_temp_rng_c station_max_temp_c station_min_temp_c
##  Min.   :4.529           Min.   :26.70      Min.   :17.8      
##  1st Qu.:6.200           1st Qu.:30.60      1st Qu.:21.7      
##  Median :6.757           Median :31.70      Median :22.8      
##  Mean   :6.757           Mean   :31.61      Mean   :22.6      
##  3rd Qu.:7.286           3rd Qu.:32.80      3rd Qu.:23.9      
##  Max.   :9.914           Max.   :35.60      Max.   :25.6      
##  NA's   :6               NA's   :6          NA's   :6         
##  station_precip_mm     Month       
##  Min.   :  0.000   Min.   : 1.000  
##  1st Qu.:  6.825   1st Qu.: 3.750  
##  Median : 17.750   Median : 6.500  
##  Mean   : 26.785   Mean   : 6.419  
##  3rd Qu.: 35.450   3rd Qu.: 9.000  
##  Max.   :305.900   Max.   :12.000  
##  NA's   :6

summary(train.iq)

##      city                year        weekofyear     total_cases     
##  Length:520         Min.   :2000   Min.   : 1.00   Min.   :  0.000  
##  Class :character   1st Qu.:2003   1st Qu.:13.75   1st Qu.:  1.000  
##  Mode  :character   Median :2005   Median :26.50   Median :  5.000  
##                     Mean   :2005   Mean   :26.50   Mean   :  7.565  
##                     3rd Qu.:2007   3rd Qu.:39.25   3rd Qu.:  9.000  
##                     Max.   :2010   Max.   :53.00   Max.   :116.000  
##                                                                     
##  week_start_date         ndvi_ne           ndvi_nw           ndvi_se       
##  Min.   :2000-07-01   Min.   :0.06173   Min.   :0.03586   Min.   :0.02988  
##  1st Qu.:2002-12-30   1st Qu.:0.20000   1st Qu.:0.17954   1st Qu.:0.19474  
##  Median :2005-06-28   Median :0.26364   Median :0.23297   Median :0.24980  
##  Mean   :2005-06-28   Mean   :0.26387   Mean   :0.23878   Mean   :0.25013  
##  3rd Qu.:2007-12-26   3rd Qu.:0.31997   3rd Qu.:0.29393   3rd Qu.:0.30230  
##  Max.   :2010-06-25   Max.   :0.50836   Max.   :0.45443   Max.   :0.53831  
##                       NA's   :3         NA's   :3         NA's   :3        
##     ndvi_sw        precipitation_amt_mm reanalysis_air_temp_k
##  Min.   :0.06418   Min.   :  0.00       Min.   :294.6        
##  1st Qu.:0.20413   1st Qu.: 39.10       1st Qu.:297.1        
##  Median :0.26214   Median : 60.47       Median :297.8        
##  Mean   :0.26678   Mean   : 64.25       Mean   :297.9        
##  3rd Qu.:0.32515   3rd Qu.: 85.76       3rd Qu.:298.6        
##  Max.   :0.54602   Max.   :210.83       Max.   :301.6        
##  NA's   :3         NA's   :4            NA's   :4            
##  reanalysis_avg_temp_k reanalysis_dew_point_temp_k reanalysis_max_air_temp_k
##  Min.   :294.9         Min.   :290.1               Min.   :300.0            
##  1st Qu.:298.2         1st Qu.:294.6               1st Qu.:305.2            
##  Median :299.1         Median :295.9               Median :307.1            
##  Mean   :299.1         Mean   :295.5               Mean   :307.1            
##  3rd Qu.:300.1         3rd Qu.:296.5               3rd Qu.:308.7            
##  Max.   :302.9         Max.   :298.4               Max.   :314.0            
##  NA's   :4             NA's   :4                   NA's   :4                
##  reanalysis_min_air_temp_k reanalysis_precip_amt_kg_per_m2
##  Min.   :286.9             Min.   :  0.00                 
##  1st Qu.:292.0             1st Qu.: 24.07                 
##  Median :293.1             Median : 46.44                 
##  Mean   :292.9             Mean   : 57.61                 
##  3rd Qu.:294.2             3rd Qu.: 71.07                 
##  Max.   :296.0             Max.   :362.03                 
##  NA's   :4                 NA's   :4                      
##  reanalysis_relative_humidity_percent reanalysis_sat_precip_amt_mm
##  Min.   :57.79                        Min.   :  0.00              
##  1st Qu.:84.30                        1st Qu.: 39.10              
##  Median :90.92                        Median : 60.47              
##  Mean   :88.64                        Mean   : 64.25              
##  3rd Qu.:94.56                        3rd Qu.: 85.76              
##  Max.   :98.61                        Max.   :210.83              
##  NA's   :4                            NA's   :4                   
##  reanalysis_specific_humidity_g_per_kg reanalysis_tdtr_k station_avg_temp_c
##  Min.   :12.11                         Min.   : 3.714    Min.   :21.40     
##  1st Qu.:16.10                         1st Qu.: 7.371    1st Qu.:27.00     
##  Median :17.43                         Median : 8.964    Median :27.60     
##  Mean   :17.10                         Mean   : 9.207    Mean   :27.53     
##  3rd Qu.:18.18                         3rd Qu.:11.014    3rd Qu.:28.10     
##  Max.   :20.46                         Max.   :16.029    Max.   :30.80     
##  NA's   :4                             NA's   :4         NA's   :37        
##  station_diur_temp_rng_c station_max_temp_c station_min_temp_c
##  Min.   : 5.20           Min.   :30.1       Min.   :14.7      
##  1st Qu.: 9.50           1st Qu.:33.2       1st Qu.:20.6      
##  Median :10.62           Median :34.0       Median :21.3      
##  Mean   :10.57           Mean   :34.0       Mean   :21.2      
##  3rd Qu.:11.65           3rd Qu.:34.9       3rd Qu.:22.0      
##  Max.   :15.80           Max.   :42.2       Max.   :24.2      
##  NA's   :37              NA's   :14         NA's   :8         
##  station_precip_mm     Month       
##  Min.   :  0.00    Min.   : 1.000  
##  1st Qu.: 17.20    1st Qu.: 3.750  
##  Median : 45.30    Median : 6.500  
##  Mean   : 62.47    Mean   : 6.417  
##  3rd Qu.: 85.95    3rd Qu.: 9.000  
##  Max.   :543.30    Max.   :12.000  
##  NA's   :16

fig4 <- ggplot(data = train.sj, 
               aes(total_cases)) + 
              geom_histogram(fill = "orange", bins = 30) + 
              ggtitle("Histogram of Cases in San Juan")
fig4

fig5 <- ggplot(data = train.iq, 
               aes(total_cases)) + 
              geom_histogram(fill = "pink", bins = 30) + 
              ggtitle("Histogram of Cases in Iquitos")
fig5

And now create some time plots for each city. I can’t overlay them because they do happen at different times, which is why it doesn’t make much sense to even really do a single time series for the train data.

ts.sj <- ts(train.sj, start = c(1990, 4, 30), end = c(2004, 10, 28), frequency = 52)
ts.iq <- ts(train.iq, start = c(2005, 11, 11), end = c(2010, 6, 25), frequency = 52)
autoplot(ts.sj[,4], main = "Total Cases San Juan")

autoplot(ts.iq[,4], main = "Total Cases Iquitos")

Now that I have these set up as time series, I want to decompose both of these to get an idea of what the trend and seasonality may look like before building any models for them.

ts.sj[,4] %>% 
  decompose(type = c("additive")) %>% 
  autoplot() + 
  ggtitle("Classical Additive Decomposition of San Jose Train Data")

## Warning: attributes are not identical across measure variables;
## they will be dropped

## Warning: Removed 104 row(s) containing missing values (geom_path).

ts.iq[,4] %>% 
  decompose(type = c("additive")) %>%
  autoplot() + 
  ggtitle("Classical Additive Decomposition of Iquitos Train Data")

## Warning: attributes are not identical across measure variables;
## they will be dropped

## Warning: Removed 104 row(s) containing missing values (geom_path).

This doesn’t seem very clear to me, so when I build an ETS model, I will be interested to see what it chooses (additive vs. multiplicative). What this does make clear for me is that there is for sure a strong seasonality but not a strong trend in both cities’ data.

Also, while it’s important to know what the total cases were over time, it’s also important to understand the climate and weather throughout the years as well. I’ll now try to focus on visualizing climate variables before jumping in to modelling data. I’m going to focus on the NOAA’s NCEP Climate Forecast System Reanalysis measurements for my climate variables.

autoplot(ts.sj[,11], main = "Temperature in San Juan", series = "Mean Air Temp") + #average, max, and min air temp sj
  autolayer(ts.sj[,14], series = "Max Air Temp") + 
  autolayer(ts.sj[,15], series = "Min Air Temp")

autoplot(ts.iq[,11], main = "Temperature in Iquitos", series = "Mean Air Temp") + #average, max, and min air temp iq
  autolayer(ts.iq[,14], series = "Max Air Temp") + 
  autolayer(ts.iq[,15], series = "Min Air Temp")

Here I am comparing the differences in the temperatures between the two cities. I find it notable that the maximum and minimum are much closer to the average in San Juan than they are in Iquitos.

round(mean(ts.sj[,20]),2) #average diurnal temp range

## [1] NA

round(mean(ts.iq[,20]),2) #average diurnal temp range

## [1] NA

Additionally, San Juan has a large spike in max and large dip in min towards the end of the data set. I’m not sure wha the explanation for that is right now.

Next, I move on to looking at precipitation.

autoplot(ts.sj[,18], #total precipitation in mm
          main = "Total Precipitation in San Juan", 
          xlab = "Time", 
          ylab = "Precipitation (mm)")

autoplot(ts.iq[,18], #average precipitation in mm iq
         main = "Total Precipitation in Iquitos", 
         xlab = "Time", 
         ylab = "Precipitation (mm)")

The final variable I’m interested in is humidity.

autoplot(ts.sj[,17], #relative humidity sj
         main = "Relative Humidity in San Juan", 
         xlab = "Time", 
         ylab = "Relative Humidity (%)")

autoplot(ts.iq[,17], #relative humidity iq
         main = "Relative Humidity in Iquitos", 
         xlab = "Time", 
         ylab = "Relative Humidity (%)")

Modelling the Data

I’m realizing now, however, that before I start building any models, that I probably want to make sure that my test data is good to go so that I can also look at the accuracy of the forecasts going forward.

However, I’m not really sure how I messed up splitting the cities data sets in the beginning, but I’m having a hard time findind everything in one place. I’m going to try to just use the first training set to try to set myself straight. However, I’m realizing that it most likely have to do with the fact that I just omitted NA values, which is now messing with my dates. I’m not really sure of how to deal with this, either, so I’m going to see what I can do with the data that I have

#using window to separate original time series
sj.train <- window(ts.sj, end = c(1999,12))
sj.test <- window(ts.sj, start = c(1999,13)) 
autoplot(sj.train[,4]) + autolayer(sj.test[,4])

autoplot(ts.sj[,4])

iq.train <- window(ts.iq, end = c(2008,27))
iq.test <- window(ts.iq, start = c(2008,28))
autoplot(iq.train[,4]) + autolayer(iq.test[,4])

autoplot(ts.iq[,4])

It’s a little bit rough, but fingers crossed it will give me something to work with going forward.

Iquitos

ARIMA

Since the literature review just loved SARIMA, I’m going to start there with auto.arima and see what I get back

iq.mod1 <- auto.arima(iq.train[,4]) #auto arima
summary(iq.mod1)

## Series: iq.train[, 4] 
## ARIMA(0,1,2) 
## 
## Coefficients:
##           ma1     ma2
##       -0.5516  0.2021
## s.e.   0.0728  0.0720
## 
## sigma^2 estimated as 15.38:  log likelihood=-478.26
## AIC=962.51   AICc=962.66   BIC=971.96
## 
## Training set error measures:
##                      ME     RMSE      MAE MPE MAPE     MASE        ACF1
## Training set 0.01768816 3.887614 2.114454 NaN  Inf 0.226415 -0.01503691

fc_iqmod1 <- forecast(iq.mod1, h = 5)
autoplot(fc_iqmod1)

## Warning: `filter_()` is deprecated as of dplyr 0.7.0.
## Please use `filter()` instead.
## See vignette('programming') for more help
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.

accuracy(fc_iqmod1,iq.test[1:5,4])

##                      ME      RMSE       MAE      MPE     MAPE      MASE
## Training set 0.01768816 3.8876143 2.1144544      NaN      Inf 0.8827334
## Test set     0.45847457 0.9584038 0.8546684 2.107743 41.72712 0.3568033
##                     ACF1
## Training set -0.01503691
## Test set              NA

My auto.arima did find some seasonality in the total cases in Iquitos, which agrees with the literature review. The model fits the training data slightly better than the test data, but we won’t really know what any of this means until we have other models to compare this to.

The auto.arima didn’t have any seasonal integration, so I want to try to add that as well.

iq.mod2 <- arima(iq.train[,4], order = c(2,0,2), seasonal = list(order = c(1,0,1))) #arima
fc_iqmod2 <- forecast(iq.mod2, h = 5)
autoplot(fc_iqmod2)

accuracy(fc_iqmod2, iq.test[1:5,4])

##                       ME      RMSE      MAE       MPE     MAPE      MASE
## Training set  0.04732523 3.8207655 2.238910      -Inf      Inf 0.9346905
## Test set     -0.17454568 0.9890804 0.887075 -29.89234 53.64332 0.3703323
##                       ACF1
## Training set -0.0003072499
## Test set                NA

This model fits thr training data a little bit better than the last model, but fits the testing data worse. I think between these two models, I would want to go with something that fit the test data better. So, I think that I would stick with the auto.arima model.

Neural Network

Let’s try some models that weren’t talked about in the literature review. In particular, I want to try neural networks and VAR models. I’m going to start with a neural network and see how that goes.

For some reason rmarkdown really doesn’t like this code, even though it is fully functional in my regular script. So, here is the code:

iq.mod3 <- nnetar(iq.train[,4]) #neural network fc_iqmod3 <- forecast(iq.mod3, h = 5) autoplot(fc_iqmod3) accuracy(fc_iqmod3, iq.test[1:5,4])

and here are the accuracy results:

                 ME     RMSE      MAE      MPE     MAPE      MASE        ACF1

Training set -0.03066757 5.061519 3.378104 -Inf Inf 0.6580225 0.006958621 Test set 5.93589429 8.310183 5.935894 37.79275 37.79275 1.1562557 NA

the neural network does the best so far at fitting the training data. However, it doesn’t do any better than the auto.arima model at fitting the test data, so the auto.arima model still sticks out as the best model.

VAR and Regression Analysis

Finally, I’m going to try doing a VAR model to see how adding climate variables improves (or not) the forecast of total cases. Before running a VAR model, I’m actually going to run a regression model first. I’m a little bit unsure if running a VAR model is even really necessary, since I’m not concern about how much the cases of dengue affects climate because, well, it won’t. So here’s a regression model real quick. It will be with average air temp [,12], total precipitation [,18], diurnal temp range [,20], and relative humidity [,17].

iq.mod4 <- tslm(iq.train[,4] ~ iq.train[,12] + iq.train[,17] + iq.train[,18] + iq.train[,20])
summary(iq.mod4)

## 
## Call:
## tslm(formula = iq.train[, 4] ~ iq.train[, 12] + iq.train[, 17] + 
##     iq.train[, 18] + iq.train[, 20])
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -7.490 -4.500 -2.651  2.694 30.480 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)   
## (Intercept)    -460.21258  156.48833  -2.941  0.00373 **
## iq.train[, 12]    1.53858    0.51916   2.964  0.00348 **
## iq.train[, 17]    0.14026    0.14954   0.938  0.34965   
## iq.train[, 18]   -0.02481    0.01968  -1.261  0.20921   
## iq.train[, 20]   -0.59963    0.47698  -1.257  0.21045   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.326 on 168 degrees of freedom
## Multiple R-squared:  0.0686, Adjusted R-squared:  0.04642 
## F-statistic: 3.093 on 4 and 168 DF,  p-value: 0.01729

The only two results that have any statistical significance are average air temperature and relative humidity. HOwever, the greatest magnitude effects were from average temperature and diurnal temperature range. Here I think we see clearly the difference between economic significance and statistical significance. I’m still not sure what I really want to do with this except maybe look at how these results compare to San Juan. However, since I have identified at least one variable that is statistically significant and economically significant, I will use the average temperature to build a VAR model.

iq.cases <- iq.train[,4]
iq.avg.temp <- iq.train[,12]
iq.for.VAR <- ts.intersect(iq.cases, iq.avg.temp, dframe = FALSE)

VARselect(iq.for.VAR, type = c("const"))

## $selection
## AIC(n)  HQ(n)  SC(n) FPE(n) 
##      2      2      2      2 
## 
## $criteria
##                1         2         3         4         5         6         7
## AIC(n)  3.282055  3.065013  3.076177  3.085667  3.125644  3.159825  3.174662
## HQ(n)   3.328289  3.142070  3.184057  3.224369  3.295170  3.360173  3.405833
## SC(n)   3.395935  3.254814  3.341898  3.427308  3.543206  3.653307  3.744064
## FPE(n) 26.630656 21.435571 21.677673 21.886969 22.783925 23.582487 23.943739
##                8         9        10
## AIC(n)  3.219592  3.262189  3.223388
## HQ(n)   3.481586  3.555006  3.547028
## SC(n)   3.864914  3.983432  4.020551
## FPE(n) 25.056032 26.162272 25.185499

I created a new time series just to make it easier on myself. I used VARselect to determine the number of lags that I want to use. Looks like I need to build a model with 3 lags as well as a model with 2 lags. Since there really isn’t a trend in this data, I decided that I would just have my type be constant.

iq.mod5 <- VAR(iq.for.VAR, p = 2, type = c("const")) #VAR model with lag 2
summary(iq.mod5)

## 
## VAR Estimation Results:
## ========================= 
## Endogenous variables: iq.cases, iq.avg.temp 
## Deterministic variables: const 
## Sample size: 171 
## Log Likelihood: -732.171 
## Roots of the characteristic polynomial:
## 0.9296 0.7614 0.4937 0.1942
## Call:
## VAR(y = iq.for.VAR, p = 2, type = c("const"))
## 
## 
## Estimation results for equation iq.cases: 
## ========================================= 
## iq.cases = iq.cases.l1 + iq.avg.temp.l1 + iq.cases.l2 + iq.avg.temp.l2 + const 
## 
##                 Estimate Std. Error t value Pr(>|t|)    
## iq.cases.l1      0.43039    0.06954   6.189 4.57e-09 ***
## iq.avg.temp.l1   0.22954    0.27188   0.844    0.400    
## iq.cases.l2      0.46789    0.06961   6.721 2.76e-10 ***
## iq.avg.temp.l2  -0.11090    0.27061  -0.410    0.682    
## const          -34.97444   66.42434  -0.527    0.599    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## 
## Residual standard error: 3.922 on 166 degrees of freedom
## Multiple R-Squared: 0.735,   Adjusted R-squared: 0.7286 
## F-statistic: 115.1 on 4 and 166 DF,  p-value: < 2.2e-16 
## 
## 
## Estimation results for equation iq.avg.temp: 
## ============================================ 
## iq.avg.temp = iq.cases.l1 + iq.avg.temp.l1 + iq.cases.l2 + iq.avg.temp.l2 + const 
## 
##                Estimate Std. Error t value Pr(>|t|)    
## iq.cases.l1    -0.01905    0.01993  -0.956    0.340    
## iq.avg.temp.l1  0.57280    0.07791   7.352 8.46e-12 ***
## iq.cases.l2     0.01069    0.01995   0.536    0.593    
## iq.avg.temp.l2  0.14246    0.07754   1.837    0.068 .  
## const          85.18752   19.03405   4.476 1.41e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## 
## Residual standard error: 1.124 on 166 degrees of freedom
## Multiple R-Squared: 0.4487,  Adjusted R-squared: 0.4354 
## F-statistic: 33.77 on 4 and 166 DF,  p-value: < 2.2e-16 
## 
## 
## 
## Covariance matrix of residuals:
##             iq.cases iq.avg.temp
## iq.cases     15.3823      0.6166
## iq.avg.temp   0.6166      1.2631
## 
## Correlation matrix of residuals:
##             iq.cases iq.avg.temp
## iq.cases      1.0000      0.1399
## iq.avg.temp   0.1399      1.0000

fc_iqmod5 <- forecast(iq.mod5, h = 5)
autoplot(fc_iqmod5)

accuracy(fc_iqmod5$forecast$iq.cases, iq.test[1:5,4])

##                         ME     RMSE       MAE       MPE     MAPE      MASE
## Training set  5.978297e-17 3.864257 2.3036659       NaN      Inf 0.9617246
## Test set     -5.004923e-01 1.096881 0.8154455 -48.17555 58.67399 0.3404287
##                     ACF1
## Training set -0.04739087
## Test set              NA

Doesn’t really perform a whole lot different than my auto.arima model. However, I still need to try again with a lag = 3

iq.mod6 <- VAR(iq.for.VAR, p = 3, type = c("const")) #VAR model with lag 3
summary(iq.mod6)

## 
## VAR Estimation Results:
## ========================= 
## Endogenous variables: iq.cases, iq.avg.temp 
## Deterministic variables: const 
## Sample size: 170 
## Log Likelihood: -725.329 
## Roots of the characteristic polynomial:
## 0.9372 0.8388 0.4218 0.4218 0.3131 0.3131
## Call:
## VAR(y = iq.for.VAR, p = 3, type = c("const"))
## 
## 
## Estimation results for equation iq.cases: 
## ========================================= 
## iq.cases = iq.cases.l1 + iq.avg.temp.l1 + iq.cases.l2 + iq.avg.temp.l2 + iq.cases.l3 + iq.avg.temp.l3 + const 
## 
##                 Estimate Std. Error t value Pr(>|t|)    
## iq.cases.l1      0.38399    0.07854   4.889 2.41e-06 ***
## iq.avg.temp.l1   0.20406    0.27527   0.741    0.460    
## iq.cases.l2      0.42327    0.07797   5.429 2.02e-07 ***
## iq.avg.temp.l2  -0.18902    0.31436  -0.601    0.548    
## iq.cases.l3      0.09932    0.07879   1.261    0.209    
## iq.avg.temp.l3   0.18785    0.27446   0.684    0.495    
## const          -60.19705   70.86262  -0.849    0.397    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## 
## Residual standard error: 3.93 on 163 degrees of freedom
## Multiple R-Squared: 0.7381,  Adjusted R-squared: 0.7285 
## F-statistic: 76.58 on 6 and 163 DF,  p-value: < 2.2e-16 
## 
## 
## Estimation results for equation iq.avg.temp: 
## ============================================ 
## iq.avg.temp = iq.cases.l1 + iq.avg.temp.l1 + iq.cases.l2 + iq.avg.temp.l2 + iq.cases.l3 + iq.avg.temp.l3 + const 
## 
##                 Estimate Std. Error t value Pr(>|t|)    
## iq.cases.l1    -0.021799   0.022328  -0.976 0.330368    
## iq.avg.temp.l1  0.550733   0.078254   7.038 5.15e-11 ***
## iq.cases.l2     0.007957   0.022164   0.359 0.720075    
## iq.avg.temp.l2  0.060518   0.089368   0.677 0.499251    
## iq.cases.l3     0.004462   0.022399   0.199 0.842330    
## iq.avg.temp.l3  0.146484   0.078025   1.877 0.062250 .  
## const          72.499565  20.144902   3.599 0.000424 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## 
## Residual standard error: 1.117 on 163 degrees of freedom
## Multiple R-Squared: 0.4614,  Adjusted R-squared: 0.4416 
## F-statistic: 23.27 on 6 and 163 DF,  p-value: < 2.2e-16 
## 
## 
## 
## Covariance matrix of residuals:
##             iq.cases iq.avg.temp
## iq.cases     15.4438      0.5734
## iq.avg.temp   0.5734      1.2481
## 
## Correlation matrix of residuals:
##             iq.cases iq.avg.temp
## iq.cases      1.0000      0.1306
## iq.avg.temp   0.1306      1.0000

fc_iqmod6 <- forecast(iq.mod6, h = 5)
autoplot(fc_iqmod6)

accuracy(fc_iqmod6$forecast$iq.cases, iq.test[1:5,4])

##                         ME     RMSE      MAE       MPE     MAPE      MASE
## Training set  1.743474e-16 3.848102 2.286984       NaN      Inf 0.9547601
## Test set     -4.631348e-01 1.150401 0.893939 -47.80478 62.16492 0.3731978
##                    ACF1
## Training set 0.01410457
## Test set             NA

I think that this may in fact have been the closest to modelling the train and test data that any of my models have actually come, which is very exciting.

I know that they’re already above, but I’m going to put all of my accuracy “tables” in one place right here:

accuracy(fc_iqmod1,iq.test[1:5,4]) #auto.arima

##                      ME      RMSE       MAE      MPE     MAPE      MASE
## Training set 0.01768816 3.8876143 2.1144544      NaN      Inf 0.8827334
## Test set     0.45847457 0.9584038 0.8546684 2.107743 41.72712 0.3568033
##                     ACF1
## Training set -0.01503691
## Test set              NA

accuracy(fc_iqmod2, iq.test[1:5,4]) #SARIMA

##                       ME      RMSE      MAE       MPE     MAPE      MASE
## Training set  0.04732523 3.8207655 2.238910      -Inf      Inf 0.9346905
## Test set     -0.17454568 0.9890804 0.887075 -29.89234 53.64332 0.3703323
##                       ACF1
## Training set -0.0003072499
## Test set                NA

accuracy(fc_iqmod5$forecast$iq.cases, iq.test[1:5,4]) #VAR lag 2

##                         ME     RMSE       MAE       MPE     MAPE      MASE
## Training set  5.978297e-17 3.864257 2.3036659       NaN      Inf 0.9617246
## Test set     -5.004923e-01 1.096881 0.8154455 -48.17555 58.67399 0.3404287
##                     ACF1
## Training set -0.04739087
## Test set              NA

accuracy(fc_iqmod6$forecast$iq.cases, iq.test[1:5,4]) #VAR lag 3

##                         ME     RMSE      MAE       MPE     MAPE      MASE
## Training set  1.743474e-16 3.848102 2.286984       NaN      Inf 0.9547601
## Test set     -4.631348e-01 1.150401 0.893939 -47.80478 62.16492 0.3731978
##                    ACF1
## Training set 0.01410457
## Test set             NA

                 ME     RMSE      MAE      MPE     MAPE      MASE        ACF1

Training set -0.03066757 5.061519 3.378104 -Inf Inf 0.6580225 0.006958621 Test set 5.93589429 8.310183 5.935894 37.79275 37.79275 1.1562557 NA

the last set being my neural net which wouldn’t go through markdown. I think that, looking at all of these metrics together, that the best fit model for Iquitos is, like the literature review said, an ARIMA model. My neural network does have the lowest RMSE for training data, but the auto.arima model has the lowest RMSE for the test set, and like it says in the instructions on datadriven, RMSE is the mettric by which we are judging.

San Juan

ARIMA

Following the same procedure now for San Juan. Starting with an ARIMA model

sj.mod1 <- auto.arima(sj.train[,4])
summary(sj.mod1)

## Series: sj.train[, 4] 
## ARIMA(2,0,1) with non-zero mean 
## 
## Coefficients:
##          ar1      ar2      ma1     mean
##       1.8413  -0.8584  -0.7471  46.7680
## s.e.  0.0569   0.0539   0.0793  10.7979
## 
## sigma^2 estimated as 262.7:  log likelihood=-2005.02
## AIC=4020.04   AICc=4020.16   BIC=4040.87
## 
## Training set error measures:
##                      ME     RMSE      MAE       MPE     MAPE      MASE
## Training set 0.04574525 16.14103 9.884084 -15.96648 34.09738 0.2007904
##                    ACF1
## Training set 0.05126605

checkresiduals(sj.mod1)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(2,0,1) with non-zero mean
## Q* = 67.835, df = 91, p-value = 0.967
## 
## Model df: 4.   Total lags used: 95

fc_sjmod1 <- forecast(sj.mod1, h = 5)
autoplot(fc_sjmod1)

accuracy(fc_sjmod1, sj.test[1:5,4])

##                      ME     RMSE      MAE       MPE     MAPE      MASE
## Training set 0.04574525 16.14103 9.884084 -15.96648 34.09738 0.9787443
## Test set     4.47025234  6.53345 4.792105  21.12432 23.30603 0.4745250
##                    ACF1
## Training set 0.05126605
## Test set             NA

Interestingly, auto.arima didn’t include any seasonality. This might be something that I want to try, especially given the support of the literature that I found. Interestingly enough, this fits the test data better than it fits the training data. Good sign? The other positive about this forecast is that the ARIMA model showed only one spike in the ACF out of the permissible range, and the residuals (while there were outliers) are centered around 0 and normally distributed.

I’m going to take another crack at an ARIMA model and this time add some seasonality to it.

sj.mod2 <- arima(sj.train[,4], order = c(2,1,2), seasonal = list(order = c(1,1,1)))
checkresiduals(sj.mod2)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(2,1,2)(1,1,1)[52]
## Q* = 91.256, df = 89, p-value = 0.4139
## 
## Model df: 6.   Total lags used: 95

fc_sjmod2 <- forecast(sj.mod2, h = 5)
autoplot(fc_sjmod2)

accuracy(fc_sjmod2, sj.test[1:5,4])

##                      ME      RMSE       MAE       MPE     MAPE     MASE
## Training set -0.1038928 16.314622 10.149330 -6.390530 32.23370 1.005010
## Test set     -1.7616603  7.823223  7.066139 -5.245001 38.78016 0.699705
##                    ACF1
## Training set 0.04332763
## Test set             NA

My RMSE is better in this model for the training data but worse for the test data. And again, but MASE is also better for the training data, but not for the testing data. For the training data, my ACF is better for this model than my other ARIMA model by auto.arima.

Neural Network

sj.mod3 <- nnetar(sj.train[,4]) #neural network 
fc_sjmod3 <- forecast(sj.mod3, h = 5)
accuracy(fc_sjmod3, sj.test[1:5,4])

##                       ME     RMSE      MAE       MPE     MAPE      MASE
## Training set -0.04771595 6.639574 4.894112 -9.539623 23.13390 0.4846260
## Test set     -0.46913342 3.536070 3.067352 -7.452337 17.86926 0.3037361
##                     ACF1
## Training set -0.02839626
## Test set              NA

Instead of comparing metrics as I go I’m going to wait until the end.

Regression and VAR

markdown didn’t like my code again, so I have the text here:

sj.mod4 <- tslm(sj.train[,4] ~ sj.train[,12] + sj.train[,17] + sj.train[,18] + sj.train[,20]) summary(sj.mod4)

Coefficients: Estimate Std. Error t value Pr(>|t|)
(Intercept) -2.332e+03 5.035e+02 -4.633 4.67e-06 sj.train[, 12] 7.725e+00 1.768e+00 4.370 1.53e-05 sj.train[, 17] 8.845e-01 7.077e-01 1.250 0.212
sj.train[, 18] -9.629e-03 4.996e-02 -0.193 0.847
sj.train[, 20] -4.932e+00 4.158e+00 -1.186 0.236

Interestingly, the same two variables are significant in San Juan as were in the Iquitos data set: temperature and relative humidity. In this case, however, it is relative humidity and total precipitation that have the greatest magnitude effects. So, as relative humidity is both statistically and economically significant, I will use this climate variable within my VAR model.

I’m having a lot of trouble with rmarkdown, and things that worked in the console do not work here. here it is pasted

sj.cases <- sj.train[,4] sj.rel.hum <- sj.train[,17] sj.for.VAR <- ts.intersect(sj.cases, sj.rel.hum, dframe = FALSE) sj.for.VAR <- (sj.for.VAR)

VARselect(sj.for.VAR, type = c(“const”))

Again, I created a di-variate time series to make it easier on myself. I chose to run my VARselect with constant since there wasn’t much of a trend when I was doing my EDA. So, I will now be building two different VAR models, one with lag 2 and one with lag 5.

Here is my lag = 2

sj.mod5 <- VAR(sj.for.VAR, p = 2, type = c(“const”)) # VAR lag 2 fc_sjmod5 <- forecast(sj.mod5, h = 5) accuracy(fc_sjmod5\(forecast\)sj.cases, sj.test[1:5,4])

and here is my lag = 5

sj.mod6 <- VAR(sj.for.VAR, p = 5, type = c(“const”)) #VAR = 5 fc_sjmod6 <- forecast(sj.mod6, h = 5) accuracy(fc_sjmod6\(forecast\)sj.cases, sj.test[1:5,4])

and let’s rope them together and check it out!

accuracy(fc_sjmod1, sj.test[1:5,4]) #auto.arima

##                      ME     RMSE      MAE       MPE     MAPE      MASE
## Training set 0.04574525 16.14103 9.884084 -15.96648 34.09738 0.9787443
## Test set     4.47025234  6.53345 4.792105  21.12432 23.30603 0.4745250
##                    ACF1
## Training set 0.05126605
## Test set             NA

accuracy(fc_sjmod2, sj.test[1:5,4]) #arima

##                      ME      RMSE       MAE       MPE     MAPE     MASE
## Training set -0.1038928 16.314622 10.149330 -6.390530 32.23370 1.005010
## Test set     -1.7616603  7.823223  7.066139 -5.245001 38.78016 0.699705
##                    ACF1
## Training set 0.04332763
## Test set             NA

accuracy(fc_sjmod3, sj.test[1:5,4]) #neural network

##                       ME     RMSE      MAE       MPE     MAPE      MASE
## Training set -0.04771595 6.639574 4.894112 -9.539623 23.13390 0.4846260
## Test set     -0.46913342 3.536070 3.067352 -7.452337 17.86926 0.3037361
##                     ACF1
## Training set -0.02839626
## Test set              NA

accuracy(fc_sjmod5\(forecast\)sj.cases, sj.test[1:5,4]) #lag 2 accuracy(fc_sjmod6\(forecast\)sj.cases, sj.test[1:5,4]) #lag 5

                    ME      RMSE      MAE       MPE     MAPE      MASE         ACF1

Training set -1.136746e-16 13.076325 8.555603 -Inf Inf 0.9925584 -0.008135356 Test set -5.838463e+00 7.499002 6.884578 -347.0961 362.0406 0.7986983 NA

                    ME     RMSE      MAE      MPE     MAPE      MASE        ACF1

Training set -1.911844e-16 12.83867 8.520497 -Inf Inf 0.9884856 0.007418085 Test set -6.424397e+00 7.79104 6.869190 -364.298 370.6522 0.7969131 NA

RMSE is smallest for the training and testing data in my neural network. I find this to be interesting, given that for Iquitos and the literature review, ARIMA performed this best. .

Final Process

Andrea Bert

8/12/2020

DengAI: Predicting Disease Spread

Predictive Analytics Final

Describing the Data: What’ve we Got to Work With?

Modelling the Data

Iquitos

ARIMA

Neural Network

VAR and Regression Analysis

San Juan

ARIMA

Neural Network

Regression and VAR