_R Packages__

library(readr)
library(ggplot2)
library(forecast)
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo
library(zoo)
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
library(haven)
library(tseries)
library(urca)
library(fUnitRoots)
## Loading required package: timeDate
## Loading required package: timeSeries
## 
## Attaching package: 'timeSeries'
## The following object is masked from 'package:zoo':
## 
##     time<-
## Loading required package: fBasics
## 
## Attaching package: 'fUnitRoots'
## The following objects are masked from 'package:urca':
## 
##     punitroot, qunitroot, unitrootTable
library(reshape)
library(reshape2)
## 
## Attaching package: 'reshape2'
## The following objects are masked from 'package:reshape':
## 
##     colsplit, melt, recast
library(quantmod)
## Loading required package: xts
## Loading required package: TTR
## 
## Attaching package: 'TTR'
## The following object is masked from 'package:fBasics':
## 
##     volatility
## Version 0.4-0 included new data defaults. See ?getSymbols.
library(plyr)
## 
## Attaching package: 'plyr'
## The following objects are masked from 'package:reshape':
## 
##     rename, round_any
library(scales)
## 
## Attaching package: 'scales'
## The following object is masked from 'package:readr':
## 
##     col_factor
library(fpp2)
## Loading required package: fma
## 
## Attaching package: 'fma'
## The following object is masked from 'package:plyr':
## 
##     ozone
## Loading required package: expsmooth
library(hts)
library(lubridate)
## 
## Attaching package: 'lubridate'
## The following object is masked from 'package:reshape':
## 
##     stamp
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
library(data.table)
## 
## Attaching package: 'data.table'
## The following objects are masked from 'package:lubridate':
## 
##     hour, isoweek, mday, minute, month, quarter, second, wday, week,
##     yday, year
## The following objects are masked from 'package:xts':
## 
##     first, last
## The following objects are masked from 'package:reshape2':
## 
##     dcast, melt
## The following object is masked from 'package:reshape':
## 
##     melt
library(GGally)
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2
## 
## Attaching package: 'GGally'
## The following object is masked from 'package:fma':
## 
##     pigs

Importing datasets

##The features for the training dataset
train <- read_csv("/Users/dorothymensah/Desktop/Training_Data_Features.csv")
## Parsed with column specification:
## cols(
##   .default = col_double(),
##   city = col_character(),
##   week_start_date = col_date(format = "")
## )
## See spec(...) for full column specifications.
# The features for the testing dataset
test <- read_csv("/Users/dorothymensah/Desktop/Test_Data_Features.csv")
## Parsed with column specification:
## cols(
##   .default = col_double(),
##   city = col_character(),
##   week_start_date = col_date(format = "")
## )
## See spec(...) for full column specifications.
#The number of dengue cases for each row in the training dataset.

label <- read_csv("/Users/dorothymensah/Desktop/Training_Data_Labels.csv")
## Parsed with column specification:
## cols(
##   city = col_character(),
##   year = col_double(),
##   weekofyear = col_double(),
##   total_cases = col_double()
## )
##Competition submission format 
submission  <- read_csv("/Users/dorothymensah/Desktop//Submission_Format.csv")
## Parsed with column specification:
## cols(
##   city = col_character(),
##   year = col_double(),
##   weekofyear = col_double(),
##   total_cases = col_double()
## )
head(train)
## # A tibble: 6 x 24
##   city   year weekofyear week_start_date ndvi_ne ndvi_nw ndvi_se ndvi_sw
##   <chr> <dbl>      <dbl> <date>            <dbl>   <dbl>   <dbl>   <dbl>
## 1 sj     1990         18 1990-04-30       0.123    0.104   0.198   0.178
## 2 sj     1990         19 1990-05-07       0.170    0.142   0.162   0.155
## 3 sj     1990         20 1990-05-14       0.0322   0.173   0.157   0.171
## 4 sj     1990         21 1990-05-21       0.129    0.245   0.228   0.236
## 5 sj     1990         22 1990-05-28       0.196    0.262   0.251   0.247
## 6 sj     1990         23 1990-06-04      NA        0.175   0.254   0.182
## # … with 16 more variables: precipitation_amt_mm <dbl>,
## #   reanalysis_air_temp_k <dbl>, reanalysis_avg_temp_k <dbl>,
## #   reanalysis_dew_point_temp_k <dbl>, reanalysis_max_air_temp_k <dbl>,
## #   reanalysis_min_air_temp_k <dbl>, reanalysis_precip_amt_kg_per_m2 <dbl>,
## #   reanalysis_relative_humidity_percent <dbl>,
## #   reanalysis_sat_precip_amt_mm <dbl>,
## #   reanalysis_specific_humidity_g_per_kg <dbl>, reanalysis_tdtr_k <dbl>,
## #   station_avg_temp_c <dbl>, station_diur_temp_rng_c <dbl>,
## #   station_max_temp_c <dbl>, station_min_temp_c <dbl>, station_precip_mm <dbl>
head(label)
## # A tibble: 6 x 4
##   city   year weekofyear total_cases
##   <chr> <dbl>      <dbl>       <dbl>
## 1 sj     1990         18           4
## 2 sj     1990         19           5
## 3 sj     1990         20           4
## 4 sj     1990         21           3
## 5 sj     1990         22           6
## 6 sj     1990         23           2
##Merging the label data set with the training dataset

train1 <- merge(label,train, by=c("city", "year","weekofyear"),all.x =T)
nrow(train1) == nrow(label) #This is true 
## [1] TRUE
#Before splitting the train1 data set by city, i am going to do an overall EDA of the data set by variable 

#Plotting total cases by time 
plot(total_cases ~ year,data=train1, col="darkolivegreen4",main="Cases of Dengue Fever per Year", xlab="Year", ylab = "Total Cases of Dengue Fever")

plot(total_cases~weekofyear, data = train1, col="darkred", main="Cases of Dengue Fever per Week of the Year", xlab="Week of the Year", ylab="Total Cases of Dengue Fever")

plot(total_cases ~ week_start_date, data = train1, col="coral",main="Cases of Dengue Fever per Week Start Date",xlab="Week Start Date", ylab="Total Cases of Dengue Fever")

#Plotting Total cases per Variable
plot(total_cases~reanalysis_avg_temp_k, data = train1, col="blue1", main="Cases of Dengue Fevr per Average Temperature", xlab="Average Temperature", ylab = "Total Cases of Dengue Fever")

plot(total_cases~reanalysis_dew_point_temp_k, data = train1, col="cyan4", main="Cases of Dengue Fever per Average Dewpoint Temperature", xlab="Average Dewpoint Temperature", ylab="Total Cases of Dengue Fever")

plot(total_cases~reanalysis_specific_humidity_g_per_kg, data = train1, col="chocolate", main="Cases of Dengue Fever per Specific Humidity", xlab="Specific Humidity(g per kg)", ylab = "Total Cases of Dengue Fever")

plot(total_cases~reanalysis_max_air_temp_k, data=train1,col="cornsilk4", main="Cases of Dengue Fever per Maximum Air Temperature", xlab="Maximum Air Temperature",ylab = "Total Cases of Dengue Fever")

EDA VISUALIZATION PER CITY

##looking at cases per month in the two cities (Iquitos and San Juan)
##Transforming start_of_week variable form a character to date format,then split into months 

train1$week_start_date <- as.Date(train1$week_start_date, "%m/%d/%Y")
train1$Month <- format(train1$week_start_date,"%m")
train1$Month <- as.numeric(train1$Month)

##Creating graphics
#by week 
g1<- ggplot(data = train1, aes(x=weekofyear, y=total_cases, fill = city)) + 
  geom_bar(stat = "identity") + 
  facet_wrap(.~city) + 
  ggtitle("Total Cases of Dengue per Week in Iquitos and San Juan")
g1

##As we can see there are higher cases of Dengue Fever per week in San Juan than in Iquitos. However, there is a decrease around 20 weeks then it steadily increases 


#By month
g2 <- ggplot(data = train1, aes(x=Month, y=total_cases, fill=city)) + geom_bar(stat = "identity") + facet_wrap(.~city)+ggtitle("Total Cases of Dengue per Month in Iquitos and San Juan")
g2

##As we can see there are higher cases of Dengue Fever per month in San Juan than in Iquitos 


#By year 

g3<- ggplot(data = train1, aes(x=year, y=total_cases, fill=city)) + geom_bar(stat = "identity") + facet_wrap(.~city)+ggtitle("Total Cases of Dengue per year in Iquitos and San Juan")
g3

##There seems to be no record of dengue fever in Iquitos till the the early 2000's. From these specific fraphics, i think it might be best to split the training set by city

Seperate the two cities

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:data.table':
## 
##     between, first, last
## The following objects are masked from 'package:plyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise,
##     summarize
## The following objects are masked from 'package:xts':
## 
##     first, last
## The following object is masked from 'package:reshape':
## 
##     rename
## The following objects are masked from 'package:timeSeries':
## 
##     filter, lag
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
#San Juan 
sj<- train1 %>% filter(city=="sj")
str(sj)
## 'data.frame':    936 obs. of  26 variables:
##  $ city                                 : chr  "sj" "sj" "sj" "sj" ...
##  $ year                                 : num  1990 1990 1990 1990 1990 1990 1990 1990 1990 1990 ...
##  $ weekofyear                           : num  18 19 20 21 22 23 24 25 26 27 ...
##  $ total_cases                          : num  4 5 4 3 6 2 4 5 10 6 ...
##  $ week_start_date                      : Date, format: "1990-04-30" "1990-05-07" ...
##  $ ndvi_ne                              : num  0.1226 0.1699 0.0323 0.1286 0.1962 ...
##  $ ndvi_nw                              : num  0.104 0.142 0.173 0.245 0.262 ...
##  $ ndvi_se                              : num  0.198 0.162 0.157 0.228 0.251 ...
##  $ ndvi_sw                              : num  0.178 0.155 0.171 0.236 0.247 ...
##  $ precipitation_amt_mm                 : num  12.42 22.82 34.54 15.36 7.52 ...
##  $ reanalysis_air_temp_k                : num  298 298 299 299 300 ...
##  $ reanalysis_avg_temp_k                : num  298 298 299 299 300 ...
##  $ reanalysis_dew_point_temp_k          : num  292 294 295 295 296 ...
##  $ reanalysis_max_air_temp_k            : num  300 301 300 301 302 ...
##  $ reanalysis_min_air_temp_k            : num  296 296 297 297 298 ...
##  $ reanalysis_precip_amt_kg_per_m2      : num  32 17.9 26.1 13.9 12.2 ...
##  $ reanalysis_relative_humidity_percent : num  73.4 77.4 82.1 80.3 80.5 ...
##  $ reanalysis_sat_precip_amt_mm         : num  12.42 22.82 34.54 15.36 7.52 ...
##  $ reanalysis_specific_humidity_g_per_kg: num  14 15.4 16.8 16.7 17.2 ...
##  $ reanalysis_tdtr_k                    : num  2.63 2.37 2.3 2.43 3.01 ...
##  $ station_avg_temp_c                   : num  25.4 26.7 26.7 27.5 28.9 ...
##  $ station_diur_temp_rng_c              : num  6.9 6.37 6.49 6.77 9.37 ...
##  $ station_max_temp_c                   : num  29.4 31.7 32.2 33.3 35 34.4 32.2 33.9 33.9 33.9 ...
##  $ station_min_temp_c                   : num  20 22.2 22.8 23.3 23.9 23.9 23.3 22.8 22.8 24.4 ...
##  $ station_precip_mm                    : num  16 8.6 41.4 4 5.8 39.1 29.7 21.1 21.1 1.1 ...
##  $ Month                                : num  4 5 5 5 5 6 6 6 6 7 ...
summary(sj)
##      city                year        weekofyear     total_cases    
##  Length:936         Min.   :1990   Min.   : 1.00   Min.   :  0.00  
##  Class :character   1st Qu.:1994   1st Qu.:13.75   1st Qu.:  9.00  
##  Mode  :character   Median :1999   Median :26.50   Median : 19.00  
##                     Mean   :1999   Mean   :26.50   Mean   : 34.18  
##                     3rd Qu.:2003   3rd Qu.:39.25   3rd Qu.: 37.00  
##                     Max.   :2008   Max.   :53.00   Max.   :461.00  
##                                                                    
##  week_start_date         ndvi_ne            ndvi_nw            ndvi_se        
##  Min.   :1990-04-30   Min.   :-0.40625   Min.   :-0.45610   Min.   :-0.01553  
##  1st Qu.:1994-10-27   1st Qu.: 0.00450   1st Qu.: 0.01642   1st Qu.: 0.13928  
##  Median :1999-04-26   Median : 0.05770   Median : 0.06808   Median : 0.17719  
##  Mean   :1999-04-26   Mean   : 0.05792   Mean   : 0.06747   Mean   : 0.17766  
##  3rd Qu.:2003-10-23   3rd Qu.: 0.11110   3rd Qu.: 0.11520   3rd Qu.: 0.21256  
##  Max.   :2008-04-22   Max.   : 0.49340   Max.   : 0.43710   Max.   : 0.39313  
##                       NA's   :191        NA's   :49         NA's   :19        
##     ndvi_sw         precipitation_amt_mm reanalysis_air_temp_k
##  Min.   :-0.06346   Min.   :  0.00       Min.   :295.9        
##  1st Qu.: 0.12916   1st Qu.:  0.00       1st Qu.:298.2        
##  Median : 0.16597   Median : 20.80       Median :299.3        
##  Mean   : 0.16596   Mean   : 35.47       Mean   :299.2        
##  3rd Qu.: 0.20277   3rd Qu.: 52.18       3rd Qu.:300.1        
##  Max.   : 0.38142   Max.   :390.60       Max.   :302.2        
##  NA's   :19         NA's   :9            NA's   :6            
##  reanalysis_avg_temp_k reanalysis_dew_point_temp_k reanalysis_max_air_temp_k
##  Min.   :296.1         Min.   :289.6               Min.   :297.8            
##  1st Qu.:298.3         1st Qu.:293.8               1st Qu.:300.4            
##  Median :299.4         Median :295.5               Median :301.5            
##  Mean   :299.3         Mean   :295.1               Mean   :301.4            
##  3rd Qu.:300.2         3rd Qu.:296.4               3rd Qu.:302.4            
##  Max.   :302.2         Max.   :297.8               Max.   :304.3            
##  NA's   :6             NA's   :6                   NA's   :6                
##  reanalysis_min_air_temp_k reanalysis_precip_amt_kg_per_m2
##  Min.   :292.6             Min.   :  0.00                 
##  1st Qu.:296.3             1st Qu.: 10.82                 
##  Median :297.5             Median : 21.30                 
##  Mean   :297.3             Mean   : 30.47                 
##  3rd Qu.:298.4             3rd Qu.: 37.00                 
##  Max.   :299.9             Max.   :570.50                 
##  NA's   :6                 NA's   :6                      
##  reanalysis_relative_humidity_percent reanalysis_sat_precip_amt_mm
##  Min.   :66.74                        Min.   :  0.00              
##  1st Qu.:76.25                        1st Qu.:  0.00              
##  Median :78.67                        Median : 20.80              
##  Mean   :78.57                        Mean   : 35.47              
##  3rd Qu.:80.96                        3rd Qu.: 52.18              
##  Max.   :87.58                        Max.   :390.60              
##  NA's   :6                            NA's   :9                   
##  reanalysis_specific_humidity_g_per_kg reanalysis_tdtr_k station_avg_temp_c
##  Min.   :11.72                         Min.   :1.357     Min.   :22.84     
##  1st Qu.:15.24                         1st Qu.:2.157     1st Qu.:25.84     
##  Median :16.85                         Median :2.457     Median :27.23     
##  Mean   :16.55                         Mean   :2.516     Mean   :27.01     
##  3rd Qu.:17.86                         3rd Qu.:2.800     3rd Qu.:28.19     
##  Max.   :19.44                         Max.   :4.429     Max.   :30.07     
##  NA's   :6                             NA's   :6         NA's   :6         
##  station_diur_temp_rng_c station_max_temp_c station_min_temp_c
##  Min.   :4.529           Min.   :26.70      Min.   :17.8      
##  1st Qu.:6.200           1st Qu.:30.60      1st Qu.:21.7      
##  Median :6.757           Median :31.70      Median :22.8      
##  Mean   :6.757           Mean   :31.61      Mean   :22.6      
##  3rd Qu.:7.286           3rd Qu.:32.80      3rd Qu.:23.9      
##  Max.   :9.914           Max.   :35.60      Max.   :25.6      
##  NA's   :6               NA's   :6          NA's   :6         
##  station_precip_mm     Month       
##  Min.   :  0.000   Min.   : 1.000  
##  1st Qu.:  6.825   1st Qu.: 3.750  
##  Median : 17.750   Median : 6.500  
##  Mean   : 26.785   Mean   : 6.419  
##  3rd Qu.: 35.450   3rd Qu.: 9.000  
##  Max.   :305.900   Max.   :12.000  
##  NA's   :6
#Correlation plots 

ggcorr(sj[, 5:24], label = TRUE, hjust = 0.85, size = 2,color = "azure3",
        label_size = 2) +
   ggplot2::labs(title = "Correlation Plot (San Juan)")
## Warning in ggcorr(sj[, 5:24], label = TRUE, hjust = 0.85, size = 2, color =
## "azure3", : data in column(s) 'week_start_date' are not numeric and were ignored

plot(total_cases ~ station_max_temp_c, data=sj
      ,main = "Scatter Plot of Total Cases and Maximum Temperature in San Juan")

 plot(total_cases ~ reanalysis_specific_humidity_g_per_kg, data=sj,
      main = "Scatter Plot of Total Cases and Humidity in San Juan")

  # Omit ndvi_ variables because of the large amount of missing data and 0 correllation
# sj.train[,grep("^ndvi", colnames(sj.train)) := NULL]
# build a function to impute data with the most recent values
 
na.locf.data.frame <- 
function(object, ...) replace(object, TRUE, lapply(object, na.locf, ...))


#fill in NAs
sj.im <- na.locf.data.frame(sj)
  summary(sj.im)
##      city                year        weekofyear     total_cases    
##  Length:936         Min.   :1990   Min.   : 1.00   Min.   :  0.00  
##  Class :character   1st Qu.:1994   1st Qu.:13.75   1st Qu.:  9.00  
##  Mode  :character   Median :1999   Median :26.50   Median : 19.00  
##                     Mean   :1999   Mean   :26.50   Mean   : 34.18  
##                     3rd Qu.:2003   3rd Qu.:39.25   3rd Qu.: 37.00  
##                     Max.   :2008   Max.   :53.00   Max.   :461.00  
##  week_start_date         ndvi_ne             ndvi_nw            ndvi_se        
##  Min.   :1990-04-30   Min.   :-0.406250   Min.   :-0.45610   Min.   :-0.01553  
##  1st Qu.:1994-10-27   1st Qu.: 0.005146   1st Qu.: 0.01414   1st Qu.: 0.13936  
##  Median :1999-04-26   Median : 0.058775   Median : 0.06699   Median : 0.17670  
##  Mean   :1999-04-26   Mean   : 0.058174   Mean   : 0.06528   Mean   : 0.17755  
##  3rd Qu.:2003-10-23   3rd Qu.: 0.111325   3rd Qu.: 0.11490   3rd Qu.: 0.21256  
##  Max.   :2008-04-22   Max.   : 0.493400   Max.   : 0.43710   Max.   : 0.39313  
##     ndvi_sw         precipitation_amt_mm reanalysis_air_temp_k
##  Min.   :-0.06346   Min.   :  0.00       Min.   :295.9        
##  1st Qu.: 0.12978   1st Qu.:  0.00       1st Qu.:298.2        
##  Median : 0.16776   Median : 20.45       Median :299.2        
##  Mean   : 0.16657   Mean   : 35.21       Mean   :299.2        
##  3rd Qu.: 0.20284   3rd Qu.: 51.66       3rd Qu.:300.1        
##  Max.   : 0.38142   Max.   :390.60       Max.   :302.2        
##  reanalysis_avg_temp_k reanalysis_dew_point_temp_k reanalysis_max_air_temp_k
##  Min.   :296.1         Min.   :289.6               Min.   :297.8            
##  1st Qu.:298.3         1st Qu.:293.8               1st Qu.:300.4            
##  Median :299.4         Median :295.4               Median :301.5            
##  Mean   :299.3         Mean   :295.1               Mean   :301.4            
##  3rd Qu.:300.2         3rd Qu.:296.4               3rd Qu.:302.4            
##  Max.   :302.2         Max.   :297.8               Max.   :304.3            
##  reanalysis_min_air_temp_k reanalysis_precip_amt_kg_per_m2
##  Min.   :292.6             Min.   :  0.00                 
##  1st Qu.:296.3             1st Qu.: 10.88                 
##  Median :297.5             Median : 21.28                 
##  Mean   :297.3             Mean   : 30.39                 
##  3rd Qu.:298.4             3rd Qu.: 36.92                 
##  Max.   :299.9             Max.   :570.50                 
##  reanalysis_relative_humidity_percent reanalysis_sat_precip_amt_mm
##  Min.   :66.74                        Min.   :  0.00              
##  1st Qu.:76.24                        1st Qu.:  0.00              
##  Median :78.64                        Median : 20.45              
##  Mean   :78.56                        Mean   : 35.21              
##  3rd Qu.:80.95                        3rd Qu.: 51.66              
##  Max.   :87.58                        Max.   :390.60              
##  reanalysis_specific_humidity_g_per_kg reanalysis_tdtr_k station_avg_temp_c
##  Min.   :11.72                         Min.   :1.357     Min.   :22.84     
##  1st Qu.:15.22                         1st Qu.:2.157     1st Qu.:25.81     
##  Median :16.83                         Median :2.457     Median :27.21     
##  Mean   :16.54                         Mean   :2.515     Mean   :26.99     
##  3rd Qu.:17.85                         3rd Qu.:2.789     3rd Qu.:28.18     
##  Max.   :19.44                         Max.   :4.429     Max.   :30.07     
##  station_diur_temp_rng_c station_max_temp_c station_min_temp_c
##  Min.   :4.529           Min.   :26.70      Min.   :17.80     
##  1st Qu.:6.200           1st Qu.:30.60      1st Qu.:21.70     
##  Median :6.757           Median :31.70      Median :22.80     
##  Mean   :6.754           Mean   :31.59      Mean   :22.59     
##  3rd Qu.:7.286           3rd Qu.:32.80      3rd Qu.:23.90     
##  Max.   :9.914           Max.   :35.60      Max.   :25.60     
##  station_precip_mm     Month       
##  Min.   :  0.00    Min.   : 1.000  
##  1st Qu.:  6.90    1st Qu.: 3.750  
##  Median : 17.80    Median : 6.500  
##  Mean   : 26.77    Mean   : 6.419  
##  3rd Qu.: 35.35    3rd Qu.: 9.000  
##  Max.   :305.90    Max.   :12.000
 #Iquitos 
iq<- train1 %>% filter(city=="iq")
str(iq)
## 'data.frame':    520 obs. of  26 variables:
##  $ city                                 : chr  "iq" "iq" "iq" "iq" ...
##  $ year                                 : num  2000 2000 2000 2000 2000 2000 2000 2000 2000 2000 ...
##  $ weekofyear                           : num  26 27 28 29 30 31 32 33 34 35 ...
##  $ total_cases                          : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ week_start_date                      : Date, format: "2000-07-01" "2000-07-08" ...
##  $ ndvi_ne                              : num  0.193 0.217 0.177 0.228 0.329 ...
##  $ ndvi_nw                              : num  0.132 0.276 0.173 0.145 0.322 ...
##  $ ndvi_se                              : num  0.341 0.289 0.204 0.254 0.254 ...
##  $ ndvi_sw                              : num  0.247 0.242 0.128 0.2 0.361 ...
##  $ precipitation_amt_mm                 : num  25.4 60.6 55.5 5.6 62.8 ...
##  $ reanalysis_air_temp_k                : num  297 297 296 295 296 ...
##  $ reanalysis_avg_temp_k                : num  298 298 297 296 298 ...
##  $ reanalysis_dew_point_temp_k          : num  295 295 296 293 294 ...
##  $ reanalysis_max_air_temp_k            : num  307 307 304 304 307 ...
##  $ reanalysis_min_air_temp_k            : num  293 291 293 289 292 ...
##  $ reanalysis_precip_amt_kg_per_m2      : num  43.2 46 64.8 24 31.8 ...
##  $ reanalysis_relative_humidity_percent : num  92.4 93.6 95.8 87.2 88.2 ...
##  $ reanalysis_sat_precip_amt_mm         : num  25.4 60.6 55.5 5.6 62.8 ...
##  $ reanalysis_specific_humidity_g_per_kg: num  16.7 16.9 17.1 14.4 15.4 ...
##  $ reanalysis_tdtr_k                    : num  8.93 10.31 7.39 9.11 9.5 ...
##  $ station_avg_temp_c                   : num  26.4 26.9 26.8 25.8 26.6 ...
##  $ station_diur_temp_rng_c              : num  10.8 11.6 11.5 10.5 11.5 ...
##  $ station_max_temp_c                   : num  32.5 34 33 31.5 33.3 32 34 33 34 34 ...
##  $ station_min_temp_c                   : num  20.7 20.8 20.7 14.7 19.1 17 19.9 20.5 19 20 ...
##  $ station_precip_mm                    : num  3 55.6 38.1 30 4 11.5 72.9 50.1 89.2 78 ...
##  $ Month                                : num  7 7 7 7 7 8 8 8 8 9 ...
summary(iq)
##      city                year        weekofyear     total_cases     
##  Length:520         Min.   :2000   Min.   : 1.00   Min.   :  0.000  
##  Class :character   1st Qu.:2003   1st Qu.:13.75   1st Qu.:  1.000  
##  Mode  :character   Median :2005   Median :26.50   Median :  5.000  
##                     Mean   :2005   Mean   :26.50   Mean   :  7.565  
##                     3rd Qu.:2007   3rd Qu.:39.25   3rd Qu.:  9.000  
##                     Max.   :2010   Max.   :53.00   Max.   :116.000  
##                                                                     
##  week_start_date         ndvi_ne           ndvi_nw           ndvi_se       
##  Min.   :2000-07-01   Min.   :0.06173   Min.   :0.03586   Min.   :0.02988  
##  1st Qu.:2002-12-30   1st Qu.:0.20000   1st Qu.:0.17954   1st Qu.:0.19474  
##  Median :2005-06-28   Median :0.26364   Median :0.23297   Median :0.24980  
##  Mean   :2005-06-28   Mean   :0.26387   Mean   :0.23878   Mean   :0.25013  
##  3rd Qu.:2007-12-26   3rd Qu.:0.31997   3rd Qu.:0.29393   3rd Qu.:0.30230  
##  Max.   :2010-06-25   Max.   :0.50836   Max.   :0.45443   Max.   :0.53831  
##                       NA's   :3         NA's   :3         NA's   :3        
##     ndvi_sw        precipitation_amt_mm reanalysis_air_temp_k
##  Min.   :0.06418   Min.   :  0.00       Min.   :294.6        
##  1st Qu.:0.20413   1st Qu.: 39.10       1st Qu.:297.1        
##  Median :0.26214   Median : 60.47       Median :297.8        
##  Mean   :0.26678   Mean   : 64.25       Mean   :297.9        
##  3rd Qu.:0.32515   3rd Qu.: 85.76       3rd Qu.:298.6        
##  Max.   :0.54602   Max.   :210.83       Max.   :301.6        
##  NA's   :3         NA's   :4            NA's   :4            
##  reanalysis_avg_temp_k reanalysis_dew_point_temp_k reanalysis_max_air_temp_k
##  Min.   :294.9         Min.   :290.1               Min.   :300.0            
##  1st Qu.:298.2         1st Qu.:294.6               1st Qu.:305.2            
##  Median :299.1         Median :295.9               Median :307.1            
##  Mean   :299.1         Mean   :295.5               Mean   :307.1            
##  3rd Qu.:300.1         3rd Qu.:296.5               3rd Qu.:308.7            
##  Max.   :302.9         Max.   :298.4               Max.   :314.0            
##  NA's   :4             NA's   :4                   NA's   :4                
##  reanalysis_min_air_temp_k reanalysis_precip_amt_kg_per_m2
##  Min.   :286.9             Min.   :  0.00                 
##  1st Qu.:292.0             1st Qu.: 24.07                 
##  Median :293.1             Median : 46.44                 
##  Mean   :292.9             Mean   : 57.61                 
##  3rd Qu.:294.2             3rd Qu.: 71.07                 
##  Max.   :296.0             Max.   :362.03                 
##  NA's   :4                 NA's   :4                      
##  reanalysis_relative_humidity_percent reanalysis_sat_precip_amt_mm
##  Min.   :57.79                        Min.   :  0.00              
##  1st Qu.:84.30                        1st Qu.: 39.10              
##  Median :90.92                        Median : 60.47              
##  Mean   :88.64                        Mean   : 64.25              
##  3rd Qu.:94.56                        3rd Qu.: 85.76              
##  Max.   :98.61                        Max.   :210.83              
##  NA's   :4                            NA's   :4                   
##  reanalysis_specific_humidity_g_per_kg reanalysis_tdtr_k station_avg_temp_c
##  Min.   :12.11                         Min.   : 3.714    Min.   :21.40     
##  1st Qu.:16.10                         1st Qu.: 7.371    1st Qu.:27.00     
##  Median :17.43                         Median : 8.964    Median :27.60     
##  Mean   :17.10                         Mean   : 9.207    Mean   :27.53     
##  3rd Qu.:18.18                         3rd Qu.:11.014    3rd Qu.:28.10     
##  Max.   :20.46                         Max.   :16.029    Max.   :30.80     
##  NA's   :4                             NA's   :4         NA's   :37        
##  station_diur_temp_rng_c station_max_temp_c station_min_temp_c
##  Min.   : 5.20           Min.   :30.1       Min.   :14.7      
##  1st Qu.: 9.50           1st Qu.:33.2       1st Qu.:20.6      
##  Median :10.62           Median :34.0       Median :21.3      
##  Mean   :10.57           Mean   :34.0       Mean   :21.2      
##  3rd Qu.:11.65           3rd Qu.:34.9       3rd Qu.:22.0      
##  Max.   :15.80           Max.   :42.2       Max.   :24.2      
##  NA's   :37              NA's   :14         NA's   :8         
##  station_precip_mm     Month       
##  Min.   :  0.00    Min.   : 1.000  
##  1st Qu.: 17.20    1st Qu.: 3.750  
##  Median : 45.30    Median : 6.500  
##  Mean   : 62.47    Mean   : 6.417  
##  3rd Qu.: 85.95    3rd Qu.: 9.000  
##  Max.   :543.30    Max.   :12.000  
##  NA's   :16
##Correlation plot 
 ggcorr(iq[, 5:24], label = TRUE, hjust = 0.85, size = 2,color = "azure3",
        label_size = 2) +
   ggplot2::labs(title = "Correlation Plot (Iquitos)")
## Warning in ggcorr(iq[, 5:24], label = TRUE, hjust = 0.85, size = 2, color =
## "azure3", : data in column(s) 'week_start_date' are not numeric and were ignored

plot(total_cases ~ station_max_temp_c, data=iq
      ,main = "Scatter Plot of Total Cases and Maximum Temperature in Iquitos")

 plot(total_cases ~ reanalysis_specific_humidity_g_per_kg, data=iq,
      main = "Scatter Plot of Total Cases and Humidity in Iquitos")

#fill in NAs
iq.im <- na.locf.data.frame(iq)
  summary(iq.im)
##      city                year        weekofyear     total_cases     
##  Length:520         Min.   :2000   Min.   : 1.00   Min.   :  0.000  
##  Class :character   1st Qu.:2003   1st Qu.:13.75   1st Qu.:  1.000  
##  Mode  :character   Median :2005   Median :26.50   Median :  5.000  
##                     Mean   :2005   Mean   :26.50   Mean   :  7.565  
##                     3rd Qu.:2007   3rd Qu.:39.25   3rd Qu.:  9.000  
##                     Max.   :2010   Max.   :53.00   Max.   :116.000  
##  week_start_date         ndvi_ne           ndvi_nw           ndvi_se       
##  Min.   :2000-07-01   Min.   :0.06173   Min.   :0.03586   Min.   :0.02988  
##  1st Qu.:2002-12-30   1st Qu.:0.19951   1st Qu.:0.17947   1st Qu.:0.19461  
##  Median :2005-06-28   Median :0.26353   Median :0.23285   Median :0.24985  
##  Mean   :2005-06-28   Mean   :0.26360   Mean   :0.23844   Mean   :0.25006  
##  3rd Qu.:2007-12-26   3rd Qu.:0.31962   3rd Qu.:0.29383   3rd Qu.:0.30255  
##  Max.   :2010-06-25   Max.   :0.50836   Max.   :0.45443   Max.   :0.53831  
##     ndvi_sw        precipitation_amt_mm reanalysis_air_temp_k
##  Min.   :0.06418   Min.   :  0.00       Min.   :294.6        
##  1st Qu.:0.20284   1st Qu.: 39.15       1st Qu.:297.1        
##  Median :0.26187   Median : 60.49       Median :297.8        
##  Mean   :0.26644   Mean   : 64.58       Mean   :297.9        
##  3rd Qu.:0.32522   3rd Qu.: 86.22       3rd Qu.:298.7        
##  Max.   :0.54602   Max.   :210.83       Max.   :301.6        
##  reanalysis_avg_temp_k reanalysis_dew_point_temp_k reanalysis_max_air_temp_k
##  Min.   :294.9         Min.   :290.1               Min.   :300.0            
##  1st Qu.:298.2         1st Qu.:294.6               1st Qu.:305.2            
##  Median :299.1         Median :295.9               Median :307.0            
##  Mean   :299.1         Mean   :295.5               Mean   :307.1            
##  3rd Qu.:300.1         3rd Qu.:296.6               3rd Qu.:308.7            
##  Max.   :302.9         Max.   :298.4               Max.   :314.0            
##  reanalysis_min_air_temp_k reanalysis_precip_amt_kg_per_m2
##  Min.   :286.9             Min.   :  0.00                 
##  1st Qu.:292.0             1st Qu.: 24.21                 
##  Median :293.1             Median : 46.68                 
##  Mean   :292.9             Mean   : 57.81                 
##  3rd Qu.:294.2             3rd Qu.: 71.90                 
##  Max.   :296.0             Max.   :362.03                 
##  reanalysis_relative_humidity_percent reanalysis_sat_precip_amt_mm
##  Min.   :57.79                        Min.   :  0.00              
##  1st Qu.:84.39                        1st Qu.: 39.15              
##  Median :90.92                        Median : 60.49              
##  Mean   :88.67                        Mean   : 64.58              
##  3rd Qu.:94.59                        3rd Qu.: 86.22              
##  Max.   :98.61                        Max.   :210.83              
##  reanalysis_specific_humidity_g_per_kg reanalysis_tdtr_k station_avg_temp_c
##  Min.   :12.11                         Min.   : 3.714    Min.   :21.40     
##  1st Qu.:16.12                         1st Qu.: 7.371    1st Qu.:26.96     
##  Median :17.44                         Median : 8.957    Median :27.60     
##  Mean   :17.11                         Mean   : 9.192    Mean   :27.52     
##  3rd Qu.:18.18                         3rd Qu.:11.004    3rd Qu.:28.10     
##  Max.   :20.46                         Max.   :16.029    Max.   :30.80     
##  station_diur_temp_rng_c station_max_temp_c station_min_temp_c
##  Min.   : 5.20           Min.   :30.10      Min.   :14.70     
##  1st Qu.: 9.40           1st Qu.:33.20      1st Qu.:20.60     
##  Median :10.52           Median :34.00      Median :21.40     
##  Mean   :10.49           Mean   :33.97      Mean   :21.21     
##  3rd Qu.:11.64           3rd Qu.:34.92      3rd Qu.:22.00     
##  Max.   :15.80           Max.   :42.20      Max.   :24.20     
##  station_precip_mm     Month       
##  Min.   :  0.00    Min.   : 1.000  
##  1st Qu.: 17.00    1st Qu.: 3.750  
##  Median : 45.10    Median : 6.500  
##  Mean   : 62.01    Mean   : 6.417  
##  3rd Qu.: 85.67    3rd Qu.: 9.000  
##  Max.   :543.30    Max.   :12.000

time series

sj.ts <-ts(sj, start = c(1990, 4, 30), end = c(2004, 10, 28), frequency = 52)
## Warning in data.matrix(data): NAs introduced by coercion
autoplot(sj.ts[,4], main = "Total Cases in San Juan ")

#IQ
iq.ts <-ts(iq, start = c(2005, 11, 11), end = c(2010, 6, 25), frequency = 52)
## Warning in data.matrix(data): NAs introduced by coercion
autoplot(iq.ts[,4], main = "Total Cases in Iquitos")

Train and Test sets

#Creating a test and train set from time series above before moving into modelling

#San Juan 
sj.train <- window(sj.ts, end = c(1999,12))
sj.test <- window(sj.ts, start = c(1999,13)) 
##Plot of total cases in the train and test sets
autoplot(sj.train[,4]) + autolayer(sj.test[,4]) 

#Iquitos
iq.train <- window(iq.ts, end = c(2008,27))
iq.test <- window(iq.ts, start = c(2008,28))



##Plot of total cases in the train and test sets
autoplot(iq.train[,4]) + autolayer(iq.test[,4])

Data Modeling

##MOdel1 -SARIMA 
#san Juan 
sj.m1 <- auto.arima(sj.train[,4])
summary(sj.m1)
## Series: sj.train[, 4] 
## ARIMA(2,0,2)(0,0,1)[52] with non-zero mean 
## 
## Coefficients:
##          ar1      ar2      ma1     ma2     sma1     mean
##       1.8194  -0.8408  -1.3424  0.4940  -0.0673  47.3764
## s.e.  0.0676   0.0633   0.0692  0.0468   0.0434   9.7257
## 
## sigma^2 estimated as 1043:  log likelihood=-2332.45
## AIC=4678.91   AICc=4679.15   BIC=4708.08
## 
## Training set error measures:
##                      ME     RMSE      MAE       MPE    MAPE      MASE
## Training set 0.06277537 32.09904 16.10851 -42.57498 61.7512 0.3092194
##                    ACF1
## Training set 0.02667101
##Forecast and plot 
sj.m1.fcast <- forecast(sj.m1,h=5)
autoplot(sj.m1.fcast)

# Model 2:Arima 
sj.m2 <- arima(sj.train[,4], order = c(2,0,2), seasonal = list(order = c(1,0,1)))
sj.m2.fcast <- forecast(sj.m2,h=5)
autoplot(sj.m2.fcast)

##IQUITOS

#SARIMA 
iq.m1 <- auto.arima(iq.train[,4])
summary(iq.m1)
## Series: iq.train[, 4] 
## ARIMA(2,1,0) 
## 
## Coefficients:
##           ar1      ar2
##       -0.8857  -0.2839
## s.e.   0.0728   0.0726
## 
## sigma^2 estimated as 12.98:  log likelihood=-463.88
## AIC=933.77   AICc=933.91   BIC=943.21
## 
## Training set error measures:
##                      ME     RMSE      MAE MPE MAPE      MASE       ACF1
## Training set 0.02160312 3.570811 2.104061 NaN  Inf 0.2157554 0.01704513
##Forecast and plot 
iq.m1.fcast <- forecast(iq.m1,h=5)

#Model2:Arima 
iq.m2 <- arima(iq.train[,4], order = c(2,0,2), seasonal = list(order = c(1,0,1)))
iq.m2.fcast <- forecast(iq.m2,h=5)
autoplot(iq.m2.fcast)

##Accuracy
accuracy(sj.m1.fcast)
##                      ME     RMSE      MAE       MPE    MAPE      MASE
## Training set 0.06277537 32.09904 16.10851 -42.57498 61.7512 0.3092194
##                    ACF1
## Training set 0.02667101
accuracy(sj.m2.fcast)
##                      ME     RMSE      MAE       MPE     MAPE      MASE
## Training set 0.07202379 32.09194 16.10033 -42.46995 61.67487 0.3090624
##                    ACF1
## Training set 0.02673673
accuracy(iq.m1.fcast)
##                      ME     RMSE      MAE MPE MAPE      MASE       ACF1
## Training set 0.02160312 3.570811 2.104061 NaN  Inf 0.2157554 0.01704513
accuracy(iq.m2.fcast)
##                     ME    RMSE      MAE MPE MAPE      MASE         ACF1
## Training set 0.0623992 3.52628 2.150196 NaN  Inf 0.2204862 -0.008728715

For some unknown reason, I am unable to run any Neural Network, ETS, or Holt winter models in rmarkdown so i am going to have to use these two ```