_R Packages__
library(readr)
library(ggplot2)
library(forecast)
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
library(zoo)
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
library(haven)
library(tseries)
library(urca)
library(fUnitRoots)
## Loading required package: timeDate
## Loading required package: timeSeries
##
## Attaching package: 'timeSeries'
## The following object is masked from 'package:zoo':
##
## time<-
## Loading required package: fBasics
##
## Attaching package: 'fUnitRoots'
## The following objects are masked from 'package:urca':
##
## punitroot, qunitroot, unitrootTable
library(reshape)
library(reshape2)
##
## Attaching package: 'reshape2'
## The following objects are masked from 'package:reshape':
##
## colsplit, melt, recast
library(quantmod)
## Loading required package: xts
## Loading required package: TTR
##
## Attaching package: 'TTR'
## The following object is masked from 'package:fBasics':
##
## volatility
## Version 0.4-0 included new data defaults. See ?getSymbols.
library(plyr)
##
## Attaching package: 'plyr'
## The following objects are masked from 'package:reshape':
##
## rename, round_any
library(scales)
##
## Attaching package: 'scales'
## The following object is masked from 'package:readr':
##
## col_factor
library(fpp2)
## Loading required package: fma
##
## Attaching package: 'fma'
## The following object is masked from 'package:plyr':
##
## ozone
## Loading required package: expsmooth
library(hts)
library(lubridate)
##
## Attaching package: 'lubridate'
## The following object is masked from 'package:reshape':
##
## stamp
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
library(data.table)
##
## Attaching package: 'data.table'
## The following objects are masked from 'package:lubridate':
##
## hour, isoweek, mday, minute, month, quarter, second, wday, week,
## yday, year
## The following objects are masked from 'package:xts':
##
## first, last
## The following objects are masked from 'package:reshape2':
##
## dcast, melt
## The following object is masked from 'package:reshape':
##
## melt
library(GGally)
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
##
## Attaching package: 'GGally'
## The following object is masked from 'package:fma':
##
## pigs
Importing datasets
##The features for the training dataset
train <- read_csv("/Users/dorothymensah/Desktop/Training_Data_Features.csv")
## Parsed with column specification:
## cols(
## .default = col_double(),
## city = col_character(),
## week_start_date = col_date(format = "")
## )
## See spec(...) for full column specifications.
# The features for the testing dataset
test <- read_csv("/Users/dorothymensah/Desktop/Test_Data_Features.csv")
## Parsed with column specification:
## cols(
## .default = col_double(),
## city = col_character(),
## week_start_date = col_date(format = "")
## )
## See spec(...) for full column specifications.
#The number of dengue cases for each row in the training dataset.
label <- read_csv("/Users/dorothymensah/Desktop/Training_Data_Labels.csv")
## Parsed with column specification:
## cols(
## city = col_character(),
## year = col_double(),
## weekofyear = col_double(),
## total_cases = col_double()
## )
##Competition submission format
submission <- read_csv("/Users/dorothymensah/Desktop//Submission_Format.csv")
## Parsed with column specification:
## cols(
## city = col_character(),
## year = col_double(),
## weekofyear = col_double(),
## total_cases = col_double()
## )
head(train)
## # A tibble: 6 x 24
## city year weekofyear week_start_date ndvi_ne ndvi_nw ndvi_se ndvi_sw
## <chr> <dbl> <dbl> <date> <dbl> <dbl> <dbl> <dbl>
## 1 sj 1990 18 1990-04-30 0.123 0.104 0.198 0.178
## 2 sj 1990 19 1990-05-07 0.170 0.142 0.162 0.155
## 3 sj 1990 20 1990-05-14 0.0322 0.173 0.157 0.171
## 4 sj 1990 21 1990-05-21 0.129 0.245 0.228 0.236
## 5 sj 1990 22 1990-05-28 0.196 0.262 0.251 0.247
## 6 sj 1990 23 1990-06-04 NA 0.175 0.254 0.182
## # … with 16 more variables: precipitation_amt_mm <dbl>,
## # reanalysis_air_temp_k <dbl>, reanalysis_avg_temp_k <dbl>,
## # reanalysis_dew_point_temp_k <dbl>, reanalysis_max_air_temp_k <dbl>,
## # reanalysis_min_air_temp_k <dbl>, reanalysis_precip_amt_kg_per_m2 <dbl>,
## # reanalysis_relative_humidity_percent <dbl>,
## # reanalysis_sat_precip_amt_mm <dbl>,
## # reanalysis_specific_humidity_g_per_kg <dbl>, reanalysis_tdtr_k <dbl>,
## # station_avg_temp_c <dbl>, station_diur_temp_rng_c <dbl>,
## # station_max_temp_c <dbl>, station_min_temp_c <dbl>, station_precip_mm <dbl>
head(label)
## # A tibble: 6 x 4
## city year weekofyear total_cases
## <chr> <dbl> <dbl> <dbl>
## 1 sj 1990 18 4
## 2 sj 1990 19 5
## 3 sj 1990 20 4
## 4 sj 1990 21 3
## 5 sj 1990 22 6
## 6 sj 1990 23 2
##Merging the label data set with the training dataset
train1 <- merge(label,train, by=c("city", "year","weekofyear"),all.x =T)
nrow(train1) == nrow(label) #This is true
## [1] TRUE
#Before splitting the train1 data set by city, i am going to do an overall EDA of the data set by variable
#Plotting total cases by time
plot(total_cases ~ year,data=train1, col="darkolivegreen4",main="Cases of Dengue Fever per Year", xlab="Year", ylab = "Total Cases of Dengue Fever")
plot(total_cases~weekofyear, data = train1, col="darkred", main="Cases of Dengue Fever per Week of the Year", xlab="Week of the Year", ylab="Total Cases of Dengue Fever")
plot(total_cases ~ week_start_date, data = train1, col="coral",main="Cases of Dengue Fever per Week Start Date",xlab="Week Start Date", ylab="Total Cases of Dengue Fever")
#Plotting Total cases per Variable
plot(total_cases~reanalysis_avg_temp_k, data = train1, col="blue1", main="Cases of Dengue Fevr per Average Temperature", xlab="Average Temperature", ylab = "Total Cases of Dengue Fever")
plot(total_cases~reanalysis_dew_point_temp_k, data = train1, col="cyan4", main="Cases of Dengue Fever per Average Dewpoint Temperature", xlab="Average Dewpoint Temperature", ylab="Total Cases of Dengue Fever")
plot(total_cases~reanalysis_specific_humidity_g_per_kg, data = train1, col="chocolate", main="Cases of Dengue Fever per Specific Humidity", xlab="Specific Humidity(g per kg)", ylab = "Total Cases of Dengue Fever")
plot(total_cases~reanalysis_max_air_temp_k, data=train1,col="cornsilk4", main="Cases of Dengue Fever per Maximum Air Temperature", xlab="Maximum Air Temperature",ylab = "Total Cases of Dengue Fever")
EDA VISUALIZATION PER CITY
##looking at cases per month in the two cities (Iquitos and San Juan)
##Transforming start_of_week variable form a character to date format,then split into months
train1$week_start_date <- as.Date(train1$week_start_date, "%m/%d/%Y")
train1$Month <- format(train1$week_start_date,"%m")
train1$Month <- as.numeric(train1$Month)
##Creating graphics
#by week
g1<- ggplot(data = train1, aes(x=weekofyear, y=total_cases, fill = city)) +
geom_bar(stat = "identity") +
facet_wrap(.~city) +
ggtitle("Total Cases of Dengue per Week in Iquitos and San Juan")
g1
##As we can see there are higher cases of Dengue Fever per week in San Juan than in Iquitos. However, there is a decrease around 20 weeks then it steadily increases
#By month
g2 <- ggplot(data = train1, aes(x=Month, y=total_cases, fill=city)) + geom_bar(stat = "identity") + facet_wrap(.~city)+ggtitle("Total Cases of Dengue per Month in Iquitos and San Juan")
g2
##As we can see there are higher cases of Dengue Fever per month in San Juan than in Iquitos
#By year
g3<- ggplot(data = train1, aes(x=year, y=total_cases, fill=city)) + geom_bar(stat = "identity") + facet_wrap(.~city)+ggtitle("Total Cases of Dengue per year in Iquitos and San Juan")
g3
##There seems to be no record of dengue fever in Iquitos till the the early 2000's. From these specific fraphics, i think it might be best to split the training set by city
Seperate the two cities
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:data.table':
##
## between, first, last
## The following objects are masked from 'package:plyr':
##
## arrange, count, desc, failwith, id, mutate, rename, summarise,
## summarize
## The following objects are masked from 'package:xts':
##
## first, last
## The following object is masked from 'package:reshape':
##
## rename
## The following objects are masked from 'package:timeSeries':
##
## filter, lag
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
#San Juan
sj<- train1 %>% filter(city=="sj")
str(sj)
## 'data.frame': 936 obs. of 26 variables:
## $ city : chr "sj" "sj" "sj" "sj" ...
## $ year : num 1990 1990 1990 1990 1990 1990 1990 1990 1990 1990 ...
## $ weekofyear : num 18 19 20 21 22 23 24 25 26 27 ...
## $ total_cases : num 4 5 4 3 6 2 4 5 10 6 ...
## $ week_start_date : Date, format: "1990-04-30" "1990-05-07" ...
## $ ndvi_ne : num 0.1226 0.1699 0.0323 0.1286 0.1962 ...
## $ ndvi_nw : num 0.104 0.142 0.173 0.245 0.262 ...
## $ ndvi_se : num 0.198 0.162 0.157 0.228 0.251 ...
## $ ndvi_sw : num 0.178 0.155 0.171 0.236 0.247 ...
## $ precipitation_amt_mm : num 12.42 22.82 34.54 15.36 7.52 ...
## $ reanalysis_air_temp_k : num 298 298 299 299 300 ...
## $ reanalysis_avg_temp_k : num 298 298 299 299 300 ...
## $ reanalysis_dew_point_temp_k : num 292 294 295 295 296 ...
## $ reanalysis_max_air_temp_k : num 300 301 300 301 302 ...
## $ reanalysis_min_air_temp_k : num 296 296 297 297 298 ...
## $ reanalysis_precip_amt_kg_per_m2 : num 32 17.9 26.1 13.9 12.2 ...
## $ reanalysis_relative_humidity_percent : num 73.4 77.4 82.1 80.3 80.5 ...
## $ reanalysis_sat_precip_amt_mm : num 12.42 22.82 34.54 15.36 7.52 ...
## $ reanalysis_specific_humidity_g_per_kg: num 14 15.4 16.8 16.7 17.2 ...
## $ reanalysis_tdtr_k : num 2.63 2.37 2.3 2.43 3.01 ...
## $ station_avg_temp_c : num 25.4 26.7 26.7 27.5 28.9 ...
## $ station_diur_temp_rng_c : num 6.9 6.37 6.49 6.77 9.37 ...
## $ station_max_temp_c : num 29.4 31.7 32.2 33.3 35 34.4 32.2 33.9 33.9 33.9 ...
## $ station_min_temp_c : num 20 22.2 22.8 23.3 23.9 23.9 23.3 22.8 22.8 24.4 ...
## $ station_precip_mm : num 16 8.6 41.4 4 5.8 39.1 29.7 21.1 21.1 1.1 ...
## $ Month : num 4 5 5 5 5 6 6 6 6 7 ...
summary(sj)
## city year weekofyear total_cases
## Length:936 Min. :1990 Min. : 1.00 Min. : 0.00
## Class :character 1st Qu.:1994 1st Qu.:13.75 1st Qu.: 9.00
## Mode :character Median :1999 Median :26.50 Median : 19.00
## Mean :1999 Mean :26.50 Mean : 34.18
## 3rd Qu.:2003 3rd Qu.:39.25 3rd Qu.: 37.00
## Max. :2008 Max. :53.00 Max. :461.00
##
## week_start_date ndvi_ne ndvi_nw ndvi_se
## Min. :1990-04-30 Min. :-0.40625 Min. :-0.45610 Min. :-0.01553
## 1st Qu.:1994-10-27 1st Qu.: 0.00450 1st Qu.: 0.01642 1st Qu.: 0.13928
## Median :1999-04-26 Median : 0.05770 Median : 0.06808 Median : 0.17719
## Mean :1999-04-26 Mean : 0.05792 Mean : 0.06747 Mean : 0.17766
## 3rd Qu.:2003-10-23 3rd Qu.: 0.11110 3rd Qu.: 0.11520 3rd Qu.: 0.21256
## Max. :2008-04-22 Max. : 0.49340 Max. : 0.43710 Max. : 0.39313
## NA's :191 NA's :49 NA's :19
## ndvi_sw precipitation_amt_mm reanalysis_air_temp_k
## Min. :-0.06346 Min. : 0.00 Min. :295.9
## 1st Qu.: 0.12916 1st Qu.: 0.00 1st Qu.:298.2
## Median : 0.16597 Median : 20.80 Median :299.3
## Mean : 0.16596 Mean : 35.47 Mean :299.2
## 3rd Qu.: 0.20277 3rd Qu.: 52.18 3rd Qu.:300.1
## Max. : 0.38142 Max. :390.60 Max. :302.2
## NA's :19 NA's :9 NA's :6
## reanalysis_avg_temp_k reanalysis_dew_point_temp_k reanalysis_max_air_temp_k
## Min. :296.1 Min. :289.6 Min. :297.8
## 1st Qu.:298.3 1st Qu.:293.8 1st Qu.:300.4
## Median :299.4 Median :295.5 Median :301.5
## Mean :299.3 Mean :295.1 Mean :301.4
## 3rd Qu.:300.2 3rd Qu.:296.4 3rd Qu.:302.4
## Max. :302.2 Max. :297.8 Max. :304.3
## NA's :6 NA's :6 NA's :6
## reanalysis_min_air_temp_k reanalysis_precip_amt_kg_per_m2
## Min. :292.6 Min. : 0.00
## 1st Qu.:296.3 1st Qu.: 10.82
## Median :297.5 Median : 21.30
## Mean :297.3 Mean : 30.47
## 3rd Qu.:298.4 3rd Qu.: 37.00
## Max. :299.9 Max. :570.50
## NA's :6 NA's :6
## reanalysis_relative_humidity_percent reanalysis_sat_precip_amt_mm
## Min. :66.74 Min. : 0.00
## 1st Qu.:76.25 1st Qu.: 0.00
## Median :78.67 Median : 20.80
## Mean :78.57 Mean : 35.47
## 3rd Qu.:80.96 3rd Qu.: 52.18
## Max. :87.58 Max. :390.60
## NA's :6 NA's :9
## reanalysis_specific_humidity_g_per_kg reanalysis_tdtr_k station_avg_temp_c
## Min. :11.72 Min. :1.357 Min. :22.84
## 1st Qu.:15.24 1st Qu.:2.157 1st Qu.:25.84
## Median :16.85 Median :2.457 Median :27.23
## Mean :16.55 Mean :2.516 Mean :27.01
## 3rd Qu.:17.86 3rd Qu.:2.800 3rd Qu.:28.19
## Max. :19.44 Max. :4.429 Max. :30.07
## NA's :6 NA's :6 NA's :6
## station_diur_temp_rng_c station_max_temp_c station_min_temp_c
## Min. :4.529 Min. :26.70 Min. :17.8
## 1st Qu.:6.200 1st Qu.:30.60 1st Qu.:21.7
## Median :6.757 Median :31.70 Median :22.8
## Mean :6.757 Mean :31.61 Mean :22.6
## 3rd Qu.:7.286 3rd Qu.:32.80 3rd Qu.:23.9
## Max. :9.914 Max. :35.60 Max. :25.6
## NA's :6 NA's :6 NA's :6
## station_precip_mm Month
## Min. : 0.000 Min. : 1.000
## 1st Qu.: 6.825 1st Qu.: 3.750
## Median : 17.750 Median : 6.500
## Mean : 26.785 Mean : 6.419
## 3rd Qu.: 35.450 3rd Qu.: 9.000
## Max. :305.900 Max. :12.000
## NA's :6
#Correlation plots
ggcorr(sj[, 5:24], label = TRUE, hjust = 0.85, size = 2,color = "azure3",
label_size = 2) +
ggplot2::labs(title = "Correlation Plot (San Juan)")
## Warning in ggcorr(sj[, 5:24], label = TRUE, hjust = 0.85, size = 2, color =
## "azure3", : data in column(s) 'week_start_date' are not numeric and were ignored
plot(total_cases ~ station_max_temp_c, data=sj
,main = "Scatter Plot of Total Cases and Maximum Temperature in San Juan")
plot(total_cases ~ reanalysis_specific_humidity_g_per_kg, data=sj,
main = "Scatter Plot of Total Cases and Humidity in San Juan")
# Omit ndvi_ variables because of the large amount of missing data and 0 correllation
# sj.train[,grep("^ndvi", colnames(sj.train)) := NULL]
# build a function to impute data with the most recent values
na.locf.data.frame <-
function(object, ...) replace(object, TRUE, lapply(object, na.locf, ...))
#fill in NAs
sj.im <- na.locf.data.frame(sj)
summary(sj.im)
## city year weekofyear total_cases
## Length:936 Min. :1990 Min. : 1.00 Min. : 0.00
## Class :character 1st Qu.:1994 1st Qu.:13.75 1st Qu.: 9.00
## Mode :character Median :1999 Median :26.50 Median : 19.00
## Mean :1999 Mean :26.50 Mean : 34.18
## 3rd Qu.:2003 3rd Qu.:39.25 3rd Qu.: 37.00
## Max. :2008 Max. :53.00 Max. :461.00
## week_start_date ndvi_ne ndvi_nw ndvi_se
## Min. :1990-04-30 Min. :-0.406250 Min. :-0.45610 Min. :-0.01553
## 1st Qu.:1994-10-27 1st Qu.: 0.005146 1st Qu.: 0.01414 1st Qu.: 0.13936
## Median :1999-04-26 Median : 0.058775 Median : 0.06699 Median : 0.17670
## Mean :1999-04-26 Mean : 0.058174 Mean : 0.06528 Mean : 0.17755
## 3rd Qu.:2003-10-23 3rd Qu.: 0.111325 3rd Qu.: 0.11490 3rd Qu.: 0.21256
## Max. :2008-04-22 Max. : 0.493400 Max. : 0.43710 Max. : 0.39313
## ndvi_sw precipitation_amt_mm reanalysis_air_temp_k
## Min. :-0.06346 Min. : 0.00 Min. :295.9
## 1st Qu.: 0.12978 1st Qu.: 0.00 1st Qu.:298.2
## Median : 0.16776 Median : 20.45 Median :299.2
## Mean : 0.16657 Mean : 35.21 Mean :299.2
## 3rd Qu.: 0.20284 3rd Qu.: 51.66 3rd Qu.:300.1
## Max. : 0.38142 Max. :390.60 Max. :302.2
## reanalysis_avg_temp_k reanalysis_dew_point_temp_k reanalysis_max_air_temp_k
## Min. :296.1 Min. :289.6 Min. :297.8
## 1st Qu.:298.3 1st Qu.:293.8 1st Qu.:300.4
## Median :299.4 Median :295.4 Median :301.5
## Mean :299.3 Mean :295.1 Mean :301.4
## 3rd Qu.:300.2 3rd Qu.:296.4 3rd Qu.:302.4
## Max. :302.2 Max. :297.8 Max. :304.3
## reanalysis_min_air_temp_k reanalysis_precip_amt_kg_per_m2
## Min. :292.6 Min. : 0.00
## 1st Qu.:296.3 1st Qu.: 10.88
## Median :297.5 Median : 21.28
## Mean :297.3 Mean : 30.39
## 3rd Qu.:298.4 3rd Qu.: 36.92
## Max. :299.9 Max. :570.50
## reanalysis_relative_humidity_percent reanalysis_sat_precip_amt_mm
## Min. :66.74 Min. : 0.00
## 1st Qu.:76.24 1st Qu.: 0.00
## Median :78.64 Median : 20.45
## Mean :78.56 Mean : 35.21
## 3rd Qu.:80.95 3rd Qu.: 51.66
## Max. :87.58 Max. :390.60
## reanalysis_specific_humidity_g_per_kg reanalysis_tdtr_k station_avg_temp_c
## Min. :11.72 Min. :1.357 Min. :22.84
## 1st Qu.:15.22 1st Qu.:2.157 1st Qu.:25.81
## Median :16.83 Median :2.457 Median :27.21
## Mean :16.54 Mean :2.515 Mean :26.99
## 3rd Qu.:17.85 3rd Qu.:2.789 3rd Qu.:28.18
## Max. :19.44 Max. :4.429 Max. :30.07
## station_diur_temp_rng_c station_max_temp_c station_min_temp_c
## Min. :4.529 Min. :26.70 Min. :17.80
## 1st Qu.:6.200 1st Qu.:30.60 1st Qu.:21.70
## Median :6.757 Median :31.70 Median :22.80
## Mean :6.754 Mean :31.59 Mean :22.59
## 3rd Qu.:7.286 3rd Qu.:32.80 3rd Qu.:23.90
## Max. :9.914 Max. :35.60 Max. :25.60
## station_precip_mm Month
## Min. : 0.00 Min. : 1.000
## 1st Qu.: 6.90 1st Qu.: 3.750
## Median : 17.80 Median : 6.500
## Mean : 26.77 Mean : 6.419
## 3rd Qu.: 35.35 3rd Qu.: 9.000
## Max. :305.90 Max. :12.000
#Iquitos
iq<- train1 %>% filter(city=="iq")
str(iq)
## 'data.frame': 520 obs. of 26 variables:
## $ city : chr "iq" "iq" "iq" "iq" ...
## $ year : num 2000 2000 2000 2000 2000 2000 2000 2000 2000 2000 ...
## $ weekofyear : num 26 27 28 29 30 31 32 33 34 35 ...
## $ total_cases : num 0 0 0 0 0 0 0 0 0 0 ...
## $ week_start_date : Date, format: "2000-07-01" "2000-07-08" ...
## $ ndvi_ne : num 0.193 0.217 0.177 0.228 0.329 ...
## $ ndvi_nw : num 0.132 0.276 0.173 0.145 0.322 ...
## $ ndvi_se : num 0.341 0.289 0.204 0.254 0.254 ...
## $ ndvi_sw : num 0.247 0.242 0.128 0.2 0.361 ...
## $ precipitation_amt_mm : num 25.4 60.6 55.5 5.6 62.8 ...
## $ reanalysis_air_temp_k : num 297 297 296 295 296 ...
## $ reanalysis_avg_temp_k : num 298 298 297 296 298 ...
## $ reanalysis_dew_point_temp_k : num 295 295 296 293 294 ...
## $ reanalysis_max_air_temp_k : num 307 307 304 304 307 ...
## $ reanalysis_min_air_temp_k : num 293 291 293 289 292 ...
## $ reanalysis_precip_amt_kg_per_m2 : num 43.2 46 64.8 24 31.8 ...
## $ reanalysis_relative_humidity_percent : num 92.4 93.6 95.8 87.2 88.2 ...
## $ reanalysis_sat_precip_amt_mm : num 25.4 60.6 55.5 5.6 62.8 ...
## $ reanalysis_specific_humidity_g_per_kg: num 16.7 16.9 17.1 14.4 15.4 ...
## $ reanalysis_tdtr_k : num 8.93 10.31 7.39 9.11 9.5 ...
## $ station_avg_temp_c : num 26.4 26.9 26.8 25.8 26.6 ...
## $ station_diur_temp_rng_c : num 10.8 11.6 11.5 10.5 11.5 ...
## $ station_max_temp_c : num 32.5 34 33 31.5 33.3 32 34 33 34 34 ...
## $ station_min_temp_c : num 20.7 20.8 20.7 14.7 19.1 17 19.9 20.5 19 20 ...
## $ station_precip_mm : num 3 55.6 38.1 30 4 11.5 72.9 50.1 89.2 78 ...
## $ Month : num 7 7 7 7 7 8 8 8 8 9 ...
summary(iq)
## city year weekofyear total_cases
## Length:520 Min. :2000 Min. : 1.00 Min. : 0.000
## Class :character 1st Qu.:2003 1st Qu.:13.75 1st Qu.: 1.000
## Mode :character Median :2005 Median :26.50 Median : 5.000
## Mean :2005 Mean :26.50 Mean : 7.565
## 3rd Qu.:2007 3rd Qu.:39.25 3rd Qu.: 9.000
## Max. :2010 Max. :53.00 Max. :116.000
##
## week_start_date ndvi_ne ndvi_nw ndvi_se
## Min. :2000-07-01 Min. :0.06173 Min. :0.03586 Min. :0.02988
## 1st Qu.:2002-12-30 1st Qu.:0.20000 1st Qu.:0.17954 1st Qu.:0.19474
## Median :2005-06-28 Median :0.26364 Median :0.23297 Median :0.24980
## Mean :2005-06-28 Mean :0.26387 Mean :0.23878 Mean :0.25013
## 3rd Qu.:2007-12-26 3rd Qu.:0.31997 3rd Qu.:0.29393 3rd Qu.:0.30230
## Max. :2010-06-25 Max. :0.50836 Max. :0.45443 Max. :0.53831
## NA's :3 NA's :3 NA's :3
## ndvi_sw precipitation_amt_mm reanalysis_air_temp_k
## Min. :0.06418 Min. : 0.00 Min. :294.6
## 1st Qu.:0.20413 1st Qu.: 39.10 1st Qu.:297.1
## Median :0.26214 Median : 60.47 Median :297.8
## Mean :0.26678 Mean : 64.25 Mean :297.9
## 3rd Qu.:0.32515 3rd Qu.: 85.76 3rd Qu.:298.6
## Max. :0.54602 Max. :210.83 Max. :301.6
## NA's :3 NA's :4 NA's :4
## reanalysis_avg_temp_k reanalysis_dew_point_temp_k reanalysis_max_air_temp_k
## Min. :294.9 Min. :290.1 Min. :300.0
## 1st Qu.:298.2 1st Qu.:294.6 1st Qu.:305.2
## Median :299.1 Median :295.9 Median :307.1
## Mean :299.1 Mean :295.5 Mean :307.1
## 3rd Qu.:300.1 3rd Qu.:296.5 3rd Qu.:308.7
## Max. :302.9 Max. :298.4 Max. :314.0
## NA's :4 NA's :4 NA's :4
## reanalysis_min_air_temp_k reanalysis_precip_amt_kg_per_m2
## Min. :286.9 Min. : 0.00
## 1st Qu.:292.0 1st Qu.: 24.07
## Median :293.1 Median : 46.44
## Mean :292.9 Mean : 57.61
## 3rd Qu.:294.2 3rd Qu.: 71.07
## Max. :296.0 Max. :362.03
## NA's :4 NA's :4
## reanalysis_relative_humidity_percent reanalysis_sat_precip_amt_mm
## Min. :57.79 Min. : 0.00
## 1st Qu.:84.30 1st Qu.: 39.10
## Median :90.92 Median : 60.47
## Mean :88.64 Mean : 64.25
## 3rd Qu.:94.56 3rd Qu.: 85.76
## Max. :98.61 Max. :210.83
## NA's :4 NA's :4
## reanalysis_specific_humidity_g_per_kg reanalysis_tdtr_k station_avg_temp_c
## Min. :12.11 Min. : 3.714 Min. :21.40
## 1st Qu.:16.10 1st Qu.: 7.371 1st Qu.:27.00
## Median :17.43 Median : 8.964 Median :27.60
## Mean :17.10 Mean : 9.207 Mean :27.53
## 3rd Qu.:18.18 3rd Qu.:11.014 3rd Qu.:28.10
## Max. :20.46 Max. :16.029 Max. :30.80
## NA's :4 NA's :4 NA's :37
## station_diur_temp_rng_c station_max_temp_c station_min_temp_c
## Min. : 5.20 Min. :30.1 Min. :14.7
## 1st Qu.: 9.50 1st Qu.:33.2 1st Qu.:20.6
## Median :10.62 Median :34.0 Median :21.3
## Mean :10.57 Mean :34.0 Mean :21.2
## 3rd Qu.:11.65 3rd Qu.:34.9 3rd Qu.:22.0
## Max. :15.80 Max. :42.2 Max. :24.2
## NA's :37 NA's :14 NA's :8
## station_precip_mm Month
## Min. : 0.00 Min. : 1.000
## 1st Qu.: 17.20 1st Qu.: 3.750
## Median : 45.30 Median : 6.500
## Mean : 62.47 Mean : 6.417
## 3rd Qu.: 85.95 3rd Qu.: 9.000
## Max. :543.30 Max. :12.000
## NA's :16
##Correlation plot
ggcorr(iq[, 5:24], label = TRUE, hjust = 0.85, size = 2,color = "azure3",
label_size = 2) +
ggplot2::labs(title = "Correlation Plot (Iquitos)")
## Warning in ggcorr(iq[, 5:24], label = TRUE, hjust = 0.85, size = 2, color =
## "azure3", : data in column(s) 'week_start_date' are not numeric and were ignored
plot(total_cases ~ station_max_temp_c, data=iq
,main = "Scatter Plot of Total Cases and Maximum Temperature in Iquitos")
plot(total_cases ~ reanalysis_specific_humidity_g_per_kg, data=iq,
main = "Scatter Plot of Total Cases and Humidity in Iquitos")
#fill in NAs
iq.im <- na.locf.data.frame(iq)
summary(iq.im)
## city year weekofyear total_cases
## Length:520 Min. :2000 Min. : 1.00 Min. : 0.000
## Class :character 1st Qu.:2003 1st Qu.:13.75 1st Qu.: 1.000
## Mode :character Median :2005 Median :26.50 Median : 5.000
## Mean :2005 Mean :26.50 Mean : 7.565
## 3rd Qu.:2007 3rd Qu.:39.25 3rd Qu.: 9.000
## Max. :2010 Max. :53.00 Max. :116.000
## week_start_date ndvi_ne ndvi_nw ndvi_se
## Min. :2000-07-01 Min. :0.06173 Min. :0.03586 Min. :0.02988
## 1st Qu.:2002-12-30 1st Qu.:0.19951 1st Qu.:0.17947 1st Qu.:0.19461
## Median :2005-06-28 Median :0.26353 Median :0.23285 Median :0.24985
## Mean :2005-06-28 Mean :0.26360 Mean :0.23844 Mean :0.25006
## 3rd Qu.:2007-12-26 3rd Qu.:0.31962 3rd Qu.:0.29383 3rd Qu.:0.30255
## Max. :2010-06-25 Max. :0.50836 Max. :0.45443 Max. :0.53831
## ndvi_sw precipitation_amt_mm reanalysis_air_temp_k
## Min. :0.06418 Min. : 0.00 Min. :294.6
## 1st Qu.:0.20284 1st Qu.: 39.15 1st Qu.:297.1
## Median :0.26187 Median : 60.49 Median :297.8
## Mean :0.26644 Mean : 64.58 Mean :297.9
## 3rd Qu.:0.32522 3rd Qu.: 86.22 3rd Qu.:298.7
## Max. :0.54602 Max. :210.83 Max. :301.6
## reanalysis_avg_temp_k reanalysis_dew_point_temp_k reanalysis_max_air_temp_k
## Min. :294.9 Min. :290.1 Min. :300.0
## 1st Qu.:298.2 1st Qu.:294.6 1st Qu.:305.2
## Median :299.1 Median :295.9 Median :307.0
## Mean :299.1 Mean :295.5 Mean :307.1
## 3rd Qu.:300.1 3rd Qu.:296.6 3rd Qu.:308.7
## Max. :302.9 Max. :298.4 Max. :314.0
## reanalysis_min_air_temp_k reanalysis_precip_amt_kg_per_m2
## Min. :286.9 Min. : 0.00
## 1st Qu.:292.0 1st Qu.: 24.21
## Median :293.1 Median : 46.68
## Mean :292.9 Mean : 57.81
## 3rd Qu.:294.2 3rd Qu.: 71.90
## Max. :296.0 Max. :362.03
## reanalysis_relative_humidity_percent reanalysis_sat_precip_amt_mm
## Min. :57.79 Min. : 0.00
## 1st Qu.:84.39 1st Qu.: 39.15
## Median :90.92 Median : 60.49
## Mean :88.67 Mean : 64.58
## 3rd Qu.:94.59 3rd Qu.: 86.22
## Max. :98.61 Max. :210.83
## reanalysis_specific_humidity_g_per_kg reanalysis_tdtr_k station_avg_temp_c
## Min. :12.11 Min. : 3.714 Min. :21.40
## 1st Qu.:16.12 1st Qu.: 7.371 1st Qu.:26.96
## Median :17.44 Median : 8.957 Median :27.60
## Mean :17.11 Mean : 9.192 Mean :27.52
## 3rd Qu.:18.18 3rd Qu.:11.004 3rd Qu.:28.10
## Max. :20.46 Max. :16.029 Max. :30.80
## station_diur_temp_rng_c station_max_temp_c station_min_temp_c
## Min. : 5.20 Min. :30.10 Min. :14.70
## 1st Qu.: 9.40 1st Qu.:33.20 1st Qu.:20.60
## Median :10.52 Median :34.00 Median :21.40
## Mean :10.49 Mean :33.97 Mean :21.21
## 3rd Qu.:11.64 3rd Qu.:34.92 3rd Qu.:22.00
## Max. :15.80 Max. :42.20 Max. :24.20
## station_precip_mm Month
## Min. : 0.00 Min. : 1.000
## 1st Qu.: 17.00 1st Qu.: 3.750
## Median : 45.10 Median : 6.500
## Mean : 62.01 Mean : 6.417
## 3rd Qu.: 85.67 3rd Qu.: 9.000
## Max. :543.30 Max. :12.000
time series
sj.ts <-ts(sj, start = c(1990, 4, 30), end = c(2004, 10, 28), frequency = 52)
## Warning in data.matrix(data): NAs introduced by coercion
autoplot(sj.ts[,4], main = "Total Cases in San Juan ")
#IQ
iq.ts <-ts(iq, start = c(2005, 11, 11), end = c(2010, 6, 25), frequency = 52)
## Warning in data.matrix(data): NAs introduced by coercion
autoplot(iq.ts[,4], main = "Total Cases in Iquitos")
Train and Test sets
#Creating a test and train set from time series above before moving into modelling
#San Juan
sj.train <- window(sj.ts, end = c(1999,12))
sj.test <- window(sj.ts, start = c(1999,13))
##Plot of total cases in the train and test sets
autoplot(sj.train[,4]) + autolayer(sj.test[,4])
#Iquitos
iq.train <- window(iq.ts, end = c(2008,27))
iq.test <- window(iq.ts, start = c(2008,28))
##Plot of total cases in the train and test sets
autoplot(iq.train[,4]) + autolayer(iq.test[,4])
Data Modeling
##MOdel1 -SARIMA
#san Juan
sj.m1 <- auto.arima(sj.train[,4])
summary(sj.m1)
## Series: sj.train[, 4]
## ARIMA(2,0,2)(0,0,1)[52] with non-zero mean
##
## Coefficients:
## ar1 ar2 ma1 ma2 sma1 mean
## 1.8194 -0.8408 -1.3424 0.4940 -0.0673 47.3764
## s.e. 0.0676 0.0633 0.0692 0.0468 0.0434 9.7257
##
## sigma^2 estimated as 1043: log likelihood=-2332.45
## AIC=4678.91 AICc=4679.15 BIC=4708.08
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set 0.06277537 32.09904 16.10851 -42.57498 61.7512 0.3092194
## ACF1
## Training set 0.02667101
##Forecast and plot
sj.m1.fcast <- forecast(sj.m1,h=5)
autoplot(sj.m1.fcast)
# Model 2:Arima
sj.m2 <- arima(sj.train[,4], order = c(2,0,2), seasonal = list(order = c(1,0,1)))
sj.m2.fcast <- forecast(sj.m2,h=5)
autoplot(sj.m2.fcast)
##IQUITOS
#SARIMA
iq.m1 <- auto.arima(iq.train[,4])
summary(iq.m1)
## Series: iq.train[, 4]
## ARIMA(2,1,0)
##
## Coefficients:
## ar1 ar2
## -0.8857 -0.2839
## s.e. 0.0728 0.0726
##
## sigma^2 estimated as 12.98: log likelihood=-463.88
## AIC=933.77 AICc=933.91 BIC=943.21
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE ACF1
## Training set 0.02160312 3.570811 2.104061 NaN Inf 0.2157554 0.01704513
##Forecast and plot
iq.m1.fcast <- forecast(iq.m1,h=5)
#Model2:Arima
iq.m2 <- arima(iq.train[,4], order = c(2,0,2), seasonal = list(order = c(1,0,1)))
iq.m2.fcast <- forecast(iq.m2,h=5)
autoplot(iq.m2.fcast)
##Accuracy
accuracy(sj.m1.fcast)
## ME RMSE MAE MPE MAPE MASE
## Training set 0.06277537 32.09904 16.10851 -42.57498 61.7512 0.3092194
## ACF1
## Training set 0.02667101
accuracy(sj.m2.fcast)
## ME RMSE MAE MPE MAPE MASE
## Training set 0.07202379 32.09194 16.10033 -42.46995 61.67487 0.3090624
## ACF1
## Training set 0.02673673
accuracy(iq.m1.fcast)
## ME RMSE MAE MPE MAPE MASE ACF1
## Training set 0.02160312 3.570811 2.104061 NaN Inf 0.2157554 0.01704513
accuracy(iq.m2.fcast)
## ME RMSE MAE MPE MAPE MASE ACF1
## Training set 0.0623992 3.52628 2.150196 NaN Inf 0.2204862 -0.008728715
For some unknown reason, I am unable to run any Neural Network, ETS, or Holt winter models in rmarkdown so i am going to have to use these two ```