library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.0.5
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v purrr 0.3.4 v forcats 0.5.1
## v stringr 1.4.0
## Warning: package 'purrr' was built under R version 4.0.5
## Warning: package 'stringr' was built under R version 4.0.5
## Warning: package 'forcats' was built under R version 4.0.5
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x lubridate::as.difftime() masks base::as.difftime()
## x data.table::between() masks dplyr::between()
## x readr::col_factor() masks scales::col_factor()
## x gridExtra::combine() masks dplyr::combine()
## x lubridate::date() masks base::date()
## x purrr::discard() masks scales::discard()
## x dplyr::filter() masks stats::filter()
## x data.table::first() masks dplyr::first()
## x data.table::hour() masks lubridate::hour()
## x tsibble::intersect() masks lubridate::intersect(), base::intersect()
## x tsibble::interval() masks lubridate::interval()
## x data.table::isoweek() masks lubridate::isoweek()
## x dplyr::lag() masks stats::lag()
## x data.table::last() masks dplyr::last()
## x purrr::map() masks maps::map()
## x data.table::mday() masks lubridate::mday()
## x data.table::minute() masks lubridate::minute()
## x data.table::month() masks lubridate::month()
## x data.table::quarter() masks lubridate::quarter()
## x data.table::second() masks lubridate::second()
## x MASS::select() masks dplyr::select()
## x tsibble::setdiff() masks lubridate::setdiff(), base::setdiff()
## x purrr::transpose() masks data.table::transpose()
## x tsibble::union() masks lubridate::union(), base::union()
## x data.table::wday() masks lubridate::wday()
## x data.table::week() masks lubridate::week()
## x data.table::yday() masks lubridate::yday()
## x data.table::year() masks lubridate::year()
train <- read_csv("C:/Users/MMENDEZ/Downloads/train.csv/train.csv")
## Rows: 969640 Columns: 9
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr (4): County, Province_State, Country_Region, Target
## dbl (4): Id, Population, Weight, TargetValue
## date (1): Date
##
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
train$Date<-as.Date(train$Date)
test <- read_csv("C:/Users/MMENDEZ/Downloads/test.csv/test.csv")
## Rows: 311670 Columns: 8
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr (4): County, Province_State, Country_Region, Target
## dbl (3): ForecastId, Population, Weight
## date (1): Date
##
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
test$Date<-as.Date(test$Date)
head(train)
## # A tibble: 6 x 9
## Id County Province_State Country_Region Population Weight Date Target
## <dbl> <chr> <chr> <chr> <dbl> <dbl> <date> <chr>
## 1 1 <NA> <NA> Afghanistan 27657145 0.0584 2020-01-23 Confi~
## 2 2 <NA> <NA> Afghanistan 27657145 0.584 2020-01-23 Fatal~
## 3 3 <NA> <NA> Afghanistan 27657145 0.0584 2020-01-24 Confi~
## 4 4 <NA> <NA> Afghanistan 27657145 0.584 2020-01-24 Fatal~
## 5 5 <NA> <NA> Afghanistan 27657145 0.0584 2020-01-25 Confi~
## 6 6 <NA> <NA> Afghanistan 27657145 0.584 2020-01-25 Fatal~
## # ... with 1 more variable: TargetValue <dbl>
tail(train)
## # A tibble: 6 x 9
## Id County Province_State Country_Region Population Weight Date
## <dbl> <chr> <chr> <chr> <dbl> <dbl> <date>
## 1 969635 <NA> <NA> Zimbabwe 14240168 0.0607 2020-06-08
## 2 969636 <NA> <NA> Zimbabwe 14240168 0.607 2020-06-08
## 3 969637 <NA> <NA> Zimbabwe 14240168 0.0607 2020-06-09
## 4 969638 <NA> <NA> Zimbabwe 14240168 0.607 2020-06-09
## 5 969639 <NA> <NA> Zimbabwe 14240168 0.0607 2020-06-10
## 6 969640 <NA> <NA> Zimbabwe 14240168 0.607 2020-06-10
## # ... with 2 more variables: Target <chr>, TargetValue <dbl>
train<-train[train$Target=="ConfirmedCases",]
Confirmed_over_time<-train %>%
group_by(Date) %>%
dplyr::summarize(Number=sum(TargetValue))
options(scipen = 9)
p <- ggplot(Confirmed_over_time, aes(x=Date, y=Number)) +
geom_line( color="steelblue") +
xlab("") +
scale_x_date(date_minor_breaks = "5 day") +scale_y_continuous(breaks=seq(0,max(Confirmed_over_time$Number)+100000,50000))+ggtitle('Worldwide Confirmed Cases Over Time')+theme(plot.title = element_text(size = 10, face = "bold"))
p_log<-ggplot(Confirmed_over_time, aes(x=Date, y=log(Number))) +
geom_line( color="steelblue") +
xlab("") +
scale_x_date(date_minor_breaks = "5 day")+ggtitle("Worldwide Confirmed Cases (Logarithmic Scale) Over Time")+theme(plot.title = element_text(size = 10, face = "bold"))
ggarrange(p,p_log,ncol=2,nrow = 1)
p_log
Here we see the optimal values of alpha and Lo
countryplot<-function(x){
data<-train%>%
filter(Country_Region==paste0(x,''))%>%
group_by(Date) %>% summarize(sum(TargetValue))
names(data)[2]<-{'Number'}
t <- ggplot(data, aes(x=Date, y=Number)) +
geom_line( color="steelblue") +
xlab("") +
scale_x_date(date_minor_breaks = "5 day") +scale_y_continuous((breaks=seq(0,max(data$Number),5000)))+ggtitle(paste0(x,' Confirmed Cases Over Time'))+theme(plot.title = element_text(size = 8, face = "bold"))
t_log<-ggplot(data, aes(x=Date, y=log(Number+1))) +
geom_line( color="steelblue") +
xlab("") +
scale_x_date(date_minor_breaks = "5 day")+ggtitle(paste0(x, ' Confirmed Cases (Logarithmic Scale) Over Time'))+theme(plot.title = element_text(size = 9, face = "bold"))
all<-ggarrange(t,t_log,ncol=2,nrow = 1)
return(all)
}
countryplot('US')
countryplot('China')
## Warning in log(Number + 1): Se han producido NaNs
## Warning in log(Number + 1): Se han producido NaNs
countryplot('Italy')
countryplot('United Kingdom')
## Warning in log(Number + 1): Se han producido NaNs
## Warning in log(Number + 1): Se han producido NaNs
Time serie decomposition
train_daily <-train %>% group_by(day=floor_date(Date, "day")) %>%
summarize( Cases=sum(TargetValue))
set.seed(1)
d_tsibble <- train_daily %>%
as_tsibble(index = day)
autoplot(d_tsibble)
## Plot variable not specified, automatically selected `.vars = Cases`
d_tsibble %>%
model(
classical_decomposition( Cases, type = "multiplicative")
) %>%
components() %>%
autoplot() +
labs(title = "Classical multiplicative decomposition of COVID virus")
## Warning: Removed 3 row(s) containing missing values (geom_path).
The function obtained the same result as the package function.
NAIVE model
fit <- d_tsibble %>%
model(SNAIVE(Cases))
report(fit)
## Series: Cases
## Model: SNAIVE
##
## sigma^2: 329879099.4262
ETS model
fit <- d_tsibble %>%
model(ETS(Cases))
report(fit)
## Series: Cases
## Model: ETS(A,Ad,A)
## Smoothing parameters:
## alpha = 0.2578272
## beta = 0.09843796
## gamma = 0.3131168
## phi = 0.9266271
##
## Initial states:
## l[0] b[0] s[0] s[-1] s[-2] s[-3] s[-4] s[-5]
## -5565.125 1256.221 -1600.775 -5199.893 -9049.631 -4797.199 3153.782 9948.941
## s[-6]
## 7544.776
##
## sigma^2: 85388234
##
## AIC AICc BIC
## 3262.065 3264.954 3300.306
HOLT-WINTERS
fit2 <- d_tsibble %>%
model(
"Holt-Winters Multiplicative" =ETS(Cases ~ error("M")+
trend("A")+
season("M"))
)
report(fit2)
## Series: Cases
## Model: ETS(M,A,M)
## Smoothing parameters:
## alpha = 0.4154206
## beta = 0.006038775
## gamma = 0.1939821
##
## Initial states:
## l[0] b[0] s[0] s[-1] s[-2] s[-3] s[-4] s[-5]
## -1110.819 692.1911 0.6068596 0.7319418 0.4018263 0.7283801 0.5661652 0.7766276
## s[-6]
## 3.188199
##
## sigma^2: 0.1717
##
## AIC AICc BIC
## 3373.635 3376.091 3408.934
ARIMA
fit <- d_tsibble %>%
model(ARIMA(Cases))
report(fit)
## Series: Cases
## Model: ARIMA(1,0,4)(0,1,1)[7]
##
## Coefficients:
## ar1 ma1 ma2 ma3 ma4 sma1
## 0.9732 -0.4704 0.0439 -0.0223 0.2271 -0.5582
## s.e. 0.0190 0.0942 0.1110 0.0795 0.0961 0.1000
##
## sigma^2 estimated as 80446334: log likelihood=-1398.21
## AIC=2810.43 AICc=2811.32 BIC=2830.66
ETS (BOXCOX)
lambda_1 <- d_tsibble %>%
features(Cases, features = guerrero) %>%
pull(lambda_guerrero)
covid_trans<-d_tsibble %>%
mutate(cases=box_cox(Cases,lambda_1))
fit <- covid_trans%>%
model(
"ETS (BoxCox)" = ETS(cases)
)
report(fit)
## Series: cases
## Model: ETS(A,Ad,A)
## Smoothing parameters:
## alpha = 0.4678152
## beta = 0.08104293
## gamma = 0.0001000164
## phi = 0.9466274
##
## Initial states:
## l[0] b[0] s[0] s[-1] s[-2] s[-3] s[-4] s[-5]
## -46.58421 41.71235 -41.96523 -62.10599 -123.1441 -63.23546 41.62505 134.9795
## s[-6]
## 113.8462
##
## sigma^2: 22696.38
##
## AIC AICc BIC
## 2109.479 2112.368 2147.720
ARIMA(BOXCOX)
lambda_1 <- d_tsibble %>%
features(Cases, features = guerrero) %>%
pull(lambda_guerrero)
covid_trans<-d_tsibble %>%
mutate(cases=box_cox(Cases,lambda_1))
fit <- covid_trans%>%
model(
"ARIMA (BoxCox)" = ARIMA(cases)
)
report(fit)
## Series: cases
## Model: ARIMA(0,1,1)(2,0,0)[7]
##
## Coefficients:
## ma1 sar1 sar2
## -0.3879 0.3663 0.2297
## s.e. 0.0813 0.0880 0.0857
##
## sigma^2 estimated as 24681: log likelihood=-899.98
## AIC=1807.97 AICc=1808.26 BIC=1819.7
COMPARED
lambda_1 <- d_tsibble %>%
features(Cases, features = guerrero) %>%
pull(lambda_guerrero)
covid_trans<-d_tsibble %>%
mutate(Cases=box_cox(Cases,lambda_1))
fit <- covid_trans%>%
model(
"ARIMA (BoxCox)" = ARIMA(Cases),
"ETS (BoxCox)" = ETS(Cases)
)
fit2 <- d_tsibble %>%
model("Naive"=SNAIVE(Cases),
"ETS" = ETS(Cases),
"ARIMA" = ARIMA(Cases)
)
fit3 <- d_tsibble %>%
model(
"Holt-Winters Multiplicative" =ETS(Cases ~ error("M")+
trend("A")+
season("M"))
)
rbind(accuracy(fit),accuracy(fit2),accuracy(fit3))
## # A tibble: 6 x 10
## .model .type ME RMSE MAE MPE MAPE MASE RMSSE ACF1
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 ARIMA (BoxCox) Trai~ 13.8 155. 110. -2.02 18.3 0.461 0.436 -0.00216
## 2 ETS (BoxCox) Trai~ 8.81 144. 114. -3.03 24.3 0.474 0.406 0.108
## 3 Naive Trai~ 8538. 20007. 12876. -27.4 75.6 1 1 0.780
## 4 ETS Trai~ 648. 8836. 6849. -11.8 90.0 0.532 0.442 0.173
## 5 ARIMA Trai~ 622. 8543. 6111. -5.89 41.3 0.475 0.427 0.00657
## 6 Holt-Winters Mu~ Trai~ -904. 14944. 9266. -28.1 57.5 0.720 0.747 -0.0394
RESIDUAL PLOT
fit %>% dplyr::select("ETS (BoxCox)") %>% gg_tsresiduals() + ggtitle("Residual plot of ETS (Box Cox) model")