Midterm Paper

Load the data.

library(tidyverse)

## Warning: package 'tidyverse' was built under R version 4.0.5

## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --

## v purrr   0.3.4     v forcats 0.5.1
## v stringr 1.4.0

## Warning: package 'purrr' was built under R version 4.0.5

## Warning: package 'stringr' was built under R version 4.0.5

## Warning: package 'forcats' was built under R version 4.0.5

## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x lubridate::as.difftime() masks base::as.difftime()
## x data.table::between()    masks dplyr::between()
## x readr::col_factor()      masks scales::col_factor()
## x gridExtra::combine()     masks dplyr::combine()
## x lubridate::date()        masks base::date()
## x purrr::discard()         masks scales::discard()
## x dplyr::filter()          masks stats::filter()
## x data.table::first()      masks dplyr::first()
## x data.table::hour()       masks lubridate::hour()
## x tsibble::intersect()     masks lubridate::intersect(), base::intersect()
## x tsibble::interval()      masks lubridate::interval()
## x data.table::isoweek()    masks lubridate::isoweek()
## x dplyr::lag()             masks stats::lag()
## x data.table::last()       masks dplyr::last()
## x purrr::map()             masks maps::map()
## x data.table::mday()       masks lubridate::mday()
## x data.table::minute()     masks lubridate::minute()
## x data.table::month()      masks lubridate::month()
## x data.table::quarter()    masks lubridate::quarter()
## x data.table::second()     masks lubridate::second()
## x MASS::select()           masks dplyr::select()
## x tsibble::setdiff()       masks lubridate::setdiff(), base::setdiff()
## x purrr::transpose()       masks data.table::transpose()
## x tsibble::union()         masks lubridate::union(), base::union()
## x data.table::wday()       masks lubridate::wday()
## x data.table::week()       masks lubridate::week()
## x data.table::yday()       masks lubridate::yday()
## x data.table::year()       masks lubridate::year()

train <- read_csv("C:/Users/MMENDEZ/Downloads/train.csv/train.csv")

## Rows: 969640 Columns: 9

## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr  (4): County, Province_State, Country_Region, Target
## dbl  (4): Id, Population, Weight, TargetValue
## date (1): Date
## 
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.

train$Date<-as.Date(train$Date)
test <- read_csv("C:/Users/MMENDEZ/Downloads/test.csv/test.csv")

## Rows: 311670 Columns: 8
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr  (4): County, Province_State, Country_Region, Target
## dbl  (3): ForecastId, Population, Weight
## date (1): Date
## 
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.

test$Date<-as.Date(test$Date)

DATA DESCRIPTION

head(train)

## # A tibble: 6 x 9
##      Id County Province_State Country_Region Population Weight Date       Target
##   <dbl> <chr>  <chr>          <chr>               <dbl>  <dbl> <date>     <chr> 
## 1     1 <NA>   <NA>           Afghanistan      27657145 0.0584 2020-01-23 Confi~
## 2     2 <NA>   <NA>           Afghanistan      27657145 0.584  2020-01-23 Fatal~
## 3     3 <NA>   <NA>           Afghanistan      27657145 0.0584 2020-01-24 Confi~
## 4     4 <NA>   <NA>           Afghanistan      27657145 0.584  2020-01-24 Fatal~
## 5     5 <NA>   <NA>           Afghanistan      27657145 0.0584 2020-01-25 Confi~
## 6     6 <NA>   <NA>           Afghanistan      27657145 0.584  2020-01-25 Fatal~
## # ... with 1 more variable: TargetValue <dbl>

tail(train)

## # A tibble: 6 x 9
##       Id County Province_State Country_Region Population Weight Date      
##    <dbl> <chr>  <chr>          <chr>               <dbl>  <dbl> <date>    
## 1 969635 <NA>   <NA>           Zimbabwe         14240168 0.0607 2020-06-08
## 2 969636 <NA>   <NA>           Zimbabwe         14240168 0.607  2020-06-08
## 3 969637 <NA>   <NA>           Zimbabwe         14240168 0.0607 2020-06-09
## 4 969638 <NA>   <NA>           Zimbabwe         14240168 0.607  2020-06-09
## 5 969639 <NA>   <NA>           Zimbabwe         14240168 0.0607 2020-06-10
## 6 969640 <NA>   <NA>           Zimbabwe         14240168 0.607  2020-06-10
## # ... with 2 more variables: Target <chr>, TargetValue <dbl>

train<-train[train$Target=="ConfirmedCases",]
Confirmed_over_time<-train %>%
  group_by(Date) %>%
  dplyr::summarize(Number=sum(TargetValue))

options(scipen = 9)
p <- ggplot(Confirmed_over_time, aes(x=Date, y=Number)) +
  geom_line( color="steelblue") + 
  xlab("") +
  scale_x_date(date_minor_breaks = "5 day") +scale_y_continuous(breaks=seq(0,max(Confirmed_over_time$Number)+100000,50000))+ggtitle('Worldwide Confirmed Cases Over Time')+theme(plot.title = element_text(size = 10, face = "bold"))
p_log<-ggplot(Confirmed_over_time, aes(x=Date, y=log(Number))) +
  geom_line( color="steelblue") + 
  xlab("") +
  scale_x_date(date_minor_breaks = "5 day")+ggtitle("Worldwide Confirmed Cases (Logarithmic Scale) Over Time")+theme(plot.title = element_text(size = 10, face = "bold"))
ggarrange(p,p_log,ncol=2,nrow = 1)

p_log

Here we see the optimal values of alpha and Lo

countryplot<-function(x){
  data<-train%>%
    filter(Country_Region==paste0(x,''))%>%
    group_by(Date) %>% summarize(sum(TargetValue))
  names(data)[2]<-{'Number'}
  t <- ggplot(data, aes(x=Date, y=Number)) +
    geom_line( color="steelblue") + 
    xlab("") +
    scale_x_date(date_minor_breaks = "5 day") +scale_y_continuous((breaks=seq(0,max(data$Number),5000)))+ggtitle(paste0(x,' Confirmed Cases Over Time'))+theme(plot.title = element_text(size = 8, face = "bold"))
  t_log<-ggplot(data, aes(x=Date, y=log(Number+1))) +
    geom_line( color="steelblue") + 
    xlab("") +
    scale_x_date(date_minor_breaks = "5 day")+ggtitle(paste0(x, ' Confirmed Cases (Logarithmic Scale) Over Time'))+theme(plot.title = element_text(size = 9, face = "bold"))
  all<-ggarrange(t,t_log,ncol=2,nrow = 1)
  return(all)
}
countryplot('US')

countryplot('China')

## Warning in log(Number + 1): Se han producido NaNs

## Warning in log(Number + 1): Se han producido NaNs

countryplot('Italy')

countryplot('United Kingdom')

## Warning in log(Number + 1): Se han producido NaNs

## Warning in log(Number + 1): Se han producido NaNs

Models

Time serie decomposition

train_daily <-train %>% group_by(day=floor_date(Date, "day")) %>%
   summarize( Cases=sum(TargetValue))


set.seed(1)
d_tsibble <- train_daily %>%
  as_tsibble(index = day)

autoplot(d_tsibble)

## Plot variable not specified, automatically selected `.vars = Cases`

d_tsibble %>%
  model(
    classical_decomposition( Cases, type = "multiplicative")
  ) %>%
  components() %>%
  autoplot() +
  labs(title = "Classical multiplicative decomposition of COVID virus")

## Warning: Removed 3 row(s) containing missing values (geom_path).

The function obtained the same result as the package function.

NAIVE model

fit <- d_tsibble %>%
  model(SNAIVE(Cases))
report(fit)

## Series: Cases 
## Model: SNAIVE 
## 
## sigma^2: 329879099.4262

ETS model

fit <- d_tsibble %>%
  model(ETS(Cases))
report(fit)

## Series: Cases 
## Model: ETS(A,Ad,A) 
##   Smoothing parameters:
##     alpha = 0.2578272 
##     beta  = 0.09843796 
##     gamma = 0.3131168 
##     phi   = 0.9266271 
## 
##   Initial states:
##       l[0]     b[0]      s[0]     s[-1]     s[-2]     s[-3]    s[-4]    s[-5]
##  -5565.125 1256.221 -1600.775 -5199.893 -9049.631 -4797.199 3153.782 9948.941
##     s[-6]
##  7544.776
## 
##   sigma^2:  85388234
## 
##      AIC     AICc      BIC 
## 3262.065 3264.954 3300.306

HOLT-WINTERS

fit2 <-  d_tsibble %>%
    model(
      "Holt-Winters Multiplicative" =ETS(Cases ~ error("M")+
                                                    trend("A")+
                                                   season("M"))
  )

report(fit2)

## Series: Cases 
## Model: ETS(M,A,M) 
##   Smoothing parameters:
##     alpha = 0.4154206 
##     beta  = 0.006038775 
##     gamma = 0.1939821 
## 
##   Initial states:
##       l[0]     b[0]      s[0]     s[-1]     s[-2]     s[-3]     s[-4]     s[-5]
##  -1110.819 692.1911 0.6068596 0.7319418 0.4018263 0.7283801 0.5661652 0.7766276
##     s[-6]
##  3.188199
## 
##   sigma^2:  0.1717
## 
##      AIC     AICc      BIC 
## 3373.635 3376.091 3408.934

ARIMA

fit <- d_tsibble %>%
  model(ARIMA(Cases))
report(fit)

## Series: Cases 
## Model: ARIMA(1,0,4)(0,1,1)[7] 
## 
## Coefficients:
##          ar1      ma1     ma2      ma3     ma4     sma1
##       0.9732  -0.4704  0.0439  -0.0223  0.2271  -0.5582
## s.e.  0.0190   0.0942  0.1110   0.0795  0.0961   0.1000
## 
## sigma^2 estimated as 80446334:  log likelihood=-1398.21
## AIC=2810.43   AICc=2811.32   BIC=2830.66

ETS (BOXCOX)

lambda_1 <- d_tsibble %>%
                  features(Cases, features = guerrero) %>%
                  pull(lambda_guerrero)
covid_trans<-d_tsibble %>%
  mutate(cases=box_cox(Cases,lambda_1))

fit <- covid_trans%>%
  model(
    "ETS (BoxCox)" = ETS(cases)
  )

report(fit)

## Series: cases 
## Model: ETS(A,Ad,A) 
##   Smoothing parameters:
##     alpha = 0.4678152 
##     beta  = 0.08104293 
##     gamma = 0.0001000164 
##     phi   = 0.9466274 
## 
##   Initial states:
##       l[0]     b[0]      s[0]     s[-1]     s[-2]     s[-3]    s[-4]    s[-5]
##  -46.58421 41.71235 -41.96523 -62.10599 -123.1441 -63.23546 41.62505 134.9795
##     s[-6]
##  113.8462
## 
##   sigma^2:  22696.38
## 
##      AIC     AICc      BIC 
## 2109.479 2112.368 2147.720

ARIMA(BOXCOX)

lambda_1 <- d_tsibble %>%
                  features(Cases, features = guerrero) %>%
                  pull(lambda_guerrero)
covid_trans<-d_tsibble %>%
  mutate(cases=box_cox(Cases,lambda_1))

fit <- covid_trans%>%
  model(
    "ARIMA (BoxCox)" = ARIMA(cases)
  )

report(fit)

## Series: cases 
## Model: ARIMA(0,1,1)(2,0,0)[7] 
## 
## Coefficients:
##           ma1    sar1    sar2
##       -0.3879  0.3663  0.2297
## s.e.   0.0813  0.0880  0.0857
## 
## sigma^2 estimated as 24681:  log likelihood=-899.98
## AIC=1807.97   AICc=1808.26   BIC=1819.7

COMPARED

lambda_1 <- d_tsibble %>%
                  features(Cases, features = guerrero) %>%
                  pull(lambda_guerrero)
covid_trans<-d_tsibble %>%
  mutate(Cases=box_cox(Cases,lambda_1))

fit <- covid_trans%>%
  model(
    "ARIMA (BoxCox)" = ARIMA(Cases),
    "ETS (BoxCox)" = ETS(Cases)
  )

fit2 <- d_tsibble %>%
  model("Naive"=SNAIVE(Cases),
        "ETS" = ETS(Cases),
        "ARIMA" = ARIMA(Cases)
        
        )
fit3 <-  d_tsibble %>%
    model(
      "Holt-Winters Multiplicative" =ETS(Cases ~ error("M")+
                                                    trend("A")+
                                                   season("M"))
  )
  

rbind(accuracy(fit),accuracy(fit2),accuracy(fit3))

## # A tibble: 6 x 10
##   .model           .type      ME   RMSE    MAE    MPE  MAPE  MASE RMSSE     ACF1
##   <chr>            <chr>   <dbl>  <dbl>  <dbl>  <dbl> <dbl> <dbl> <dbl>    <dbl>
## 1 ARIMA (BoxCox)   Trai~   13.8    155.   110.  -2.02  18.3 0.461 0.436 -0.00216
## 2 ETS (BoxCox)     Trai~    8.81   144.   114.  -3.03  24.3 0.474 0.406  0.108  
## 3 Naive            Trai~ 8538.   20007. 12876. -27.4   75.6 1     1      0.780  
## 4 ETS              Trai~  648.    8836.  6849. -11.8   90.0 0.532 0.442  0.173  
## 5 ARIMA            Trai~  622.    8543.  6111.  -5.89  41.3 0.475 0.427  0.00657
## 6 Holt-Winters Mu~ Trai~ -904.   14944.  9266. -28.1   57.5 0.720 0.747 -0.0394

RESIDUAL PLOT

fit %>% dplyr::select("ETS (BoxCox)") %>% gg_tsresiduals() + ggtitle("Residual plot of ETS (Box Cox) model")

Midterm Paper

Miguel Angel Méndez Hernández

18/4/2022

Load the data.

DATA DESCRIPTION

Models