#train
train<-read_csv("train.csv")
## Rows: 969640 Columns: 9
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (4): County, Province_State, Country_Region, Target
## dbl  (4): Id, Population, Weight, TargetValue
## date (1): Date
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
str(train)
## spec_tbl_df [969,640 × 9] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ Id            : num [1:969640] 1 2 3 4 5 6 7 8 9 10 ...
##  $ County        : chr [1:969640] NA NA NA NA ...
##  $ Province_State: chr [1:969640] NA NA NA NA ...
##  $ Country_Region: chr [1:969640] "Afghanistan" "Afghanistan" "Afghanistan" "Afghanistan" ...
##  $ Population    : num [1:969640] 27657145 27657145 27657145 27657145 27657145 ...
##  $ Weight        : num [1:969640] 0.0584 0.5836 0.0584 0.5836 0.0584 ...
##  $ Date          : Date[1:969640], format: "2020-01-23" "2020-01-23" ...
##  $ Target        : chr [1:969640] "ConfirmedCases" "Fatalities" "ConfirmedCases" "Fatalities" ...
##  $ TargetValue   : num [1:969640] 0 0 0 0 0 0 0 0 0 0 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   Id = col_double(),
##   ..   County = col_character(),
##   ..   Province_State = col_character(),
##   ..   Country_Region = col_character(),
##   ..   Population = col_double(),
##   ..   Weight = col_double(),
##   ..   Date = col_date(format = ""),
##   ..   Target = col_character(),
##   ..   TargetValue = col_double()
##   .. )
##  - attr(*, "problems")=<externalptr>
###Describe Data
#Top 10 country with the highest Confirmed Cases and fatalities
con<-train %>%
  filter(Target=='ConfirmedCases') %>%
  group_by(Country_Region) %>%
  summarise(TargetValue=sum(TargetValue)) %>%
  arrange(desc(TargetValue))
con[1:10,]
## # A tibble: 10 × 2
##    Country_Region TargetValue
##    <chr>                <dbl>
##  1 US                 5979962
##  2 Brazil              772416
##  3 Russia              493023
##  4 United Kingdom      291588
##  5 India               276583
##  6 Spain               242280
##  7 Italy               235763
##  8 Peru                208823
##  9 Canada              197414
## 10 France              192068
fat<-train %>%
  filter(Target=='Fatalities') %>%
  group_by(Country_Region) %>%
  summarise(TargetValue=sum(TargetValue)) %>%
  arrange(desc(TargetValue))
fat[1:10,]
## # A tibble: 10 × 2
##    Country_Region TargetValue
##    <chr>                <dbl>
##  1 US                  337252
##  2 United Kingdom       41213
##  3 Brazil               39680
##  4 Italy                34114
##  5 France               29322
##  6 Spain                27136
##  7 Canada               16074
##  8 Mexico               15357
##  9 Belgium               9629
## 10 China                 9242
#United States case
US<-train %>%
  filter(Country_Region=='US')
colSums(is.na(US))
##             Id         County Province_State Country_Region     Population 
##              0          15400            280              0              0 
##         Weight           Date         Target    TargetValue 
##              0              0              0              0
#filter the na values
US <- US %>%
  filter(is.na(Province_State)) 
ggplot(data = US, 
       mapping=aes(x = Date, 
                     y = TargetValue,
                     fill=Target,
                     color = Target)) + 
  geom_line() +
  scale_color_manual(values=c("black", "red")) + 
  theme(legend.position = c(0.175, 0.835))+
  labs(title="US Confirmed Cases vs Fatalities")

#US confirmed cases, date from 01-23-2020 to 04-26-2020
US_con<- US %>%
  filter(Target=='ConfirmedCases')

########Reorganize data
US_con_train<- US_con %>%
  filter(Date<='2020-04-26')
head(US_con_train)
## # A tibble: 6 × 9
##       Id County Province_State Country_Region Population Weight Date      
##    <dbl> <chr>  <chr>          <chr>               <dbl>  <dbl> <date>    
## 1 962921 <NA>   <NA>           US              324141489 0.0510 2020-01-23
## 2 962923 <NA>   <NA>           US              324141489 0.0510 2020-01-24
## 3 962925 <NA>   <NA>           US              324141489 0.0510 2020-01-25
## 4 962927 <NA>   <NA>           US              324141489 0.0510 2020-01-26
## 5 962929 <NA>   <NA>           US              324141489 0.0510 2020-01-27
## 6 962931 <NA>   <NA>           US              324141489 0.0510 2020-01-28
## # … with 2 more variables: Target <chr>, TargetValue <dbl>
#test date from 04-27-2020 to 06-10-2020
US_con_test<-US_con %>%
  filter(Date>'2020-04-26')
US_con_test1<-US_con_test%>%
  group_by(Date) %>%
  select(-County, -Province_State) %>%
  pivot_wider(names_from = Target, values_from = TargetValue, values_fill = 0)
head(US_con_test1)
## # A tibble: 6 × 6
## # Groups:   Date [6]
##       Id Country_Region Population Weight Date       ConfirmedCases
##    <dbl> <chr>               <dbl>  <dbl> <date>              <dbl>
## 1 963111 US              324141489 0.0510 2020-04-27          22414
## 2 963113 US              324141489 0.0510 2020-04-28          24385
## 3 963115 US              324141489 0.0510 2020-04-29          27327
## 4 963117 US              324141489 0.0510 2020-04-30          29515
## 5 963119 US              324141489 0.0510 2020-05-01          34037
## 6 963121 US              324141489 0.0510 2020-05-02          29078
test_ts <- zoo(US_con_test1$ConfirmedCases,seq(from=as.Date("2020-04-27"), to=as.Date("2020-06-10"), by = "day"))
test_ts
## 2020-04-27 2020-04-28 2020-04-29 2020-04-30 2020-05-01 2020-05-02 2020-05-03 
##      22414      24385      27327      29515      34037      29078      25501 
## 2020-05-04 2020-05-05 2020-05-06 2020-05-07 2020-05-08 2020-05-09 2020-05-10 
##      22335      23976      24251      28420      26906      25620      19710 
## 2020-05-11 2020-05-12 2020-05-13 2020-05-14 2020-05-15 2020-05-16 2020-05-17 
##      18618      21693      20832      27368      25050      24994      18937 
## 2020-05-18 2020-05-19 2020-05-20 2020-05-21 2020-05-22 2020-05-23 2020-05-24 
##      21551      20260      23285      25293      23790      21675      20634 
## 2020-05-25 2020-05-26 2020-05-27 2020-05-28 2020-05-29 2020-05-30 2020-05-31 
##      19056      18611      18263      22577      24266      24146      20007 
## 2020-06-01 2020-06-02 2020-06-03 2020-06-04 2020-06-05 2020-06-06 2020-06-07 
##      20848      20800      19698      21138      29972      23133      18114 
## 2020-06-08 2020-06-09 2020-06-10 
##      17547      17981      21053
US_con_train1<-US_con_train%>%
  group_by(Date) %>%
  select(-County, -Province_State) %>%
  pivot_wider(names_from = Target, values_from = TargetValue, values_fill = 0)
head(US_con_train1)
## # A tibble: 6 × 6
## # Groups:   Date [6]
##       Id Country_Region Population Weight Date       ConfirmedCases
##    <dbl> <chr>               <dbl>  <dbl> <date>              <dbl>
## 1 962921 US              324141489 0.0510 2020-01-23              0
## 2 962923 US              324141489 0.0510 2020-01-24              0
## 3 962925 US              324141489 0.0510 2020-01-25              0
## 4 962927 US              324141489 0.0510 2020-01-26              0
## 5 962929 US              324141489 0.0510 2020-01-27              0
## 6 962931 US              324141489 0.0510 2020-01-28              0
adf.test(US_con_train1$ConfirmedCases)
## 
##  Augmented Dickey-Fuller Test
## 
## data:  US_con_train1$ConfirmedCases
## Dickey-Fuller = -1.5933, Lag order = 4, p-value = 0.7444
## alternative hypothesis: stationary
#p-value = 0.7444 > 0.05, the data is not stationary
#convert the US train data to time series
train_ts <- zoo(US_con_train1$ConfirmedCases,seq(from=as.Date("2020-01-23"), to=as.Date("2020-04-26"), by = "day"))
train_ts
## 2020-01-23 2020-01-24 2020-01-25 2020-01-26 2020-01-27 2020-01-28 2020-01-29 
##          0          0          0          0          0          0          0 
## 2020-01-30 2020-01-31 2020-02-01 2020-02-02 2020-02-03 2020-02-04 2020-02-05 
##          0          0          0          0          0          0          0 
## 2020-02-06 2020-02-07 2020-02-08 2020-02-09 2020-02-10 2020-02-11 2020-02-12 
##          0          0          0          0          0          0          0 
## 2020-02-13 2020-02-14 2020-02-15 2020-02-16 2020-02-17 2020-02-18 2020-02-19 
##          0          0          0          0          0          0          0 
## 2020-02-20 2020-02-21 2020-02-22 2020-02-23 2020-02-24 2020-02-25 2020-02-26 
##          0          0          0          0          0          0          0 
## 2020-02-27 2020-02-28 2020-02-29 2020-03-01 2020-03-02 2020-03-03 2020-03-04 
##          0          0          0          0          0          0          0 
## 2020-03-05 2020-03-06 2020-03-07 2020-03-08 2020-03-09 2020-03-10 2020-03-11 
##          0          0          0          0          0        892        322 
## 2020-03-12 2020-03-13 2020-03-14 2020-03-15 2020-03-16 2020-03-17 2020-03-18 
##        382        516        546        773       1134       1788       1362 
## 2020-03-19 2020-03-20 2020-03-21 2020-03-22 2020-03-23 2020-03-24 2020-03-25 
##       5893       5417       6410       8330       9821      10073      12042 
## 2020-03-26 2020-03-27 2020-03-28 2020-03-29 2020-03-30 2020-03-31 2020-04-01 
##      18058      17821      19733      19444      20922      26339      25196 
## 2020-04-02 2020-04-03 2020-04-04 2020-04-05 2020-04-06 2020-04-07 2020-04-08 
##      30227      31985      33267      28219      29595      29556      32826 
## 2020-04-09 2020-04-10 2020-04-11 2020-04-12 2020-04-13 2020-04-14 2020-04-15 
##      32385      35098      29861      28917      25306      27051      28678 
## 2020-04-16 2020-04-17 2020-04-18 2020-04-19 2020-04-20 2020-04-21 2020-04-22 
##      31451      31905      32490      26889      25240      27539      28355 
## 2020-04-23 2020-04-24 2020-04-25 2020-04-26 
##      28950      36163      32821      27629
train_ts%>%
  autoplot()+
  labs(title="US daily confirmed cases time series")

########Model
acf(train_ts)

#The series is dampening down, all lags are outside the boundaries and is not stationary
pacf(train_ts)

#lag1, 2, 3, and 16 are outside the boundaries

unitroot_ndiffs(train_ts)
## ndiffs 
##      1
#1 difference is required

#Differencing to eliminate trend or seasonality
#first order difference, take 1 day back
diff1<-diff(train_ts,differences=1)
diff1
## 2020-01-24 2020-01-25 2020-01-26 2020-01-27 2020-01-28 2020-01-29 2020-01-30 
##          0          0          0          0          0          0          0 
## 2020-01-31 2020-02-01 2020-02-02 2020-02-03 2020-02-04 2020-02-05 2020-02-06 
##          0          0          0          0          0          0          0 
## 2020-02-07 2020-02-08 2020-02-09 2020-02-10 2020-02-11 2020-02-12 2020-02-13 
##          0          0          0          0          0          0          0 
## 2020-02-14 2020-02-15 2020-02-16 2020-02-17 2020-02-18 2020-02-19 2020-02-20 
##          0          0          0          0          0          0          0 
## 2020-02-21 2020-02-22 2020-02-23 2020-02-24 2020-02-25 2020-02-26 2020-02-27 
##          0          0          0          0          0          0          0 
## 2020-02-28 2020-02-29 2020-03-01 2020-03-02 2020-03-03 2020-03-04 2020-03-05 
##          0          0          0          0          0          0          0 
## 2020-03-06 2020-03-07 2020-03-08 2020-03-09 2020-03-10 2020-03-11 2020-03-12 
##          0          0          0          0        892       -570         60 
## 2020-03-13 2020-03-14 2020-03-15 2020-03-16 2020-03-17 2020-03-18 2020-03-19 
##        134         30        227        361        654       -426       4531 
## 2020-03-20 2020-03-21 2020-03-22 2020-03-23 2020-03-24 2020-03-25 2020-03-26 
##       -476        993       1920       1491        252       1969       6016 
## 2020-03-27 2020-03-28 2020-03-29 2020-03-30 2020-03-31 2020-04-01 2020-04-02 
##       -237       1912       -289       1478       5417      -1143       5031 
## 2020-04-03 2020-04-04 2020-04-05 2020-04-06 2020-04-07 2020-04-08 2020-04-09 
##       1758       1282      -5048       1376        -39       3270       -441 
## 2020-04-10 2020-04-11 2020-04-12 2020-04-13 2020-04-14 2020-04-15 2020-04-16 
##       2713      -5237       -944      -3611       1745       1627       2773 
## 2020-04-17 2020-04-18 2020-04-19 2020-04-20 2020-04-21 2020-04-22 2020-04-23 
##        454        585      -5601      -1649       2299        816        595 
## 2020-04-24 2020-04-25 2020-04-26 
##       7213      -3342      -5192
autoplot(diff1)+
  labs(title="Differencing US daily Confirmed Cases")

#much more stationary than before
acf(diff1)

#acf drops to zero related
pacf(diff1)

#lag 6 and 7 are strong

#ets(A,N,N)
ets<-ets(train_ts,)
ets
## ETS(A,N,N) 
## 
## Call:
##  ets(y = train_ts) 
## 
##   Smoothing parameters:
##     alpha = 0.9981 
## 
##   Initial states:
##     l = 0.0518 
## 
##   sigma:  1988.822
## 
##      AIC     AICc      BIC 
## 1879.704 1879.967 1887.365
#alpha=0.9981,  l = 0.0518 
#AIC=1879.704 AICc=1879.967 BIC=1887.365
myfc1<-ets%>%forecast(h=45)
myfc1
##       Point Forecast    Lo 80    Hi 80     Lo 95    Hi 95
## 18379       27638.72 25089.94 30187.49 23740.697 31536.74
## 18380       27638.72 24037.57 31239.87 22131.234 33146.20
## 18381       27638.72 23229.60 32047.83 20895.559 34381.87
## 18382       27638.72 22548.30 32729.13 19853.604 35423.83
## 18383       27638.72 21948.00 33329.44 18935.511 36341.92
## 18384       27638.72 21405.23 33872.20 18105.428 37172.00
## 18385       27638.72 20906.09 34371.35 17342.047 37935.39
## 18386       27638.72 20441.47 34835.96 16631.482 38645.95
## 18387       27638.72 20005.09 35272.35 15964.085 39313.35
## 18388       27638.72 19592.33 35685.10 15332.831 39944.60
## 18389       27638.72 19199.74 36077.69 14732.415 40545.02
## 18390       27638.72 18824.62 36452.82 14158.715 41118.72
## 18391       27638.72 18464.82 36812.61 13608.455 41668.98
## 18392       27638.72 18118.61 37158.82 13078.976 42198.46
## 18393       27638.72 17784.56 37492.87 12568.088 42709.35
## 18394       27638.72 17461.47 37815.96 12073.959 43203.47
## 18395       27638.72 17148.32 38129.11 11595.043 43682.39
## 18396       27638.72 16844.26 38433.18 11130.013 44147.42
## 18397       27638.72 16548.52 38728.91 10677.729 44599.70
## 18398       27638.72 16260.47 39016.96 10237.197 45040.24
## 18399       27638.72 15979.54 39297.89  9807.544 45469.89
## 18400       27638.72 15705.22 39572.22  9388.004 45889.43
## 18401       27638.72 15437.06 39840.37  8977.894 46299.54
## 18402       27638.72 15174.67 40102.76  8576.604 46700.83
## 18403       27638.72 14917.69 40359.74  8183.591 47093.84
## 18404       27638.72 14665.80 40611.63  7798.360 47479.07
## 18405       27638.72 14418.71 40858.72  7420.469 47856.96
## 18406       27638.72 14176.16 41101.27  7049.512 48227.92
## 18407       27638.72 13937.90 41339.54  6685.121 48592.31
## 18408       27638.72 13703.71 41573.72  6326.960 48950.47
## 18409       27638.72 13473.39 41804.04  5974.720 49302.71
## 18410       27638.72 13246.76 42030.67  5628.115 49649.32
## 18411       27638.72 13023.64 42253.79  5286.885 49990.55
## 18412       27638.72 12803.88 42473.56  4950.786 50326.65
## 18413       27638.72 12587.32 42690.11  4619.594 50657.84
## 18414       27638.72 12373.84 42903.59  4293.100 50984.33
## 18415       27638.72 12163.30 43114.13  3971.110 51306.32
## 18416       27638.72 11955.59 43321.84  3653.442 51623.99
## 18417       27638.72 11750.59 43526.84  3339.926 51937.51
## 18418       27638.72 11548.21 43729.23  3030.404 52247.03
## 18419       27638.72 11348.33 43929.10  2724.728 52552.70
## 18420       27638.72 11150.89 44126.55  2422.756 52854.68
## 18421       27638.72 10955.77 44321.66  2124.359 53153.07
## 18422       27638.72 10762.92 44514.51  1829.411 53448.02
## 18423       27638.72 10572.24 44705.19  1537.796 53739.64
autoplot(myfc1)

myfc1%>%accuracy(test_ts)
##                      ME     RMSE      MAE       MPE    MAPE      MASE
## Training set   291.4781 1967.776 1011.610      -Inf     Inf 0.9893086
## Test set     -4646.1608 5928.830 5216.313 -23.13163 24.9378 5.1013160
##                     ACF1
## Training set -0.02691076
## Test set              NA
#                     ME     RMSE      MAE       MPE    MAPE      MASE        ACF1
#Training set   291.4781 1967.776 1011.610      -Inf     Inf 0.9893086 -0.02691076
#Test set     -4646.1608 5928.830 5216.313 -23.13163 24.9378 5.1013160          NA

#ARIMA (0,0,0)
arimadiff1<-auto.arima(diff1)
arimadiff1
## Series: diff1 
## ARIMA(0,0,0) with non-zero mean 
## 
## Coefficients:
##           mean
##       293.9255
## s.e.  201.7730
## 
## sigma^2 estimated as 3868105:  log likelihood=-845.79
## AIC=1695.57   AICc=1695.7   BIC=1700.66
#s.e.=201.7730
#AIC=1695.57   AICc=1695.7   BIC=1700.66
myfc2<-arimadiff1%>%forecast(h=45)
autoplot(myfc2)

difftest<-diff(test_ts)
myfc2%>%accuracy(difftest)
##                         ME     RMSE      MAE      MPE     MAPE      MASE
## Training set  3.802160e-14 1956.260 1132.660     -Inf      Inf 0.7492735
## Test set     -3.248574e+02 3302.361 2598.691 133.9599 134.2727 1.7190777
##                     ACF1
## Training set -0.02901554
## Test set              NA
#                       ME     RMSE      MAE      MPE     MAPE      MASE        ACF1
#Training set  3.802160e-14 1956.260 1132.660     -Inf      Inf 0.7492735 -0.02901554
#Test set     -3.248574e+02 3302.361 2598.691 133.9599 134.2727 1.7190777          NA
################################################
#US fatalities, date from 01-23-2020 to 04-26-2020
US_fat<- US %>%
  filter(Target=='Fatalities')
#fatalities testing
US_fat_test<-US_fat %>%
  filter(Date>'2020-04-26')
US_fat_test1<-US_fat_test%>%
  group_by(Date) %>%
  select(-County, -Province_State) %>%
  pivot_wider(names_from = Target, values_from = TargetValue, values_fill = 0)
head(US_fat_test1)
## # A tibble: 6 × 6
## # Groups:   Date [6]
##       Id Country_Region Population Weight Date       Fatalities
##    <dbl> <chr>               <dbl>  <dbl> <date>          <dbl>
## 1 963112 US              324141489  0.510 2020-04-27       1378
## 2 963114 US              324141489  0.510 2020-04-28       2096
## 3 963116 US              324141489  0.510 2020-04-29       2612
## 4 963118 US              324141489  0.510 2020-04-30       2029
## 5 963120 US              324141489  0.510 2020-05-01       1947
## 6 963122 US              324141489  0.510 2020-05-02       1426
test_ts2 <- zoo(US_fat_test1$Fatalities,seq(from=as.Date("2020-04-27"), to=as.Date("2020-06-10"), by = "day"))
test_ts2
## 2020-04-27 2020-04-28 2020-04-29 2020-04-30 2020-05-01 2020-05-02 2020-05-03 
##       1378       2096       2612       2029       1947       1426       1313 
## 2020-05-04 2020-05-05 2020-05-06 2020-05-07 2020-05-08 2020-05-09 2020-05-10 
##       1240       2142       2367       2231       1518       1615        731 
## 2020-05-11 2020-05-12 2020-05-13 2020-05-14 2020-05-15 2020-05-16 2020-05-17 
##       1156       1694       1743       1779       1632       1224        808 
## 2020-05-18 2020-05-19 2020-05-20 2020-05-21 2020-05-22 2020-05-23 2020-05-24 
##        785       1574       1518       1263       1277       1108        633 
## 2020-05-25 2020-05-26 2020-05-27 2020-05-28 2020-05-29 2020-05-30 2020-05-31 
##        500        693       1505       1198       1193        967        605 
## 2020-06-01 2020-06-02 2020-06-03 2020-06-04 2020-06-05 2020-06-06 2020-06-07 
##        768       1031        981       1036       1162        709        443 
## 2020-06-08 2020-06-09 2020-06-10 
##        500        978        935
#fatalities training
US_fat_train<- US_fat %>%
  filter(Date<='2020-04-26')
head(US_fat_train)
## # A tibble: 6 × 9
##       Id County Province_State Country_Region Population Weight Date      
##    <dbl> <chr>  <chr>          <chr>               <dbl>  <dbl> <date>    
## 1 962922 <NA>   <NA>           US              324141489  0.510 2020-01-23
## 2 962924 <NA>   <NA>           US              324141489  0.510 2020-01-24
## 3 962926 <NA>   <NA>           US              324141489  0.510 2020-01-25
## 4 962928 <NA>   <NA>           US              324141489  0.510 2020-01-26
## 5 962930 <NA>   <NA>           US              324141489  0.510 2020-01-27
## 6 962932 <NA>   <NA>           US              324141489  0.510 2020-01-28
## # … with 2 more variables: Target <chr>, TargetValue <dbl>
US_fat_train1<-US_fat_train%>%
  group_by(Date) %>%
  select(-County, -Province_State) %>%
  pivot_wider(names_from = Target, values_from = TargetValue, values_fill = 0)
head(US_fat_train1)
## # A tibble: 6 × 6
## # Groups:   Date [6]
##       Id Country_Region Population Weight Date       Fatalities
##    <dbl> <chr>               <dbl>  <dbl> <date>          <dbl>
## 1 962922 US              324141489  0.510 2020-01-23          0
## 2 962924 US              324141489  0.510 2020-01-24          0
## 3 962926 US              324141489  0.510 2020-01-25          0
## 4 962928 US              324141489  0.510 2020-01-26          0
## 5 962930 US              324141489  0.510 2020-01-27          0
## 6 962932 US              324141489  0.510 2020-01-28          0
train_ts1 <- zoo(US_fat_train$TargetValue,seq(from=as.Date("2020-01-23"), to=as.Date("2020-04-26"), by = "day"))
train_ts1
## 2020-01-23 2020-01-24 2020-01-25 2020-01-26 2020-01-27 2020-01-28 2020-01-29 
##          0          0          0          0          0          0          0 
## 2020-01-30 2020-01-31 2020-02-01 2020-02-02 2020-02-03 2020-02-04 2020-02-05 
##          0          0          0          0          0          0          0 
## 2020-02-06 2020-02-07 2020-02-08 2020-02-09 2020-02-10 2020-02-11 2020-02-12 
##          0          0          0          0          0          0          0 
## 2020-02-13 2020-02-14 2020-02-15 2020-02-16 2020-02-17 2020-02-18 2020-02-19 
##          0          0          0          0          0          0          0 
## 2020-02-20 2020-02-21 2020-02-22 2020-02-23 2020-02-24 2020-02-25 2020-02-26 
##          0          0          0          0          0          0          0 
## 2020-02-27 2020-02-28 2020-02-29 2020-03-01 2020-03-02 2020-03-03 2020-03-04 
##          0          0          0          0          0          0          0 
## 2020-03-05 2020-03-06 2020-03-07 2020-03-08 2020-03-09 2020-03-10 2020-03-11 
##          0          0          0          0          0         28          8 
## 2020-03-12 2020-03-13 2020-03-14 2020-03-15 2020-03-16 2020-03-17 2020-03-18 
##          4          7          7          9         22         23         10 
## 2020-03-19 2020-03-20 2020-03-21 2020-03-22 2020-03-23 2020-03-24 2020-03-25 
##         82         44         63        119        125        154        236 
## 2020-03-26 2020-03-27 2020-03-28 2020-03-29 2020-03-30 2020-03-31 2020-04-01 
##        267        370        445        441        511        895        883 
## 2020-04-02 2020-04-03 2020-04-04 2020-04-05 2020-04-06 2020-04-07 2020-04-08 
##       1169       1161       1320       1212       1164       1938       1973 
## 2020-04-09 2020-04-10 2020-04-11 2020-04-12 2020-04-13 2020-04-14 2020-04-15 
##       1783       2108       1876       1560       1509       2303       2494 
## 2020-04-16 2020-04-17 2020-04-18 2020-04-19 2020-04-20 2020-04-21 2020-04-22 
##       4591       3857       1891       1997       1433       2350       2178 
## 2020-04-23 2020-04-24 2020-04-25 2020-04-26 
##       3329       1995       1806       1126
train_ts1%>%
  autoplot()+
  labs(title="US daily fatalities time series")

adf.test(train_ts1)
## 
##  Augmented Dickey-Fuller Test
## 
## data:  train_ts1
## Dickey-Fuller = -1.722, Lag order = 4, p-value = 0.6912
## alternative hypothesis: stationary
#p-value is greater than 0.05, the data is not stationary

unitroot_ndiffs(train_ts1)
## ndiffs 
##      1
#1 difference is required

acf(train_ts1)

#acf is damping down slowly
pacf(train_ts1)

#lag1, 5, and 8 are outside the boundaries

#Differencing to eliminate trend or seasonality
#first order difference, take 1 day back
diff2<-diff(train_ts1,differences=1)
diff2
## 2020-01-24 2020-01-25 2020-01-26 2020-01-27 2020-01-28 2020-01-29 2020-01-30 
##          0          0          0          0          0          0          0 
## 2020-01-31 2020-02-01 2020-02-02 2020-02-03 2020-02-04 2020-02-05 2020-02-06 
##          0          0          0          0          0          0          0 
## 2020-02-07 2020-02-08 2020-02-09 2020-02-10 2020-02-11 2020-02-12 2020-02-13 
##          0          0          0          0          0          0          0 
## 2020-02-14 2020-02-15 2020-02-16 2020-02-17 2020-02-18 2020-02-19 2020-02-20 
##          0          0          0          0          0          0          0 
## 2020-02-21 2020-02-22 2020-02-23 2020-02-24 2020-02-25 2020-02-26 2020-02-27 
##          0          0          0          0          0          0          0 
## 2020-02-28 2020-02-29 2020-03-01 2020-03-02 2020-03-03 2020-03-04 2020-03-05 
##          0          0          0          0          0          0          0 
## 2020-03-06 2020-03-07 2020-03-08 2020-03-09 2020-03-10 2020-03-11 2020-03-12 
##          0          0          0          0         28        -20         -4 
## 2020-03-13 2020-03-14 2020-03-15 2020-03-16 2020-03-17 2020-03-18 2020-03-19 
##          3          0          2         13          1        -13         72 
## 2020-03-20 2020-03-21 2020-03-22 2020-03-23 2020-03-24 2020-03-25 2020-03-26 
##        -38         19         56          6         29         82         31 
## 2020-03-27 2020-03-28 2020-03-29 2020-03-30 2020-03-31 2020-04-01 2020-04-02 
##        103         75         -4         70        384        -12        286 
## 2020-04-03 2020-04-04 2020-04-05 2020-04-06 2020-04-07 2020-04-08 2020-04-09 
##         -8        159       -108        -48        774         35       -190 
## 2020-04-10 2020-04-11 2020-04-12 2020-04-13 2020-04-14 2020-04-15 2020-04-16 
##        325       -232       -316        -51        794        191       2097 
## 2020-04-17 2020-04-18 2020-04-19 2020-04-20 2020-04-21 2020-04-22 2020-04-23 
##       -734      -1966        106       -564        917       -172       1151 
## 2020-04-24 2020-04-25 2020-04-26 
##      -1334       -189       -680
autoplot(diff2)+
  labs(title="Differencing Daily Fatalities in the US")

#the graph is much more stationary than before

acf(diff2)

#acf drops to zero related
pacf(diff2)

#lag 3 and 7 are strong lags


#ETS(A,N,N)
ets1<-ets(train_ts1)
ets1
## ETS(A,N,N) 
## 
## Call:
##  ets(y = train_ts1) 
## 
##   Smoothing parameters:
##     alpha = 0.848 
## 
##   Initial states:
##     l = -0.113 
## 
##   sigma:  404.3635
## 
##      AIC     AICc      BIC 
## 1577.037 1577.300 1584.698
#alpha=0.848, l=-0.113
#AIC=1577.037 AICc=1577.300 BIC=1584.698
myfc3<-ets1%>% forecast(h=45)
autoplot(myfc3)

myfc3%>%accuracy(test_ts2)
##                    ME     RMSE      MAE       MPE     MAPE      MASE       ACF1
## Training set 15.36559 400.0844 153.2029       Inf      Inf 0.9937258 0.01210657
## Test set     52.06590 529.9220 422.0617 -15.72695 40.06742 2.7376348         NA
#                   ME     RMSE      MAE       MPE     MAPE      MASE       ACF1
#Training set 15.36559 400.0844 153.2029       Inf      Inf 0.9937258 0.01210657
#Test set     52.06590 529.9220 422.0617 -15.72695 40.06742 2.7376348         NA

#ARIMA (1,0,1)
arimadiff2<-auto.arima(diff2)
arimadiff2
## Series: diff2 
## ARIMA(1,0,1) with zero mean 
## 
## Coefficients:
##          ar1      ma1
##       0.5171  -0.7749
## s.e.  0.1518   0.0998
## 
## sigma^2 estimated as 155013:  log likelihood=-694.18
## AIC=1394.36   AICc=1394.63   BIC=1401.99
#AIC=1394.36   AICc=1394.63   BIC=1401.99
myfc4<-arimadiff2 %>% forecast(h=45)
autoplot(myfc4)

difftest2<-diff(test_ts2)
myfc4%>%accuracy(difftest2)
##                     ME     RMSE      MAE      MPE     MAPE      MASE       ACF1
## Training set  39.40202 389.5062 152.8411      Inf      Inf 0.5913724 0.02947579
## Test set     -23.71178 389.0037 296.9636 100.2996 100.2996 1.1490104         NA
#                   ME     RMSE      MAE      MPE     MAPE      MASE       ACF1
#Training set  39.40202 389.5062 152.8411      Inf      Inf 0.5913724 0.02947579
#Test set     -23.71178 389.0037 296.9636 100.2996 100.2996 1.1490104         NA