#train
train<-read_csv("train.csv")
## Rows: 969640 Columns: 9
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (4): County, Province_State, Country_Region, Target
## dbl (4): Id, Population, Weight, TargetValue
## date (1): Date
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
str(train)
## spec_tbl_df [969,640 × 9] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ Id : num [1:969640] 1 2 3 4 5 6 7 8 9 10 ...
## $ County : chr [1:969640] NA NA NA NA ...
## $ Province_State: chr [1:969640] NA NA NA NA ...
## $ Country_Region: chr [1:969640] "Afghanistan" "Afghanistan" "Afghanistan" "Afghanistan" ...
## $ Population : num [1:969640] 27657145 27657145 27657145 27657145 27657145 ...
## $ Weight : num [1:969640] 0.0584 0.5836 0.0584 0.5836 0.0584 ...
## $ Date : Date[1:969640], format: "2020-01-23" "2020-01-23" ...
## $ Target : chr [1:969640] "ConfirmedCases" "Fatalities" "ConfirmedCases" "Fatalities" ...
## $ TargetValue : num [1:969640] 0 0 0 0 0 0 0 0 0 0 ...
## - attr(*, "spec")=
## .. cols(
## .. Id = col_double(),
## .. County = col_character(),
## .. Province_State = col_character(),
## .. Country_Region = col_character(),
## .. Population = col_double(),
## .. Weight = col_double(),
## .. Date = col_date(format = ""),
## .. Target = col_character(),
## .. TargetValue = col_double()
## .. )
## - attr(*, "problems")=<externalptr>
###Describe Data
#Top 10 country with the highest Confirmed Cases and fatalities
con<-train %>%
filter(Target=='ConfirmedCases') %>%
group_by(Country_Region) %>%
summarise(TargetValue=sum(TargetValue)) %>%
arrange(desc(TargetValue))
con[1:10,]
## # A tibble: 10 × 2
## Country_Region TargetValue
## <chr> <dbl>
## 1 US 5979962
## 2 Brazil 772416
## 3 Russia 493023
## 4 United Kingdom 291588
## 5 India 276583
## 6 Spain 242280
## 7 Italy 235763
## 8 Peru 208823
## 9 Canada 197414
## 10 France 192068
fat<-train %>%
filter(Target=='Fatalities') %>%
group_by(Country_Region) %>%
summarise(TargetValue=sum(TargetValue)) %>%
arrange(desc(TargetValue))
fat[1:10,]
## # A tibble: 10 × 2
## Country_Region TargetValue
## <chr> <dbl>
## 1 US 337252
## 2 United Kingdom 41213
## 3 Brazil 39680
## 4 Italy 34114
## 5 France 29322
## 6 Spain 27136
## 7 Canada 16074
## 8 Mexico 15357
## 9 Belgium 9629
## 10 China 9242
#United States case
US<-train %>%
filter(Country_Region=='US')
colSums(is.na(US))
## Id County Province_State Country_Region Population
## 0 15400 280 0 0
## Weight Date Target TargetValue
## 0 0 0 0
#filter the na values
US <- US %>%
filter(is.na(Province_State))
ggplot(data = US,
mapping=aes(x = Date,
y = TargetValue,
fill=Target,
color = Target)) +
geom_line() +
scale_color_manual(values=c("black", "red")) +
theme(legend.position = c(0.175, 0.835))+
labs(title="US Confirmed Cases vs Fatalities")

#US confirmed cases, date from 01-23-2020 to 04-26-2020
US_con<- US %>%
filter(Target=='ConfirmedCases')
########Reorganize data
US_con_train<- US_con %>%
filter(Date<='2020-04-26')
head(US_con_train)
## # A tibble: 6 × 9
## Id County Province_State Country_Region Population Weight Date
## <dbl> <chr> <chr> <chr> <dbl> <dbl> <date>
## 1 962921 <NA> <NA> US 324141489 0.0510 2020-01-23
## 2 962923 <NA> <NA> US 324141489 0.0510 2020-01-24
## 3 962925 <NA> <NA> US 324141489 0.0510 2020-01-25
## 4 962927 <NA> <NA> US 324141489 0.0510 2020-01-26
## 5 962929 <NA> <NA> US 324141489 0.0510 2020-01-27
## 6 962931 <NA> <NA> US 324141489 0.0510 2020-01-28
## # … with 2 more variables: Target <chr>, TargetValue <dbl>
#test date from 04-27-2020 to 06-10-2020
US_con_test<-US_con %>%
filter(Date>'2020-04-26')
US_con_test1<-US_con_test%>%
group_by(Date) %>%
select(-County, -Province_State) %>%
pivot_wider(names_from = Target, values_from = TargetValue, values_fill = 0)
head(US_con_test1)
## # A tibble: 6 × 6
## # Groups: Date [6]
## Id Country_Region Population Weight Date ConfirmedCases
## <dbl> <chr> <dbl> <dbl> <date> <dbl>
## 1 963111 US 324141489 0.0510 2020-04-27 22414
## 2 963113 US 324141489 0.0510 2020-04-28 24385
## 3 963115 US 324141489 0.0510 2020-04-29 27327
## 4 963117 US 324141489 0.0510 2020-04-30 29515
## 5 963119 US 324141489 0.0510 2020-05-01 34037
## 6 963121 US 324141489 0.0510 2020-05-02 29078
test_ts <- zoo(US_con_test1$ConfirmedCases,seq(from=as.Date("2020-04-27"), to=as.Date("2020-06-10"), by = "day"))
test_ts
## 2020-04-27 2020-04-28 2020-04-29 2020-04-30 2020-05-01 2020-05-02 2020-05-03
## 22414 24385 27327 29515 34037 29078 25501
## 2020-05-04 2020-05-05 2020-05-06 2020-05-07 2020-05-08 2020-05-09 2020-05-10
## 22335 23976 24251 28420 26906 25620 19710
## 2020-05-11 2020-05-12 2020-05-13 2020-05-14 2020-05-15 2020-05-16 2020-05-17
## 18618 21693 20832 27368 25050 24994 18937
## 2020-05-18 2020-05-19 2020-05-20 2020-05-21 2020-05-22 2020-05-23 2020-05-24
## 21551 20260 23285 25293 23790 21675 20634
## 2020-05-25 2020-05-26 2020-05-27 2020-05-28 2020-05-29 2020-05-30 2020-05-31
## 19056 18611 18263 22577 24266 24146 20007
## 2020-06-01 2020-06-02 2020-06-03 2020-06-04 2020-06-05 2020-06-06 2020-06-07
## 20848 20800 19698 21138 29972 23133 18114
## 2020-06-08 2020-06-09 2020-06-10
## 17547 17981 21053
US_con_train1<-US_con_train%>%
group_by(Date) %>%
select(-County, -Province_State) %>%
pivot_wider(names_from = Target, values_from = TargetValue, values_fill = 0)
head(US_con_train1)
## # A tibble: 6 × 6
## # Groups: Date [6]
## Id Country_Region Population Weight Date ConfirmedCases
## <dbl> <chr> <dbl> <dbl> <date> <dbl>
## 1 962921 US 324141489 0.0510 2020-01-23 0
## 2 962923 US 324141489 0.0510 2020-01-24 0
## 3 962925 US 324141489 0.0510 2020-01-25 0
## 4 962927 US 324141489 0.0510 2020-01-26 0
## 5 962929 US 324141489 0.0510 2020-01-27 0
## 6 962931 US 324141489 0.0510 2020-01-28 0
adf.test(US_con_train1$ConfirmedCases)
##
## Augmented Dickey-Fuller Test
##
## data: US_con_train1$ConfirmedCases
## Dickey-Fuller = -1.5933, Lag order = 4, p-value = 0.7444
## alternative hypothesis: stationary
#p-value = 0.7444 > 0.05, the data is not stationary
#convert the US train data to time series
train_ts <- zoo(US_con_train1$ConfirmedCases,seq(from=as.Date("2020-01-23"), to=as.Date("2020-04-26"), by = "day"))
train_ts
## 2020-01-23 2020-01-24 2020-01-25 2020-01-26 2020-01-27 2020-01-28 2020-01-29
## 0 0 0 0 0 0 0
## 2020-01-30 2020-01-31 2020-02-01 2020-02-02 2020-02-03 2020-02-04 2020-02-05
## 0 0 0 0 0 0 0
## 2020-02-06 2020-02-07 2020-02-08 2020-02-09 2020-02-10 2020-02-11 2020-02-12
## 0 0 0 0 0 0 0
## 2020-02-13 2020-02-14 2020-02-15 2020-02-16 2020-02-17 2020-02-18 2020-02-19
## 0 0 0 0 0 0 0
## 2020-02-20 2020-02-21 2020-02-22 2020-02-23 2020-02-24 2020-02-25 2020-02-26
## 0 0 0 0 0 0 0
## 2020-02-27 2020-02-28 2020-02-29 2020-03-01 2020-03-02 2020-03-03 2020-03-04
## 0 0 0 0 0 0 0
## 2020-03-05 2020-03-06 2020-03-07 2020-03-08 2020-03-09 2020-03-10 2020-03-11
## 0 0 0 0 0 892 322
## 2020-03-12 2020-03-13 2020-03-14 2020-03-15 2020-03-16 2020-03-17 2020-03-18
## 382 516 546 773 1134 1788 1362
## 2020-03-19 2020-03-20 2020-03-21 2020-03-22 2020-03-23 2020-03-24 2020-03-25
## 5893 5417 6410 8330 9821 10073 12042
## 2020-03-26 2020-03-27 2020-03-28 2020-03-29 2020-03-30 2020-03-31 2020-04-01
## 18058 17821 19733 19444 20922 26339 25196
## 2020-04-02 2020-04-03 2020-04-04 2020-04-05 2020-04-06 2020-04-07 2020-04-08
## 30227 31985 33267 28219 29595 29556 32826
## 2020-04-09 2020-04-10 2020-04-11 2020-04-12 2020-04-13 2020-04-14 2020-04-15
## 32385 35098 29861 28917 25306 27051 28678
## 2020-04-16 2020-04-17 2020-04-18 2020-04-19 2020-04-20 2020-04-21 2020-04-22
## 31451 31905 32490 26889 25240 27539 28355
## 2020-04-23 2020-04-24 2020-04-25 2020-04-26
## 28950 36163 32821 27629
train_ts%>%
autoplot()+
labs(title="US daily confirmed cases time series")

########Model
acf(train_ts)

#The series is dampening down, all lags are outside the boundaries and is not stationary
pacf(train_ts)

#lag1, 2, 3, and 16 are outside the boundaries
unitroot_ndiffs(train_ts)
## ndiffs
## 1
#1 difference is required
#Differencing to eliminate trend or seasonality
#first order difference, take 1 day back
diff1<-diff(train_ts,differences=1)
diff1
## 2020-01-24 2020-01-25 2020-01-26 2020-01-27 2020-01-28 2020-01-29 2020-01-30
## 0 0 0 0 0 0 0
## 2020-01-31 2020-02-01 2020-02-02 2020-02-03 2020-02-04 2020-02-05 2020-02-06
## 0 0 0 0 0 0 0
## 2020-02-07 2020-02-08 2020-02-09 2020-02-10 2020-02-11 2020-02-12 2020-02-13
## 0 0 0 0 0 0 0
## 2020-02-14 2020-02-15 2020-02-16 2020-02-17 2020-02-18 2020-02-19 2020-02-20
## 0 0 0 0 0 0 0
## 2020-02-21 2020-02-22 2020-02-23 2020-02-24 2020-02-25 2020-02-26 2020-02-27
## 0 0 0 0 0 0 0
## 2020-02-28 2020-02-29 2020-03-01 2020-03-02 2020-03-03 2020-03-04 2020-03-05
## 0 0 0 0 0 0 0
## 2020-03-06 2020-03-07 2020-03-08 2020-03-09 2020-03-10 2020-03-11 2020-03-12
## 0 0 0 0 892 -570 60
## 2020-03-13 2020-03-14 2020-03-15 2020-03-16 2020-03-17 2020-03-18 2020-03-19
## 134 30 227 361 654 -426 4531
## 2020-03-20 2020-03-21 2020-03-22 2020-03-23 2020-03-24 2020-03-25 2020-03-26
## -476 993 1920 1491 252 1969 6016
## 2020-03-27 2020-03-28 2020-03-29 2020-03-30 2020-03-31 2020-04-01 2020-04-02
## -237 1912 -289 1478 5417 -1143 5031
## 2020-04-03 2020-04-04 2020-04-05 2020-04-06 2020-04-07 2020-04-08 2020-04-09
## 1758 1282 -5048 1376 -39 3270 -441
## 2020-04-10 2020-04-11 2020-04-12 2020-04-13 2020-04-14 2020-04-15 2020-04-16
## 2713 -5237 -944 -3611 1745 1627 2773
## 2020-04-17 2020-04-18 2020-04-19 2020-04-20 2020-04-21 2020-04-22 2020-04-23
## 454 585 -5601 -1649 2299 816 595
## 2020-04-24 2020-04-25 2020-04-26
## 7213 -3342 -5192
autoplot(diff1)+
labs(title="Differencing US daily Confirmed Cases")

#much more stationary than before
acf(diff1)

#acf drops to zero related
pacf(diff1)

#lag 6 and 7 are strong
#ets(A,N,N)
ets<-ets(train_ts,)
ets
## ETS(A,N,N)
##
## Call:
## ets(y = train_ts)
##
## Smoothing parameters:
## alpha = 0.9981
##
## Initial states:
## l = 0.0518
##
## sigma: 1988.822
##
## AIC AICc BIC
## 1879.704 1879.967 1887.365
#alpha=0.9981, l = 0.0518
#AIC=1879.704 AICc=1879.967 BIC=1887.365
myfc1<-ets%>%forecast(h=45)
myfc1
## Point Forecast Lo 80 Hi 80 Lo 95 Hi 95
## 18379 27638.72 25089.94 30187.49 23740.697 31536.74
## 18380 27638.72 24037.57 31239.87 22131.234 33146.20
## 18381 27638.72 23229.60 32047.83 20895.559 34381.87
## 18382 27638.72 22548.30 32729.13 19853.604 35423.83
## 18383 27638.72 21948.00 33329.44 18935.511 36341.92
## 18384 27638.72 21405.23 33872.20 18105.428 37172.00
## 18385 27638.72 20906.09 34371.35 17342.047 37935.39
## 18386 27638.72 20441.47 34835.96 16631.482 38645.95
## 18387 27638.72 20005.09 35272.35 15964.085 39313.35
## 18388 27638.72 19592.33 35685.10 15332.831 39944.60
## 18389 27638.72 19199.74 36077.69 14732.415 40545.02
## 18390 27638.72 18824.62 36452.82 14158.715 41118.72
## 18391 27638.72 18464.82 36812.61 13608.455 41668.98
## 18392 27638.72 18118.61 37158.82 13078.976 42198.46
## 18393 27638.72 17784.56 37492.87 12568.088 42709.35
## 18394 27638.72 17461.47 37815.96 12073.959 43203.47
## 18395 27638.72 17148.32 38129.11 11595.043 43682.39
## 18396 27638.72 16844.26 38433.18 11130.013 44147.42
## 18397 27638.72 16548.52 38728.91 10677.729 44599.70
## 18398 27638.72 16260.47 39016.96 10237.197 45040.24
## 18399 27638.72 15979.54 39297.89 9807.544 45469.89
## 18400 27638.72 15705.22 39572.22 9388.004 45889.43
## 18401 27638.72 15437.06 39840.37 8977.894 46299.54
## 18402 27638.72 15174.67 40102.76 8576.604 46700.83
## 18403 27638.72 14917.69 40359.74 8183.591 47093.84
## 18404 27638.72 14665.80 40611.63 7798.360 47479.07
## 18405 27638.72 14418.71 40858.72 7420.469 47856.96
## 18406 27638.72 14176.16 41101.27 7049.512 48227.92
## 18407 27638.72 13937.90 41339.54 6685.121 48592.31
## 18408 27638.72 13703.71 41573.72 6326.960 48950.47
## 18409 27638.72 13473.39 41804.04 5974.720 49302.71
## 18410 27638.72 13246.76 42030.67 5628.115 49649.32
## 18411 27638.72 13023.64 42253.79 5286.885 49990.55
## 18412 27638.72 12803.88 42473.56 4950.786 50326.65
## 18413 27638.72 12587.32 42690.11 4619.594 50657.84
## 18414 27638.72 12373.84 42903.59 4293.100 50984.33
## 18415 27638.72 12163.30 43114.13 3971.110 51306.32
## 18416 27638.72 11955.59 43321.84 3653.442 51623.99
## 18417 27638.72 11750.59 43526.84 3339.926 51937.51
## 18418 27638.72 11548.21 43729.23 3030.404 52247.03
## 18419 27638.72 11348.33 43929.10 2724.728 52552.70
## 18420 27638.72 11150.89 44126.55 2422.756 52854.68
## 18421 27638.72 10955.77 44321.66 2124.359 53153.07
## 18422 27638.72 10762.92 44514.51 1829.411 53448.02
## 18423 27638.72 10572.24 44705.19 1537.796 53739.64
autoplot(myfc1)

myfc1%>%accuracy(test_ts)
## ME RMSE MAE MPE MAPE MASE
## Training set 291.4781 1967.776 1011.610 -Inf Inf 0.9893086
## Test set -4646.1608 5928.830 5216.313 -23.13163 24.9378 5.1013160
## ACF1
## Training set -0.02691076
## Test set NA
# ME RMSE MAE MPE MAPE MASE ACF1
#Training set 291.4781 1967.776 1011.610 -Inf Inf 0.9893086 -0.02691076
#Test set -4646.1608 5928.830 5216.313 -23.13163 24.9378 5.1013160 NA
#ARIMA (0,0,0)
arimadiff1<-auto.arima(diff1)
arimadiff1
## Series: diff1
## ARIMA(0,0,0) with non-zero mean
##
## Coefficients:
## mean
## 293.9255
## s.e. 201.7730
##
## sigma^2 estimated as 3868105: log likelihood=-845.79
## AIC=1695.57 AICc=1695.7 BIC=1700.66
#s.e.=201.7730
#AIC=1695.57 AICc=1695.7 BIC=1700.66
myfc2<-arimadiff1%>%forecast(h=45)
autoplot(myfc2)

difftest<-diff(test_ts)
myfc2%>%accuracy(difftest)
## ME RMSE MAE MPE MAPE MASE
## Training set 3.802160e-14 1956.260 1132.660 -Inf Inf 0.7492735
## Test set -3.248574e+02 3302.361 2598.691 133.9599 134.2727 1.7190777
## ACF1
## Training set -0.02901554
## Test set NA
# ME RMSE MAE MPE MAPE MASE ACF1
#Training set 3.802160e-14 1956.260 1132.660 -Inf Inf 0.7492735 -0.02901554
#Test set -3.248574e+02 3302.361 2598.691 133.9599 134.2727 1.7190777 NA
################################################
#US fatalities, date from 01-23-2020 to 04-26-2020
US_fat<- US %>%
filter(Target=='Fatalities')
#fatalities testing
US_fat_test<-US_fat %>%
filter(Date>'2020-04-26')
US_fat_test1<-US_fat_test%>%
group_by(Date) %>%
select(-County, -Province_State) %>%
pivot_wider(names_from = Target, values_from = TargetValue, values_fill = 0)
head(US_fat_test1)
## # A tibble: 6 × 6
## # Groups: Date [6]
## Id Country_Region Population Weight Date Fatalities
## <dbl> <chr> <dbl> <dbl> <date> <dbl>
## 1 963112 US 324141489 0.510 2020-04-27 1378
## 2 963114 US 324141489 0.510 2020-04-28 2096
## 3 963116 US 324141489 0.510 2020-04-29 2612
## 4 963118 US 324141489 0.510 2020-04-30 2029
## 5 963120 US 324141489 0.510 2020-05-01 1947
## 6 963122 US 324141489 0.510 2020-05-02 1426
test_ts2 <- zoo(US_fat_test1$Fatalities,seq(from=as.Date("2020-04-27"), to=as.Date("2020-06-10"), by = "day"))
test_ts2
## 2020-04-27 2020-04-28 2020-04-29 2020-04-30 2020-05-01 2020-05-02 2020-05-03
## 1378 2096 2612 2029 1947 1426 1313
## 2020-05-04 2020-05-05 2020-05-06 2020-05-07 2020-05-08 2020-05-09 2020-05-10
## 1240 2142 2367 2231 1518 1615 731
## 2020-05-11 2020-05-12 2020-05-13 2020-05-14 2020-05-15 2020-05-16 2020-05-17
## 1156 1694 1743 1779 1632 1224 808
## 2020-05-18 2020-05-19 2020-05-20 2020-05-21 2020-05-22 2020-05-23 2020-05-24
## 785 1574 1518 1263 1277 1108 633
## 2020-05-25 2020-05-26 2020-05-27 2020-05-28 2020-05-29 2020-05-30 2020-05-31
## 500 693 1505 1198 1193 967 605
## 2020-06-01 2020-06-02 2020-06-03 2020-06-04 2020-06-05 2020-06-06 2020-06-07
## 768 1031 981 1036 1162 709 443
## 2020-06-08 2020-06-09 2020-06-10
## 500 978 935
#fatalities training
US_fat_train<- US_fat %>%
filter(Date<='2020-04-26')
head(US_fat_train)
## # A tibble: 6 × 9
## Id County Province_State Country_Region Population Weight Date
## <dbl> <chr> <chr> <chr> <dbl> <dbl> <date>
## 1 962922 <NA> <NA> US 324141489 0.510 2020-01-23
## 2 962924 <NA> <NA> US 324141489 0.510 2020-01-24
## 3 962926 <NA> <NA> US 324141489 0.510 2020-01-25
## 4 962928 <NA> <NA> US 324141489 0.510 2020-01-26
## 5 962930 <NA> <NA> US 324141489 0.510 2020-01-27
## 6 962932 <NA> <NA> US 324141489 0.510 2020-01-28
## # … with 2 more variables: Target <chr>, TargetValue <dbl>
US_fat_train1<-US_fat_train%>%
group_by(Date) %>%
select(-County, -Province_State) %>%
pivot_wider(names_from = Target, values_from = TargetValue, values_fill = 0)
head(US_fat_train1)
## # A tibble: 6 × 6
## # Groups: Date [6]
## Id Country_Region Population Weight Date Fatalities
## <dbl> <chr> <dbl> <dbl> <date> <dbl>
## 1 962922 US 324141489 0.510 2020-01-23 0
## 2 962924 US 324141489 0.510 2020-01-24 0
## 3 962926 US 324141489 0.510 2020-01-25 0
## 4 962928 US 324141489 0.510 2020-01-26 0
## 5 962930 US 324141489 0.510 2020-01-27 0
## 6 962932 US 324141489 0.510 2020-01-28 0
train_ts1 <- zoo(US_fat_train$TargetValue,seq(from=as.Date("2020-01-23"), to=as.Date("2020-04-26"), by = "day"))
train_ts1
## 2020-01-23 2020-01-24 2020-01-25 2020-01-26 2020-01-27 2020-01-28 2020-01-29
## 0 0 0 0 0 0 0
## 2020-01-30 2020-01-31 2020-02-01 2020-02-02 2020-02-03 2020-02-04 2020-02-05
## 0 0 0 0 0 0 0
## 2020-02-06 2020-02-07 2020-02-08 2020-02-09 2020-02-10 2020-02-11 2020-02-12
## 0 0 0 0 0 0 0
## 2020-02-13 2020-02-14 2020-02-15 2020-02-16 2020-02-17 2020-02-18 2020-02-19
## 0 0 0 0 0 0 0
## 2020-02-20 2020-02-21 2020-02-22 2020-02-23 2020-02-24 2020-02-25 2020-02-26
## 0 0 0 0 0 0 0
## 2020-02-27 2020-02-28 2020-02-29 2020-03-01 2020-03-02 2020-03-03 2020-03-04
## 0 0 0 0 0 0 0
## 2020-03-05 2020-03-06 2020-03-07 2020-03-08 2020-03-09 2020-03-10 2020-03-11
## 0 0 0 0 0 28 8
## 2020-03-12 2020-03-13 2020-03-14 2020-03-15 2020-03-16 2020-03-17 2020-03-18
## 4 7 7 9 22 23 10
## 2020-03-19 2020-03-20 2020-03-21 2020-03-22 2020-03-23 2020-03-24 2020-03-25
## 82 44 63 119 125 154 236
## 2020-03-26 2020-03-27 2020-03-28 2020-03-29 2020-03-30 2020-03-31 2020-04-01
## 267 370 445 441 511 895 883
## 2020-04-02 2020-04-03 2020-04-04 2020-04-05 2020-04-06 2020-04-07 2020-04-08
## 1169 1161 1320 1212 1164 1938 1973
## 2020-04-09 2020-04-10 2020-04-11 2020-04-12 2020-04-13 2020-04-14 2020-04-15
## 1783 2108 1876 1560 1509 2303 2494
## 2020-04-16 2020-04-17 2020-04-18 2020-04-19 2020-04-20 2020-04-21 2020-04-22
## 4591 3857 1891 1997 1433 2350 2178
## 2020-04-23 2020-04-24 2020-04-25 2020-04-26
## 3329 1995 1806 1126
train_ts1%>%
autoplot()+
labs(title="US daily fatalities time series")

adf.test(train_ts1)
##
## Augmented Dickey-Fuller Test
##
## data: train_ts1
## Dickey-Fuller = -1.722, Lag order = 4, p-value = 0.6912
## alternative hypothesis: stationary
#p-value is greater than 0.05, the data is not stationary
unitroot_ndiffs(train_ts1)
## ndiffs
## 1
#1 difference is required
acf(train_ts1)

#acf is damping down slowly
pacf(train_ts1)

#lag1, 5, and 8 are outside the boundaries
#Differencing to eliminate trend or seasonality
#first order difference, take 1 day back
diff2<-diff(train_ts1,differences=1)
diff2
## 2020-01-24 2020-01-25 2020-01-26 2020-01-27 2020-01-28 2020-01-29 2020-01-30
## 0 0 0 0 0 0 0
## 2020-01-31 2020-02-01 2020-02-02 2020-02-03 2020-02-04 2020-02-05 2020-02-06
## 0 0 0 0 0 0 0
## 2020-02-07 2020-02-08 2020-02-09 2020-02-10 2020-02-11 2020-02-12 2020-02-13
## 0 0 0 0 0 0 0
## 2020-02-14 2020-02-15 2020-02-16 2020-02-17 2020-02-18 2020-02-19 2020-02-20
## 0 0 0 0 0 0 0
## 2020-02-21 2020-02-22 2020-02-23 2020-02-24 2020-02-25 2020-02-26 2020-02-27
## 0 0 0 0 0 0 0
## 2020-02-28 2020-02-29 2020-03-01 2020-03-02 2020-03-03 2020-03-04 2020-03-05
## 0 0 0 0 0 0 0
## 2020-03-06 2020-03-07 2020-03-08 2020-03-09 2020-03-10 2020-03-11 2020-03-12
## 0 0 0 0 28 -20 -4
## 2020-03-13 2020-03-14 2020-03-15 2020-03-16 2020-03-17 2020-03-18 2020-03-19
## 3 0 2 13 1 -13 72
## 2020-03-20 2020-03-21 2020-03-22 2020-03-23 2020-03-24 2020-03-25 2020-03-26
## -38 19 56 6 29 82 31
## 2020-03-27 2020-03-28 2020-03-29 2020-03-30 2020-03-31 2020-04-01 2020-04-02
## 103 75 -4 70 384 -12 286
## 2020-04-03 2020-04-04 2020-04-05 2020-04-06 2020-04-07 2020-04-08 2020-04-09
## -8 159 -108 -48 774 35 -190
## 2020-04-10 2020-04-11 2020-04-12 2020-04-13 2020-04-14 2020-04-15 2020-04-16
## 325 -232 -316 -51 794 191 2097
## 2020-04-17 2020-04-18 2020-04-19 2020-04-20 2020-04-21 2020-04-22 2020-04-23
## -734 -1966 106 -564 917 -172 1151
## 2020-04-24 2020-04-25 2020-04-26
## -1334 -189 -680
autoplot(diff2)+
labs(title="Differencing Daily Fatalities in the US")

#the graph is much more stationary than before
acf(diff2)

#acf drops to zero related
pacf(diff2)

#lag 3 and 7 are strong lags
#ETS(A,N,N)
ets1<-ets(train_ts1)
ets1
## ETS(A,N,N)
##
## Call:
## ets(y = train_ts1)
##
## Smoothing parameters:
## alpha = 0.848
##
## Initial states:
## l = -0.113
##
## sigma: 404.3635
##
## AIC AICc BIC
## 1577.037 1577.300 1584.698
#alpha=0.848, l=-0.113
#AIC=1577.037 AICc=1577.300 BIC=1584.698
myfc3<-ets1%>% forecast(h=45)
autoplot(myfc3)

myfc3%>%accuracy(test_ts2)
## ME RMSE MAE MPE MAPE MASE ACF1
## Training set 15.36559 400.0844 153.2029 Inf Inf 0.9937258 0.01210657
## Test set 52.06590 529.9220 422.0617 -15.72695 40.06742 2.7376348 NA
# ME RMSE MAE MPE MAPE MASE ACF1
#Training set 15.36559 400.0844 153.2029 Inf Inf 0.9937258 0.01210657
#Test set 52.06590 529.9220 422.0617 -15.72695 40.06742 2.7376348 NA
#ARIMA (1,0,1)
arimadiff2<-auto.arima(diff2)
arimadiff2
## Series: diff2
## ARIMA(1,0,1) with zero mean
##
## Coefficients:
## ar1 ma1
## 0.5171 -0.7749
## s.e. 0.1518 0.0998
##
## sigma^2 estimated as 155013: log likelihood=-694.18
## AIC=1394.36 AICc=1394.63 BIC=1401.99
#AIC=1394.36 AICc=1394.63 BIC=1401.99
myfc4<-arimadiff2 %>% forecast(h=45)
autoplot(myfc4)

difftest2<-diff(test_ts2)
myfc4%>%accuracy(difftest2)
## ME RMSE MAE MPE MAPE MASE ACF1
## Training set 39.40202 389.5062 152.8411 Inf Inf 0.5913724 0.02947579
## Test set -23.71178 389.0037 296.9636 100.2996 100.2996 1.1490104 NA
# ME RMSE MAE MPE MAPE MASE ACF1
#Training set 39.40202 389.5062 152.8411 Inf Inf 0.5913724 0.02947579
#Test set -23.71178 389.0037 296.9636 100.2996 100.2996 1.1490104 NA