EDA

str(data)
## 'data.frame':    351 obs. of  2 variables:
##  $ Period: chr  "1/1/1992" "2/1/1992" "3/1/1992" "4/1/1992" ...
##  $ Value : chr  "1,509" "1,541" "1,597" "1,675" ...
head(data)
##     Period Value
## 1 1/1/1992 1,509
## 2 2/1/1992 1,541
## 3 3/1/1992 1,597
## 4 4/1/1992 1,675
## 5 5/1/1992 1,822
## 6 6/1/1992 1,775
summary(data)
##     Period             Value          
##  Length:351         Length:351        
##  Class :character   Class :character  
##  Mode  :character   Mode  :character

Variable Type Conversion

# convert Value to numeric
data$Value <- str_remove(data$Value, ",")
data$Value <- as.numeric(data$Value)
head(data)
##     Period Value
## 1 1/1/1992  1509
## 2 2/1/1992  1541
## 3 3/1/1992  1597
## 4 4/1/1992  1675
## 5 5/1/1992  1822
## 6 6/1/1992  1775
summary(data)
##     Period              Value     
##  Length:351         Min.   :1501  
##  Class :character   1st Qu.:2140  
##  Mode  :character   Median :2984  
##                     Mean   :3115  
##                     3rd Qu.:3796  
##                     Max.   :7366
# Convert Period to Date
data$Period <- as.Date(data$Period, "%m/%d/%Y")
head(data)
##       Period Value
## 1 1992-01-01  1509
## 2 1992-02-01  1541
## 3 1992-03-01  1597
## 4 1992-04-01  1675
## 5 1992-05-01  1822
## 6 1992-06-01  1775
data_alec$Period <- as.Date(data_alec$Period,format = "%m/%d/%Y")
data_alec$Value <- as.numeric(gsub(",","",data_alec$Value))

head(data)
##       Period Value
## 1 1992-01-01  1509
## 2 1992-02-01  1541
## 3 1992-03-01  1597
## 4 1992-04-01  1675
## 5 1992-05-01  1822
## 6 1992-06-01  1775
head(data_alec)
##       Period Value
## 1 1992-01-01  1509
## 2 1992-02-01  1541
## 3 1992-03-01  1597
## 4 1992-04-01  1675
## 5 1992-05-01  1822
## 6 1992-06-01  1775
identical(data, data_alec)
## [1] TRUE

Check for missings

# Check for missings
sum(is.na(data))
## [1] 0
sapply(data, function(x) {sum(is.na(x))})
## Period  Value 
##      0      0
sapply(data, function(x) {sum(x == "")})
## Period  Value 
##     NA      0

No missings in data or empty strings

Visualization

ts_plot <- ggplot(data, aes(Period, Value)) + 
  geom_line() + 
  xlab("Month") + ylab("Liquor Sales in Millions") + 
  ggtitle("Monthly Liquor Sales, January 1992 to March 2021") + 
  scale_x_date(labels = date_format(format= "%Y"), 
               breaks = date_breaks("2 year")) + 
  stat_smooth(colour = "orangered", se = F, linetype = 6) + 
  theme_bw()

ts_plot_alec <- ggplot(data_alec, aes(Period,Value)) + geom_line(na.rm=TRUE) + 
  xlab("Month") + ylab("Sales in Thousands") + 
  scale_x_date(labels = date_format(format= "%b-%Y"),breaks = date_breaks("1 year")) + 
  stat_smooth(colour = "green")

ts_plot

ts_plot_alec

data %>% 
  mutate(month = month(Period, label = T)) %>% 
  ggplot(aes(Period, Value, color = factor(month))) + 
  geom_line(size = 0.8) +
  xlab("Year") + ylab("Liquor Sales in Millions") + 
  labs(title = "Liquor Sales by Month, January 1992 to March 2021",
       color = "Month") + 
  scale_color_viridis_d(option = "D", end = 1) + 
  scale_x_date(labels = date_format(format= "%Y"), 
               breaks = date_breaks("2 year")) +
  theme_bw()

Definite seasonality, definite trend, nonconstant variance (and therefore likely non stationary)

Convert to Time Series

data_ts <- ts(data[,"Value"])
  # only include the values, but doesn't know the interval
head(data_ts)
## Time Series:
## Start = 1 
## End = 6 
## Frequency = 1 
## [1] 1509 1541 1597 1675 1822 1775

ACF & PACF

# Autocorrelation and Partial Autocorrelation Plots
Acf(data_ts)

Pacf(data_ts)

# Lag plot of Data
gglagplot(data_ts, set.lags = 1:24)

#Ljung-Box test on the first 24 Lag autocorrelations
Box.test(data_ts, lag = 24, fitdf = 0, type = "Lj")
## 
##  Box-Ljung test
## 
## data:  data_ts
## X-squared = 4781.4, df = 24, p-value < 2.2e-16

Everything significant in ACF, definite non-stationarity. PACF shows spikes at lag 1-5, 12, 13, 16. Perhaps a 1 year differencing is in order? Well, difference the data first, then look again.

Lag plot shows correlation at lag12 and lag24, definite yearly seasonality.

Ljung-Box v. signif. Data def NOT white noise.

Convert to Time Series

data_ts <- ts(data[,"Value"], frequency = 12, start = 1992)
head(data_ts)
##       Jan  Feb  Mar  Apr  May  Jun
## 1992 1509 1541 1597 1675 1822 1775
data_ts_alec <- ts(data_alec[,"Value"], frequency = 12, start = 1992)
head(data_ts_alec)
##       Jan  Feb  Mar  Apr  May  Jun
## 1992 1509 1541 1597 1675 1822 1775
identical(data_ts, data_ts_alec)
## [1] TRUE
Acf(data_ts)

Pacf(data_ts)

gglagplot(data_ts, set.lags = 1:24)

Again all significant in ACF. PACF shows spikes at 1, 12, & 13. Maybe a seasonal component with a lag-1 component? idk. Based on the lagplot, the winter months seem to be outliers more than the rest? Colors hard to distinguish.

Decomposing the Time Series

component.tsa <- decompose(data_ts, type = "additive")
plot(component.tsa)

component.tsa$trend
##           Jan      Feb      Mar      Apr      May      Jun      Jul      Aug
## 1992       NA       NA       NA       NA       NA       NA 1811.625 1815.500
## 1993 1825.750 1823.792 1817.625 1809.792 1802.542 1797.667 1792.083 1788.333
## 1994 1803.542 1808.208 1816.417 1823.292 1827.958 1836.167 1842.042 1842.167
## 1995 1832.708 1828.875 1829.917 1830.083 1830.750 1833.042 1839.083 1850.542
## 1996 1901.000 1915.542 1921.250 1921.583 1929.292 1931.875 1930.875 1930.708
## 1997 1947.250 1950.625 1955.958 1968.500 1980.167 1995.333 2012.583 2024.542
## 1998 2062.292 2068.542 2075.958 2086.458 2094.167 2106.167 2116.000 2118.500
## 1999 2157.583 2163.917 2168.792 2175.083 2182.375 2196.833 2208.583 2217.833
## 2000 2284.333 2301.250 2322.667 2337.875 2354.917 2371.000 2383.458 2394.500
## 2001 2437.000 2441.833 2444.083 2444.708 2450.792 2461.458 2468.625 2471.208
## 2002 2491.042 2497.792 2500.208 2499.083 2497.208 2493.417 2491.458 2489.125
## 2003 2479.458 2483.958 2493.500 2510.333 2522.792 2532.167 2546.250 2560.625
## 2004 2633.833 2642.917 2647.875 2658.000 2665.875 2676.292 2679.333 2680.292
## 2005 2718.625 2728.042 2742.000 2752.000 2762.167 2783.250 2805.792 2822.792
## 2006 2918.792 2936.542 2955.125 2968.667 2981.833 2995.833 3007.375 3016.917
## 2007 3102.458 3122.167 3135.833 3145.583 3159.500 3171.750 3181.750 3194.625
## 2008 3226.792 3242.417 3253.833 3268.750 3283.458 3289.583 3301.917 3310.792
## 2009 3331.583 3330.125 3331.750 3339.500 3341.333 3346.917 3352.292 3356.458
## 2010 3408.083 3414.625 3421.167 3428.417 3438.417 3447.750 3450.500 3453.750
## 2011 3477.125 3485.667 3497.083 3504.167 3510.000 3523.875 3537.125 3550.667
## 2012 3636.708 3643.750 3650.500 3654.542 3668.458 3687.417 3703.250 3709.833
## 2013 3737.625 3761.167 3777.375 3785.708 3796.250 3802.958 3811.833 3821.792
## 2014 3876.167 3886.792 3896.417 3913.500 3924.625 3937.208 3955.958 3968.000
## 2015 4029.958 4040.125 4049.292 4064.333 4075.375 4088.417 4096.417 4106.708
## 2016 4170.958 4184.708 4204.375 4221.917 4241.458 4263.583 4274.750 4277.000
## 2017 4332.167 4339.292 4348.917 4355.917 4365.583 4378.500 4394.083 4410.750
## 2018 4502.958 4519.958 4533.083 4544.833 4565.667 4584.167 4596.542 4607.583
## 2019 4656.750 4679.750 4694.667 4701.292 4711.792 4725.500 4744.375 4771.750
## 2020 5125.208 5204.750 5284.917 5377.333 5453.542 5527.667 5608.458 5673.000
## 2021       NA       NA       NA                                             
##           Sep      Oct      Nov      Dec
## 1992 1818.375 1823.333 1823.833 1823.458
## 1993 1788.833 1792.167 1795.417 1799.542
## 1994 1842.458 1840.500 1837.125 1836.417
## 1995 1861.500 1870.292 1880.667 1890.875
## 1996 1930.083 1931.542 1936.625 1942.750
## 1997 2032.375 2040.167 2049.583 2055.292
## 1998 2124.000 2133.500 2142.083 2148.583
## 1999 2233.333 2243.208 2250.708 2268.000
## 2000 2403.375 2413.208 2422.042 2431.458
## 2001 2475.208 2479.708 2486.708 2489.167
## 2002 2482.625 2480.125 2481.792 2479.875
## 2003 2573.042 2587.417 2601.125 2614.417
## 2004 2689.833 2698.958 2703.583 2710.417
## 2005 2840.542 2856.125 2875.250 2899.417
## 2006 3030.917 3044.583 3057.958 3081.042
## 2007 3202.417 3207.167 3219.167 3222.208
## 2008 3311.208 3318.208 3325.708 3328.833
## 2009 3370.000 3385.417 3392.917 3398.375
## 2010 3458.042 3461.708 3464.083 3468.958
## 2011 3572.292 3586.750 3601.792 3625.958
## 2012 3712.458 3716.958 3724.667 3728.417
## 2013 3823.542 3830.708 3845.667 3861.083
## 2014 3980.542 3992.625 4003.083 4014.083
## 2015 4123.000 4137.333 4146.417 4155.708
## 2016 4282.458 4294.042 4307.583 4323.792
## 2017 4433.583 4451.417 4464.417 4485.417
## 2018 4613.125 4622.458 4638.042 4645.875
## 2019 4822.000 4879.667 4949.208 5036.708
## 2020 5720.125       NA       NA       NA
## 2021
component.tsm <- decompose(data_ts, type = "multiplicative")
plot(component.tsm)

plot(component.tsm)

component.tsm$trend
##           Jan      Feb      Mar      Apr      May      Jun      Jul      Aug
## 1992       NA       NA       NA       NA       NA       NA 1811.625 1815.500
## 1993 1825.750 1823.792 1817.625 1809.792 1802.542 1797.667 1792.083 1788.333
## 1994 1803.542 1808.208 1816.417 1823.292 1827.958 1836.167 1842.042 1842.167
## 1995 1832.708 1828.875 1829.917 1830.083 1830.750 1833.042 1839.083 1850.542
## 1996 1901.000 1915.542 1921.250 1921.583 1929.292 1931.875 1930.875 1930.708
## 1997 1947.250 1950.625 1955.958 1968.500 1980.167 1995.333 2012.583 2024.542
## 1998 2062.292 2068.542 2075.958 2086.458 2094.167 2106.167 2116.000 2118.500
## 1999 2157.583 2163.917 2168.792 2175.083 2182.375 2196.833 2208.583 2217.833
## 2000 2284.333 2301.250 2322.667 2337.875 2354.917 2371.000 2383.458 2394.500
## 2001 2437.000 2441.833 2444.083 2444.708 2450.792 2461.458 2468.625 2471.208
## 2002 2491.042 2497.792 2500.208 2499.083 2497.208 2493.417 2491.458 2489.125
## 2003 2479.458 2483.958 2493.500 2510.333 2522.792 2532.167 2546.250 2560.625
## 2004 2633.833 2642.917 2647.875 2658.000 2665.875 2676.292 2679.333 2680.292
## 2005 2718.625 2728.042 2742.000 2752.000 2762.167 2783.250 2805.792 2822.792
## 2006 2918.792 2936.542 2955.125 2968.667 2981.833 2995.833 3007.375 3016.917
## 2007 3102.458 3122.167 3135.833 3145.583 3159.500 3171.750 3181.750 3194.625
## 2008 3226.792 3242.417 3253.833 3268.750 3283.458 3289.583 3301.917 3310.792
## 2009 3331.583 3330.125 3331.750 3339.500 3341.333 3346.917 3352.292 3356.458
## 2010 3408.083 3414.625 3421.167 3428.417 3438.417 3447.750 3450.500 3453.750
## 2011 3477.125 3485.667 3497.083 3504.167 3510.000 3523.875 3537.125 3550.667
## 2012 3636.708 3643.750 3650.500 3654.542 3668.458 3687.417 3703.250 3709.833
## 2013 3737.625 3761.167 3777.375 3785.708 3796.250 3802.958 3811.833 3821.792
## 2014 3876.167 3886.792 3896.417 3913.500 3924.625 3937.208 3955.958 3968.000
## 2015 4029.958 4040.125 4049.292 4064.333 4075.375 4088.417 4096.417 4106.708
## 2016 4170.958 4184.708 4204.375 4221.917 4241.458 4263.583 4274.750 4277.000
## 2017 4332.167 4339.292 4348.917 4355.917 4365.583 4378.500 4394.083 4410.750
## 2018 4502.958 4519.958 4533.083 4544.833 4565.667 4584.167 4596.542 4607.583
## 2019 4656.750 4679.750 4694.667 4701.292 4711.792 4725.500 4744.375 4771.750
## 2020 5125.208 5204.750 5284.917 5377.333 5453.542 5527.667 5608.458 5673.000
## 2021       NA       NA       NA                                             
##           Sep      Oct      Nov      Dec
## 1992 1818.375 1823.333 1823.833 1823.458
## 1993 1788.833 1792.167 1795.417 1799.542
## 1994 1842.458 1840.500 1837.125 1836.417
## 1995 1861.500 1870.292 1880.667 1890.875
## 1996 1930.083 1931.542 1936.625 1942.750
## 1997 2032.375 2040.167 2049.583 2055.292
## 1998 2124.000 2133.500 2142.083 2148.583
## 1999 2233.333 2243.208 2250.708 2268.000
## 2000 2403.375 2413.208 2422.042 2431.458
## 2001 2475.208 2479.708 2486.708 2489.167
## 2002 2482.625 2480.125 2481.792 2479.875
## 2003 2573.042 2587.417 2601.125 2614.417
## 2004 2689.833 2698.958 2703.583 2710.417
## 2005 2840.542 2856.125 2875.250 2899.417
## 2006 3030.917 3044.583 3057.958 3081.042
## 2007 3202.417 3207.167 3219.167 3222.208
## 2008 3311.208 3318.208 3325.708 3328.833
## 2009 3370.000 3385.417 3392.917 3398.375
## 2010 3458.042 3461.708 3464.083 3468.958
## 2011 3572.292 3586.750 3601.792 3625.958
## 2012 3712.458 3716.958 3724.667 3728.417
## 2013 3823.542 3830.708 3845.667 3861.083
## 2014 3980.542 3992.625 4003.083 4014.083
## 2015 4123.000 4137.333 4146.417 4155.708
## 2016 4282.458 4294.042 4307.583 4323.792
## 2017 4433.583 4451.417 4464.417 4485.417
## 2018 4613.125 4622.458 4638.042 4645.875
## 2019 4822.000 4879.667 4949.208 5036.708
## 2020 5720.125       NA       NA       NA
## 2021

Cleaned vs uncleaned

#Using the ts() command to create a time series object to pass to tsclean()
# data2 <- data
# data2$Value <- tsclean(ts(data2[,"Value"], frequency = 12))
data$Value <-tsclean(data_ts)
data_alec$Value <-tsclean(data_ts_alec)



#Plot the cleaned data
c_ts_plot <- ggplot(data, aes(Period,Value)) + geom_line(na.rm=TRUE) + 
  xlab("Month") + ylab("Sales in Thousands") + 
  scale_x_date(labels = date_format(format= "%b-%Y"),breaks = date_breaks("1 year")) + 
  stat_smooth(colour="green")

c_ts_plot_alec <- ggplot(data_alec, aes(Period,Value)) + geom_line(na.rm=TRUE) + 
  xlab("Month") + ylab("Sales in Thousands") + 
  scale_x_date(labels = date_format(format= "%b-%Y"),breaks = date_breaks("1 year")) + 
  stat_smooth(colour="green")

c_ts_plot

c_ts_plot_alec

#Compare both cleaned and uncleaned plots
grid.arrange(ts_plot,c_ts_plot,ncol=1, top = textGrob("Uncleaned vs Cleaned Series"))

grid.arrange(ts_plot_alec,c_ts_plot_alec,ncol=1, top = textGrob("Uncleaned vs Cleaned Series"))

ts_test <- ts(na.omit(data$Value), frequency = 12)
data_ts <- ts(data[,"Value"], frequency = 12, start = 1992)
# data2_ts <- ts(data2[,"Value"], frequency = 12, start = 1992)

Checking for stationarityy

adf.test(data_ts)
## 
##  Augmented Dickey-Fuller Test
## 
## data:  data_ts
## Dickey-Fuller = -2.3704, Lag order = 7, p-value = 0.4205
## alternative hypothesis: stationary
# NOT stationary

Manually fitting the model

# 1 order difference
dfit <- arima(data_ts, order = c(0,1,0))

plot(residuals(dfit))

Acf(residuals(dfit))

Pacf(residuals(dfit))

adf.test(residuals(dfit))
## 
##  Augmented Dickey-Fuller Test
## 
## data:  residuals(dfit)
## Dickey-Fuller = -11.035, Lag order = 7, p-value = 0.01
## alternative hypothesis: stationary
# 1 order seasonal difference
dfit <- arima(data_ts, order = c(0,0,0), 
              seasonal = list(order = c(0, 1, 0), period = 12))

plot(residuals(dfit))

Acf(residuals(dfit))

Pacf(residuals(dfit))

adf.test(residuals(dfit))
## 
##  Augmented Dickey-Fuller Test
## 
## data:  residuals(dfit)
## Dickey-Fuller = -2.7335, Lag order = 7, p-value = 0.2673
## alternative hypothesis: stationary
# tail off in ACF suggests AR, cut off after PACF lag 2 or 3



# 1 order diff, 1 order seasonal difference
dfit <- arima(data_ts, order = c(0,1,0), 
              seasonal = list(order = c(0, 1, 0), period = 12))
dfit
## 
## Call:
## arima(x = data_ts, order = c(0, 1, 0), seasonal = list(order = c(0, 1, 0), period = 12))
## 
## 
## sigma^2 estimated as 14065:  log likelihood = -2093.8,  aic = 4189.6
plot(residuals(dfit))

Acf(residuals(dfit))

Pacf(residuals(dfit))

adf.test(residuals(dfit))
## 
##  Augmented Dickey-Fuller Test
## 
## data:  residuals(dfit)
## Dickey-Fuller = -8.5543, Lag order = 7, p-value = 0.01
## alternative hypothesis: stationary
Box.test(residuals(dfit), type = "Lj")
## 
##  Box-Ljung test
## 
## data:  residuals(dfit)
## X-squared = 96.874, df = 1, p-value < 2.2e-16
# "strong negative spike at lag-1 suggests MA(1)



# AR(3), 1 order diff, 1 order seasonal difference
dfit <- arima(data_ts, order = c(2,1,1), 
              seasonal = list(order = c(0, 1, 0), period = 12))
dfit
## 
## Call:
## arima(x = data_ts, order = c(2, 1, 1), seasonal = list(order = c(0, 1, 0), period = 12))
## 
## Coefficients:
##           ar1      ar2     ma1
##       -0.9466  -0.4915  0.2346
## s.e.   0.1281   0.0745  0.1486
## 
## sigma^2 estimated as 8552:  log likelihood = -2010.06,  aic = 4028.12
plot(residuals(dfit))

Acf(residuals(dfit))

Pacf(residuals(dfit))

adf.test(residuals(dfit))
## 
##  Augmented Dickey-Fuller Test
## 
## data:  residuals(dfit)
## Dickey-Fuller = -7.2469, Lag order = 7, p-value = 0.01
## alternative hypothesis: stationary
Box.test(residuals(dfit), type = "Lj")
## 
##  Box-Ljung test
## 
## data:  residuals(dfit)
## X-squared = 0.027379, df = 1, p-value = 0.8686
# AR(2), 1st diff, 1 order seasonal difference
dfit <- arima(data_ts, order = c(2,1,0), 
              seasonal = list(order = c(0, 1, 0), period = 12))
dfit
## 
## Call:
## arima(x = data_ts, order = c(2, 1, 0), seasonal = list(order = c(0, 1, 0), period = 12))
## 
## Coefficients:
##           ar1      ar2
##       -0.7506  -0.3861
## s.e.   0.0513   0.0513
## 
## sigma^2 estimated as 8601:  log likelihood = -2011.02,  aic = 4028.04
summary(dfit)
## 
## Call:
## arima(x = data_ts, order = c(2, 1, 0), seasonal = list(order = c(0, 1, 0), period = 12))
## 
## Coefficients:
##           ar1      ar2
##       -0.7506  -0.3861
## s.e.   0.0513   0.0513
## 
## sigma^2 estimated as 8601:  log likelihood = -2011.02,  aic = 4028.04
## 
## Training set error measures:
##                    ME     RMSE      MAE          MPE     MAPE      MASE
## Training set 3.691924 91.01027 68.06409 -0.002241093 2.216752 0.1993787
##                    ACF1
## Training set 0.01805167
plot(residuals(dfit))

Acf(residuals(dfit))

Pacf(residuals(dfit))

adf.test(residuals(dfit))
## 
##  Augmented Dickey-Fuller Test
## 
## data:  residuals(dfit)
## Dickey-Fuller = -7.0702, Lag order = 7, p-value = 0.01
## alternative hypothesis: stationary
Box.test(residuals(dfit), type = "Lj")
## 
##  Box-Ljung test
## 
## data:  residuals(dfit)
## X-squared = 0.11536, df = 1, p-value = 0.7341

Automated Model

dfit_auto <- auto.arima(data_ts, seasonal = TRUE)
dfit_auto
## Series: data_ts 
## ARIMA(2,1,2)(0,1,2)[12] 
## 
## Coefficients:
##           ar1      ar2     ma1      ma2     sma1     sma2
##       -1.1799  -0.4437  0.4659  -0.2985  -0.3421  -0.1049
## s.e.   0.0816   0.0739  0.0866   0.0807   0.0578   0.0513
## 
## sigma^2 estimated as 7649:  log likelihood=-1989.36
## AIC=3992.72   AICc=3993.06   BIC=4019.48
summary(dfit_auto)
## Series: data_ts 
## ARIMA(2,1,2)(0,1,2)[12] 
## 
## Coefficients:
##           ar1      ar2     ma1      ma2     sma1     sma2
##       -1.1799  -0.4437  0.4659  -0.2985  -0.3421  -0.1049
## s.e.   0.0816   0.0739  0.0866   0.0807   0.0578   0.0513
## 
## sigma^2 estimated as 7649:  log likelihood=-1989.36
## AIC=3992.72   AICc=3993.06   BIC=4019.48
## 
## Training set error measures:
##                    ME     RMSE      MAE         MPE     MAPE      MASE
## Training set 5.347325 85.05609 64.10699 0.001067671 2.091984 0.4306553
##                      ACF1
## Training set -0.009206448
plot(residuals(dfit_auto))

Acf(residuals(dfit_auto))

Pacf(residuals(dfit_auto))

adf.test(residuals(dfit_auto))
## 
##  Augmented Dickey-Fuller Test
## 
## data:  residuals(dfit_auto)
## Dickey-Fuller = -6.7121, Lag order = 7, p-value = 0.01
## alternative hypothesis: stationary
Box.test(residuals(dfit_auto), type = "Lj")
## 
##  Box-Ljung test
## 
## data:  residuals(dfit_auto)
## X-squared = 0.030005, df = 1, p-value = 0.8625
coeftest(dfit_auto)
## 
## z test of coefficients:
## 
##       Estimate Std. Error  z value  Pr(>|z|)    
## ar1  -1.179931   0.081580 -14.4634 < 2.2e-16 ***
## ar2  -0.443725   0.073889  -6.0053 1.910e-09 ***
## ma1   0.465888   0.086619   5.3786 7.508e-08 ***
## ma2  -0.298507   0.080715  -3.6983 0.0002171 ***
## sma1 -0.342064   0.057752  -5.9230 3.161e-09 ***
## sma2 -0.104850   0.051304  -2.0437 0.0409832 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
dfit_auto2 <- auto.arima(data_ts_alec, seasonal = TRUE)
dfit_auto2
## Series: data_ts_alec 
## ARIMA(2,1,1)(0,1,2)[12] 
## 
## Coefficients:
##           ar1      ar2     ma1     sma1     sma2
##       -0.9569  -0.4796  0.2595  -0.3489  -0.0899
## s.e.   0.1186   0.0708  0.1320   0.0595   0.0548
## 
## sigma^2 estimated as 8912:  log likelihood=-2015.54
## AIC=4043.09   AICc=4043.34   BIC=4066.03
summary(dfit_auto2)
## Series: data_ts_alec 
## ARIMA(2,1,1)(0,1,2)[12] 
## 
## Coefficients:
##           ar1      ar2     ma1     sma1     sma2
##       -0.9569  -0.4796  0.2595  -0.3489  -0.0899
## s.e.   0.1186   0.0708  0.1320   0.0595   0.0548
## 
## sigma^2 estimated as 8912:  log likelihood=-2015.54
## AIC=4043.09   AICc=4043.34   BIC=4066.03
## 
## Training set error measures:
##                    ME     RMSE      MAE         MPE     MAPE      MASE
## Training set 4.047023 91.95075 67.72276 -0.03303985 2.176473 0.4528476
##                     ACF1
## Training set 0.002130525
plot(residuals(dfit_auto2))

Acf(residuals(dfit_auto2))

Pacf(residuals(dfit_auto2))

adf.test(residuals(dfit_auto2))
## 
##  Augmented Dickey-Fuller Test
## 
## data:  residuals(dfit_auto2)
## Dickey-Fuller = -7.0397, Lag order = 7, p-value = 0.01
## alternative hypothesis: stationary
Box.test(residuals(dfit_auto2), type = "Lj")
## 
##  Box-Ljung test
## 
## data:  residuals(dfit_auto2)
## X-squared = 0.0016069, df = 1, p-value = 0.968
coeftest(dfit_auto2)
## 
## z test of coefficients:
## 
##       Estimate Std. Error z value  Pr(>|z|)    
## ar1  -0.956939   0.118594 -8.0690 7.086e-16 ***
## ar2  -0.479635   0.070771 -6.7773 1.224e-11 ***
## ma1   0.259535   0.131998  1.9662   0.04928 *  
## sma1 -0.348872   0.059539 -5.8595 4.642e-09 ***
## sma2 -0.089912   0.054751 -1.6422   0.10055    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
dfit <- arima(data_ts, order = c(1,1,3), 
              seasonal = list(order = c(0, 1, 0), period = 12))
dfit
## 
## Call:
## arima(x = data_ts, order = c(1, 1, 3), seasonal = list(order = c(0, 1, 0), period = 12))
## 
## Coefficients:
##           ar1     ma1      ma2     ma3
##       -0.7988  0.1201  -0.5227  0.2563
## s.e.   0.0550  0.0664   0.0493  0.0517
## 
## sigma^2 estimated as 8425:  log likelihood = -2007.71,  aic = 4025.41

FORECASTING

#we will forecast data for the last two full years, 2019+ (month = 328 to 351)
fit_predicted <- arima(ts(data_ts[-c(328:351)]), order =c(0,1,1), seasonal = list(order = c(0,1,2), period = 12))
fit_predicted2 <- arima(ts(data_ts[-c(328:351)]), order =c(2,1,1), seasonal = list(order = c(0,1,2), period = 12))

fit_predicted
## 
## Call:
## arima(x = ts(data_ts[-c(328:351)]), order = c(0, 1, 1), seasonal = list(order = c(0, 
##     1, 2), period = 12))
## 
## Coefficients:
##           ma1     sma1     sma2
##       -0.8005  -0.2735  -0.2012
## s.e.   0.0400   0.0552   0.0472
## 
## sigma^2 estimated as 6358:  log likelihood = -1822.25,  aic = 3652.49
fit_predicted2
## 
## Call:
## arima(x = ts(data_ts[-c(328:351)]), order = c(2, 1, 1), seasonal = list(order = c(0, 
##     1, 2), period = 12))
## 
## Coefficients:
##           ar1      ar2      ma1     sma1     sma2
##       -0.8279  -0.4655  -0.0045  -0.2638  -0.1256
## s.e.   0.2051   0.1240   0.2534   0.0577   0.0532
## 
## sigma^2 estimated as 6172:  log likelihood = -1817.01,  aic = 3646.03
# ARIMA(2,1,1)(0,1,2)[12] WRONG!!!
# ARIMA(0,1,1)(0,1,2)[12] 

#use the model to forecast values for last 24 months. 
#Specify forecast horizon h periods ahead of prediction to be made 
#and use the fitted model to generate those predictions

forecast_pred <- forecast(fit_predicted, h = 24)
plot(forecast_pred, main = "Forecasting 2019-2021")
lines(ts(data_ts))

plot(forecast_pred, main = "Plotting only 2018 and predicted 2019-2021"
     , xlim = c(313,351)
     , ylim = c(1500,6500)
     )
lines(ts(data_ts))

autoplot(forecast_pred) +
  xlab("Year") + ylab("Sales in Thousands") +
  labs(title = "Forecasting Previous 24 Months -- Apr 2019 to Mar 2021") + 
  scale_x_continuous(breaks = seq(1, 351, by = 24), 
                     labels = seq(1992, 2021, by = 2)) + 
  theme_bw()

actual2018 <- ts(data_ts)[313:351]

autoplot(forecast_pred) +
  geom_line(aes(x = 313:351, y = actual2018)) +
  xlab("Year") + ylab("Sales in Thousands") +
  labs(title = "Forecasting Previous 24 Months -- Apr 2019 to Mar 2021",
       subtitle = "Zoomed in: X-axis limited to 2018-2021") + 
  coord_cartesian(xlim = c(313,351)) + 
  scale_x_continuous(breaks = seq(313, 351, by = 6),
                     labels = c("Jan 2018", "Jul 2018",
                                "Jan 2019", "Jul 2019",
                                "Jan 2020", "Jul 2020",
                                "Jan 2021")) +
  theme_bw()

#8. Forecasting
#Next step is to forecast the sales for another 24 months ahead of time. 
f_values <-forecast(dfit_auto2, h=24)
plot(f_values, main = "Forecasting for Next Months -- Apr 2021 to Mar 2023")

autoplot(f_values) +
  xlab("Year") + ylab("Sales in Thousands") +
  labs(title = "Forecasting for Next Months -- Apr 2021 to Mar 2023") + 
  scale_x_continuous(breaks = seq(1, 32, by = 2),
                     labels = seq(1992, 2022, by = 2)) +
  theme_bw()

Alcohol sales much higher than predicted in 2020 (Covid)