str(data)
## 'data.frame': 351 obs. of 2 variables:
## $ Period: chr "1/1/1992" "2/1/1992" "3/1/1992" "4/1/1992" ...
## $ Value : chr "1,509" "1,541" "1,597" "1,675" ...
head(data)
## Period Value
## 1 1/1/1992 1,509
## 2 2/1/1992 1,541
## 3 3/1/1992 1,597
## 4 4/1/1992 1,675
## 5 5/1/1992 1,822
## 6 6/1/1992 1,775
summary(data)
## Period Value
## Length:351 Length:351
## Class :character Class :character
## Mode :character Mode :character
# convert Value to numeric
data$Value <- str_remove(data$Value, ",")
data$Value <- as.numeric(data$Value)
head(data)
## Period Value
## 1 1/1/1992 1509
## 2 2/1/1992 1541
## 3 3/1/1992 1597
## 4 4/1/1992 1675
## 5 5/1/1992 1822
## 6 6/1/1992 1775
summary(data)
## Period Value
## Length:351 Min. :1501
## Class :character 1st Qu.:2140
## Mode :character Median :2984
## Mean :3115
## 3rd Qu.:3796
## Max. :7366
# Convert Period to Date
data$Period <- as.Date(data$Period, "%m/%d/%Y")
head(data)
## Period Value
## 1 1992-01-01 1509
## 2 1992-02-01 1541
## 3 1992-03-01 1597
## 4 1992-04-01 1675
## 5 1992-05-01 1822
## 6 1992-06-01 1775
data_alec$Period <- as.Date(data_alec$Period,format = "%m/%d/%Y")
data_alec$Value <- as.numeric(gsub(",","",data_alec$Value))
head(data)
## Period Value
## 1 1992-01-01 1509
## 2 1992-02-01 1541
## 3 1992-03-01 1597
## 4 1992-04-01 1675
## 5 1992-05-01 1822
## 6 1992-06-01 1775
head(data_alec)
## Period Value
## 1 1992-01-01 1509
## 2 1992-02-01 1541
## 3 1992-03-01 1597
## 4 1992-04-01 1675
## 5 1992-05-01 1822
## 6 1992-06-01 1775
identical(data, data_alec)
## [1] TRUE
# Check for missings
sum(is.na(data))
## [1] 0
sapply(data, function(x) {sum(is.na(x))})
## Period Value
## 0 0
sapply(data, function(x) {sum(x == "")})
## Period Value
## NA 0
No missings in data or empty strings
ts_plot <- ggplot(data, aes(Period, Value)) +
geom_line() +
xlab("Month") + ylab("Liquor Sales in Millions") +
ggtitle("Monthly Liquor Sales, January 1992 to March 2021") +
scale_x_date(labels = date_format(format= "%Y"),
breaks = date_breaks("2 year")) +
stat_smooth(colour = "orangered", se = F, linetype = 6) +
theme_bw()
ts_plot_alec <- ggplot(data_alec, aes(Period,Value)) + geom_line(na.rm=TRUE) +
xlab("Month") + ylab("Sales in Thousands") +
scale_x_date(labels = date_format(format= "%b-%Y"),breaks = date_breaks("1 year")) +
stat_smooth(colour = "green")
ts_plot
ts_plot_alec
data %>%
mutate(month = month(Period, label = T)) %>%
ggplot(aes(Period, Value, color = factor(month))) +
geom_line(size = 0.8) +
xlab("Year") + ylab("Liquor Sales in Millions") +
labs(title = "Liquor Sales by Month, January 1992 to March 2021",
color = "Month") +
scale_color_viridis_d(option = "D", end = 1) +
scale_x_date(labels = date_format(format= "%Y"),
breaks = date_breaks("2 year")) +
theme_bw()
Definite seasonality, definite trend, nonconstant variance (and therefore likely non stationary)
data_ts <- ts(data[,"Value"])
# only include the values, but doesn't know the interval
head(data_ts)
## Time Series:
## Start = 1
## End = 6
## Frequency = 1
## [1] 1509 1541 1597 1675 1822 1775
# Autocorrelation and Partial Autocorrelation Plots
Acf(data_ts)
Pacf(data_ts)
# Lag plot of Data
gglagplot(data_ts, set.lags = 1:24)
#Ljung-Box test on the first 24 Lag autocorrelations
Box.test(data_ts, lag = 24, fitdf = 0, type = "Lj")
##
## Box-Ljung test
##
## data: data_ts
## X-squared = 4781.4, df = 24, p-value < 2.2e-16
Everything significant in ACF, definite non-stationarity. PACF shows spikes at lag 1-5, 12, 13, 16. Perhaps a 1 year differencing is in order? Well, difference the data first, then look again.
Lag plot shows correlation at lag12 and lag24, definite yearly seasonality.
Ljung-Box v. signif. Data def NOT white noise.
data_ts <- ts(data[,"Value"], frequency = 12, start = 1992)
head(data_ts)
## Jan Feb Mar Apr May Jun
## 1992 1509 1541 1597 1675 1822 1775
data_ts_alec <- ts(data_alec[,"Value"], frequency = 12, start = 1992)
head(data_ts_alec)
## Jan Feb Mar Apr May Jun
## 1992 1509 1541 1597 1675 1822 1775
identical(data_ts, data_ts_alec)
## [1] TRUE
Acf(data_ts)
Pacf(data_ts)
gglagplot(data_ts, set.lags = 1:24)
Again all significant in ACF. PACF shows spikes at 1, 12, & 13. Maybe a seasonal component with a lag-1 component? idk. Based on the lagplot, the winter months seem to be outliers more than the rest? Colors hard to distinguish.
component.tsa <- decompose(data_ts, type = "additive")
plot(component.tsa)
component.tsa$trend
## Jan Feb Mar Apr May Jun Jul Aug
## 1992 NA NA NA NA NA NA 1811.625 1815.500
## 1993 1825.750 1823.792 1817.625 1809.792 1802.542 1797.667 1792.083 1788.333
## 1994 1803.542 1808.208 1816.417 1823.292 1827.958 1836.167 1842.042 1842.167
## 1995 1832.708 1828.875 1829.917 1830.083 1830.750 1833.042 1839.083 1850.542
## 1996 1901.000 1915.542 1921.250 1921.583 1929.292 1931.875 1930.875 1930.708
## 1997 1947.250 1950.625 1955.958 1968.500 1980.167 1995.333 2012.583 2024.542
## 1998 2062.292 2068.542 2075.958 2086.458 2094.167 2106.167 2116.000 2118.500
## 1999 2157.583 2163.917 2168.792 2175.083 2182.375 2196.833 2208.583 2217.833
## 2000 2284.333 2301.250 2322.667 2337.875 2354.917 2371.000 2383.458 2394.500
## 2001 2437.000 2441.833 2444.083 2444.708 2450.792 2461.458 2468.625 2471.208
## 2002 2491.042 2497.792 2500.208 2499.083 2497.208 2493.417 2491.458 2489.125
## 2003 2479.458 2483.958 2493.500 2510.333 2522.792 2532.167 2546.250 2560.625
## 2004 2633.833 2642.917 2647.875 2658.000 2665.875 2676.292 2679.333 2680.292
## 2005 2718.625 2728.042 2742.000 2752.000 2762.167 2783.250 2805.792 2822.792
## 2006 2918.792 2936.542 2955.125 2968.667 2981.833 2995.833 3007.375 3016.917
## 2007 3102.458 3122.167 3135.833 3145.583 3159.500 3171.750 3181.750 3194.625
## 2008 3226.792 3242.417 3253.833 3268.750 3283.458 3289.583 3301.917 3310.792
## 2009 3331.583 3330.125 3331.750 3339.500 3341.333 3346.917 3352.292 3356.458
## 2010 3408.083 3414.625 3421.167 3428.417 3438.417 3447.750 3450.500 3453.750
## 2011 3477.125 3485.667 3497.083 3504.167 3510.000 3523.875 3537.125 3550.667
## 2012 3636.708 3643.750 3650.500 3654.542 3668.458 3687.417 3703.250 3709.833
## 2013 3737.625 3761.167 3777.375 3785.708 3796.250 3802.958 3811.833 3821.792
## 2014 3876.167 3886.792 3896.417 3913.500 3924.625 3937.208 3955.958 3968.000
## 2015 4029.958 4040.125 4049.292 4064.333 4075.375 4088.417 4096.417 4106.708
## 2016 4170.958 4184.708 4204.375 4221.917 4241.458 4263.583 4274.750 4277.000
## 2017 4332.167 4339.292 4348.917 4355.917 4365.583 4378.500 4394.083 4410.750
## 2018 4502.958 4519.958 4533.083 4544.833 4565.667 4584.167 4596.542 4607.583
## 2019 4656.750 4679.750 4694.667 4701.292 4711.792 4725.500 4744.375 4771.750
## 2020 5125.208 5204.750 5284.917 5377.333 5453.542 5527.667 5608.458 5673.000
## 2021 NA NA NA
## Sep Oct Nov Dec
## 1992 1818.375 1823.333 1823.833 1823.458
## 1993 1788.833 1792.167 1795.417 1799.542
## 1994 1842.458 1840.500 1837.125 1836.417
## 1995 1861.500 1870.292 1880.667 1890.875
## 1996 1930.083 1931.542 1936.625 1942.750
## 1997 2032.375 2040.167 2049.583 2055.292
## 1998 2124.000 2133.500 2142.083 2148.583
## 1999 2233.333 2243.208 2250.708 2268.000
## 2000 2403.375 2413.208 2422.042 2431.458
## 2001 2475.208 2479.708 2486.708 2489.167
## 2002 2482.625 2480.125 2481.792 2479.875
## 2003 2573.042 2587.417 2601.125 2614.417
## 2004 2689.833 2698.958 2703.583 2710.417
## 2005 2840.542 2856.125 2875.250 2899.417
## 2006 3030.917 3044.583 3057.958 3081.042
## 2007 3202.417 3207.167 3219.167 3222.208
## 2008 3311.208 3318.208 3325.708 3328.833
## 2009 3370.000 3385.417 3392.917 3398.375
## 2010 3458.042 3461.708 3464.083 3468.958
## 2011 3572.292 3586.750 3601.792 3625.958
## 2012 3712.458 3716.958 3724.667 3728.417
## 2013 3823.542 3830.708 3845.667 3861.083
## 2014 3980.542 3992.625 4003.083 4014.083
## 2015 4123.000 4137.333 4146.417 4155.708
## 2016 4282.458 4294.042 4307.583 4323.792
## 2017 4433.583 4451.417 4464.417 4485.417
## 2018 4613.125 4622.458 4638.042 4645.875
## 2019 4822.000 4879.667 4949.208 5036.708
## 2020 5720.125 NA NA NA
## 2021
component.tsm <- decompose(data_ts, type = "multiplicative")
plot(component.tsm)
plot(component.tsm)
component.tsm$trend
## Jan Feb Mar Apr May Jun Jul Aug
## 1992 NA NA NA NA NA NA 1811.625 1815.500
## 1993 1825.750 1823.792 1817.625 1809.792 1802.542 1797.667 1792.083 1788.333
## 1994 1803.542 1808.208 1816.417 1823.292 1827.958 1836.167 1842.042 1842.167
## 1995 1832.708 1828.875 1829.917 1830.083 1830.750 1833.042 1839.083 1850.542
## 1996 1901.000 1915.542 1921.250 1921.583 1929.292 1931.875 1930.875 1930.708
## 1997 1947.250 1950.625 1955.958 1968.500 1980.167 1995.333 2012.583 2024.542
## 1998 2062.292 2068.542 2075.958 2086.458 2094.167 2106.167 2116.000 2118.500
## 1999 2157.583 2163.917 2168.792 2175.083 2182.375 2196.833 2208.583 2217.833
## 2000 2284.333 2301.250 2322.667 2337.875 2354.917 2371.000 2383.458 2394.500
## 2001 2437.000 2441.833 2444.083 2444.708 2450.792 2461.458 2468.625 2471.208
## 2002 2491.042 2497.792 2500.208 2499.083 2497.208 2493.417 2491.458 2489.125
## 2003 2479.458 2483.958 2493.500 2510.333 2522.792 2532.167 2546.250 2560.625
## 2004 2633.833 2642.917 2647.875 2658.000 2665.875 2676.292 2679.333 2680.292
## 2005 2718.625 2728.042 2742.000 2752.000 2762.167 2783.250 2805.792 2822.792
## 2006 2918.792 2936.542 2955.125 2968.667 2981.833 2995.833 3007.375 3016.917
## 2007 3102.458 3122.167 3135.833 3145.583 3159.500 3171.750 3181.750 3194.625
## 2008 3226.792 3242.417 3253.833 3268.750 3283.458 3289.583 3301.917 3310.792
## 2009 3331.583 3330.125 3331.750 3339.500 3341.333 3346.917 3352.292 3356.458
## 2010 3408.083 3414.625 3421.167 3428.417 3438.417 3447.750 3450.500 3453.750
## 2011 3477.125 3485.667 3497.083 3504.167 3510.000 3523.875 3537.125 3550.667
## 2012 3636.708 3643.750 3650.500 3654.542 3668.458 3687.417 3703.250 3709.833
## 2013 3737.625 3761.167 3777.375 3785.708 3796.250 3802.958 3811.833 3821.792
## 2014 3876.167 3886.792 3896.417 3913.500 3924.625 3937.208 3955.958 3968.000
## 2015 4029.958 4040.125 4049.292 4064.333 4075.375 4088.417 4096.417 4106.708
## 2016 4170.958 4184.708 4204.375 4221.917 4241.458 4263.583 4274.750 4277.000
## 2017 4332.167 4339.292 4348.917 4355.917 4365.583 4378.500 4394.083 4410.750
## 2018 4502.958 4519.958 4533.083 4544.833 4565.667 4584.167 4596.542 4607.583
## 2019 4656.750 4679.750 4694.667 4701.292 4711.792 4725.500 4744.375 4771.750
## 2020 5125.208 5204.750 5284.917 5377.333 5453.542 5527.667 5608.458 5673.000
## 2021 NA NA NA
## Sep Oct Nov Dec
## 1992 1818.375 1823.333 1823.833 1823.458
## 1993 1788.833 1792.167 1795.417 1799.542
## 1994 1842.458 1840.500 1837.125 1836.417
## 1995 1861.500 1870.292 1880.667 1890.875
## 1996 1930.083 1931.542 1936.625 1942.750
## 1997 2032.375 2040.167 2049.583 2055.292
## 1998 2124.000 2133.500 2142.083 2148.583
## 1999 2233.333 2243.208 2250.708 2268.000
## 2000 2403.375 2413.208 2422.042 2431.458
## 2001 2475.208 2479.708 2486.708 2489.167
## 2002 2482.625 2480.125 2481.792 2479.875
## 2003 2573.042 2587.417 2601.125 2614.417
## 2004 2689.833 2698.958 2703.583 2710.417
## 2005 2840.542 2856.125 2875.250 2899.417
## 2006 3030.917 3044.583 3057.958 3081.042
## 2007 3202.417 3207.167 3219.167 3222.208
## 2008 3311.208 3318.208 3325.708 3328.833
## 2009 3370.000 3385.417 3392.917 3398.375
## 2010 3458.042 3461.708 3464.083 3468.958
## 2011 3572.292 3586.750 3601.792 3625.958
## 2012 3712.458 3716.958 3724.667 3728.417
## 2013 3823.542 3830.708 3845.667 3861.083
## 2014 3980.542 3992.625 4003.083 4014.083
## 2015 4123.000 4137.333 4146.417 4155.708
## 2016 4282.458 4294.042 4307.583 4323.792
## 2017 4433.583 4451.417 4464.417 4485.417
## 2018 4613.125 4622.458 4638.042 4645.875
## 2019 4822.000 4879.667 4949.208 5036.708
## 2020 5720.125 NA NA NA
## 2021
#Using the ts() command to create a time series object to pass to tsclean()
# data2 <- data
# data2$Value <- tsclean(ts(data2[,"Value"], frequency = 12))
data$Value <-tsclean(data_ts)
data_alec$Value <-tsclean(data_ts_alec)
#Plot the cleaned data
c_ts_plot <- ggplot(data, aes(Period,Value)) + geom_line(na.rm=TRUE) +
xlab("Month") + ylab("Sales in Thousands") +
scale_x_date(labels = date_format(format= "%b-%Y"),breaks = date_breaks("1 year")) +
stat_smooth(colour="green")
c_ts_plot_alec <- ggplot(data_alec, aes(Period,Value)) + geom_line(na.rm=TRUE) +
xlab("Month") + ylab("Sales in Thousands") +
scale_x_date(labels = date_format(format= "%b-%Y"),breaks = date_breaks("1 year")) +
stat_smooth(colour="green")
c_ts_plot
c_ts_plot_alec
#Compare both cleaned and uncleaned plots
grid.arrange(ts_plot,c_ts_plot,ncol=1, top = textGrob("Uncleaned vs Cleaned Series"))
grid.arrange(ts_plot_alec,c_ts_plot_alec,ncol=1, top = textGrob("Uncleaned vs Cleaned Series"))
ts_test <- ts(na.omit(data$Value), frequency = 12)
data_ts <- ts(data[,"Value"], frequency = 12, start = 1992)
# data2_ts <- ts(data2[,"Value"], frequency = 12, start = 1992)
adf.test(data_ts)
##
## Augmented Dickey-Fuller Test
##
## data: data_ts
## Dickey-Fuller = -2.3704, Lag order = 7, p-value = 0.4205
## alternative hypothesis: stationary
# NOT stationary
# 1 order difference
dfit <- arima(data_ts, order = c(0,1,0))
plot(residuals(dfit))
Acf(residuals(dfit))
Pacf(residuals(dfit))
adf.test(residuals(dfit))
##
## Augmented Dickey-Fuller Test
##
## data: residuals(dfit)
## Dickey-Fuller = -11.035, Lag order = 7, p-value = 0.01
## alternative hypothesis: stationary
# 1 order seasonal difference
dfit <- arima(data_ts, order = c(0,0,0),
seasonal = list(order = c(0, 1, 0), period = 12))
plot(residuals(dfit))
Acf(residuals(dfit))
Pacf(residuals(dfit))
adf.test(residuals(dfit))
##
## Augmented Dickey-Fuller Test
##
## data: residuals(dfit)
## Dickey-Fuller = -2.7335, Lag order = 7, p-value = 0.2673
## alternative hypothesis: stationary
# tail off in ACF suggests AR, cut off after PACF lag 2 or 3
# 1 order diff, 1 order seasonal difference
dfit <- arima(data_ts, order = c(0,1,0),
seasonal = list(order = c(0, 1, 0), period = 12))
dfit
##
## Call:
## arima(x = data_ts, order = c(0, 1, 0), seasonal = list(order = c(0, 1, 0), period = 12))
##
##
## sigma^2 estimated as 14065: log likelihood = -2093.8, aic = 4189.6
plot(residuals(dfit))
Acf(residuals(dfit))
Pacf(residuals(dfit))
adf.test(residuals(dfit))
##
## Augmented Dickey-Fuller Test
##
## data: residuals(dfit)
## Dickey-Fuller = -8.5543, Lag order = 7, p-value = 0.01
## alternative hypothesis: stationary
Box.test(residuals(dfit), type = "Lj")
##
## Box-Ljung test
##
## data: residuals(dfit)
## X-squared = 96.874, df = 1, p-value < 2.2e-16
# "strong negative spike at lag-1 suggests MA(1)
# AR(3), 1 order diff, 1 order seasonal difference
dfit <- arima(data_ts, order = c(2,1,1),
seasonal = list(order = c(0, 1, 0), period = 12))
dfit
##
## Call:
## arima(x = data_ts, order = c(2, 1, 1), seasonal = list(order = c(0, 1, 0), period = 12))
##
## Coefficients:
## ar1 ar2 ma1
## -0.9466 -0.4915 0.2346
## s.e. 0.1281 0.0745 0.1486
##
## sigma^2 estimated as 8552: log likelihood = -2010.06, aic = 4028.12
plot(residuals(dfit))
Acf(residuals(dfit))
Pacf(residuals(dfit))
adf.test(residuals(dfit))
##
## Augmented Dickey-Fuller Test
##
## data: residuals(dfit)
## Dickey-Fuller = -7.2469, Lag order = 7, p-value = 0.01
## alternative hypothesis: stationary
Box.test(residuals(dfit), type = "Lj")
##
## Box-Ljung test
##
## data: residuals(dfit)
## X-squared = 0.027379, df = 1, p-value = 0.8686
# AR(2), 1st diff, 1 order seasonal difference
dfit <- arima(data_ts, order = c(2,1,0),
seasonal = list(order = c(0, 1, 0), period = 12))
dfit
##
## Call:
## arima(x = data_ts, order = c(2, 1, 0), seasonal = list(order = c(0, 1, 0), period = 12))
##
## Coefficients:
## ar1 ar2
## -0.7506 -0.3861
## s.e. 0.0513 0.0513
##
## sigma^2 estimated as 8601: log likelihood = -2011.02, aic = 4028.04
summary(dfit)
##
## Call:
## arima(x = data_ts, order = c(2, 1, 0), seasonal = list(order = c(0, 1, 0), period = 12))
##
## Coefficients:
## ar1 ar2
## -0.7506 -0.3861
## s.e. 0.0513 0.0513
##
## sigma^2 estimated as 8601: log likelihood = -2011.02, aic = 4028.04
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set 3.691924 91.01027 68.06409 -0.002241093 2.216752 0.1993787
## ACF1
## Training set 0.01805167
plot(residuals(dfit))
Acf(residuals(dfit))
Pacf(residuals(dfit))
adf.test(residuals(dfit))
##
## Augmented Dickey-Fuller Test
##
## data: residuals(dfit)
## Dickey-Fuller = -7.0702, Lag order = 7, p-value = 0.01
## alternative hypothesis: stationary
Box.test(residuals(dfit), type = "Lj")
##
## Box-Ljung test
##
## data: residuals(dfit)
## X-squared = 0.11536, df = 1, p-value = 0.7341
dfit_auto <- auto.arima(data_ts, seasonal = TRUE)
dfit_auto
## Series: data_ts
## ARIMA(2,1,2)(0,1,2)[12]
##
## Coefficients:
## ar1 ar2 ma1 ma2 sma1 sma2
## -1.1799 -0.4437 0.4659 -0.2985 -0.3421 -0.1049
## s.e. 0.0816 0.0739 0.0866 0.0807 0.0578 0.0513
##
## sigma^2 estimated as 7649: log likelihood=-1989.36
## AIC=3992.72 AICc=3993.06 BIC=4019.48
summary(dfit_auto)
## Series: data_ts
## ARIMA(2,1,2)(0,1,2)[12]
##
## Coefficients:
## ar1 ar2 ma1 ma2 sma1 sma2
## -1.1799 -0.4437 0.4659 -0.2985 -0.3421 -0.1049
## s.e. 0.0816 0.0739 0.0866 0.0807 0.0578 0.0513
##
## sigma^2 estimated as 7649: log likelihood=-1989.36
## AIC=3992.72 AICc=3993.06 BIC=4019.48
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set 5.347325 85.05609 64.10699 0.001067671 2.091984 0.4306553
## ACF1
## Training set -0.009206448
plot(residuals(dfit_auto))
Acf(residuals(dfit_auto))
Pacf(residuals(dfit_auto))
adf.test(residuals(dfit_auto))
##
## Augmented Dickey-Fuller Test
##
## data: residuals(dfit_auto)
## Dickey-Fuller = -6.7121, Lag order = 7, p-value = 0.01
## alternative hypothesis: stationary
Box.test(residuals(dfit_auto), type = "Lj")
##
## Box-Ljung test
##
## data: residuals(dfit_auto)
## X-squared = 0.030005, df = 1, p-value = 0.8625
coeftest(dfit_auto)
##
## z test of coefficients:
##
## Estimate Std. Error z value Pr(>|z|)
## ar1 -1.179931 0.081580 -14.4634 < 2.2e-16 ***
## ar2 -0.443725 0.073889 -6.0053 1.910e-09 ***
## ma1 0.465888 0.086619 5.3786 7.508e-08 ***
## ma2 -0.298507 0.080715 -3.6983 0.0002171 ***
## sma1 -0.342064 0.057752 -5.9230 3.161e-09 ***
## sma2 -0.104850 0.051304 -2.0437 0.0409832 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
dfit_auto2 <- auto.arima(data_ts_alec, seasonal = TRUE)
dfit_auto2
## Series: data_ts_alec
## ARIMA(2,1,1)(0,1,2)[12]
##
## Coefficients:
## ar1 ar2 ma1 sma1 sma2
## -0.9569 -0.4796 0.2595 -0.3489 -0.0899
## s.e. 0.1186 0.0708 0.1320 0.0595 0.0548
##
## sigma^2 estimated as 8912: log likelihood=-2015.54
## AIC=4043.09 AICc=4043.34 BIC=4066.03
summary(dfit_auto2)
## Series: data_ts_alec
## ARIMA(2,1,1)(0,1,2)[12]
##
## Coefficients:
## ar1 ar2 ma1 sma1 sma2
## -0.9569 -0.4796 0.2595 -0.3489 -0.0899
## s.e. 0.1186 0.0708 0.1320 0.0595 0.0548
##
## sigma^2 estimated as 8912: log likelihood=-2015.54
## AIC=4043.09 AICc=4043.34 BIC=4066.03
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set 4.047023 91.95075 67.72276 -0.03303985 2.176473 0.4528476
## ACF1
## Training set 0.002130525
plot(residuals(dfit_auto2))
Acf(residuals(dfit_auto2))
Pacf(residuals(dfit_auto2))
adf.test(residuals(dfit_auto2))
##
## Augmented Dickey-Fuller Test
##
## data: residuals(dfit_auto2)
## Dickey-Fuller = -7.0397, Lag order = 7, p-value = 0.01
## alternative hypothesis: stationary
Box.test(residuals(dfit_auto2), type = "Lj")
##
## Box-Ljung test
##
## data: residuals(dfit_auto2)
## X-squared = 0.0016069, df = 1, p-value = 0.968
coeftest(dfit_auto2)
##
## z test of coefficients:
##
## Estimate Std. Error z value Pr(>|z|)
## ar1 -0.956939 0.118594 -8.0690 7.086e-16 ***
## ar2 -0.479635 0.070771 -6.7773 1.224e-11 ***
## ma1 0.259535 0.131998 1.9662 0.04928 *
## sma1 -0.348872 0.059539 -5.8595 4.642e-09 ***
## sma2 -0.089912 0.054751 -1.6422 0.10055
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
dfit <- arima(data_ts, order = c(1,1,3),
seasonal = list(order = c(0, 1, 0), period = 12))
dfit
##
## Call:
## arima(x = data_ts, order = c(1, 1, 3), seasonal = list(order = c(0, 1, 0), period = 12))
##
## Coefficients:
## ar1 ma1 ma2 ma3
## -0.7988 0.1201 -0.5227 0.2563
## s.e. 0.0550 0.0664 0.0493 0.0517
##
## sigma^2 estimated as 8425: log likelihood = -2007.71, aic = 4025.41
#we will forecast data for the last two full years, 2019+ (month = 328 to 351)
fit_predicted <- arima(ts(data_ts[-c(328:351)]), order =c(0,1,1), seasonal = list(order = c(0,1,2), period = 12))
fit_predicted2 <- arima(ts(data_ts[-c(328:351)]), order =c(2,1,1), seasonal = list(order = c(0,1,2), period = 12))
fit_predicted
##
## Call:
## arima(x = ts(data_ts[-c(328:351)]), order = c(0, 1, 1), seasonal = list(order = c(0,
## 1, 2), period = 12))
##
## Coefficients:
## ma1 sma1 sma2
## -0.8005 -0.2735 -0.2012
## s.e. 0.0400 0.0552 0.0472
##
## sigma^2 estimated as 6358: log likelihood = -1822.25, aic = 3652.49
fit_predicted2
##
## Call:
## arima(x = ts(data_ts[-c(328:351)]), order = c(2, 1, 1), seasonal = list(order = c(0,
## 1, 2), period = 12))
##
## Coefficients:
## ar1 ar2 ma1 sma1 sma2
## -0.8279 -0.4655 -0.0045 -0.2638 -0.1256
## s.e. 0.2051 0.1240 0.2534 0.0577 0.0532
##
## sigma^2 estimated as 6172: log likelihood = -1817.01, aic = 3646.03
# ARIMA(2,1,1)(0,1,2)[12] WRONG!!!
# ARIMA(0,1,1)(0,1,2)[12]
#use the model to forecast values for last 24 months.
#Specify forecast horizon h periods ahead of prediction to be made
#and use the fitted model to generate those predictions
forecast_pred <- forecast(fit_predicted, h = 24)
plot(forecast_pred, main = "Forecasting 2019-2021")
lines(ts(data_ts))
plot(forecast_pred, main = "Plotting only 2018 and predicted 2019-2021"
, xlim = c(313,351)
, ylim = c(1500,6500)
)
lines(ts(data_ts))
autoplot(forecast_pred) +
xlab("Year") + ylab("Sales in Thousands") +
labs(title = "Forecasting Previous 24 Months -- Apr 2019 to Mar 2021") +
scale_x_continuous(breaks = seq(1, 351, by = 24),
labels = seq(1992, 2021, by = 2)) +
theme_bw()
actual2018 <- ts(data_ts)[313:351]
autoplot(forecast_pred) +
geom_line(aes(x = 313:351, y = actual2018)) +
xlab("Year") + ylab("Sales in Thousands") +
labs(title = "Forecasting Previous 24 Months -- Apr 2019 to Mar 2021",
subtitle = "Zoomed in: X-axis limited to 2018-2021") +
coord_cartesian(xlim = c(313,351)) +
scale_x_continuous(breaks = seq(313, 351, by = 6),
labels = c("Jan 2018", "Jul 2018",
"Jan 2019", "Jul 2019",
"Jan 2020", "Jul 2020",
"Jan 2021")) +
theme_bw()
#8. Forecasting
#Next step is to forecast the sales for another 24 months ahead of time.
f_values <-forecast(dfit_auto2, h=24)
plot(f_values, main = "Forecasting for Next Months -- Apr 2021 to Mar 2023")
autoplot(f_values) +
xlab("Year") + ylab("Sales in Thousands") +
labs(title = "Forecasting for Next Months -- Apr 2021 to Mar 2023") +
scale_x_continuous(breaks = seq(1, 32, by = 2),
labels = seq(1992, 2022, by = 2)) +
theme_bw()
Alcohol sales much higher than predicted in 2020 (Covid)