library(ggplot2)
library(forecast)
library(grid)
library(Amelia)
library(tseries)
library(scales)
library(gridExtra)
library(lmtest)
library(Rcpp)
library(Amelia)
library(dplyr)
# The dataset (usliquorsales.csv) contains monthly data of non-seasonally adjusted sales of beer,
# wine and liquor across retail liquor stores in the US from 1992 to 2021 (March). The source for
# the data is the US Census Bureau’s Retail Trade and Food Services.
# Period: Month and Year
# Value: Sales in Million Dollars
# 10 pages, 5 pages appendix
liquor <- read.csv("C:/Users/willi/Desktop/Georgetown/RStudio Datasource/usliquorsales.csv")
#remove na and blank rows
liquor <- liquor[!apply(is.na(liquor) | liquor == "", 1, all),]
#change date to date (as column PeriodDate)
liquor$PeriodDate <- as.Date(paste(liquor$Period,"-01",sep=""),"%b-%Y-%d")
# change value to numeric
liquor$Value <- gsub(',', '', liquor$Value)
liquor$Value <- as.numeric(liquor$Value)
glimpse(liquor)
## Rows: 351
## Columns: 3
## $ Period <chr> "Jan-1992", "Feb-1992", "Mar-1992", "Apr-1992", "May-1992",~
## $ Value <dbl> 1509, 1541, 1597, 1675, 1822, 1775, 1912, 1862, 1770, 1882,~
## $ PeriodDate <date> 1992-01-01, 1992-02-01, 1992-03-01, 1992-04-01, 1992-05-01~
# plot
ts_plot <- ggplot(liquor, aes(PeriodDate,Value)) + geom_line(na.rm=TRUE) +
xlab("Month") + ylab("Liquor Sales in Millions") +
scale_x_date(breaks = date_breaks("1 year"),labels = date_format(format= "%b-%Y")) +
stat_smooth(colour = "blue") +
theme(axis.text.x = element_text(angle = 45)) +
ggtitle("Liquor Sales over Time") +
theme(plot.title = element_text(hjust = 0.5))
ts_plot

#convert to time series object
liquor_ts <-ts(liquor[,c('Value')])
class(liquor_ts)
## [1] "ts"
# check for stationary
adf.test(liquor_ts)
##
## Augmented Dickey-Fuller Test
##
## data: liquor_ts
## Dickey-Fuller = -2.4723, Lag order = 7, p-value = 0.3775
## alternative hypothesis: stationary
#Plot time series with trend line
plot(liquor_ts, col = "blue", main = "Liquor Sales Time Series Data")
abline(reg=lm(liquor_ts~time(liquor_ts)), col="lightgray") #plotting the trend line

#Autocorrelation and Partial Autocorrelation Plots
Acf(liquor_ts)

Pacf(liquor_ts)

#Lag plot of Data
gglagplot(liquor_ts, set.lags=1:16)

Box.test(liquor_ts, lag=24, fitdf=0, type="Lj")
##
## Box-Ljung test
##
## data: liquor_ts
## X-squared = 4781.4, df = 24, p-value < 2.2e-16
###DECOMPOSING THE TIME SERIES (additive)
#Converting data into a time series object by year
liquor_tsd <-ts(liquor[,c('Value')], frequency=12)
class(liquor_tsd)
## [1] "ts"
component.ts = decompose(liquor_tsd)
plot(component.ts)

#Or use the following
component.tsa = decompose(liquor_tsd, type="additive", filter=NULL)
plot(component.tsa)

##EXTRA CODE
#For multiplicative decomposition use the following code
component.tsm2 = decompose(liquor_tsd, type="multiplicative", filter=NULL)
plot(component.tsm2)

#OR
component.tsm2 = decompose(log(liquor_tsd))
plot(component.tsm2)

#We can also use STL (Seasonal and Trend Decomposition using LOESS)
#LOESS is a method for estimating nonlinear relationships
#Advantage of LOESS: handle any type of seasonality, seasonal component changes over time,
#smoothness of trend can be controlled by user and can be robust to outliers
liquor_tsd %>%
stl(t.window=12, s.window="periodic", robust=TRUE) %>%
autoplot()

#tsclean() identifies and replaces outliers using series smoothing and decomposition.
#tsclean() can also impute missing values in the series if there are any
#We are using the ts() command to create a time series object to pass to tsclean()
liquor$csales <-tsclean(liquor_ts)
##### NEED TO FIX THIS
#Plot the cleaned data
c_ts_plot <- ggplot(liquor, aes(PeriodDate,csales)) + geom_line(na.rm=TRUE) +
xlab("Month") + ylab("Liquor Sales in Miillions") +
scale_x_date(labels = date_format(format= "%b-%Y"),breaks = date_breaks("1 year")) +
stat_smooth(colour="green") +
theme(axis.text.x = element_text(angle = 45))
c_ts_plot

#Lets compare both cleaned and uncleaned plots
grid.arrange(ts_plot,c_ts_plot,ncol=1, top = textGrob("Uncleaned vs Cleaned Series"))

#Smoothing the time series and looking at the decomposed time series again
my_ts <- ts(na.omit(liquor$csales), frequency = 12)
plot(my_ts)

component.ts2 = decompose(my_ts)
plot(component.ts2)

#3. Naive Forecasting Method (for the next 24 months after observed data)
naive_forecast <-naive(liquor_ts, 24)
summary(naive_forecast)
##
## Forecast method: Naive method
##
## Model Information:
## Call: naive(y = liquor_ts, h = 24)
##
## Residual sd: 601.3411
##
## Error measures:
## ME RMSE MAE MPE MAPE MASE ACF1
## Training set 12.06 601.3411 345.8257 -1.189119 11.13808 1 -0.4077653
##
## Forecasts:
## Point Forecast Lo 80 Hi 80 Lo 95 Hi 95
## 352 5730 4959.350 6500.650 4551.39318 6908.607
## 353 5730 4640.137 6819.863 4063.19825 7396.802
## 354 5730 4395.196 7064.804 3688.59310 7771.407
## 355 5730 4188.701 7271.299 3372.78636 8087.214
## 356 5730 4006.775 7453.225 3094.55503 8365.445
## 357 5730 3842.302 7617.698 2843.01468 8616.985
## 358 5730 3691.053 7768.947 2611.69946 8848.301
## 359 5730 3550.274 7909.726 2396.39650 9063.604
## 360 5730 3418.051 8041.949 2194.17954 9265.820
## 361 5730 3292.992 8167.008 2002.91798 9457.082
## 362 5730 3174.045 8285.955 1821.00340 9638.997
## 363 5730 3060.392 8399.608 1647.18621 9812.814
## 364 5730 2951.383 8508.617 1480.47267 9979.527
## 365 5730 2846.493 8613.507 1320.05708 10139.943
## 366 5730 2745.287 8714.713 1165.27541 10294.725
## 367 5730 2647.402 8812.598 1015.57271 10444.427
## 368 5730 2552.530 8907.470 870.47958 10589.520
## 369 5730 2460.411 8999.589 729.59474 10730.405
## 370 5730 2370.816 9089.184 592.57197 10867.428
## 371 5730 2283.550 9176.450 459.11006 11000.890
## 372 5730 2198.440 9261.560 328.94503 11131.055
## 373 5730 2115.333 9344.667 201.84399 11258.156
## 374 5730 2034.094 9425.906 77.60025 11382.400
## 375 5730 1954.604 9505.396 -43.97064 11503.971
autoplot(naive_forecast)

#Check for fitted values and residuals
checkresiduals(naive_forecast)

##
## Ljung-Box test
##
## data: Residuals from Naive method
## Q* = 99.964, df = 10, p-value < 2.2e-16
##
## Model df: 0. Total lags used: 10
#4. Smoothing the Series to uncover patterns in data
#4.1 Moving Averages
#MA of order 5 (generally of odd numbers)
liquor_ma<-ma(liquor_ts, 5)
autoplot(liquor_ts, series="Data") +
autolayer(ma(liquor_ts,5), series="5-MA") +
xlab("Year") + ylab("Sales") +
ggtitle("Liquor Sales Moving Average - 5 months") +
scale_colour_manual(values=c("Data"="grey50","5-MA"="red"),
breaks=c("Data","5-MA"))

#MA of order 3
autoplot(liquor_ts, series="Data") +
autolayer(ma(liquor_ts,3), series="3-MA") +
xlab("Year") + ylab("Sales") +
ggtitle("Liquor Sales Moving Average - 3 months") +
scale_colour_manual(values=c("Data"="grey50","3-MA"="red"),
breaks=c("Data","3-MA"))

#MA of order 9
autoplot(liquor_ts, series="Data") +
autolayer(ma(liquor_ts,9), series="9-MA") +
xlab("Year") + ylab("Sales") +
ggtitle("Liquor Sales Moving Average - 9 months") +
scale_colour_manual(values=c("Data"="grey50","9-MA"="red"),
breaks=c("Data","9-MA"))

#Moving Average of Moving Averages (only for even order moving average to make them symmetric)
#A 2x4 moving average
autoplot(liquor_ts, series = "Data") +
autolayer(ma(liquor_ts, order = 4, centre = TRUE), series = "2x4-MA") +
labs(x = "Year", y = "Sales") +
ggtitle("2x4 moving average of liquor sales")

#Removing Seasonal effects (if it is there- say a 1 year seasonal variation)
autoplot(liquor_ts, series = "Data") +
autolayer(ma(liquor_ts, 12), series = "12-MA") +
labs(x = "Year", y = "Sales") +
ggtitle("12-month moving average of liquor sales") +
scale_colour_manual(values=c("Data"="grey50","12-MA"="red"),
breaks=c("Data","12-MA"))

#4.2 Exponential Smoothing Models
#Simple exponential smoothing used only for models that dont have any trend or Seasonality
liquor_ses <-ses(liquor_ts, alpha=0.2, h=24)
autoplot(liquor_ses)

#We can remove the trend simply by differencing the data
liquor_dif <-diff(liquor_ts)
autoplot(liquor_dif)

#Once we’ve differenced we’ve effectively removed the trend from our data and can reapply the SES model
liquor_ses2 <-ses(liquor_dif, alpha=0.2, h=24)
autoplot(liquor_ses2)

#5. Making the series stationary (identify level of differencing required)
#we need to remove trend by using appropriate order of difference and make the series stationary.
#We do this by looking at acf, Dickey-Fuller Test and standard deviation.
#DICKEY FULLER TEST
#(We have to test if Rho - 1 is significantly different than zero or not.
#If the null hypothesis gets rejected, we’ll get a stationary time series.)
#First, confirm that the series is non-stationary using augmented DF test
adf.test(my_ts)
##
## Augmented Dickey-Fuller Test
##
## data: my_ts
## Dickey-Fuller = -3.4303, Lag order = 7, p-value = 0.04959
## alternative hypothesis: stationary
#To convert series to stationary, we need to know the level of differencing required
#Look at ACF (autocorrelation plot for the series to identify the order of differencing required)
Acf(my_ts)

Pacf(my_ts)

#6. Forecasting with ARIMA Model
#using differencing: lets try order 1 difference
#We will fit ARIMA(0,d,0)(0,D,0)[12] models
#and verify acf residuals to find which ‘d’ or ‘D’ order of differencing is appropriate in our case.
#Applying only one order of difference i.e ARIMA(0,1,0)(0,0,0)
dfit1 <-arima(my_ts, order=c(0,1,0))
plot(residuals(dfit1))

Acf(residuals(dfit1))

Pacf(residuals(dfit1))

#Because the seasonal pattern is strong and stable,
#we will want to use an order of seasonal differencing in the model.
#Before that let’s try only with one seasonal difference i.e ARIMA(0,0,0)(0,1,0)
dfit2 <- arima(my_ts, order =c(0,0,0), seasonal = list(order = c(0,1,0), period = 12))
plot(residuals(dfit2))

Acf(residuals(dfit2))

Pacf(residuals(dfit2))

#lets try and apply both seasonal and non-seasonal differencing, ARIMA(0,1,0)(0,1,0)[12]
dfit3 <- arima(my_ts, order =c(0,1,0), seasonal = list(order = c(0,1,0), period = 12))
plot(residuals(dfit3))

Acf(residuals(dfit3))

Pacf(residuals(dfit3))

#Since first ACF is -ve and most of the positive correlations are now negative (series is overdifferenced)
#we should add an MA term to the model but to know what order of MA we need,
#check the standard deviation of the models (sd=RMSE)
summary(dfit1)
##
## Call:
## arima(x = my_ts, order = c(0, 1, 0))
##
##
## sigma^2 estimated as 84825: log likelihood = -2482.59, aic = 4967.18
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set 12.02994 290.8322 202.5769 -0.2134874 7.154548 0.9971722
## ACF1
## Training set -0.2321729
summary(dfit2)
##
## Call:
## arima(x = my_ts, order = c(0, 0, 0), seasonal = list(order = c(0, 1, 0), period = 12))
##
##
## sigma^2 estimated as 59182: log likelihood = -2343.55, aic = 4689.1
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE ACF1
## Training set 129.0376 239.0782 150.0148 3.658188 4.602501 0.7384382 0.5799081
summary(dfit3)
##
## Call:
## arima(x = my_ts, order = c(0, 1, 0), seasonal = list(order = c(0, 1, 0), period = 12))
##
##
## sigma^2 estimated as 34897: log likelihood = -2247.37, aic = 4496.74
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set 1.064112 183.3162 103.3961 -0.06062248 3.60127 0.5089605
## ACF1
## Training set -0.4610254
#We have over-differencing, so we will stop here,
#Out of the above, dfit3 model, i.e., ARIMA(0,1,0)(0,1,0)12 has the lowest standard deviation(RMSE) and AIC.
#Therefore, it is the correct order of differencing.
#Now, we need to identify AR/MA and SAR/SMA values and fit the model
dfit4 <- arima(my_ts, order =c(0,1,1), seasonal = list(order = c(0,1,0), period = 12))
plot(residuals(dfit4))

Acf(residuals(dfit4))

Pacf(residuals(dfit4))

#Add a one-order MA component to the seasonal part and see what we get
dfit5 <- arima(my_ts, order =c(0,1,0), seasonal = list(order = c(0,1,1), period = 12))
plot(residuals(dfit5))

Acf(residuals(dfit5))

Pacf(residuals(dfit5))

#combine a MA component to non-seasonal and one to seasonal
dfit6 <- arima(my_ts, order =c(0,1,1), seasonal = list(order = c(0,1,1), period = 12))
plot(residuals(dfit6))

Acf(residuals(dfit6))

Pacf(residuals(dfit6))

#Pending statistically significant MA coefficient and low AIC the model seems a good fit
summary(dfit4)
##
## Call:
## arima(x = my_ts, order = c(0, 1, 1), seasonal = list(order = c(0, 1, 0), period = 12))
##
## Coefficients:
## ma1
## -0.7035
## s.e. 0.0422
##
## sigma^2 estimated as 23394: log likelihood = -2180.12, aic = 4364.24
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set 6.181045 150.0912 86.47321 0.02136486 2.961413 0.4256589
## ACF1
## Training set -0.004405172
summary(dfit5)
##
## Call:
## arima(x = my_ts, order = c(0, 1, 0), seasonal = list(order = c(0, 1, 1), period = 12))
##
## Coefficients:
## sma1
## -0.6258
## s.e. 0.0439
##
## sigma^2 estimated as 28059: log likelihood = -2213.49, aic = 4430.99
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set 2.701396 164.3783 106.5518 -0.02086988 3.725383 0.5244943
## ACF1
## Training set -0.4858186
summary(dfit6)
##
## Call:
## arima(x = my_ts, order = c(0, 1, 1), seasonal = list(order = c(0, 1, 1), period = 12))
##
## Coefficients:
## ma1 sma1
## -0.6819 -0.599
## s.e. 0.0401 0.045
##
## sigma^2 estimated as 18762: log likelihood = -2145.48, aic = 4296.96
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set 8.838467 134.4146 85.18305 0.08820039 2.92143 0.4193082
## ACF1
## Training set -0.02979656
#The coeftest() function in lmtest package can help us in getting the p-values of coefficients.
coeftest(dfit6)
##
## z test of coefficients:
##
## Estimate Std. Error z value Pr(>|z|)
## ma1 -0.681877 0.040130 -16.992 < 2.2e-16 ***
## sma1 -0.599038 0.044969 -13.321 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#Check Minimum AIC and Iterate
#We use the auto.arima() function to let R build our model with least AIC
#this function will search through combination of order parameters and provide best set
#by default it looks at maximum order of size 5
dfit7 <- auto.arima(my_ts, seasonal = TRUE)
plot(residuals(dfit7))

Acf(residuals(dfit7))

Pacf(residuals(dfit7))

summary(dfit7)
## Series: my_ts
## ARIMA(0,1,1)(0,1,2)[12]
##
## Coefficients:
## ma1 sma1 sma2
## -0.6728 -0.5073 -0.1067
## s.e. 0.0413 0.0558 0.0478
##
## sigma^2 estimated as 18691: log likelihood=-2143.12
## AIC=4294.24 AICc=4294.36 BIC=4309.54
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set 8.374599 133.5625 83.23823 0.08038519 2.859069 0.5361179
## ACF1
## Training set -0.02350809
coeftest(dfit7)
##
## z test of coefficients:
##
## Estimate Std. Error z value Pr(>|z|)
## ma1 -0.672814 0.041318 -16.2837 < 2e-16 ***
## sma1 -0.507288 0.055836 -9.0853 < 2e-16 ***
## sma2 -0.106710 0.047786 -2.2331 0.02554 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#7. Model Validation (n-fold holdout method)
hold <- window(ts(my_ts), start =233)
#we will forecast data for the last two years (month = 233 to 256)
fit_predicted <- arima(ts(my_ts[-c(233:256)]), order =c(0,1,1), seasonal = list(order = c(0,1,1), period = 12))
#use the model to forecast values for last 24 months.
#Specify forecast horizon h periods ahead of prediction to be made
#and use the fitted model to generate those predictions
forecast_pred <- forecast(fit_predicted,h=24)
plot(forecast_pred, main="Liquor Sales Forecast", xlab="Months since Jan 1992", ylab="Sales in Millions")
lines(ts(my_ts))

#8. Forecasting
#Next step is to forecast the sales for another 24 months ahead of time.
f_values <-forecast(dfit7, h=24)
plot(f_values, main="")

f_values
## Point Forecast Lo 80 Hi 80 Lo 95 Hi 95
## Apr 30 5567.796 5392.589 5743.003 5299.840 5835.752
## May 30 6295.510 6111.163 6479.857 6013.576 6577.444
## Jun 30 6199.472 6006.418 6392.526 5904.221 6494.723
## Jul 30 6440.115 6238.729 6641.500 6132.122 6748.107
## Aug 30 6253.527 6044.141 6462.913 5933.299 6573.755
## Sep 30 5984.442 5767.350 6201.533 5652.429 6316.454
## Oct 30 6142.549 5918.017 6367.081 5799.156 6485.941
## Nov 30 6301.949 6070.214 6533.683 5947.541 6656.356
## Dec 30 5912.535 5673.815 6151.255 5547.444 6277.625
## Jan 31 5522.513 5277.006 5768.019 5147.043 5897.982
## Feb 31 5486.830 5234.720 5738.940 5101.261 5872.399
## Mar 31 6171.218 5912.673 6429.764 5775.807 6566.629
## Apr 31 6014.515 5718.742 6310.288 5562.170 6466.861
## May 31 6683.410 6375.508 6991.313 6212.514 7154.306
## Jun 31 6600.650 6281.078 6920.222 6111.906 7089.393
## Jul 31 6825.910 6495.080 7156.740 6319.949 7331.871
## Aug 31 6664.767 6323.050 7006.485 6142.156 7187.379
## Sep 31 6380.052 6027.783 6732.320 5841.304 6918.800
## Oct 31 6535.239 6172.727 6897.752 5980.824 7089.654
## Nov 31 6724.307 6351.832 7096.782 6154.655 7293.958
## Dec 31 6327.570 5945.392 6709.748 5743.079 6912.060
## Jan 32 5930.127 5538.487 6321.767 5331.165 6529.089
## Feb 32 5914.751 5513.871 6315.631 5301.659 6527.844
## Mar 32 6592.784 6182.873 7002.694 5965.880 7219.688
### White noise series (non-stationary series)
wn <- arima(liquor_ts, order=c(0,0,0))
summary(wn)
##
## Call:
## arima(x = liquor_ts, order = c(0, 0, 0))
##
## Coefficients:
## intercept
## 3114.897
## s.e. 59.830
##
## sigma^2 estimated as 1256451: log likelihood = -2962.73, aic = 5929.47
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set -1.945083e-12 1120.915 926.836 -13.29419 33.52151 2.680067
## ACF1
## Training set 0.8458309
checkresiduals(wn)

##
## Ljung-Box test
##
## data: Residuals from ARIMA(0,0,0) with non-zero mean
## Q* = 2243.5, df = 9, p-value < 2.2e-16
##
## Model df: 1. Total lags used: 10
f_values <-forecast(wn, h=24)
plot(f_values, main="")

dfit10 <- auto.arima(liquor_ts, seasonal = TRUE)
f_values <-forecast(dfit10, h=24)
plot(f_values, main="")

summary(dfit10)
## Series: liquor_ts
## ARIMA(2,1,2) with drift
##
## Coefficients:
## ar1 ar2 ma1 ma2 drift
## 0.8455 -0.2204 -1.8259 0.8871 12.8542
## s.e. 0.0646 0.0643 0.0434 0.0391 3.7062
##
## sigma^2 estimated as 182490: log likelihood=-2615.67
## AIC=5243.34 AICc=5243.58 BIC=5266.49
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set 2.669031 423.5212 265.6343 -1.754215 8.20172 0.7681162
## ACF1
## Training set -0.0004498032
Acf(residuals(dfit10), main="Autocorrelation")

Pacf(residuals(dfit10), main="Partial Autocorrelation")

plot(residuals(dfit10), main="Residuals",ylab='residuals')

coeftest(dfit10)
##
## z test of coefficients:
##
## Estimate Std. Error z value Pr(>|z|)
## ar1 0.845534 0.064607 13.0874 < 2.2e-16 ***
## ar2 -0.220374 0.064342 -3.4250 0.0006147 ***
## ma1 -1.825898 0.043368 -42.1028 < 2.2e-16 ***
## ma2 0.887067 0.039145 22.6608 < 2.2e-16 ***
## drift 12.854153 3.706232 3.4683 0.0005239 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
fit_predicted <- arima(ts(liquor_ts[-c(351:374)]), order =c(2,1,2), seasonal = list(order = c(2,1,2), period = 12))
forecast_pred <- forecast(fit_predicted,h=24)
plot(forecast_pred, main="Liquor Sales Forecast", xlab="Months since Jan 1992", ylab="Sales in Millions")

checkresiduals(dfit10)

##
## Ljung-Box test
##
## data: Residuals from ARIMA(2,1,2) with drift
## Q* = 20.032, df = 5, p-value = 0.001233
##
## Model df: 5. Total lags used: 10