library(ggplot2)
library(forecast)
library(grid)
library(Amelia)
library(tseries)
library(scales)
library(gridExtra)
library(lmtest)
library(Rcpp)
library(Amelia)
library(dplyr)
# The dataset (usliquorsales.csv) contains monthly data of non-seasonally adjusted sales of beer, 
# wine and liquor across retail liquor stores in the US from 1992 to 2021 (March). The source for 
# the data is the US Census Bureau’s Retail Trade and Food Services. 

# Period: Month and Year
# Value: Sales in Million Dollars

# 10 pages, 5 pages appendix

liquor <- read.csv("C:/Users/willi/Desktop/Georgetown/RStudio Datasource/usliquorsales.csv")

#remove na and blank rows
liquor <- liquor[!apply(is.na(liquor) | liquor == "", 1, all),]

#change date to date (as column PeriodDate)
liquor$PeriodDate <- as.Date(paste(liquor$Period,"-01",sep=""),"%b-%Y-%d")

# change value to numeric 
liquor$Value <- gsub(',', '', liquor$Value)
liquor$Value <- as.numeric(liquor$Value)

glimpse(liquor)
## Rows: 351
## Columns: 3
## $ Period     <chr> "Jan-1992", "Feb-1992", "Mar-1992", "Apr-1992", "May-1992",~
## $ Value      <dbl> 1509, 1541, 1597, 1675, 1822, 1775, 1912, 1862, 1770, 1882,~
## $ PeriodDate <date> 1992-01-01, 1992-02-01, 1992-03-01, 1992-04-01, 1992-05-01~
# plot
ts_plot <- ggplot(liquor, aes(PeriodDate,Value)) + geom_line(na.rm=TRUE) + 
  xlab("Month") + ylab("Liquor Sales in Millions") + 
  scale_x_date(breaks = date_breaks("1 year"),labels = date_format(format= "%b-%Y")) +
  stat_smooth(colour = "blue") +
  theme(axis.text.x = element_text(angle = 45)) +
  ggtitle("Liquor Sales over Time") +
  theme(plot.title = element_text(hjust = 0.5))

ts_plot

#convert to time series object
liquor_ts <-ts(liquor[,c('Value')])
class(liquor_ts)
## [1] "ts"
# check for stationary 
adf.test(liquor_ts)
## 
##  Augmented Dickey-Fuller Test
## 
## data:  liquor_ts
## Dickey-Fuller = -2.4723, Lag order = 7, p-value = 0.3775
## alternative hypothesis: stationary
#Plot time series with trend line
plot(liquor_ts, col = "blue", main = "Liquor Sales Time Series Data")
abline(reg=lm(liquor_ts~time(liquor_ts)), col="lightgray") #plotting the trend line

#Autocorrelation and Partial Autocorrelation Plots
Acf(liquor_ts)

Pacf(liquor_ts)

#Lag plot of Data
gglagplot(liquor_ts, set.lags=1:16)

Box.test(liquor_ts, lag=24, fitdf=0, type="Lj")
## 
##  Box-Ljung test
## 
## data:  liquor_ts
## X-squared = 4781.4, df = 24, p-value < 2.2e-16
###DECOMPOSING THE TIME SERIES (additive)
#Converting data into a time series object by year
liquor_tsd <-ts(liquor[,c('Value')], frequency=12)
class(liquor_tsd)
## [1] "ts"
component.ts = decompose(liquor_tsd)
plot(component.ts)

#Or use the following
component.tsa = decompose(liquor_tsd, type="additive", filter=NULL)
plot(component.tsa)

##EXTRA CODE
#For multiplicative decomposition use the following code
component.tsm2 = decompose(liquor_tsd, type="multiplicative", filter=NULL)
plot(component.tsm2)

#OR

component.tsm2 = decompose(log(liquor_tsd))
plot(component.tsm2)

#We can also use STL (Seasonal and Trend Decomposition using LOESS)
#LOESS is a method for estimating nonlinear relationships
#Advantage of LOESS: handle any type of seasonality, seasonal component changes over time, 
#smoothness of trend can be controlled by user and can be robust to outliers

liquor_tsd %>%
  stl(t.window=12, s.window="periodic", robust=TRUE) %>%
  autoplot()

#tsclean() identifies and replaces outliers using series smoothing and decomposition.
#tsclean() can also impute missing values in the series if there are any
#We are using the ts() command to create a time series object to pass to tsclean()
liquor$csales <-tsclean(liquor_ts)

##### NEED TO FIX THIS 

#Plot the cleaned data
c_ts_plot <- ggplot(liquor, aes(PeriodDate,csales)) + geom_line(na.rm=TRUE) + 
  xlab("Month") + ylab("Liquor Sales in Miillions") + 
  scale_x_date(labels = date_format(format= "%b-%Y"),breaks = date_breaks("1 year")) + 
  stat_smooth(colour="green") +
  theme(axis.text.x = element_text(angle = 45))
c_ts_plot

#Lets compare both cleaned and uncleaned plots
grid.arrange(ts_plot,c_ts_plot,ncol=1, top = textGrob("Uncleaned vs Cleaned Series"))

#Smoothing the time series and looking at the decomposed time series again
my_ts <- ts(na.omit(liquor$csales), frequency = 12)
plot(my_ts)

component.ts2 = decompose(my_ts)
plot(component.ts2)

#3. Naive Forecasting Method (for the next 24 months after observed data)
naive_forecast <-naive(liquor_ts, 24)
summary(naive_forecast)
## 
## Forecast method: Naive method
## 
## Model Information:
## Call: naive(y = liquor_ts, h = 24) 
## 
## Residual sd: 601.3411 
## 
## Error measures:
##                 ME     RMSE      MAE       MPE     MAPE MASE       ACF1
## Training set 12.06 601.3411 345.8257 -1.189119 11.13808    1 -0.4077653
## 
## Forecasts:
##     Point Forecast    Lo 80    Hi 80      Lo 95     Hi 95
## 352           5730 4959.350 6500.650 4551.39318  6908.607
## 353           5730 4640.137 6819.863 4063.19825  7396.802
## 354           5730 4395.196 7064.804 3688.59310  7771.407
## 355           5730 4188.701 7271.299 3372.78636  8087.214
## 356           5730 4006.775 7453.225 3094.55503  8365.445
## 357           5730 3842.302 7617.698 2843.01468  8616.985
## 358           5730 3691.053 7768.947 2611.69946  8848.301
## 359           5730 3550.274 7909.726 2396.39650  9063.604
## 360           5730 3418.051 8041.949 2194.17954  9265.820
## 361           5730 3292.992 8167.008 2002.91798  9457.082
## 362           5730 3174.045 8285.955 1821.00340  9638.997
## 363           5730 3060.392 8399.608 1647.18621  9812.814
## 364           5730 2951.383 8508.617 1480.47267  9979.527
## 365           5730 2846.493 8613.507 1320.05708 10139.943
## 366           5730 2745.287 8714.713 1165.27541 10294.725
## 367           5730 2647.402 8812.598 1015.57271 10444.427
## 368           5730 2552.530 8907.470  870.47958 10589.520
## 369           5730 2460.411 8999.589  729.59474 10730.405
## 370           5730 2370.816 9089.184  592.57197 10867.428
## 371           5730 2283.550 9176.450  459.11006 11000.890
## 372           5730 2198.440 9261.560  328.94503 11131.055
## 373           5730 2115.333 9344.667  201.84399 11258.156
## 374           5730 2034.094 9425.906   77.60025 11382.400
## 375           5730 1954.604 9505.396  -43.97064 11503.971
autoplot(naive_forecast)

#Check for fitted values and residuals
checkresiduals(naive_forecast)

## 
##  Ljung-Box test
## 
## data:  Residuals from Naive method
## Q* = 99.964, df = 10, p-value < 2.2e-16
## 
## Model df: 0.   Total lags used: 10
#4. Smoothing the Series to uncover patterns in data
#4.1 Moving Averages
#MA of order 5 (generally of odd numbers)

liquor_ma<-ma(liquor_ts, 5)
autoplot(liquor_ts, series="Data") +
  autolayer(ma(liquor_ts,5), series="5-MA") +
  xlab("Year") + ylab("Sales") +
  ggtitle("Liquor Sales Moving Average - 5 months") +
  scale_colour_manual(values=c("Data"="grey50","5-MA"="red"),
                      breaks=c("Data","5-MA"))

#MA of order 3
autoplot(liquor_ts, series="Data") +
  autolayer(ma(liquor_ts,3), series="3-MA") +
  xlab("Year") + ylab("Sales") +
  ggtitle("Liquor Sales Moving Average - 3 months") +
  scale_colour_manual(values=c("Data"="grey50","3-MA"="red"),
                      breaks=c("Data","3-MA"))

#MA of order 9
autoplot(liquor_ts, series="Data") +
  autolayer(ma(liquor_ts,9), series="9-MA") +
  xlab("Year") + ylab("Sales") +
  ggtitle("Liquor Sales Moving Average - 9 months") +
  scale_colour_manual(values=c("Data"="grey50","9-MA"="red"),
                      breaks=c("Data","9-MA"))

#Moving Average of Moving Averages (only for even order moving average to make them symmetric)
#A 2x4 moving average

autoplot(liquor_ts, series = "Data") + 
  autolayer(ma(liquor_ts, order = 4, centre = TRUE), series = "2x4-MA") +
  labs(x = "Year", y = "Sales") + 
  ggtitle("2x4 moving average of liquor sales")

#Removing Seasonal effects (if it is there- say a 1 year seasonal variation)
autoplot(liquor_ts, series = "Data") + 
  autolayer(ma(liquor_ts, 12), series = "12-MA") +
  labs(x = "Year", y = "Sales") + 
  ggtitle("12-month moving average of liquor sales") +
  scale_colour_manual(values=c("Data"="grey50","12-MA"="red"),
                      breaks=c("Data","12-MA"))

#4.2 Exponential Smoothing Models
#Simple exponential smoothing used only for models that dont have any trend or Seasonality
liquor_ses <-ses(liquor_ts, alpha=0.2, h=24)
autoplot(liquor_ses)

#We can remove the trend simply by differencing the data
liquor_dif <-diff(liquor_ts)
autoplot(liquor_dif)

#Once we’ve differenced we’ve effectively removed the trend from our data and can reapply the SES model
liquor_ses2 <-ses(liquor_dif, alpha=0.2, h=24)
autoplot(liquor_ses2)

#5. Making the series stationary (identify level of differencing required) 
#we need to remove trend by using appropriate order of difference and make the series stationary. 
#We do this by looking at acf, Dickey-Fuller Test and standard deviation.
#DICKEY FULLER TEST 
#(We have to test if Rho - 1 is significantly different than zero or not. 
#If the null hypothesis gets rejected, we’ll get a stationary time series.)
#First, confirm that the series is non-stationary using augmented DF test
adf.test(my_ts)
## 
##  Augmented Dickey-Fuller Test
## 
## data:  my_ts
## Dickey-Fuller = -3.4303, Lag order = 7, p-value = 0.04959
## alternative hypothesis: stationary
#To convert series to stationary, we need to know the level of differencing required
#Look at ACF (autocorrelation plot for the series to identify the order of differencing required)
Acf(my_ts)

Pacf(my_ts)

#6. Forecasting with ARIMA Model
#using differencing: lets try order 1 difference
#We will fit ARIMA(0,d,0)(0,D,0)[12] models 
#and verify acf residuals to find which ‘d’ or ‘D’ order of differencing is appropriate in our case.
#Applying only one order of difference i.e ARIMA(0,1,0)(0,0,0)
dfit1 <-arima(my_ts, order=c(0,1,0))
plot(residuals(dfit1))

Acf(residuals(dfit1))

Pacf(residuals(dfit1))

#Because the seasonal pattern is strong and stable, 
#we will want to use an order of seasonal differencing in the model. 
#Before that let’s try only with one seasonal difference i.e ARIMA(0,0,0)(0,1,0)

dfit2 <- arima(my_ts, order =c(0,0,0), seasonal = list(order = c(0,1,0), period = 12))
plot(residuals(dfit2))

Acf(residuals(dfit2))

Pacf(residuals(dfit2))

#lets try and apply both seasonal and non-seasonal differencing, ARIMA(0,1,0)(0,1,0)[12]
dfit3 <- arima(my_ts, order =c(0,1,0), seasonal = list(order = c(0,1,0), period = 12))
plot(residuals(dfit3))

Acf(residuals(dfit3))

Pacf(residuals(dfit3))

#Since first ACF is -ve and most of the positive correlations are now negative (series is overdifferenced)
#we should add an MA term to the model but to know what order of MA we need,
#check the standard deviation of the models (sd=RMSE) 
summary(dfit1)
## 
## Call:
## arima(x = my_ts, order = c(0, 1, 0))
## 
## 
## sigma^2 estimated as 84825:  log likelihood = -2482.59,  aic = 4967.18
## 
## Training set error measures:
##                    ME     RMSE      MAE        MPE     MAPE      MASE
## Training set 12.02994 290.8322 202.5769 -0.2134874 7.154548 0.9971722
##                    ACF1
## Training set -0.2321729
summary(dfit2)
## 
## Call:
## arima(x = my_ts, order = c(0, 0, 0), seasonal = list(order = c(0, 1, 0), period = 12))
## 
## 
## sigma^2 estimated as 59182:  log likelihood = -2343.55,  aic = 4689.1
## 
## Training set error measures:
##                    ME     RMSE      MAE      MPE     MAPE      MASE      ACF1
## Training set 129.0376 239.0782 150.0148 3.658188 4.602501 0.7384382 0.5799081
summary(dfit3)
## 
## Call:
## arima(x = my_ts, order = c(0, 1, 0), seasonal = list(order = c(0, 1, 0), period = 12))
## 
## 
## sigma^2 estimated as 34897:  log likelihood = -2247.37,  aic = 4496.74
## 
## Training set error measures:
##                    ME     RMSE      MAE         MPE    MAPE      MASE
## Training set 1.064112 183.3162 103.3961 -0.06062248 3.60127 0.5089605
##                    ACF1
## Training set -0.4610254
#We have over-differencing, so we will stop here, 
#Out of the above, dfit3 model, i.e., ARIMA(0,1,0)(0,1,0)12 has the lowest standard deviation(RMSE) and AIC. 
#Therefore, it is the correct order of differencing.
#Now, we need to identify AR/MA and SAR/SMA values and fit the model

dfit4 <- arima(my_ts, order =c(0,1,1), seasonal = list(order = c(0,1,0), period = 12))
plot(residuals(dfit4))

Acf(residuals(dfit4))

Pacf(residuals(dfit4))

#Add a one-order MA component to the seasonal part and see what we get
dfit5 <- arima(my_ts, order =c(0,1,0), seasonal = list(order = c(0,1,1), period = 12))
plot(residuals(dfit5))

Acf(residuals(dfit5))

Pacf(residuals(dfit5))

#combine a MA component to non-seasonal and one to seasonal
dfit6 <- arima(my_ts, order =c(0,1,1), seasonal = list(order = c(0,1,1), period = 12))
plot(residuals(dfit6))

Acf(residuals(dfit6))

Pacf(residuals(dfit6))

#Pending statistically significant MA coefficient and low AIC the model seems a good fit
summary(dfit4)
## 
## Call:
## arima(x = my_ts, order = c(0, 1, 1), seasonal = list(order = c(0, 1, 0), period = 12))
## 
## Coefficients:
##           ma1
##       -0.7035
## s.e.   0.0422
## 
## sigma^2 estimated as 23394:  log likelihood = -2180.12,  aic = 4364.24
## 
## Training set error measures:
##                    ME     RMSE      MAE        MPE     MAPE      MASE
## Training set 6.181045 150.0912 86.47321 0.02136486 2.961413 0.4256589
##                      ACF1
## Training set -0.004405172
summary(dfit5)
## 
## Call:
## arima(x = my_ts, order = c(0, 1, 0), seasonal = list(order = c(0, 1, 1), period = 12))
## 
## Coefficients:
##          sma1
##       -0.6258
## s.e.   0.0439
## 
## sigma^2 estimated as 28059:  log likelihood = -2213.49,  aic = 4430.99
## 
## Training set error measures:
##                    ME     RMSE      MAE         MPE     MAPE      MASE
## Training set 2.701396 164.3783 106.5518 -0.02086988 3.725383 0.5244943
##                    ACF1
## Training set -0.4858186
summary(dfit6)
## 
## Call:
## arima(x = my_ts, order = c(0, 1, 1), seasonal = list(order = c(0, 1, 1), period = 12))
## 
## Coefficients:
##           ma1    sma1
##       -0.6819  -0.599
## s.e.   0.0401   0.045
## 
## sigma^2 estimated as 18762:  log likelihood = -2145.48,  aic = 4296.96
## 
## Training set error measures:
##                    ME     RMSE      MAE        MPE    MAPE      MASE
## Training set 8.838467 134.4146 85.18305 0.08820039 2.92143 0.4193082
##                     ACF1
## Training set -0.02979656
#The coeftest() function in lmtest package can help us in getting the p-values of coefficients.
coeftest(dfit6)
## 
## z test of coefficients:
## 
##       Estimate Std. Error z value  Pr(>|z|)    
## ma1  -0.681877   0.040130 -16.992 < 2.2e-16 ***
## sma1 -0.599038   0.044969 -13.321 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#Check Minimum AIC and Iterate
#We use the auto.arima() function to let R build our model with least AIC
#this function will search through combination of order parameters and provide best set
#by default it looks at maximum order of size 5 
dfit7 <- auto.arima(my_ts, seasonal = TRUE)
plot(residuals(dfit7))

Acf(residuals(dfit7))

Pacf(residuals(dfit7))

summary(dfit7)
## Series: my_ts 
## ARIMA(0,1,1)(0,1,2)[12] 
## 
## Coefficients:
##           ma1     sma1     sma2
##       -0.6728  -0.5073  -0.1067
## s.e.   0.0413   0.0558   0.0478
## 
## sigma^2 estimated as 18691:  log likelihood=-2143.12
## AIC=4294.24   AICc=4294.36   BIC=4309.54
## 
## Training set error measures:
##                    ME     RMSE      MAE        MPE     MAPE      MASE
## Training set 8.374599 133.5625 83.23823 0.08038519 2.859069 0.5361179
##                     ACF1
## Training set -0.02350809
coeftest(dfit7)
## 
## z test of coefficients:
## 
##       Estimate Std. Error  z value Pr(>|z|)    
## ma1  -0.672814   0.041318 -16.2837  < 2e-16 ***
## sma1 -0.507288   0.055836  -9.0853  < 2e-16 ***
## sma2 -0.106710   0.047786  -2.2331  0.02554 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#7. Model Validation (n-fold holdout method)
hold <- window(ts(my_ts), start =233)

#we will forecast data for the last two years (month = 233 to 256)
fit_predicted <- arima(ts(my_ts[-c(233:256)]), order =c(0,1,1), seasonal = list(order = c(0,1,1), period = 12))
#use the model to forecast values for last 24 months. 
#Specify forecast horizon h periods ahead of prediction to be made 
#and use the fitted model to generate those predictions

forecast_pred <- forecast(fit_predicted,h=24)
plot(forecast_pred, main="Liquor Sales Forecast", xlab="Months since Jan 1992", ylab="Sales in Millions")
lines(ts(my_ts))

#8. Forecasting
#Next step is to forecast the sales for another 24 months ahead of time. 
f_values <-forecast(dfit7, h=24)
plot(f_values, main="")

f_values
##        Point Forecast    Lo 80    Hi 80    Lo 95    Hi 95
## Apr 30       5567.796 5392.589 5743.003 5299.840 5835.752
## May 30       6295.510 6111.163 6479.857 6013.576 6577.444
## Jun 30       6199.472 6006.418 6392.526 5904.221 6494.723
## Jul 30       6440.115 6238.729 6641.500 6132.122 6748.107
## Aug 30       6253.527 6044.141 6462.913 5933.299 6573.755
## Sep 30       5984.442 5767.350 6201.533 5652.429 6316.454
## Oct 30       6142.549 5918.017 6367.081 5799.156 6485.941
## Nov 30       6301.949 6070.214 6533.683 5947.541 6656.356
## Dec 30       5912.535 5673.815 6151.255 5547.444 6277.625
## Jan 31       5522.513 5277.006 5768.019 5147.043 5897.982
## Feb 31       5486.830 5234.720 5738.940 5101.261 5872.399
## Mar 31       6171.218 5912.673 6429.764 5775.807 6566.629
## Apr 31       6014.515 5718.742 6310.288 5562.170 6466.861
## May 31       6683.410 6375.508 6991.313 6212.514 7154.306
## Jun 31       6600.650 6281.078 6920.222 6111.906 7089.393
## Jul 31       6825.910 6495.080 7156.740 6319.949 7331.871
## Aug 31       6664.767 6323.050 7006.485 6142.156 7187.379
## Sep 31       6380.052 6027.783 6732.320 5841.304 6918.800
## Oct 31       6535.239 6172.727 6897.752 5980.824 7089.654
## Nov 31       6724.307 6351.832 7096.782 6154.655 7293.958
## Dec 31       6327.570 5945.392 6709.748 5743.079 6912.060
## Jan 32       5930.127 5538.487 6321.767 5331.165 6529.089
## Feb 32       5914.751 5513.871 6315.631 5301.659 6527.844
## Mar 32       6592.784 6182.873 7002.694 5965.880 7219.688
### White noise series (non-stationary series)

wn <- arima(liquor_ts, order=c(0,0,0))
summary(wn)
## 
## Call:
## arima(x = liquor_ts, order = c(0, 0, 0))
## 
## Coefficients:
##       intercept
##        3114.897
## s.e.     59.830
## 
## sigma^2 estimated as 1256451:  log likelihood = -2962.73,  aic = 5929.47
## 
## Training set error measures:
##                         ME     RMSE     MAE       MPE     MAPE     MASE
## Training set -1.945083e-12 1120.915 926.836 -13.29419 33.52151 2.680067
##                   ACF1
## Training set 0.8458309
checkresiduals(wn)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(0,0,0) with non-zero mean
## Q* = 2243.5, df = 9, p-value < 2.2e-16
## 
## Model df: 1.   Total lags used: 10
f_values <-forecast(wn, h=24)
plot(f_values, main="")

dfit10 <- auto.arima(liquor_ts, seasonal = TRUE)
f_values <-forecast(dfit10, h=24)
plot(f_values, main="")

summary(dfit10)
## Series: liquor_ts 
## ARIMA(2,1,2) with drift 
## 
## Coefficients:
##          ar1      ar2      ma1     ma2    drift
##       0.8455  -0.2204  -1.8259  0.8871  12.8542
## s.e.  0.0646   0.0643   0.0434  0.0391   3.7062
## 
## sigma^2 estimated as 182490:  log likelihood=-2615.67
## AIC=5243.34   AICc=5243.58   BIC=5266.49
## 
## Training set error measures:
##                    ME     RMSE      MAE       MPE    MAPE      MASE
## Training set 2.669031 423.5212 265.6343 -1.754215 8.20172 0.7681162
##                       ACF1
## Training set -0.0004498032
Acf(residuals(dfit10), main="Autocorrelation")

Pacf(residuals(dfit10), main="Partial Autocorrelation")

plot(residuals(dfit10), main="Residuals",ylab='residuals')

coeftest(dfit10)
## 
## z test of coefficients:
## 
##        Estimate Std. Error  z value  Pr(>|z|)    
## ar1    0.845534   0.064607  13.0874 < 2.2e-16 ***
## ar2   -0.220374   0.064342  -3.4250 0.0006147 ***
## ma1   -1.825898   0.043368 -42.1028 < 2.2e-16 ***
## ma2    0.887067   0.039145  22.6608 < 2.2e-16 ***
## drift 12.854153   3.706232   3.4683 0.0005239 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
fit_predicted <- arima(ts(liquor_ts[-c(351:374)]), order =c(2,1,2), seasonal = list(order = c(2,1,2), period = 12))

forecast_pred <- forecast(fit_predicted,h=24)

plot(forecast_pred, main="Liquor Sales Forecast", xlab="Months since Jan 1992", ylab="Sales in Millions")

checkresiduals(dfit10)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(2,1,2) with drift
## Q* = 20.032, df = 5, p-value = 0.001233
## 
## Model df: 5.   Total lags used: 10