# Install pacman if needed
if (!require("pacman")) install.packages("pacman")
## Loading required package: pacman
# load packages
pacman::p_load(pacman,
  tidyverse, openxlsx, forecast, psych)
#Import dataset
diet <- read.xlsx("datasets/Addata.xlsx", skipEmptyRows = TRUE)

Explore data

#Check results
str(diet)
## 'data.frame':    36 obs. of  3 variables:
##  $ Month      : num  1 2 3 4 5 6 7 8 9 10 ...
##  $ Sales      : num  12 20.5 21 15.5 15.3 23.5 24.5 21.3 23.5 28 ...
##  $ Advertising: num  15 16 18 27 21 49 21 22 28 36 ...

Let’s get a sense of the data so we will do some visualizations of our sales data.

#Use describe function from psych package. It gives a good overview of the dataset variables.
describe(diet)

Average monthly sales is 24k with a standard deviation of 6 while advertising spend average around 29k with a very large deviation of 19k.

#Plot Sales by Advertising
ggplot(diet, aes(x=Advertising, y=Sales)) + geom_point() + geom_smooth(method = 'lm')
## `geom_smooth()` using formula 'y ~ x'

#Plot sales over 36 months
ggplot(diet, aes(x=Month, y = Sales)) + geom_line()

First, a scatterplot of sales and advertising. Sales are generally high when advertising spend is high which is perhaps not a big surprise, but this is not necessarily always the case. There seems to be wide variability between sales and advertising and the points all over the place.

If we look at our sales data over time, we can see a lot of peaks and dips. There are definitely some dips around months 12, 24 and now 36 which suggests that sales are down around the Christmas holidays, but picks back up rather quickly.

Dynamic Regression Modeling

#Convert dataframe to time series object using the ts() function
diet_ts <- ts(data = diet[,c(2,3)])


#Check results of transformation
diet_ts
## Time Series:
## Start = 1 
## End = 36 
## Frequency = 1 
##    Sales Advertising
##  1  12.0          15
##  2  20.5          16
##  3  21.0          18
##  4  15.5          27
##  5  15.3          21
##  6  23.5          49
##  7  24.5          21
##  8  21.3          22
##  9  23.5          28
## 10  28.0          36
## 11  24.0          40
## 12  15.5           3
## 13  17.3          21
## 14  25.3          29
## 15  25.0          62
## 16  36.5          65
## 17  36.5          46
## 18  29.6          44
## 19  30.5          33
## 20  28.0          62
## 21  26.0          22
## 22  21.5          12
## 23  19.7          24
## 24  19.0           3
## 25  16.0           5
## 26  20.7          14
## 27  26.5          36
## 28  30.6          40
## 29  32.3          49
## 30  29.5           7
## 31  28.3          52
## 32  31.3          65
## 33  32.2          17
## 34  26.4           5
## 35  23.4          17
## 36  16.4           1
# Time plot of both variables
autoplot(diet_ts, facets = TRUE)

We fit our model with auto.arima function from the forecast package.

# Fit ARIMA model
fit <- auto.arima(diet_ts[, "Sales"], xreg = diet_ts[, "Advertising"], stationary = TRUE)

# Check model fit
fit
## Series: diet_ts[, "Sales"] 
## Regression with ARIMA(1,0,0) errors 
## 
## Coefficients:
##          ar1  intercept    xreg
##       0.6412    20.8737  0.0926
## s.e.  0.1559     2.0842  0.0426
## 
## sigma^2 estimated as 18.48:  log likelihood=-102.28
## AIC=212.55   AICc=213.84   BIC=218.89

The auto.arima function has fit a linear regression model to the advertising variable and an an ARIMA model to the time series (month) variable. ARIMA (1,0,0)

#What is the increase in sales for each unit increase in advertising?
sales_increase <- coefficients(fit)[3]

Interpretation of sales increase: For every $1,000 (the unit of ad spend is in thousands USD) increase in advertising spend, the unit sales increase by 0.09.

#Check Residuals
checkresiduals(fit)

## 
##  Ljung-Box test
## 
## data:  Residuals from Regression with ARIMA(1,0,0) errors
## Q* = 4.5652, df = 4, p-value = 0.3349
## 
## Model df: 3.   Total lags used: 7

P-value is larger than 0.05 so the residuals are white noise. The residuals look nearly normally distributed.

Forecasting for the next quarter

#We need to provide forecast with the future value of our predictor which is Advertising. So here, we will spend 5 in month 1, 12 in month 2 and 40 in month 3.

#Create a vector to hold advertising spend
adv_spend <- c(5, 12, 40)

#Let's build the forecast
diet_fc <- forecast(fit, xreg = adv_spend)

# Plot forecast
autoplot(diet_fc) + xlab("Month") + ylab("Sales")

#Print forecast with confidence intervals
diet_fc
##    Point Forecast    Lo 80    Hi 80     Lo 95    Hi 95
## 37       18.40883 12.90030 23.91737  9.984257 26.83341
## 38       20.10783 13.56408 26.65158 10.100031 30.11563
## 39       23.37542 16.45077 30.30007 12.785082 33.96576
# Install ggpubr
if (!require("ggpubr")) install.packages("ggpubr")
## Loading required package: ggpubr
## 
## Attaching package: 'ggpubr'
## The following object is masked from 'package:forecast':
## 
##     gghistogram
library(ggpubr)

#Create vectors needed
sales_forecast <- round(diet_fc$mean, 2)
month <- paste("Month", c(37:39), sep = " ")

#Put forecasts into a dataframe
forecasts <- cbind(month, sales_forecast)


#Create nice looking table
 forecast_table <- forecasts %>% 
  ggtexttable(theme = ttheme("classic"))
 
#Visualize table
  forecast_table

Forecasts using a Train/Test split

#We will use the first 33 months to train
training <- window(diet_ts, end = 33)
test <- window(diet_ts, start = 34)


# Fit ARIMA model
fit_02 <- auto.arima(training[, "Sales"], xreg = training[, "Advertising"], stationary = TRUE)

# Check model fit
fit_02
## Series: training[, "Sales"] 
## Regression with ARIMA(1,0,0) errors 
## 
## Coefficients:
##          ar1  intercept    xreg
##       0.6940    21.9815  0.0762
## s.e.  0.1531     2.5000  0.0416
## 
## sigma^2 estimated as 18.96:  log likelihood=-94.13
## AIC=196.26   AICc=197.69   BIC=202.24
#Check residuals on our training set
checkresiduals(fit_02)

## 
##  Ljung-Box test
## 
## data:  Residuals from Regression with ARIMA(1,0,0) errors
## Q* = 5.2197, df = 4, p-value = 0.2655
## 
## Model df: 3.   Total lags used: 7
#Forecast sales on the test set
diet_fc_02 <- forecast(fit_02, xreg = test[,2])

# Plot test set forecast
autoplot(diet_fc_02) + xlab("Month") + ylab("Sales")

At first glance, our forecasts look pretty good although the confidence intervals are a bit wide.

#Print forecast
diet_fc_02
##    Point Forecast    Lo 80    Hi 80    Lo 95    Hi 95
## 34       28.55478 22.97453 34.13504 20.02052 37.08904
## 35       27.57436 20.78203 34.36670 17.18638 37.96234
## 36       25.03986 17.73520 32.34452 13.86835 36.21138

How do you know if your forecasts are any good?

Definitions:

Errors = actual minus predicted values Standard Error = standard deviation of errors SSE = Sum of Standard Error

#Evaluate accuracy
accuracy(diet_fc_02, test[,1])
##                      ME     RMSE      MAE        MPE     MAPE      MASE
## Training set  0.3283182 4.151658 3.573361  -2.323035 16.03978 0.9995417
## Test set     -4.9896683 5.677899 4.989668 -26.227763 26.22776 1.3957114
##                     ACF1 Theil's U
## Training set -0.02422498        NA
## Test set     -0.03018011  1.255173
#Combine test data and the forecasts into a tibble dataframe
explanatory_data <- as_tibble(cbind(test, diet_fc_02))

#Make sure to use the back ticks when referencing the forecasts from the forecast model object
explanatory_data <- explanatory_data %>% 
  mutate(errors = test.Sales - `diet_fc_02.Point Forecast`,
         squared_error = (test.Sales - `diet_fc_02.Point Forecast`)^2)

#View results - Select only the relevant columns
explanatory_data %>% 
  select(test.Sales, `diet_fc_02.Point Forecast`, errors, squared_error)
#Calculate standard error across all forecasts
(standard_error <- sd(explanatory_data$errors, na.rm = TRUE))
## [1] 3.318526
#Calculate sse across all forecasts
(sse <- sum(explanatory_data$squared_error, na.rm = TRUE))
## [1] 96.7156

All of our forecasts were above actual sales (hence the negative errors).

Our forecast error was the lowest for month 34, slightly higher for month 35 and the absolute worst forecast was for month 36. There does appear to be an interaction between advertising and sales.

#Computing the mean absolute percentage error (MAPE)
mean(abs((explanatory_data$test.Sales-explanatory_data$`diet_fc_02.Point Forecast`)/explanatory_data$test.Sales)) * 100
## [1] 26.22776

Forecasts are off by on the test set 26% which is rather high and was much lower on the training set. This gives us the same result as the accuracy function from above, but there may be times you may need to manually calculate MAPE.