Core libraries

library(tidyverse)
library(lubridate)
library(tsibble)
library(fable)
library(fabletools)
library(feasts)
library(imputeTS)
library(forecast)

setwd("C:/Users/Aaron/Downloads")
getwd()

## [1] "C:/Users/Aaron/Downloads"

sales_data <- read_csv("fact_Sales.csv")

# Preview the first few rows of the dataset
head(sales_data)

## # A tibble: 6 × 13
##   invoice_line_no date       sale_bottles sale_dollars sale_liters sale_gallons
##   <chr>           <date>            <dbl>        <dbl>       <dbl>        <dbl>
## 1 INV-54543000001 2022-12-30           12         111.       12            3.17
## 2 INV-54502500215 2022-12-30            2          15         0.4          0.1 
## 3 INV-54538300027 2022-12-30           12         315.        9            2.37
## 4 INV-54549600015 2022-12-30            6         104.       10.5          2.77
## 5 INV-54508600018 2022-12-30            3         126         2.25         0.59
## 6 INV-54536400012 2022-12-30           24         155.        1.2          0.31
## # ℹ 7 more variables: store_ID <dbl>, city_ID <dbl>, county_ID <dbl>,
## #   itemno <dbl>, item_group <dbl>, category_ID <dbl>, vendor_ID <dbl>

Introduction

In this project, we analyze the Iowa liquor sales dataset. The dataset is composed of one fact table and several dimension tables as described below:

Fact Table: fact.sales Contains transactional data with fields including:

invoice_line_no

date

sale_bottles

sale_dollars

sale_liters

sale_gallons

store_ID

city_ID

county_ID

itemno

item_group

category_ID

vendor_ID

Dimension Tables:

dim_category: Provides product category details (category_ID, category).

dim_City: Contains city information (city_ID, city, county_ID).

dim_County: Contains county information (county_ID, county).

dim_items: Describes product details (itemno, item_group, im_desc, pack, bottle_volume_ml, state_bottle_cost, state_bottle_retail, category_ID, vendor_ID).

dim_Stores: Provides store details (store_ID, store, name, brand_store_no, address, zipcode, store_location, city_ID, county_ID).

dim_Vendor: Contains vendor information (vendor_ID, vendor).

Why Choose This Dataset?

This dataset is valuable because it combines detailed transaction data with rich descriptive dimensions. Forecasting sales based on this data can lead to smarter decisions in areas like:

Inventory Management: Anticipate product demand to optimize stock levels.

Resource Allocation: Identify which stores or regions may require additional support.

Pricing Strategy: Understand cost and revenue patterns to adjust retail prices if needed.

Sales Performance Analysis: Evaluate trends by category, vendor, and geographic location for targeted strategies. ## Data Wrangling

# Convert the 'date' column into Date format and aggregate by day.
sales_daily <- sales_data %>%
  mutate(date = as.Date(date, format = "%Y-%m-%d")) %>%
  group_by(date) %>% 
  summarise(total_sales = sum(sale_dollars, na.rm = TRUE),
            total_bottles = sum(sale_bottles, na.rm = TRUE)) %>%
  ungroup() %>%
  as_tsibble(index = date)

# Check the tsibble structure
sales_daily

## # A tsibble: 1,390 x 3 [1D]
##    date       total_sales total_bottles
##    <date>           <dbl>         <dbl>
##  1 2018-01-02     971602.         79218
##  2 2018-01-03    1203749.         91417
##  3 2018-01-04    1124619.         83410
##  4 2018-01-05     551381.         44579
##  5 2018-01-08    1621213.        130829
##  6 2018-01-09    1333084.        112432
##  7 2018-01-10    1093672.         89007
##  8 2018-01-11    1094104.         81121
##  9 2018-01-12     490379.         41869
## 10 2018-01-15    1255476.         97877
## # ℹ 1,380 more rows

sales_daily <- sales_daily %>%
  mutate(
    year = year(date),
    month = month(date, label = TRUE),
    day = day(date)
  )

Exploratory visualization

monthly_sales <- sales_daily %>%
  index_by(month = ~ floor_date(.x, "month")) %>%
  summarise(
    total_sales = sum(total_sales, na.rm = TRUE),
    total_bottles = sum(total_bottles, na.rm = TRUE)
  )

# Visualize the monthly sales data
autoplot(monthly_sales, total_sales) +
  labs(title = "Monthly Iowa Liquor Sales Revenue", x = "Month", y = "Sales (Dollars)")

library(ggplot2)

# Daily sales time series plot
ggplot(sales_daily, aes(x = date, y = total_sales)) +
  geom_line(color = "blue") +
  labs(title = "Daily Sales Revenue", x = "Date", y = "Total Sales (Dollars)")

# Ensure that missing months are explicitly represented
monthly_sales_filled <- monthly_sales %>%
  fill_gaps() %>% 
  mutate(total_sales = na_interpolation(total_sales))

# Now create the seasonal plot
monthly_sales_filled %>% 
  gg_season(total_sales) +
  labs(title = "Seasonal Plot of Monthly Sales", y = "Total Sales (Dollars)")

# Boxplot by month to identify potential anomalies in monthly sales
monthly_sales %>%
  mutate(Month = month(month, label = TRUE)) %>%
  ggplot(aes(x = Month, y = total_sales)) +
  geom_boxplot() +
  labs(title = "Monthly Sales Boxplot", y = "Total Sales (Dollars)")

## Model Fitting

# Define a cutoff date for the training dataset
cutoff_date <- as.Date("2020-12-31")
train_data <- monthly_sales_filled %>% filter(month <= cutoff_date)
test_data <- monthly_sales_filled %>% filter(month > cutoff_date)

# Check the number of records in each set
cat("Training set records:", nrow(train_data), "\n")

## Training set records: 1096

cat("Test set records:", nrow(test_data), "\n")

## Test set records: 700

# Fit forecasting models on the training set of monthly aggregated data
sales_models <- train_data %>%
  model(
    TSLM = TSLM(total_sales ~ trend() + season()),
    ETS = ETS(total_sales),
    ARIMA = ARIMA(total_sales)
  )

# Display the model summaries and compare using AICc
report(sales_models)

## # A tibble: 3 × 20
##   .model r_squared adj_r_squared   sigma2 statistic    p_value    df log_lik
##   <chr>      <dbl>         <dbl>    <dbl>     <dbl>      <dbl> <int>   <dbl>
## 1 TSLM       0.509         0.506 7.73e+12      161.  2.32e-163     8 -17814.
## 2 ETS       NA            NA     2.99e- 6       NA  NA            NA -15727.
## 3 ARIMA     NA            NA     2.80e+ 9       NA  NA            NA -13463.
## # ℹ 12 more variables: AIC <dbl>, AICc <dbl>, BIC <dbl>, CV <dbl>,
## #   deviance <dbl>, df.residual <int>, rank <int>, MSE <dbl>, AMSE <dbl>,
## #   MAE <dbl>, ar_roots <list>, ma_roots <list>

Accuracy

# Determine forecast horizon based on the test set length
h <- nrow(test_data)

# Generate forecasts from all fitted models
forecasts <- sales_models %>%
  forecast(h = h)

# Compute forecast accuracy
accuracy_metrics <- forecasts %>%
  accuracy(test_data)

# Show selected accuracy measures
accuracy_metrics %>% 
  select(.model, RMSE, MAE, MAPE)

## # A tibble: 3 × 4
##   .model      RMSE       MAE  MAPE
##   <chr>      <dbl>     <dbl> <dbl>
## 1 ARIMA  14681983. 14153800. 38.9 
## 2 ETS    14576195. 14047431. 38.6 
## 3 TSLM    4000703.  2919535.  8.92

Forecast

# Assume ARIMA is the best-performing model
final_model <- sales_models %>% select(TSLM)

# Forecast for the next 12 months
future_forecast <- final_model %>% forecast(h = "12 months")

# Plot the future forecast together with the training data
autoplot(future_forecast, train_data) +
  labs(title = "12-Month Sales Forecast", x = "Month", y = "Forecasted Sales (Dollars)")

Forecasting project

Aaron Breeze, Daniel Rekow

2025-04-14