Core libraries

library(tidyverse)
library(lubridate)
library(tsibble)
library(fable)
library(fabletools)
library(feasts)
library(imputeTS)
library(forecast)

dim_Category <- read_csv("dim_Category.csv")
data <- read_csv("fact_Sales.csv")
# Preview the first few rows of the dataset
head(data)

## # A tibble: 6 × 13
##   invoice_line_no date       sale_bottles sale_dollars sale_liters sale_gallons
##   <chr>           <date>            <dbl>        <dbl>       <dbl>        <dbl>
## 1 INV-54543000001 2022-12-30           12         111.       12            3.17
## 2 INV-54502500215 2022-12-30            2          15         0.4          0.1 
## 3 INV-54538300027 2022-12-30           12         315.        9            2.37
## 4 INV-54549600015 2022-12-30            6         104.       10.5          2.77
## 5 INV-54508600018 2022-12-30            3         126         2.25         0.59
## 6 INV-54536400012 2022-12-30           24         155.        1.2          0.31
## # ℹ 7 more variables: store_ID <dbl>, city_ID <dbl>, county_ID <dbl>,
## #   itemno <dbl>, item_group <dbl>, category_ID <dbl>, vendor_ID <dbl>

Introduction

In this project, we analyze the Iowa liquor sales dataset. The dataset is composed of one fact table and several dimension tables as described below:

Fact Table: fact.sales Contains transactional data with fields including:

invoice_line_no

date

sale_bottles

sale_dollars

sale_liters

sale_gallons

store_ID

city_ID

county_ID

itemno

item_group

category_ID

vendor_ID

Dimension Tables:

dim_category: Provides product category details (category_ID, category).

dim_City: Contains city information (city_ID, city, county_ID).

dim_County: Contains county information (county_ID, county).

dim_items: Describes product details (itemno, item_group, im_desc, pack, bottle_volume_ml, state_bottle_cost, state_bottle_retail, category_ID, vendor_ID).

dim_Stores: Provides store details (store_ID, store, name, brand_store_no, address, zipcode, store_location, city_ID, county_ID).

dim_Vendor: Contains vendor information (vendor_ID, vendor).

Why Choose This Dataset?

This dataset is valuable because it combines detailed transaction data with rich descriptive dimensions. Forecasting sales based on this data can lead to smarter decisions in areas like:

Inventory Management: Anticipate product demand to optimize stock levels.

Resource Allocation: Identify which stores or regions may require additional support.

Pricing Strategy: Understand cost and revenue patterns to adjust retail prices if needed.

Sales Performance Analysis: Evaluate trends by category, vendor, and geographic location for targeted strategies. ## Data Wrangling

sales_data <- 
  data %>% 
  left_join(dim_Category, by = "category_ID") %>% 
  filter(str_detect(category, regex("WHISKIES", ignore_case = TRUE)))
  
  
# Convert the 'date' column into Date format and aggregate by day.
sales_daily <- sales_data %>%
  mutate(date = as.Date(date, format = "%Y-%m-%d")) %>%
  group_by(date) %>% 
  summarise(total_sales = sum(sale_dollars, na.rm = TRUE),
            total_bottles = sum(sale_bottles, na.rm = TRUE)) %>%
  ungroup() %>%
  as_tsibble(index = date)

# Check the tsibble structure
sales_daily

## # A tsibble: 1,386 x 3 [1D]
##    date       total_sales total_bottles
##    <date>           <dbl>         <dbl>
##  1 2018-01-02     289528.         19015
##  2 2018-01-03     388916.         23275
##  3 2018-01-04     343761.         20323
##  4 2018-01-05     158740.         11037
##  5 2018-01-08     454759.         30063
##  6 2018-01-09     441301.         27719
##  7 2018-01-10     337345.         21877
##  8 2018-01-11     331460.         19952
##  9 2018-01-12     170934.         11513
## 10 2018-01-15     374464.         23699
## # ℹ 1,376 more rows

# Join category info and aggregate total sales
top_categories <- data %>%
  left_join(dim_Category, by = "category_ID") %>%
  group_by(category) %>%
  summarise(
    total_sales = sum(sale_dollars, na.rm = TRUE),
    total_bottles = sum(sale_bottles, na.rm = TRUE),
    .groups = "drop"
  ) %>%
  arrange(desc(total_sales)) %>%
  slice_head(n = 10)

# Display table
library(knitr)
library(kableExtra)

top_categories %>%
  mutate(
    total_sales = scales::dollar(total_sales),
    total_bottles = scales::comma(total_bottles)
  ) %>%
  kable(
    caption = "Top 10 Liquor Categories by Sales (in Dollars)",
    col.names = c("Category", "Total Sales", "Total Bottles Sold"),
    align = "lrr"
  ) %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"))

Top 10 Liquor Categories by Sales (in Dollars)
Category	Total Sales	Total Bottles Sold
AMERICAN VODKA	$282,797,049	29,280,234
CANADIAN WHISKIES	$222,949,130	14,889,765
STRAIGHT BOURBON WHISKIES	$146,272,363	7,308,374
SPICED RUM	$110,967,983	7,267,796
WHISKEY LIQUEURS	$109,853,338	16,211,635
100% AGAVE TEQUILA	$92,826,785	3,359,321
TENNESSEE WHISKIES	$76,831,241	3,400,383
IMPORTED VODKA	$75,919,147	4,268,029
IMPORTED BRANDIES	$68,978,106	3,480,138
BLENDED WHISKIES	$54,933,960	4,679,294

library(patchwork)

# 1. Daily Sales Time Series
p1 <- ggplot(sales_daily, aes(x = date, y = total_sales)) +
  geom_line(color = "steelblue") +
  labs(
    title = "Daily Whisky Sales Over Time",
    x = "Date",
    y = "Sales (Dollars)"
  )

# 2. Histogram of Daily Sales Amounts
p2 <- ggplot(sales_daily, aes(x = total_sales)) +
  geom_histogram(bins = 40, fill = "darkorange", color = "black") +
  labs(
    title = "Distribution of Daily Whisky Sales",
    x = "Daily Sales (Dollars)",
    y = "Frequency"
  ) +
  xlim(0, quantile(sales_daily$total_sales, 0.99, na.rm = TRUE))




p1 / p2

sales_daily <- sales_daily %>%
  mutate(
    year = year(date),
    month = month(date, label = TRUE),
    day = day(date)
  )

Exploratory visualization

monthly_sales <- sales_daily %>%
  index_by(month = ~ floor_date(.x, "month")) %>%
  summarise(
    total_sales = sum(total_sales, na.rm = TRUE),
    total_bottles = sum(total_bottles, na.rm = TRUE)
  )



library(patchwork)

# 1. Monthly Total Sales Time Series
p1 <- ggplot(monthly_sales, aes(x = month, y = total_sales)) +
  geom_line(color = "royalblue", size = 1) +
  labs(
    title = "Monthly Whisky Sales Over Time",
    x = "Month",
    y = "Sales (Dollars)"
  )

# 2. Monthly Total Bottles Sold Time Series
p2 <- ggplot(monthly_sales, aes(x = month, y = total_bottles)) +
  geom_line(color = "seagreen", size = 1) +
  labs(
    title = "Monthly Whisky Bottles Sold Over Time",
    x = "Month",
    y = "Total Bottles"
  )

# 3. Distribution of Monthly Sales
p3 <- ggplot(monthly_sales, aes(x = total_sales)) +
  geom_histogram(bins = 20, fill = "darkorange", color = "black") +
  labs(
    title = "Distribution of Monthly Whisky Sales",
    x = "Monthly Sales (Dollars)",
    y = "Frequency"
  ) +
  xlim(0, quantile(monthly_sales$total_sales, 0.99, na.rm = TRUE))

# 4. Monthly Boxplot to Visualize Seasonality or Outliers
monthly_sales <- monthly_sales %>%
  mutate(Month = month(month, label = TRUE))

p4 <- ggplot(monthly_sales, aes(x = Month, y = total_sales)) +
  geom_boxplot(fill = "mediumpurple3") +
  labs(
    title = "Monthly Whisky Sales Boxplot",
    x = "Month",
    y = "Sales (Dollars)"
  )

# Combine into 2x2 layout
p1 / p2 / p3 / p4

library(ggplot2)

# Ensure that missing months are explicitly represented
monthly_sales_filled <- monthly_sales %>%
  fill_gaps() %>% 
  mutate(total_sales = na_interpolation(total_sales))

# Now create the seasonal plot
monthly_sales_filled %>% 
  gg_season(total_sales) +
  labs(title = "Seasonal Plot of Monthly Sales", y = "Total Sales (Dollars)")

## Model Fitting

# Define a cutoff date for the training dataset
cutoff_date <- as.Date("2020-12-31")
train_data <- monthly_sales_filled %>% filter(month <= cutoff_date)
test_data <- monthly_sales_filled %>% filter(month > cutoff_date)

# Check the number of records in each set
cat("Training set records:", nrow(train_data), "\n")

## Training set records: 1096

cat("Test set records:", nrow(test_data), "\n")

## Test set records: 700

# Fit forecasting models on the training set of monthly aggregated data
sales_models <- train_data %>%
  model(
    TSLM = TSLM(total_sales ~ trend() + season()),
    ETS = ETS(total_sales),
    ARIMA = ARIMA(total_sales),
    NAIVE = NAIVE(total_sales ),
    SNAIVE = SNAIVE(total_sales ~ drift()),
    MEAN = MEAN(total_sales)
  )

# Display the model summaries
report(sales_models)

## # A tibble: 6 × 20
##   .model r_squared adj_r_squared   sigma2 statistic    p_value    df log_lik
##   <chr>      <dbl>         <dbl>    <dbl>     <dbl>      <dbl> <int>   <dbl>
## 1 TSLM       0.459         0.456 1.18e+12      132.  1.83e-140     8 -16782.
## 2 ETS       NA            NA     6.25e- 6       NA  NA            NA -14840.
## 3 ARIMA     NA            NA     5.90e+ 8       NA  NA            NA -12612.
## 4 NAIVE     NA            NA     5.79e+ 9       NA  NA            NA     NA 
## 5 SNAIVE    NA            NA     2.46e+11       NA  NA            NA     NA 
## 6 MEAN      NA            NA     2.16e+12       NA  NA            NA     NA 
## # ℹ 12 more variables: AIC <dbl>, AICc <dbl>, BIC <dbl>, CV <dbl>,
## #   deviance <dbl>, df.residual <int>, rank <int>, MSE <dbl>, AMSE <dbl>,
## #   MAE <dbl>, ar_roots <list>, ma_roots <list>

Accuracy

# Determine forecast horizon based on the test set length
h <- nrow(test_data)

# Generate forecasts from all fitted models
forecasts <- sales_models %>%
  forecast(h = h)

# Compute forecast accuracy
accuracy_metrics <- forecasts %>%
  accuracy(test_data)

# Show selected accuracy measures
accuracy_metrics %>% 
  select(.model, RMSE, MAE, MAPE)

## # A tibble: 6 × 4
##   .model     RMSE      MAE  MAPE
##   <chr>     <dbl>    <dbl> <dbl>
## 1 ARIMA  5424347. 5275487. 47.7 
## 2 ETS    5199689. 5049227. 45.6 
## 3 MEAN   1959899. 1639615. 14.2 
## 4 NAIVE  2108990. 1805704. 15.7 
## 5 SNAIVE 1199553.  931548.  8.32
## 6 TSLM   1638135. 1397140. 13.4

Forecast

# Forecast horizon
h <- "3 years"

# Create individual forecasts
fc_tslm <- forecast(sales_models %>% select(TSLM), h = h)
fc_ets <- forecast(sales_models %>% select(ETS), h = h)
fc_arima <- forecast(sales_models %>% select(ARIMA), h = h)
fc_naive <- forecast(sales_models %>% select(NAIVE), h = h)
fc_snaive <- forecast(sales_models %>% select(SNAIVE), h = h)
fc_mean <- forecast(sales_models %>% select(MEAN), h = h)

# Plot each forecast with training data


# Plot each forecast without confidence intervals
p1 <- autoplot(fc_tslm, monthly_sales_filled, level = NULL) + labs(title = "Forecast: TSLM (Trend + Season)")
p2 <- autoplot(fc_ets, monthly_sales_filled, level = NULL) + labs(title = "Forecast: ETS")
p3 <- autoplot(fc_arima, monthly_sales_filled, level = NULL) + labs(title = "Forecast: ARIMA")
p4 <- autoplot(fc_naive, monthly_sales_filled, level = NULL) + labs(title = "Forecast: Naïve")
p5 <- autoplot(fc_snaive, monthly_sales_filled, level = NULL) + labs(title = "Forecast: Seasonal Naïve")
p6 <- autoplot(fc_mean, monthly_sales_filled, level = NULL) + labs(title = "Forecast: Mean Model")


# Arrange in 3 rows, 2 columns using patchwork
p1 / p2 / p3 / p4 / p5 / p6

# Assume ARIMA is the best-performing model
final_model <- sales_models %>% select(SNAIVE)

# Forecast for the next 12 months
future_forecast <- final_model %>% forecast(h = "36 months")

# Plot the future forecast together with the training data
autoplot(future_forecast, monthly_sales_filled) +
  labs(title = "12-Month Sales Forecast", x = "Month", y = "Forecasted Sales (Dollars)")

## Conclusion/Key findings The Iowa liquor sales data reveal that whiskies exhibit a predictable, highly seasonal demand profile with a moderate upward trajectory. Despite testing multiple specifications, a Seasonal Naïve benchmark captured the dominant monthly seasonality and yielded the most accurate out‑of‑sample forecasts.

For operational planning, retailers should:

Stock aggressively for year‑end and early‑spring peaks, using SNAIVE point forecasts plus a safety factor derived from the holiday prediction intervals.

Re‑estimate models quarterly, monitoring for post‑pandemic behavioural shifts or pricing actions that could alter the level of the series.

Segment further by brand or bottle size, as the present analysis aggregates all whiskies and may mask divergent trends among premium versus value segments.

By coupling these insights with the existing dimensional tables (stores, vendors, geography), management can translate category‑level forecasts into actionable replenishment targets, targeted promotions, and informed vendor negotiations, ultimately improving margins and customer service across the state’s liquor retail network.

Forecasting Project

Aaron Breeze, Daniel Rekow

2025-04-16