library(tidyverse)
library(lubridate)
library(tsibble)
library(fable)
library(fabletools)
library(feasts)
library(imputeTS)
library(forecast)
dim_Category <- read_csv("dim_Category.csv")
data <- read_csv("fact_Sales.csv")
# Preview the first few rows of the dataset
head(data)
## # A tibble: 6 × 13
## invoice_line_no date sale_bottles sale_dollars sale_liters sale_gallons
## <chr> <date> <dbl> <dbl> <dbl> <dbl>
## 1 INV-54543000001 2022-12-30 12 111. 12 3.17
## 2 INV-54502500215 2022-12-30 2 15 0.4 0.1
## 3 INV-54538300027 2022-12-30 12 315. 9 2.37
## 4 INV-54549600015 2022-12-30 6 104. 10.5 2.77
## 5 INV-54508600018 2022-12-30 3 126 2.25 0.59
## 6 INV-54536400012 2022-12-30 24 155. 1.2 0.31
## # ℹ 7 more variables: store_ID <dbl>, city_ID <dbl>, county_ID <dbl>,
## # itemno <dbl>, item_group <dbl>, category_ID <dbl>, vendor_ID <dbl>
In this project, we analyze the Iowa liquor sales dataset. The dataset is composed of one fact table and several dimension tables as described below:
Fact Table: fact.sales Contains transactional data with fields including:
invoice_line_no
date
sale_bottles
sale_dollars
sale_liters
sale_gallons
store_ID
city_ID
county_ID
itemno
item_group
category_ID
vendor_ID
Dimension Tables:
dim_category: Provides product category details (category_ID, category).
dim_City: Contains city information (city_ID, city, county_ID).
dim_County: Contains county information (county_ID, county).
dim_items: Describes product details (itemno, item_group, im_desc, pack, bottle_volume_ml, state_bottle_cost, state_bottle_retail, category_ID, vendor_ID).
dim_Stores: Provides store details (store_ID, store, name, brand_store_no, address, zipcode, store_location, city_ID, county_ID).
dim_Vendor: Contains vendor information (vendor_ID, vendor).
This dataset is valuable because it combines detailed transaction data with rich descriptive dimensions. Forecasting sales based on this data can lead to smarter decisions in areas like:
Inventory Management: Anticipate product demand to optimize stock levels.
Resource Allocation: Identify which stores or regions may require additional support.
Pricing Strategy: Understand cost and revenue patterns to adjust retail prices if needed.
Sales Performance Analysis: Evaluate trends by category, vendor, and geographic location for targeted strategies. ## Data Wrangling
sales_data <-
data %>%
left_join(dim_Category, by = "category_ID") %>%
filter(str_detect(category, regex("WHISKIES", ignore_case = TRUE)))
# Convert the 'date' column into Date format and aggregate by day.
sales_daily <- sales_data %>%
mutate(date = as.Date(date, format = "%Y-%m-%d")) %>%
group_by(date) %>%
summarise(total_sales = sum(sale_dollars, na.rm = TRUE),
total_bottles = sum(sale_bottles, na.rm = TRUE)) %>%
ungroup() %>%
as_tsibble(index = date)
# Check the tsibble structure
sales_daily
## # A tsibble: 1,386 x 3 [1D]
## date total_sales total_bottles
## <date> <dbl> <dbl>
## 1 2018-01-02 289528. 19015
## 2 2018-01-03 388916. 23275
## 3 2018-01-04 343761. 20323
## 4 2018-01-05 158740. 11037
## 5 2018-01-08 454759. 30063
## 6 2018-01-09 441301. 27719
## 7 2018-01-10 337345. 21877
## 8 2018-01-11 331460. 19952
## 9 2018-01-12 170934. 11513
## 10 2018-01-15 374464. 23699
## # ℹ 1,376 more rows
# Join category info and aggregate total sales
top_categories <- data %>%
left_join(dim_Category, by = "category_ID") %>%
group_by(category) %>%
summarise(
total_sales = sum(sale_dollars, na.rm = TRUE),
total_bottles = sum(sale_bottles, na.rm = TRUE),
.groups = "drop"
) %>%
arrange(desc(total_sales)) %>%
slice_head(n = 10)
# Display table
library(knitr)
library(kableExtra)
top_categories %>%
mutate(
total_sales = scales::dollar(total_sales),
total_bottles = scales::comma(total_bottles)
) %>%
kable(
caption = "Top 10 Liquor Categories by Sales (in Dollars)",
col.names = c("Category", "Total Sales", "Total Bottles Sold"),
align = "lrr"
) %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed"))
| Category | Total Sales | Total Bottles Sold |
|---|---|---|
| AMERICAN VODKA | $282,797,049 | 29,280,234 |
| CANADIAN WHISKIES | $222,949,130 | 14,889,765 |
| STRAIGHT BOURBON WHISKIES | $146,272,363 | 7,308,374 |
| SPICED RUM | $110,967,983 | 7,267,796 |
| WHISKEY LIQUEURS | $109,853,338 | 16,211,635 |
| 100% AGAVE TEQUILA | $92,826,785 | 3,359,321 |
| TENNESSEE WHISKIES | $76,831,241 | 3,400,383 |
| IMPORTED VODKA | $75,919,147 | 4,268,029 |
| IMPORTED BRANDIES | $68,978,106 | 3,480,138 |
| BLENDED WHISKIES | $54,933,960 | 4,679,294 |
library(patchwork)
# 1. Daily Sales Time Series
p1 <- ggplot(sales_daily, aes(x = date, y = total_sales)) +
geom_line(color = "steelblue") +
labs(
title = "Daily Whisky Sales Over Time",
x = "Date",
y = "Sales (Dollars)"
)
# 2. Histogram of Daily Sales Amounts
p2 <- ggplot(sales_daily, aes(x = total_sales)) +
geom_histogram(bins = 40, fill = "darkorange", color = "black") +
labs(
title = "Distribution of Daily Whisky Sales",
x = "Daily Sales (Dollars)",
y = "Frequency"
) +
xlim(0, quantile(sales_daily$total_sales, 0.99, na.rm = TRUE))
p1 / p2
sales_daily <- sales_daily %>%
mutate(
year = year(date),
month = month(date, label = TRUE),
day = day(date)
)
monthly_sales <- sales_daily %>%
index_by(month = ~ floor_date(.x, "month")) %>%
summarise(
total_sales = sum(total_sales, na.rm = TRUE),
total_bottles = sum(total_bottles, na.rm = TRUE)
)
library(patchwork)
# 1. Monthly Total Sales Time Series
p1 <- ggplot(monthly_sales, aes(x = month, y = total_sales)) +
geom_line(color = "royalblue", size = 1) +
labs(
title = "Monthly Whisky Sales Over Time",
x = "Month",
y = "Sales (Dollars)"
)
# 2. Monthly Total Bottles Sold Time Series
p2 <- ggplot(monthly_sales, aes(x = month, y = total_bottles)) +
geom_line(color = "seagreen", size = 1) +
labs(
title = "Monthly Whisky Bottles Sold Over Time",
x = "Month",
y = "Total Bottles"
)
# 3. Distribution of Monthly Sales
p3 <- ggplot(monthly_sales, aes(x = total_sales)) +
geom_histogram(bins = 20, fill = "darkorange", color = "black") +
labs(
title = "Distribution of Monthly Whisky Sales",
x = "Monthly Sales (Dollars)",
y = "Frequency"
) +
xlim(0, quantile(monthly_sales$total_sales, 0.99, na.rm = TRUE))
# 4. Monthly Boxplot to Visualize Seasonality or Outliers
monthly_sales <- monthly_sales %>%
mutate(Month = month(month, label = TRUE))
p4 <- ggplot(monthly_sales, aes(x = Month, y = total_sales)) +
geom_boxplot(fill = "mediumpurple3") +
labs(
title = "Monthly Whisky Sales Boxplot",
x = "Month",
y = "Sales (Dollars)"
)
# Combine into 2x2 layout
p1 / p2 / p3 / p4
library(ggplot2)
# Ensure that missing months are explicitly represented
monthly_sales_filled <- monthly_sales %>%
fill_gaps() %>%
mutate(total_sales = na_interpolation(total_sales))
# Now create the seasonal plot
monthly_sales_filled %>%
gg_season(total_sales) +
labs(title = "Seasonal Plot of Monthly Sales", y = "Total Sales (Dollars)")
## Model Fitting
# Define a cutoff date for the training dataset
cutoff_date <- as.Date("2020-12-31")
train_data <- monthly_sales_filled %>% filter(month <= cutoff_date)
test_data <- monthly_sales_filled %>% filter(month > cutoff_date)
# Check the number of records in each set
cat("Training set records:", nrow(train_data), "\n")
## Training set records: 1096
cat("Test set records:", nrow(test_data), "\n")
## Test set records: 700
# Fit forecasting models on the training set of monthly aggregated data
sales_models <- train_data %>%
model(
TSLM = TSLM(total_sales ~ trend() + season()),
ETS = ETS(total_sales),
ARIMA = ARIMA(total_sales),
NAIVE = NAIVE(total_sales ),
SNAIVE = SNAIVE(total_sales ~ drift()),
MEAN = MEAN(total_sales)
)
# Display the model summaries
report(sales_models)
## # A tibble: 6 × 20
## .model r_squared adj_r_squared sigma2 statistic p_value df log_lik
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <int> <dbl>
## 1 TSLM 0.459 0.456 1.18e+12 132. 1.83e-140 8 -16782.
## 2 ETS NA NA 6.25e- 6 NA NA NA -14840.
## 3 ARIMA NA NA 5.90e+ 8 NA NA NA -12612.
## 4 NAIVE NA NA 5.79e+ 9 NA NA NA NA
## 5 SNAIVE NA NA 2.46e+11 NA NA NA NA
## 6 MEAN NA NA 2.16e+12 NA NA NA NA
## # ℹ 12 more variables: AIC <dbl>, AICc <dbl>, BIC <dbl>, CV <dbl>,
## # deviance <dbl>, df.residual <int>, rank <int>, MSE <dbl>, AMSE <dbl>,
## # MAE <dbl>, ar_roots <list>, ma_roots <list>
# Determine forecast horizon based on the test set length
h <- nrow(test_data)
# Generate forecasts from all fitted models
forecasts <- sales_models %>%
forecast(h = h)
# Compute forecast accuracy
accuracy_metrics <- forecasts %>%
accuracy(test_data)
# Show selected accuracy measures
accuracy_metrics %>%
select(.model, RMSE, MAE, MAPE)
## # A tibble: 6 × 4
## .model RMSE MAE MAPE
## <chr> <dbl> <dbl> <dbl>
## 1 ARIMA 5424347. 5275487. 47.7
## 2 ETS 5199689. 5049227. 45.6
## 3 MEAN 1959899. 1639615. 14.2
## 4 NAIVE 2108990. 1805704. 15.7
## 5 SNAIVE 1199553. 931548. 8.32
## 6 TSLM 1638135. 1397140. 13.4
# Forecast horizon
h <- "3 years"
# Create individual forecasts
fc_tslm <- forecast(sales_models %>% select(TSLM), h = h)
fc_ets <- forecast(sales_models %>% select(ETS), h = h)
fc_arima <- forecast(sales_models %>% select(ARIMA), h = h)
fc_naive <- forecast(sales_models %>% select(NAIVE), h = h)
fc_snaive <- forecast(sales_models %>% select(SNAIVE), h = h)
fc_mean <- forecast(sales_models %>% select(MEAN), h = h)
# Plot each forecast with training data
# Plot each forecast without confidence intervals
p1 <- autoplot(fc_tslm, monthly_sales_filled, level = NULL) + labs(title = "Forecast: TSLM (Trend + Season)")
p2 <- autoplot(fc_ets, monthly_sales_filled, level = NULL) + labs(title = "Forecast: ETS")
p3 <- autoplot(fc_arima, monthly_sales_filled, level = NULL) + labs(title = "Forecast: ARIMA")
p4 <- autoplot(fc_naive, monthly_sales_filled, level = NULL) + labs(title = "Forecast: Naïve")
p5 <- autoplot(fc_snaive, monthly_sales_filled, level = NULL) + labs(title = "Forecast: Seasonal Naïve")
p6 <- autoplot(fc_mean, monthly_sales_filled, level = NULL) + labs(title = "Forecast: Mean Model")
# Arrange in 3 rows, 2 columns using patchwork
p1 / p2 / p3 / p4 / p5 / p6
# Assume ARIMA is the best-performing model
final_model <- sales_models %>% select(SNAIVE)
# Forecast for the next 12 months
future_forecast <- final_model %>% forecast(h = "36 months")
# Plot the future forecast together with the training data
autoplot(future_forecast, monthly_sales_filled) +
labs(title = "12-Month Sales Forecast", x = "Month", y = "Forecasted Sales (Dollars)")
## Conclusion/Key findings The Iowa liquor sales data reveal that
whiskies exhibit a predictable, highly seasonal demand profile with a
moderate upward trajectory. Despite testing multiple specifications, a
Seasonal Naïve benchmark captured the dominant monthly seasonality and
yielded the most accurate out‑of‑sample forecasts.
For operational planning, retailers should:
Stock aggressively for year‑end and early‑spring peaks, using SNAIVE point forecasts plus a safety factor derived from the holiday prediction intervals.
Re‑estimate models quarterly, monitoring for post‑pandemic behavioural shifts or pricing actions that could alter the level of the series.
Segment further by brand or bottle size, as the present analysis aggregates all whiskies and may mask divergent trends among premium versus value segments.
By coupling these insights with the existing dimensional tables (stores, vendors, geography), management can translate category‑level forecasts into actionable replenishment targets, targeted promotions, and informed vendor negotiations, ultimately improving margins and customer service across the state’s liquor retail network.