library(tidyverse)
library(lubridate)
library(tsibble)
library(fable)
library(fabletools)
library(feasts)
library(imputeTS)
library(forecast)
setwd("C:/Users/Aaron/Downloads")
getwd()
## [1] "C:/Users/Aaron/Downloads"
sales_data <- read_csv("fact_Sales.csv")
# Preview the first few rows of the dataset
head(sales_data)
## # A tibble: 6 × 13
## invoice_line_no date sale_bottles sale_dollars sale_liters sale_gallons
## <chr> <date> <dbl> <dbl> <dbl> <dbl>
## 1 INV-54543000001 2022-12-30 12 111. 12 3.17
## 2 INV-54502500215 2022-12-30 2 15 0.4 0.1
## 3 INV-54538300027 2022-12-30 12 315. 9 2.37
## 4 INV-54549600015 2022-12-30 6 104. 10.5 2.77
## 5 INV-54508600018 2022-12-30 3 126 2.25 0.59
## 6 INV-54536400012 2022-12-30 24 155. 1.2 0.31
## # ℹ 7 more variables: store_ID <dbl>, city_ID <dbl>, county_ID <dbl>,
## # itemno <dbl>, item_group <dbl>, category_ID <dbl>, vendor_ID <dbl>
In this project, we analyze the Iowa liquor sales dataset. The dataset is composed of one fact table and several dimension tables as described below:
Fact Table: fact.sales Contains transactional data with fields including:
invoice_line_no
date
sale_bottles
sale_dollars
sale_liters
sale_gallons
store_ID
city_ID
county_ID
itemno
item_group
category_ID
vendor_ID
Dimension Tables:
dim_category: Provides product category details (category_ID, category).
dim_City: Contains city information (city_ID, city, county_ID).
dim_County: Contains county information (county_ID, county).
dim_items: Describes product details (itemno, item_group, im_desc, pack, bottle_volume_ml, state_bottle_cost, state_bottle_retail, category_ID, vendor_ID).
dim_Stores: Provides store details (store_ID, store, name, brand_store_no, address, zipcode, store_location, city_ID, county_ID).
dim_Vendor: Contains vendor information (vendor_ID, vendor).
This dataset is valuable because it combines detailed transaction data with rich descriptive dimensions. Forecasting sales based on this data can lead to smarter decisions in areas like:
Inventory Management: Anticipate product demand to optimize stock levels.
Resource Allocation: Identify which stores or regions may require additional support.
Pricing Strategy: Understand cost and revenue patterns to adjust retail prices if needed.
Sales Performance Analysis: Evaluate trends by category, vendor, and geographic location for targeted strategies. ## Data Wrangling
# Convert the 'date' column into Date format and aggregate by day.
sales_daily <- sales_data %>%
mutate(date = as.Date(date, format = "%Y-%m-%d")) %>%
group_by(date) %>%
summarise(total_sales = sum(sale_dollars, na.rm = TRUE),
total_bottles = sum(sale_bottles, na.rm = TRUE)) %>%
ungroup() %>%
as_tsibble(index = date)
# Check the tsibble structure
sales_daily
## # A tsibble: 1,390 x 3 [1D]
## date total_sales total_bottles
## <date> <dbl> <dbl>
## 1 2018-01-02 971602. 79218
## 2 2018-01-03 1203749. 91417
## 3 2018-01-04 1124619. 83410
## 4 2018-01-05 551381. 44579
## 5 2018-01-08 1621213. 130829
## 6 2018-01-09 1333084. 112432
## 7 2018-01-10 1093672. 89007
## 8 2018-01-11 1094104. 81121
## 9 2018-01-12 490379. 41869
## 10 2018-01-15 1255476. 97877
## # ℹ 1,380 more rows
sales_daily <- sales_daily %>%
mutate(
year = year(date),
month = month(date, label = TRUE),
day = day(date)
)
monthly_sales <- sales_daily %>%
index_by(month = ~ floor_date(.x, "month")) %>%
summarise(
total_sales = sum(total_sales, na.rm = TRUE),
total_bottles = sum(total_bottles, na.rm = TRUE)
)
# Visualize the monthly sales data
autoplot(monthly_sales, total_sales) +
labs(title = "Monthly Iowa Liquor Sales Revenue", x = "Month", y = "Sales (Dollars)")
library(ggplot2)
# Daily sales time series plot
ggplot(sales_daily, aes(x = date, y = total_sales)) +
geom_line(color = "blue") +
labs(title = "Daily Sales Revenue", x = "Date", y = "Total Sales (Dollars)")
# Ensure that missing months are explicitly represented
monthly_sales_filled <- monthly_sales %>%
fill_gaps() %>%
mutate(total_sales = na_interpolation(total_sales))
# Now create the seasonal plot
monthly_sales_filled %>%
gg_season(total_sales) +
labs(title = "Seasonal Plot of Monthly Sales", y = "Total Sales (Dollars)")
# Boxplot by month to identify potential anomalies in monthly sales
monthly_sales %>%
mutate(Month = month(month, label = TRUE)) %>%
ggplot(aes(x = Month, y = total_sales)) +
geom_boxplot() +
labs(title = "Monthly Sales Boxplot", y = "Total Sales (Dollars)")
## Model Fitting
# Define a cutoff date for the training dataset
cutoff_date <- as.Date("2020-12-31")
train_data <- monthly_sales_filled %>% filter(month <= cutoff_date)
test_data <- monthly_sales_filled %>% filter(month > cutoff_date)
# Check the number of records in each set
cat("Training set records:", nrow(train_data), "\n")
## Training set records: 1096
cat("Test set records:", nrow(test_data), "\n")
## Test set records: 700
# Fit forecasting models on the training set of monthly aggregated data
sales_models <- train_data %>%
model(
TSLM = TSLM(total_sales ~ trend() + season()),
ETS = ETS(total_sales),
ARIMA = ARIMA(total_sales)
)
# Display the model summaries and compare using AICc
report(sales_models)
## # A tibble: 3 × 20
## .model r_squared adj_r_squared sigma2 statistic p_value df log_lik
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <int> <dbl>
## 1 TSLM 0.509 0.506 7.73e+12 161. 2.32e-163 8 -17814.
## 2 ETS NA NA 2.99e- 6 NA NA NA -15727.
## 3 ARIMA NA NA 2.80e+ 9 NA NA NA -13463.
## # ℹ 12 more variables: AIC <dbl>, AICc <dbl>, BIC <dbl>, CV <dbl>,
## # deviance <dbl>, df.residual <int>, rank <int>, MSE <dbl>, AMSE <dbl>,
## # MAE <dbl>, ar_roots <list>, ma_roots <list>
# Determine forecast horizon based on the test set length
h <- nrow(test_data)
# Generate forecasts from all fitted models
forecasts <- sales_models %>%
forecast(h = h)
# Compute forecast accuracy
accuracy_metrics <- forecasts %>%
accuracy(test_data)
# Show selected accuracy measures
accuracy_metrics %>%
select(.model, RMSE, MAE, MAPE)
## # A tibble: 3 × 4
## .model RMSE MAE MAPE
## <chr> <dbl> <dbl> <dbl>
## 1 ARIMA 14681983. 14153800. 38.9
## 2 ETS 14576195. 14047431. 38.6
## 3 TSLM 4000703. 2919535. 8.92
# Assume ARIMA is the best-performing model
final_model <- sales_models %>% select(TSLM)
# Forecast for the next 12 months
future_forecast <- final_model %>% forecast(h = "12 months")
# Plot the future forecast together with the training data
autoplot(future_forecast, train_data) +
labs(title = "12-Month Sales Forecast", x = "Month", y = "Forecasted Sales (Dollars)")