=====================================================

Iowa Liquor Sales EDA (Jan 2019–Nov 2023)

=====================================================

LIBRARIES

=====================================================

library(ggplot2) library(scales) library(data.table) library(lubridate)

=====================================================

DATA LOAD

=====================================================

DATA_DIR <- “/Users/Clint/Desktop/Harvard_DataMining_Business_Clint_Graumann/Cases/Spring/I Retail_Transactions_EDA/data”

files <- list.files( path = DATA_DIR, pattern = “\.csv$”, full.names = TRUE, recursive = TRUE )

Verify file count (no Dec 2023 in raw files)

stopifnot(length(files) == 59)

df <- rbindlist( lapply(files, function(f) { dt <- fread(f, showProgress = TRUE) setnames(dt, make.names(names(dt))) # make.names() used here per assignment guidance dt }), fill = TRUE )

setDT(df)

=====================================================

BASIC EDA (must run after df exists)

=====================================================

dim(df) names(df) str(df) summary(df) head(df)

print(dim(df)) print(class(df$Date))

=====================================================

FEATURE ENGINEERING (TIME)

=====================================================

df[, Year := year(Date)] df[, Month := month(Date)] df[, YearMonth := sprintf(“%04d-%02d”, Year, Month)]

=====================================================

CORE AGGREGATIONS (MONTHLY/YEARLY)

=====================================================

monthly_sales <- df[, .( total_sales = sum(Sale..Dollars., na.rm = TRUE), total_bottles = sum(Bottles.Sold, na.rm = TRUE) ), by = .(Year, Month, YearMonth)]

setorder(monthly_sales, Year, Month)

yearly_sales <- df[, .( total_sales = sum(Sale..Dollars., na.rm = TRUE), total_bottles = sum(Bottles.Sold, na.rm = TRUE) ), by = .(Year)]

setorder(yearly_sales, Year)

print(head(monthly_sales)) print(tail(monthly_sales)) print(yearly_sales)

print(monthly_sales[Year == 2023]) # should show months 1–11

=====================================================

SYNTHETIC DECEMBER 2023 (DOCUMENTED)

=====================================================

december_estimate <- monthly_sales[ Month == 12 & Year %in% 2019:2022, .( total_sales = mean(total_sales), total_bottles = mean(total_bottles) )]

print(december_estimate)

monthly_sales_with_est_dec2023 <- copy(monthly_sales)

monthly_sales_with_est_dec2023 <- rbind( monthly_sales_with_est_dec2023, data.table( Year = 2023, Month = 12, YearMonth = “2023-12”, total_sales = december_estimate$total_sales, total_bottles = december_estimate$total_bottles, is_synthetic = TRUE ), fill = TRUE )

setorder(monthly_sales_with_est_dec2023, Year, Month) print(monthly_sales_with_est_dec2023[Month == 12])

=====================================================

ADDITIONAL FEATURES (TIME + UNIT ECONOMICS)

=====================================================

df[, quarter := quarter(Date)] df[, week := isoweek(Date)] df[, dow := wday(Date, label = TRUE, abbr = TRUE)] df[, is_weekend := dow %chin% c(“Sat”, “Sun”)]

df[, liters_sold := (Bottle.Volume..ml. / 1000) * Bottles.Sold] df[, unit_price := fifelse(Bottles.Sold > 0, Sale..Dollars. / Bottles.Sold, NA_real_)]

=====================================================

STORE PERFORMANCE + CONCENTRATION

=====================================================

store_perf <- df[, .( sales = sum(Sale..Dollars.), liters = sum(liters_sold), bottles = sum(Bottles.Sold), avg_unit_price = mean(unit_price, na.rm=TRUE) ), by=.(Store.Number, Store.Name, City, County)][order(-sales)]

total_sales <- store_perf[, sum(sales)] store_conc <- data.table( top10_share = store_perf[1:10, sum(sales)] / total_sales, top50_share = store_perf[1:50, sum(sales)] / total_sales, top100_share = store_perf[1:100, sum(sales)] / total_sales ) print(store_perf[1:20]) print(store_conc)

=====================================================

TREND + SEASONALITY + WEEKEND SPLIT

=====================================================

trend_month <- df[, .( sales = sum(Sale..Dollars.), bottles = sum(Bottles.Sold), liters = sum(liters_sold) ), by=.(Year, Month, YearMonth)][order(Year, Month)]

seasonality <- trend_month[, .( avg_sales = mean(sales), avg_liters = mean(liters) ), by=.(Month)][order(Month)]

weekend_split <- df[, .( sales = sum(Sale..Dollars.), liters = sum(liters_sold) ), by=.(Year, is_weekend)]

print(head(trend_month)) print(seasonality) print(weekend_split)

=====================================================

CATEGORY PERFORMANCE + MIX

=====================================================

cat_perf <- df[, .( sales = sum(Sale..Dollars.), liters = sum(liters_sold), bottles = sum(Bottles.Sold) ), by=.(Category.Name)][order(-sales)] print(cat_perf[1:20])

cat_year <- df[, .( sales = sum(Sale..Dollars.), liters = sum(liters_sold) ), by=.(Year, Category.Name)] cat_year[, sales_share := sales / sum(sales), by=.(Year)] print(cat_year[order(Year, -sales)][1:50])

=====================================================

VOLUME BINS + PRICE BANDS

=====================================================

df[, vol_bin := fifelse(Bottle.Volume..ml. <= 375, “Small (<=375)”, fifelse(Bottle.Volume..ml. <= 750, “Standard (<=750)”, fifelse(Bottle.Volume..ml. <= 1000,“1L (<=1000)”,“Large (>1L)”)))]

vol_trend <- df[, .( sales = sum(Sale..Dollars.), liters = sum(liters_sold) ), by=.(Year, vol_bin)] vol_trend[, sales_share := sales / sum(sales), by=.(Year)] print(vol_trend[order(Year, vol_bin)])

df[, price_band := cut(unit_price, breaks=c(-Inf, 10, 20, 40, 80, Inf), labels=c(“<=10”,“10-20”,“20-40”,“40-80”,“>80”))]

price_trend <- df[, .( sales = sum(Sale..Dollars.), liters = sum(liters_sold) ), by=.(Year, price_band)] price_trend[, sales_share := sales / sum(sales), by=.(Year)] price_trend[, liters_share := liters / sum(liters), by=.(Year)] print(price_trend[order(Year, price_band)])

=====================================================

VENDOR PERFORMANCE + CONCENTRATION + MATRIX

=====================================================

vendor_perf <- df[, .( sales = sum(Sale..Dollars.), liters = sum(liters_sold), bottles = sum(Bottles.Sold), avg_unit_price = mean(unit_price, na.rm=TRUE) ), by=.(Vendor.Name)][order(-sales)]

print(vendor_perf[1:20]) print(vendor_conc)

top_vendors <- vendor_perf[1:10, Vendor.Name] vendor_cat <- df[Vendor.Name %chin% top_vendors, .(sales=sum(Sale..Dollars.), liters=sum(liters_sold)), by=.(Vendor.Name, Category.Name)][order(Vendor.Name, -sales)] print(vendor_cat[1:100])

=====================================================

COUNTY PERFORMANCE + CONCENTRATION + TREND

=====================================================

county_perf <- df[, .( sales = sum(Sale..Dollars.), liters = sum(liters_sold), bottles = sum(Bottles.Sold), avg_unit_price = mean(unit_price, na.rm=TRUE) ), by=.(County)][order(-sales)]

print(county_perf[1:20]) print(county_conc)

top_counties <- county_perf[1:10, County] county_trend <- df[County %chin% top_counties, .(sales=sum(Sale..Dollars.), liters=sum(liters_sold)), by=.(YearMonth, County)][order(County, YearMonth)] print(county_trend[1:50])

=====================================================

YOY TABLES (TREND_MONTH_TBL)

=====================================================

trend_month_tbl <- copy(trend_month) setorder(trend_month_tbl, Year, Month)

trend_month_tbl[, sales_yoy := (sales / shift(sales, 12) - 1)] trend_month_tbl[, liters_yoy := (liters / shift(liters, 12) - 1)]

trend_month_tbl_out <- trend_month_tbl[, .( YearMonth, sales = round(sales,0), liters = round(liters,0), bottles = round(bottles,0), sales_yoy = round(100 * sales_yoy,1), liters_yoy = round(100 * liters_yoy,1) )] print(head(trend_month_tbl_out)) print(tail(trend_month_tbl_out))

=====================================================

SAFETY: KEY FIELDS (KEEP, BUT REDUNDANT)

=====================================================

if (!(“Year” %in% names(df)) && (“Date” %in% names(df))) df[, Year := year(Date)] if (!(“Month” %in% names(df)) && (“Date” %in% names(df))) df[, Month := month(Date)] if (!(“YearMonth” %in% names(df)) && all(c(“Year”,“Month”) %in% names(df))) { df[, YearMonth := sprintf(“%04d-%02d”, Year, Month)] } if (!(“quarter” %in% names(df)) && (“Date” %in% names(df))) df[, quarter := quarter(Date)] if (!(“week” %in% names(df)) && (“Date” %in% names(df))) df[, week := isoweek(Date)] if (!(“dow” %in% names(df)) && (“Date” %in% names(df))) df[, dow := wday(Date, label = TRUE, abbr = TRUE)] if (!(“is_weekend” %in% names(df)) && (“dow” %in% names(df))) df[, is_weekend := dow %chin% c(“Sat”,“Sun”)]

if (!(“liters_sold” %in% names(df)) && all(c(“Bottle.Volume..ml.”,“Bottles.Sold”) %in% names(df))) { df[, liters_sold := (Bottle.Volume..ml. / 1000) * Bottles.Sold] } if (!(“unit_price” %in% names(df)) && all(c(“Sale..Dollars.”,“Bottles.Sold”) %in% names(df))) { df[, unit_price := fifelse(Bottles.Sold > 0, Sale..Dollars. / Bottles.Sold, NA_real_)] }

=====================================================

PLOT THEME

=====================================================

ACCENT <- “#0EA5E9”

theme_board <- function() { theme_minimal(base_size = 12) + theme( panel.grid.minor = element_blank(), plot.title = element_text(face = “bold”), legend.position = “top” ) }

=====================================================

ANALYSIS #1 — SALES TRENDS ACROSS TIME

=====================================================

TABLE 1.1 already printed above (trend_month_tbl_out)

PLOT 1.1 — Monthly sales over time + 3-month MA

trend_plot <- copy(trend_month) setorder(trend_plot, Year, Month) trend_plot[, ym_date := as.Date(paste0(Year, “-”, Month, “-01”))] trend_plot[, sales_ma3 := frollmean(sales, n = 3, align = “right”)]

p1 <- ggplot(trend_plot, aes(x = ym_date)) + geom_line(aes(y = sales), color = ACCENT, linewidth = 0.8, alpha = 0.9) + geom_line(aes(y = sales_ma3), color = “black”, linewidth = 0.7, alpha = 0.6) + scale_y_continuous(labels = label_dollar(scale = 1e-6, suffix = “M”)) + labs( title = “Iowa Liquor Sales — Monthly Trend (2019–2023)”, subtitle = “Blue = monthly sales. Black = 3-month moving average.”, x = NULL, y = “Sales ($)” ) + theme_board() print(p1)

TABLE 1.2 — Seasonality index printed earlier, print formatted version too

season_tbl <- trend_month[, .( avg_sales = mean(sales, na.rm = TRUE), avg_liters = mean(liters, na.rm = TRUE) ), by = .(Month)] season_tbl[, season_index := avg_sales / mean(avg_sales)] season_tbl_out <- season_tbl[, .( Month, avg_sales = round(avg_sales, 0), avg_liters = round(avg_liters, 0), season_index = round(season_index, 2) )][order(Month)] print(season_tbl_out)

PLOT 1.2 — Seasonality profile

p2 <- ggplot(season_tbl, aes(x = Month, y = season_index)) + geom_hline(yintercept = 1, linetype = “dashed”, linewidth = 0.6, alpha = 0.5) + geom_line(color = ACCENT, linewidth = 0.9) + geom_point(color = ACCENT, size = 2) + scale_x_continuous(breaks = 1:12) + scale_y_continuous(labels = label_number(accuracy = 0.01)) + labs( title = “Seasonality Profile”, subtitle = “Index > 1.00 = above-average month.”, x = “Month”, y = “Seasonality Index” ) + theme_board() print(p2)

=====================================================

ANALYSIS #2 — Dominant store locations + performance

=====================================================

TABLE 2.1

store_perf <- df[, .( sales = sum(Sale..Dollars., na.rm=TRUE), liters = sum(liters_sold, na.rm=TRUE), bottles = sum(Bottles.Sold, na.rm=TRUE), avg_unit_price = mean(unit_price, na.rm=TRUE), transactions = .N ), by=.(Store.Number, Store.Name, City, County)][order(-sales)]

store_perf_top20 <- store_perf[1:20, .( Store.Number, Store.Name, City, County, sales = round(sales,0), liters = round(liters,0), bottles = round(bottles,0), avg_unit_price = round(avg_unit_price,2), transactions )] print(store_perf_top20)

PLOT 2.1

p2_1 <- ggplot(store_perf[1:20], aes(x=reorder(Store.Name, sales), y=sales)) + geom_col(fill=ACCENT, width=0.75) + coord_flip() + scale_y_continuous(labels=label_dollar(scale=1e-6, suffix=“M”)) + labs( title=“Top 20 Stores by Sales (2019–2023)”, subtitle=“Concentration view: where revenue is actually generated.”, x=NULL, y=“Sales ($)” ) + theme_board() print(p2_1)

TABLE 2.2

total_sales_all <- store_perf[, sum(sales)] store_conc <- data.table( top10_share = store_perf[1:10, sum(sales)] / total_sales_all, top50_share = store_perf[1:50, sum(sales)] / total_sales_all, top100_share = store_perf[1:100, sum(sales)] / total_sales_all ) store_conc_out <- store_conc[, .( top10_share_pct = round(100top10_share, 1), top50_share_pct = round(100top50_share, 1), top100_share_pct = round(100*top100_share, 1) )] print(store_conc_out)

PLOT 2.2

store_cum <- store_perf[, .(sales)][1:min(.N,500)] store_cum[, rank := .I] store_cum[, cum_share := cumsum(sales)/sum(store_perf$sales)] p2_2 <- ggplot(store_cum, aes(x=rank, y=cum_share)) + geom_line(color=ACCENT, linewidth=0.9) + scale_y_continuous(labels=label_percent(accuracy=1)) + labs( title=“Store Concentration Curve”, subtitle=“How quickly revenue concentrates into the top accounts.”, x=“Store rank (highest sales → lower)”, y=“Cumulative share of sales” ) + theme_board() print(p2_2)

TABLE 2.3

city_perf <- df[, .( sales = sum(Sale..Dollars., na.rm=TRUE), liters = sum(liters_sold, na.rm=TRUE), bottles = sum(Bottles.Sold, na.rm=TRUE), avg_unit_price = mean(unit_price, na.rm=TRUE) ), by=.(City)][order(-sales)]

city_perf_top15 <- city_perf[1:15, .( City, sales = round(sales,0), liters = round(liters,0), avg_unit_price = round(avg_unit_price,2) )] print(city_perf_top15)

PLOT 2.3

p2_3 <- ggplot(city_perf[1:15], aes(x=reorder(City, sales), y=sales)) + geom_col(fill=ACCENT, width=0.75) + coord_flip() + scale_y_continuous(labels=label_dollar(scale=1e-6, suffix=“M”)) + labs( title=“Top 15 Cities by Sales (2019–2023)”, x=NULL, y=“Sales ($)” ) + theme_board() print(p2_3)

=====================================================

ANALYSIS #3 — Prominent liquor categories + composition

=====================================================

TABLE 3.1

cat_perf <- df[, .( sales = sum(Sale..Dollars., na.rm=TRUE), liters = sum(liters_sold, na.rm=TRUE), bottles = sum(Bottles.Sold, na.rm=TRUE), avg_unit_price = mean(unit_price, na.rm=TRUE) ), by=.(Category.Name)][order(-sales)] cat_perf_top20 <- cat_perf[1:20, .( Category.Name, sales = round(sales,0), liters = round(liters,0), avg_unit_price = round(avg_unit_price,2) )] print(cat_perf_top20)

PLOT 3.1

p3_1 <- ggplot(cat_perf[1:15], aes(x=reorder(Category.Name, sales), y=sales)) + geom_col(fill=ACCENT, width=0.75) + coord_flip() + scale_y_continuous(labels=label_dollar(scale=1e-6, suffix=“M”)) + labs( title=“Top 15 Categories by Sales (2019–2023)”, x=NULL, y=“Sales ($)” ) + theme_board() print(p3_1)

TABLE 3.2

top_cats <- cat_perf[1:10, Category.Name] cat_year <- df[Category.Name %chin% top_cats, .( sales = sum(Sale..Dollars., na.rm=TRUE), liters = sum(liters_sold, na.rm=TRUE) ), by=.(Year, Category.Name)] cat_year[, sales_share := sales / sum(sales), by=.(Year)] cat_year_out <- cat_year[, .( Year, Category.Name, sales = round(sales,0), sales_share_pct = round(100*sales_share,1) )][order(Year, -sales)] print(cat_year_out)

PLOT 3.2

p3_2 <- ggplot(cat_year, aes(x=Year, y=sales_share, color=Category.Name)) + geom_line(linewidth=0.9) + geom_point(size=2) + scale_y_continuous(labels=label_percent(accuracy=1)) + labs( title=“Category Mix Over Time (Top 10)”, subtitle=“Share of annual sales by category.”, x=NULL, y=“Share of sales”, color=NULL ) + theme_board() print(p3_2)

=====================================================

ANALYSIS #4 — Consumer preferences (bottle volume + price bands)

=====================================================

if (!(“vol_bin” %in% names(df)) && (“Bottle.Volume..ml.” %in% names(df))) { df[, vol_bin := fifelse(Bottle.Volume..ml. <= 375, “Small (<=375ml)”, fifelse(Bottle.Volume..ml. <= 750, “Standard (<=750ml)”, fifelse(Bottle.Volume..ml. <= 1000, “1L (<=1000ml)”, “Large (>1L)”)))] }

if (!(“price_band” %in% names(df)) && (“unit_price” %in% names(df))) { df[, price_band := cut(unit_price, breaks=c(-Inf,10,20,40,80,Inf), labels=c(“<=10”,“10-20”,“20-40”,“40-80”,“>80”))] }

TABLE 4.1

vol_year <- df[, .( sales = sum(Sale..Dollars., na.rm=TRUE), liters = sum(liters_sold, na.rm=TRUE) ), by=.(Year, vol_bin)] vol_year[, sales_share := sales / sum(sales), by=.(Year)] vol_year_out <- vol_year[, .( Year, vol_bin, sales = round(sales,0), sales_share_pct = round(100*sales_share,1), liters = round(liters,0) )][order(Year, vol_bin)] print(vol_year_out)

PLOT 4.1

p4_1 <- ggplot(vol_year, aes(x=factor(Year), y=sales_share, fill=vol_bin)) + geom_col(width=0.75) + scale_y_continuous(labels=label_percent(accuracy=1)) + labs( title=“Bottle Size Mix (by Sales Share)”, subtitle=“How consumer purchasing shifts across bottle formats.”, x=NULL, y=“Share of annual sales”, fill=NULL ) + theme_board() print(p4_1)

TABLE 4.2

price_year <- df[!is.na(price_band), .( sales = sum(Sale..Dollars., na.rm=TRUE), liters = sum(liters_sold, na.rm=TRUE) ), by=.(Year, price_band)] price_year[, sales_share := sales / sum(sales), by=.(Year)] price_year[, liters_share := liters / sum(liters), by=.(Year)] price_year_out <- price_year[, .( Year, price_band, sales = round(sales,0), sales_share_pct = round(100sales_share,1), liters_share_pct = round(100liters_share,1) )][order(Year, price_band)] print(price_year_out)

PLOT 4.2

p4_2 <- ggplot(price_year, aes(x=Year, y=sales_share, color=price_band)) + geom_line(linewidth=0.9) + geom_point(size=2) + scale_y_continuous(labels=label_percent(accuracy=1)) + labs( title=“Price Tier Mix Over Time”, subtitle=“Share of annual sales by unit-price band.”, x=NULL, y=“Share of annual sales”, color=NULL ) + theme_board() print(p4_2)

PLOT 4.3

price_sample <- df[!is.na(unit_price) & unit_price > 0, .(unit_price)] if (nrow(price_sample) > 2e6) price_sample <- price_sample[sample(.N, 2e6)] p4_3 <- ggplot(price_sample, aes(x=unit_price)) + geom_density(color=ACCENT, linewidth=0.9, fill=ACCENT, alpha=0.15) + scale_x_continuous(labels=label_dollar(), limits=c(0, quantile(price_sample$unit_price, 0.99, na.rm=TRUE))) + labs( title="Unit Price Distribution (Snapshot)", subtitle="Trimmed at the 99th percentile to avoid outlier-driven distortion.", x="Unit price ($ per bottle)“, y=”Density” ) + theme_board() print(p4_3)

=====================================================

ANALYSIS #5 — Vendor performance + contribution

=====================================================

vendor_perf <- df[, .( sales = sum(Sale..Dollars., na.rm=TRUE), liters = sum(liters_sold, na.rm=TRUE), bottles = sum(Bottles.Sold, na.rm=TRUE), avg_unit_price = mean(unit_price, na.rm=TRUE), transactions = .N ), by=.(Vendor.Name)][order(-sales)] vendor_perf_top20 <- vendor_perf[1:20, .( Vendor.Name, sales = round(sales,0), liters = round(liters,0), avg_unit_price = round(avg_unit_price,2), transactions )] print(vendor_perf_top20)

PLOT 5.1

p5_1 <- ggplot(vendor_perf[1:15], aes(x=reorder(Vendor.Name, sales), y=sales)) + geom_col(fill=ACCENT, width=0.75) + coord_flip() + scale_y_continuous(labels=label_dollar(scale=1e-6, suffix=“M”)) + labs( title=“Top 15 Vendors by Sales (2019–2023)”, x=NULL, y=“Sales ($)” ) + theme_board() print(p5_1)

TABLE 5.2

total_vendor_sales <- vendor_perf[, sum(sales)] vendor_conc <- data.table( top5_share = vendor_perf[1:5, sum(sales)] / total_vendor_sales, top10_share = vendor_perf[1:10, sum(sales)] / total_vendor_sales, top25_share = vendor_perf[1:25, sum(sales)] / total_vendor_sales ) vendor_conc_out <- vendor_conc[, .( top5_share_pct = round(100top5_share, 1), top10_share_pct = round(100top10_share,1), top25_share_pct = round(100*top25_share,1) )] print(vendor_conc_out)

PLOT 5.2

vend_cum <- vendor_perf[, .(sales)][1:min(.N,200)] vend_cum[, rank := .I] vend_cum[, cum_share := cumsum(sales)/sum(vendor_perf$sales)] p5_2 <- ggplot(vend_cum, aes(x=rank, y=cum_share)) + geom_line(color=ACCENT, linewidth=0.9) + scale_y_continuous(labels=label_percent(accuracy=1)) + labs( title=“Vendor Concentration Curve”, subtitle=“How quickly sales concentrate among the top vendors.”, x=“Vendor rank (highest → lower)”, y=“Cumulative share of sales” ) + theme_board() print(p5_2)

TABLE 5.3

top_vendors <- vendor_perf[1:10, Vendor.Name] top_categories <- cat_perf[1:10, Category.Name] vendor_cat <- df[Vendor.Name %chin% top_vendors & Category.Name %chin% top_categories, .(sales=sum(Sale..Dollars., na.rm=TRUE), liters=sum(liters_sold, na.rm=TRUE)), by=.(Vendor.Name, Category.Name)] vendor_cat[, sales := round(sales,0)] vendor_cat <- vendor_cat[order(Vendor.Name, -sales)] print(vendor_cat)

PLOT 5.3

p5_3 <- ggplot(vendor_cat, aes(x=Category.Name, y=Vendor.Name, fill=sales)) + geom_tile(color=“white”, linewidth=0.2) + scale_fill_gradient(low=“grey95”, high=ACCENT, labels=label_dollar(scale=1e-6, suffix=“M”)) + labs( title=“Vendor × Category Sales Heatmap (Top 10 × Top 10)”, x=NULL, y=NULL, fill=“Sales” ) + theme_board() + theme(axis.text.x = element_text(angle=45, hjust=1)) print(p5_3)

=====================================================

ANALYSIS #6 — Geography + external factors

=====================================================

county_perf <- df[, .( sales = sum(Sale..Dollars., na.rm=TRUE), liters = sum(liters_sold, na.rm=TRUE), bottles = sum(Bottles.Sold, na.rm=TRUE), avg_unit_price = mean(unit_price, na.rm=TRUE) ), by=.(County)][order(-sales)] county_perf_top20 <- county_perf[1:20, .( County, sales = round(sales,0), liters = round(liters,0), avg_unit_price = round(avg_unit_price,2) )] print(county_perf_top20)

PLOT 6.1

p6_1 <- ggplot(county_perf[1:15], aes(x=reorder(County, sales), y=sales)) + geom_col(fill=ACCENT, width=0.75) + coord_flip() + scale_y_continuous(labels=label_dollar(scale=1e-6, suffix=“M”)) + labs( title=“Top 15 Counties by Sales (2019–2023)”, x=NULL, y=“Sales ($)” ) + theme_board() print(p6_1)

TABLE 6.2

monthly_sales <- df[, .( sales = sum(Sale..Dollars., na.rm=TRUE), liters = sum(liters_sold, na.rm=TRUE) ), by=.(Year, Month, YearMonth)][order(Year, Month)] monthly_sales[, ym_date := as.Date(paste0(Year, “-”, Month, “-01”))] monthly_sales[, sales_yoy := (sales / shift(sales, 12) - 1)] monthly_sales[, liters_yoy := (liters / shift(liters, 12) - 1)] monthly_sales_out <- monthly_sales[, .( YearMonth, sales = round(sales,0), sales_yoy_pct = round(100sales_yoy,1), liters_yoy_pct = round(100liters_yoy,1) )] print(head(monthly_sales_out, 12)) print(tail(monthly_sales_out, 12))

PLOT 6.2

p6_2 <- ggplot(monthly_sales[!is.na(sales_yoy)], aes(x=ym_date, y=sales_yoy)) + geom_hline(yintercept=0, linetype=“dashed”, linewidth=0.6, alpha=0.5) + geom_line(color=ACCENT, linewidth=0.9) + scale_y_continuous(labels=label_percent(accuracy=1)) + labs( title=“Monthly Sales YoY Change”, subtitle=“Highlights structural disruptions (e.g., 2020) and normalization afterward.”, x=NULL, y=“YoY change” ) + theme_board() print(p6_2)

TABLE 6.3

top_counties <- county_perf[1:8, County] county_trend <- df[County %chin% top_counties, .( sales = sum(Sale..Dollars., na.rm=TRUE), liters = sum(liters_sold, na.rm=TRUE) ), by=.(YearMonth, County)] county_trend[, ym_date := as.Date(paste0(substr(YearMonth,1,4), “-”, substr(YearMonth,6,7), “-01”))] setorder(county_trend, County, ym_date) county_trend_out <- county_trend[, .( YearMonth, County, sales = round(sales,0), liters = round(liters,0) )][order(County, YearMonth)] print(head(county_trend_out, 20))

PLOT 6.3

p6_3 <- ggplot(county_trend, aes(x=ym_date, y=sales, color=County)) + geom_line(linewidth=0.9) + scale_y_continuous(labels=label_dollar(scale=1e-6, suffix=“M”)) + labs( title=“Sales Trend — Top Counties”, subtitle=“Shows whether growth is broad-based or concentrated geographically.”, x=NULL, y=“Sales ($)”, color=NULL ) + theme_board() print(p6_3)

TABLE 6.4

total_county_sales <- county_perf[, sum(sales)] county_conc <- data.table( top5_share = county_perf[1:5, sum(sales)] / total_county_sales, top10_share = county_perf[1:10, sum(sales)] / total_county_sales ) county_conc_out <- county_conc[, .( top5_share_pct = round(100top5_share, 1), top10_share_pct = round(100top10_share,1) )] print(county_conc_out)

PLOT 6.4

county_perf[, rank := .I] county_perf[, cumulative_share := cumsum(sales) / sum(sales)] ggplot(county_perf, aes(x = rank, y = cumulative_share)) + geom_line(linewidth = 1.2) + geom_hline(yintercept = 0.8, linetype = “dashed”) + labs( title = “Cumulative Share of Iowa Liquor Sales by County”, x = “County Rank”, y = “Cumulative Share of State Sales” ) + theme_minimal()

=====================================================

Iowa Liquor Sales EDA (Jan 2019–Nov 2023)

=====================================================

=====================================================

LIBRARIES

=====================================================

=====================================================

DATA LOAD

=====================================================

Verify file count (no Dec 2023 in raw files)

=====================================================

BASIC EDA (must run after df exists)

=====================================================

=====================================================

FEATURE ENGINEERING (TIME)

=====================================================

=====================================================

CORE AGGREGATIONS (MONTHLY/YEARLY)

=====================================================

=====================================================

SYNTHETIC DECEMBER 2023 (DOCUMENTED)

=====================================================

=====================================================

ADDITIONAL FEATURES (TIME + UNIT ECONOMICS)

=====================================================

=====================================================

STORE PERFORMANCE + CONCENTRATION

=====================================================

=====================================================

TREND + SEASONALITY + WEEKEND SPLIT

=====================================================

=====================================================

CATEGORY PERFORMANCE + MIX

=====================================================

=====================================================

VOLUME BINS + PRICE BANDS

=====================================================

=====================================================

VENDOR PERFORMANCE + CONCENTRATION + MATRIX

=====================================================

=====================================================

COUNTY PERFORMANCE + CONCENTRATION + TREND

=====================================================

=====================================================

YOY TABLES (TREND_MONTH_TBL)

=====================================================

=====================================================

SAFETY: KEY FIELDS (KEEP, BUT REDUNDANT)

=====================================================

=====================================================

PLOT THEME

=====================================================

=====================================================

ANALYSIS #1 — SALES TRENDS ACROSS TIME

=====================================================

TABLE 1.1 already printed above (trend_month_tbl_out)

PLOT 1.1 — Monthly sales over time + 3-month MA

TABLE 1.2 — Seasonality index printed earlier, print formatted version too

PLOT 1.2 — Seasonality profile

TABLE 1.3 — Weekend vs weekday share

PLOT 1.3 — Weekend vs weekday share

=====================================================

ANALYSIS #2 — Dominant store locations + performance

=====================================================

TABLE 2.1

PLOT 2.1

TABLE 2.2

PLOT 2.2

TABLE 2.3

PLOT 2.3

=====================================================

ANALYSIS #3 — Prominent liquor categories + composition

=====================================================

TABLE 3.1

PLOT 3.1

TABLE 3.2

PLOT 3.2

=====================================================

ANALYSIS #4 — Consumer preferences (bottle volume + price bands)

=====================================================