data/
(default) or
change the params$data_dir
above.Store_ID, State, Year, Sales, Customer_Count, Employees, Store_Area_sqft, Profit_Margin, Store_Type
.tinytex
with:# run once in your R session (not in knit)
install.packages("tinytex")
tinytex::install_tinytex()
data_dir <- params$data_dir
stopifnot(dir.exists(data_dir))
csv_paths <- list.files(data_dir, pattern = "(?i)\\.csv$", full.names = TRUE)
if (length(csv_paths) == 0) stop(glue("No CSV files found in '{data_dir}'. Place your 9 files there."))
raw <- csv_paths %>%
map_dfr(\(p) readr::read_csv(p, show_col_types = FALSE))
# Check required columns
required_cols <- c("Store_ID","State","Year","Sales","Customer_Count","Employees","Store_Area_sqft","Profit_Margin","Store_Type")
missing <- setdiff(required_cols, names(raw))
if (length(missing) > 0) stop(glue("Missing required columns: {paste(missing, collapse=', ')}"))
df <- raw %>%
mutate(
Year = as.integer(Year),
State = as.factor(State),
Store_Type = as.factor(Store_Type),
sales_per_customer = ifelse(Customer_Count > 0, Sales / Customer_Count, NA_real_),
sales_per_employee = ifelse(Employees > 0, Sales / Employees, NA_real_),
sales_per_sqft = ifelse(Store_Area_sqft > 0, Sales / Store_Area_sqft, NA_real_),
profit_dollars = Sales * Profit_Margin
)
# Light sanity check
summary(select(df, Sales, Customer_Count, Employees, Store_Area_sqft, Profit_Margin))
## Sales Customer_Count Employees Store_Area_sqft
## Min. : 48510 Min. :1196 Min. : 6.00 Min. :1137
## 1st Qu.: 66134 1st Qu.:1782 1st Qu.:14.00 1st Qu.:2185
## Median :137251 Median :2266 Median :35.00 Median :4933
## Mean :131292 Mean :2747 Mean :29.33 Mean :4821
## 3rd Qu.:183502 3rd Qu.:3780 3rd Qu.:41.00 3rd Qu.:8152
## Max. :242705 Max. :5089 Max. :48.00 Max. :9042
## Profit_Margin
## Min. :0.0610
## 1st Qu.:0.1420
## Median :0.1899
## Mean :0.1836
## 3rd Qu.:0.2321
## Max. :0.3020
# Useful helpers
w_mean <- function(x, w) {
if (length(x) != length(w)) return(NA_real_)
weighted.mean(x, w, na.rm = TRUE)
}
state_year <- df %>%
group_by(State, Year) %>%
summarise(
total_sales = sum(Sales, na.rm = TRUE),
avg_profit_margin_unweighted = mean(Profit_Margin, na.rm = TRUE),
avg_profit_margin_weighted = w_mean(Profit_Margin, Sales),
total_area = sum(Store_Area_sqft, na.rm = TRUE),
.groups = "drop"
) %>%
mutate(sales_per_sqft_state = ifelse(total_area > 0, total_sales / total_area, NA_real_))
# Growth via linear slope of total sales ~ year per state
sales_growth <- state_year %>%
group_by(State) %>%
do(tidy(lm(total_sales ~ Year, data = .))) %>%
ungroup() %>%
filter(term == "Year") %>%
arrange(desc(estimate)) %>%
rename(sales_growth_slope = estimate, slope_se = std.error) %>%
mutate(rank_by_growth = row_number())
sales_growth %>%
select(State, sales_growth_slope, slope_se, rank_by_growth) %>%
gt() %>%
fmt_number(columns = c(sales_growth_slope, slope_se), decimals = 0) %>%
tab_header(title = "Sales Growth Rank (slope of Total Sales ~ Year)")
Sales Growth Rank (slope of Total Sales ~ Year) | |||
State | sales_growth_slope | slope_se | rank_by_growth |
---|---|---|---|
OH | 164,891 | 8,413 | 1 |
PA | 123,691 | 12,256 | 2 |
KY | −52,369 | 580 | 3 |
Total annual sales by state and year.
ggplot(state_year, aes(Year, total_sales, color = State, group = State)) +
geom_line(linewidth = 1) +
geom_point(size = 2) +
scale_y_continuous(labels = dollar_format(accuracy = 1)) +
scale_x_continuous(breaks = sort(unique(state_year$Year))) +
labs(x = NULL, y = "Total Sales (USD)", title = "Total Annual Sales by State")
Total annual sales by state (sum of stores).
Average profit margin (sales-weighted) by state and year.
ggplot(state_year, aes(Year, avg_profit_margin_weighted, color = State, group = State)) +
geom_line(linewidth = 1) +
geom_point(size = 2) +
scale_y_continuous(labels = percent_format(accuracy = 0.1), limits = c(0, NA)) +
scale_x_continuous(breaks = sort(unique(state_year$Year))) +
labs(x = NULL, y = "Avg Profit Margin (weighted)", title = "Average Profit Margin by State")
Average profit margin by state (sales-weighted).
We define conversion as
Sales per Customer
at the store level.
conv_by_state <- df %>%
group_by(State) %>%
summarise(
median_spc = median(sales_per_customer, na.rm = TRUE),
mean_spc = mean(sales_per_customer, na.rm = TRUE),
stores = n(),
.groups = "drop"
)
# Identify high/low conversion states via quartiles of median_spc
cutoffs <- quantile(conv_by_state$median_spc, probs = c(0.25, 0.75), na.rm = TRUE)
conv_by_state <- conv_by_state %>%
mutate(
conversion_band = case_when(
median_spc >= cutoffs[[2]] ~ "High-conversion",
median_spc <= cutoffs[[1]] ~ "Low-conversion",
TRUE ~ "Mid"
)
)
conv_by_state %>%
arrange(desc(median_spc)) %>%
gt() %>%
fmt_currency(columns = c(median_spc, mean_spc)) %>%
tab_header(title = "Sales per Customer by State (Median & Mean)")
Sales per Customer by State (Median & Mean) | ||||
State | median_spc | mean_spc | stores | conversion_band |
---|---|---|---|---|
OH | $49.31 | $66.35 | 30 | High-conversion |
KY | $40.87 | $54.91 | 30 | Mid |
PA | $34.83 | $55.55 | 30 | Low-conversion |
Store-level relationship between customers and sales.
ggplot(df, aes(Customer_Count, Sales, color = State)) +
geom_point(alpha = 0.6) +
geom_smooth(method = "lm", se = FALSE, linewidth = 1) +
scale_y_continuous(labels = dollar_format()) +
scale_x_continuous(labels = label_number(big.mark=",")) +
labs(x = "Customer Count", y = "Sales (USD)", title = "Customer Traffic vs. Sales (by State)") +
facet_wrap(~ State, scales = "free")
Sales vs. Customers per store, with OLS fit per state.
Do some store types convert better?
conv_by_type_state <- df %>%
group_by(State, Store_Type) %>%
summarise(
median_spc = median(sales_per_customer, na.rm = TRUE),
mean_spc = mean(sales_per_customer, na.rm = TRUE),
.groups = "drop"
)
ggplot(conv_by_type_state, aes(Store_Type, median_spc, fill = Store_Type)) +
geom_col() +
scale_y_continuous(labels = dollar_format()) +
labs(x = NULL, y = "Median Sales per Customer", title = "Conversion by Store Type and State") +
facet_wrap(~ State, nrow = 1) +
theme(legend.position = "none")
Metrics: Sales per Employee and Sales per Square Foot.
store_year <- df %>%
transmute(
Store_ID, State, Year, Store_Type, Sales, Profit_Margin,
sales_per_employee,
sales_per_sqft
)
# Top performers per year (top 10% by metric)
top_decile_flag <- function(x) x >= quantile(x, 0.9, na.rm = TRUE)
bottom_quartile_flag <- function(x) x <= quantile(x, 0.25, na.rm = TRUE)
store_year_flags <- store_year %>%
group_by(Year) %>%
mutate(
top_spe = top_decile_flag(sales_per_employee),
top_spsf = top_decile_flag(sales_per_sqft),
bottom_spsf = bottom_quartile_flag(sales_per_sqft),
bottom_margin = bottom_quartile_flag(Profit_Margin)
) %>%
ungroup()
# Consistent top performers: appear in top decile for SPSF at least 2 years
consistent_top_spsf <- store_year_flags %>%
group_by(Store_ID) %>%
summarise(top_years = sum(top_spsf, na.rm = TRUE), .groups = "drop") %>%
filter(top_years >= 2) %>%
inner_join(distinct(store_year, Store_ID, State, Store_Type), by = "Store_ID")
consistent_top_spsf %>%
arrange(desc(top_years)) %>%
gt() %>%
tab_header(title = "Consistent Top Performers (Sales per Sq Ft)") %>%
cols_label(top_years = "Years in Top Decile")
Consistent Top Performers (Sales per Sq Ft) | |||
Store_ID | Years in Top Decile | State | Store_Type |
---|---|---|---|
6 | 6 | KY | Urban |
6 | 6 | OH | Urban |
6 | 6 | PA | Urban |
1 | 3 | KY | Urban |
1 | 3 | OH | Urban |
1 | 3 | PA | Suburban |
# Consistent underperformers: bottom quartile for both SPSF and Profit Margin in same year; flag if happens >=2 years
underperf <- store_year_flags %>%
mutate(under_both = bottom_spsf & bottom_margin) %>%
group_by(Store_ID) %>%
summarise(under_years = sum(under_both, na.rm = TRUE), .groups = "drop") %>%
filter(under_years >= 2) %>%
inner_join(distinct(store_year, Store_ID, State, Store_Type), by = "Store_ID")
underperf %>%
arrange(desc(under_years)) %>%
gt() %>%
tab_header(title = "Consistent Underperformers (Low SPSF & Low Profit Margin)")
Consistent Underperformers (Low SPSF & Low Profit Margin) | |||
Store_ID | under_years | State | Store_Type |
---|---|---|---|
10 | 6 | KY | Suburban |
10 | 6 | OH | Suburban |
10 | 6 | PA | Rural |
Distribution of efficiency metrics by state and year.
p1 <- ggplot(store_year, aes(State, sales_per_employee, fill = State)) +
geom_boxplot(outlier.alpha = 0.3) +
scale_y_continuous(labels = dollar_format()) +
labs(x = NULL, y = "Sales per Employee", title = "Sales per Employee") +
theme(legend.position = "none") +
facet_wrap(~ Year)
p2 <- ggplot(store_year, aes(State, sales_per_sqft, fill = State)) +
geom_boxplot(outlier.alpha = 0.3) +
scale_y_continuous(labels = dollar_format()) +
labs(x = NULL, y = "Sales per Sq Ft", title = "Sales per Square Foot") +
theme(legend.position = "none") +
facet_wrap(~ Year)
p1
Sales per Employee and Sales per Sq Ft by state/year.
p2
Sales per Employee and Sales per Sq Ft by state/year.
type_perf <- df %>%
group_by(State, Store_Type) %>%
summarise(
median_sales = median(Sales, na.rm = TRUE),
avg_profit_margin_w = w_mean(Profit_Margin, Sales),
.groups = "drop"
)
type_perf %>%
arrange(State, desc(median_sales)) %>%
gt() %>%
fmt_currency(columns = median_sales) %>%
fmt_percent(columns = avg_profit_margin_w, decimals = 1) %>%
tab_header(title = "Performance by Store Type (per-state)")
Performance by Store Type (per-state) | |||
State | Store_Type | median_sales | avg_profit_margin_w |
---|---|---|---|
KY | Rural | $179,895.00 | 18.3% |
KY | Suburban | $125,452.00 | 18.5% |
KY | Urban | $60,345.00 | 18.8% |
OH | Rural | $209,904.00 | 16.7% |
OH | Suburban | $139,261.00 | 17.3% |
OH | Urban | $73,661.00 | 17.4% |
PA | Suburban | $201,446.00 | 25.6% |
PA | Rural | $125,893.00 | 13.6% |
PA | Urban | $65,008.00 | 19.8% |
ggplot(type_perf, aes(Store_Type, median_sales, fill = Store_Type)) +
geom_col() +
scale_y_continuous(labels = dollar_format()) +
labs(x = NULL, y = "Median Store Sales", title = "Median Sales by Store Type") +
facet_wrap(~ State, nrow = 1) +
theme(legend.position = "none")
ggplot(type_perf, aes(Store_Type, avg_profit_margin_w, fill = Store_Type)) +
geom_col() +
scale_y_continuous(labels = percent_format(accuracy = 0.1)) +
labs(x = NULL, y = "Avg Profit Margin (weighted)", title = "Profit Margin by Store Type") +
facet_wrap(~ State, nrow = 1) +
theme(legend.position = "none")
state_year_rank <- df %>%
group_by(State, Year) %>%
summarise(
total_sales = sum(Sales, na.rm = TRUE),
avg_profit_margin_w = w_mean(Profit_Margin, Sales),
total_area = sum(Store_Area_sqft, na.rm = TRUE),
sales_per_sqft_state = ifelse(total_area > 0, total_sales / total_area, NA_real_),
.groups = "drop"
) %>%
group_by(Year) %>%
mutate(
rank_total_sales = rank(-total_sales, ties.method = "min"),
rank_profit_margin = rank(-avg_profit_margin_w, ties.method = "min"),
rank_sales_per_sqft = rank(-sales_per_sqft_state, ties.method = "min")
) %>%
ungroup()
state_year_rank %>%
arrange(Year, rank_total_sales) %>%
select(Year, State, rank_total_sales, rank_profit_margin, rank_sales_per_sqft) %>%
gt(groupname_col = "Year") %>%
cols_label(
State = "State",
rank_total_sales = "Rank: Total Sales",
rank_profit_margin = "Rank: Avg Profit Margin",
rank_sales_per_sqft = "Rank: Sales per Sq Ft"
) %>%
tab_header(title = "State Rankings by Year")
State Rankings by Year | |||
State | Rank: Total Sales | Rank: Avg Profit Margin | Rank: Sales per Sq Ft |
---|---|---|---|
2021 | |||
KY | 1 | 2 | 1 |
OH | 2 | 3 | 2 |
PA | 3 | 1 | 3 |
2022 | |||
OH | 1 | 3 | 1 |
PA | 2 | 2 | 2 |
KY | 3 | 1 | 3 |
2023 | |||
OH | 1 | 3 | 1 |
PA | 2 | 2 | 2 |
KY | 3 | 1 | 3 |
# write_csv(state_year, "out_state_year.csv")
# write_csv(conv_by_state, "out_conversion_state.csv")
# write_csv(store_year_flags, "out_store_year_flags.csv")
# write_csv(state_year_rank, "out_state_year_rank.csv")
sessionInfo()
## R version 4.5.1 (2025-06-13)
## Platform: aarch64-apple-darwin20
## Running under: macOS Sequoia 15.0.1
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/lib/libRlapack.dylib; LAPACK version 3.12.1
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## time zone: America/New_York
## tzcode source: internal
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] broom_1.0.9 gt_1.0.0 glue_1.8.0 scales_1.4.0
## [5] lubridate_1.9.4 forcats_1.0.0 stringr_1.5.2 dplyr_1.1.4
## [9] purrr_1.1.0 readr_2.1.5 tidyr_1.3.1 tibble_3.3.0
## [13] ggplot2_3.5.2 tidyverse_2.0.0
##
## loaded via a namespace (and not attached):
## [1] sass_0.4.10 generics_0.1.4 xml2_1.4.0 lattice_0.22-7
## [5] stringi_1.8.7 hms_1.1.3 digest_0.6.37 magrittr_2.0.3
## [9] evaluate_1.0.5 grid_4.5.1 timechange_0.3.0 RColorBrewer_1.1-3
## [13] fastmap_1.2.0 Matrix_1.7-3 jsonlite_2.0.0 backports_1.5.0
## [17] mgcv_1.9-3 jquerylib_0.1.4 cli_3.6.5 rlang_1.1.6
## [21] crayon_1.5.3 bit64_4.6.0-1 splines_4.5.1 withr_3.0.2
## [25] cachem_1.1.0 yaml_2.3.10 tools_4.5.1 parallel_4.5.1
## [29] tzdb_0.5.0 vctrs_0.6.5 R6_2.6.1 lifecycle_1.0.4
## [33] bit_4.6.0 vroom_1.6.5 pkgconfig_2.0.3 pillar_1.11.0
## [37] bslib_0.9.0 gtable_0.3.6 xfun_0.53 tidyselect_1.2.1
## [41] rstudioapi_0.17.1 knitr_1.50 farver_2.1.2 nlme_3.1-168
## [45] htmltools_0.5.8.1 rmarkdown_2.29 labeling_0.4.3 compiler_4.5.1