How to run this report

# run once in your R session (not in knit)
install.packages("tinytex")
tinytex::install_tinytex()

Data loading & preparation

data_dir <- params$data_dir
stopifnot(dir.exists(data_dir))

csv_paths <- list.files(data_dir, pattern = "(?i)\\.csv$", full.names = TRUE)
if (length(csv_paths) == 0) stop(glue("No CSV files found in '{data_dir}'. Place your 9 files there."))

raw <- csv_paths %>% 
  map_dfr(\(p) readr::read_csv(p, show_col_types = FALSE))

# Check required columns
required_cols <- c("Store_ID","State","Year","Sales","Customer_Count","Employees","Store_Area_sqft","Profit_Margin","Store_Type")
missing <- setdiff(required_cols, names(raw))
if (length(missing) > 0) stop(glue("Missing required columns: {paste(missing, collapse=', ')}"))

df <- raw %>%
  mutate(
    Year = as.integer(Year),
    State = as.factor(State),
    Store_Type = as.factor(Store_Type),
    sales_per_customer = ifelse(Customer_Count > 0, Sales / Customer_Count, NA_real_),
    sales_per_employee = ifelse(Employees > 0, Sales / Employees, NA_real_),
    sales_per_sqft = ifelse(Store_Area_sqft > 0, Sales / Store_Area_sqft, NA_real_),
    profit_dollars = Sales * Profit_Margin
  )

# Light sanity check
summary(select(df, Sales, Customer_Count, Employees, Store_Area_sqft, Profit_Margin))
##      Sales        Customer_Count   Employees     Store_Area_sqft
##  Min.   : 48510   Min.   :1196   Min.   : 6.00   Min.   :1137   
##  1st Qu.: 66134   1st Qu.:1782   1st Qu.:14.00   1st Qu.:2185   
##  Median :137251   Median :2266   Median :35.00   Median :4933   
##  Mean   :131292   Mean   :2747   Mean   :29.33   Mean   :4821   
##  3rd Qu.:183502   3rd Qu.:3780   3rd Qu.:41.00   3rd Qu.:8152   
##  Max.   :242705   Max.   :5089   Max.   :48.00   Max.   :9042   
##  Profit_Margin   
##  Min.   :0.0610  
##  1st Qu.:0.1420  
##  Median :0.1899  
##  Mean   :0.1836  
##  3rd Qu.:0.2321  
##  Max.   :0.3020
# Useful helpers
w_mean <- function(x, w) {
  if (length(x) != length(w)) return(NA_real_)
  weighted.mean(x, w, na.rm = TRUE)
}

2) Customer Traffic and Revenue Conversion

We define conversion as Sales per Customer at the store level.

conv_by_state <- df %>%
  group_by(State) %>%
  summarise(
    median_spc = median(sales_per_customer, na.rm = TRUE),
    mean_spc = mean(sales_per_customer, na.rm = TRUE),
    stores = n(),
    .groups = "drop"
  )

# Identify high/low conversion states via quartiles of median_spc
cutoffs <- quantile(conv_by_state$median_spc, probs = c(0.25, 0.75), na.rm = TRUE)
conv_by_state <- conv_by_state %>%
  mutate(
    conversion_band = case_when(
      median_spc >= cutoffs[[2]] ~ "High-conversion",
      median_spc <= cutoffs[[1]] ~ "Low-conversion",
      TRUE ~ "Mid"
    )
  )

conv_by_state %>%
  arrange(desc(median_spc)) %>%
  gt() %>%
  fmt_currency(columns = c(median_spc, mean_spc)) %>%
  tab_header(title = "Sales per Customer by State (Median & Mean)")
Sales per Customer by State (Median & Mean)
State median_spc mean_spc stores conversion_band
OH $49.31 $66.35 30 High-conversion
KY $40.87 $54.91 30 Mid
PA $34.83 $55.55 30 Low-conversion

Store-level relationship between customers and sales.

ggplot(df, aes(Customer_Count, Sales, color = State)) +
  geom_point(alpha = 0.6) +
  geom_smooth(method = "lm", se = FALSE, linewidth = 1) +
  scale_y_continuous(labels = dollar_format()) +
  scale_x_continuous(labels = label_number(big.mark=",")) +
  labs(x = "Customer Count", y = "Sales (USD)", title = "Customer Traffic vs. Sales (by State)") +
  facet_wrap(~ State, scales = "free")
Sales vs. Customers per store, with OLS fit per state.

Sales vs. Customers per store, with OLS fit per state.

Do some store types convert better?

conv_by_type_state <- df %>%
  group_by(State, Store_Type) %>%
  summarise(
    median_spc = median(sales_per_customer, na.rm = TRUE),
    mean_spc = mean(sales_per_customer, na.rm = TRUE),
    .groups = "drop"
  )

ggplot(conv_by_type_state, aes(Store_Type, median_spc, fill = Store_Type)) +
  geom_col() +
  scale_y_continuous(labels = dollar_format()) +
  labs(x = NULL, y = "Median Sales per Customer", title = "Conversion by Store Type and State") +
  facet_wrap(~ State, nrow = 1) +
  theme(legend.position = "none")

3) Store Efficiency Metrics

Metrics: Sales per Employee and Sales per Square Foot.

store_year <- df %>%
  transmute(
    Store_ID, State, Year, Store_Type, Sales, Profit_Margin,
    sales_per_employee,
    sales_per_sqft
  )

# Top performers per year (top 10% by metric)
top_decile_flag <- function(x) x >= quantile(x, 0.9, na.rm = TRUE)
bottom_quartile_flag <- function(x) x <= quantile(x, 0.25, na.rm = TRUE)

store_year_flags <- store_year %>%
  group_by(Year) %>%
  mutate(
    top_spe = top_decile_flag(sales_per_employee),
    top_spsf = top_decile_flag(sales_per_sqft),
    bottom_spsf = bottom_quartile_flag(sales_per_sqft),
    bottom_margin = bottom_quartile_flag(Profit_Margin)
  ) %>%
  ungroup()

# Consistent top performers: appear in top decile for SPSF at least 2 years
consistent_top_spsf <- store_year_flags %>%
  group_by(Store_ID) %>%
  summarise(top_years = sum(top_spsf, na.rm = TRUE), .groups = "drop") %>%
  filter(top_years >= 2) %>%
  inner_join(distinct(store_year, Store_ID, State, Store_Type), by = "Store_ID")

consistent_top_spsf %>%
  arrange(desc(top_years)) %>%
  gt() %>%
  tab_header(title = "Consistent Top Performers (Sales per Sq Ft)") %>%
  cols_label(top_years = "Years in Top Decile")
Consistent Top Performers (Sales per Sq Ft)
Store_ID Years in Top Decile State Store_Type
6 6 KY Urban
6 6 OH Urban
6 6 PA Urban
1 3 KY Urban
1 3 OH Urban
1 3 PA Suburban
# Consistent underperformers: bottom quartile for both SPSF and Profit Margin in same year; flag if happens >=2 years
underperf <- store_year_flags %>%
  mutate(under_both = bottom_spsf & bottom_margin) %>%
  group_by(Store_ID) %>%
  summarise(under_years = sum(under_both, na.rm = TRUE), .groups = "drop") %>%
  filter(under_years >= 2) %>%
  inner_join(distinct(store_year, Store_ID, State, Store_Type), by = "Store_ID")

underperf %>%
  arrange(desc(under_years)) %>%
  gt() %>%
  tab_header(title = "Consistent Underperformers (Low SPSF & Low Profit Margin)")
Consistent Underperformers (Low SPSF & Low Profit Margin)
Store_ID under_years State Store_Type
10 6 KY Suburban
10 6 OH Suburban
10 6 PA Rural

Distribution of efficiency metrics by state and year.

p1 <- ggplot(store_year, aes(State, sales_per_employee, fill = State)) +
  geom_boxplot(outlier.alpha = 0.3) +
  scale_y_continuous(labels = dollar_format()) +
  labs(x = NULL, y = "Sales per Employee", title = "Sales per Employee") +
  theme(legend.position = "none") +
  facet_wrap(~ Year)

p2 <- ggplot(store_year, aes(State, sales_per_sqft, fill = State)) +
  geom_boxplot(outlier.alpha = 0.3) +
  scale_y_continuous(labels = dollar_format()) +
  labs(x = NULL, y = "Sales per Sq Ft", title = "Sales per Square Foot") +
  theme(legend.position = "none") +
  facet_wrap(~ Year)

p1
Sales per Employee and Sales per Sq Ft by state/year.

Sales per Employee and Sales per Sq Ft by state/year.

p2
Sales per Employee and Sales per Sq Ft by state/year.

Sales per Employee and Sales per Sq Ft by state/year.

4) Performance by Store Type

type_perf <- df %>%
  group_by(State, Store_Type) %>%
  summarise(
    median_sales = median(Sales, na.rm = TRUE),
    avg_profit_margin_w = w_mean(Profit_Margin, Sales),
    .groups = "drop"
  )

type_perf %>%
  arrange(State, desc(median_sales)) %>%
  gt() %>%
  fmt_currency(columns = median_sales) %>%
  fmt_percent(columns = avg_profit_margin_w, decimals = 1) %>%
  tab_header(title = "Performance by Store Type (per-state)")
Performance by Store Type (per-state)
State Store_Type median_sales avg_profit_margin_w
KY Rural $179,895.00 18.3%
KY Suburban $125,452.00 18.5%
KY Urban $60,345.00 18.8%
OH Rural $209,904.00 16.7%
OH Suburban $139,261.00 17.3%
OH Urban $73,661.00 17.4%
PA Suburban $201,446.00 25.6%
PA Rural $125,893.00 13.6%
PA Urban $65,008.00 19.8%
ggplot(type_perf, aes(Store_Type, median_sales, fill = Store_Type)) +
  geom_col() +
  scale_y_continuous(labels = dollar_format()) +
  labs(x = NULL, y = "Median Store Sales", title = "Median Sales by Store Type") +
  facet_wrap(~ State, nrow = 1) +
  theme(legend.position = "none")

ggplot(type_perf, aes(Store_Type, avg_profit_margin_w, fill = Store_Type)) +
  geom_col() +
  scale_y_continuous(labels = percent_format(accuracy = 0.1)) +
  labs(x = NULL, y = "Avg Profit Margin (weighted)", title = "Profit Margin by Store Type") +
  facet_wrap(~ State, nrow = 1) +
  theme(legend.position = "none")

5) State Rankings Over Time

state_year_rank <- df %>%
  group_by(State, Year) %>%
  summarise(
    total_sales = sum(Sales, na.rm = TRUE),
    avg_profit_margin_w = w_mean(Profit_Margin, Sales),
    total_area = sum(Store_Area_sqft, na.rm = TRUE),
    sales_per_sqft_state = ifelse(total_area > 0, total_sales / total_area, NA_real_),
    .groups = "drop"
  ) %>%
  group_by(Year) %>%
  mutate(
    rank_total_sales = rank(-total_sales, ties.method = "min"),
    rank_profit_margin = rank(-avg_profit_margin_w, ties.method = "min"),
    rank_sales_per_sqft = rank(-sales_per_sqft_state, ties.method = "min")
  ) %>%
  ungroup()

state_year_rank %>%
  arrange(Year, rank_total_sales) %>%
  select(Year, State, rank_total_sales, rank_profit_margin, rank_sales_per_sqft) %>%
  gt(groupname_col = "Year") %>%
  cols_label(
    State = "State",
    rank_total_sales = "Rank: Total Sales",
    rank_profit_margin = "Rank: Avg Profit Margin",
    rank_sales_per_sqft = "Rank: Sales per Sq Ft"
  ) %>%
  tab_header(title = "State Rankings by Year")
State Rankings by Year
State Rank: Total Sales Rank: Avg Profit Margin Rank: Sales per Sq Ft
2021
KY 1 2 1
OH 2 3 2
PA 3 1 3
2022
OH 1 3 1
PA 2 2 2
KY 3 1 3
2023
OH 1 3 1
PA 2 2 2
KY 3 1 3

Appendix: Derived datasets saved (optional)

# write_csv(state_year, "out_state_year.csv")
# write_csv(conv_by_state, "out_conversion_state.csv")
# write_csv(store_year_flags, "out_store_year_flags.csv")
# write_csv(state_year_rank, "out_state_year_rank.csv")

Session Info

sessionInfo()
## R version 4.5.1 (2025-06-13)
## Platform: aarch64-apple-darwin20
## Running under: macOS Sequoia 15.0.1
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/lib/libRblas.0.dylib 
## LAPACK: /Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/lib/libRlapack.dylib;  LAPACK version 3.12.1
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## time zone: America/New_York
## tzcode source: internal
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] broom_1.0.9     gt_1.0.0        glue_1.8.0      scales_1.4.0   
##  [5] lubridate_1.9.4 forcats_1.0.0   stringr_1.5.2   dplyr_1.1.4    
##  [9] purrr_1.1.0     readr_2.1.5     tidyr_1.3.1     tibble_3.3.0   
## [13] ggplot2_3.5.2   tidyverse_2.0.0
## 
## loaded via a namespace (and not attached):
##  [1] sass_0.4.10        generics_0.1.4     xml2_1.4.0         lattice_0.22-7    
##  [5] stringi_1.8.7      hms_1.1.3          digest_0.6.37      magrittr_2.0.3    
##  [9] evaluate_1.0.5     grid_4.5.1         timechange_0.3.0   RColorBrewer_1.1-3
## [13] fastmap_1.2.0      Matrix_1.7-3       jsonlite_2.0.0     backports_1.5.0   
## [17] mgcv_1.9-3         jquerylib_0.1.4    cli_3.6.5          rlang_1.1.6       
## [21] crayon_1.5.3       bit64_4.6.0-1      splines_4.5.1      withr_3.0.2       
## [25] cachem_1.1.0       yaml_2.3.10        tools_4.5.1        parallel_4.5.1    
## [29] tzdb_0.5.0         vctrs_0.6.5        R6_2.6.1           lifecycle_1.0.4   
## [33] bit_4.6.0          vroom_1.6.5        pkgconfig_2.0.3    pillar_1.11.0     
## [37] bslib_0.9.0        gtable_0.3.6       xfun_0.53          tidyselect_1.2.1  
## [41] rstudioapi_0.17.1  knitr_1.50         farver_2.1.2       nlme_3.1-168      
## [45] htmltools_0.5.8.1  rmarkdown_2.29     labeling_0.4.3     compiler_4.5.1