How to run this report
Data loading & preparation
1) Sales and Profit Trends (2021–2023)
2) Customer Traffic and Revenue Conversion
3) Store Efficiency Metrics
4) Performance by Store Type
5) State Rankings Over Time
Appendix: Derived datasets saved (optional)
Session Info

How to run this report

Put all your CSVs in a folder called data/ (default) or change the params$data_dir above.
CSVs must include these columns (case-sensitive):
Store_ID, State, Year, Sales, Customer_Count, Employees, Store_Area_sqft, Profit_Margin, Store_Type.
Knit to PDF. If you don’t have a LaTeX engine installed, install tinytex with:

# run once in your R session (not in knit)
install.packages("tinytex")
tinytex::install_tinytex()

Data loading & preparation

data_dir <- params$data_dir
stopifnot(dir.exists(data_dir))

csv_paths <- list.files(data_dir, pattern = "(?i)\\.csv$", full.names = TRUE)
if (length(csv_paths) == 0) stop(glue("No CSV files found in '{data_dir}'. Place your 9 files there."))

raw <- csv_paths %>% 
  map_dfr(\(p) readr::read_csv(p, show_col_types = FALSE))

# Check required columns
required_cols <- c("Store_ID","State","Year","Sales","Customer_Count","Employees","Store_Area_sqft","Profit_Margin","Store_Type")
missing <- setdiff(required_cols, names(raw))
if (length(missing) > 0) stop(glue("Missing required columns: {paste(missing, collapse=', ')}"))

df <- raw %>%
  mutate(
    Year = as.integer(Year),
    State = as.factor(State),
    Store_Type = as.factor(Store_Type),
    sales_per_customer = ifelse(Customer_Count > 0, Sales / Customer_Count, NA_real_),
    sales_per_employee = ifelse(Employees > 0, Sales / Employees, NA_real_),
    sales_per_sqft = ifelse(Store_Area_sqft > 0, Sales / Store_Area_sqft, NA_real_),
    profit_dollars = Sales * Profit_Margin
  )

# Light sanity check
summary(select(df, Sales, Customer_Count, Employees, Store_Area_sqft, Profit_Margin))

##      Sales        Customer_Count   Employees     Store_Area_sqft
##  Min.   : 48510   Min.   :1196   Min.   : 6.00   Min.   :1137   
##  1st Qu.: 66134   1st Qu.:1782   1st Qu.:14.00   1st Qu.:2185   
##  Median :137251   Median :2266   Median :35.00   Median :4933   
##  Mean   :131292   Mean   :2747   Mean   :29.33   Mean   :4821   
##  3rd Qu.:183502   3rd Qu.:3780   3rd Qu.:41.00   3rd Qu.:8152   
##  Max.   :242705   Max.   :5089   Max.   :48.00   Max.   :9042   
##  Profit_Margin   
##  Min.   :0.0610  
##  1st Qu.:0.1420  
##  Median :0.1899  
##  Mean   :0.1836  
##  3rd Qu.:0.2321  
##  Max.   :0.3020

# Useful helpers
w_mean <- function(x, w) {
  if (length(x) != length(w)) return(NA_real_)
  weighted.mean(x, w, na.rm = TRUE)
}

1) Sales and Profit Trends (2021–2023)

state_year <- df %>%
  group_by(State, Year) %>%
  summarise(
    total_sales = sum(Sales, na.rm = TRUE),
    avg_profit_margin_unweighted = mean(Profit_Margin, na.rm = TRUE),
    avg_profit_margin_weighted = w_mean(Profit_Margin, Sales),
    total_area = sum(Store_Area_sqft, na.rm = TRUE),
    .groups = "drop"
  ) %>%
  mutate(sales_per_sqft_state = ifelse(total_area > 0, total_sales / total_area, NA_real_))

# Growth via linear slope of total sales ~ year per state
sales_growth <- state_year %>%
  group_by(State) %>%
  do(tidy(lm(total_sales ~ Year, data = .))) %>%
  ungroup() %>%
  filter(term == "Year") %>%
  arrange(desc(estimate)) %>%
  rename(sales_growth_slope = estimate, slope_se = std.error) %>%
  mutate(rank_by_growth = row_number())

sales_growth %>%
  select(State, sales_growth_slope, slope_se, rank_by_growth) %>%
  gt() %>%
  fmt_number(columns = c(sales_growth_slope, slope_se), decimals = 0) %>%
  tab_header(title = "Sales Growth Rank (slope of Total Sales ~ Year)")

State	sales_growth_slope	slope_se	rank_by_growth
Sales Growth Rank (slope of Total Sales ~ Year)
OH	164,891	8,413	1
PA	123,691	12,256	2
KY	−52,369	580	3

Total annual sales by state and year.

ggplot(state_year, aes(Year, total_sales, color = State, group = State)) +
  geom_line(linewidth = 1) +
  geom_point(size = 2) +
  scale_y_continuous(labels = dollar_format(accuracy = 1)) +
  scale_x_continuous(breaks = sort(unique(state_year$Year))) +
  labs(x = NULL, y = "Total Sales (USD)", title = "Total Annual Sales by State")

Total annual sales by state (sum of stores).

Average profit margin (sales-weighted) by state and year.

ggplot(state_year, aes(Year, avg_profit_margin_weighted, color = State, group = State)) +
  geom_line(linewidth = 1) +
  geom_point(size = 2) +
  scale_y_continuous(labels = percent_format(accuracy = 0.1), limits = c(0, NA)) +
  scale_x_continuous(breaks = sort(unique(state_year$Year))) +
  labs(x = NULL, y = "Avg Profit Margin (weighted)", title = "Average Profit Margin by State")

Average profit margin by state (sales-weighted).

2) Customer Traffic and Revenue Conversion

We define conversion as Sales per Customer at the store level.

conv_by_state <- df %>%
  group_by(State) %>%
  summarise(
    median_spc = median(sales_per_customer, na.rm = TRUE),
    mean_spc = mean(sales_per_customer, na.rm = TRUE),
    stores = n(),
    .groups = "drop"
  )

# Identify high/low conversion states via quartiles of median_spc
cutoffs <- quantile(conv_by_state$median_spc, probs = c(0.25, 0.75), na.rm = TRUE)
conv_by_state <- conv_by_state %>%
  mutate(
    conversion_band = case_when(
      median_spc >= cutoffs[[2]] ~ "High-conversion",
      median_spc <= cutoffs[[1]] ~ "Low-conversion",
      TRUE ~ "Mid"
    )
  )

conv_by_state %>%
  arrange(desc(median_spc)) %>%
  gt() %>%
  fmt_currency(columns = c(median_spc, mean_spc)) %>%
  tab_header(title = "Sales per Customer by State (Median & Mean)")

State	median_spc	mean_spc	stores	conversion_band
Sales per Customer by State (Median & Mean)
OH	$49.31	$66.35	30	High-conversion
KY	$40.87	$54.91	30	Mid
PA	$34.83	$55.55	30	Low-conversion

Store-level relationship between customers and sales.

ggplot(df, aes(Customer_Count, Sales, color = State)) +
  geom_point(alpha = 0.6) +
  geom_smooth(method = "lm", se = FALSE, linewidth = 1) +
  scale_y_continuous(labels = dollar_format()) +
  scale_x_continuous(labels = label_number(big.mark=",")) +
  labs(x = "Customer Count", y = "Sales (USD)", title = "Customer Traffic vs. Sales (by State)") +
  facet_wrap(~ State, scales = "free")

Sales vs. Customers per store, with OLS fit per state.

Do some store types convert better?

conv_by_type_state <- df %>%
  group_by(State, Store_Type) %>%
  summarise(
    median_spc = median(sales_per_customer, na.rm = TRUE),
    mean_spc = mean(sales_per_customer, na.rm = TRUE),
    .groups = "drop"
  )

ggplot(conv_by_type_state, aes(Store_Type, median_spc, fill = Store_Type)) +
  geom_col() +
  scale_y_continuous(labels = dollar_format()) +
  labs(x = NULL, y = "Median Sales per Customer", title = "Conversion by Store Type and State") +
  facet_wrap(~ State, nrow = 1) +
  theme(legend.position = "none")

3) Store Efficiency Metrics

Metrics: Sales per Employee and Sales per Square Foot.

store_year <- df %>%
  transmute(
    Store_ID, State, Year, Store_Type, Sales, Profit_Margin,
    sales_per_employee,
    sales_per_sqft
  )

# Top performers per year (top 10% by metric)
top_decile_flag <- function(x) x >= quantile(x, 0.9, na.rm = TRUE)
bottom_quartile_flag <- function(x) x <= quantile(x, 0.25, na.rm = TRUE)

store_year_flags <- store_year %>%
  group_by(Year) %>%
  mutate(
    top_spe = top_decile_flag(sales_per_employee),
    top_spsf = top_decile_flag(sales_per_sqft),
    bottom_spsf = bottom_quartile_flag(sales_per_sqft),
    bottom_margin = bottom_quartile_flag(Profit_Margin)
  ) %>%
  ungroup()

# Consistent top performers: appear in top decile for SPSF at least 2 years
consistent_top_spsf <- store_year_flags %>%
  group_by(Store_ID) %>%
  summarise(top_years = sum(top_spsf, na.rm = TRUE), .groups = "drop") %>%
  filter(top_years >= 2) %>%
  inner_join(distinct(store_year, Store_ID, State, Store_Type), by = "Store_ID")

consistent_top_spsf %>%
  arrange(desc(top_years)) %>%
  gt() %>%
  tab_header(title = "Consistent Top Performers (Sales per Sq Ft)") %>%
  cols_label(top_years = "Years in Top Decile")

Store_ID	Years in Top Decile	State	Store_Type
Consistent Top Performers (Sales per Sq Ft)
6	6	KY	Urban
6	6	OH	Urban
6	6	PA	Urban
1	3	KY	Urban
1	3	OH	Urban
1	3	PA	Suburban

# Consistent underperformers: bottom quartile for both SPSF and Profit Margin in same year; flag if happens >=2 years
underperf <- store_year_flags %>%
  mutate(under_both = bottom_spsf & bottom_margin) %>%
  group_by(Store_ID) %>%
  summarise(under_years = sum(under_both, na.rm = TRUE), .groups = "drop") %>%
  filter(under_years >= 2) %>%
  inner_join(distinct(store_year, Store_ID, State, Store_Type), by = "Store_ID")

underperf %>%
  arrange(desc(under_years)) %>%
  gt() %>%
  tab_header(title = "Consistent Underperformers (Low SPSF & Low Profit Margin)")

Store_ID	under_years	State	Store_Type
Consistent Underperformers (Low SPSF & Low Profit Margin)
10	6	KY	Suburban
10	6	OH	Suburban
10	6	PA	Rural

Distribution of efficiency metrics by state and year.

p1 <- ggplot(store_year, aes(State, sales_per_employee, fill = State)) +
  geom_boxplot(outlier.alpha = 0.3) +
  scale_y_continuous(labels = dollar_format()) +
  labs(x = NULL, y = "Sales per Employee", title = "Sales per Employee") +
  theme(legend.position = "none") +
  facet_wrap(~ Year)

p2 <- ggplot(store_year, aes(State, sales_per_sqft, fill = State)) +
  geom_boxplot(outlier.alpha = 0.3) +
  scale_y_continuous(labels = dollar_format()) +
  labs(x = NULL, y = "Sales per Sq Ft", title = "Sales per Square Foot") +
  theme(legend.position = "none") +
  facet_wrap(~ Year)

p1

Sales per Employee and Sales per Sq Ft by state/year.

p2

Sales per Employee and Sales per Sq Ft by state/year.

4) Performance by Store Type

type_perf <- df %>%
  group_by(State, Store_Type) %>%
  summarise(
    median_sales = median(Sales, na.rm = TRUE),
    avg_profit_margin_w = w_mean(Profit_Margin, Sales),
    .groups = "drop"
  )

type_perf %>%
  arrange(State, desc(median_sales)) %>%
  gt() %>%
  fmt_currency(columns = median_sales) %>%
  fmt_percent(columns = avg_profit_margin_w, decimals = 1) %>%
  tab_header(title = "Performance by Store Type (per-state)")

State	Store_Type	median_sales	avg_profit_margin_w
Performance by Store Type (per-state)
KY	Rural	$179,895.00	18.3%
KY	Suburban	$125,452.00	18.5%
KY	Urban	$60,345.00	18.8%
OH	Rural	$209,904.00	16.7%
OH	Suburban	$139,261.00	17.3%
OH	Urban	$73,661.00	17.4%
PA	Suburban	$201,446.00	25.6%
PA	Rural	$125,893.00	13.6%
PA	Urban	$65,008.00	19.8%

ggplot(type_perf, aes(Store_Type, median_sales, fill = Store_Type)) +
  geom_col() +
  scale_y_continuous(labels = dollar_format()) +
  labs(x = NULL, y = "Median Store Sales", title = "Median Sales by Store Type") +
  facet_wrap(~ State, nrow = 1) +
  theme(legend.position = "none")

ggplot(type_perf, aes(Store_Type, avg_profit_margin_w, fill = Store_Type)) +
  geom_col() +
  scale_y_continuous(labels = percent_format(accuracy = 0.1)) +
  labs(x = NULL, y = "Avg Profit Margin (weighted)", title = "Profit Margin by Store Type") +
  facet_wrap(~ State, nrow = 1) +
  theme(legend.position = "none")

5) State Rankings Over Time

state_year_rank <- df %>%
  group_by(State, Year) %>%
  summarise(
    total_sales = sum(Sales, na.rm = TRUE),
    avg_profit_margin_w = w_mean(Profit_Margin, Sales),
    total_area = sum(Store_Area_sqft, na.rm = TRUE),
    sales_per_sqft_state = ifelse(total_area > 0, total_sales / total_area, NA_real_),
    .groups = "drop"
  ) %>%
  group_by(Year) %>%
  mutate(
    rank_total_sales = rank(-total_sales, ties.method = "min"),
    rank_profit_margin = rank(-avg_profit_margin_w, ties.method = "min"),
    rank_sales_per_sqft = rank(-sales_per_sqft_state, ties.method = "min")
  ) %>%
  ungroup()

state_year_rank %>%
  arrange(Year, rank_total_sales) %>%
  select(Year, State, rank_total_sales, rank_profit_margin, rank_sales_per_sqft) %>%
  gt(groupname_col = "Year") %>%
  cols_label(
    State = "State",
    rank_total_sales = "Rank: Total Sales",
    rank_profit_margin = "Rank: Avg Profit Margin",
    rank_sales_per_sqft = "Rank: Sales per Sq Ft"
  ) %>%
  tab_header(title = "State Rankings by Year")

State	Rank: Total Sales	Rank: Avg Profit Margin	Rank: Sales per Sq Ft
State Rankings by Year
2021
KY	1	2	1
OH	2	3	2
PA	3	1	3
2022
OH	1	3	1
PA	2	2	2
KY	3	1	3
2023
OH	1	3	1
PA	2	2	2
KY	3	1	3

Appendix: Derived datasets saved (optional)

# write_csv(state_year, "out_state_year.csv")
# write_csv(conv_by_state, "out_conversion_state.csv")
# write_csv(store_year_flags, "out_store_year_flags.csv")
# write_csv(state_year_rank, "out_state_year_rank.csv")

Session Info

sessionInfo()

## R version 4.5.1 (2025-06-13)
## Platform: aarch64-apple-darwin20
## Running under: macOS Sequoia 15.0.1
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/lib/libRblas.0.dylib 
## LAPACK: /Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/lib/libRlapack.dylib;  LAPACK version 3.12.1
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## time zone: America/New_York
## tzcode source: internal
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] broom_1.0.9     gt_1.0.0        glue_1.8.0      scales_1.4.0   
##  [5] lubridate_1.9.4 forcats_1.0.0   stringr_1.5.2   dplyr_1.1.4    
##  [9] purrr_1.1.0     readr_2.1.5     tidyr_1.3.1     tibble_3.3.0   
## [13] ggplot2_3.5.2   tidyverse_2.0.0
## 
## loaded via a namespace (and not attached):
##  [1] sass_0.4.10        generics_0.1.4     xml2_1.4.0         lattice_0.22-7    
##  [5] stringi_1.8.7      hms_1.1.3          digest_0.6.37      magrittr_2.0.3    
##  [9] evaluate_1.0.5     grid_4.5.1         timechange_0.3.0   RColorBrewer_1.1-3
## [13] fastmap_1.2.0      Matrix_1.7-3       jsonlite_2.0.0     backports_1.5.0   
## [17] mgcv_1.9-3         jquerylib_0.1.4    cli_3.6.5          rlang_1.1.6       
## [21] crayon_1.5.3       bit64_4.6.0-1      splines_4.5.1      withr_3.0.2       
## [25] cachem_1.1.0       yaml_2.3.10        tools_4.5.1        parallel_4.5.1    
## [29] tzdb_0.5.0         vctrs_0.6.5        R6_2.6.1           lifecycle_1.0.4   
## [33] bit_4.6.0          vroom_1.6.5        pkgconfig_2.0.3    pillar_1.11.0     
## [37] bslib_0.9.0        gtable_0.3.6       xfun_0.53          tidyselect_1.2.1  
## [41] rstudioapi_0.17.1  knitr_1.50         farver_2.1.2       nlme_3.1-168      
## [45] htmltools_0.5.8.1  rmarkdown_2.29     labeling_0.4.3     compiler_4.5.1

Case Study III: R — Putting it all together

Group: Fred Xiong, Ziqi Niu

September 16, 2025