1 Executive Summary

This report analyzes production and profitability through the lenses of break-even thresholds, profit modeling, customer value segmentation, and statistical outlier detection. It blends daily and monthly operational performance with customer-centric insight.

Key Highlights: - #r round(mean(df$units_produced > 100) * 100, 1)`% of production days exceed break-even - Contribution margin provides clearer visibility than flat accounting cost models - RFM segmentation identifies top-tier customers and at-risk segments

2 Data Overview & Methodology

library(tidyverse)
library(lubridate)
library(scales)
library(knitr)
# Load and clean data
df <- read.csv("better_sample_data.csv")
df$date <- as.Date(df$date)

# Cost assumptions
fixed_cost <- 1000
variable_cost <- 15
mean_price <- mean(df$item_price)
daily_be <- round(fixed_cost / (mean_price - variable_cost))

Note - Period: Jan 2023 – Jan 2024 -Granularity: Daily transactions -Key fields: customer_id, date, units_produced, item_price, revenue, delivery_mode - Cost structure: Fixed: $1,000/month (distributed) Variable: $15/unit -Break-even: 100 units/day (approx.)

3 Break- even Performance

3.1 Monthly Production vs Break- Even Threshold

df_month <- df %>%
  mutate(month = floor_date(date, "month")) %>%
  group_by(month) %>%
  summarise(units_produced = sum(units_produced))

ggplot(df_month, aes(x = month, y = units_produced)) +
  geom_col(fill = "steelblue") +
  geom_hline(yintercept = 3000, linetype = "dashed", color = "red") +
  labs(title = "Monthly Production vs Estimated Break-Even (3000 units)",
       x = "Month", y = "Units Produced") +
  theme_minimal()

3.2 Daily production vs 100 BE threshold

ggplot(df, aes(x = date, y = units_produced)) +
  geom_col(aes(fill = units_produced >= 100), show.legend = FALSE) +
  scale_fill_manual(values = c("TRUE" = "forestgreen", "FALSE" = "firebrick")) +
  geom_hline(yintercept = 100, color = "black", linetype = "dashed") +
  labs(title = "Daily Units Produced vs Break-Even Threshold (100 Units)",
       x = "Date", y = "Units Produced") +
  theme_minimal()

4 Profit Analysis

4.1 Profit calculation models

total_units <- sum(df$units_produced)
fixed_cost <- 1000
fixed_cost_per_unit <- fixed_cost / total_units

df <- df %>%
  mutate(
    fixed_cost_scaled = fixed_cost_per_unit * units_produced,
    variable_cost_component = 15 * units_produced,
    total_cost_scaled = fixed_cost_scaled + variable_cost_component,
    revenue = item_price * units_produced,
    profit_accounting = revenue - total_costs,
    profit_contribution = revenue - total_cost_scaled
  )

4.2 Profit by Customer

profit_long <- df %>%
  group_by(customer_id) %>%
  summarise(
    accounting = sum(profit_accounting),
    contribution = sum(profit_contribution)
  ) %>%
  pivot_longer(cols = c(accounting, contribution), names_to = "type", values_to = "profit")

ggplot(profit_long, aes(x = reorder(customer_id, profit), y = profit, fill = type)) +
  geom_col(position = "dodge") +
  coord_flip() +
  scale_fill_manual(values = c("accounting" = "firebrick", "contribution" = "seagreen")) +
  labs(title = "Profit by Customer: Accounting vs Contribution",
       x = "Customer", y = "Profit", fill = "Profit Type") +
  theme_minimal()

4.3 Average Margin by Delivery Mode

df %>%
  group_by(delivery_mode) %>%
  summarise(
    accounting_margin = mean(profit_accounting / revenue, na.rm = TRUE) * 100,
    contribution_margin = mean(profit_contribution / revenue, na.rm = TRUE) * 100
  ) %>%
  pivot_longer(cols = c(accounting_margin, contribution_margin),
               names_to = "type", values_to = "margin") %>%
  ggplot(aes(x = reorder(delivery_mode, margin), y = margin, fill = type)) +
  geom_col(position = "dodge") +
  scale_fill_manual(values = c("accounting_margin" = "tomato", "contribution_margin" = "forestgreen")) +
  labs(title = "Average Profit Margin by Delivery Mode",
       x = "Delivery Mode", y = "Margin (%)", fill = "Model") +
  theme_minimal()

5 Profit Outlier Analysis

5.1 Profit Distribution (Boxplots)

df_long_profit <- df %>%
  pivot_longer(cols = c(profit_accounting, profit_contribution),
               names_to = "profit_type", values_to = "profit_value")

ggplot(df_long_profit, aes(x = profit_type, y = profit_value, fill = profit_type)) +
  geom_boxplot(outlier.colour = "red", outlier.shape = 4, alpha = 0.6) +
  scale_fill_manual(values = c("profit_accounting" = "salmon", "profit_contribution" = "seagreen")) +
  labs(title = "πŸ“¦ Profit Distribution & Outlier Detection",
       x = "Profit Model", y = "Profit per Transaction") +
  theme_minimal()

5.2 Flag Outliers with 1.5*IQR

outlier_bounds <- df_long_profit %>%
  group_by(profit_type) %>%
  summarise(
    Q1 = quantile(profit_value, 0.25, na.rm = TRUE),
    Q3 = quantile(profit_value, 0.75, na.rm = TRUE),
    IQR = Q3 - Q1,
    lower = Q1 - 1.5 * IQR,
    upper = Q3 + 1.5 * IQR
  )

df_outliers <- df_long_profit %>%
  left_join(outlier_bounds, by = "profit_type") %>%
  filter(profit_value < lower | profit_value > upper)

head(df_outliers)
## # A tibble: 0 Γ— 17
## # … with 17 variables: customer_id <int>, date <date>, units_produced <int>,
## #   delivery_mode <chr>, item_price <int>, total_costs <dbl>,
## #   fixed_cost_scaled <dbl>, variable_cost_component <dbl>,
## #   total_cost_scaled <dbl>, revenue <int>, profit_type <chr>,
## #   profit_value <dbl>, Q1 <dbl>, Q3 <dbl>, IQR <dbl>, lower <dbl>, upper <dbl>

Insight: Extreme losses are primarily under the accounting model and often tied to small transactions with high fixed cost load. The contribution model minimizes variance and outliers.

6 Customer Segmentation (RFM)

6.1 RFM scoring pipeline

latest_date <- max(df$date)

df_rfm <- df %>%
  group_by(customer_id) %>%
  summarise(
    recency = as.numeric(latest_date - max(date)),
    frequency = n_distinct(date),
    monetary = sum(revenue)
  ) %>%
  mutate(
    r_score = ntile(-recency, 5),
    f_score = ntile(frequency, 5),
    m_score = ntile(monetary, 5),
    rfm_score = r_score + f_score + m_score,
    segment = case_when(
      rfm_score >= 13 ~ "Champions",
      rfm_score >= 10 ~ "Loyal",
      rfm_score >= 7  ~ "Potential",
      TRUE            ~ "At-Risk"
    )
  )

df_rfm %>%
  count(segment) %>%
  ggplot(aes(x = reorder(segment, -n), y = n, fill = segment)) +
  geom_col(width = 0.6, show.legend = FALSE) +
  labs(title = "Customer Distribution by RFM Segment",
       x = "Segment", y = "Number of Customers") +
  scale_fill_manual(values = c(
    "Champions" = "#2E8B57",
    "Loyal"     = "#1E90FF",
    "Potential" = "#FFD700",
    "At-Risk"   = "#DC143C"
  )) +
  theme_minimal(base_size = 13)

6.2 Score Distribution

ggplot(df_rfm, aes(x = rfm_score)) +
  geom_histogram(binwidth = 1, fill = "steelblue", color = "white") +
  labs(title = "πŸ“Š RFM Score Distribution", x = "RFM Score (3–15)", y = "Number of Customers") +
  theme_minimal()

6.3 Segment Profitabilty

rfm_segment_perf <- df %>%
  left_join(df_rfm, by = "customer_id") %>%
  group_by(segment) %>%
  summarise(
    total_revenue = sum(revenue, na.rm = TRUE),
    accounting_profit = sum(profit_accounting, na.rm = TRUE),
    contribution_profit = sum(profit_contribution, na.rm = TRUE),
    avg_units = mean(units_produced),
    .groups = "drop"
  )

kable(rfm_segment_perf, caption = "πŸ’° Profitability by RFM Segment")
πŸ’° Profitability by RFM Segment
segment total_revenue accounting_profit contribution_profit avg_units
At-Risk 40535 6630.373 16939.79 156.7000
Champions 136340 20373.733 53237.29 167.2424
Loyal 135425 21750.123 52141.60 184.3667
Potential 121615 22629.481 50031.33 176.0741

Insight: Champions and Loyal customers consistently deliver stronger contribution margins and volume. At-Risk segments show weaker economics and may warrant reactivation efforts.


7 Strategic Insights & Recommendations

This report has surfaced key operational and customer-level patterns. Here are the most actionable takeaways:


8 Appendix

8.1 Sample Data Preview

head(df)
##   customer_id       date units_produced delivery_mode item_price total_costs
## 1        1013 2023-09-21            175        Pickup         30    3593.048
## 2        1012 2023-06-03            255       Courier         30    4834.910
## 3        1004 2023-09-26             96       Courier         30    2442.833
## 4        1026 2023-01-16            261        Pickup         25    4931.465
## 5        1001 2023-12-27            217       Courier         25    4317.508
## 6        1023 2023-08-19            201   Third-Party         30    4040.686
##   fixed_cost_scaled variable_cost_component total_cost_scaled revenue
## 1         10.074262                    2625          2635.074    5250
## 2         14.679638                    3825          3839.680    7650
## 3          5.526452                    1440          1445.526    2880
## 4         15.025042                    3915          3930.025    6525
## 5         12.492085                    3255          3267.492    5425
## 6         11.571009                    3015          3026.571    6030
##   profit_accounting profit_contribution
## 1         1656.9521            2614.926
## 2         2815.0898            3810.320
## 3          437.1666            1434.474
## 4         1593.5347            2594.975
## 5         1107.4920            2157.508
## 6         1989.3138            3003.429

8.2 Dataset Summary Stats

summary(df)
##   customer_id        date            units_produced  delivery_mode     
##  Min.   :1001   Min.   :2023-01-05   Min.   : 51.0   Length:100        
##  1st Qu.:1011   1st Qu.:2023-04-25   1st Qu.:112.8   Class :character  
##  Median :1017   Median :2023-07-17   Median :176.0   Mode  :character  
##  Mean   :1016   Mean   :2023-07-11   Mean   :173.7                     
##  3rd Qu.:1023   3rd Qu.:2023-10-11   3rd Qu.:237.0                     
##  Max.   :1030   Max.   :2023-12-27   Max.   :300.0                     
##    item_price     total_costs   fixed_cost_scaled variable_cost_component
##  Min.   :20.00   Min.   :1569   Min.   : 2.936    Min.   : 765           
##  1st Qu.:20.00   1st Qu.:2743   1st Qu.: 6.491    1st Qu.:1691           
##  Median :25.00   Median :3618   Median :10.132    Median :2640           
##  Mean   :24.95   Mean   :3625   Mean   :10.000    Mean   :2606           
##  3rd Qu.:30.00   3rd Qu.:4581   3rd Qu.:13.643    3rd Qu.:3555           
##  Max.   :30.00   Max.   :5569   Max.   :17.270    Max.   :4500           
##  total_cost_scaled    revenue     profit_accounting profit_contribution
##  Min.   : 767.9    Min.   :1020   Min.   :-795.13   Min.   : 252.1     
##  1st Qu.:1697.7    1st Qu.:2925   1st Qu.: -92.54   1st Qu.: 956.4     
##  Median :2650.1    Median :4380   Median : 412.14   Median :1397.1     
##  Mean   :2615.7    Mean   :4339   Mean   : 713.84   Mean   :1723.5     
##  3rd Qu.:3568.6    3rd Qu.:5421   3rd Qu.:1377.61   3rd Qu.:2426.0     
##  Max.   :4517.3    Max.   :8820   Max.   :3425.28   Max.   :4393.1

8.3 Outlier Summary table(Sample)

df_outliers %>%
  arrange(desc(abs(profit_value))) %>%
  head(10)
## # A tibble: 0 Γ— 17
## # … with 17 variables: customer_id <int>, date <date>, units_produced <int>,
## #   delivery_mode <chr>, item_price <int>, total_costs <dbl>,
## #   fixed_cost_scaled <dbl>, variable_cost_component <dbl>,
## #   total_cost_scaled <dbl>, revenue <int>, profit_type <chr>,
## #   profit_value <dbl>, Q1 <dbl>, Q3 <dbl>, IQR <dbl>, lower <dbl>, upper <dbl>

8.4 RFM Score Distribution table

table(df_rfm$segment)
## 
##   At-Risk Champions     Loyal Potential 
##         7         6         7         9

8.5 Code Snippet: Break-even formula

# Daily BE threshold
fixed_cost <- 1000
variable_cost <- 15
mean_price <- mean(df$item_price)
daily_be <- round(fixed_cost / (mean_price - variable_cost))