Executive Summary

This report blends cost behavior modeling, break-even performance tracking, and RFM-based customer segmentation to guide strategic decisions. Insights include fixed/variable cost estimates, production performance vs financial thresholds, and customer-level value distribution.


Load and Prepare Data

df <- read.csv("better_sample_data.csv") %>% clean_names()
df$date <- as.Date(df$date)
tibble(df)
## # A tibble: 100 × 6
##    customer_id date       units_produced delivery_mode item_price total_costs
##          <int> <date>              <int> <chr>              <int>       <dbl>
##  1        1013 2023-09-21            175 Pickup                30       3593.
##  2        1012 2023-06-03            255 Courier               30       4835.
##  3        1004 2023-09-26             96 Courier               30       2443.
##  4        1026 2023-01-16            261 Pickup                25       4931.
##  5        1001 2023-12-27            217 Courier               25       4318.
##  6        1023 2023-08-19            201 Third-Party           30       4041.
##  7        1030 2023-02-18            180 Courier               20       3922.
##  8        1027 2023-06-24            236 Third-Party           20       4532.
##  9        1010 2023-07-15            267 Courier               30       4954.
## 10        1013 2023-05-22            233 Pickup                20       4565.
## # … with 90 more rows

Regression – Estimating Cost Structure

model <- lm(total_costs ~ units_produced, data = df)
summary(model)
## 
## Call:
## lm(formula = total_costs ~ units_produced, data = df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -213.13  -76.91  -14.55   71.53  222.43 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    1015.6223    27.4439   37.01   <2e-16 ***
## units_produced   15.0233     0.1467  102.40   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 101.8 on 98 degrees of freedom
## Multiple R-squared:  0.9907, Adjusted R-squared:  0.9906 
## F-statistic: 1.049e+04 on 1 and 98 DF,  p-value: < 2.2e-16
fixed_cost <- coef(model)[1]
variable_cost <- coef(model)[2]

cat("Estimated Fixed Cost: $", round(fixed_cost, 2), "\n")
## Estimated Fixed Cost: $ 1015.62
cat("Variable Cost per Unit: $", round(variable_cost, 2), "\n")
## Variable Cost per Unit: $ 15.02
ggplot(df, aes(x = units_produced, y = total_costs)) +
  geom_point(color = "steelblue") +
  geom_smooth(method = "lm", se = FALSE, color = "darkred") +
  labs(title = "Regression: Total Cost vs Units Produced", x = "Units", y = "Total Cost")


Revenue and Profit Calculation

df <- df %>%
  mutate(
    revenue = units_produced * item_price,
    profit = revenue - total_costs
  )
tibble(df)
## # A tibble: 100 × 8
##    customer_id date       units_produced delive…¹ item_…² total…³ revenue profit
##          <int> <date>              <int> <chr>      <int>   <dbl>   <int>  <dbl>
##  1        1013 2023-09-21            175 Pickup        30   3593.    5250 1657. 
##  2        1012 2023-06-03            255 Courier       30   4835.    7650 2815. 
##  3        1004 2023-09-26             96 Courier       30   2443.    2880  437. 
##  4        1026 2023-01-16            261 Pickup        25   4931.    6525 1594. 
##  5        1001 2023-12-27            217 Courier       25   4318.    5425 1107. 
##  6        1023 2023-08-19            201 Third-P…      30   4041.    6030 1989. 
##  7        1030 2023-02-18            180 Courier       20   3922.    3600 -322. 
##  8        1027 2023-06-24            236 Third-P…      20   4532.    4720  188. 
##  9        1010 2023-07-15            267 Courier       30   4954.    8010 3056. 
## 10        1013 2023-05-22            233 Pickup        20   4565.    4660   94.5
## # … with 90 more rows, and abbreviated variable names ¹​delivery_mode,
## #   ²​item_price, ³​total_costs
# Total fixed cost from regression
total_fixed_cost <- fixed_cost

# Calculate total units to scale fixed cost across
total_units <- sum(df$units_produced)

# Scaled fixed cost per unit
fixed_cost_per_unit <- total_fixed_cost / total_units

# Add accounting vs contribution profit
df <- df %>%
  mutate(
    variable_cost_component = variable_cost * units_produced,
    
    # Traditional accounting view (original)
    profit_accounting = revenue - (total_costs),
    
    # Contribution view: allocate fixed cost per unit
    fixed_cost_scaled = fixed_cost_per_unit * units_produced,
    total_cost_scaled = variable_cost_component + fixed_cost_scaled,
    profit_contribution = revenue - total_cost_scaled
  )
summary(df %>% select(profit_accounting, profit_contribution))
##  profit_accounting profit_contribution
##  Min.   :-795.13   Min.   : 250.8     
##  1st Qu.: -92.54   1st Qu.: 951.7     
##  Median : 412.14   Median :1394.9     
##  Mean   : 713.84   Mean   :1719.3     
##  3rd Qu.:1377.61   3rd Qu.:2420.1     
##  Max.   :3425.28   Max.   :4386.0
df_long <- df %>%
  select(date, customer_id, profit_accounting, profit_contribution) %>%
  pivot_longer(cols = starts_with("profit_"), names_to = "profit_type", values_to = "profit")

ggplot(df_long, aes(x = date, y = profit, color = profit_type)) +
  geom_line(alpha = 0.6) +
  labs(title = "📈 Profit View: Accounting vs Contribution Margin",
       x = "Date", y = "Profit", color = "Profit Type") +
  scale_color_manual(values = c("profit_accounting" = "firebrick", "profit_contribution" = "forestgreen")) +
  theme_minimal()


Break-Even Deep Dive

Daily Break-Even Tracking

daily_be <- round(fixed_cost / (mean(df$item_price) - variable_cost))

df_daily <- df %>%
  group_by(date) %>%
  summarise(units = sum(units_produced)) %>%
  mutate(status = ifelse(units >= daily_be, "Above BE", "Below BE"))

ggplot(df_daily, aes(x = date, y = units, fill = status)) +
  geom_col() +
  geom_hline(yintercept = daily_be, linetype = "dashed", color = "red") +
  scale_fill_manual(values = c("Above BE" = "forestgreen", "Below BE" = "tomato")) +
  labs(title = "📆 Daily Production vs Daily Break-Even", y = "Units", x = "Date")


Monthly Targeted Break-Even

df_monthly <- df %>%
  mutate(month = floor_date(date, "month")) %>%
  group_by(month) %>%
  summarise(
    units = sum(units_produced),
    work_days = n_distinct(date)
  ) %>%
  mutate(
    monthly_be_target = daily_be * work_days,
    performance_ratio = round(units / monthly_be_target * 100, 1),
    status = ifelse(units >= monthly_be_target, "Above BE", "Below BE")
  )
ggplot(df_monthly, aes(x = month)) +
  geom_col(aes(y = units, fill = status), width = 20) +
  geom_line(aes(y = monthly_be_target), color = "red", size = 1.2) +
  geom_text(aes(y = units + 150, label = units), size = 4) +
  labs(title = "📅 Monthly Production vs Dynamic Break-Even",
       y = "Units Produced", x = "Month") +
scale_fill_manual(
  name = "Performance",
  values = c("Above BE" = "forestgreen", "Below BE" = "firebrick"))+
  scale_x_date(date_breaks = "1 month", date_labels = "%b") +
  theme_minimal(base_size = 14)


Customer-Level Break-Even Status

unit_margin <- mean(df$item_price) - variable_cost

df_customer <- df %>%
  group_by(customer_id) %>%
  summarise(units = sum(units_produced)) %>%
  mutate(
    profit = units * unit_margin,
    status = ifelse(profit >= fixed_cost, "Break-Even Reached", "Below BE"),
    surplus = profit - fixed_cost
  )
tibble(df_customer)
## # A tibble: 29 × 5
##    customer_id units profit status             surplus
##          <int> <int>  <dbl> <chr>                <dbl>
##  1        1001   326  3236. Break-Even Reached   2220.
##  2        1002   122  1211. Break-Even Reached    195.
##  3        1003   519  5152. Break-Even Reached   4136.
##  4        1004   649  6442. Break-Even Reached   5427.
##  5        1005   906  8994. Break-Even Reached   7978.
##  6        1006   163  1618. Break-Even Reached    602.
##  7        1008   801  7951. Break-Even Reached   6936.
##  8        1009   469  4656. Break-Even Reached   3640.
##  9        1010   267  2650. Break-Even Reached   1635.
## 10        1011  1150 11416. Break-Even Reached  10400.
## # … with 19 more rows
ggplot(df_customer, aes(x = reorder(as.character(customer_id), surplus), y = surplus, fill = status)) +
  geom_col() +
  coord_flip() +
  scale_fill_manual(values = c("Break-Even Reached" = "forestgreen", "Below BE" = "firebrick")) +
  labs(title = "💼 Customer-Level Surplus Over Break-Even", x = "Customer", y = "Surplus")


RFM Segmentation

rfm <- df %>%
  group_by(customer_id) %>%
  summarise(
    recency = as.numeric(Sys.Date() - max(date)),
    frequency = n(),
    monetary = sum(revenue)
  ) %>%
  mutate(
    recency_score = ntile(-recency, 5),
    frequency_score = ntile(frequency, 5),
    monetary_score = ntile(monetary, 5),
    rfm_score = recency_score + frequency_score + monetary_score
  )
tibble(rfm)
## # A tibble: 29 × 8
##    customer_id recency frequency monetary recency_score freque…¹ monet…² rfm_s…³
##          <int>   <dbl>     <int>    <int>         <int>    <int>   <int>   <int>
##  1        1001     549         2     8695             5        1       2       8
##  2        1002     594         1     2440             4        1       1       6
##  3        1003     557         4    14160             5        3       3      11
##  4        1004     570         4    16910             4        3       3      10
##  5        1005     668         4    19580             2        4       4      10
##  6        1006     865         2     4360             1        1       1       3
##  7        1008     550         4    19890             5        4       4      13
##  8        1009     630         2    14070             2        2       3       7
##  9        1010     714         1     8010             1        1       1       3
## 10        1011     682         5    27650             2        4       5      11
## # … with 19 more rows, and abbreviated variable names ¹​frequency_score,
## #   ²​monetary_score, ³​rfm_score

Top-Tier RFM Customers vs Break-Even

top_customers <- rfm %>% filter(rfm_score >= 13)

top_contrib <- df %>% filter(customer_id %in% top_customers$customer_id)
units_top <- sum(top_contrib$units_produced)
break_even_target <- 260
pct_of_be <- round(100 * units_top / break_even_target, 1)

cat("Top-tier customers contributed", units_top, "units -", pct_of_be, "% of break-even target.\n")
## Top-tier customers contributed 5519 units - 2122.7 % of break-even target.

📈 Visual Performance Summary

ggplot(rfm, aes(x = rfm_score)) +
  geom_bar(fill = "steelblue") +
  labs(title = "RFM Score Distribution", x = "RFM Score", y = "Customer Count")

top_contrib %>%
  group_by(customer_id) %>%
  summarise(total_units = sum(units_produced)) %>%
  top_n(10) %>%
  ggplot(aes(x = reorder(as.character(customer_id), total_units), y = total_units)) +
  geom_col(fill = "goldenrod") +
  coord_flip() +
  labs(title = "Top 10 Customers by Units", x = "Customer ID", y = "Units Produced")

df_long <- df %>%
  group_by(customer_id) %>%
  summarise(
    revenue = sum(revenue),
    profit_accounting = sum(profit_accounting),
    profit_contribution = sum(profit_contribution)
  ) %>%
  pivot_longer(cols = starts_with("profit_"), names_to = "ProfitType", values_to = "Profit") %>%
  filter(revenue > 0)

ggplot(df_long, aes(x = reorder(customer_id, Profit), y = Profit, fill = ProfitType)) +
  geom_col(position = "dodge") +
  coord_flip() +
  scale_fill_manual(values = c("profit_accounting" = "tomato", "profit_contribution" = "forestgreen"),
                    labels = c("Accounting Profit", "Contribution Margin")) +
  labs(title = "💼 Customer-Level Profit Comparison",
       x = "Customer ID", y = "Total Profit", fill = "Profit View") +
  theme_minimal(base_size = 13)

df_delivery_summary <- df %>%
  group_by(delivery_mode) %>%
  summarise(
    avg_margin_accounting = mean(profit_accounting / revenue, na.rm = TRUE) * 100,
    avg_margin_contribution = mean(profit_contribution / revenue, na.rm = TRUE) * 100
  ) %>%
  pivot_longer(cols = starts_with("avg_margin"), names_to = "MarginType", values_to = "Margin")

ggplot(df_delivery_summary, aes(x = reorder(delivery_mode, Margin), y = Margin, fill = MarginType)) +
  geom_col(position = "dodge", width = 0.6) +
  scale_fill_manual(values = c("avg_margin_accounting" = "tomato", "avg_margin_contribution" = "seagreen"),
                    labels = c("Accounting", "Contribution")) +
  labs(title = "🚚 Average Profit Margin by Delivery Mode",
       x = "Delivery Mode", y = "Average Margin (%)", fill = "Margin Type") +
  theme_minimal(base_size = 13)


Conclusions

-Which customers are profit-positive under both models — and which ones flip from loss to gain when fixed costs are scaled

-Where pricing, cost control, or customer strategy might need attention - The operation has vastly exceeded break-even, both in aggregate and through top-tier customers. - RFM segmentation reveals concentrated customer value. - Daily and monthly break-even tracking can guide real-time ops and strategic decisions. - Recommend doubling down on high-tier segments and maintaining production efficiency.


Profit Outlier Detection & Distribution

# Reshape for easier boxplot comparison
df_long_profit <- df %>%
  pivot_longer(cols = c(profit_accounting, profit_contribution),
               names_to = "profit_type", values_to = "profit_value")

# Box Plot to identify outliers
ggplot(df_long_profit, aes(x = profit_type, y = profit_value, fill = profit_type)) +
  geom_boxplot(outlier.colour = "red", outlier.shape = 4, alpha = 0.6) +
  scale_fill_manual(values = c("profit_accounting" = "salmon", "profit_contribution" = "seagreen")) +
  labs(title = "📦 Profit Distribution by Type with Outliers Highlighted",
       x = "Profit Type", y = "Profit per Order") +
  theme_minimal(base_size = 13)

# Calculate bounds for each profit type
outlier_bounds <- df_long_profit %>%
  group_by(profit_type) %>%
  summarise(
    Q1 = quantile(profit_value, 0.25, na.rm = TRUE),
    Q3 = quantile(profit_value, 0.75, na.rm = TRUE),
    IQR = Q3 - Q1,
    lower = Q1 - 1.5 * IQR,
    upper = Q3 + 1.5 * IQR,
    .groups = "drop"
  )

# Join and flag
df_outliers <- df_long_profit %>%
  left_join(outlier_bounds, by = "profit_type") %>%
  filter(profit_value < lower | profit_value > upper)

# Preview outliers
head(df_outliers)
## # A tibble: 0 × 18
## # … with 18 variables: customer_id <int>, date <date>, units_produced <int>,
## #   delivery_mode <chr>, item_price <int>, total_costs <dbl>, revenue <int>,
## #   profit <dbl>, variable_cost_component <dbl>, fixed_cost_scaled <dbl>,
## #   total_cost_scaled <dbl>, profit_type <chr>, profit_value <dbl>, Q1 <dbl>,
## #   Q3 <dbl>, IQR <dbl>, lower <dbl>, upper <dbl>

interpretation

-This analysis reveals that many of the negative profit values are driven by outlier transactions, especially under the traditional accounting model. The contribution margin approach narrows the spread, reducing volatility from small-volume orders and disproportionate fixed cost assignment. These insights suggest that refining cost allocation and filtering edge cases can improve performance tracking clarity.