Sample Data: Cost Modeling & RFM Analysis

df <- read.csv("better_sample_data.csv") %>% clean_names()
df$date <- as.Date(df$date)
glimpse(df)

## Rows: 100
## Columns: 6
## $ customer_id    <int> 1013, 1012, 1004, 1026, 1001, 1023, 1030, 1027, 1010, 1…
## $ date           <date> 2023-09-21, 2023-06-03, 2023-09-26, 2023-01-16, 2023-1…
## $ units_produced <int> 175, 255, 96, 261, 217, 201, 180, 236, 267, 233, 193, 8…
## $ delivery_mode  <chr> "Pickup", "Courier", "Courier", "Pickup", "Courier", "T…
## $ item_price     <int> 30, 30, 30, 25, 25, 30, 20, 20, 30, 20, 30, 25, 20, 30,…
## $ total_costs    <dbl> 3593.048, 4834.910, 2442.833, 4931.465, 4317.508, 4040.…

model <- lm(total_costs ~ units_produced, data = df)
summary(model)

## 
## Call:
## lm(formula = total_costs ~ units_produced, data = df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -213.13  -76.91  -14.55   71.53  222.43 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    1015.6223    27.4439   37.01   <2e-16 ***
## units_produced   15.0233     0.1467  102.40   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 101.8 on 98 degrees of freedom
## Multiple R-squared:  0.9907, Adjusted R-squared:  0.9906 
## F-statistic: 1.049e+04 on 1 and 98 DF,  p-value: < 2.2e-16

fixed_cost <- coef(model)[1]
variable_cost <- coef(model)[2]

cat("Estimated Fixed Cost: $", round(fixed_cost, 2), "\n")

## Estimated Fixed Cost: $ 1015.62

cat("Variable Cost per Unit: $", round(variable_cost, 2), "\n")

## Variable Cost per Unit: $ 15.02

ggplot(df, aes(x = units_produced, y = total_costs)) +
  geom_point(color = "steelblue") +
  geom_smooth(method = "lm", se = FALSE, color = "darkred") +
  labs(title = "Regression: Total Costs vs Units Produced",
       x = "Units Produced", y = "Total Costs")

df <- df %>%
  mutate(
    revenue = units_produced * item_price,
    profit = revenue - total_costs
  )
tibble(df)

## # A tibble: 100 × 8
##    customer_id date       units_produced delive…¹ item_…² total…³ revenue profit
##          <int> <date>              <int> <chr>      <int>   <dbl>   <int>  <dbl>
##  1        1013 2023-09-21            175 Pickup        30   3593.    5250 1657. 
##  2        1012 2023-06-03            255 Courier       30   4835.    7650 2815. 
##  3        1004 2023-09-26             96 Courier       30   2443.    2880  437. 
##  4        1026 2023-01-16            261 Pickup        25   4931.    6525 1594. 
##  5        1001 2023-12-27            217 Courier       25   4318.    5425 1107. 
##  6        1023 2023-08-19            201 Third-P…      30   4041.    6030 1989. 
##  7        1030 2023-02-18            180 Courier       20   3922.    3600 -322. 
##  8        1027 2023-06-24            236 Third-P…      20   4532.    4720  188. 
##  9        1010 2023-07-15            267 Courier       30   4954.    8010 3056. 
## 10        1013 2023-05-22            233 Pickup        20   4565.    4660   94.5
## # … with 90 more rows, and abbreviated variable names ¹delivery_mode,
## #   ²item_price, ³total_costs

rfm <- df %>%
  group_by(customer_id) %>%
  summarise(
    recency = as.numeric(Sys.Date() - max(date)),
    frequency = n(),
    monetary = sum(revenue)
  ) %>%
  ungroup() %>%
  mutate(
    recency_score = ntile(-recency, 5),
    frequency_score = ntile(frequency, 5),
    monetary_score = ntile(monetary, 5),
    rfm_score = recency_score + frequency_score + monetary_score
  )
tibble(rfm)

## # A tibble: 29 × 8
##    customer_id recency frequency monetary recency_score freque…¹ monet…² rfm_s…³
##          <int>   <dbl>     <int>    <int>         <int>    <int>   <int>   <int>
##  1        1001     545         2     8695             5        1       2       8
##  2        1002     590         1     2440             4        1       1       6
##  3        1003     553         4    14160             5        3       3      11
##  4        1004     566         4    16910             4        3       3      10
##  5        1005     664         4    19580             2        4       4      10
##  6        1006     861         2     4360             1        1       1       3
##  7        1008     546         4    19890             5        4       4      13
##  8        1009     626         2    14070             2        2       3       7
##  9        1010     710         1     8010             1        1       1       3
## 10        1011     678         5    27650             2        4       5      11
## # … with 19 more rows, and abbreviated variable names ¹frequency_score,
## #   ²monetary_score, ³rfm_score

top_customers <- rfm %>% filter(rfm_score >= 13)
n_top <- nrow(top_customers)

cat("Top-tier customers:", n_top)

## Top-tier customers: 6

break_even_units <- 260

top_contribution <- df %>%
  filter(customer_id %in% top_customers$customer_id) %>%
  summarise(top_units = sum(units_produced)) %>%
  pull(top_units)

percent_contrib <- round(100 * top_contribution / break_even_units, 1)

cat("Top-tier customers contributed", top_contribution, "units —", percent_contrib, "% of break-even.")

## Top-tier customers contributed 5519 units — 2122.7 % of break-even.

ggplot(rfm, aes(x = rfm_score)) +
  geom_bar(fill = "skyblue") +
  labs(title = "RFM Score Distribution", x = "RFM Score", y = "Customer Count")

df %>%
  filter(customer_id %in% top_customers$customer_id) %>%
  group_by(customer_id) %>%
  summarise(units = sum(units_produced)) %>%
  arrange(desc(units)) %>%
  top_n(10) %>%
  ggplot(aes(x = reorder(as.character(customer_id), -units), y = units)) +
  geom_col(fill = "forestgreen") +
  labs(title = "Top 10 Customers by Units Produced", x = "Customer", y = "Units") +
  coord_flip()

###This report combined operational regression modeling with customer RFM segmentation to quantify contribution toward break-even performance. The insights can drive marketing, pricing, and production decisions.###

Sample Data: Cost Modeling & RFM Analysis

Fred Bukenya

2025-06-24