df <- read.csv("better_sample_data.csv") %>% clean_names()
df$date <- as.Date(df$date)
glimpse(df)
## Rows: 100
## Columns: 6
## $ customer_id <int> 1013, 1012, 1004, 1026, 1001, 1023, 1030, 1027, 1010, 1…
## $ date <date> 2023-09-21, 2023-06-03, 2023-09-26, 2023-01-16, 2023-1…
## $ units_produced <int> 175, 255, 96, 261, 217, 201, 180, 236, 267, 233, 193, 8…
## $ delivery_mode <chr> "Pickup", "Courier", "Courier", "Pickup", "Courier", "T…
## $ item_price <int> 30, 30, 30, 25, 25, 30, 20, 20, 30, 20, 30, 25, 20, 30,…
## $ total_costs <dbl> 3593.048, 4834.910, 2442.833, 4931.465, 4317.508, 4040.…
model <- lm(total_costs ~ units_produced, data = df)
summary(model)
##
## Call:
## lm(formula = total_costs ~ units_produced, data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -213.13 -76.91 -14.55 71.53 222.43
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1015.6223 27.4439 37.01 <2e-16 ***
## units_produced 15.0233 0.1467 102.40 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 101.8 on 98 degrees of freedom
## Multiple R-squared: 0.9907, Adjusted R-squared: 0.9906
## F-statistic: 1.049e+04 on 1 and 98 DF, p-value: < 2.2e-16
fixed_cost <- coef(model)[1]
variable_cost <- coef(model)[2]
cat("Estimated Fixed Cost: $", round(fixed_cost, 2), "\n")
## Estimated Fixed Cost: $ 1015.62
cat("Variable Cost per Unit: $", round(variable_cost, 2), "\n")
## Variable Cost per Unit: $ 15.02
ggplot(df, aes(x = units_produced, y = total_costs)) +
geom_point(color = "steelblue") +
geom_smooth(method = "lm", se = FALSE, color = "darkred") +
labs(title = "Regression: Total Costs vs Units Produced",
x = "Units Produced", y = "Total Costs")

df <- df %>%
mutate(
revenue = units_produced * item_price,
profit = revenue - total_costs
)
tibble(df)
## # A tibble: 100 × 8
## customer_id date units_produced delive…¹ item_…² total…³ revenue profit
## <int> <date> <int> <chr> <int> <dbl> <int> <dbl>
## 1 1013 2023-09-21 175 Pickup 30 3593. 5250 1657.
## 2 1012 2023-06-03 255 Courier 30 4835. 7650 2815.
## 3 1004 2023-09-26 96 Courier 30 2443. 2880 437.
## 4 1026 2023-01-16 261 Pickup 25 4931. 6525 1594.
## 5 1001 2023-12-27 217 Courier 25 4318. 5425 1107.
## 6 1023 2023-08-19 201 Third-P… 30 4041. 6030 1989.
## 7 1030 2023-02-18 180 Courier 20 3922. 3600 -322.
## 8 1027 2023-06-24 236 Third-P… 20 4532. 4720 188.
## 9 1010 2023-07-15 267 Courier 30 4954. 8010 3056.
## 10 1013 2023-05-22 233 Pickup 20 4565. 4660 94.5
## # … with 90 more rows, and abbreviated variable names ¹delivery_mode,
## # ²item_price, ³total_costs
rfm <- df %>%
group_by(customer_id) %>%
summarise(
recency = as.numeric(Sys.Date() - max(date)),
frequency = n(),
monetary = sum(revenue)
) %>%
ungroup() %>%
mutate(
recency_score = ntile(-recency, 5),
frequency_score = ntile(frequency, 5),
monetary_score = ntile(monetary, 5),
rfm_score = recency_score + frequency_score + monetary_score
)
tibble(rfm)
## # A tibble: 29 × 8
## customer_id recency frequency monetary recency_score freque…¹ monet…² rfm_s…³
## <int> <dbl> <int> <int> <int> <int> <int> <int>
## 1 1001 545 2 8695 5 1 2 8
## 2 1002 590 1 2440 4 1 1 6
## 3 1003 553 4 14160 5 3 3 11
## 4 1004 566 4 16910 4 3 3 10
## 5 1005 664 4 19580 2 4 4 10
## 6 1006 861 2 4360 1 1 1 3
## 7 1008 546 4 19890 5 4 4 13
## 8 1009 626 2 14070 2 2 3 7
## 9 1010 710 1 8010 1 1 1 3
## 10 1011 678 5 27650 2 4 5 11
## # … with 19 more rows, and abbreviated variable names ¹frequency_score,
## # ²monetary_score, ³rfm_score
top_customers <- rfm %>% filter(rfm_score >= 13)
n_top <- nrow(top_customers)
cat("Top-tier customers:", n_top)
## Top-tier customers: 6
break_even_units <- 260
top_contribution <- df %>%
filter(customer_id %in% top_customers$customer_id) %>%
summarise(top_units = sum(units_produced)) %>%
pull(top_units)
percent_contrib <- round(100 * top_contribution / break_even_units, 1)
cat("Top-tier customers contributed", top_contribution, "units —", percent_contrib, "% of break-even.")
## Top-tier customers contributed 5519 units — 2122.7 % of break-even.
ggplot(rfm, aes(x = rfm_score)) +
geom_bar(fill = "skyblue") +
labs(title = "RFM Score Distribution", x = "RFM Score", y = "Customer Count")

df %>%
filter(customer_id %in% top_customers$customer_id) %>%
group_by(customer_id) %>%
summarise(units = sum(units_produced)) %>%
arrange(desc(units)) %>%
top_n(10) %>%
ggplot(aes(x = reorder(as.character(customer_id), -units), y = units)) +
geom_col(fill = "forestgreen") +
labs(title = "Top 10 Customers by Units Produced", x = "Customer", y = "Units") +
coord_flip()

###This report combined operational regression modeling with customer RFM segmentation to quantify contribution toward break-even performance. The insights can drive marketing, pricing, and production decisions.###