Dataset: Sample Sales Data
Source:
C:/Users/Muhammad.shaamel/Downloads/advdataanalytics/data/sales_data_sample.csv
Size: ~2,823 rows × 25 columns
Time Period: 2003–2005
This dataset contains retail sales transaction records including order details, customer information, product lines, and geographic data. It is useful for analyzing sales patterns, customer behavior, seasonal trends, and market performance by region and product category.
Variable Types
ORDERNUMBER, QUANTITYORDERED, PRICEEACH, SALES, ORDERLINENUMBER, QTR_ID, MONTH_ID, YEAR_ID, MSRPSTATUS, PRODUCTLINE, CUSTOMERNAME, COUNTRY, TERRITORY, DEALSIZE, ...ORDERDATElibrary(tidyverse)
library(knitr)
library(lubridate)
# Read CSV (absolute path as provided)
sales_data <- read.csv("C:/Users/Muhammad.shaamel/Downloads/advdataanalytics/data/sales_data_sample.csv")
# ORDERDATE in this dataset can appear with or without time; normalize safely
# Strip time part if present and parse as mdy
sales_data$ORDERDATE <- lubridate::mdy(gsub(" .*", "", as.character(sales_data$ORDERDATE)))
# Derived time variables (recompute even if present)
sales_data$QTR_ID <- lubridate::quarter(sales_data$ORDERDATE)
sales_data$MONTH_ID <- lubridate::month(sales_data$ORDERDATE)
sales_data$YEAR_ID <- lubridate::year(sales_data$ORDERDATE)
cat("Dataset loaded:", nrow(sales_data), "rows,", ncol(sales_data), "columns")
## Dataset loaded: 2823 rows, 25 columns
missing_counts <- colSums(is.na(sales_data))
missing_data <- data.frame(
Variable = names(missing_counts),
Missing_Count = as.integer(missing_counts),
Missing_Percent = round((missing_counts / nrow(sales_data)) * 100, 1)
) %>%
dplyr::filter(Missing_Count > 0) %>%
dplyr::arrange(desc(Missing_Percent))
if (nrow(missing_data) == 0) {
cat("No missing values detected.")
} else {
kable(missing_data, caption = "Missing Values Summary")
}
| Variable | Missing_Count | Missing_Percent | |
|---|---|---|---|
| TERRITORY | TERRITORY | 1074 | 38 |
key_vars <- sales_data[, c("QUANTITYORDERED", "PRICEEACH", "SALES", "MSRP")]
summary_stats <- do.call(cbind, lapply(key_vars, function(x) {
c(Mean = round(mean(x, na.rm = TRUE), 2),
Median = round(median(x, na.rm = TRUE), 2),
SD = round(sd(x, na.rm = TRUE), 2),
Min = round(min(x, na.rm = TRUE), 2),
Max = round(max(x, na.rm = TRUE), 2))
}))
kable(summary_stats, caption = "Summary Statistics for Key Variables")
| QUANTITYORDERED | PRICEEACH | SALES | MSRP | |
|---|---|---|---|---|
| Mean | 35.09 | 83.66 | 3553.89 | 100.72 |
| Median | 35.00 | 95.70 | 3184.80 | 99.00 |
| SD | 9.74 | 20.17 | 1841.87 | 40.19 |
| Min | 6.00 | 26.88 | 482.13 | 33.00 |
| Max | 97.00 | 100.00 | 14082.80 | 214.00 |
# Business metrics
shipped_rate <- round(sum(sales_data$STATUS == "Shipped", na.rm = TRUE) / nrow(sales_data) * 100, 1)
cat("Total Revenue: $", format(sum(sales_data$SALES, na.rm = TRUE), big.mark = ","), "\n")
## Total Revenue: $ 10,032,629
cat("Average Order Value: $", round(mean(sales_data$SALES, na.rm = TRUE), 2), "\n")
## Average Order Value: $ 3553.89
cat("Total Orders:", nrow(sales_data), "\n")
## Total Orders: 2823
cat("Shipping Success Rate:", shipped_rate, "%\n")
## Shipping Success Rate: 92.7 %
# Sales distribution
hist(sales_data$SALES, breaks = 30, main = "Sales Amount Distribution",
xlab = "Sales ($)", col = "lightblue")
# Product line performance
product_perf <- sales_data %>%
group_by(PRODUCTLINE) %>%
summarise(
Orders = n(),
Total_Revenue = sum(SALES, na.rm = TRUE),
Avg_Revenue = round(mean(SALES, na.rm = TRUE), 2),
.groups = "drop"
) %>%
arrange(desc(Total_Revenue))
kable(head(product_perf), caption = "Top Product Lines by Revenue")
| PRODUCTLINE | Orders | Total_Revenue | Avg_Revenue |
|---|---|---|---|
| Classic Cars | 967 | 3919615.7 | 4053.38 |
| Vintage Cars | 607 | 1903150.8 | 3135.34 |
| Motorcycles | 331 | 1166388.3 | 3523.83 |
| Trucks and Buses | 301 | 1127789.8 | 3746.81 |
| Planes | 306 | 975003.6 | 3186.29 |
| Ships | 234 | 714437.1 | 3053.15 |
# Order status distribution
status_dist <- table(sales_data$STATUS)
kable(data.frame(Status = names(status_dist), Count = as.numeric(status_dist)),
caption = "Order Status Distribution")
| Status | Count |
|---|---|
| Cancelled | 60 |
| Disputed | 14 |
| In Process | 41 |
| On Hold | 44 |
| Resolved | 47 |
| Shipped | 2617 |
# Monthly sales trend
monthly_sales <- sales_data %>%
mutate(YearMonth = format(ORDERDATE, "%Y-%m")) %>%
group_by(YearMonth) %>%
summarise(Monthly_Revenue = sum(SALES, na.rm = TRUE), .groups = 'drop') %>%
arrange(YearMonth)
kable(head(monthly_sales, 12), caption = "Sample Monthly Sales",
format.args = list(big.mark = ","))
| YearMonth | Monthly_Revenue |
|---|---|
| 2003-01 | 129,753.6 |
| 2003-02 | 140,836.2 |
| 2003-03 | 174,504.9 |
| 2003-04 | 201,609.5 |
| 2003-05 | 192,673.1 |
| 2003-06 | 168,082.6 |
| 2003-07 | 187,731.9 |
| 2003-08 | 197,809.3 |
| 2003-09 | 263,973.4 |
| 2003-10 | 568,291.0 |
| 2003-11 | 1,029,837.7 |
| 2003-12 | 261,876.5 |
# Yearly comparison
yearly_sales <- sales_data %>%
group_by(YEAR_ID) %>%
summarise(
Total_Sales = sum(SALES, na.rm = TRUE),
Order_Count = n(),
Avg_Order_Value = round(mean(SALES, na.rm = TRUE), 2),
.groups = 'drop'
) %>%
arrange(YEAR_ID)
kable(yearly_sales, caption = "Year-over-Year Performance",
format.args = list(big.mark = ","))
| YEAR_ID | Total_Sales | Order_Count | Avg_Order_Value |
|---|---|---|---|
| 2,003 | 3,516,980 | 1,000 | 3,516.98 |
| 2,004 | 4,724,163 | 1,345 | 3,512.39 |
| 2,005 | 1,791,487 | 478 | 3,747.88 |
# Correlation and leader (computed here so inline R has them ready)
corr_q_sales <- suppressWarnings(
cor(sales_data$QUANTITYORDERED, sales_data$SALES, use = "complete.obs")
)
shipped_rate <- round(sum(sales_data$STATUS == "Shipped", na.rm = TRUE) / nrow(sales_data) * 100, 1)
product_leader <- if (nrow(product_perf) > 0) product_perf$PRODUCTLINE[1] else NA
Data Quality: Core business fields are generally complete. See Missing Values Summary (if any) for optional fields with gaps.
Sales Performance: - Total revenue of
$10,032,629 across 2823 order
lines
- Average order value $3553.89
- 92.7% of orders successfully shipped
Business Patterns: - Classic Cars
currently leads total revenue among product lines
- Sales distribution is right-skewed (typical for order values)
- Seasonal effects visible in quarterly/monthly aggregations
Correlation: Quantity ordered vs sales amount shows a positive relationship (r = 0.55), as expected.
Proposed Model: Monthly Sales Revenue
Forecasting
Business Problem: Forecast monthly sales 3–6 months
ahead to improve inventory planning, cash-flow management, and
staffing.
Approach: Use historical trends, seasonal indicators, product-mix signals, and regional factors to predict revenue.
Expected Benefits: - Reduce inventory costs via
better demand prediction
- Improve cash-flow planning accuracy
- Enable proactive marketing during predicted slow periods
- Support realistic target setting
Target Variable: Monthly total sales revenue
Key Features: Lagged revenue, month/quarter
seasonality, product line mix, geography
Kyanyoga (2016). Sample Sales Data. Kaggle. https://www.kaggle.com/datasets/kyanyoga/sample-sales-data
sqldf# Ensure sqldf is available
if (!requireNamespace("sqldf", quietly = TRUE)) install.packages("sqldf")
library(sqldf)
Why this matters?:
Identifying revenue-leading product lines in a specific year and
fulfillment state (“Shipped”) helps merchandise and finance teams focus
inventory, promotions, and supplier negotiations where it pays off.
SQL query
SELECT
PRODUCTLINE,
COUNT(*) AS order_lines,
SUM(SALES) AS total_revenue,
AVG(SALES) AS avg_line_revenue
FROM sales_data
WHERE YEAR_ID = 2004
AND STATUS = 'Shipped' -- filter (fulfillment)
GROUP BY PRODUCTLINE -- grouping
HAVING COUNT(*) >= 10 -- guardrail for tiny groups
ORDER BY total_revenue DESC -- sorting (by revenue)
LIMIT 10;
Results
sql1 <- "
SELECT
PRODUCTLINE,
COUNT(*) AS order_lines,
SUM(SALES) AS total_revenue,
AVG(SALES) AS avg_line_revenue
FROM sales_data
WHERE YEAR_ID = 2004
AND STATUS = 'Shipped'
GROUP BY PRODUCTLINE
HAVING COUNT(*) >= 10
ORDER BY total_revenue DESC
LIMIT 10;
"
res1 <- sqldf(sql1)
knitr::kable(res1, caption = "Top Product Lines by Revenue (Shipped, 2004)",
format.args = list(big.mark = ",", scientific = FALSE))
| PRODUCTLINE | order_lines | total_revenue | avg_line_revenue |
|---|---|---|---|
| Classic Cars | 425 | 1,702,871.5 | 4,006.757 |
| Vintage Cars | 275 | 883,004.4 | 3,210.925 |
| Motorcycles | 164 | 560,545.2 | 3,417.959 |
| Trucks and Buses | 138 | 509,109.6 | 3,689.200 |
| Planes | 150 | 468,552.2 | 3,123.682 |
| Ships | 99 | 292,522.8 | 2,954.775 |
| Trains | 36 | 111,441.4 | 3,095.595 |
My takeaways
Why this matters?:
Finding high-value customers (by spend per line) in a key market guides
account-based marketing, loyalty perks, and tailored service levels.
SQL query
SELECT
CUSTOMERNAME,
COUNT(*) AS order_lines,
SUM(SALES) AS total_revenue,
AVG(SALES) AS avg_line_revenue
FROM sales_data
WHERE COUNTRY = 'USA' -- filter (market)
AND YEAR_ID = 2005 -- filter (time)
AND STATUS = 'Shipped' -- filter (fulfillment)
GROUP BY CUSTOMERNAME -- grouping (by customer)
HAVING COUNT(*) >= 3 -- ensure meaningful activity
ORDER BY avg_line_revenue DESC, -- sorting (primary by avg ticket)
total_revenue DESC -- tie-breaker by total
LIMIT 10;
Results
sql2 <- "
SELECT
CUSTOMERNAME,
COUNT(*) AS order_lines,
SUM(SALES) AS total_revenue,
AVG(SALES) AS avg_line_revenue
FROM sales_data
WHERE COUNTRY = 'USA'
AND YEAR_ID = 2005
AND STATUS = 'Shipped'
GROUP BY CUSTOMERNAME
HAVING COUNT(*) >= 3
ORDER BY avg_line_revenue DESC, total_revenue DESC
LIMIT 10;
"
res2 <- sqldf(sql2)
knitr::kable(res2, caption = "Top USA Customers by Avg Order-Line Revenue (Shipped, 2005)",
format.args = list(big.mark = ",", scientific = FALSE))
| CUSTOMERNAME | order_lines | total_revenue | avg_line_revenue |
|---|---|---|---|
| Gift Depot Inc. | 6 | 31,648.47 | 5,274.745 |
| FunGiftIdeas.com | 8 | 37,557.66 | 4,694.708 |
| The Sharp Gifts Warehouse | 9 | 37,526.84 | 4,169.649 |
| Corporate Gift Ideas Co. | 13 | 54,203.62 | 4,169.509 |
| Collectables For Less Inc. | 8 | 31,474.78 | 3,934.347 |
| Mini Gifts Distributors Ltd. | 56 | 205,993.93 | 3,678.463 |
| Mini Creations Ltd. | 3 | 11,021.30 | 3,673.767 |
| Technics Stores Inc. | 4 | 13,529.57 | 3,382.392 |
Interpretation
ORDERNUMBER first.DEALSIZE = 'Large',
PRODUCTLINE IN (...), QTR_ID IN (1,2)), and
change sorting priorities (ORDER BY total_revenue DESC) to
answer adjacent business questions.Below are two opinions I formed after looking at the data. For each, I state a clear hypothesis, use a basic statistical test, show the results, and give a short, non-technical interpretation.
Question (business): Do orders placed in
Q4 (holiday period) have higher average sales
per line than orders in Q1–Q3?
Why this matters: If Q4 really runs higher, we can plan
inventory, staffing, and promos earlier.
Hypotheses
- H0: Average SALES in Q4 = Average SALES in
Q1–Q3
- H1: Average SALES in Q4 > Average SALES
in Q1–Q3
Test chosen: Two-sample t-test (Welch). It compares the means of two groups.
library(dplyr)
# Keep needed columns and drop NAs
df <- sales_data %>%
select(SALES, QTR_ID) %>%
filter(!is.na(SALES), !is.na(QTR_ID))
# Define two groups: Q4 vs non-Q4
df <- df %>%
mutate(group = ifelse(QTR_ID == 4, "Q4", "Q1toQ3"))
# Quick look at group means
op1_means <- df %>%
group_by(group) %>%
summarise(n = n(),
mean_sales = mean(SALES),
sd_sales = sd(SALES),
.groups = "drop")
knitr::kable(op1_means, digits = 2, caption = "Average line revenue: Q4 vs Q1–Q3")
| group | n | mean_sales | sd_sales |
|---|---|---|---|
| Q1toQ3 | 1729 | 3561.51 | 1879.12 |
| Q4 | 1094 | 3541.85 | 1782.18 |
# One-sided Welch t-test (Q4 > Q1-3)
t_res <- t.test(SALES ~ group, data = df, alternative = "greater")
# Save key numbers for a plain summary
mean_q4 <- round(op1_means$mean_sales[op1_means$group=="Q4"], 2)
mean_nonq4 <- round(op1_means$mean_sales[op1_means$group=="Q1toQ3"], 2)
pval1 <- signif(t_res$p.value, 3)
Result:
- Average per line — Q4: $3541.85 vs Q1–Q3:
$3561.51
- t-test p-value: 0.39
Interpretation:
If the p-value is below 0.05, I treat this as evidence that Q4
runs higher on average. If it’s above 0.05, I don’t have enough
evidence from this sample to claim a difference. Either way, the table
shows the actual averages, which is useful for planning.
Question (business): Is the Shipped
success rate higher for Large deals than for Small
deals?
Why this matters: If Large deals ship more reliably,
ops can keep prioritizing them; if not, we may need to fix small-deal
handling.
Hypotheses
- H0: Shipped rate (Large) = Shipped rate (Small)
- H1: Shipped rate (Large) > Shipped rate (Small)
Test chosen: Two-proportion test
(prop.test). It compares two percentages.
# Build shipped flag and keep only Small/Large
ship_df <- sales_data %>%
filter(!is.na(DEALSIZE), DEALSIZE %in% c("Small","Large"),
!is.na(STATUS)) %>%
mutate(SHIPPED = ifelse(STATUS == "Shipped", 1, 0))
# Counts by deal size
summ <- ship_df %>%
group_by(DEALSIZE) %>%
summarise(
shipped = sum(SHIPPED),
total = n(),
rate = shipped/total,
.groups = "drop"
) %>%
arrange(DEALSIZE)
knitr::kable(summ, digits = 3, caption = "Shipped counts and rates by deal size (Small vs Large)")
| DEALSIZE | shipped | total | rate |
|---|---|---|---|
| Large | 143 | 157 | 0.911 |
| Small | 1196 | 1282 | 0.933 |
# Two-proportion test (one-sided: Large > Small)
x <- c(summ$shipped[summ$DEALSIZE=="Small"], summ$shipped[summ$DEALSIZE=="Large"])
n <- c(summ$total[summ$DEALSIZE=="Small"], summ$total[summ$DEALSIZE=="Large"])
prop_res <- prop.test(x = x, n = n, alternative = "less")
# 'less' here tests Small < Large, which matches H1 (Large > Small)
pval2 <- signif(prop_res$p.value, 3)
rate_small <- round(summ$rate[summ$DEALSIZE=="Small"], 3)
rate_large <- round(summ$rate[summ$DEALSIZE=="Large"], 3)
Result:
- Shipped rate — Small: 0.933 ; Large:
0.911
- 2-proportion test p-value: 0.805
Interpretation:
If the p-value is below 0.05, I read this as evidence that Large
deals ship more successfully than Small deals in this dataset.
If it’s above 0.05, the sample does not give me enough evidence to claim
a real difference, even if the raw rates look different.
These two quick checks are enough for a basic business read: Q4 may run “richer” per line, and Large deals may ship more reliably. I would use these signals to shape stock planning and ops priorities, and I’d re-check them each quarter as more data comes in.
Goal (in my own words): Predict the dollar value per
order line (SALES). I’ll keep it simple and only use a few
sensible predictors.
What I did before modeling (pre-processing) and why:
- Kept complete rows for the variables I’ll use →
otherwise the model can’t fit. - Logged the target
(log(SALES)) because sales values are
right-skewed; log usually makes the pattern more linear. -
Standardised (z-scored) numeric predictors so the
coefficients are on a comparable scale and stepwise doesn’t “prefer”
variables just due to units. - Turned time and category fields
into factors (QTR_ID, YEAR_ID,
PRODUCTLINE, DEALSIZE) so the model can
compare groups.
library(dplyr)
lm_df <- sales_data %>%
select(SALES, QUANTITYORDERED, PRICEEACH, MSRP, QTR_ID, YEAR_ID, PRODUCTLINE, DEALSIZE) %>%
filter(!is.na(SALES), SALES > 0,
!is.na(QUANTITYORDERED), !is.na(PRICEEACH), !is.na(MSRP),
!is.na(QTR_ID), !is.na(YEAR_ID),
!is.na(PRODUCTLINE), !is.na(DEALSIZE)) %>%
mutate(
log_SALES = log(SALES),
PRODUCTLINE = factor(PRODUCTLINE),
DEALSIZE = factor(DEALSIZE),
QTR_ID = factor(QTR_ID),
YEAR_ID = factor(YEAR_ID)
)
# Standardise key numeric predictors
num_vars <- c("QUANTITYORDERED","PRICEEACH","MSRP")
lm_df[paste0(num_vars, "_z")] <- scale(lm_df[num_vars])
# Quick sanity check
knitr::kable(head(lm_df[, c("SALES","log_SALES","QUANTITYORDERED_z","PRICEEACH_z","MSRP_z","QTR_ID","YEAR_ID","PRODUCTLINE","DEALSIZE")]),
caption = "Sample of data after simple pre-processing")
| SALES | log_SALES | QUANTITYORDERED_z | PRICEEACH_z | MSRP_z | QTR_ID | YEAR_ID | PRODUCTLINE | DEALSIZE |
|---|---|---|---|---|---|---|---|---|
| 2871.00 | 7.962416 | -0.5227982 | 0.5968718 | -0.1422206 | 1 | 2003 | Motorcycles | Small |
| 2765.90 | 7.925121 | -0.1121814 | -0.1144301 | -0.1422206 | 2 | 2003 | Motorcycles | Small |
| 3884.34 | 8.264708 | 0.6063980 | 0.5492864 | -0.1422206 | 3 | 2003 | Motorcycles | Medium |
| 3746.70 | 8.228631 | 1.0170147 | -0.0197551 | -0.1422206 | 3 | 2003 | Motorcycles | Medium |
| 5205.27 | 8.557427 | 1.4276315 | 0.8100145 | -0.1422206 | 4 | 2003 | Motorcycles | Medium |
| 3479.76 | 8.154719 | 0.0931270 | 0.6444571 | -0.1422206 | 4 | 2003 | Motorcycles | Medium |
Fit the model: I start with a reasonable full model and let stepwise (both directions) trim it using AIC.
# Full model
lm_full <- lm(log_SALES ~ QUANTITYORDERED_z + PRICEEACH_z + MSRP_z +
QTR_ID + YEAR_ID + PRODUCTLINE + DEALSIZE,
data = lm_df)
# Stepwise (both directions)
lm_step <- step(lm_full, direction = "both", trace = 0)
summary(lm_step)
##
## Call:
## lm(formula = log_SALES ~ QUANTITYORDERED_z + PRICEEACH_z + MSRP_z +
## QTR_ID + YEAR_ID + PRODUCTLINE + DEALSIZE, data = lm_df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.96148 -0.08311 -0.00936 0.07971 0.61706
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 8.412055 0.014683 572.894 < 2e-16 ***
## QUANTITYORDERED_z 0.233724 0.003146 74.293 < 2e-16 ***
## PRICEEACH_z 0.282935 0.004039 70.047 < 2e-16 ***
## MSRP_z 0.077464 0.003730 20.770 < 2e-16 ***
## QTR_ID2 -0.042663 0.007505 -5.685 1.44e-08 ***
## QTR_ID3 -0.015892 0.008387 -1.895 0.058213 .
## QTR_ID4 -0.013213 0.007259 -1.820 0.068849 .
## YEAR_ID2004 0.007055 0.005428 1.300 0.193800
## YEAR_ID2005 -0.029704 0.008355 -3.555 0.000384 ***
## PRODUCTLINEMotorcycles 0.003297 0.008379 0.394 0.693963
## PRODUCTLINEPlanes -0.010674 0.008788 -1.215 0.224635
## PRODUCTLINEShips -0.023508 0.009785 -2.402 0.016354 *
## PRODUCTLINETrains 0.033082 0.015604 2.120 0.034084 *
## PRODUCTLINETrucks and Buses -0.006100 0.008677 -0.703 0.482099
## PRODUCTLINEVintage Cars -0.018471 0.007046 -2.622 0.008799 **
## DEALSIZEMedium -0.288072 0.012082 -23.843 < 2e-16 ***
## DEALSIZESmall -0.442847 0.015380 -28.794 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.129 on 2806 degrees of freedom
## Multiple R-squared: 0.9392, Adjusted R-squared: 0.9388
## F-statistic: 2707 on 16 and 2806 DF, p-value: < 2.2e-16
Assumption checks: - Residuals should look roughly centered around 0 with no pattern. - QQ plot should be roughly straight (normal-ish residuals).
par(mfrow = c(1,2))
plot(lm_step$fitted.values, residuals(lm_step),
xlab="Fitted log(SALES)", ylab="Residuals",
main="Residuals vs Fitted"); abline(h=0,lty=2)
qqnorm(residuals(lm_step)); qqline(residuals(lm_step), col=2)
par(mfrow = c(1,1))
Simple cross-validation: 5-fold CV RMSE/R² using `caret.
if (!requireNamespace("caret", quietly = TRUE)) install.packages("caret")
library(caret)
set.seed(123)
cv_ctrl <- trainControl(method = "cv", number = 5)
lm_cv <- train(formula(lm_step), data = lm_df, method = "lm", trControl = cv_ctrl)
lm_cv$results # shows RMSE and R-squared across folds
## intercept RMSE Rsquared MAE RMSESD RsquaredSD MAESD
## 1 TRUE 0.1294282 0.9383337 0.0989613 0.007369609 0.007559473 0.004359889
Interpreting coefficients (manager view): Because
the target is log(SALES) and predictors are z-scored, a
+1 SD change in a numeric predictor changes expected
sales by ~exp(beta)−1 percent.
coefs <- coef(lm_step)
pct_change <- function(term) {
if (term %in% names(coefs)) round(100*(exp(coefs[term]) - 1), 1) else NA
}
chg_qty <- pct_change("QUANTITYORDERED_z")
chg_price <- pct_change("PRICEEACH_z")
chg_msrp <- pct_change("MSRP_z")
r2_train <- summary(lm_step)$r.squared
My Takeaway: I’m not claiming a perfect predictor here; I just want a directionally useful model. The signs tell me the way each driver moves expected sales (holding others steady). The CV numbers help me not over-trust the training fit.
Business framing: I’ll predict whether an order line
ships successfully.
- Target: SHIPPED_FLAG = 1 if
STATUS == "Shipped", else 0
- Inputs: A few practical fields available at order
time (quantity, price, MSRP, quarter, year, product line, deal size)
clf_df <- sales_data %>%
select(STATUS, QUANTITYORDERED, PRICEEACH, MSRP, QTR_ID, YEAR_ID, PRODUCTLINE, DEALSIZE) %>%
filter(!is.na(STATUS),
!is.na(QUANTITYORDERED), !is.na(PRICEEACH), !is.na(MSRP),
!is.na(QTR_ID), !is.na(YEAR_ID),
!is.na(PRODUCTLINE), !is.na(DEALSIZE)) %>%
mutate(
SHIPPED_FLAG = ifelse(STATUS == "Shipped", 1L, 0L),
PRODUCTLINE = factor(PRODUCTLINE),
DEALSIZE = factor(DEALSIZE),
QTR_ID = factor(QTR_ID),
YEAR_ID = factor(YEAR_ID)
)
# Train/test split (70/30)
set.seed(123)
idx <- sample(seq_len(nrow(clf_df)), size = 0.7*nrow(clf_df))
train_df <- clf_df[idx, ]
test_df <- clf_df[-idx, ]
Fit a plain logistic regression:
logit_fit <- glm(SHIPPED_FLAG ~ QUANTITYORDERED + PRICEEACH + MSRP +
QTR_ID + YEAR_ID + PRODUCTLINE + DEALSIZE,
data = train_df, family = binomial())
summary(logit_fit)
##
## Call:
## glm(formula = SHIPPED_FLAG ~ QUANTITYORDERED + PRICEEACH + MSRP +
## QTR_ID + YEAR_ID + PRODUCTLINE + DEALSIZE, family = binomial(),
## data = train_df)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 5.599899 1.416071 3.955 7.67e-05 ***
## QUANTITYORDERED 0.001981 0.012412 0.160 0.8732
## PRICEEACH 0.000274 0.008657 0.032 0.9747
## MSRP 0.001929 0.004125 0.468 0.6400
## QTR_ID2 -2.854973 0.315424 -9.051 < 2e-16 ***
## QTR_ID3 14.502998 544.985781 0.027 0.9788
## QTR_ID4 -1.887710 0.398386 -4.738 2.15e-06 ***
## YEAR_ID2004 -0.465532 0.284382 -1.637 0.1016
## YEAR_ID2005 -2.855769 0.334134 -8.547 < 2e-16 ***
## PRODUCTLINEMotorcycles 1.164998 0.495388 2.352 0.0187 *
## PRODUCTLINEPlanes -0.329384 0.360143 -0.915 0.3604
## PRODUCTLINEShips -1.552835 0.339134 -4.579 4.68e-06 ***
## PRODUCTLINETrains 1.043427 1.132780 0.921 0.3570
## PRODUCTLINETrucks and Buses -0.144872 0.363076 -0.399 0.6899
## PRODUCTLINEVintage Cars -0.412190 0.306075 -1.347 0.1781
## DEALSIZEMedium -0.407100 0.520008 -0.783 0.4337
## DEALSIZESmall -0.104351 0.684772 -0.152 0.8789
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 995.54 on 1975 degrees of freedom
## Residual deviance: 687.26 on 1959 degrees of freedom
## AIC: 721.26
##
## Number of Fisher Scoring iterations: 18
Evaluate fit: confusion matrix (0.5 cutoff), ROC AUC, and a simple pseudo-R².
# Predictions (probability of 'Shipped')
test_df$pred_prob <- predict(logit_fit, newdata = test_df, type = "response")
test_df$pred_flag <- ifelse(test_df$pred_prob >= 0.5, 1L, 0L)
# Confusion matrix
cm <- table(Actual = test_df$SHIPPED_FLAG, Predicted = test_df$pred_flag)
acc <- sum(diag(cm))/sum(cm)
sens <- ifelse(sum(cm[2,])>0, cm["1","1"] / sum(cm[2,]), NA) # recall for Shipped
spec <- ifelse(sum(cm[1,])>0, cm["0","0"] / sum(cm[1,]), NA) # true negative rate
# ROC AUC
if (!requireNamespace("pROC", quietly = TRUE)) install.packages("pROC")
library(pROC)
roc_obj <- roc(response = test_df$SHIPPED_FLAG, predictor = test_df$pred_prob)
auc_val <- as.numeric(auc(roc_obj))
# Pseudo-R^2 (McFadden)
if (!requireNamespace("pscl", quietly = TRUE)) install.packages("pscl")
library(pscl)
pseudo <- pR2(logit_fit) # McFadden et al.
## fitting null model for pseudo-r2
list(
Confusion_Matrix = cm,
Accuracy = round(acc, 3),
Sensitivity_ShP = round(sens, 3),
Specificity_Not = round(spec, 3),
AUC = round(auc_val, 3),
McFadden_R2 = round(pseudo["McFadden"], 3)
)
## $Confusion_Matrix
## Predicted
## Actual 0 1
## 0 28 41
## 1 16 762
##
## $Accuracy
## [1] 0.933
##
## $Sensitivity_ShP
## [1] 0.979
##
## $Specificity_Not
## [1] 0.406
##
## $AUC
## [1] 0.929
##
## $McFadden_R2
## McFadden
## 0.31
How I read this: - Accuracy/Sensitivity/Specificity tell me how well the model separates shipped vs not-shipped on unseen data. If sensitivity is decent, we catch most of the shipped lines; if specificity is decent, we’re not over-predicting shipped when it isn’t. - AUC (closer to 1 is better) shows ranking quality independent of a fixed cutoff; anything much above 0.7 is useful in practice for simple triage. - Pseudo-R² (McFadden) gives a rough sense of overall explanatory power for logistic models (values are usually lower than linear R²).
Manager meaning: - If AUC and accuracy are
reasonable, I’d use the probability to prioritise lines
at risk (low probability of shipping).
- Coefficients’ signs indicate drivers: a positive
coefficient means higher chance to ship, negative
lowers it (while holding others constant). We can turn strong drivers
into checklist items for ops (e.g., certain product
lines or deal sizes may need extra QA or stock buffers).
I kept the modeling choices simple on purpose: a basic stepwise linear regression for
SALESand a plain logistic regression forShipped.
Variable chosen (time-dependent): Monthly total
SALES (sum of order-line sales per calendar
month).
Why: Revenue is naturally time-based and important for
planning inventory, cash flow, and marketing.
Remark on packages: I used forecast for
ETS/ARIMA and accuracy. Install if needed:
install.packages("forecast")
library(dplyr)
library(lubridate)
library(ggplot2)
if (!requireNamespace("forecast", quietly = TRUE)) install.packages("forecast")
library(forecast)
# 1) Aggregate to monthly revenue
ts_df <- sales_data %>%
mutate(YearMonth = floor_date(ORDERDATE, unit = "month")) %>%
group_by(YearMonth) %>%
summarise(Monthly_Sales = sum(SALES, na.rm = TRUE), .groups = "drop") %>%
arrange(YearMonth)
# 2) Make sure every calendar month is present (fill missing with 0)
full_months <- tibble(YearMonth = seq(min(ts_df$YearMonth), max(ts_df$YearMonth), by = "month"))
ts_df_full <- full_months %>%
left_join(ts_df, by = "YearMonth") %>%
mutate(Monthly_Sales = ifelse(is.na(Monthly_Sales), 0, Monthly_Sales))
# 3) Convert to ts object (monthly frequency)
ts_start <- c(year(min(ts_df_full$YearMonth)), month(min(ts_df_full$YearMonth)))
y <- ts(ts_df_full$Monthly_Sales, frequency = 12, start = ts_start)
length(y); head(ts_df_full)
## [1] 29
## # A tibble: 6 × 2
## YearMonth Monthly_Sales
## <date> <dbl>
## 1 2003-01-01 129754.
## 2 2003-02-01 140836.
## 3 2003-03-01 174505.
## 4 2003-04-01 201610.
## 5 2003-05-01 192673.
## 6 2003-06-01 168083.
h <- 6 # forecast horizon = last 6 months for testing
n <- length(y)
train_y <- head(y, n - h)
test_y <- tail(y, h)
# Simple plot of the full series
ggplot(ts_df_full, aes(x = YearMonth, y = Monthly_Sales)) +
geom_line() +
geom_vline(xintercept = ts_df_full$YearMonth[n - h], linetype = 2) +
labs(title = "Monthly Sales (with Train/Test Split)",
x = "Month", y = "Revenue ($)") +
theme_minimal()
ets_fit <- ets(train_y) # ETS automatically picks components
ets_fc <- forecast(ets_fit, h = h) # Forecast next h months
ets_fit
## ETS(M,N,N)
##
## Call:
## ets(y = train_y)
##
## Smoothing parameters:
## alpha = 0.9999
##
## Initial states:
## l = 222523.8162
##
## sigma: 0.4864
##
## AIC AICc BIC
## 617.5286 618.7918 620.9351
arima_fit <- auto.arima(train_y, seasonal = TRUE, stepwise = TRUE, approximation = FALSE)
arima_fc <- forecast(arima_fit, h = h)
arima_fit
## Series: train_y
## ARIMA(0,0,1) with non-zero mean
##
## Coefficients:
## ma1 mean
## 0.5375 352818.89
## s.e. 0.1834 70480.62
##
## sigma^2 = 5.447e+10: log likelihood = -316.05
## AIC=638.1 AICc=639.36 BIC=641.51
# Helper to compute metrics; MAPE skips months where actual == 0
compute_metrics <- function(actual, pred){
err <- actual - pred
MAD <- mean(abs(err), na.rm = TRUE)
MSE <- mean(err^2, na.rm = TRUE)
RMSE <- sqrt(MSE)
mape_vec <- abs(err / actual)
MAPE <- mean(mape_vec[is.finite(mape_vec) & !is.nan(mape_vec) & actual != 0], na.rm = TRUE) * 100
data.frame(MAD = MAD, MSE = MSE, RMSE = RMSE, MAPE = MAPE)
}
ets_metrics <- compute_metrics(test_y, ets_fc$mean)
arima_metrics <- compute_metrics(test_y, arima_fc$mean)
metrics_tbl <- dplyr::bind_rows(
cbind(Model = "ETS", round(ets_metrics, 2)),
cbind(Model = "ARIMA", round(arima_metrics, 2))
)
knitr::kable(metrics_tbl, caption = "Forecast accuracy on the 6-month test set (lower is better)")
| Model | MAD | MSE | RMSE | MAPE |
|---|---|---|---|---|
| ETS | 728279.49 | 533730822655 | 730568.8 | 210.32 |
| ARIMA | 90059.82 | 18742773780 | 136904.2 | 25.08 |
# Build a plotting frame for the test period
plot_test <- ts_df_full %>%
slice((nrow(ts_df_full) - h + 1):nrow(ts_df_full)) %>%
mutate(ETS_Forecast = as.numeric(ets_fc$mean),
ARIMA_Forecast = as.numeric(arima_fc$mean))
ggplot(plot_test, aes(x = YearMonth)) +
geom_line(aes(y = Monthly_Sales), size = 1) +
geom_line(aes(y = ETS_Forecast), linetype = 2) +
geom_line(aes(y = ARIMA_Forecast), linetype = 3) +
labs(title = "Actual vs Forecast (Test Months Only)",
x = "Month", y = "Revenue ($)",
subtitle = "Solid = Actual; Dotted = ETS; Dot-dash = ARIMA") +
theme_minimal()
Small note on MAPE: If a month’s actual sales are zero, MAPE is undefined. I skipped those in the MAPE calculation so the % error isn’t blown up by division by zero.
Remark (packages installed): To reproduce all tables, charts and models in this report I installed the following packages on my machine:
install.packages(c(
"tidyverse","lubridate","knitr","ggplot2",
"sqldf","caret","pROC","pscl","forecast"
))
Tables - Missing Values Summary
(missing_data) - Summary Statistics for key variables
(summary_stats) - Top Product Lines by Revenue
(product_perf) - Order Status Distribution
(status_dist) - Year-over-Year Performance
(yearly_sales) - SQL Results – Product lines 2004 shipped
(res1) - SQL Results – Top USA customers 2005 shipped
(res2) - Opinion 1 means (Q4 vs Q1–Q3)
(op1_means) - Deal-size shipped counts & rates
(summ) - Stepwise CV results (lm_cv$results) -
Forecast Accuracy (ETS vs ARIMA) (metrics_tbl)
Figures - Histogram: Sales Amount Distribution - Linear Regression diagnostics (Residuals vs Fitted, QQ plot) - Time-series: Monthly Sales with Train/Test split - Forecasts vs Actuals (test window): ETS & ARIMA - (Optional) ROC curve for logistic regression
Note: If you knit from a clean session, run the earlier chunks (Q1a–Q1f) so the objects exist. The code below only re-prints what’s already been computed.
library(knitr)
safe_kable <- function(obj, caption, ...) {
if (exists(obj) && !is.null(get(obj))) {
kable(get(obj), caption = caption, ...)
} else {
cat(paste0("*Skipped (object not found): ", obj, "\n\n"))
}
}
safe_kable("missing_data", "Missing Values Summary")
| Variable | Missing_Count | Missing_Percent | |
|---|---|---|---|
| TERRITORY | TERRITORY | 1074 | 38 |
if (exists("summary_stats")) kable(summary_stats, caption="Summary Statistics for Key Variables")
| QUANTITYORDERED | PRICEEACH | SALES | MSRP | |
|---|---|---|---|---|
| Mean | 35.09 | 83.66 | 3553.89 | 100.72 |
| Median | 35.00 | 95.70 | 3184.80 | 99.00 |
| SD | 9.74 | 20.17 | 1841.87 | 40.19 |
| Min | 6.00 | 26.88 | 482.13 | 33.00 |
| Max | 97.00 | 100.00 | 14082.80 | 214.00 |
if (exists("product_perf")) kable(head(product_perf), caption="Top Product Lines by Revenue (head)")
| PRODUCTLINE | Orders | Total_Revenue | Avg_Revenue |
|---|---|---|---|
| Classic Cars | 967 | 3919615.7 | 4053.38 |
| Vintage Cars | 607 | 1903150.8 | 3135.34 |
| Motorcycles | 331 | 1166388.3 | 3523.83 |
| Trucks and Buses | 301 | 1127789.8 | 3746.81 |
| Planes | 306 | 975003.6 | 3186.29 |
| Ships | 234 | 714437.1 | 3053.15 |
if (exists("status_dist")) kable(data.frame(Status=names(status_dist), Count=as.numeric(status_dist)),
caption="Order Status Distribution")
| Status | Count |
|---|---|
| Cancelled | 60 |
| Disputed | 14 |
| In Process | 41 |
| On Hold | 44 |
| Resolved | 47 |
| Shipped | 2617 |
safe_kable("yearly_sales", "Year-over-Year Performance", format.args = list(big.mark=","))
| YEAR_ID | Total_Sales | Order_Count | Avg_Order_Value |
|---|---|---|---|
| 2,003 | 3,516,980 | 1,000 | 3,516.98 |
| 2,004 | 4,724,163 | 1,345 | 3,512.39 |
| 2,005 | 1,791,487 | 478 | 3,747.88 |
safe_kable("res1", "SQL Result 1 – Product lines by revenue (Shipped, 2004)",
format.args = list(big.mark=",", scientific=FALSE))
| PRODUCTLINE | order_lines | total_revenue | avg_line_revenue |
|---|---|---|---|
| Classic Cars | 425 | 1,702,871.5 | 4,006.757 |
| Vintage Cars | 275 | 883,004.4 | 3,210.925 |
| Motorcycles | 164 | 560,545.2 | 3,417.959 |
| Trucks and Buses | 138 | 509,109.6 | 3,689.200 |
| Planes | 150 | 468,552.2 | 3,123.682 |
| Ships | 99 | 292,522.8 | 2,954.775 |
| Trains | 36 | 111,441.4 | 3,095.595 |
safe_kable("res2", "SQL Result 2 – Top USA customers by avg line revenue (Shipped, 2005)",
format.args = list(big.mark=",", scientific=FALSE))
| CUSTOMERNAME | order_lines | total_revenue | avg_line_revenue |
|---|---|---|---|
| Gift Depot Inc. | 6 | 31,648.47 | 5,274.745 |
| FunGiftIdeas.com | 8 | 37,557.66 | 4,694.708 |
| The Sharp Gifts Warehouse | 9 | 37,526.84 | 4,169.649 |
| Corporate Gift Ideas Co. | 13 | 54,203.62 | 4,169.509 |
| Collectables For Less Inc. | 8 | 31,474.78 | 3,934.347 |
| Mini Gifts Distributors Ltd. | 56 | 205,993.93 | 3,678.463 |
| Mini Creations Ltd. | 3 | 11,021.30 | 3,673.767 |
| Technics Stores Inc. | 4 | 13,529.57 | 3,382.392 |
safe_kable("op1_means", "Opinion 1 – Q4 vs Q1–Q3 mean SALES (line level)", digits=2)
| group | n | mean_sales | sd_sales |
|---|---|---|---|
| Q1toQ3 | 1729 | 3561.51 | 1879.12 |
| Q4 | 1094 | 3541.85 | 1782.18 |
safe_kable("summ", "Opinion 2 – Shipped counts & rates by deal size (Small vs Large)", digits=3)
| DEALSIZE | shipped | total | rate |
|---|---|---|---|
| Large | 143 | 157 | 0.911 |
| Small | 1196 | 1282 | 0.933 |
if (exists("lm_cv")) {
kable(lm_cv$results, caption="Linear Regression 5-fold CV Results (RMSE, R²)")
} else {
cat("*Skipped (object not found): lm_cv$results\n\n")
}
| intercept | RMSE | Rsquared | MAE | RMSESD | RsquaredSD | MAESD |
|---|---|---|---|---|---|---|
| TRUE | 0.1294282 | 0.9383337 | 0.0989613 | 0.0073696 | 0.0075595 | 0.0043599 |
safe_kable("metrics_tbl", "Forecast accuracy on 6-month test set (ETS vs ARIMA) – lower = better")
| Model | MAD | MSE | RMSE | MAPE |
|---|---|---|---|---|
| ETS | 728279.49 | 533730822655 | 730568.8 | 210.32 |
| ARIMA | 90059.82 | 18742773780 | 136904.2 | 25.08 |
# Histogram of SALES (from Q1a)
if (exists("sales_data")) {
hist(sales_data$SALES, breaks = 30, main = "Sales Amount Distribution (reprint)",
xlab = "Sales ($)", col = "lightblue")
} else {
plot.new(); title("Skipped: sales_data not found")
}
# Linear Regression diagnostics (from Q1d)
if (exists("lm_step")) {
par(mfrow = c(1,2))
plot(lm_step$fitted.values, residuals(lm_step),
xlab="Fitted log(SALES)", ylab="Residuals",
main="Residuals vs Fitted (reprint)"); abline(h=0, lty=2)
qqnorm(residuals(lm_step)); qqline(residuals(lm_step), col=2)
par(mfrow = c(1,1))
} else {
plot.new(); title("Skipped: lm_step not found")
}
# Time-series: Monthly Sales with Train/Test split (from Q1f)
if (exists("ts_df_full")) {
n <- nrow(ts_df_full); h <- if (exists("h")) h else 6
library(ggplot2)
ggplot(ts_df_full, aes(x = YearMonth, y = Monthly_Sales)) +
geom_line() +
{if (n > h) geom_vline(xintercept = ts_df_full$YearMonth[n - h], linetype = 2) else NULL} +
labs(title = "Monthly Sales (Train/Test split – reprint)",
x = "Month", y = "Revenue ($)") +
theme_minimal()
} else {
plot.new(); title("Skipped: ts_df_full not found")
}
# Forecasts vs Actuals (test window) – ETS & ARIMA (from Q1f)
if (exists("ets_fc") && exists("arima_fc") && exists("ts_df_full")) {
n <- nrow(ts_df_full); h <- length(ets_fc$mean)
plot_test <- ts_df_full[(n - h + 1):n, ]
plot_test$ETS_Forecast <- as.numeric(ets_fc$mean)
plot_test$ARIMA_Forecast <- as.numeric(arima_fc$mean)
ggplot(plot_test, aes(x = YearMonth)) +
geom_line(aes(y = Monthly_Sales), size = 1) +
geom_line(aes(y = ETS_Forecast), linetype = 2) +
geom_line(aes(y = ARIMA_Forecast), linetype = 3) +
labs(title = "Actual vs Forecast (Test Months – reprint)",
x = "Month", y = "Revenue ($)",
subtitle = "Solid = Actual; Dotted = ETS; Dot-dash = ARIMA") +
theme_minimal()
} else {
plot.new(); title("Skipped: forecasts not found (ets_fc/arima_fc)")
}
# Optional: ROC curve (from Q1e)
if (exists("roc_obj")) {
plot(roc_obj, main = "ROC Curve (reprint)")
} else if (exists("test_df") && "pred_prob" %in% names(test_df) && "SHIPPED_FLAG" %in% names(test_df)) {
if (!requireNamespace("pROC", quietly = TRUE)) install.packages("pROC")
library(pROC)
roc_tmp <- roc(response = test_df$SHIPPED_FLAG, predictor = test_df$pred_prob)
plot(roc_tmp, main = "ROC Curve (reprint)")
} else {
plot.new(); title("Skipped: ROC objects not found")
}
sessionInfo()
## R version 4.5.1 (2025-06-13 ucrt)
## Platform: x86_64-w64-mingw32/x64
## Running under: Windows 11 x64 (build 22621)
##
## Matrix products: default
## LAPACK version 3.12.1
##
## locale:
## [1] LC_COLLATE=English_Singapore.utf8 LC_CTYPE=English_Singapore.utf8
## [3] LC_MONETARY=English_Singapore.utf8 LC_NUMERIC=C
## [5] LC_TIME=English_Singapore.utf8
##
## time zone: Asia/Singapore
## tzcode source: internal
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] forecast_8.24.0 pscl_1.5.9 pROC_1.19.0.1 caret_7.0-1
## [5] lattice_0.22-7 sqldf_0.4-11 RSQLite_2.4.3 gsubfn_0.7
## [9] proto_1.0.0 knitr_1.50 lubridate_1.9.4 forcats_1.0.0
## [13] stringr_1.5.2 dplyr_1.1.4 purrr_1.1.0 readr_2.1.5
## [17] tidyr_1.3.1 tibble_3.3.0 ggplot2_4.0.0 tidyverse_2.0.0
##
## loaded via a namespace (and not attached):
## [1] tidyselect_1.2.1 timeDate_4041.110 farver_2.1.2
## [4] blob_1.2.4 S7_0.2.0 fastmap_1.2.0
## [7] digest_0.6.37 rpart_4.1.24 timechange_0.3.0
## [10] lifecycle_1.0.4 survival_3.8-3 magrittr_2.0.3
## [13] compiler_4.5.1 rlang_1.1.6 sass_0.4.10
## [16] tools_4.5.1 yaml_2.3.10 data.table_1.17.8
## [19] labeling_0.4.3 curl_7.0.0 bit_4.6.0
## [22] TTR_0.24.4 plyr_1.8.9 RColorBrewer_1.1-3
## [25] withr_3.0.2 stats4_4.5.1 nnet_7.3-20
## [28] grid_4.5.1 xts_0.14.1 colorspace_2.1-1
## [31] future_1.67.0 globals_0.18.0 scales_1.4.0
## [34] iterators_1.0.14 MASS_7.3-65 cli_3.6.5
## [37] rmarkdown_2.29 chron_2.3-62 generics_0.1.4
## [40] rstudioapi_0.17.1 future.apply_1.20.0 reshape2_1.4.4
## [43] tzdb_0.5.0 DBI_1.2.3 cachem_1.1.0
## [46] splines_4.5.1 parallel_4.5.1 urca_1.3-4
## [49] vctrs_0.6.5 hardhat_1.4.2 Matrix_1.7-3
## [52] jsonlite_2.0.0 tseries_0.10-58 hms_1.1.3
## [55] bit64_4.6.0-1 listenv_0.9.1 foreach_1.5.2
## [58] gower_1.0.2 jquerylib_0.1.4 recipes_1.3.1
## [61] quantmod_0.4.28 glue_1.8.0 parallelly_1.45.1
## [64] codetools_0.2-20 stringi_1.8.7 gtable_0.3.6
## [67] quadprog_1.5-8 lmtest_0.9-40 pillar_1.11.0
## [70] htmltools_0.5.8.1 ipred_0.9-15 lava_1.8.1
## [73] R6_2.6.1 tcltk_4.5.1 evaluate_1.0.5
## [76] memoise_2.0.1 fracdiff_1.5-3 bslib_0.9.0
## [79] class_7.3-23 Rcpp_1.1.0 nlme_3.1-168
## [82] prodlim_2025.04.28 xfun_0.53 zoo_1.8-14
## [85] ModelMetrics_1.2.2.2 pkgconfig_2.0.3
Data given
library(dplyr)
library(knitr)
# Inputs
fixed_dc <- 1200000
vc_dc <- 120
c_no_dc <- 180
scenarios <- tibble::tibble(
Scenario = c("Low","Medium","High"),
Demand = c(8000, 12000, 18000),
Prob = c(0.3, 0.5, 0.2)
)
# Costs per scenario under each decision
results <- scenarios %>%
mutate(
Cost_OpenDC = fixed_dc + vc_dc * Demand,
Cost_NoDC = c_no_dc * Demand,
Delta_Saving = Cost_NoDC - Cost_OpenDC # +ve means opening DC saves money
)
kable(results, caption = "Per-scenario annual cost (₹) under each decision",
format.args = list(big.mark=","), digits = 0)
| Scenario | Demand | Prob | Cost_OpenDC | Cost_NoDC | Delta_Saving |
|---|---|---|---|---|---|
| Low | 8,000 | 0 | 2,160,000 | 1,440,000 | -720,000 |
| Medium | 12,000 | 0 | 2,640,000 | 2,160,000 | -480,000 |
| High | 18,000 | 0 | 3,360,000 | 3,240,000 | -120,000 |
# Expected (EMV = expected cost) for each decision
emv_open <- sum(results$Prob * results$Cost_OpenDC)
emv_no <- sum(results$Prob * results$Cost_NoDC)
emv_tbl <- tibble::tibble(
Decision = c("Open DC","Do Not Open DC"),
EMV_Expected_Cost = c(emv_open, emv_no)
)
kable(emv_tbl, caption = "EMV (expected annual cost) by decision (₹)",
format.args = list(big.mark=","), digits = 0)
| Decision | EMV_Expected_Cost |
|---|---|
| Open DC | 2,640,000 |
| Do Not Open DC | 2,160,000 |
# Also show expected saving if we opened the DC (vs no DC)
exp_saving_if_open <- emv_no - emv_open # +ve => opening DC is better
cat("Expected saving if we OPEN the DC (vs stay as-is): ₹",
format(round(exp_saving_if_open,0), big.mark=","), "\n")
## Expected saving if we OPEN the DC (vs stay as-is): ₹ -480,000
Quick sense-check (break-even): Opening the DC beats status quo when
Fixed + 120×Demand < 180×Demand ⇒ 1,200,000 < 60×Demand ⇒ Demand > 20,000.
Our “High” case is 18,000, still below 20,000, so opening the DC doesn’t pay off in any of the given scenarios — which is consistent with the EMV table.
Manager takeaway: Given today’s demand outlook, the safer, cheaper choice is not to open the regional DC. If demand projections rise above ~20k units/year, revisit the decision.
We compare annual cost to Open DC vs cost to Not Open:
Break-even when they’re equal: \[ 1{,}200{,}000 + 120D = 180D \Rightarrow 1{,}200{,}000 = 60D \Rightarrow D^\* = \frac{1{,}200{,}000}{60} = \mathbf{20{,}000\ units} \]
So opening only makes sense if annual demand exceeds 20,000 units.
Assumption: Profit difference comes only from cost
savings (selling price/revenue same either way).
So, opening the DC “wins” when Cost_Open <
Cost_No.
Remark: Triangular is bounded between min and max, so simulated demands will always be ≤ 18,000, which is still below the break-even 20,000.
set.seed(123)
# Given cost parameters
fixed_dc <- 1200000
vc_dc <- 120
c_no_dc <- 180
# Triangular parameters
a <- 8000 # min
m <- 12000 # mode
b <- 18000 # max
# Custom triangular RNG (inverse CDF), so no extra package needed
rtri_inv <- function(n, a, m, b){
u <- runif(n)
Fc <- (m - a) / (b - a)
x <- ifelse(
u < Fc,
a + sqrt(u * (b - a) * (m - a)),
b - sqrt((1 - u) * (b - a) * (b - m))
)
return(x)
}
N <- 10000
demand_sim <- rtri_inv(N, a, m, b)
# Compute annual costs under each decision
cost_open <- fixed_dc + vc_dc * demand_sim
cost_no <- c_no_dc * demand_sim
# "Savings" if we open (positive = opening is better)
savings <- cost_no - cost_open
# Probability opening DC is better (i.e., savings > 0)
prob_open_better <- mean(savings > 0)
# Also report expected savings (useful sense-check)
exp_saving <- mean(savings)
# Print results
cat("Break-even demand (analytic): 20,000 units\n")
## Break-even demand (analytic): 20,000 units
cat("Simulation: P(opening is better) =", round(prob_open_better, 4), "\n")
## Simulation: P(opening is better) = 0
cat("Simulation: Expected saving if open (₹) =", round(exp_saving, 0), "\n")
## Simulation: Expected saving if open (₹) = -441274
library(ggplot2)
ggplot(data.frame(Demand = demand_sim), aes(x = Demand)) +
geom_histogram(bins = 40, fill = "skyblue", color = "white") +
geom_vline(xintercept = 20000, linetype = 2) +
labs(title = "Triangular Demand Simulation (min=8k, mode=12k, max=18k)",
subtitle = "Dashed line = 20,000-unit break-even",
x = "Annual Demand (units)", y = "Frequency") +
theme_minimal()
Under the stated uncertainty (8k–18k demand, most likely 12k), the chance that opening the DC beats the status quo is essentially 0%. Unless demand forecasts credibly move beyond 20,000 units/year, do not open the DC. If new information suggests higher demand, rerun this exact simulation with the updated range/mode.
What we’re deciding:
- Open 0, 1, or 2 mini-DCs (each mini-DC has fixed cost
and capacity).
- How many units to ship via each open mini-DC vs the existing
network.
Given - Mini-DC fixed cost =
₹700,000 (per mini-DC)
- Mini-DC variable cost = ₹130 per unit
- Mini-DC capacity = 9,000 units
- Existing network (no DC) variable cost = ₹180 per
unit
- Demand \(D\) (I’ll solve for
Low=8k, Medium=12k,
High=18k like in 2a, and you can plug any \(D\) you want)
Decision variables - \(y_1, y_2 \in \{0,1\}\): open mini-DC 1 or
2?
- \(x_1, x_2 \ge 0\): units shipped via
mini-DC 1 and 2
- \(x_0 \ge 0\): units shipped via
existing network
Objective (minimize total annual cost) \[ \min\ C = 700{,}000(y_1+y_2) + 130(x_1+x_2) + 180x_0 \]
Constraints \[ \begin{aligned} &x_0 + x_1 + x_2 = D &\text{(meet demand)}\\ &x_1 \le 9{,}000\cdot y_1,\quad x_2 \le 9{,}000\cdot y_2 &\text{(capacity only if opened)}\\ &x_0, x_1, x_2 \ge 0,\quad y_1, y_2 \in \{0,1\} \end{aligned} \]
# If needed: install.packages("lpSolve")
library(lpSolve)
library(dplyr)
library(knitr)
solve_minidc <- function(D){
# Vars order: [x0, x1, x2, y1, y2]
obj <- c(180, 130, 130, 700000, 700000)
# Constraints:
# 1) Demand: x0 + x1 + x2 = D
# 2) x1 - 9000*y1 <= 0
# 3) x2 - 9000*y2 <= 0
const.mat <- rbind(
c(1, 1, 1, 0, 0),
c(0, 1, 0, -9000, 0),
c(0, 0, 1, 0, -9000)
)
const.dir <- c("=", "<=", "<=")
const.rhs <- c(D, 0, 0)
# Solve MIP
out <- lp(direction = "min",
objective.in = obj,
const.mat = const.mat,
const.dir = const.dir,
const.rhs = const.rhs,
all.bin = FALSE,
binary.vec = c(4,5)) # y1,y2 binary
sol <- out$solution
tibble(
Demand = D,
TotalCost = out$objval,
x0 = sol[1], x1 = sol[2], x2 = sol[3],
y1 = sol[4], y2 = sol[5],
MinisOpened = sol[4] + sol[5]
)
}
scenarios <- c(8000, 12000, 18000)
res_tbl <- bind_rows(lapply(scenarios, solve_minidc))
# Add "No mini-DC" baseline for comparison
baseline <- tibble(
Demand = scenarios,
BaselineCost_NoMini = 180 * scenarios
)
final_tbl <- res_tbl %>%
left_join(baseline, by = "Demand") %>%
mutate(Choice = ifelse(TotalCost < BaselineCost_NoMini,
paste0("Open ", MinisOpened, " mini-DC(s)"),
"Open 0 (use existing network)"))
kable(final_tbl, caption = "Mini-DC MIP solution vs baseline (per demand scenario)",
format.args = list(big.mark=","), digits = 0)
| Demand | TotalCost | x0 | x1 | x2 | y1 | y2 | MinisOpened | BaselineCost_NoMini | Choice |
|---|---|---|---|---|---|---|---|---|---|
| 8,000 | 1,440,000 | 8,000 | 0 | 0 | 0 | 0 | 0 | 1,440,000 | Open 0 mini-DC(s) |
| 12,000 | 2,160,000 | 12,000 | 0 | 0 | 0 | 0 | 0 | 2,160,000 | Open 0 mini-DC(s) |
| 18,000 | 3,240,000 | 18,000 | 0 | 0 | 0 | 0 | 0 | 3,240,000 | Open 0 mini-DC(s) |
(What the table will show — worked numbers)
For these costs and capacities, the solver selects 0
mini-DCs in all three scenarios:
With today’s numbers, the cost-minimizing choice is to open 0
mini-DCs and ship everything through the existing
network.
If the fixed cost per mini drops below ~₹450k, or the
baseline per-unit cost rises, or mini-DC
capacity increases materially, this conclusion could
change—then re-run the same model with the updated inputs.
Remark (packages): I used lpSolve
to solve this small mixed-integer model. If knitting on a new machine,
install.packages("lpSolve").