This analysis addresses three research questions:
Q1: Are there statistically significant differences in profit margins among different customer segments (Consumer, Corporate, Home Office)?
Q2: Will product prices have a positive impact on profit margins?
Q3: Are there significant differences in profit margins among three ship modes (Standard Class, Second Class, First Class)?
suppressPackageStartupMessages(library(readxl))
set.seed(42)
# Load the sales dataset - UPDATE THIS PATH TO YOUR FILE
data <- as.data.frame(read_excel("sales_data.xlsx"))
cat(sprintf("Dataset Dimensions: %d rows x %d columns
Column Names:
%s
First 6 rows:
", nrow(data), ncol(data), paste(names(data), collapse = ", ")))
## Dataset Dimensions: 9800 rows x 22 columns
##
## Column Names:
## Row ID, Order ID, Order Date, Ship Date, Ship Mode, Customer ID, Customer Name, Segment, Country, City, State, Postal Code, Region, Product ID, Category, Sub-Category, Product Name, Cost, Price, Profit, Quantity, Sales
##
## First 6 rows:
print(head(data[, 1:8]))
## Row ID Order ID Order Date Ship Date Ship Mode Customer ID
## 1 1 CA-2017-152156 2017-11-08 2017-11-11 Second Class CG-12520
## 2 2 CA-2017-152156 2017-11-08 2017-11-11 Second Class CG-12520
## 3 3 CA-2017-138688 2017-06-12 2017-06-16 Second Class DV-13045
## 4 4 US-2016-108966 2016-10-11 2016-10-18 Standard Class SO-20335
## 5 5 US-2016-108966 2016-10-11 2016-10-18 Standard Class SO-20335
## 6 6 CA-2015-115812 2015-06-09 2015-06-14 Standard Class BH-11710
## Customer Name Segment
## 1 Claire Gute Consumer
## 2 Claire Gute Consumer
## 3 Darrin Van Huff Corporate
## 4 Sean O'Donnell Consumer
## 5 Sean O'Donnell Consumer
## 6 Brosina Hoffman Consumer
\[\text{Profit Margin} = \frac{\text{Profit}}{\text{Sales}}\]
data$pm <- ifelse(data$Sales == 0, NA, data$Profit / data$Sales)
data <- data[!is.na(data$pm), ]
cat(sprintf("Profit margin created.
NA values removed.
Final sample size: %d
", nrow(data)))
## Profit margin created.
## NA values removed.
## Final sample size: 9800
consumer_pm <- data$pm[data$Segment == "Consumer"]
corporate_pm <- data$pm[data$Segment == "Corporate"]
homeoffice_pm <- data$pm[data$Segment == "Home Office"]
cat(sprintf("=== DESCRIPTIVE STATISTICS BY CUSTOMER SEGMENT ===
CONSUMER:
N = %d
Mean = %.6f
SD = %.4f
Variance = %.4f
Median = %.6f
Range = [%.4f, %.4f]
CORPORATE:
N = %d
Mean = %.6f
SD = %.4f
Variance = %.4f
Median = %.6f
Range = [%.4f, %.4f]
HOME OFFICE:
N = %d
Mean = %.6f
SD = %.4f
Variance = %.4f
Median = %.6f
Range = [%.4f, %.4f]
KEY OBSERVATION: All segments show NEGATIVE mean profit margins.
",
length(consumer_pm), mean(consumer_pm), sd(consumer_pm), var(consumer_pm),
median(consumer_pm), min(consumer_pm), max(consumer_pm),
length(corporate_pm), mean(corporate_pm), sd(corporate_pm), var(corporate_pm),
median(corporate_pm), min(corporate_pm), max(corporate_pm),
length(homeoffice_pm), mean(homeoffice_pm), sd(homeoffice_pm), var(homeoffice_pm),
median(homeoffice_pm), min(homeoffice_pm), max(homeoffice_pm)))
## === DESCRIPTIVE STATISTICS BY CUSTOMER SEGMENT ===
##
## CONSUMER:
## N = 5101
## Mean = -0.306701
## SD = 2.6685
## Variance = 7.1211
## Median = -0.000557
## Range = [-101.7471, 0.9558]
##
## CORPORATE:
## N = 2953
## Mean = -0.390451
## SD = 3.4586
## Variance = 11.9621
## Median = -0.000229
## Range = [-104.9584, 0.9947]
##
## HOME OFFICE:
## N = 1746
## Mean = -0.450146
## SD = 4.9703
## Variance = 24.7040
## Median = 0.000388
## Range = [-180.5577, 0.9774]
##
## KEY OBSERVATION: All segments show NEGATIVE mean profit margins.
boxplot(pm ~ Segment, data = data,
main = "Profit Margin Distribution by Customer Segment",
ylab = "Profit Margin", xlab = "Customer Segment",
col = c("lightblue", "lightgreen", "lightyellow"))
abline(h = 0, col = "red", lty = 2, lwd = 2)
legend("topright", legend = "Break-even (0)", col = "red", lty = 2, lwd = 2)
par(mfrow = c(3, 1))
hist(consumer_pm, breaks = 50, main = "Consumer Segment",
xlab = "Profit Margin", col = "lightblue", xlim = c(-10, 10))
abline(v = 0, col = "red", lty = 2, lwd = 2)
abline(v = mean(consumer_pm), col = "blue", lwd = 2)
hist(corporate_pm, breaks = 50, main = "Corporate Segment",
xlab = "Profit Margin", col = "lightgreen", xlim = c(-10, 10))
abline(v = 0, col = "red", lty = 2, lwd = 2)
abline(v = mean(corporate_pm), col = "blue", lwd = 2)
hist(homeoffice_pm, breaks = 50, main = "Home Office Segment",
xlab = "Profit Margin", col = "lightyellow", xlim = c(-10, 10))
abline(v = 0, col = "red", lty = 2, lwd = 2)
abline(v = mean(homeoffice_pm), col = "blue", lwd = 2)
par(mfrow = c(1, 1))
set.seed(42)
sw_con <- shapiro.test(sample(consumer_pm, min(5000, length(consumer_pm))))
sw_cor <- shapiro.test(sample(corporate_pm, min(5000, length(corporate_pm))))
sw_ho <- shapiro.test(sample(homeoffice_pm, min(5000, length(homeoffice_pm))))
cat(sprintf("=== NORMALITY TESTS (Shapiro-Wilk) ===
Consumer: W = %.4f, p = %s
Corporate: W = %.4f, p = %s
Home Office: W = %.4f, p = %s
All p < 0.05: Data deviates from normality.
With large samples, t-tests are robust. Mann-Whitney U provides robustness check.
", sw_con$statistic, format(sw_con$p.value, digits = 3),
sw_cor$statistic, format(sw_cor$p.value, digits = 3),
sw_ho$statistic, format(sw_ho$p.value, digits = 3)))
## === NORMALITY TESTS (Shapiro-Wilk) ===
##
## Consumer: W = 0.1164, p = 6.05e-93
## Corporate: W = 0.0964, p = 5.1e-80
## Home Office: W = 0.0686, p = 2.78e-68
##
## All p < 0.05: Data deviates from normality.
## With large samples, t-tests are robust. Mann-Whitney U provides robustness check.
For independent samples with unequal variances (Welch’s t-test):
\[t = \frac{\bar{X}_1 - \bar{X}_2}{\sqrt{\frac{s_1^2}{n_1} + \frac{s_2^2}{n_2}}}\]
my_ttest <- function(x1, x2, name1, name2) {
n1 <- length(x1); n2 <- length(x2)
m1 <- mean(x1); m2 <- mean(x2)
v1 <- var(x1); v2 <- var(x2)
s1 <- sd(x1); s2 <- sd(x2)
se <- sqrt(v1/n1 + v2/n2)
t_stat <- (m1 - m2) / se
df <- ((v1/n1 + v2/n2)^2) / ((v1/n1)^2/(n1-1) + (v2/n2)^2/(n2-1))
p_one <- pt(t_stat, df, lower.tail = FALSE)
p_two <- 2 * pt(abs(t_stat), df, lower.tail = FALSE)
pooled_sd <- sqrt(((n1-1)*s1^2 + (n2-1)*s2^2) / (n1+n2-2))
d <- (m1 - m2) / pooled_sd
effect_interp <- ifelse(abs(d) < 0.2, "Negligible (< 0.2)",
ifelse(abs(d) < 0.5, "Small (0.2 - 0.5)",
ifelse(abs(d) < 0.8, "Medium (0.5 - 0.8)", "Large (> 0.8)")))
decision <- ifelse(p_one < 0.05,
sprintf(" p = %.4f < 0.05
Decision: REJECT H0
Conclusion: %s has significantly higher profit margin than %s", p_one, name1, name2),
sprintf(" p = %.4f >= 0.05
Decision: FAIL TO REJECT H0
Conclusion: No significant evidence that %s > %s", p_one, name1, name2))
cat(sprintf("============================================================
T-TEST: %s vs %s
============================================================
HYPOTHESES:
H0: mu_%s <= mu_%s
H1: mu_%s > mu_%s
SAMPLE STATISTICS:
%s:
N = %d
Mean = %.6f
SD = %.4f
Variance = %.4f
%s:
N = %d
Mean = %.6f
SD = %.4f
Variance = %.4f
STEP-BY-STEP CALCULATION:
Step 1: Calculate mean difference
Mean1 - Mean2 = %.6f - (%.6f)
= %.6f
Step 2: Calculate standard error
SE = sqrt(Var1/n1 + Var2/n2)
= sqrt(%.4f/%d + %.4f/%d)
= sqrt(%.8f + %.8f)
= sqrt(%.8f)
= %.6f
Step 3: Calculate t-statistic
t = (Mean1 - Mean2) / SE
= %.6f / %.6f
= %.4f
Step 4: Calculate Welch degrees of freedom
df = (Var1/n1 + Var2/n2)^2 / [(Var1/n1)^2/(n1-1) + (Var2/n2)^2/(n2-1)]
= %.2f
Step 5: Calculate p-value (one-tailed)
p = P(T > %.4f) with df = %.2f
= %.4f
RESULTS SUMMARY:
t-statistic = %.4f
df = %.2f
p-value (one-tailed) = %.4f
p-value (two-tailed) = %.4f
EFFECT SIZE (Cohen's d):
d = (Mean1 - Mean2) / pooled_SD
= %.6f / %.4f
= %.4f
Interpretation: %s
DECISION (alpha = 0.05):
%s
",
name1, name2, name1, name2, name1, name2,
name1, n1, m1, s1, v1,
name2, n2, m2, s2, v2,
m1, m2, m1 - m2,
v1, n1, v2, n2, v1/n1, v2/n2, v1/n1 + v2/n2, se,
m1 - m2, se, t_stat,
df,
t_stat, df, p_one,
t_stat, df, p_one, p_two,
m1 - m2, pooled_sd, d, effect_interp,
decision))
invisible(list(t = t_stat, df = df, p_one = p_one, p_two = p_two, d = d))
}
res1 <- my_ttest(consumer_pm, corporate_pm, "Consumer", "Corporate")
## ============================================================
## T-TEST: Consumer vs Corporate
## ============================================================
##
## HYPOTHESES:
## H0: mu_Consumer <= mu_Corporate
## H1: mu_Consumer > mu_Corporate
##
## SAMPLE STATISTICS:
## Consumer:
## N = 5101
## Mean = -0.306701
## SD = 2.6685
## Variance = 7.1211
##
## Corporate:
## N = 2953
## Mean = -0.390451
## SD = 3.4586
## Variance = 11.9621
##
## STEP-BY-STEP CALCULATION:
##
## Step 1: Calculate mean difference
## Mean1 - Mean2 = -0.306701 - (-0.390451)
## = 0.083750
##
## Step 2: Calculate standard error
## SE = sqrt(Var1/n1 + Var2/n2)
## = sqrt(7.1211/5101 + 11.9621/2953)
## = sqrt(0.00139601 + 0.00405083)
## = sqrt(0.00544684)
## = 0.073803
##
## Step 3: Calculate t-statistic
## t = (Mean1 - Mean2) / SE
## = 0.083750 / 0.073803
## = 1.1348
##
## Step 4: Calculate Welch degrees of freedom
## df = (Var1/n1 + Var2/n2)^2 / [(Var1/n1)^2/(n1-1) + (Var2/n2)^2/(n2-1)]
## = 4993.95
##
## Step 5: Calculate p-value (one-tailed)
## p = P(T > 1.1348) with df = 4993.95
## = 0.1283
##
## RESULTS SUMMARY:
## t-statistic = 1.1348
## df = 4993.95
## p-value (one-tailed) = 0.1283
## p-value (two-tailed) = 0.2565
##
## EFFECT SIZE (Cohen's d):
## d = (Mean1 - Mean2) / pooled_SD
## = 0.083750 / 2.9826
## = 0.0281
## Interpretation: Negligible (< 0.2)
##
## DECISION (alpha = 0.05):
## p = 0.1283 >= 0.05
## Decision: FAIL TO REJECT H0
## Conclusion: No significant evidence that Consumer > Corporate
res2 <- my_ttest(consumer_pm, homeoffice_pm, "Consumer", "HomeOffice")
## ============================================================
## T-TEST: Consumer vs HomeOffice
## ============================================================
##
## HYPOTHESES:
## H0: mu_Consumer <= mu_HomeOffice
## H1: mu_Consumer > mu_HomeOffice
##
## SAMPLE STATISTICS:
## Consumer:
## N = 5101
## Mean = -0.306701
## SD = 2.6685
## Variance = 7.1211
##
## HomeOffice:
## N = 1746
## Mean = -0.450146
## SD = 4.9703
## Variance = 24.7040
##
## STEP-BY-STEP CALCULATION:
##
## Step 1: Calculate mean difference
## Mean1 - Mean2 = -0.306701 - (-0.450146)
## = 0.143445
##
## Step 2: Calculate standard error
## SE = sqrt(Var1/n1 + Var2/n2)
## = sqrt(7.1211/5101 + 24.7040/1746)
## = sqrt(0.00139601 + 0.01414891)
## = sqrt(0.01554492)
## = 0.124679
##
## Step 3: Calculate t-statistic
## t = (Mean1 - Mean2) / SE
## = 0.143445 / 0.124679
## = 1.1505
##
## Step 4: Calculate Welch degrees of freedom
## df = (Var1/n1 + Var2/n2)^2 / [(Var1/n1)^2/(n1-1) + (Var2/n2)^2/(n2-1)]
## = 2099.34
##
## Step 5: Calculate p-value (one-tailed)
## p = P(T > 1.1505) with df = 2099.34
## = 0.1250
##
## RESULTS SUMMARY:
## t-statistic = 1.1505
## df = 2099.34
## p-value (one-tailed) = 0.1250
## p-value (two-tailed) = 0.2501
##
## EFFECT SIZE (Cohen's d):
## d = (Mean1 - Mean2) / pooled_SD
## = 0.143445 / 3.4064
## = 0.0421
## Interpretation: Negligible (< 0.2)
##
## DECISION (alpha = 0.05):
## p = 0.1250 >= 0.05
## Decision: FAIL TO REJECT H0
## Conclusion: No significant evidence that Consumer > HomeOffice
res3 <- my_ttest(corporate_pm, homeoffice_pm, "Corporate", "HomeOffice")
## ============================================================
## T-TEST: Corporate vs HomeOffice
## ============================================================
##
## HYPOTHESES:
## H0: mu_Corporate <= mu_HomeOffice
## H1: mu_Corporate > mu_HomeOffice
##
## SAMPLE STATISTICS:
## Corporate:
## N = 2953
## Mean = -0.390451
## SD = 3.4586
## Variance = 11.9621
##
## HomeOffice:
## N = 1746
## Mean = -0.450146
## SD = 4.9703
## Variance = 24.7040
##
## STEP-BY-STEP CALCULATION:
##
## Step 1: Calculate mean difference
## Mean1 - Mean2 = -0.390451 - (-0.450146)
## = 0.059696
##
## Step 2: Calculate standard error
## SE = sqrt(Var1/n1 + Var2/n2)
## = sqrt(11.9621/2953 + 24.7040/1746)
## = sqrt(0.00405083 + 0.01414891)
## = sqrt(0.01819973)
## = 0.134906
##
## Step 3: Calculate t-statistic
## t = (Mean1 - Mean2) / SE
## = 0.059696 / 0.134906
## = 0.4425
##
## Step 4: Calculate Welch degrees of freedom
## df = (Var1/n1 + Var2/n2)^2 / [(Var1/n1)^2/(n1-1) + (Var2/n2)^2/(n2-1)]
## = 2753.79
##
## Step 5: Calculate p-value (one-tailed)
## p = P(T > 0.4425) with df = 2753.79
## = 0.3291
##
## RESULTS SUMMARY:
## t-statistic = 0.4425
## df = 2753.79
## p-value (one-tailed) = 0.3291
## p-value (two-tailed) = 0.6582
##
## EFFECT SIZE (Cohen's d):
## d = (Mean1 - Mean2) / pooled_SD
## = 0.059696 / 4.0861
## = 0.0146
## Interpretation: Negligible (< 0.2)
##
## DECISION (alpha = 0.05):
## p = 0.3291 >= 0.05
## Decision: FAIL TO REJECT H0
## Conclusion: No significant evidence that Corporate > HomeOffice
t1 <- t.test(consumer_pm, corporate_pm, alternative = "greater")
t2 <- t.test(consumer_pm, homeoffice_pm, alternative = "greater")
t3 <- t.test(corporate_pm, homeoffice_pm, alternative = "greater")
cat(sprintf("=== VERIFICATION WITH R's t.test() FUNCTION ===
Consumer vs Corporate:
t = %.4f, df = %.2f, p-value = %.4f
Consumer vs Home Office:
t = %.4f, df = %.2f, p-value = %.4f
Corporate vs Home Office:
t = %.4f, df = %.2f, p-value = %.4f
",
t1$statistic, t1$parameter, t1$p.value,
t2$statistic, t2$parameter, t2$p.value,
t3$statistic, t3$parameter, t3$p.value))
## === VERIFICATION WITH R's t.test() FUNCTION ===
##
## Consumer vs Corporate:
## t = 1.1348, df = 4993.95, p-value = 0.1283
##
## Consumer vs Home Office:
## t = 1.1505, df = 2099.34, p-value = 0.1250
##
## Corporate vs Home Office:
## t = 0.4425, df = 2753.79, p-value = 0.3291
mw1 <- wilcox.test(consumer_pm, corporate_pm, alternative = "greater")
mw2 <- wilcox.test(consumer_pm, homeoffice_pm, alternative = "greater")
mw3 <- wilcox.test(corporate_pm, homeoffice_pm, alternative = "greater")
cat(sprintf("=== MANN-WHITNEY U TESTS (Non-Parametric Robustness Check) ===
Consumer vs Corporate:
W statistic = %.0f
p-value = %.4f
Decision: %s
Consumer vs Home Office:
W statistic = %.0f
p-value = %.4f
Decision: %s
Corporate vs Home Office:
W statistic = %.0f
p-value = %.4f
Decision: %s
CONCLUSION: Mann-Whitney results confirm t-test findings.
No significant differences between any segment pairs.
",
mw1$statistic, mw1$p.value, ifelse(mw1$p.value < 0.05, "Significant", "Not Significant"),
mw2$statistic, mw2$p.value, ifelse(mw2$p.value < 0.05, "Significant", "Not Significant"),
mw3$statistic, mw3$p.value, ifelse(mw3$p.value < 0.05, "Significant", "Not Significant")))
## === MANN-WHITNEY U TESTS (Non-Parametric Robustness Check) ===
##
## Consumer vs Corporate:
## W statistic = 7649760
## p-value = 0.1200
## Decision: Not Significant
##
## Consumer vs Home Office:
## W statistic = 4446281
## p-value = 0.5385
## Decision: Not Significant
##
## Corporate vs Home Office:
## W statistic = 2533396
## p-value = 0.8394
## Decision: Not Significant
##
## CONCLUSION: Mann-Whitney results confirm t-test findings.
## No significant differences between any segment pairs.
cat(sprintf("====================================================================
QUESTION 1 SUMMARY: Customer Segment Profit Margin Differences
====================================================================
Comparison t-stat p-value Cohen's d Decision
--------------------------------------------------------------------
Consumer vs Corporate %7.4f %7.4f %7.4f Fail to Reject H0
Consumer vs Home Office %7.4f %7.4f %7.4f Fail to Reject H0
Corporate vs Home Office %7.4f %7.4f %7.4f Fail to Reject H0
CONCLUSION:
There are NO statistically significant differences in profit margins
among the three customer segments. All p-values > 0.05 and all
effect sizes are negligible (|d| < 0.2).
", res1$t, res1$p_one, res1$d, res2$t, res2$p_one, res2$d, res3$t, res3$p_one, res3$d))
## ====================================================================
## QUESTION 1 SUMMARY: Customer Segment Profit Margin Differences
## ====================================================================
##
## Comparison t-stat p-value Cohen's d Decision
## --------------------------------------------------------------------
## Consumer vs Corporate 1.1348 0.1283 0.0281 Fail to Reject H0
## Consumer vs Home Office 1.1505 0.1250 0.0421 Fail to Reject H0
## Corporate vs Home Office 0.4425 0.3291 0.0146 Fail to Reject H0
##
## CONCLUSION:
## There are NO statistically significant differences in profit margins
## among the three customer segments. All p-values > 0.05 and all
## effect sizes are negligible (|d| < 0.2).
price_cols <- grep("price", names(data), ignore.case = TRUE, value = TRUE)
if ("Unit Price" %in% names(data)) {
data$price <- data[["Unit Price"]]
price_source <- "Using 'Unit Price' column"
} else {
data$price <- data$Sales / data$Quantity
price_source <- "Calculated: Unit Price = Sales / Quantity"
}
cat(sprintf("Price columns found: %s
%s
Price Statistics:
N: %d
Mean: %.2f
SD: %.2f
Min: %.2f
Max: %.2f
", paste(price_cols, collapse = ", "), price_source,
sum(!is.na(data$price)), mean(data$price, na.rm = TRUE), sd(data$price, na.rm = TRUE),
min(data$price, na.rm = TRUE), max(data$price, na.rm = TRUE)))
## Price columns found: Price
## Calculated: Unit Price = Sales / Quantity
##
## Price Statistics:
## N: 9800
## Mean: 501.57
## SD: 287.18
## Min: 1.06
## Max: 999.92
plot(data$price, data$pm,
main = "Price vs Profit Margin",
xlab = "Unit Price", ylab = "Profit Margin",
pch = 16, cex = 0.5, col = rgb(0, 0, 1, 0.2))
abline(h = 0, col = "blue", lty = 2, lwd = 2)
abline(lm(pm ~ price, data = data), col = "red", lwd = 2)
legend("topright", legend = c("Regression line", "Break-even"),
col = c("red", "blue"), lty = c(1, 2), lwd = 2)
cor_result <- cor.test(data$price, data$pm, use = "complete.obs")
cat(sprintf("=== CORRELATION ANALYSIS ===
Pearson Correlation Coefficient (r):
r = %.4f
Hypothesis Test:
t-statistic = %.4f
df = %d
p-value = %s
95%% Confidence Interval:
[%.4f, %.4f]
Interpretation: Weak positive correlation.
", cor_result$estimate, cor_result$statistic, cor_result$parameter,
format(cor_result$p.value, scientific = TRUE),
cor_result$conf.int[1], cor_result$conf.int[2]))
## === CORRELATION ANALYSIS ===
##
## Pearson Correlation Coefficient (r):
## r = 0.1823
##
## Hypothesis Test:
## t-statistic = 18.3545
## df = 9798
## p-value = 5.245323e-74
##
## 95% Confidence Interval:
## [0.1631, 0.2014]
##
## Interpretation: Weak positive correlation.
\[\text{Profit Margin} = \beta_0 + \beta_1 \times \text{Price} + \epsilon\]
model <- lm(pm ~ price, data = data)
s <- summary(model)
f_pval <- pf(s$fstatistic[1], s$fstatistic[2], s$fstatistic[3], lower.tail = FALSE)
cat(sprintf("=== LINEAR REGRESSION RESULTS ===
MODEL EQUATION:
Profit Margin = %.6f + %.6f * Price
COEFFICIENTS:
Intercept (beta0) = %.6f
Interpretation: When Price = 0, expected profit margin = %.4f
Slope (beta1) = %.6f
Interpretation: For each $1 increase in price,
profit margin increases by %.6f
PRACTICAL EXAMPLE:
A $100 price increase corresponds to:
Profit margin change = 100 * %.6f = %.4f
MODEL FIT (R-squared):
R-squared = %.4f
Adjusted R-squared = %.4f
Interpretation: Price explains only %.2f%% of variance in profit margin
STATISTICAL SIGNIFICANCE:
F-statistic = %.2f
Model p-value = %s
COEFFICIENT TESTS:
Slope:
t-value = %.4f
p-value = %s
Significant at alpha = 0.05? %s
",
coef(model)[1], coef(model)[2],
coef(model)[1], coef(model)[1],
coef(model)[2], coef(model)[2],
coef(model)[2], coef(model)[2] * 100,
s$r.squared, s$adj.r.squared, s$r.squared * 100,
s$fstatistic[1], format(f_pval, scientific = TRUE),
s$coefficients[2, 3], format(s$coefficients[2, 4], scientific = TRUE),
ifelse(s$coefficients[2, 4] < 0.05, "YES", "NO")))
## === LINEAR REGRESSION RESULTS ===
##
## MODEL EQUATION:
## Profit Margin = -1.447266 + 0.002173 * Price
##
## COEFFICIENTS:
## Intercept (beta0) = -1.447266
## Interpretation: When Price = 0, expected profit margin = -1.4473
##
## Slope (beta1) = 0.002173
## Interpretation: For each $1 increase in price,
## profit margin increases by 0.002173
##
## PRACTICAL EXAMPLE:
## A $100 price increase corresponds to:
## Profit margin change = 100 * 0.002173 = 0.2173
##
## MODEL FIT (R-squared):
## R-squared = 0.0332
## Adjusted R-squared = 0.0331
## Interpretation: Price explains only 3.32% of variance in profit margin
##
## STATISTICAL SIGNIFICANCE:
## F-statistic = 336.89
## Model p-value = 5.245323e-74
##
## COEFFICIENT TESTS:
## Slope:
## t-value = 18.3545
## p-value = 5.245323e-74
## Significant at alpha = 0.05? YES
par(mfrow = c(2, 2))
plot(model)
par(mfrow = c(1, 1))
winsorize <- function(x, probs = c(0.01, 0.99)) {
q <- quantile(x, probs, na.rm = TRUE)
x[x < q[1]] <- q[1]
x[x > q[2]] <- q[2]
x
}
data$pm_wins <- winsorize(data$pm)
model_w <- lm(pm_wins ~ price, data = data)
sw <- summary(model_w)
cat(sprintf("=== WINSORIZATION ===
Original profit margin:
Range: [%.4f, %.4f]
SD: %.4f
Winsorized profit margin (1%% - 99%%):
Range: [%.4f, %.4f]
SD: %.4f
=== WINSORIZED REGRESSION ===
Coefficients:
Intercept: %.6f
Slope: %.6f
Model Fit:
R-squared: %.4f
p-value: %s
COMPARISON:
Original R-squared: %.4f
Winsorized R-squared: %.4f
Conclusion: Results are robust to outlier treatment.
",
min(data$pm), max(data$pm), sd(data$pm),
min(data$pm_wins), max(data$pm_wins), sd(data$pm_wins),
coef(model_w)[1], coef(model_w)[2],
sw$r.squared, format(sw$coefficients[2, 4], scientific = TRUE),
s$r.squared, sw$r.squared))
## === WINSORIZATION ===
##
## Original profit margin:
## Range: [-180.5577, 0.9947]
## SD: 3.4223
##
## Winsorized profit margin (1% - 99%):
## Range: [-7.0194, 0.4921]
## SD: 0.9193
##
## === WINSORIZED REGRESSION ===
##
## Coefficients:
## Intercept: -0.875804
## Slope: 0.001315
##
## Model Fit:
## R-squared: 0.1688
## p-value: 0e+00
##
## COMPARISON:
## Original R-squared: 0.0332
## Winsorized R-squared: 0.1688
## Conclusion: Results are robust to outlier treatment.
cat(sprintf("====================================================================
QUESTION 2 SUMMARY: Price Impact on Profit Margin
====================================================================
HYPOTHESIS:
H0: beta1 <= 0 (price has no positive effect on profit margin)
H1: beta1 > 0 (price has positive effect on profit margin)
RESULTS:
Slope coefficient = %.6f
p-value = %s
R-squared = %.4f (%.2f%%)
DECISION:
p < 0.05: REJECT H0
CONCLUSION:
Price has a STATISTICALLY SIGNIFICANT positive effect on profit margin.
HOWEVER, the effect is ECONOMICALLY WEAK:
- R-squared = 0.033 means price explains only 3.3%% of variance
- The slope is very small (0.002)
- A $100 price increase yields only 0.2 percentage point gain
Therefore, while the relationship is real, price is NOT a major
driver of profit margin variation.
", coef(model)[2], format(s$coefficients[2, 4], scientific = TRUE),
s$r.squared, s$r.squared * 100))
## ====================================================================
## QUESTION 2 SUMMARY: Price Impact on Profit Margin
## ====================================================================
##
## HYPOTHESIS:
## H0: beta1 <= 0 (price has no positive effect on profit margin)
## H1: beta1 > 0 (price has positive effect on profit margin)
##
## RESULTS:
## Slope coefficient = 0.002173
## p-value = 5.245323e-74
## R-squared = 0.0332 (3.32%)
##
## DECISION:
## p < 0.05: REJECT H0
##
## CONCLUSION:
## Price has a STATISTICALLY SIGNIFICANT positive effect on profit margin.
## HOWEVER, the effect is ECONOMICALLY WEAK:
## - R-squared = 0.033 means price explains only 3.3% of variance
## - The slope is very small (0.002)
## - A $100 price increase yields only 0.2 percentage point gain
## Therefore, while the relationship is real, price is NOT a major
## driver of profit margin variation.
ship_main <- c("Standard Class", "Second Class", "First Class")
data_ship <- data[data[["Ship Mode"]] %in% ship_main, ]
cat(sprintf("=== SHIP MODE DATA ===
Original sample size: %d
After excluding Same Day: %d
Distribution:
", nrow(data), nrow(data_ship)))
## === SHIP MODE DATA ===
##
## Original sample size: 9800
## After excluding Same Day: 9262
##
## Distribution:
print(table(data_ship[["Ship Mode"]]))
##
## First Class Second Class Standard Class
## 1501 1902 5859
first_pm <- data_ship$pm[data_ship[["Ship Mode"]] == "First Class"]
second_pm <- data_ship$pm[data_ship[["Ship Mode"]] == "Second Class"]
standard_pm <- data_ship$pm[data_ship[["Ship Mode"]] == "Standard Class"]
cat(sprintf("
=== DESCRIPTIVE STATISTICS BY SHIP MODE ===
FIRST CLASS:
N = %d
Mean = %.6f
SD = %.4f
Variance = %.4f
SECOND CLASS:
N = %d
Mean = %.6f
SD = %.4f
Variance = %.4f
STANDARD CLASS:
N = %d
Mean = %.6f
SD = %.4f
Variance = %.4f
",
length(first_pm), mean(first_pm), sd(first_pm), var(first_pm),
length(second_pm), mean(second_pm), sd(second_pm), var(second_pm),
length(standard_pm), mean(standard_pm), sd(standard_pm), var(standard_pm)))
##
## === DESCRIPTIVE STATISTICS BY SHIP MODE ===
##
## FIRST CLASS:
## N = 1501
## Mean = -0.301510
## SD = 2.8769
## Variance = 8.2768
##
## SECOND CLASS:
## N = 1902
## Mean = -0.299727
## SD = 2.0579
## Variance = 4.2351
##
## STANDARD CLASS:
## N = 5859
## Mean = -0.402446
## SD = 3.9952
## Variance = 15.9616
boxplot(pm ~ `Ship Mode`, data = data_ship,
main = "Profit Margin Distribution by Ship Mode",
ylab = "Profit Margin", xlab = "Ship Mode",
col = c("lightcoral", "lightblue", "lightgreen"))
abline(h = 0, col = "red", lty = 2, lwd = 2)
ship_res1 <- my_ttest(first_pm, standard_pm, "FirstClass", "StandardClass")
## ============================================================
## T-TEST: FirstClass vs StandardClass
## ============================================================
##
## HYPOTHESES:
## H0: mu_FirstClass <= mu_StandardClass
## H1: mu_FirstClass > mu_StandardClass
##
## SAMPLE STATISTICS:
## FirstClass:
## N = 1501
## Mean = -0.301510
## SD = 2.8769
## Variance = 8.2768
##
## StandardClass:
## N = 5859
## Mean = -0.402446
## SD = 3.9952
## Variance = 15.9616
##
## STEP-BY-STEP CALCULATION:
##
## Step 1: Calculate mean difference
## Mean1 - Mean2 = -0.301510 - (-0.402446)
## = 0.100936
##
## Step 2: Calculate standard error
## SE = sqrt(Var1/n1 + Var2/n2)
## = sqrt(8.2768/1501 + 15.9616/5859)
## = sqrt(0.00551416 + 0.00272428)
## = sqrt(0.00823844)
## = 0.090766
##
## Step 3: Calculate t-statistic
## t = (Mean1 - Mean2) / SE
## = 0.100936 / 0.090766
## = 1.1120
##
## Step 4: Calculate Welch degrees of freedom
## df = (Var1/n1 + Var2/n2)^2 / [(Var1/n1)^2/(n1-1) + (Var2/n2)^2/(n2-1)]
## = 3151.33
##
## Step 5: Calculate p-value (one-tailed)
## p = P(T > 1.1120) with df = 3151.33
## = 0.1331
##
## RESULTS SUMMARY:
## t-statistic = 1.1120
## df = 3151.33
## p-value (one-tailed) = 0.1331
## p-value (two-tailed) = 0.2662
##
## EFFECT SIZE (Cohen's d):
## d = (Mean1 - Mean2) / pooled_SD
## = 0.100936 / 3.7941
## = 0.0266
## Interpretation: Negligible (< 0.2)
##
## DECISION (alpha = 0.05):
## p = 0.1331 >= 0.05
## Decision: FAIL TO REJECT H0
## Conclusion: No significant evidence that FirstClass > StandardClass
ship_res2 <- my_ttest(first_pm, second_pm, "FirstClass", "SecondClass")
## ============================================================
## T-TEST: FirstClass vs SecondClass
## ============================================================
##
## HYPOTHESES:
## H0: mu_FirstClass <= mu_SecondClass
## H1: mu_FirstClass > mu_SecondClass
##
## SAMPLE STATISTICS:
## FirstClass:
## N = 1501
## Mean = -0.301510
## SD = 2.8769
## Variance = 8.2768
##
## SecondClass:
## N = 1902
## Mean = -0.299727
## SD = 2.0579
## Variance = 4.2351
##
## STEP-BY-STEP CALCULATION:
##
## Step 1: Calculate mean difference
## Mean1 - Mean2 = -0.301510 - (-0.299727)
## = -0.001783
##
## Step 2: Calculate standard error
## SE = sqrt(Var1/n1 + Var2/n2)
## = sqrt(8.2768/1501 + 4.2351/1902)
## = sqrt(0.00551416 + 0.00222667)
## = sqrt(0.00774083)
## = 0.087982
##
## Step 3: Calculate t-statistic
## t = (Mean1 - Mean2) / SE
## = -0.001783 / 0.087982
## = -0.0203
##
## Step 4: Calculate Welch degrees of freedom
## df = (Var1/n1 + Var2/n2)^2 / [(Var1/n1)^2/(n1-1) + (Var2/n2)^2/(n2-1)]
## = 2619.04
##
## Step 5: Calculate p-value (one-tailed)
## p = P(T > -0.0203) with df = 2619.04
## = 0.5081
##
## RESULTS SUMMARY:
## t-statistic = -0.0203
## df = 2619.04
## p-value (one-tailed) = 0.5081
## p-value (two-tailed) = 0.9838
##
## EFFECT SIZE (Cohen's d):
## d = (Mean1 - Mean2) / pooled_SD
## = -0.001783 / 2.4531
## = -0.0007
## Interpretation: Negligible (< 0.2)
##
## DECISION (alpha = 0.05):
## p = 0.5081 >= 0.05
## Decision: FAIL TO REJECT H0
## Conclusion: No significant evidence that FirstClass > SecondClass
ship_res3 <- my_ttest(second_pm, standard_pm, "SecondClass", "StandardClass")
## ============================================================
## T-TEST: SecondClass vs StandardClass
## ============================================================
##
## HYPOTHESES:
## H0: mu_SecondClass <= mu_StandardClass
## H1: mu_SecondClass > mu_StandardClass
##
## SAMPLE STATISTICS:
## SecondClass:
## N = 1902
## Mean = -0.299727
## SD = 2.0579
## Variance = 4.2351
##
## StandardClass:
## N = 5859
## Mean = -0.402446
## SD = 3.9952
## Variance = 15.9616
##
## STEP-BY-STEP CALCULATION:
##
## Step 1: Calculate mean difference
## Mean1 - Mean2 = -0.299727 - (-0.402446)
## = 0.102719
##
## Step 2: Calculate standard error
## SE = sqrt(Var1/n1 + Var2/n2)
## = sqrt(4.2351/1902 + 15.9616/5859)
## = sqrt(0.00222667 + 0.00272428)
## = sqrt(0.00495096)
## = 0.070363
##
## Step 3: Calculate t-statistic
## t = (Mean1 - Mean2) / SE
## = 0.102719 / 0.070363
## = 1.4598
##
## Step 4: Calculate Welch degrees of freedom
## df = (Var1/n1 + Var2/n2)^2 / [(Var1/n1)^2/(n1-1) + (Var2/n2)^2/(n2-1)]
## = 6325.54
##
## Step 5: Calculate p-value (one-tailed)
## p = P(T > 1.4598) with df = 6325.54
## = 0.0722
##
## RESULTS SUMMARY:
## t-statistic = 1.4598
## df = 6325.54
## p-value (one-tailed) = 0.0722
## p-value (two-tailed) = 0.1444
##
## EFFECT SIZE (Cohen's d):
## d = (Mean1 - Mean2) / pooled_SD
## = 0.102719 / 3.6178
## = 0.0284
## Interpretation: Negligible (< 0.2)
##
## DECISION (alpha = 0.05):
## p = 0.0722 >= 0.05
## Decision: FAIL TO REJECT H0
## Conclusion: No significant evidence that SecondClass > StandardClass
mw_s1 <- wilcox.test(first_pm, standard_pm, alternative = "greater")
mw_s2 <- wilcox.test(first_pm, second_pm, alternative = "greater")
mw_s3 <- wilcox.test(second_pm, standard_pm, alternative = "greater")
cat(sprintf("=== MANN-WHITNEY U TESTS FOR SHIP MODES ===
First Class vs Standard Class:
W = %.0f, p = %.4f
First Class vs Second Class:
W = %.0f, p = %.4f
Second Class vs Standard Class:
W = %.0f, p = %.4f
Mann-Whitney results confirm t-test findings.
", mw_s1$statistic, mw_s1$p.value,
mw_s2$statistic, mw_s2$p.value,
mw_s3$statistic, mw_s3$p.value))
## === MANN-WHITNEY U TESTS FOR SHIP MODES ===
##
## First Class vs Standard Class:
## W = 4460653, p = 0.1937
##
## First Class vs Second Class:
## W = 1435281, p = 0.3916
##
## Second Class vs Standard Class:
## W = 5616146, p = 0.3012
##
## Mann-Whitney results confirm t-test findings.
cat(sprintf("====================================================================
QUESTION 3 SUMMARY: Ship Mode Profit Margin Differences
====================================================================
Comparison t-stat p-value Cohen's d Decision
--------------------------------------------------------------------
First vs Standard %7.4f %7.4f %7.4f Fail to Reject H0
First vs Second %7.4f %7.4f %7.4f Fail to Reject H0
Second vs Standard %7.4f %7.4f %7.4f Fail to Reject H0
CONCLUSION:
There are NO statistically significant differences in profit margins
among the three ship modes. All p-values > 0.05.
", ship_res1$t, ship_res1$p_one, ship_res1$d,
ship_res2$t, ship_res2$p_one, ship_res2$d,
ship_res3$t, ship_res3$p_one, ship_res3$d))
## ====================================================================
## QUESTION 3 SUMMARY: Ship Mode Profit Margin Differences
## ====================================================================
##
## Comparison t-stat p-value Cohen's d Decision
## --------------------------------------------------------------------
## First vs Standard 1.1120 0.1331 0.0266 Fail to Reject H0
## First vs Second -0.0203 0.5081 -0.0007 Fail to Reject H0
## Second vs Standard 1.4598 0.0722 0.0284 Fail to Reject H0
##
## CONCLUSION:
## There are NO statistically significant differences in profit margins
## among the three ship modes. All p-values > 0.05.
cat("====================================================================
OVERALL ANALYSIS CONCLUSIONS
====================================================================
QUESTION 1: Customer Segment Differences
--------------------------------------------
Result: No significant differences (all p > 0.05)
Effect sizes: Negligible (|d| < 0.05)
Business implication: Customer segment alone does not
determine profit margin performance.
QUESTION 2: Price Impact on Profit Margin
--------------------------------------------
Result: Significant positive relationship (p < 0.001)
Effect size: Weak (R-squared = 0.033, only 3.3% explained)
Business implication: While price has a real positive effect,
it is not a major driver of profit margins. Other factors
(costs, discounts, product mix) likely play larger roles.
QUESTION 3: Ship Mode Differences
--------------------------------------------
Result: No significant differences (all p > 0.05)
Effect sizes: Negligible (|d| < 0.05)
Business implication: Ship mode selection does not
significantly impact profit margins.
====================================================================
KEY TAKEAWAY:
The factors examined (customer segment, price, and ship mode)
have minimal practical impact on profit margins. Further analysis
should explore:
- Product category and sub-category
- Discount rates applied
- Geographic region
- Order quantity and bundling effects
====================================================================
")
## ====================================================================
## OVERALL ANALYSIS CONCLUSIONS
## ====================================================================
##
## QUESTION 1: Customer Segment Differences
## --------------------------------------------
## Result: No significant differences (all p > 0.05)
## Effect sizes: Negligible (|d| < 0.05)
## Business implication: Customer segment alone does not
## determine profit margin performance.
##
## QUESTION 2: Price Impact on Profit Margin
## --------------------------------------------
## Result: Significant positive relationship (p < 0.001)
## Effect size: Weak (R-squared = 0.033, only 3.3% explained)
## Business implication: While price has a real positive effect,
## it is not a major driver of profit margins. Other factors
## (costs, discounts, product mix) likely play larger roles.
##
## QUESTION 3: Ship Mode Differences
## --------------------------------------------
## Result: No significant differences (all p > 0.05)
## Effect sizes: Negligible (|d| < 0.05)
## Business implication: Ship mode selection does not
## significantly impact profit margins.
##
## ====================================================================
## KEY TAKEAWAY:
## The factors examined (customer segment, price, and ship mode)
## have minimal practical impact on profit margins. Further analysis
## should explore:
## - Product category and sub-category
## - Discount rates applied
## - Geographic region
## - Order quantity and bundling effects
## ====================================================================
sessionInfo()
## R version 4.4.3 (2025-02-28 ucrt)
## Platform: x86_64-w64-mingw32/x64
## Running under: Windows 11 x64 (build 26100)
##
## Matrix products: default
##
##
## locale:
## [1] C
## system code page: 65001
##
## time zone: Asia/Shanghai
## tzcode source: internal
##
## attached base packages:
## [1] stats graphics grDevices datasets utils methods base
##
## other attached packages:
## [1] readxl_1.4.5
##
## loaded via a namespace (and not attached):
## [1] digest_0.6.39 R6_2.6.1 fastmap_1.2.0 cellranger_1.1.0
## [5] xfun_0.54 magrittr_2.0.4 glue_1.8.0 cachem_1.1.0
## [9] tibble_3.3.0 knitr_1.50 pkgconfig_2.0.3 htmltools_0.5.9
## [13] rmarkdown_2.30 lifecycle_1.0.4 cli_3.6.5 vctrs_0.6.5
## [17] sass_0.4.10 renv_1.0.7 jquerylib_0.1.4 compiler_4.4.3
## [21] rstudioapi_0.17.1 tools_4.4.3 pillar_1.11.1 evaluate_1.0.5
## [25] bslib_0.9.0 yaml_2.3.11 jsonlite_2.0.0 rlang_1.1.6