# Core
library(tidyverse)
library(gridExtra)
library(ggpubr)
library(corrplot)
library(knitr)
# Preprocessing & Encoding
library(caret)
library(VIM)
# LDA
library(MASS)
library(heplots)
library(mvnormtest)
# MLR
library(nnet)
library(car)
library(DescTools)
# Resolve namespace conflicts
select <- dplyr::select
filter <- dplyr::filtertrain <- read.csv("Train.csv", stringsAsFactors = FALSE)
test <- read.csv("Test.csv", stringsAsFactors = FALSE)
# Both files are fully labelled; merge into one dataset, then split 80/20 downstream
df_raw <- bind_rows(train, test)
cat("Train rows :", nrow(train), "\n")## Train rows : 8068
## Test rows : 2627
## Combined rows : 10695
## Columns : 11
## 'data.frame': 10695 obs. of 11 variables:
## $ ID : int 462809 462643 466315 461735 462669 461319 460156 464347 465015 465176 ...
## $ Gender : chr "Male" "Female" "Female" "Male" ...
## $ Ever_Married : chr "No" "Yes" "Yes" "Yes" ...
## $ Age : int 22 38 67 67 40 56 32 33 61 55 ...
## $ Graduated : chr "No" "Yes" "Yes" "Yes" ...
## $ Profession : chr "Healthcare" "Engineer" "Engineer" "Lawyer" ...
## $ Work_Experience: num 1 NA 1 0 NA 0 1 1 0 1 ...
## $ Spending_Score : chr "Low" "Average" "Low" "High" ...
## $ Family_Size : num 4 3 1 2 6 2 3 3 3 4 ...
## $ Var_1 : chr "Cat_4" "Cat_4" "Cat_6" "Cat_6" ...
## $ Segmentation : chr "D" "A" "B" "B" ...
## ID Gender Ever_Married Age Graduated Profession Work_Experience
## 1 462809 Male No 22 No Healthcare 1
## 2 462643 Female Yes 38 Yes Engineer NA
## 3 466315 Female Yes 67 Yes Engineer 1
## 4 461735 Male Yes 67 Yes Lawyer 0
## 5 462669 Female Yes 40 Yes Entertainment NA
## 6 461319 Male Yes 56 No Artist 0
## Spending_Score Family_Size Var_1 Segmentation
## 1 Low 4 Cat_4 D
## 2 Average 3 Cat_4 A
## 3 Low 1 Cat_6 B
## 4 High 2 Cat_6 B
## 5 High 6 Cat_6 A
## 6 Average 2 Cat_6 C
## ID Gender Ever_Married Age
## Min. :458982 Length:10695 Length:10695 Min. :18.00
## 1st Qu.:461221 Class :character Class :character 1st Qu.:30.00
## Median :463451 Mode :character Mode :character Median :41.00
## Mean :463468 Mean :43.51
## 3rd Qu.:465734 3rd Qu.:53.00
## Max. :467974 Max. :89.00
##
## Graduated Profession Work_Experience Spending_Score
## Length:10695 Length:10695 Min. : 0.00 Length:10695
## Class :character Class :character 1st Qu.: 0.00 Class :character
## Mode :character Mode :character Median : 1.00 Mode :character
## Mean : 2.62
## 3rd Qu.: 4.00
## Max. :14.00
## NA's :1098
## Family_Size Var_1 Segmentation
## Min. :1.000 Length:10695 Length:10695
## 1st Qu.:2.000 Class :character Class :character
## Median :3.000 Mode :character Mode :character
## Mean :2.844
## 3rd Qu.:4.000
## Max. :9.000
## NA's :448
is_missing <- is.na(df_raw) | sapply(df_raw, function(x)
if (is.character(x)) x == "" else rep(FALSE, length(x)))
missing_summary <- data.frame(
Column = names(df_raw),
Missing = colSums(is_missing),
Percent = round(colMeans(is_missing) * 100, 2)
) %>% filter(Missing > 0) %>% arrange(desc(Percent))
knitr::kable(missing_summary,
caption = "Missing Values Summary",
row.names = FALSE)| Column | Missing | Percent |
|---|---|---|
| Work_Experience | 1098 | 10.27 |
| Family_Size | 448 | 4.19 |
| Ever_Married | 190 | 1.78 |
| Profession | 162 | 1.51 |
| Var_1 | 108 | 1.01 |
| Graduated | 102 | 0.95 |
df_viz <- dplyr::select(df_raw, -ID)
is_missing_viz <- is.na(df_viz) | sapply(df_viz, function(x)
if (is.character(x)) x == "" else rep(FALSE, length(x)))
missing_plot <- data.frame(
Variable = names(df_viz),
Missing = colMeans(is_missing_viz) * 100
) %>%
arrange(desc(Missing)) %>%
mutate(Variable = factor(Variable, levels = Variable))
ggplot(missing_plot, aes(x = Variable, y = Missing,
fill = Missing > 0)) +
geom_col(width = 0.7, show.legend = FALSE) +
geom_text(aes(label = paste0(round(Missing, 1), "%")),
hjust = -0.15, size = 3.2) +
scale_fill_manual(values = c("FALSE" = "#18BC9C",
"TRUE" = "#E74C3C")) +
coord_flip() +
ylim(0, max(missing_plot$Missing) + 5) +
theme_minimal() +
labs(title = "Missing Value Proportion per Variable",
x = NULL,
y = "Missing (%)")## Age Work_Experience Family_Size
## Min. :18.00 Min. : 0.00 Min. :1.000
## 1st Qu.:30.00 1st Qu.: 0.00 1st Qu.:2.000
## Median :41.00 Median : 1.00 Median :3.000
## Mean :43.51 Mean : 2.62 Mean :2.844
## 3rd Qu.:53.00 3rd Qu.: 4.00 3rd Qu.:4.000
## Max. :89.00 Max. :14.00 Max. :9.000
## NA's :1098 NA's :448
df_raw %>%
filter(!is.na(Segmentation)) %>%
group_by(Segmentation) %>%
summarise(
n = n(),
Age_mean = round(mean(Age, na.rm = TRUE), 2),
Age_sd = round(sd(Age, na.rm = TRUE), 2),
Work_Experience_mean = round(mean(Work_Experience, na.rm = TRUE), 2),
Work_Experience_sd = round(sd(Work_Experience, na.rm = TRUE), 2),
Family_Size_mean = round(mean(Family_Size, na.rm = TRUE), 2),
Family_Size_sd = round(sd(Family_Size, na.rm = TRUE), 2)
) %>%
knitr::kable(caption = "Descriptive Statistics of Numeric Features by Segmentation")| Segmentation | n | Age_mean | Age_sd | Work_Experience_mean | Work_Experience_sd | Family_Size_mean | Family_Size_sd |
|---|---|---|---|---|---|---|---|
| A | 2818 | 44.40 | 16.48 | 2.82 | 3.57 | 2.57 | 1.52 |
| B | 2408 | 47.70 | 15.02 | 2.40 | 3.24 | 2.69 | 1.45 |
| C | 2442 | 49.26 | 14.88 | 2.25 | 3.06 | 2.91 | 1.39 |
| D | 3027 | 34.72 | 16.28 | 2.91 | 3.55 | 3.17 | 1.67 |
df_raw %>%
filter(!is.na(Segmentation)) %>%
count(Segmentation) %>%
mutate(Percent = round(n / sum(n) * 100, 1)) %>%
ggplot(aes(x = Segmentation, y = n, fill = Segmentation)) +
geom_col(width = 0.6, show.legend = FALSE) +
geom_text(aes(label = paste0(n, "\n(", Percent, "%)")),
vjust = -0.3, size = 3.5) +
scale_fill_manual(values = c("#3266ad","#1D9E75","#BA7517","#D4537E")) +
theme_minimal() +
labs(title = "Distribution of Customer Segmentation",
x = "Segment", y = "Count")df_raw %>%
dplyr::select(Age, Work_Experience, Family_Size, Segmentation) %>%
filter(!is.na(Segmentation)) %>%
pivot_longer(cols = c(Age, Work_Experience, Family_Size),
names_to = "Feature", values_to = "Value") %>%
ggplot(aes(x = Value, fill = Segmentation)) +
geom_histogram(bins = 25, color = "white", alpha = 0.8,
position = "identity") +
facet_wrap(~Feature, scales = "free") +
scale_fill_manual(values = c("#3266ad","#1D9E75","#BA7517","#D4537E")) +
theme_minimal() +
labs(title = "Histogram of Numeric Features by Segmentation",
x = "Value", y = "Count")df_raw %>%
dplyr::select(Age, Work_Experience, Family_Size, Segmentation) %>%
filter(!is.na(Segmentation)) %>%
pivot_longer(cols = c(Age, Work_Experience, Family_Size),
names_to = "Feature", values_to = "Value") %>%
ggplot(aes(x = Segmentation, y = Value, fill = Segmentation)) +
geom_boxplot(outlier.color = "red", outlier.size = 1.5,
alpha = 0.7, show.legend = FALSE) +
facet_wrap(~Feature, scales = "free_y") +
scale_fill_manual(values = c("#3266ad","#1D9E75","#BA7517","#D4537E")) +
theme_minimal() +
labs(title = "Boxplots of Numeric Features by Segmentation",
x = "Segment", y = "Value")cat_vars <- c("Gender", "Ever_Married", "Graduated",
"Profession", "Spending_Score", "Var_1")
plots_cat <- map(cat_vars, function(var) {
df_raw %>%
filter(!is.na(Segmentation), .data[[var]] != "") %>%
count(!!sym(var), Segmentation) %>%
ggplot(aes(x = .data[[var]], y = n, fill = Segmentation)) +
geom_col(position = "fill", width = 0.7) +
scale_fill_manual(values = c("#3266ad","#1D9E75","#BA7517","#D4537E")) +
scale_y_continuous(labels = scales::percent) +
theme_minimal(base_size = 9) +
theme(axis.text.x = element_text(angle = 35, hjust = 1),
legend.position = "none") +
labs(title = var, x = NULL, y = "Proportion")
})
ggarrange(plotlist = plots_cat, ncol = 3, nrow = 2,
common.legend = TRUE, legend = "bottom")cor_num <- df_raw %>%
dplyr::select(Age, Work_Experience, Family_Size) %>%
cor(use = "complete.obs")
corrplot(cor_num,
method = "color",
type = "upper",
addCoef.col = "black",
tl.col = "black",
tl.srt = 45,
number.cex = 0.9,
title = "Correlation Matrix of Numeric Features",
mar = c(0, 0, 1, 0))df_raw %>%
dplyr::select(Age, Work_Experience, Family_Size) %>%
pivot_longer(everything(), names_to = "Feature", values_to = "Value") %>%
ggplot(aes(x = Feature, y = Value, fill = Feature)) +
geom_boxplot(outlier.color = "red", outlier.alpha = 0.5,
show.legend = FALSE) +
coord_flip() +
theme_minimal() +
labs(title = "Outlier Detection — Numeric Features (Raw Data)",
subtitle = "Red dots indicate potential outliers",
x = NULL, y = "Value")get_mode <- function(x) {
ux <- unique(x[!is.na(x) & x != ""])
ux[which.max(tabulate(match(x, ux)))]
}
df_clean <- df_raw %>%
mutate(
Work_Experience = ifelse(is.na(Work_Experience),
median(Work_Experience, na.rm = TRUE),
Work_Experience),
Family_Size = ifelse(is.na(Family_Size),
median(Family_Size, na.rm = TRUE),
Family_Size)
) %>%
mutate(
Ever_Married = ifelse(Ever_Married == "" | is.na(Ever_Married),
get_mode(Ever_Married), Ever_Married),
Profession = ifelse(Profession == "" | is.na(Profession),
get_mode(Profession), Profession),
Var_1 = ifelse(Var_1 == "" | is.na(Var_1),
get_mode(Var_1), Var_1),
Graduated = ifelse(Graduated == "" | is.na(Graduated),
get_mode(Graduated), Graduated)
)
cat("Missing values after imputation:\n")## Missing values after imputation:
is_missing_clean <- is.na(df_clean) | sapply(df_clean, function(x)
if (is.character(x)) x == "" else rep(FALSE, length(x)))
colSums(is_missing_clean)## ID Gender Ever_Married Age Graduated
## 0 0 0 0 0
## Profession Work_Experience Spending_Score Family_Size Var_1
## 0 0 0 0 0
## Segmentation
## 0
df_encoded <- df_clean %>%
mutate(
Segmentation = factor(Segmentation,
levels = c("A","B","C","D")),
Gender = factor(Gender),
Ever_Married = factor(Ever_Married),
Graduated = factor(Graduated),
Profession = factor(Profession),
Spending_Score = factor(Spending_Score,
levels = c("Low","Average","High")),
Var_1 = factor(Var_1)
) %>%
dplyr::select(-ID) # Drop ID column
cat("Dataset dimensions after encoding:",
nrow(df_encoded), "rows x", ncol(df_encoded), "cols\n")## Dataset dimensions after encoding: 10695 rows x 10 cols
## Rows: 10,695
## Columns: 10
## $ Gender <fct> Male, Female, Female, Male, Female, Male, Male, Female…
## $ Ever_Married <fct> No, Yes, Yes, Yes, Yes, Yes, No, No, Yes, Yes, No, No,…
## $ Age <int> 22, 38, 67, 67, 40, 56, 32, 33, 61, 55, 26, 19, 19, 70…
## $ Graduated <fct> No, Yes, Yes, Yes, Yes, No, Yes, Yes, Yes, Yes, Yes, N…
## $ Profession <fct> Healthcare, Engineer, Engineer, Lawyer, Entertainment,…
## $ Work_Experience <dbl> 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 4, 0, 1, 0, 1, 9, 1, …
## $ Spending_Score <fct> Low, Average, Low, High, High, Average, Low, Low, Low,…
## $ Family_Size <dbl> 4, 3, 1, 2, 6, 2, 3, 3, 3, 4, 3, 4, 3, 1, 1, 2, 5, 6, …
## $ Var_1 <fct> Cat_4, Cat_4, Cat_6, Cat_6, Cat_6, Cat_6, Cat_6, Cat_6…
## $ Segmentation <fct> D, A, B, B, A, C, C, D, D, C, A, D, D, A, B, C, D, B, …
# LDA uses numeric predictors only; drop_na() removes residual missing rows.
df_lda <- df_encoded %>%
filter(!is.na(Segmentation)) %>%
dplyr::select(Age, Work_Experience, Family_Size, Segmentation) %>%
drop_na()
cat("LDA dataset :", nrow(df_lda), "rows x",
ncol(df_lda), "cols\n")## LDA dataset : 10695 rows x 4 cols
## Features used: Age, Work_Experience, Family_Size
# MLR uses all 9 features; drop_na() removes residual missing rows.
df_mlr <- df_encoded %>%
filter(!is.na(Segmentation)) %>%
drop_na()
cat("\nMLR dataset :", nrow(df_mlr), "rows x",
ncol(df_mlr), "cols\n")##
## MLR dataset : 10695 rows x 10 cols
## Features used: Gender, Ever_Married, Age, Graduated, Profession, Work_Experience, Spending_Score, Family_Size, Var_1
set.seed(123)
# MLR split — 80/20, stratified
df_mlr$Segmentation <- relevel(df_mlr$Segmentation, ref = "A")
idx_mlr <- createDataPartition(df_mlr$Segmentation,
p = 0.8, list = FALSE)
train_mlr <- df_mlr[ idx_mlr, ]
test_mlr <- df_mlr[-idx_mlr, ]
cat("MLR — Train:", nrow(train_mlr),
"| Test:", nrow(test_mlr), "\n")## MLR — Train: 8558 | Test: 2137
# LDA split — 80/20, stratified, independent from MLR split
set.seed(123)
idx_lda <- createDataPartition(df_lda$Segmentation,
p = 0.8, list = FALSE)
train_lda <- df_lda[ idx_lda, ]
test_lda <- df_lda[-idx_lda, ]
cat("LDA — Train:", nrow(train_lda), "| Test:", nrow(test_lda), "\n")## LDA — Train: 8558 | Test: 2137
# Apply log1p transformation to reduce skewness in Work_Experience & Family_Size
train_lda_t <- train_lda %>%
mutate(
Work_Experience = log1p(Work_Experience),
Family_Size = log1p(Family_Size)
)
test_lda_t <- test_lda %>%
mutate(
Work_Experience = log1p(Work_Experience),
Family_Size = log1p(Family_Size)
)
# Subsample 75 obs per group from training set only for Multivariate Shapiro-Wilk
# (test is sensitive to large n; valid only for n <= 5000)
df_lda_sub <- train_lda_t %>%
group_by(Segmentation) %>%
slice_sample(n = 75) %>%
ungroup()
cat("Log-transformed LDA train :", nrow(train_lda_t), "rows\n")## Log-transformed LDA train : 8558 rows
## Log-transformed LDA test : 2137 rows
## Subsampled (train only) : 300 observations
## Per group : 75 observations x 4 groups
## Log transform applied to : Work_Experience, Family_Size
segments <- levels(df_lda_sub$Segmentation)
norm_results <- map_df(segments, function(seg) {
mat <- df_lda_sub %>%
filter(Segmentation == seg) %>%
dplyr::select(Age, Work_Experience, Family_Size) %>%
as.matrix() %>% t()
test <- mvnormtest::mshapiro.test(mat)
data.frame(
Group = seg,
W = round(test$statistic, 4),
P_Value = round(test$p.value, 4),
Result = ifelse(test$p.value > 0.05, "YES", "NO"),
Status = ifelse(test$p.value > 0.05,
"Normal",
"Non-Normal")
)
})
knitr::kable(norm_results,
caption = "Multivariate Shapiro-Wilk Normality Test per Segmentation Group (n = 75 per group, log-transformed)",
row.names = FALSE)| Group | W | P_Value | Result | Status |
|---|---|---|---|---|
| A | 0.9843 | 0.4830 | YES | Normal |
| B | 0.9846 | 0.4994 | YES | Normal |
| C | 0.9822 | 0.3721 | YES | Normal |
| D | 0.7524 | 0.0000 | NO | Non-Normal |
boxm_lda <- boxM(
Y = df_lda_sub %>%
dplyr::select(Age, Work_Experience, Family_Size),
group = df_lda_sub$Segmentation
)
print(boxm_lda)##
## Box's M-test for Homogeneity of Covariance Matrices
##
## data: df_lda_sub %>% dplyr::select(Age, Work_Experience, Family_Size) by df_lda_sub$Segmentation
## Chi-Sq (approx.) = 18.9654, df = 18, p-value = 0.394
cat("\nConclusion:",
ifelse(boxm_lda$p.value > 0.05,
"PASSED — Covariance matrices are homogeneous (p > 0.05)",
"FAILED — Covariance matrices differ across groups (p < 0.05)"))##
## Conclusion: PASSED — Covariance matrices are homogeneous (p > 0.05)
cor_lda <- cor(df_lda_sub %>%
dplyr::select(Age, Work_Experience, Family_Size))
knitr::kable(round(cor_lda, 4),
caption = "Correlation Matrix of LDA Predictors (Log-Transformed)")| Age | Work_Experience | Family_Size | |
|---|---|---|---|
| Age | 1.0000 | -0.2330 | -0.3364 |
| Work_Experience | -0.2330 | 1.0000 | 0.0083 |
| Family_Size | -0.3364 | 0.0083 | 1.0000 |
high_cor <- any(abs(cor_lda[upper.tri(cor_lda)]) > 0.8)
cat("\nHighest correlation:",
round(max(abs(cor_lda[upper.tri(cor_lda)])), 4), "\n")##
## Highest correlation: 0.3364
cat("Conclusion:",
ifelse(!high_cor,
"PASSED, No severe multicollinearity among predictors",
"FAILED, High multicollinearity detected"))## Conclusion: PASSED, No severe multicollinearity among predictors
Each observation in the subsampled dataset represents a unique customer record drawn randomly from the full dataset. No repeated measures or hierarchical structure exists. The independence assumption is therefore satisfied by design.
norm_pass <- all(norm_results$Result == "YES")
boxm_pass <- boxm_lda$p.value > 0.05
cor_pass <- !high_cor
norm_detail <- paste(
norm_results$Group,
ifelse(norm_results$Result == "YES", "Normal", "Non-normal"),
paste0("(p=", norm_results$P_Value, ")"),
collapse = " | "
)
lda_assumption_summary <- data.frame(
No = 1:4,
Assumption = c(
"Multivariate Normality",
"Homogeneity of Covariance",
"Absence of Multicollinearity",
"Independence of Observations"
),
Test_Used = c(
"Multivariate Shapiro-Wilk (per group)",
"Box's M Test",
"Correlation Matrix (threshold > 0.8)",
"Methodological Design"
),
Status = c(
ifelse(norm_pass, "PASSED", "FAILED"),
ifelse(boxm_pass, "PASSED", "FAILED"),
ifelse(cor_pass, "PASSED", "FAILED"),
"PASSED"
),
Conclusion = c(
norm_detail,
paste0("Box's M p = ", round(boxm_lda$p.value, 4),
ifelse(boxm_pass,
" — Covariance matrices are homogeneous (p > 0.05)",
" — Covariance matrices differ across groups (p < 0.05); consider QDA as robust alternative")),
paste0("Max correlation = ",
round(max(abs(cor_lda[upper.tri(cor_lda)])), 4),
" — Well below threshold of 0.8"),
"Each record represents a unique independent customer"
)
)
knitr::kable(lda_assumption_summary,
caption = "Summary Table - LDA Assumption Checking",
row.names = FALSE)| No | Assumption | Test_Used | Status | Conclusion |
|---|---|---|---|---|
| 1 | Multivariate Normality | Multivariate Shapiro-Wilk (per group) | FAILED | A Normal (p=0.483) | B Normal (p=0.4994) | C Normal (p=0.3721) | D Non-normal (p=0) |
| 2 | Homogeneity of Covariance | Box’s M Test | PASSED | Box’s M p = 0.394 — Covariance matrices are homogeneous (p > 0.05) |
| 3 | Absence of Multicollinearity | Correlation Matrix (threshold > 0.8) | PASSED | Max correlation = 0.3364 — Well below threshold of 0.8 |
| 4 | Independence of Observations | Methodological Design | PASSED | Each record represents a unique independent customer |
train_lda_t$Segmentation <- factor(train_lda_t$Segmentation,
levels = c("A","B","C","D"))
train_lda_t <- train_lda_t %>%
dplyr::select(Segmentation, Age, Work_Experience, Family_Size) %>%
na.omit()
str(train_lda_t)## 'data.frame': 8558 obs. of 4 variables:
## $ Segmentation : Factor w/ 4 levels "A","B","C","D": 4 1 2 2 1 3 3 4 4 3 ...
## $ Age : int 22 38 67 67 40 56 32 33 61 55 ...
## $ Work_Experience: num 0.693 0.693 0.693 0 0.693 ...
## $ Family_Size : num 1.609 1.386 0.693 1.099 1.946 ...
lda_model <- lda(Segmentation ~ Age + Work_Experience + Family_Size,
data = train_lda_t)
print(lda_model)## Call:
## lda(Segmentation ~ Age + Work_Experience + Family_Size, data = train_lda_t)
##
## Prior probabilities of groups:
## A B C D
## 0.2634961 0.2251694 0.2283244 0.2830100
##
## Group means:
## Age Work_Experience Family_Size
## A 44.30554 0.9105635 1.204255
## B 47.61754 0.8438422 1.242504
## C 48.99488 0.8270146 1.312888
## D 34.83526 0.9295996 1.350492
##
## Coefficients of linear discriminants:
## LD1 LD2 LD3
## Age -0.06465726 -0.01464310 -0.01124301
## Work_Experience -0.03804155 0.08169357 -1.26821959
## Family_Size -0.11010392 -2.72944123 -0.48525693
##
## Proportion of trace:
## LD1 LD2 LD3
## 0.8745 0.1247 0.0009
# lda$scaling gives raw (unstandardised) discriminant function weights.
lda_coef <- as.data.frame(lda_model$scaling)
lda_coef$Predictor <- rownames(lda_coef)
rownames(lda_coef) <- NULL
ld_cols <- grep("^LD", names(lda_coef), value = TRUE)
lda_coef <- lda_coef[, c("Predictor", ld_cols)]
lda_coef[, ld_cols] <- round(lda_coef[, ld_cols], 4)
knitr::kable(lda_coef,
caption = "LDA Discriminant Function Coefficients (Standardised)",
row.names = FALSE)| Predictor | LD1 | LD2 | LD3 |
|---|---|---|---|
| Age | -0.0647 | -0.0146 | -0.0112 |
| Work_Experience | -0.0380 | 0.0817 | -1.2682 |
| Family_Size | -0.1101 | -2.7294 | -0.4853 |
prop_trace <- data.frame(
LD = paste0("LD", seq_along(lda_model$svd)),
Eigenvalue = round(lda_model$svd^2, 4),
Proportion = round(lda_model$svd^2 / sum(lda_model$svd^2) * 100, 2),
Cumulative = round(cumsum(lda_model$svd^2 / sum(lda_model$svd^2)) * 100, 2)
)
knitr::kable(prop_trace,
caption = "Proportion of Trace — Variance Explained by Each Discriminant Function",
row.names = FALSE)| LD | Eigenvalue | Proportion | Cumulative |
|---|---|---|---|
| LD1 | 372.8420 | 87.45 | 87.45 |
| LD2 | 53.1454 | 12.47 | 99.91 |
| LD3 | 0.3626 | 0.09 | 100.00 |
lda_scores <- predict(lda_model, train_lda_t)
lda_plot_df <- data.frame(
LD1 = lda_scores$x[, 1],
LD2 = lda_scores$x[, 2],
Segmentation = train_lda_t$Segmentation
)
ggplot(lda_plot_df, aes(x = LD1, y = LD2, color = Segmentation)) +
geom_point(alpha = 0.5, size = 1.8) +
stat_ellipse(aes(fill = Segmentation), geom = "polygon",
alpha = 0.08, level = 0.95) +
scale_color_manual(values = c("#3266ad","#1D9E75","#BA7517","#D4537E")) +
scale_fill_manual(values = c("#3266ad","#1D9E75","#BA7517","#D4537E")) +
theme_minimal() +
labs(title = "LDA Bi-Plot: LD1 vs LD2 (Training Set)",
subtitle = "95% confidence ellipses per segment",
x = "First Discriminant Function (LD1)",
y = "Second Discriminant Function (LD2)")lda_pred <- predict(lda_model, newdata = test_lda_t)
# Re-level to match training factor levels for confusionMatrix().
lda_pred_class <- factor(lda_pred$class,
levels = levels(train_lda_t$Segmentation))
lda_cm <- confusionMatrix(
data = lda_pred_class,
reference = factor(test_lda_t$Segmentation,
levels = levels(train_lda_t$Segmentation))
)
print(lda_cm)## Confusion Matrix and Statistics
##
## Reference
## Prediction A B C D
## A 172 157 130 86
## B 14 9 8 9
## C 138 171 227 64
## D 239 144 123 446
##
## Overall Statistics
##
## Accuracy : 0.3996
## 95% CI : (0.3788, 0.4208)
## No Information Rate : 0.2831
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.1869
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Statistics by Class:
##
## Class: A Class: B Class: C Class: D
## Sensitivity 0.30551 0.018711 0.4652 0.7372
## Specificity 0.76302 0.981280 0.7738 0.6697
## Pos Pred Value 0.31560 0.225000 0.3783 0.4685
## Neg Pred Value 0.75440 0.774917 0.8302 0.8658
## Prevalence 0.26345 0.225082 0.2284 0.2831
## Detection Rate 0.08049 0.004212 0.1062 0.2087
## Detection Prevalence 0.25503 0.018718 0.2808 0.4455
## Balanced Accuracy 0.53427 0.499996 0.6195 0.7035
as.data.frame(lda_cm$table) %>%
ggplot(aes(x = Reference, y = Prediction, fill = Freq)) +
geom_tile(color = "white") +
geom_text(aes(label = Freq), size = 5, fontface = "bold") +
scale_fill_gradient(low = "#EBF5FB", high = "#2980B9") +
theme_minimal() +
labs(title = "Confusion Matrix — LDA (Test Set)",
x = "Actual Class", y = "Predicted Class",
fill = "Count")lda_byclass <- as.data.frame(lda_cm$byClass)
lda_byclass$Class <- rownames(lda_byclass)
rownames(lda_byclass) <- NULL
lda_metrics_tbl <- lda_byclass %>%
dplyr::select(Class, Sensitivity, Specificity, `Pos Pred Value`,
`Neg Pred Value`, F1) %>%
mutate(across(where(is.numeric), ~ round(.x, 4)))
knitr::kable(lda_metrics_tbl,
caption = "Per-Class Performance Metrics — LDA",
row.names = FALSE)| Class | Sensitivity | Specificity | Pos Pred Value | Neg Pred Value | F1 |
|---|---|---|---|---|---|
| Class: A | 0.3055 | 0.7630 | 0.3156 | 0.7544 | 0.3105 |
| Class: B | 0.0187 | 0.9813 | 0.2250 | 0.7749 | 0.0345 |
| Class: C | 0.4652 | 0.7738 | 0.3783 | 0.8302 | 0.4173 |
| Class: D | 0.7372 | 0.6697 | 0.4685 | 0.8658 | 0.5729 |
lda_acc <- round(lda_cm$overall["Accuracy"] * 100, 2)
lda_kappa <- round(lda_cm$overall["Kappa"], 4)
# Warn if any class has undefined F1 (zero predicted positives)
lda_na_f1 <- sum(is.na(lda_byclass$F1))
if (lda_na_f1 > 0) {
cat("Note:", lda_na_f1, "class(es) had undefined F1 (excluded from macro average)\n")
}
lda_f1_macro <- round(mean(lda_byclass$F1, na.rm = TRUE), 4)
cat("LDA Overall Performance\n")## LDA Overall Performance
## Accuracy : 39.96 %
## Kappa : 0.1869
## Macro-F1 : 0.3338
lda_post_df <- as.data.frame(lda_pred$posterior)
lda_post_df$Predicted <- lda_pred_class
lda_post_df$Actual <- factor(test_lda_t$Segmentation,
levels = levels(train_lda_t$Segmentation))
lda_post_df$MaxProb <- apply(lda_pred$posterior, 1, max)
lda_post_df$Correct <- lda_post_df$Predicted == lda_post_df$Actual
ggplot(lda_post_df, aes(x = MaxProb, fill = Correct)) +
geom_histogram(bins = 30, color = "white", alpha = 0.85) +
scale_fill_manual(values = c("TRUE" = "#1D9E75", "FALSE" = "#E74C3C"),
labels = c("TRUE" = "Correct", "FALSE" = "Incorrect")) +
theme_minimal() +
labs(title = "Distribution of Maximum Posterior Probability — LDA",
subtitle = "Higher probability = more confident prediction",
x = "Max Posterior Probability", y = "Count",
fill = "Prediction")## Target variable : Segmentation
## Number of levels : 4
## Levels : A B C D
seg_dist <- table(df_mlr$Segmentation)
seg_pct <- round(prop.table(seg_dist) * 100, 2)
knitr::kable(data.frame(
Segment = names(seg_dist),
Count = as.integer(seg_dist),
Percentage = as.numeric(seg_pct)
), caption = "Distribution of Target Variable",
row.names = FALSE)| Segment | Count | Percentage |
|---|---|---|
| A | 2818 | 26.35 |
| B | 2408 | 22.52 |
| C | 2442 | 22.83 |
| D | 3027 | 28.30 |
##
## Conclusion: PASSED - Target has 4 nominal categories
# GVIF^(1/(2*Df)) used for multi-level factors; threshold: sqrt(5) ≈ 2.24.
lm_proxy <- lm(
as.numeric(Segmentation) ~ Age + Work_Experience + Family_Size +
Gender + Ever_Married + Graduated +
Profession + Spending_Score + Var_1,
data = train_mlr
)
vif_raw <- vif(lm_proxy)
# vif() returns a matrix for factors; use adjusted GVIF column.
if (is.matrix(vif_raw)) {
vif_res <- data.frame(
Predictor = rownames(vif_raw),
GVIF = round(vif_raw[, "GVIF"], 4),
Df = vif_raw[, "Df"],
GVIF_adj = round(vif_raw[, "GVIF^(1/(2*Df))"], 4),
Status = ifelse(vif_raw[, "GVIF^(1/(2*Df))"] < sqrt(5),
"Acceptable", "High Multicollinearity"),
row.names = NULL
)
knitr::kable(vif_res,
caption = "VIF - Multicollinearity Check (All 9 Predictors; GVIF adjusted for multi-level factors)",
col.names = c("Predictor", "GVIF", "Df", "GVIF^(1/(2*Df))", "Status"),
row.names = FALSE)
} else {
vif_res <- data.frame(
Predictor = names(vif_raw),
VIF = round(vif_raw, 4),
Status = ifelse(vif_raw < 5, "Acceptable", "High Multicollinearity"),
row.names = NULL
)
knitr::kable(vif_res,
caption = "VIF - Multicollinearity Check (All 9 Predictors)",
row.names = FALSE)
}| Predictor | GVIF | Df | GVIF^(1/(2*Df)) | Status |
|---|---|---|---|---|
| Age | 2.6017 | 1 | 1.6130 | Acceptable |
| Work_Experience | 1.0860 | 1 | 1.0421 | Acceptable |
| Family_Size | 1.2900 | 1 | 1.1358 | Acceptable |
| Gender | 1.1728 | 1 | 1.0830 | Acceptable |
| Ever_Married | 2.4205 | 1 | 1.5558 | Acceptable |
| Graduated | 1.2818 | 1 | 1.1322 | Acceptable |
| Profession | 4.0663 | 8 | 1.0916 | Acceptable |
| Spending_Score | 2.6335 | 2 | 1.2739 | Acceptable |
| Var_1 | 1.1718 | 6 | 1.0133 | Acceptable |
# model.matrix() counts dummy-expanded params correctly; -1 excludes intercept.
n_predictors <- ncol(model.matrix(
~ Age + Work_Experience + Family_Size +
Gender + Ever_Married + Graduated +
Profession + Spending_Score + Var_1,
data = train_mlr
)) - 1 # true dummy-expanded predictor count (excl. intercept)
n_categories <- nlevels(train_mlr$Segmentation)
n_train <- nrow(train_mlr)
min_required <- n_predictors * n_categories * 10
sample_check <- data.frame(
Metric = c("Total Training Samples",
"Number of Predictors",
"Number of Categories",
"Minimum Required (10 per predictor per category)",
"Adequacy Status"),
Value = c(n_train,
n_predictors,
n_categories,
min_required,
ifelse(n_train >= min_required,
"PASSED", "FAILED"))
)
knitr::kable(sample_check,
caption = "Sample Adequacy Check for MLR",
row.names = FALSE)| Metric | Value |
|---|---|
| Total Training Samples | 8558 |
| Number of Predictors | 22 |
| Number of Categories | 4 |
| Minimum Required (10 per predictor per category) | 880 |
| Adequacy Status | PASSED |
Each row represents a unique customer. Data was collected independently with no repeated measurements or clustering structure. The independence assumption is satisfied by the nature of the dataset.
separation_check <- df_mlr %>%
group_by(Segmentation) %>%
summarise(
Age_range = paste0(min(Age), "--", max(Age)),
WorkExp_range = paste0(min(Work_Experience), "--",
max(Work_Experience)),
FamSize_range = paste0(min(Family_Size), "--",
max(Family_Size)),
.groups = "drop"
)
knitr::kable(separation_check,
caption = "Feature Range per Segment - Perfect Separation Check",
row.names = FALSE)| Segmentation | Age_range | WorkExp_range | FamSize_range |
|---|---|---|---|
| A | 18–89 | 0–14 | 1–9 |
| B | 18–89 | 0–14 | 1–9 |
| C | 18–89 | 0–14 | 1–9 |
| D | 18–89 | 0–14 | 1–9 |
cat("\nConclusion: PASSED - Overlapping ranges across",
"segments confirm no perfect separation exists")##
## Conclusion: PASSED - Overlapping ranges across segments confirm no perfect separation exists
if (is.matrix(vif_raw)) {
vif_pass <- all(vif_raw[, "GVIF^(1/(2*Df))"] < sqrt(5))
vif_conclusion <- ifelse(vif_pass,
"All adjusted GVIF values below threshold (sqrt(5) ≈ 2.24) for all 9 predictors",
"High multicollinearity detected in one or more predictors - consider removal")
} else {
vif_pass <- all(vif_raw < 5)
vif_conclusion <- ifelse(vif_pass,
"All VIF values below threshold of 5 for all 9 predictors",
"High VIF detected - consider removing predictors")
}
n_pass <- n_train >= min_required
mlr_assumption_summary <- data.frame(
No = 1:5,
Assumption = c(
"Nominal DV with 3+ Categories",
"Absence of Multicollinearity",
"Sample Adequacy",
"Independence of Observations",
"No Perfect Separation"
),
Test_Used = c(
"Level count of target variable",
"VIF / GVIF (threshold < 5, or GVIF^(1/(2*Df)) < sqrt(5))",
"Min. 10 obs per predictor per category",
"Methodological Design",
"Feature range overlap per segment"
),
Status = c(
"PASSED",
ifelse(vif_pass, "PASSED", "FAILED"),
ifelse(n_pass, "PASSED", "FAILED"),
"PASSED",
"PASSED"
),
Conclusion = c(
paste0("4 nominal categories: A, B, C, D"),
vif_conclusion,
paste0("n = ", n_train, " vs minimum required = ", min_required,
" (based on ", n_predictors, " estimated parameters excl. intercept)"),
"Each record represents a unique independent customer",
"All segments share overlapping feature ranges"
)
)
knitr::kable(mlr_assumption_summary,
caption = "Summary Table - MLR Assumption Checking",
row.names = FALSE)| No | Assumption | Test_Used | Status | Conclusion |
|---|---|---|---|---|
| 1 | Nominal DV with 3+ Categories | Level count of target variable | PASSED | 4 nominal categories: A, B, C, D |
| 2 | Absence of Multicollinearity | VIF / GVIF (threshold < 5, or GVIF^(1/(2*Df)) < sqrt(5)) | PASSED | All adjusted GVIF values below threshold (sqrt(5) ≈ 2.24) for all 9 predictors |
| 3 | Sample Adequacy | Min. 10 obs per predictor per category | PASSED | n = 8558 vs minimum required = 880 (based on 22 estimated parameters excl. intercept) |
| 4 | Independence of Observations | Methodological Design | PASSED | Each record represents a unique independent customer |
| 5 | No Perfect Separation | Feature range overlap per segment | PASSED | All segments share overlapping feature ranges |
# Reference level "A" was already set via relevel() before the train/test split
mlr_model <- multinom(
Segmentation ~ Age + Work_Experience + Family_Size +
Gender + Ever_Married + Graduated +
Profession + Spending_Score + Var_1,
data = train_mlr,
trace = FALSE
)
coef_tbl <- as.data.frame(t(coef(mlr_model)))
coef_tbl <- round(coef_tbl, 4)
coef_tbl$Predictor <- rownames(coef_tbl)
rownames(coef_tbl) <- NULL
class_cols <- setdiff(names(coef_tbl), "Predictor")
coef_tbl <- coef_tbl[, c("Predictor", class_cols)]
knitr::kable(coef_tbl,
caption = "MLR Coefficients by Class (Reference: Segment A)",
row.names = FALSE)| Predictor | B | C | D |
|---|---|---|---|
| (Intercept) | -1.3052 | -1.9956 | 0.4901 |
| Age | 0.0114 | 0.0207 | -0.0209 |
| Work_Experience | -0.0239 | -0.0204 | 0.0099 |
| Family_Size | 0.0685 | 0.1993 | 0.0568 |
| GenderMale | -0.0761 | -0.1627 | 0.1690 |
| Ever_MarriedYes | 0.1629 | -0.1707 | -0.2076 |
| GraduatedYes | 0.3421 | 0.6387 | -0.3649 |
| ProfessionDoctor | -0.3137 | -0.5403 | 0.4456 |
| ProfessionEngineer | -0.3017 | -1.0852 | 0.3153 |
| ProfessionEntertainment | -0.5176 | -1.0993 | 0.1477 |
| ProfessionExecutive | -0.0402 | -0.2205 | 0.8483 |
| ProfessionHealthcare | 0.0453 | 0.2588 | 1.7795 |
| ProfessionHomemaker | -0.0468 | -0.6868 | 0.7724 |
| ProfessionLawyer | -0.6182 | -0.9365 | 1.2276 |
| ProfessionMarketing | -0.3381 | -0.3060 | 1.3538 |
| Spending_ScoreAverage | 0.5469 | 1.3320 | -0.2046 |
| Spending_ScoreHigh | 0.3751 | 0.7508 | -0.3696 |
| Var_1Cat_2 | 0.4507 | 0.0944 | -0.4074 |
| Var_1Cat_3 | 0.1492 | -0.2186 | -0.3373 |
| Var_1Cat_4 | 0.1069 | -0.7471 | -0.2359 |
| Var_1Cat_5 | 0.1628 | -0.0350 | -0.1981 |
| Var_1Cat_6 | 0.2348 | 0.1083 | -0.1977 |
| Var_1Cat_7 | 0.0411 | -0.2476 | -0.2082 |
coef_mlr <- coef(mlr_model)
or_mlr <- exp(coef_mlr)
knitr::kable(round(or_mlr, 4),
caption = "Odds Ratio - MLR (Reference: Segment A)",
row.names = TRUE)| (Intercept) | Age | Work_Experience | Family_Size | GenderMale | Ever_MarriedYes | GraduatedYes | ProfessionDoctor | ProfessionEngineer | ProfessionEntertainment | ProfessionExecutive | ProfessionHealthcare | ProfessionHomemaker | ProfessionLawyer | ProfessionMarketing | Spending_ScoreAverage | Spending_ScoreHigh | Var_1Cat_2 | Var_1Cat_3 | Var_1Cat_4 | Var_1Cat_5 | Var_1Cat_6 | Var_1Cat_7 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| B | 0.2711 | 1.0115 | 0.9764 | 1.0709 | 0.9267 | 1.1770 | 1.4079 | 0.7307 | 0.7395 | 0.5960 | 0.9606 | 1.0463 | 0.9543 | 0.5389 | 0.7131 | 1.7279 | 1.4551 | 1.5695 | 1.1610 | 1.1129 | 1.1768 | 1.2647 | 1.0420 |
| C | 0.1359 | 1.0209 | 0.9798 | 1.2206 | 0.8499 | 0.8431 | 1.8940 | 0.5826 | 0.3378 | 0.3331 | 0.8021 | 1.2954 | 0.5032 | 0.3920 | 0.7364 | 3.7884 | 2.1188 | 1.0990 | 0.8036 | 0.4738 | 0.9656 | 1.1144 | 0.7806 |
| D | 1.6325 | 0.9793 | 1.0099 | 1.0585 | 1.1842 | 0.8125 | 0.6943 | 1.5615 | 1.3707 | 1.1592 | 2.3358 | 5.9266 | 2.1650 | 3.4129 | 3.8720 | 0.8149 | 0.6910 | 0.6654 | 0.7137 | 0.7899 | 0.8203 | 0.8206 | 0.8121 |
mlr_coef <- summary(mlr_model)$coefficients
mlr_se <- summary(mlr_model)$standard.errors
mlr_z <- mlr_coef / mlr_se
mlr_pval <- 2 * (1 - pnorm(abs(mlr_z)))
knitr::kable(round(mlr_pval, 4),
caption = "P-Value Z-Test per Predictor and Class",
row.names = TRUE)| (Intercept) | Age | Work_Experience | Family_Size | GenderMale | Ever_MarriedYes | GraduatedYes | ProfessionDoctor | ProfessionEngineer | ProfessionEntertainment | ProfessionExecutive | ProfessionHealthcare | ProfessionHomemaker | ProfessionLawyer | ProfessionMarketing | Spending_ScoreAverage | Spending_ScoreHigh | Var_1Cat_2 | Var_1Cat_3 | Var_1Cat_4 | Var_1Cat_5 | Var_1Cat_6 | Var_1Cat_7 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| B | 0.0000 | 1e-04 | 0.0199 | 0.0057 | 0.2724 | 0.0862 | 0 | 0.0091 | 0.0094 | 0.0000 | 0.7839 | 0.7470 | 0.8051 | 0 | 0.1015 | 0.0000 | 0.0007 | 0.1440 | 0.6073 | 0.7091 | 0.6987 | 0.3965 | 0.9042 |
| C | 0.0000 | 0e+00 | 0.0574 | 0.0000 | 0.0233 | 0.1114 | 0 | 0.0000 | 0.0000 | 0.0000 | 0.1406 | 0.0634 | 0.0021 | 0 | 0.1463 | 0.0000 | 0.0000 | 0.7585 | 0.4505 | 0.0098 | 0.9339 | 0.6908 | 0.4698 |
| D | 0.0939 | 0e+00 | 0.3061 | 0.0146 | 0.0147 | 0.0257 | 0 | 0.0002 | 0.0119 | 0.1851 | 0.0000 | 0.0000 | 0.0000 | 0 | 0.0000 | 0.0662 | 0.0033 | 0.1460 | 0.1906 | 0.3507 | 0.6018 | 0.4174 | 0.4976 |
mlr_pseudoR2 <- PseudoR2(mlr_model, which = "McFadden")
pseudoR2_tbl <- data.frame(
Metric = "McFadden Pseudo R-Squared",
Value = round(mlr_pseudoR2, 4),
# Thresholds: <0.10 Poor, 0.10–0.20 Acceptable, 0.20–0.40 Good, >0.40 Excellent.
Interpretation = case_when(
mlr_pseudoR2 < 0.10 ~ "Poor",
mlr_pseudoR2 < 0.20 ~ "Acceptable",
mlr_pseudoR2 < 0.40 ~ "Good / Excellent",
TRUE ~ "Excellent / Very High Fit"
)
)
knitr::kable(pseudoR2_tbl,
caption = "McFadden Pseudo R-Squared",
row.names = FALSE)| Metric | Value | Interpretation |
|---|---|---|
| McFadden Pseudo R-Squared | 0.1333 | Acceptable |
mlr_null <- multinom(Segmentation ~ 1,
data = train_mlr,
trace = FALSE)
lrt_result <- anova(mlr_null, mlr_model, test = "Chisq")
# nnet anova column names may have leading spaces; grep handles this robustly.
df_col <- grep("Df", names(lrt_result), value = TRUE)[1]
chi_col <- grep("LR", names(lrt_result), value = TRUE)[1]
p_col <- grep("Pr", names(lrt_result), value = TRUE)[1]
stopifnot(
"Could not find Df column in LRT output" = !is.na(df_col),
"Could not find LR stat column in LRT output" = !is.na(chi_col),
"Could not find p-value column in LRT output" = !is.na(p_col)
)
lrt_tbl <- data.frame(
Model = c("Null (intercept only)", "Full (all predictors)"),
Resid.df = lrt_result[["Resid. df"]],
Resid.Dev = round(lrt_result[["Resid. Dev"]], 2),
df = c(NA, lrt_result[2, df_col]),
Chi.sq = c(NA, round(lrt_result[2, chi_col], 2)),
p.value = c(NA, round(lrt_result[2, p_col], 4))
)
knitr::kable(lrt_tbl,
caption = "Likelihood Ratio Test - Null vs Full MLR Model",
row.names = FALSE,
na = "")| Model | Resid.df | Resid.Dev | df | Chi.sq | p.value |
|---|---|---|---|---|---|
| Null (intercept only) | 25671 | 23647.52 | NA | NA | NA |
| Full (all predictors) | 25605 | 20496.38 | 66 | 3151.14 | 0 |
cat("\nConclusion:",
ifelse(lrt_result[2, p_col] < 0.05,
"SIGNIFICANT - Full model outperforms the null model (p < 0.05)",
"NOT SIGNIFICANT - No meaningful improvement over the null model"))##
## Conclusion: SIGNIFICANT - Full model outperforms the null model (p < 0.05)
# Re-level predicted factor to match training levels for confusionMatrix().
mlr_pred_class <- factor(
predict(mlr_model, newdata = test_mlr, type = "class"),
levels = levels(train_mlr$Segmentation)
)
mlr_pred_prob <- predict(mlr_model, newdata = test_mlr, type = "probs")
mlr_cm <- confusionMatrix(
data = mlr_pred_class,
reference = test_mlr$Segmentation
)
cm_tbl <- as.data.frame(mlr_cm$table)
cm_wide <- tidyr::pivot_wider(cm_tbl,
names_from = Reference,
values_from = Freq)
knitr::kable(cm_wide,
caption = "Confusion Matrix - MLR (Test Set)",
row.names = FALSE)| Prediction | A | B | C | D |
|---|---|---|---|---|
| A | 282 | 181 | 119 | 150 |
| B | 55 | 68 | 55 | 24 |
| C | 108 | 168 | 260 | 53 |
| D | 118 | 64 | 54 | 378 |
cm_df <- as.data.frame(mlr_cm$table)
ggplot(cm_df, aes(x = Reference, y = Prediction, fill = Freq)) +
geom_tile(color = "white", linewidth = 0.8) +
geom_text(aes(label = Freq,
color = Freq > max(Freq) * 0.4),
size = 5, fontface = "bold") +
scale_color_manual(values = c("TRUE" = "white", "FALSE" = "#3A3A3A"),
guide = "none") +
scale_fill_gradient(low = "#D6EAF8", high = "#2E86C1") +
theme_minimal(base_size = 12) +
theme(panel.grid = element_blank()) +
labs(title = "Confusion Matrix - MLR (Test Set)",
x = "Actual Class",
y = "Predicted Class",
fill = "Count")mlr_byclass <- as.data.frame(mlr_cm$byClass)
mlr_byclass$Class <- rownames(mlr_byclass)
rownames(mlr_byclass) <- NULL
mlr_metrics_tbl <- mlr_byclass %>%
dplyr::select(Class, Sensitivity, Specificity, `Pos Pred Value`,
`Neg Pred Value`, F1) %>%
mutate(across(where(is.numeric), ~ round(.x, 4)))
knitr::kable(mlr_metrics_tbl,
caption = "Per-Class Performance Metrics - MLR",
row.names = FALSE)| Class | Sensitivity | Specificity | Pos Pred Value | Neg Pred Value | F1 |
|---|---|---|---|---|---|
| Class: A | 0.5009 | 0.7141 | 0.3852 | 0.8000 | 0.4355 |
| Class: B | 0.1414 | 0.9191 | 0.3366 | 0.7866 | 0.1991 |
| Class: C | 0.5328 | 0.8005 | 0.4414 | 0.8527 | 0.4828 |
| Class: D | 0.6248 | 0.8460 | 0.6156 | 0.8510 | 0.6202 |
mlr_acc <- round(mlr_cm$overall["Accuracy"] * 100, 2)
mlr_kappa <- round(mlr_cm$overall["Kappa"], 4)
# Warn if any class has undefined F1 (zero predicted positives)
mlr_na_f1 <- sum(is.na(mlr_byclass$F1))
if (mlr_na_f1 > 0) {
cat("Note:", mlr_na_f1, "class(es) had undefined F1 (excluded from macro average)\n")
}
mlr_f1_macro <- round(mean(mlr_byclass$F1, na.rm = TRUE), 4)
overall_tbl <- data.frame(
Metric = c("Accuracy (%)", "Kappa", "Macro-F1"),
Value = c(mlr_acc, mlr_kappa, mlr_f1_macro),
Note = c(
ifelse(mlr_acc >= 70, "Good", ifelse(mlr_acc >= 50, "Moderate", "Weak")),
ifelse(mlr_kappa >= 0.8, "Almost Perfect",
ifelse(mlr_kappa >= 0.6, "Substantial",
ifelse(mlr_kappa >= 0.4, "Moderate",
ifelse(mlr_kappa >= 0.2, "Fair", "Poor/Slight")))),
ifelse(mlr_f1_macro >= 0.6, "Good", ifelse(mlr_f1_macro >= 0.4, "Moderate", "Weak"))
)
)
knitr::kable(overall_tbl,
caption = "MLR Overall Performance Metrics",
row.names = FALSE)| Metric | Value | Note |
|---|---|---|
| Accuracy (%) | 46.2300 | Weak |
| Kappa | 0.2775 | Fair |
| Macro-F1 | 0.4344 | Moderate |
mlr_post_df <- as.data.frame(mlr_pred_prob)
mlr_post_df$Predicted <- mlr_pred_class
mlr_post_df$Actual <- test_mlr$Segmentation
mlr_post_df$MaxProb <- apply(mlr_pred_prob, 1, max)
mlr_post_df$Correct <- mlr_post_df$Predicted == mlr_post_df$Actual
ggplot(mlr_post_df, aes(x = MaxProb, fill = Correct)) +
geom_histogram(bins = 30, color = "white", alpha = 0.85) +
scale_fill_manual(values = c("TRUE" = "#7EC8C8",
"FALSE" = "#F4A7B9"),
labels = c("TRUE" = "Correct",
"FALSE" = "Incorrect")) +
theme_minimal() +
labs(title = "Distribution of Maximum Posterior Probability - MLR",
subtitle = "Higher probability = more confident prediction",
x = "Max Posterior Probability",
y = "Count",
fill = "Prediction")comparison_df <- data.frame(
Model = c("LDA", "MLR"),
Accuracy = c(lda_acc, mlr_acc),
Kappa = c(lda_kappa, mlr_kappa),
MacroF1 = c(lda_f1_macro, mlr_f1_macro)
)
knitr::kable(comparison_df,
caption = "Performance Comparison: LDA vs MLR (Test Set)",
row.names = FALSE)| Model | Accuracy | Kappa | MacroF1 |
|---|---|---|---|
| LDA | 39.96 | 0.1869 | 0.3338 |
| MLR | 46.23 | 0.2775 | 0.4344 |
comparison_df %>%
pivot_longer(cols = c(Accuracy, Kappa, MacroF1),
names_to = "Metric",
values_to = "Value") %>%
mutate(Metric = factor(Metric, levels = c("Accuracy", "Kappa", "MacroF1"))) %>%
ggplot(aes(x = Metric, y = Value, fill = Model)) +
geom_col(position = "dodge", width = 0.6) +
geom_text(aes(label = round(Value, 3)),
position = position_dodge(width = 0.6),
vjust = -0.4, size = 3.5) +
scale_fill_manual(values = c("LDA" = "#A8C5E0", "MLR" = "#B5D5C5")) +
theme_minimal() +
labs(title = "Model Comparison: LDA vs MLR (Test Set)",
x = "Metric",
y = "Value",
fill = "Model")best_model <- ifelse(mlr_acc >= lda_acc, "MLR", "LDA")
summary_final <- data.frame(
Metric = c("Accuracy (%)", "Kappa", "Macro-F1",
"Predictors Used", "No. of Features",
"Assumptions Checked", "Best Model"),
LDA = c(paste0(lda_acc, "%"),
as.character(lda_kappa),
as.character(lda_f1_macro),
"Age, Work_Experience, Family_Size",
"3 (numeric only)",
"MVN, Homogeneity, No Multicollinearity, Independence",
ifelse(best_model == "LDA", "✓", "—")),
MLR = c(paste0(mlr_acc, "%"),
as.character(mlr_kappa),
as.character(mlr_f1_macro),
"All 9 Features (numeric + categorical)",
"9",
"Nominal DV, No Multicollinearity, Sample Adequacy, Independence, No Perfect Separation",
ifelse(best_model == "MLR", "✓", "—"))
)
knitr::kable(summary_final,
caption = "Summary Comparison Table — LDA vs MLR",
row.names = FALSE)| Metric | LDA | MLR |
|---|---|---|
| Accuracy (%) | 39.96% | 46.23% |
| Kappa | 0.1869 | 0.2775 |
| Macro-F1 | 0.3338 | 0.4344 |
| Predictors Used | Age, Work_Experience, Family_Size | All 9 Features (numeric + categorical) |
| No. of Features | 3 (numeric only) | 9 |
| Assumptions Checked | MVN, Homogeneity, No Multicollinearity, Independence | Nominal DV, No Multicollinearity, Sample Adequacy, Independence, No Perfect Separation |
| Best Model | — | ✓ |
##
## Best model based on Accuracy: MLR
## Accuracy gain of MLR over LDA: 6.27 %