SETUP

# Core
library(tidyverse)
library(gridExtra)
library(ggpubr)
library(corrplot)
library(knitr)

# Preprocessing & Encoding
library(caret)
library(VIM)

# LDA
library(MASS)
library(heplots)
library(mvnormtest)

# MLR
library(nnet)
library(car)
library(DescTools)

# Resolve namespace conflicts
select <- dplyr::select
filter <- dplyr::filter

LOAD DATA & OVERVIEW

Load & Merge Data

train <- read.csv("Train.csv", stringsAsFactors = FALSE)
test  <- read.csv("Test.csv",  stringsAsFactors = FALSE)

# Both files are fully labelled; merge into one dataset, then split 80/20 downstream
df_raw <- bind_rows(train, test)

cat("Train rows    :", nrow(train),  "\n")
## Train rows    : 8068
cat("Test rows     :", nrow(test),   "\n")
## Test rows     : 2627
cat("Combined rows :", nrow(df_raw), "\n")
## Combined rows : 10695
cat("Columns       :", ncol(df_raw), "\n")
## Columns       : 11

Structure & Dim

str(df_raw)
## 'data.frame':    10695 obs. of  11 variables:
##  $ ID             : int  462809 462643 466315 461735 462669 461319 460156 464347 465015 465176 ...
##  $ Gender         : chr  "Male" "Female" "Female" "Male" ...
##  $ Ever_Married   : chr  "No" "Yes" "Yes" "Yes" ...
##  $ Age            : int  22 38 67 67 40 56 32 33 61 55 ...
##  $ Graduated      : chr  "No" "Yes" "Yes" "Yes" ...
##  $ Profession     : chr  "Healthcare" "Engineer" "Engineer" "Lawyer" ...
##  $ Work_Experience: num  1 NA 1 0 NA 0 1 1 0 1 ...
##  $ Spending_Score : chr  "Low" "Average" "Low" "High" ...
##  $ Family_Size    : num  4 3 1 2 6 2 3 3 3 4 ...
##  $ Var_1          : chr  "Cat_4" "Cat_4" "Cat_6" "Cat_6" ...
##  $ Segmentation   : chr  "D" "A" "B" "B" ...
head(df_raw)
##       ID Gender Ever_Married Age Graduated    Profession Work_Experience
## 1 462809   Male           No  22        No    Healthcare               1
## 2 462643 Female          Yes  38       Yes      Engineer              NA
## 3 466315 Female          Yes  67       Yes      Engineer               1
## 4 461735   Male          Yes  67       Yes        Lawyer               0
## 5 462669 Female          Yes  40       Yes Entertainment              NA
## 6 461319   Male          Yes  56        No        Artist               0
##   Spending_Score Family_Size Var_1 Segmentation
## 1            Low           4 Cat_4            D
## 2        Average           3 Cat_4            A
## 3            Low           1 Cat_6            B
## 4           High           2 Cat_6            B
## 5           High           6 Cat_6            A
## 6        Average           2 Cat_6            C
summary(df_raw)
##        ID            Gender          Ever_Married            Age       
##  Min.   :458982   Length:10695       Length:10695       Min.   :18.00  
##  1st Qu.:461221   Class :character   Class :character   1st Qu.:30.00  
##  Median :463451   Mode  :character   Mode  :character   Median :41.00  
##  Mean   :463468                                         Mean   :43.51  
##  3rd Qu.:465734                                         3rd Qu.:53.00  
##  Max.   :467974                                         Max.   :89.00  
##                                                                        
##   Graduated          Profession        Work_Experience Spending_Score    
##  Length:10695       Length:10695       Min.   : 0.00   Length:10695      
##  Class :character   Class :character   1st Qu.: 0.00   Class :character  
##  Mode  :character   Mode  :character   Median : 1.00   Mode  :character  
##                                        Mean   : 2.62                     
##                                        3rd Qu.: 4.00                     
##                                        Max.   :14.00                     
##                                        NA's   :1098                      
##   Family_Size       Var_1           Segmentation      
##  Min.   :1.000   Length:10695       Length:10695      
##  1st Qu.:2.000   Class :character   Class :character  
##  Median :3.000   Mode  :character   Mode  :character  
##  Mean   :2.844                                        
##  3rd Qu.:4.000                                        
##  Max.   :9.000                                        
##  NA's   :448

Missing Values Check

is_missing <- is.na(df_raw) | sapply(df_raw, function(x)
  if (is.character(x)) x == "" else rep(FALSE, length(x)))

missing_summary <- data.frame(
  Column  = names(df_raw),
  Missing = colSums(is_missing),
  Percent = round(colMeans(is_missing) * 100, 2)
) %>% filter(Missing > 0) %>% arrange(desc(Percent))

knitr::kable(missing_summary,
             caption = "Missing Values Summary",
             row.names = FALSE)
Missing Values Summary
Column Missing Percent
Work_Experience 1098 10.27
Family_Size 448 4.19
Ever_Married 190 1.78
Profession 162 1.51
Var_1 108 1.01
Graduated 102 0.95
df_viz <- dplyr::select(df_raw, -ID)

is_missing_viz <- is.na(df_viz) | sapply(df_viz, function(x)
  if (is.character(x)) x == "" else rep(FALSE, length(x)))

missing_plot <- data.frame(
  Variable = names(df_viz),
  Missing  = colMeans(is_missing_viz) * 100
) %>%
  arrange(desc(Missing)) %>%
  mutate(Variable = factor(Variable, levels = Variable))

ggplot(missing_plot, aes(x = Variable, y = Missing,
                          fill = Missing > 0)) +
  geom_col(width = 0.7, show.legend = FALSE) +
  geom_text(aes(label = paste0(round(Missing, 1), "%")),
            hjust = -0.15, size = 3.2) +
  scale_fill_manual(values = c("FALSE" = "#18BC9C",
                                "TRUE"  = "#E74C3C")) +
  coord_flip() +
  ylim(0, max(missing_plot$Missing) + 5) +
  theme_minimal() +
  labs(title = "Missing Value Proportion per Variable",
       x     = NULL,
       y     = "Missing (%)")

EDA

Descriptive Statistics

df_raw %>%
  dplyr::select(Age, Work_Experience, Family_Size) %>%
  summary()
##       Age        Work_Experience  Family_Size   
##  Min.   :18.00   Min.   : 0.00   Min.   :1.000  
##  1st Qu.:30.00   1st Qu.: 0.00   1st Qu.:2.000  
##  Median :41.00   Median : 1.00   Median :3.000  
##  Mean   :43.51   Mean   : 2.62   Mean   :2.844  
##  3rd Qu.:53.00   3rd Qu.: 4.00   3rd Qu.:4.000  
##  Max.   :89.00   Max.   :14.00   Max.   :9.000  
##                  NA's   :1098    NA's   :448
df_raw %>%
  filter(!is.na(Segmentation)) %>%
  group_by(Segmentation) %>%
  summarise(
    n                      = n(),
    Age_mean               = round(mean(Age, na.rm = TRUE), 2),
    Age_sd                 = round(sd(Age, na.rm = TRUE), 2),
    Work_Experience_mean   = round(mean(Work_Experience, na.rm = TRUE), 2),
    Work_Experience_sd     = round(sd(Work_Experience, na.rm = TRUE), 2),
    Family_Size_mean       = round(mean(Family_Size, na.rm = TRUE), 2),
    Family_Size_sd         = round(sd(Family_Size, na.rm = TRUE), 2)
  ) %>%
  knitr::kable(caption = "Descriptive Statistics of Numeric Features by Segmentation")
Descriptive Statistics of Numeric Features by Segmentation
Segmentation n Age_mean Age_sd Work_Experience_mean Work_Experience_sd Family_Size_mean Family_Size_sd
A 2818 44.40 16.48 2.82 3.57 2.57 1.52
B 2408 47.70 15.02 2.40 3.24 2.69 1.45
C 2442 49.26 14.88 2.25 3.06 2.91 1.39
D 3027 34.72 16.28 2.91 3.55 3.17 1.67

Target Distribution

df_raw %>%
  filter(!is.na(Segmentation)) %>%
  count(Segmentation) %>%
  mutate(Percent = round(n / sum(n) * 100, 1)) %>%
  ggplot(aes(x = Segmentation, y = n, fill = Segmentation)) +
  geom_col(width = 0.6, show.legend = FALSE) +
  geom_text(aes(label = paste0(n, "\n(", Percent, "%)")),
            vjust = -0.3, size = 3.5) +
  scale_fill_manual(values = c("#3266ad","#1D9E75","#BA7517","#D4537E")) +
  theme_minimal() +
  labs(title = "Distribution of Customer Segmentation",
       x = "Segment", y = "Count")

Numeric Feature Distribution

df_raw %>%
  dplyr::select(Age, Work_Experience, Family_Size, Segmentation) %>%
  filter(!is.na(Segmentation)) %>%
  pivot_longer(cols = c(Age, Work_Experience, Family_Size),
               names_to = "Feature", values_to = "Value") %>%
  ggplot(aes(x = Value, fill = Segmentation)) +
  geom_histogram(bins = 25, color = "white", alpha = 0.8,
                 position = "identity") +
  facet_wrap(~Feature, scales = "free") +
  scale_fill_manual(values = c("#3266ad","#1D9E75","#BA7517","#D4537E")) +
  theme_minimal() +
  labs(title = "Histogram of Numeric Features by Segmentation",
       x = "Value", y = "Count")

df_raw %>%
  dplyr::select(Age, Work_Experience, Family_Size, Segmentation) %>%
  filter(!is.na(Segmentation)) %>%
  pivot_longer(cols = c(Age, Work_Experience, Family_Size),
               names_to = "Feature", values_to = "Value") %>%
  ggplot(aes(x = Segmentation, y = Value, fill = Segmentation)) +
  geom_boxplot(outlier.color = "red", outlier.size = 1.5,
               alpha = 0.7, show.legend = FALSE) +
  facet_wrap(~Feature, scales = "free_y") +
  scale_fill_manual(values = c("#3266ad","#1D9E75","#BA7517","#D4537E")) +
  theme_minimal() +
  labs(title = "Boxplots of Numeric Features by Segmentation",
       x = "Segment", y = "Value")

Categorical Feature Distribution

cat_vars <- c("Gender", "Ever_Married", "Graduated",
              "Profession", "Spending_Score", "Var_1")

plots_cat <- map(cat_vars, function(var) {
  df_raw %>%
    filter(!is.na(Segmentation), .data[[var]] != "") %>%
    count(!!sym(var), Segmentation) %>%
    ggplot(aes(x = .data[[var]], y = n, fill = Segmentation)) +
    geom_col(position = "fill", width = 0.7) +
    scale_fill_manual(values = c("#3266ad","#1D9E75","#BA7517","#D4537E")) +
    scale_y_continuous(labels = scales::percent) +
    theme_minimal(base_size = 9) +
    theme(axis.text.x = element_text(angle = 35, hjust = 1),
          legend.position = "none") +
    labs(title = var, x = NULL, y = "Proportion")
})

ggarrange(plotlist = plots_cat, ncol = 3, nrow = 2,
          common.legend = TRUE, legend = "bottom")

Correlation Matrix

cor_num <- df_raw %>%
  dplyr::select(Age, Work_Experience, Family_Size) %>%
  cor(use = "complete.obs")

corrplot(cor_num,
         method     = "color",
         type       = "upper",
         addCoef.col = "black",
         tl.col     = "black",
         tl.srt     = 45,
         number.cex = 0.9,
         title      = "Correlation Matrix of Numeric Features",
         mar        = c(0, 0, 1, 0))

Outlier Detection

df_raw %>%
  dplyr::select(Age, Work_Experience, Family_Size) %>%
  pivot_longer(everything(), names_to = "Feature", values_to = "Value") %>%
  ggplot(aes(x = Feature, y = Value, fill = Feature)) +
  geom_boxplot(outlier.color = "red", outlier.alpha = 0.5,
               show.legend = FALSE) +
  coord_flip() +
  theme_minimal() +
  labs(title = "Outlier Detection — Numeric Features (Raw Data)",
       subtitle = "Red dots indicate potential outliers",
       x = NULL, y = "Value")

PREPROCESSING

Handling Missing Values

get_mode <- function(x) {
  ux <- unique(x[!is.na(x) & x != ""])
  ux[which.max(tabulate(match(x, ux)))]
}

df_clean <- df_raw %>%
  mutate(
    Work_Experience = ifelse(is.na(Work_Experience),
                             median(Work_Experience, na.rm = TRUE),
                             Work_Experience),
    Family_Size     = ifelse(is.na(Family_Size),
                             median(Family_Size, na.rm = TRUE),
                             Family_Size)
  ) %>%
  mutate(
    Ever_Married = ifelse(Ever_Married == "" | is.na(Ever_Married),
                          get_mode(Ever_Married), Ever_Married),
    Profession   = ifelse(Profession == "" | is.na(Profession),
                          get_mode(Profession), Profession),
    Var_1        = ifelse(Var_1 == "" | is.na(Var_1),
                          get_mode(Var_1), Var_1),
    Graduated    = ifelse(Graduated == "" | is.na(Graduated),
                          get_mode(Graduated), Graduated)
  )

cat("Missing values after imputation:\n")
## Missing values after imputation:
is_missing_clean <- is.na(df_clean) | sapply(df_clean, function(x)
  if (is.character(x)) x == "" else rep(FALSE, length(x)))
colSums(is_missing_clean)
##              ID          Gender    Ever_Married             Age       Graduated 
##               0               0               0               0               0 
##      Profession Work_Experience  Spending_Score     Family_Size           Var_1 
##               0               0               0               0               0 
##    Segmentation 
##               0

Encoding Categorical Variables

df_encoded <- df_clean %>%
  mutate(
    Segmentation   = factor(Segmentation,
                            levels = c("A","B","C","D")),
    Gender         = factor(Gender),
    Ever_Married   = factor(Ever_Married),
    Graduated      = factor(Graduated),
    Profession     = factor(Profession),
    Spending_Score = factor(Spending_Score,
                            levels = c("Low","Average","High")),
    Var_1          = factor(Var_1)
  ) %>%
  dplyr::select(-ID)  # Drop ID column

cat("Dataset dimensions after encoding:", 
    nrow(df_encoded), "rows x", ncol(df_encoded), "cols\n")
## Dataset dimensions after encoding: 10695 rows x 10 cols
glimpse(df_encoded)
## Rows: 10,695
## Columns: 10
## $ Gender          <fct> Male, Female, Female, Male, Female, Male, Male, Female…
## $ Ever_Married    <fct> No, Yes, Yes, Yes, Yes, Yes, No, No, Yes, Yes, No, No,…
## $ Age             <int> 22, 38, 67, 67, 40, 56, 32, 33, 61, 55, 26, 19, 19, 70…
## $ Graduated       <fct> No, Yes, Yes, Yes, Yes, No, Yes, Yes, Yes, Yes, Yes, N…
## $ Profession      <fct> Healthcare, Engineer, Engineer, Lawyer, Entertainment,…
## $ Work_Experience <dbl> 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 4, 0, 1, 0, 1, 9, 1, …
## $ Spending_Score  <fct> Low, Average, Low, High, High, Average, Low, Low, Low,…
## $ Family_Size     <dbl> 4, 3, 1, 2, 6, 2, 3, 3, 3, 4, 3, 4, 3, 1, 1, 2, 5, 6, …
## $ Var_1           <fct> Cat_4, Cat_4, Cat_6, Cat_6, Cat_6, Cat_6, Cat_6, Cat_6…
## $ Segmentation    <fct> D, A, B, B, A, C, C, D, D, C, A, D, D, A, B, C, D, B, …

Feature Selection per Method

# LDA uses numeric predictors only; drop_na() removes residual missing rows.
df_lda <- df_encoded %>%
  filter(!is.na(Segmentation)) %>%
  dplyr::select(Age, Work_Experience, Family_Size, Segmentation) %>%
  drop_na()

cat("LDA dataset  :", nrow(df_lda), "rows x", 
    ncol(df_lda), "cols\n")
## LDA dataset  : 10695 rows x 4 cols
cat("Features used:", 
    paste(names(df_lda)[-ncol(df_lda)], collapse = ", "), "\n")
## Features used: Age, Work_Experience, Family_Size
# MLR uses all 9 features; drop_na() removes residual missing rows.
df_mlr <- df_encoded %>%
  filter(!is.na(Segmentation)) %>%
  drop_na()

cat("\nMLR dataset  :", nrow(df_mlr), "rows x", 
    ncol(df_mlr), "cols\n")
## 
## MLR dataset  : 10695 rows x 10 cols
cat("Features used:", 
    paste(setdiff(names(df_mlr), "Segmentation"), collapse = ", "), "\n")
## Features used: Gender, Ever_Married, Age, Graduated, Profession, Work_Experience, Spending_Score, Family_Size, Var_1

Train-Test Split

set.seed(123)

# MLR split — 80/20, stratified
df_mlr$Segmentation <- relevel(df_mlr$Segmentation, ref = "A")

idx_mlr   <- createDataPartition(df_mlr$Segmentation,
                                  p = 0.8, list = FALSE)
train_mlr <- df_mlr[ idx_mlr, ]
test_mlr  <- df_mlr[-idx_mlr, ]

cat("MLR  — Train:", nrow(train_mlr),
    "| Test:", nrow(test_mlr), "\n")
## MLR  — Train: 8558 | Test: 2137

Post-Preprocessing Verification

cat("=== MLR Train — Segmentation Distribution ===\n")
## === MLR Train — Segmentation Distribution ===
print(prop.table(table(train_mlr$Segmentation)) * 100)
## 
##        A        B        C        D 
## 26.34961 22.51694 22.83244 28.30100

DISCRIMINANT ANALYSIS

Assumption Checking

Preprocessing for LDA

# LDA split — 80/20, stratified, independent from MLR split
set.seed(123)
idx_lda   <- createDataPartition(df_lda$Segmentation,
                                  p = 0.8, list = FALSE)
train_lda <- df_lda[ idx_lda, ]
test_lda  <- df_lda[-idx_lda, ]

cat("LDA  — Train:", nrow(train_lda), "| Test:", nrow(test_lda), "\n")
## LDA  — Train: 8558 | Test: 2137
# Apply log1p transformation to reduce skewness in Work_Experience & Family_Size
train_lda_t <- train_lda %>%
  mutate(
    Work_Experience = log1p(Work_Experience),
    Family_Size     = log1p(Family_Size)
  )

test_lda_t <- test_lda %>%
  mutate(
    Work_Experience = log1p(Work_Experience),
    Family_Size     = log1p(Family_Size)
  )

# Subsample 75 obs per group from training set only for Multivariate Shapiro-Wilk
# (test is sensitive to large n; valid only for n <= 5000)
df_lda_sub <- train_lda_t %>%
  group_by(Segmentation) %>%
  slice_sample(n = 75) %>%
  ungroup()

cat("Log-transformed LDA train :", nrow(train_lda_t), "rows\n")
## Log-transformed LDA train : 8558 rows
cat("Log-transformed LDA test  :", nrow(test_lda_t),  "rows\n")
## Log-transformed LDA test  : 2137 rows
cat("Subsampled (train only)   :", nrow(df_lda_sub),  "observations\n")
## Subsampled (train only)   : 300 observations
cat("Per group                 : 75 observations x",
    nlevels(df_lda_sub$Segmentation), "groups\n")
## Per group                 : 75 observations x 4 groups
cat("Log transform applied to  : Work_Experience, Family_Size\n")
## Log transform applied to  : Work_Experience, Family_Size

Multivariate Normality

segments <- levels(df_lda_sub$Segmentation)

norm_results <- map_df(segments, function(seg) {
  mat <- df_lda_sub %>%
    filter(Segmentation == seg) %>%
    dplyr::select(Age, Work_Experience, Family_Size) %>%
    as.matrix() %>% t()
  
  test <- mvnormtest::mshapiro.test(mat) 
  
  data.frame(
    Group   = seg,
    W       = round(test$statistic, 4),
    P_Value = round(test$p.value, 4),
    Result  = ifelse(test$p.value > 0.05, "YES", "NO"),
    Status  = ifelse(test$p.value > 0.05,
                     "Normal",
                     "Non-Normal")
  )
})

knitr::kable(norm_results,
             caption = "Multivariate Shapiro-Wilk Normality Test per Segmentation Group (n = 75 per group, log-transformed)",
             row.names = FALSE)
Multivariate Shapiro-Wilk Normality Test per Segmentation Group (n = 75 per group, log-transformed)
Group W P_Value Result Status
A 0.9843 0.4830 YES Normal
B 0.9846 0.4994 YES Normal
C 0.9822 0.3721 YES Normal
D 0.7524 0.0000 NO Non-Normal

Homogeneity

boxm_lda <- boxM(
  Y     = df_lda_sub %>%
            dplyr::select(Age, Work_Experience, Family_Size),
  group = df_lda_sub$Segmentation
)
print(boxm_lda)
## 
##  Box's M-test for Homogeneity of Covariance Matrices 
## 
## data:  df_lda_sub %>% dplyr::select(Age, Work_Experience, Family_Size) by df_lda_sub$Segmentation 
## Chi-Sq (approx.) = 18.9654, df = 18, p-value = 0.394
cat("\nConclusion:",
    ifelse(boxm_lda$p.value > 0.05,
           "PASSED — Covariance matrices are homogeneous (p > 0.05)",
           "FAILED — Covariance matrices differ across groups (p < 0.05)"))
## 
## Conclusion: PASSED — Covariance matrices are homogeneous (p > 0.05)

Absence of Multicollinearity

cor_lda <- cor(df_lda_sub %>%
                 dplyr::select(Age, Work_Experience, Family_Size))

knitr::kable(round(cor_lda, 4),
             caption = "Correlation Matrix of LDA Predictors (Log-Transformed)")
Correlation Matrix of LDA Predictors (Log-Transformed)
Age Work_Experience Family_Size
Age 1.0000 -0.2330 -0.3364
Work_Experience -0.2330 1.0000 0.0083
Family_Size -0.3364 0.0083 1.0000
high_cor <- any(abs(cor_lda[upper.tri(cor_lda)]) > 0.8)

cat("\nHighest correlation:",
    round(max(abs(cor_lda[upper.tri(cor_lda)])), 4), "\n")
## 
## Highest correlation: 0.3364
cat("Conclusion:",
    ifelse(!high_cor,
           "PASSED, No severe multicollinearity among predictors",
           "FAILED, High multicollinearity detected"))
## Conclusion: PASSED, No severe multicollinearity among predictors

Independence

Each observation in the subsampled dataset represents a unique customer record drawn randomly from the full dataset. No repeated measures or hierarchical structure exists. The independence assumption is therefore satisfied by design.

Summary

norm_pass <- all(norm_results$Result == "YES")
boxm_pass <- boxm_lda$p.value > 0.05
cor_pass  <- !high_cor

norm_detail <- paste(
  norm_results$Group,
  ifelse(norm_results$Result == "YES", "Normal", "Non-normal"),
  paste0("(p=", norm_results$P_Value, ")"),
  collapse = " | "
)

lda_assumption_summary <- data.frame(
  No         = 1:4,
  Assumption = c(
    "Multivariate Normality",
    "Homogeneity of Covariance",
    "Absence of Multicollinearity",
    "Independence of Observations"
  ),
  Test_Used  = c(
    "Multivariate Shapiro-Wilk (per group)",
    "Box's M Test",
    "Correlation Matrix (threshold > 0.8)",
    "Methodological Design"
  ),
  Status     = c(
    ifelse(norm_pass, "PASSED", "FAILED"),
    ifelse(boxm_pass, "PASSED", "FAILED"),
    ifelse(cor_pass,  "PASSED", "FAILED"),
    "PASSED"
  ),
  Conclusion = c(
    norm_detail,
    paste0("Box's M p = ", round(boxm_lda$p.value, 4),
           ifelse(boxm_pass,
                  " — Covariance matrices are homogeneous (p > 0.05)",
                  " — Covariance matrices differ across groups (p < 0.05); consider QDA as robust alternative")),
    paste0("Max correlation = ",
           round(max(abs(cor_lda[upper.tri(cor_lda)])), 4),
           " — Well below threshold of 0.8"),
    "Each record represents a unique independent customer"
  )
)

knitr::kable(lda_assumption_summary,
             caption = "Summary Table - LDA Assumption Checking",
             row.names = FALSE)
Summary Table - LDA Assumption Checking
No Assumption Test_Used Status Conclusion
1 Multivariate Normality Multivariate Shapiro-Wilk (per group) FAILED A Normal (p=0.483) | B Normal (p=0.4994) | C Normal (p=0.3721) | D Non-normal (p=0)
2 Homogeneity of Covariance Box’s M Test PASSED Box’s M p = 0.394 — Covariance matrices are homogeneous (p > 0.05)
3 Absence of Multicollinearity Correlation Matrix (threshold > 0.8) PASSED Max correlation = 0.3364 — Well below threshold of 0.8
4 Independence of Observations Methodological Design PASSED Each record represents a unique independent customer

Model Fitting

Fit LDA Model

train_lda_t$Segmentation <- factor(train_lda_t$Segmentation,
                                    levels = c("A","B","C","D"))

train_lda_t <- train_lda_t %>%
  dplyr::select(Segmentation, Age, Work_Experience, Family_Size) %>%
  na.omit()

str(train_lda_t)
## 'data.frame':    8558 obs. of  4 variables:
##  $ Segmentation   : Factor w/ 4 levels "A","B","C","D": 4 1 2 2 1 3 3 4 4 3 ...
##  $ Age            : int  22 38 67 67 40 56 32 33 61 55 ...
##  $ Work_Experience: num  0.693 0.693 0.693 0 0.693 ...
##  $ Family_Size    : num  1.609 1.386 0.693 1.099 1.946 ...
lda_model <- lda(Segmentation ~ Age + Work_Experience + Family_Size,
                 data = train_lda_t)

print(lda_model)
## Call:
## lda(Segmentation ~ Age + Work_Experience + Family_Size, data = train_lda_t)
## 
## Prior probabilities of groups:
##         A         B         C         D 
## 0.2634961 0.2251694 0.2283244 0.2830100 
## 
## Group means:
##        Age Work_Experience Family_Size
## A 44.30554       0.9105635    1.204255
## B 47.61754       0.8438422    1.242504
## C 48.99488       0.8270146    1.312888
## D 34.83526       0.9295996    1.350492
## 
## Coefficients of linear discriminants:
##                         LD1         LD2         LD3
## Age             -0.06465726 -0.01464310 -0.01124301
## Work_Experience -0.03804155  0.08169357 -1.26821959
## Family_Size     -0.11010392 -2.72944123 -0.48525693
## 
## Proportion of trace:
##    LD1    LD2    LD3 
## 0.8745 0.1247 0.0009

Discriminant Function Coefficients

# lda$scaling gives raw (unstandardised) discriminant function weights.
lda_coef <- as.data.frame(lda_model$scaling)
lda_coef$Predictor <- rownames(lda_coef)
rownames(lda_coef) <- NULL

ld_cols  <- grep("^LD", names(lda_coef), value = TRUE)
lda_coef <- lda_coef[, c("Predictor", ld_cols)]
lda_coef[, ld_cols] <- round(lda_coef[, ld_cols], 4)

knitr::kable(lda_coef,
             caption = "LDA Discriminant Function Coefficients (Standardised)",
             row.names = FALSE)
LDA Discriminant Function Coefficients (Standardised)
Predictor LD1 LD2 LD3
Age -0.0647 -0.0146 -0.0112
Work_Experience -0.0380 0.0817 -1.2682
Family_Size -0.1101 -2.7294 -0.4853

Proportion of Trace (Variance Explained)

prop_trace <- data.frame(
  LD        = paste0("LD", seq_along(lda_model$svd)),
  Eigenvalue = round(lda_model$svd^2, 4),
  Proportion = round(lda_model$svd^2 / sum(lda_model$svd^2) * 100, 2),
  Cumulative = round(cumsum(lda_model$svd^2 / sum(lda_model$svd^2)) * 100, 2)
)

knitr::kable(prop_trace,
             caption = "Proportion of Trace — Variance Explained by Each Discriminant Function",
             row.names = FALSE)
Proportion of Trace — Variance Explained by Each Discriminant Function
LD Eigenvalue Proportion Cumulative
LD1 372.8420 87.45 87.45
LD2 53.1454 12.47 99.91
LD3 0.3626 0.09 100.00

LDA Bi-Plot (LD1 vs LD2)

lda_scores <- predict(lda_model, train_lda_t)

lda_plot_df <- data.frame(
  LD1          = lda_scores$x[, 1],
  LD2          = lda_scores$x[, 2],
  Segmentation = train_lda_t$Segmentation
)

ggplot(lda_plot_df, aes(x = LD1, y = LD2, color = Segmentation)) +
  geom_point(alpha = 0.5, size = 1.8) +
  stat_ellipse(aes(fill = Segmentation), geom = "polygon",
               alpha = 0.08, level = 0.95) +
  scale_color_manual(values = c("#3266ad","#1D9E75","#BA7517","#D4537E")) +
  scale_fill_manual(values  = c("#3266ad","#1D9E75","#BA7517","#D4537E")) +
  theme_minimal() +
  labs(title    = "LDA Bi-Plot: LD1 vs LD2 (Training Set)",
       subtitle  = "95% confidence ellipses per segment",
       x = "First Discriminant Function (LD1)",
       y = "Second Discriminant Function (LD2)")

Prediction & Confusion Matrix — LDA

lda_pred <- predict(lda_model, newdata = test_lda_t)

# Re-level to match training factor levels for confusionMatrix().
lda_pred_class <- factor(lda_pred$class,
                          levels = levels(train_lda_t$Segmentation))

lda_cm <- confusionMatrix(
  data      = lda_pred_class,
  reference = factor(test_lda_t$Segmentation,
                     levels = levels(train_lda_t$Segmentation))
)

print(lda_cm)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   A   B   C   D
##          A 172 157 130  86
##          B  14   9   8   9
##          C 138 171 227  64
##          D 239 144 123 446
## 
## Overall Statistics
##                                           
##                Accuracy : 0.3996          
##                  95% CI : (0.3788, 0.4208)
##     No Information Rate : 0.2831          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.1869          
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
## 
## Statistics by Class:
## 
##                      Class: A Class: B Class: C Class: D
## Sensitivity           0.30551 0.018711   0.4652   0.7372
## Specificity           0.76302 0.981280   0.7738   0.6697
## Pos Pred Value        0.31560 0.225000   0.3783   0.4685
## Neg Pred Value        0.75440 0.774917   0.8302   0.8658
## Prevalence            0.26345 0.225082   0.2284   0.2831
## Detection Rate        0.08049 0.004212   0.1062   0.2087
## Detection Prevalence  0.25503 0.018718   0.2808   0.4455
## Balanced Accuracy     0.53427 0.499996   0.6195   0.7035
as.data.frame(lda_cm$table) %>%
  ggplot(aes(x = Reference, y = Prediction, fill = Freq)) +
  geom_tile(color = "white") +
  geom_text(aes(label = Freq), size = 5, fontface = "bold") +
  scale_fill_gradient(low = "#EBF5FB", high = "#2980B9") +
  theme_minimal() +
  labs(title    = "Confusion Matrix — LDA (Test Set)",
       x = "Actual Class", y = "Predicted Class",
       fill = "Count")

Per-Class Metrics — LDA

lda_byclass <- as.data.frame(lda_cm$byClass)
lda_byclass$Class <- rownames(lda_byclass)
rownames(lda_byclass) <- NULL

lda_metrics_tbl <- lda_byclass %>%
  dplyr::select(Class, Sensitivity, Specificity, `Pos Pred Value`,
                `Neg Pred Value`, F1) %>%
  mutate(across(where(is.numeric), ~ round(.x, 4)))

knitr::kable(lda_metrics_tbl,
             caption = "Per-Class Performance Metrics — LDA",
             row.names = FALSE)
Per-Class Performance Metrics — LDA
Class Sensitivity Specificity Pos Pred Value Neg Pred Value F1
Class: A 0.3055 0.7630 0.3156 0.7544 0.3105
Class: B 0.0187 0.9813 0.2250 0.7749 0.0345
Class: C 0.4652 0.7738 0.3783 0.8302 0.4173
Class: D 0.7372 0.6697 0.4685 0.8658 0.5729

LDA Overall Performance

lda_acc    <- round(lda_cm$overall["Accuracy"] * 100, 2)
lda_kappa  <- round(lda_cm$overall["Kappa"],          4)

# Warn if any class has undefined F1 (zero predicted positives)
lda_na_f1 <- sum(is.na(lda_byclass$F1))
if (lda_na_f1 > 0) {
  cat("Note:", lda_na_f1, "class(es) had undefined F1 (excluded from macro average)\n")
}
lda_f1_macro <- round(mean(lda_byclass$F1, na.rm = TRUE), 4)

cat("LDA Overall Performance\n")
## LDA Overall Performance
cat("Accuracy     :", lda_acc, "%\n")
## Accuracy     : 39.96 %
cat("Kappa        :", lda_kappa, "\n")
## Kappa        : 0.1869
cat("Macro-F1     :", lda_f1_macro, "\n")
## Macro-F1     : 0.3338

Posterior Probability Plot — LDA

lda_post_df <- as.data.frame(lda_pred$posterior)
lda_post_df$Predicted  <- lda_pred_class
lda_post_df$Actual     <- factor(test_lda_t$Segmentation,
                                  levels = levels(train_lda_t$Segmentation))
lda_post_df$MaxProb    <- apply(lda_pred$posterior, 1, max)
lda_post_df$Correct    <- lda_post_df$Predicted == lda_post_df$Actual

ggplot(lda_post_df, aes(x = MaxProb, fill = Correct)) +
  geom_histogram(bins = 30, color = "white", alpha = 0.85) +
  scale_fill_manual(values = c("TRUE" = "#1D9E75", "FALSE" = "#E74C3C"),
                    labels = c("TRUE" = "Correct", "FALSE" = "Incorrect")) +
  theme_minimal() +
  labs(title   = "Distribution of Maximum Posterior Probability — LDA",
       subtitle = "Higher probability = more confident prediction",
       x = "Max Posterior Probability", y = "Count",
       fill = "Prediction")

MULTINOMIAL LOGISTIC REGRESSION (MLR)

Assumption Checking

Nominal Dependent Variable

cat("Target variable  : Segmentation\n")
## Target variable  : Segmentation
cat("Number of levels :", nlevels(df_mlr$Segmentation), "\n")
## Number of levels : 4
cat("Levels           :", levels(df_mlr$Segmentation), "\n")
## Levels           : A B C D
seg_dist <- table(df_mlr$Segmentation)
seg_pct  <- round(prop.table(seg_dist) * 100, 2)

knitr::kable(data.frame(
  Segment    = names(seg_dist),
  Count      = as.integer(seg_dist),
  Percentage = as.numeric(seg_pct)
), caption = "Distribution of Target Variable",
   row.names = FALSE)
Distribution of Target Variable
Segment Count Percentage
A 2818 26.35
B 2408 22.52
C 2442 22.83
D 3027 28.30
cat("\nConclusion: PASSED - Target has",
    nlevels(df_mlr$Segmentation),
    "nominal categories")
## 
## Conclusion: PASSED - Target has 4 nominal categories

VIF

# GVIF^(1/(2*Df)) used for multi-level factors; threshold: sqrt(5) ≈ 2.24.
lm_proxy <- lm(
  as.numeric(Segmentation) ~ Age + Work_Experience + Family_Size +
                              Gender + Ever_Married + Graduated +
                              Profession + Spending_Score + Var_1,
  data = train_mlr
)

vif_raw <- vif(lm_proxy)

# vif() returns a matrix for factors; use adjusted GVIF column.
if (is.matrix(vif_raw)) {
  vif_res <- data.frame(
    Predictor  = rownames(vif_raw),
    GVIF       = round(vif_raw[, "GVIF"], 4),
    Df         = vif_raw[, "Df"],
    GVIF_adj   = round(vif_raw[, "GVIF^(1/(2*Df))"], 4),
    Status     = ifelse(vif_raw[, "GVIF^(1/(2*Df))"] < sqrt(5),
                        "Acceptable", "High Multicollinearity"),
    row.names  = NULL
  )
  knitr::kable(vif_res,
               caption = "VIF - Multicollinearity Check (All 9 Predictors; GVIF adjusted for multi-level factors)",
               col.names = c("Predictor", "GVIF", "Df", "GVIF^(1/(2*Df))", "Status"),
               row.names = FALSE)
} else {
  vif_res <- data.frame(
    Predictor = names(vif_raw),
    VIF       = round(vif_raw, 4),
    Status    = ifelse(vif_raw < 5, "Acceptable", "High Multicollinearity"),
    row.names = NULL
  )
  knitr::kable(vif_res,
               caption = "VIF - Multicollinearity Check (All 9 Predictors)",
               row.names = FALSE)
}
VIF - Multicollinearity Check (All 9 Predictors; GVIF adjusted for multi-level factors)
Predictor GVIF Df GVIF^(1/(2*Df)) Status
Age 2.6017 1 1.6130 Acceptable
Work_Experience 1.0860 1 1.0421 Acceptable
Family_Size 1.2900 1 1.1358 Acceptable
Gender 1.1728 1 1.0830 Acceptable
Ever_Married 2.4205 1 1.5558 Acceptable
Graduated 1.2818 1 1.1322 Acceptable
Profession 4.0663 8 1.0916 Acceptable
Spending_Score 2.6335 2 1.2739 Acceptable
Var_1 1.1718 6 1.0133 Acceptable

Sample Adequacy

# model.matrix() counts dummy-expanded params correctly; -1 excludes intercept.
n_predictors <- ncol(model.matrix(
  ~ Age + Work_Experience + Family_Size +
    Gender + Ever_Married + Graduated +
    Profession + Spending_Score + Var_1,
  data = train_mlr
)) - 1   # true dummy-expanded predictor count (excl. intercept)
n_categories <- nlevels(train_mlr$Segmentation)
n_train      <- nrow(train_mlr)
min_required <- n_predictors * n_categories * 10

sample_check <- data.frame(
  Metric       = c("Total Training Samples",
                   "Number of Predictors",
                   "Number of Categories",
                   "Minimum Required (10 per predictor per category)",
                   "Adequacy Status"),
  Value        = c(n_train,
                   n_predictors,
                   n_categories,
                   min_required,
                   ifelse(n_train >= min_required,
                          "PASSED", "FAILED"))
)

knitr::kable(sample_check,
             caption = "Sample Adequacy Check for MLR",
             row.names = FALSE)
Sample Adequacy Check for MLR
Metric Value
Total Training Samples 8558
Number of Predictors 22
Number of Categories 4
Minimum Required (10 per predictor per category) 880
Adequacy Status PASSED

Independence

Each row represents a unique customer. Data was collected independently with no repeated measurements or clustering structure. The independence assumption is satisfied by the nature of the dataset.

No Perfect Separation

separation_check <- df_mlr %>%
  group_by(Segmentation) %>%
  summarise(
    Age_range     = paste0(min(Age), "--", max(Age)),
    WorkExp_range = paste0(min(Work_Experience), "--",
                           max(Work_Experience)),
    FamSize_range = paste0(min(Family_Size), "--",
                           max(Family_Size)),
    .groups = "drop"
  )

knitr::kable(separation_check,
             caption = "Feature Range per Segment - Perfect Separation Check",
             row.names = FALSE)
Feature Range per Segment - Perfect Separation Check
Segmentation Age_range WorkExp_range FamSize_range
A 18–89 0–14 1–9
B 18–89 0–14 1–9
C 18–89 0–14 1–9
D 18–89 0–14 1–9
cat("\nConclusion: PASSED - Overlapping ranges across",
    "segments confirm no perfect separation exists")
## 
## Conclusion: PASSED - Overlapping ranges across segments confirm no perfect separation exists

Summary

if (is.matrix(vif_raw)) {
  vif_pass <- all(vif_raw[, "GVIF^(1/(2*Df))"] < sqrt(5))
  vif_conclusion <- ifelse(vif_pass,
    "All adjusted GVIF values below threshold (sqrt(5) ≈ 2.24) for all 9 predictors",
    "High multicollinearity detected in one or more predictors - consider removal")
} else {
  vif_pass <- all(vif_raw < 5)
  vif_conclusion <- ifelse(vif_pass,
    "All VIF values below threshold of 5 for all 9 predictors",
    "High VIF detected - consider removing predictors")
}
n_pass   <- n_train >= min_required

mlr_assumption_summary <- data.frame(
  No         = 1:5,
  Assumption = c(
    "Nominal DV with 3+ Categories",
    "Absence of Multicollinearity",
    "Sample Adequacy",
    "Independence of Observations",
    "No Perfect Separation"
  ),
  Test_Used  = c(
    "Level count of target variable",
    "VIF / GVIF (threshold < 5, or GVIF^(1/(2*Df)) < sqrt(5))",
    "Min. 10 obs per predictor per category",
    "Methodological Design",
    "Feature range overlap per segment"
  ),
  Status     = c(
    "PASSED",
    ifelse(vif_pass,  "PASSED", "FAILED"),
    ifelse(n_pass,    "PASSED", "FAILED"),
    "PASSED",
    "PASSED"
  ),
  Conclusion = c(
    paste0("4 nominal categories: A, B, C, D"),
    vif_conclusion,
    paste0("n = ", n_train, " vs minimum required = ", min_required,
           " (based on ", n_predictors, " estimated parameters excl. intercept)"),
    "Each record represents a unique independent customer",
    "All segments share overlapping feature ranges"
  )
)

knitr::kable(mlr_assumption_summary,
             caption = "Summary Table - MLR Assumption Checking",
             row.names = FALSE)
Summary Table - MLR Assumption Checking
No Assumption Test_Used Status Conclusion
1 Nominal DV with 3+ Categories Level count of target variable PASSED 4 nominal categories: A, B, C, D
2 Absence of Multicollinearity VIF / GVIF (threshold < 5, or GVIF^(1/(2*Df)) < sqrt(5)) PASSED All adjusted GVIF values below threshold (sqrt(5) ≈ 2.24) for all 9 predictors
3 Sample Adequacy Min. 10 obs per predictor per category PASSED n = 8558 vs minimum required = 880 (based on 22 estimated parameters excl. intercept)
4 Independence of Observations Methodological Design PASSED Each record represents a unique independent customer
5 No Perfect Separation Feature range overlap per segment PASSED All segments share overlapping feature ranges

Model Fitting

Fit MLR Model

# Reference level "A" was already set via relevel() before the train/test split
mlr_model <- multinom(
  Segmentation ~ Age + Work_Experience + Family_Size +
                 Gender + Ever_Married + Graduated +
                 Profession + Spending_Score + Var_1,
  data  = train_mlr,
  trace = FALSE
)

coef_tbl <- as.data.frame(t(coef(mlr_model)))
coef_tbl <- round(coef_tbl, 4)
coef_tbl$Predictor <- rownames(coef_tbl)
rownames(coef_tbl) <- NULL
class_cols <- setdiff(names(coef_tbl), "Predictor")
coef_tbl <- coef_tbl[, c("Predictor", class_cols)]

knitr::kable(coef_tbl,
             caption = "MLR Coefficients by Class (Reference: Segment A)",
             row.names = FALSE)
MLR Coefficients by Class (Reference: Segment A)
Predictor B C D
(Intercept) -1.3052 -1.9956 0.4901
Age 0.0114 0.0207 -0.0209
Work_Experience -0.0239 -0.0204 0.0099
Family_Size 0.0685 0.1993 0.0568
GenderMale -0.0761 -0.1627 0.1690
Ever_MarriedYes 0.1629 -0.1707 -0.2076
GraduatedYes 0.3421 0.6387 -0.3649
ProfessionDoctor -0.3137 -0.5403 0.4456
ProfessionEngineer -0.3017 -1.0852 0.3153
ProfessionEntertainment -0.5176 -1.0993 0.1477
ProfessionExecutive -0.0402 -0.2205 0.8483
ProfessionHealthcare 0.0453 0.2588 1.7795
ProfessionHomemaker -0.0468 -0.6868 0.7724
ProfessionLawyer -0.6182 -0.9365 1.2276
ProfessionMarketing -0.3381 -0.3060 1.3538
Spending_ScoreAverage 0.5469 1.3320 -0.2046
Spending_ScoreHigh 0.3751 0.7508 -0.3696
Var_1Cat_2 0.4507 0.0944 -0.4074
Var_1Cat_3 0.1492 -0.2186 -0.3373
Var_1Cat_4 0.1069 -0.7471 -0.2359
Var_1Cat_5 0.1628 -0.0350 -0.1981
Var_1Cat_6 0.2348 0.1083 -0.1977
Var_1Cat_7 0.0411 -0.2476 -0.2082

Coefficients & Odds Ratio

coef_mlr <- coef(mlr_model)
or_mlr   <- exp(coef_mlr)

knitr::kable(round(or_mlr, 4),
             caption = "Odds Ratio - MLR (Reference: Segment A)",
             row.names = TRUE)
Odds Ratio - MLR (Reference: Segment A)
(Intercept) Age Work_Experience Family_Size GenderMale Ever_MarriedYes GraduatedYes ProfessionDoctor ProfessionEngineer ProfessionEntertainment ProfessionExecutive ProfessionHealthcare ProfessionHomemaker ProfessionLawyer ProfessionMarketing Spending_ScoreAverage Spending_ScoreHigh Var_1Cat_2 Var_1Cat_3 Var_1Cat_4 Var_1Cat_5 Var_1Cat_6 Var_1Cat_7
B 0.2711 1.0115 0.9764 1.0709 0.9267 1.1770 1.4079 0.7307 0.7395 0.5960 0.9606 1.0463 0.9543 0.5389 0.7131 1.7279 1.4551 1.5695 1.1610 1.1129 1.1768 1.2647 1.0420
C 0.1359 1.0209 0.9798 1.2206 0.8499 0.8431 1.8940 0.5826 0.3378 0.3331 0.8021 1.2954 0.5032 0.3920 0.7364 3.7884 2.1188 1.0990 0.8036 0.4738 0.9656 1.1144 0.7806
D 1.6325 0.9793 1.0099 1.0585 1.1842 0.8125 0.6943 1.5615 1.3707 1.1592 2.3358 5.9266 2.1650 3.4129 3.8720 0.8149 0.6910 0.6654 0.7137 0.7899 0.8203 0.8206 0.8121

Predictor Significance Test (z-test)

mlr_coef  <- summary(mlr_model)$coefficients
mlr_se    <- summary(mlr_model)$standard.errors
mlr_z     <- mlr_coef / mlr_se
mlr_pval  <- 2 * (1 - pnorm(abs(mlr_z)))

knitr::kable(round(mlr_pval, 4),
             caption = "P-Value Z-Test per Predictor and Class",
             row.names = TRUE)
P-Value Z-Test per Predictor and Class
(Intercept) Age Work_Experience Family_Size GenderMale Ever_MarriedYes GraduatedYes ProfessionDoctor ProfessionEngineer ProfessionEntertainment ProfessionExecutive ProfessionHealthcare ProfessionHomemaker ProfessionLawyer ProfessionMarketing Spending_ScoreAverage Spending_ScoreHigh Var_1Cat_2 Var_1Cat_3 Var_1Cat_4 Var_1Cat_5 Var_1Cat_6 Var_1Cat_7
B 0.0000 1e-04 0.0199 0.0057 0.2724 0.0862 0 0.0091 0.0094 0.0000 0.7839 0.7470 0.8051 0 0.1015 0.0000 0.0007 0.1440 0.6073 0.7091 0.6987 0.3965 0.9042
C 0.0000 0e+00 0.0574 0.0000 0.0233 0.1114 0 0.0000 0.0000 0.0000 0.1406 0.0634 0.0021 0 0.1463 0.0000 0.0000 0.7585 0.4505 0.0098 0.9339 0.6908 0.4698
D 0.0939 0e+00 0.3061 0.0146 0.0147 0.0257 0 0.0002 0.0119 0.1851 0.0000 0.0000 0.0000 0 0.0000 0.0662 0.0033 0.1460 0.1906 0.3507 0.6018 0.4174 0.4976

Pseudo R-Squared (McFadden)

mlr_pseudoR2 <- PseudoR2(mlr_model, which = "McFadden")

pseudoR2_tbl <- data.frame(
  Metric        = "McFadden Pseudo R-Squared",
  Value         = round(mlr_pseudoR2, 4),
  # Thresholds: <0.10 Poor, 0.10–0.20 Acceptable, 0.20–0.40 Good, >0.40 Excellent.
  Interpretation = case_when(
    mlr_pseudoR2 < 0.10 ~ "Poor",
    mlr_pseudoR2 < 0.20 ~ "Acceptable",
    mlr_pseudoR2 < 0.40 ~ "Good / Excellent",
    TRUE                ~ "Excellent / Very High Fit"
  )
)

knitr::kable(pseudoR2_tbl,
             caption = "McFadden Pseudo R-Squared",
             row.names = FALSE)
McFadden Pseudo R-Squared
Metric Value Interpretation
McFadden Pseudo R-Squared 0.1333 Acceptable

Overall Model Test (Likelihood Ratio Test)

mlr_null <- multinom(Segmentation ~ 1,
                     data  = train_mlr,
                     trace = FALSE)

lrt_result <- anova(mlr_null, mlr_model, test = "Chisq")

# nnet anova column names may have leading spaces; grep handles this robustly.
df_col  <- grep("Df",  names(lrt_result), value = TRUE)[1]
chi_col <- grep("LR",  names(lrt_result), value = TRUE)[1]
p_col   <- grep("Pr",  names(lrt_result), value = TRUE)[1]

stopifnot(
  "Could not find Df column in LRT output"    = !is.na(df_col),
  "Could not find LR stat column in LRT output" = !is.na(chi_col),
  "Could not find p-value column in LRT output" = !is.na(p_col)
)

lrt_tbl <- data.frame(
  Model      = c("Null (intercept only)", "Full (all predictors)"),
  Resid.df   = lrt_result[["Resid. df"]],
  Resid.Dev  = round(lrt_result[["Resid. Dev"]], 2),
  df         = c(NA, lrt_result[2, df_col]),
  Chi.sq     = c(NA, round(lrt_result[2, chi_col], 2)),
  p.value    = c(NA, round(lrt_result[2, p_col],   4))
)

knitr::kable(lrt_tbl,
             caption = "Likelihood Ratio Test - Null vs Full MLR Model",
             row.names = FALSE,
             na = "")
Likelihood Ratio Test - Null vs Full MLR Model
Model Resid.df Resid.Dev df Chi.sq p.value
Null (intercept only) 25671 23647.52 NA NA NA
Full (all predictors) 25605 20496.38 66 3151.14 0
cat("\nConclusion:",
    ifelse(lrt_result[2, p_col] < 0.05,
           "SIGNIFICANT - Full model outperforms the null model (p < 0.05)",
           "NOT SIGNIFICANT - No meaningful improvement over the null model"))
## 
## Conclusion: SIGNIFICANT - Full model outperforms the null model (p < 0.05)

Prediction & Evaluation

Prediction on Test Set

# Re-level predicted factor to match training levels for confusionMatrix().
mlr_pred_class <- factor(
  predict(mlr_model, newdata = test_mlr, type = "class"),
  levels = levels(train_mlr$Segmentation)
)
mlr_pred_prob  <- predict(mlr_model, newdata = test_mlr, type = "probs")

mlr_cm <- confusionMatrix(
  data      = mlr_pred_class,
  reference = test_mlr$Segmentation
)

cm_tbl <- as.data.frame(mlr_cm$table)
cm_wide <- tidyr::pivot_wider(cm_tbl,
                               names_from  = Reference,
                               values_from = Freq)

knitr::kable(cm_wide,
             caption = "Confusion Matrix - MLR (Test Set)",
             row.names = FALSE)
Confusion Matrix - MLR (Test Set)
Prediction A B C D
A 282 181 119 150
B 55 68 55 24
C 108 168 260 53
D 118 64 54 378

Confusion Matrix Visualization - MLR

cm_df <- as.data.frame(mlr_cm$table)

ggplot(cm_df, aes(x = Reference, y = Prediction, fill = Freq)) +
  geom_tile(color = "white", linewidth = 0.8) +
  geom_text(aes(label = Freq,
                color  = Freq > max(Freq) * 0.4),
            size = 5, fontface = "bold") +
  scale_color_manual(values = c("TRUE" = "white", "FALSE" = "#3A3A3A"),
                     guide = "none") +
  scale_fill_gradient(low = "#D6EAF8", high = "#2E86C1") +
  theme_minimal(base_size = 12) +
  theme(panel.grid = element_blank()) +
  labs(title = "Confusion Matrix - MLR (Test Set)",
       x     = "Actual Class",
       y     = "Predicted Class",
       fill  = "Count")

Per-Class Metrics - MLR

mlr_byclass <- as.data.frame(mlr_cm$byClass)
mlr_byclass$Class <- rownames(mlr_byclass)
rownames(mlr_byclass) <- NULL

mlr_metrics_tbl <- mlr_byclass %>%
  dplyr::select(Class, Sensitivity, Specificity, `Pos Pred Value`,
                `Neg Pred Value`, F1) %>%
  mutate(across(where(is.numeric), ~ round(.x, 4)))

knitr::kable(mlr_metrics_tbl,
             caption = "Per-Class Performance Metrics - MLR",
             row.names = FALSE)
Per-Class Performance Metrics - MLR
Class Sensitivity Specificity Pos Pred Value Neg Pred Value F1
Class: A 0.5009 0.7141 0.3852 0.8000 0.4355
Class: B 0.1414 0.9191 0.3366 0.7866 0.1991
Class: C 0.5328 0.8005 0.4414 0.8527 0.4828
Class: D 0.6248 0.8460 0.6156 0.8510 0.6202

MLR Overall Performance

mlr_acc      <- round(mlr_cm$overall["Accuracy"] * 100, 2)
mlr_kappa    <- round(mlr_cm$overall["Kappa"],          4)

# Warn if any class has undefined F1 (zero predicted positives)
mlr_na_f1 <- sum(is.na(mlr_byclass$F1))
if (mlr_na_f1 > 0) {
  cat("Note:", mlr_na_f1, "class(es) had undefined F1 (excluded from macro average)\n")
}
mlr_f1_macro <- round(mean(mlr_byclass$F1, na.rm = TRUE), 4)

overall_tbl <- data.frame(
  Metric = c("Accuracy (%)", "Kappa", "Macro-F1"),
  Value  = c(mlr_acc, mlr_kappa, mlr_f1_macro),
  Note   = c(
    ifelse(mlr_acc >= 70, "Good", ifelse(mlr_acc >= 50, "Moderate", "Weak")),
    ifelse(mlr_kappa >= 0.8, "Almost Perfect",
           ifelse(mlr_kappa >= 0.6, "Substantial",
                  ifelse(mlr_kappa >= 0.4, "Moderate",
                         ifelse(mlr_kappa >= 0.2, "Fair", "Poor/Slight")))),
    ifelse(mlr_f1_macro >= 0.6, "Good", ifelse(mlr_f1_macro >= 0.4, "Moderate", "Weak"))
  )
)

knitr::kable(overall_tbl,
             caption = "MLR Overall Performance Metrics",
             row.names = FALSE)
MLR Overall Performance Metrics
Metric Value Note
Accuracy (%) 46.2300 Weak
Kappa 0.2775 Fair
Macro-F1 0.4344 Moderate

Posterior Probability Plot - MLR

mlr_post_df <- as.data.frame(mlr_pred_prob)
mlr_post_df$Predicted <- mlr_pred_class
mlr_post_df$Actual    <- test_mlr$Segmentation
mlr_post_df$MaxProb   <- apply(mlr_pred_prob, 1, max)
mlr_post_df$Correct   <- mlr_post_df$Predicted == mlr_post_df$Actual

ggplot(mlr_post_df, aes(x = MaxProb, fill = Correct)) +
  geom_histogram(bins = 30, color = "white", alpha = 0.85) +
  scale_fill_manual(values = c("TRUE"  = "#7EC8C8",
                                "FALSE" = "#F4A7B9"),
                    labels = c("TRUE"  = "Correct",
                                "FALSE" = "Incorrect")) +
  theme_minimal() +
  labs(title    = "Distribution of Maximum Posterior Probability - MLR",
       subtitle = "Higher probability = more confident prediction",
       x    = "Max Posterior Probability",
       y    = "Count",
       fill = "Prediction")


MODEL COMPARISON

Performance Metrics: LDA vs MLR

comparison_df <- data.frame(
  Model    = c("LDA", "MLR"),
  Accuracy = c(lda_acc, mlr_acc),
  Kappa    = c(lda_kappa, mlr_kappa),
  MacroF1  = c(lda_f1_macro, mlr_f1_macro)
)

knitr::kable(comparison_df,
             caption = "Performance Comparison: LDA vs MLR (Test Set)",
             row.names = FALSE)
Performance Comparison: LDA vs MLR (Test Set)
Model Accuracy Kappa MacroF1
LDA 39.96 0.1869 0.3338
MLR 46.23 0.2775 0.4344
comparison_df %>%
  pivot_longer(cols      = c(Accuracy, Kappa, MacroF1),
               names_to  = "Metric",
               values_to = "Value") %>%
  mutate(Metric = factor(Metric, levels = c("Accuracy", "Kappa", "MacroF1"))) %>%
  ggplot(aes(x = Metric, y = Value, fill = Model)) +
  geom_col(position = "dodge", width = 0.6) +
  geom_text(aes(label = round(Value, 3)),
            position = position_dodge(width = 0.6),
            vjust = -0.4, size = 3.5) +
  scale_fill_manual(values = c("LDA" = "#A8C5E0", "MLR" = "#B5D5C5")) +
  theme_minimal() +
  labs(title = "Model Comparison: LDA vs MLR (Test Set)",
       x     = "Metric",
       y     = "Value",
       fill  = "Model")

Summary Table

best_model <- ifelse(mlr_acc >= lda_acc, "MLR", "LDA")

summary_final <- data.frame(
  Metric           = c("Accuracy (%)", "Kappa", "Macro-F1",
                       "Predictors Used", "No. of Features",
                       "Assumptions Checked", "Best Model"),
  LDA              = c(paste0(lda_acc, "%"),
                       as.character(lda_kappa),
                       as.character(lda_f1_macro),
                       "Age, Work_Experience, Family_Size",
                       "3 (numeric only)",
                       "MVN, Homogeneity, No Multicollinearity, Independence",
                       ifelse(best_model == "LDA", "✓", "—")),
  MLR              = c(paste0(mlr_acc, "%"),
                       as.character(mlr_kappa),
                       as.character(mlr_f1_macro),
                       "All 9 Features (numeric + categorical)",
                       "9",
                       "Nominal DV, No Multicollinearity, Sample Adequacy, Independence, No Perfect Separation",
                       ifelse(best_model == "MLR", "✓", "—"))
)

knitr::kable(summary_final,
             caption = "Summary Comparison Table — LDA vs MLR",
             row.names = FALSE)
Summary Comparison Table — LDA vs MLR
Metric LDA MLR
Accuracy (%) 39.96% 46.23%
Kappa 0.1869 0.2775
Macro-F1 0.3338 0.4344
Predictors Used Age, Work_Experience, Family_Size All 9 Features (numeric + categorical)
No. of Features 3 (numeric only) 9
Assumptions Checked MVN, Homogeneity, No Multicollinearity, Independence Nominal DV, No Multicollinearity, Sample Adequacy, Independence, No Perfect Separation
Best Model
cat("\nBest model based on Accuracy:", best_model, "\n")
## 
## Best model based on Accuracy: MLR
cat("Accuracy gain of MLR over LDA:", round(mlr_acc - lda_acc, 2), "%\n")
## Accuracy gain of MLR over LDA: 6.27 %