SETUP

# Core
library(tidyverse)
library(gridExtra)
library(ggpubr)
library(corrplot)
library(knitr)

# Preprocessing & Encoding
library(caret)
library(VIM)

# LDA
library(MASS)
library(heplots)
library(mvnormtest)

# MLR
library(nnet)
library(car)
library(DescTools)

# Resolve namespace conflicts
select <- dplyr::select
filter <- dplyr::filter

LOAD DATA & OVERVIEW

Load & Merge Data

train <- read.csv("Train.csv", stringsAsFactors = FALSE)
test  <- read.csv("Test.csv",  stringsAsFactors = FALSE)

# Both files are fully labelled; merge into one dataset, then split 80/20 downstream
df_raw <- bind_rows(train, test)

cat("Train rows    :", nrow(train),  "\n")

## Train rows    : 8068

cat("Test rows     :", nrow(test),   "\n")

## Test rows     : 2627

cat("Combined rows :", nrow(df_raw), "\n")

## Combined rows : 10695

cat("Columns       :", ncol(df_raw), "\n")

## Columns       : 11

Structure & Dim

str(df_raw)

## 'data.frame':    10695 obs. of  11 variables:
##  $ ID             : int  462809 462643 466315 461735 462669 461319 460156 464347 465015 465176 ...
##  $ Gender         : chr  "Male" "Female" "Female" "Male" ...
##  $ Ever_Married   : chr  "No" "Yes" "Yes" "Yes" ...
##  $ Age            : int  22 38 67 67 40 56 32 33 61 55 ...
##  $ Graduated      : chr  "No" "Yes" "Yes" "Yes" ...
##  $ Profession     : chr  "Healthcare" "Engineer" "Engineer" "Lawyer" ...
##  $ Work_Experience: num  1 NA 1 0 NA 0 1 1 0 1 ...
##  $ Spending_Score : chr  "Low" "Average" "Low" "High" ...
##  $ Family_Size    : num  4 3 1 2 6 2 3 3 3 4 ...
##  $ Var_1          : chr  "Cat_4" "Cat_4" "Cat_6" "Cat_6" ...
##  $ Segmentation   : chr  "D" "A" "B" "B" ...

head(df_raw)

##       ID Gender Ever_Married Age Graduated    Profession Work_Experience
## 1 462809   Male           No  22        No    Healthcare               1
## 2 462643 Female          Yes  38       Yes      Engineer              NA
## 3 466315 Female          Yes  67       Yes      Engineer               1
## 4 461735   Male          Yes  67       Yes        Lawyer               0
## 5 462669 Female          Yes  40       Yes Entertainment              NA
## 6 461319   Male          Yes  56        No        Artist               0
##   Spending_Score Family_Size Var_1 Segmentation
## 1            Low           4 Cat_4            D
## 2        Average           3 Cat_4            A
## 3            Low           1 Cat_6            B
## 4           High           2 Cat_6            B
## 5           High           6 Cat_6            A
## 6        Average           2 Cat_6            C

summary(df_raw)

##        ID            Gender          Ever_Married            Age       
##  Min.   :458982   Length:10695       Length:10695       Min.   :18.00  
##  1st Qu.:461221   Class :character   Class :character   1st Qu.:30.00  
##  Median :463451   Mode  :character   Mode  :character   Median :41.00  
##  Mean   :463468                                         Mean   :43.51  
##  3rd Qu.:465734                                         3rd Qu.:53.00  
##  Max.   :467974                                         Max.   :89.00  
##                                                                        
##   Graduated          Profession        Work_Experience Spending_Score    
##  Length:10695       Length:10695       Min.   : 0.00   Length:10695      
##  Class :character   Class :character   1st Qu.: 0.00   Class :character  
##  Mode  :character   Mode  :character   Median : 1.00   Mode  :character  
##                                        Mean   : 2.62                     
##                                        3rd Qu.: 4.00                     
##                                        Max.   :14.00                     
##                                        NA's   :1098                      
##   Family_Size       Var_1           Segmentation      
##  Min.   :1.000   Length:10695       Length:10695      
##  1st Qu.:2.000   Class :character   Class :character  
##  Median :3.000   Mode  :character   Mode  :character  
##  Mean   :2.844                                        
##  3rd Qu.:4.000                                        
##  Max.   :9.000                                        
##  NA's   :448

Missing Values Check

is_missing <- is.na(df_raw) | sapply(df_raw, function(x)
  if (is.character(x)) x == "" else rep(FALSE, length(x)))

missing_summary <- data.frame(
  Column  = names(df_raw),
  Missing = colSums(is_missing),
  Percent = round(colMeans(is_missing) * 100, 2)
) %>% filter(Missing > 0) %>% arrange(desc(Percent))

knitr::kable(missing_summary,
             caption = "Missing Values Summary",
             row.names = FALSE)

Missing Values Summary
Column	Missing	Percent
Work_Experience	1098	10.27
Family_Size	448	4.19
Ever_Married	190	1.78
Profession	162	1.51
Var_1	108	1.01
Graduated	102	0.95

df_viz <- dplyr::select(df_raw, -ID)

is_missing_viz <- is.na(df_viz) | sapply(df_viz, function(x)
  if (is.character(x)) x == "" else rep(FALSE, length(x)))

missing_plot <- data.frame(
  Variable = names(df_viz),
  Missing  = colMeans(is_missing_viz) * 100
) %>%
  arrange(desc(Missing)) %>%
  mutate(Variable = factor(Variable, levels = Variable))

ggplot(missing_plot, aes(x = Variable, y = Missing,
                          fill = Missing > 0)) +
  geom_col(width = 0.7, show.legend = FALSE) +
  geom_text(aes(label = paste0(round(Missing, 1), "%")),
            hjust = -0.15, size = 3.2) +
  scale_fill_manual(values = c("FALSE" = "#18BC9C",
                                "TRUE"  = "#E74C3C")) +
  coord_flip() +
  ylim(0, max(missing_plot$Missing) + 5) +
  theme_minimal() +
  labs(title = "Missing Value Proportion per Variable",
       x     = NULL,
       y     = "Missing (%)")

EDA

Descriptive Statistics

df_raw %>%
  dplyr::select(Age, Work_Experience, Family_Size) %>%
  summary()

##       Age        Work_Experience  Family_Size   
##  Min.   :18.00   Min.   : 0.00   Min.   :1.000  
##  1st Qu.:30.00   1st Qu.: 0.00   1st Qu.:2.000  
##  Median :41.00   Median : 1.00   Median :3.000  
##  Mean   :43.51   Mean   : 2.62   Mean   :2.844  
##  3rd Qu.:53.00   3rd Qu.: 4.00   3rd Qu.:4.000  
##  Max.   :89.00   Max.   :14.00   Max.   :9.000  
##                  NA's   :1098    NA's   :448

df_raw %>%
  filter(!is.na(Segmentation)) %>%
  group_by(Segmentation) %>%
  summarise(
    n                      = n(),
    Age_mean               = round(mean(Age, na.rm = TRUE), 2),
    Age_sd                 = round(sd(Age, na.rm = TRUE), 2),
    Work_Experience_mean   = round(mean(Work_Experience, na.rm = TRUE), 2),
    Work_Experience_sd     = round(sd(Work_Experience, na.rm = TRUE), 2),
    Family_Size_mean       = round(mean(Family_Size, na.rm = TRUE), 2),
    Family_Size_sd         = round(sd(Family_Size, na.rm = TRUE), 2)
  ) %>%
  knitr::kable(caption = "Descriptive Statistics of Numeric Features by Segmentation")

Descriptive Statistics of Numeric Features by Segmentation
Segmentation	n	Age_mean	Age_sd	Work_Experience_mean	Work_Experience_sd	Family_Size_mean	Family_Size_sd
A	2818	44.40	16.48	2.82	3.57	2.57	1.52
B	2408	47.70	15.02	2.40	3.24	2.69	1.45
C	2442	49.26	14.88	2.25	3.06	2.91	1.39
D	3027	34.72	16.28	2.91	3.55	3.17	1.67

Target Distribution

df_raw %>%
  filter(!is.na(Segmentation)) %>%
  count(Segmentation) %>%
  mutate(Percent = round(n / sum(n) * 100, 1)) %>%
  ggplot(aes(x = Segmentation, y = n, fill = Segmentation)) +
  geom_col(width = 0.6, show.legend = FALSE) +
  geom_text(aes(label = paste0(n, "\n(", Percent, "%)")),
            vjust = -0.3, size = 3.5) +
  scale_fill_manual(values = c("#3266ad","#1D9E75","#BA7517","#D4537E")) +
  theme_minimal() +
  labs(title = "Distribution of Customer Segmentation",
       x = "Segment", y = "Count")

Numeric Feature Distribution

df_raw %>%
  dplyr::select(Age, Work_Experience, Family_Size, Segmentation) %>%
  filter(!is.na(Segmentation)) %>%
  pivot_longer(cols = c(Age, Work_Experience, Family_Size),
               names_to = "Feature", values_to = "Value") %>%
  ggplot(aes(x = Value, fill = Segmentation)) +
  geom_histogram(bins = 25, color = "white", alpha = 0.8,
                 position = "identity") +
  facet_wrap(~Feature, scales = "free") +
  scale_fill_manual(values = c("#3266ad","#1D9E75","#BA7517","#D4537E")) +
  theme_minimal() +
  labs(title = "Histogram of Numeric Features by Segmentation",
       x = "Value", y = "Count")

df_raw %>%
  dplyr::select(Age, Work_Experience, Family_Size, Segmentation) %>%
  filter(!is.na(Segmentation)) %>%
  pivot_longer(cols = c(Age, Work_Experience, Family_Size),
               names_to = "Feature", values_to = "Value") %>%
  ggplot(aes(x = Segmentation, y = Value, fill = Segmentation)) +
  geom_boxplot(outlier.color = "red", outlier.size = 1.5,
               alpha = 0.7, show.legend = FALSE) +
  facet_wrap(~Feature, scales = "free_y") +
  scale_fill_manual(values = c("#3266ad","#1D9E75","#BA7517","#D4537E")) +
  theme_minimal() +
  labs(title = "Boxplots of Numeric Features by Segmentation",
       x = "Segment", y = "Value")

Categorical Feature Distribution

cat_vars <- c("Gender", "Ever_Married", "Graduated",
              "Profession", "Spending_Score", "Var_1")

plots_cat <- map(cat_vars, function(var) {
  df_raw %>%
    filter(!is.na(Segmentation), .data[[var]] != "") %>%
    count(!!sym(var), Segmentation) %>%
    ggplot(aes(x = .data[[var]], y = n, fill = Segmentation)) +
    geom_col(position = "fill", width = 0.7) +
    scale_fill_manual(values = c("#3266ad","#1D9E75","#BA7517","#D4537E")) +
    scale_y_continuous(labels = scales::percent) +
    theme_minimal(base_size = 9) +
    theme(axis.text.x = element_text(angle = 35, hjust = 1),
          legend.position = "none") +
    labs(title = var, x = NULL, y = "Proportion")
})

ggarrange(plotlist = plots_cat, ncol = 3, nrow = 2,
          common.legend = TRUE, legend = "bottom")

Correlation Matrix

cor_num <- df_raw %>%
  dplyr::select(Age, Work_Experience, Family_Size) %>%
  cor(use = "complete.obs")

corrplot(cor_num,
         method     = "color",
         type       = "upper",
         addCoef.col = "black",
         tl.col     = "black",
         tl.srt     = 45,
         number.cex = 0.9,
         title      = "Correlation Matrix of Numeric Features",
         mar        = c(0, 0, 1, 0))

Outlier Detection

df_raw %>%
  dplyr::select(Age, Work_Experience, Family_Size) %>%
  pivot_longer(everything(), names_to = "Feature", values_to = "Value") %>%
  ggplot(aes(x = Feature, y = Value, fill = Feature)) +
  geom_boxplot(outlier.color = "red", outlier.alpha = 0.5,
               show.legend = FALSE) +
  coord_flip() +
  theme_minimal() +
  labs(title = "Outlier Detection — Numeric Features (Raw Data)",
       subtitle = "Red dots indicate potential outliers",
       x = NULL, y = "Value")

PREPROCESSING

Handling Missing Values

get_mode <- function(x) {
  ux <- unique(x[!is.na(x) & x != ""])
  ux[which.max(tabulate(match(x, ux)))]
}

df_clean <- df_raw %>%
  mutate(
    Work_Experience = ifelse(is.na(Work_Experience),
                             median(Work_Experience, na.rm = TRUE),
                             Work_Experience),
    Family_Size     = ifelse(is.na(Family_Size),
                             median(Family_Size, na.rm = TRUE),
                             Family_Size)
  ) %>%
  mutate(
    Ever_Married = ifelse(Ever_Married == "" | is.na(Ever_Married),
                          get_mode(Ever_Married), Ever_Married),
    Profession   = ifelse(Profession == "" | is.na(Profession),
                          get_mode(Profession), Profession),
    Var_1        = ifelse(Var_1 == "" | is.na(Var_1),
                          get_mode(Var_1), Var_1),
    Graduated    = ifelse(Graduated == "" | is.na(Graduated),
                          get_mode(Graduated), Graduated)
  )

cat("Missing values after imputation:\n")

## Missing values after imputation:

is_missing_clean <- is.na(df_clean) | sapply(df_clean, function(x)
  if (is.character(x)) x == "" else rep(FALSE, length(x)))
colSums(is_missing_clean)

##              ID          Gender    Ever_Married             Age       Graduated 
##               0               0               0               0               0 
##      Profession Work_Experience  Spending_Score     Family_Size           Var_1 
##               0               0               0               0               0 
##    Segmentation 
##               0

Encoding Categorical Variables

df_encoded <- df_clean %>%
  mutate(
    Segmentation   = factor(Segmentation,
                            levels = c("A","B","C","D")),
    Gender         = factor(Gender),
    Ever_Married   = factor(Ever_Married),
    Graduated      = factor(Graduated),
    Profession     = factor(Profession),
    Spending_Score = factor(Spending_Score,
                            levels = c("Low","Average","High")),
    Var_1          = factor(Var_1)
  ) %>%
  dplyr::select(-ID)  # Drop ID column

cat("Dataset dimensions after encoding:", 
    nrow(df_encoded), "rows x", ncol(df_encoded), "cols\n")

## Dataset dimensions after encoding: 10695 rows x 10 cols

glimpse(df_encoded)

## Rows: 10,695
## Columns: 10
## $ Gender          <fct> Male, Female, Female, Male, Female, Male, Male, Female…
## $ Ever_Married    <fct> No, Yes, Yes, Yes, Yes, Yes, No, No, Yes, Yes, No, No,…
## $ Age             <int> 22, 38, 67, 67, 40, 56, 32, 33, 61, 55, 26, 19, 19, 70…
## $ Graduated       <fct> No, Yes, Yes, Yes, Yes, No, Yes, Yes, Yes, Yes, Yes, N…
## $ Profession      <fct> Healthcare, Engineer, Engineer, Lawyer, Entertainment,…
## $ Work_Experience <dbl> 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 4, 0, 1, 0, 1, 9, 1, …
## $ Spending_Score  <fct> Low, Average, Low, High, High, Average, Low, Low, Low,…
## $ Family_Size     <dbl> 4, 3, 1, 2, 6, 2, 3, 3, 3, 4, 3, 4, 3, 1, 1, 2, 5, 6, …
## $ Var_1           <fct> Cat_4, Cat_4, Cat_6, Cat_6, Cat_6, Cat_6, Cat_6, Cat_6…
## $ Segmentation    <fct> D, A, B, B, A, C, C, D, D, C, A, D, D, A, B, C, D, B, …

Feature Selection per Method

# LDA uses numeric predictors only; drop_na() removes residual missing rows.
df_lda <- df_encoded %>%
  filter(!is.na(Segmentation)) %>%
  dplyr::select(Age, Work_Experience, Family_Size, Segmentation) %>%
  drop_na()

cat("LDA dataset  :", nrow(df_lda), "rows x", 
    ncol(df_lda), "cols\n")

## LDA dataset  : 10695 rows x 4 cols

cat("Features used:", 
    paste(names(df_lda)[-ncol(df_lda)], collapse = ", "), "\n")

## Features used: Age, Work_Experience, Family_Size

# MLR uses all 9 features; drop_na() removes residual missing rows.
df_mlr <- df_encoded %>%
  filter(!is.na(Segmentation)) %>%
  drop_na()

cat("\nMLR dataset  :", nrow(df_mlr), "rows x", 
    ncol(df_mlr), "cols\n")

## 
## MLR dataset  : 10695 rows x 10 cols

cat("Features used:", 
    paste(setdiff(names(df_mlr), "Segmentation"), collapse = ", "), "\n")

## Features used: Gender, Ever_Married, Age, Graduated, Profession, Work_Experience, Spending_Score, Family_Size, Var_1

Train-Test Split

set.seed(123)

# MLR split — 80/20, stratified
df_mlr$Segmentation <- relevel(df_mlr$Segmentation, ref = "A")

idx_mlr   <- createDataPartition(df_mlr$Segmentation,
                                  p = 0.8, list = FALSE)
train_mlr <- df_mlr[ idx_mlr, ]
test_mlr  <- df_mlr[-idx_mlr, ]

cat("MLR  — Train:", nrow(train_mlr),
    "| Test:", nrow(test_mlr), "\n")

## MLR  — Train: 8558 | Test: 2137

Post-Preprocessing Verification

cat("=== MLR Train — Segmentation Distribution ===\n")

## === MLR Train — Segmentation Distribution ===

print(prop.table(table(train_mlr$Segmentation)) * 100)

## 
##        A        B        C        D 
## 26.34961 22.51694 22.83244 28.30100

DISCRIMINANT ANALYSIS

Assumption Checking

Preprocessing for LDA

# LDA split — 80/20, stratified, independent from MLR split
set.seed(123)
idx_lda   <- createDataPartition(df_lda$Segmentation,
                                  p = 0.8, list = FALSE)
train_lda <- df_lda[ idx_lda, ]
test_lda  <- df_lda[-idx_lda, ]

cat("LDA  — Train:", nrow(train_lda), "| Test:", nrow(test_lda), "\n")

## LDA  — Train: 8558 | Test: 2137

# Apply log1p transformation to reduce skewness in Work_Experience & Family_Size
train_lda_t <- train_lda %>%
  mutate(
    Work_Experience = log1p(Work_Experience),
    Family_Size     = log1p(Family_Size)
  )

test_lda_t <- test_lda %>%
  mutate(
    Work_Experience = log1p(Work_Experience),
    Family_Size     = log1p(Family_Size)
  )

# Subsample 75 obs per group from training set only for Multivariate Shapiro-Wilk
# (test is sensitive to large n; valid only for n <= 5000)
df_lda_sub <- train_lda_t %>%
  group_by(Segmentation) %>%
  slice_sample(n = 75) %>%
  ungroup()

cat("Log-transformed LDA train :", nrow(train_lda_t), "rows\n")

## Log-transformed LDA train : 8558 rows

cat("Log-transformed LDA test  :", nrow(test_lda_t),  "rows\n")

## Log-transformed LDA test  : 2137 rows

cat("Subsampled (train only)   :", nrow(df_lda_sub),  "observations\n")

## Subsampled (train only)   : 300 observations

cat("Per group                 : 75 observations x",
    nlevels(df_lda_sub$Segmentation), "groups\n")

## Per group                 : 75 observations x 4 groups

cat("Log transform applied to  : Work_Experience, Family_Size\n")

## Log transform applied to  : Work_Experience, Family_Size

Multivariate Normality

segments <- levels(df_lda_sub$Segmentation)

norm_results <- map_df(segments, function(seg) {
  mat <- df_lda_sub %>%
    filter(Segmentation == seg) %>%
    dplyr::select(Age, Work_Experience, Family_Size) %>%
    as.matrix() %>% t()
  
  test <- mvnormtest::mshapiro.test(mat) 
  
  data.frame(
    Group   = seg,
    W       = round(test$statistic, 4),
    P_Value = round(test$p.value, 4),
    Result  = ifelse(test$p.value > 0.05, "YES", "NO"),
    Status  = ifelse(test$p.value > 0.05,
                     "Normal",
                     "Non-Normal")
  )
})

knitr::kable(norm_results,
             caption = "Multivariate Shapiro-Wilk Normality Test per Segmentation Group (n = 75 per group, log-transformed)",
             row.names = FALSE)

Multivariate Shapiro-Wilk Normality Test per Segmentation Group (n = 75 per group, log-transformed)
Group	W	P_Value	Result	Status
A	0.9843	0.4830	YES	Normal
B	0.9846	0.4994	YES	Normal
C	0.9822	0.3721	YES	Normal
D	0.7524	0.0000	NO	Non-Normal

Homogeneity

boxm_lda <- boxM(
  Y     = df_lda_sub %>%
            dplyr::select(Age, Work_Experience, Family_Size),
  group = df_lda_sub$Segmentation
)
print(boxm_lda)

## 
##  Box's M-test for Homogeneity of Covariance Matrices 
## 
## data:  df_lda_sub %>% dplyr::select(Age, Work_Experience, Family_Size) by df_lda_sub$Segmentation 
## Chi-Sq (approx.) = 18.9654, df = 18, p-value = 0.394

cat("\nConclusion:",
    ifelse(boxm_lda$p.value > 0.05,
           "PASSED — Covariance matrices are homogeneous (p > 0.05)",
           "FAILED — Covariance matrices differ across groups (p < 0.05)"))

## 
## Conclusion: PASSED — Covariance matrices are homogeneous (p > 0.05)

Absence of Multicollinearity

cor_lda <- cor(df_lda_sub %>%
                 dplyr::select(Age, Work_Experience, Family_Size))

knitr::kable(round(cor_lda, 4),
             caption = "Correlation Matrix of LDA Predictors (Log-Transformed)")

Correlation Matrix of LDA Predictors (Log-Transformed)
	Age	Work_Experience	Family_Size
Age	1.0000	-0.2330	-0.3364
Work_Experience	-0.2330	1.0000	0.0083
Family_Size	-0.3364	0.0083	1.0000

high_cor <- any(abs(cor_lda[upper.tri(cor_lda)]) > 0.8)

cat("\nHighest correlation:",
    round(max(abs(cor_lda[upper.tri(cor_lda)])), 4), "\n")

## 
## Highest correlation: 0.3364

cat("Conclusion:",
    ifelse(!high_cor,
           "PASSED, No severe multicollinearity among predictors",
           "FAILED, High multicollinearity detected"))

## Conclusion: PASSED, No severe multicollinearity among predictors

Independence

Each observation in the subsampled dataset represents a unique customer record drawn randomly from the full dataset. No repeated measures or hierarchical structure exists. The independence assumption is therefore satisfied by design.

Summary

norm_pass <- all(norm_results$Result == "YES")
boxm_pass <- boxm_lda$p.value > 0.05
cor_pass  <- !high_cor

norm_detail <- paste(
  norm_results$Group,
  ifelse(norm_results$Result == "YES", "Normal", "Non-normal"),
  paste0("(p=", norm_results$P_Value, ")"),
  collapse = " | "
)

lda_assumption_summary <- data.frame(
  No         = 1:4,
  Assumption = c(
    "Multivariate Normality",
    "Homogeneity of Covariance",
    "Absence of Multicollinearity",
    "Independence of Observations"
  ),
  Test_Used  = c(
    "Multivariate Shapiro-Wilk (per group)",
    "Box's M Test",
    "Correlation Matrix (threshold > 0.8)",
    "Methodological Design"
  ),
  Status     = c(
    ifelse(norm_pass, "PASSED", "FAILED"),
    ifelse(boxm_pass, "PASSED", "FAILED"),
    ifelse(cor_pass,  "PASSED", "FAILED"),
    "PASSED"
  ),
  Conclusion = c(
    norm_detail,
    paste0("Box's M p = ", round(boxm_lda$p.value, 4),
           ifelse(boxm_pass,
                  " — Covariance matrices are homogeneous (p > 0.05)",
                  " — Covariance matrices differ across groups (p < 0.05); consider QDA as robust alternative")),
    paste0("Max correlation = ",
           round(max(abs(cor_lda[upper.tri(cor_lda)])), 4),
           " — Well below threshold of 0.8"),
    "Each record represents a unique independent customer"
  )
)

knitr::kable(lda_assumption_summary,
             caption = "Summary Table - LDA Assumption Checking",
             row.names = FALSE)

Summary Table - LDA Assumption Checking
No	Assumption	Test_Used	Status	Conclusion
1	Multivariate Normality	Multivariate Shapiro-Wilk (per group)	FAILED	A Normal (p=0.483) \| B Normal (p=0.4994) \| C Normal (p=0.3721) \| D Non-normal (p=0)
2	Homogeneity of Covariance	Box’s M Test	PASSED	Box’s M p = 0.394 — Covariance matrices are homogeneous (p > 0.05)
3	Absence of Multicollinearity	Correlation Matrix (threshold > 0.8)	PASSED	Max correlation = 0.3364 — Well below threshold of 0.8
4	Independence of Observations	Methodological Design	PASSED	Each record represents a unique independent customer

Model Fitting

Fit LDA Model

train_lda_t$Segmentation <- factor(train_lda_t$Segmentation,
                                    levels = c("A","B","C","D"))

train_lda_t <- train_lda_t %>%
  dplyr::select(Segmentation, Age, Work_Experience, Family_Size) %>%
  na.omit()

str(train_lda_t)

## 'data.frame':    8558 obs. of  4 variables:
##  $ Segmentation   : Factor w/ 4 levels "A","B","C","D": 4 1 2 2 1 3 3 4 4 3 ...
##  $ Age            : int  22 38 67 67 40 56 32 33 61 55 ...
##  $ Work_Experience: num  0.693 0.693 0.693 0 0.693 ...
##  $ Family_Size    : num  1.609 1.386 0.693 1.099 1.946 ...

lda_model <- lda(Segmentation ~ Age + Work_Experience + Family_Size,
                 data = train_lda_t)

print(lda_model)

## Call:
## lda(Segmentation ~ Age + Work_Experience + Family_Size, data = train_lda_t)
## 
## Prior probabilities of groups:
##         A         B         C         D 
## 0.2634961 0.2251694 0.2283244 0.2830100 
## 
## Group means:
##        Age Work_Experience Family_Size
## A 44.30554       0.9105635    1.204255
## B 47.61754       0.8438422    1.242504
## C 48.99488       0.8270146    1.312888
## D 34.83526       0.9295996    1.350492
## 
## Coefficients of linear discriminants:
##                         LD1         LD2         LD3
## Age             -0.06465726 -0.01464310 -0.01124301
## Work_Experience -0.03804155  0.08169357 -1.26821959
## Family_Size     -0.11010392 -2.72944123 -0.48525693
## 
## Proportion of trace:
##    LD1    LD2    LD3 
## 0.8745 0.1247 0.0009

Discriminant Function Coefficients

# lda$scaling gives raw (unstandardised) discriminant function weights.
lda_coef <- as.data.frame(lda_model$scaling)
lda_coef$Predictor <- rownames(lda_coef)
rownames(lda_coef) <- NULL

ld_cols  <- grep("^LD", names(lda_coef), value = TRUE)
lda_coef <- lda_coef[, c("Predictor", ld_cols)]
lda_coef[, ld_cols] <- round(lda_coef[, ld_cols], 4)

knitr::kable(lda_coef,
             caption = "LDA Discriminant Function Coefficients (Standardised)",
             row.names = FALSE)

LDA Discriminant Function Coefficients (Standardised)
Predictor	LD1	LD2	LD3
Age	-0.0647	-0.0146	-0.0112
Work_Experience	-0.0380	0.0817	-1.2682
Family_Size	-0.1101	-2.7294	-0.4853

Proportion of Trace (Variance Explained)

prop_trace <- data.frame(
  LD        = paste0("LD", seq_along(lda_model$svd)),
  Eigenvalue = round(lda_model$svd^2, 4),
  Proportion = round(lda_model$svd^2 / sum(lda_model$svd^2) * 100, 2),
  Cumulative = round(cumsum(lda_model$svd^2 / sum(lda_model$svd^2)) * 100, 2)
)

knitr::kable(prop_trace,
             caption = "Proportion of Trace — Variance Explained by Each Discriminant Function",
             row.names = FALSE)

Proportion of Trace — Variance Explained by Each Discriminant Function
LD	Eigenvalue	Proportion	Cumulative
LD1	372.8420	87.45	87.45
LD2	53.1454	12.47	99.91
LD3	0.3626	0.09	100.00

LDA Bi-Plot (LD1 vs LD2)

lda_scores <- predict(lda_model, train_lda_t)

lda_plot_df <- data.frame(
  LD1          = lda_scores$x[, 1],
  LD2          = lda_scores$x[, 2],
  Segmentation = train_lda_t$Segmentation
)

ggplot(lda_plot_df, aes(x = LD1, y = LD2, color = Segmentation)) +
  geom_point(alpha = 0.5, size = 1.8) +
  stat_ellipse(aes(fill = Segmentation), geom = "polygon",
               alpha = 0.08, level = 0.95) +
  scale_color_manual(values = c("#3266ad","#1D9E75","#BA7517","#D4537E")) +
  scale_fill_manual(values  = c("#3266ad","#1D9E75","#BA7517","#D4537E")) +
  theme_minimal() +
  labs(title    = "LDA Bi-Plot: LD1 vs LD2 (Training Set)",
       subtitle  = "95% confidence ellipses per segment",
       x = "First Discriminant Function (LD1)",
       y = "Second Discriminant Function (LD2)")

Prediction & Confusion Matrix — LDA

lda_pred <- predict(lda_model, newdata = test_lda_t)

# Re-level to match training factor levels for confusionMatrix().
lda_pred_class <- factor(lda_pred$class,
                          levels = levels(train_lda_t$Segmentation))

lda_cm <- confusionMatrix(
  data      = lda_pred_class,
  reference = factor(test_lda_t$Segmentation,
                     levels = levels(train_lda_t$Segmentation))
)

print(lda_cm)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   A   B   C   D
##          A 172 157 130  86
##          B  14   9   8   9
##          C 138 171 227  64
##          D 239 144 123 446
## 
## Overall Statistics
##                                           
##                Accuracy : 0.3996          
##                  95% CI : (0.3788, 0.4208)
##     No Information Rate : 0.2831          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.1869          
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
## 
## Statistics by Class:
## 
##                      Class: A Class: B Class: C Class: D
## Sensitivity           0.30551 0.018711   0.4652   0.7372
## Specificity           0.76302 0.981280   0.7738   0.6697
## Pos Pred Value        0.31560 0.225000   0.3783   0.4685
## Neg Pred Value        0.75440 0.774917   0.8302   0.8658
## Prevalence            0.26345 0.225082   0.2284   0.2831
## Detection Rate        0.08049 0.004212   0.1062   0.2087
## Detection Prevalence  0.25503 0.018718   0.2808   0.4455
## Balanced Accuracy     0.53427 0.499996   0.6195   0.7035

as.data.frame(lda_cm$table) %>%
  ggplot(aes(x = Reference, y = Prediction, fill = Freq)) +
  geom_tile(color = "white") +
  geom_text(aes(label = Freq), size = 5, fontface = "bold") +
  scale_fill_gradient(low = "#EBF5FB", high = "#2980B9") +
  theme_minimal() +
  labs(title    = "Confusion Matrix — LDA (Test Set)",
       x = "Actual Class", y = "Predicted Class",
       fill = "Count")

Per-Class Metrics — LDA

lda_byclass <- as.data.frame(lda_cm$byClass)
lda_byclass$Class <- rownames(lda_byclass)
rownames(lda_byclass) <- NULL

lda_metrics_tbl <- lda_byclass %>%
  dplyr::select(Class, Sensitivity, Specificity, `Pos Pred Value`,
                `Neg Pred Value`, F1) %>%
  mutate(across(where(is.numeric), ~ round(.x, 4)))

knitr::kable(lda_metrics_tbl,
             caption = "Per-Class Performance Metrics — LDA",
             row.names = FALSE)

Per-Class Performance Metrics — LDA
Class	Sensitivity	Specificity	Pos Pred Value	Neg Pred Value	F1
Class: A	0.3055	0.7630	0.3156	0.7544	0.3105
Class: B	0.0187	0.9813	0.2250	0.7749	0.0345
Class: C	0.4652	0.7738	0.3783	0.8302	0.4173
Class: D	0.7372	0.6697	0.4685	0.8658	0.5729

LDA Overall Performance

lda_acc    <- round(lda_cm$overall["Accuracy"] * 100, 2)
lda_kappa  <- round(lda_cm$overall["Kappa"],          4)

# Warn if any class has undefined F1 (zero predicted positives)
lda_na_f1 <- sum(is.na(lda_byclass$F1))
if (lda_na_f1 > 0) {
  cat("Note:", lda_na_f1, "class(es) had undefined F1 (excluded from macro average)\n")
}
lda_f1_macro <- round(mean(lda_byclass$F1, na.rm = TRUE), 4)

cat("LDA Overall Performance\n")

## LDA Overall Performance

cat("Accuracy     :", lda_acc, "%\n")

## Accuracy     : 39.96 %

cat("Kappa        :", lda_kappa, "\n")

## Kappa        : 0.1869

cat("Macro-F1     :", lda_f1_macro, "\n")

## Macro-F1     : 0.3338

Posterior Probability Plot — LDA

lda_post_df <- as.data.frame(lda_pred$posterior)
lda_post_df$Predicted  <- lda_pred_class
lda_post_df$Actual     <- factor(test_lda_t$Segmentation,
                                  levels = levels(train_lda_t$Segmentation))
lda_post_df$MaxProb    <- apply(lda_pred$posterior, 1, max)
lda_post_df$Correct    <- lda_post_df$Predicted == lda_post_df$Actual

ggplot(lda_post_df, aes(x = MaxProb, fill = Correct)) +
  geom_histogram(bins = 30, color = "white", alpha = 0.85) +
  scale_fill_manual(values = c("TRUE" = "#1D9E75", "FALSE" = "#E74C3C"),
                    labels = c("TRUE" = "Correct", "FALSE" = "Incorrect")) +
  theme_minimal() +
  labs(title   = "Distribution of Maximum Posterior Probability — LDA",
       subtitle = "Higher probability = more confident prediction",
       x = "Max Posterior Probability", y = "Count",
       fill = "Prediction")

MULTINOMIAL LOGISTIC REGRESSION (MLR)

Assumption Checking

Nominal Dependent Variable

cat("Target variable  : Segmentation\n")

## Target variable  : Segmentation

cat("Number of levels :", nlevels(df_mlr$Segmentation), "\n")

## Number of levels : 4

cat("Levels           :", levels(df_mlr$Segmentation), "\n")

## Levels           : A B C D

seg_dist <- table(df_mlr$Segmentation)
seg_pct  <- round(prop.table(seg_dist) * 100, 2)

knitr::kable(data.frame(
  Segment    = names(seg_dist),
  Count      = as.integer(seg_dist),
  Percentage = as.numeric(seg_pct)
), caption = "Distribution of Target Variable",
   row.names = FALSE)

Distribution of Target Variable
Segment	Count	Percentage
A	2818	26.35
B	2408	22.52
C	2442	22.83
D	3027	28.30

cat("\nConclusion: PASSED - Target has",
    nlevels(df_mlr$Segmentation),
    "nominal categories")

## 
## Conclusion: PASSED - Target has 4 nominal categories

VIF

# GVIF^(1/(2*Df)) used for multi-level factors; threshold: sqrt(5) ≈ 2.24.
lm_proxy <- lm(
  as.numeric(Segmentation) ~ Age + Work_Experience + Family_Size +
                              Gender + Ever_Married + Graduated +
                              Profession + Spending_Score + Var_1,
  data = train_mlr
)

vif_raw <- vif(lm_proxy)

# vif() returns a matrix for factors; use adjusted GVIF column.
if (is.matrix(vif_raw)) {
  vif_res <- data.frame(
    Predictor  = rownames(vif_raw),
    GVIF       = round(vif_raw[, "GVIF"], 4),
    Df         = vif_raw[, "Df"],
    GVIF_adj   = round(vif_raw[, "GVIF^(1/(2*Df))"], 4),
    Status     = ifelse(vif_raw[, "GVIF^(1/(2*Df))"] < sqrt(5),
                        "Acceptable", "High Multicollinearity"),
    row.names  = NULL
  )
  knitr::kable(vif_res,
               caption = "VIF - Multicollinearity Check (All 9 Predictors; GVIF adjusted for multi-level factors)",
               col.names = c("Predictor", "GVIF", "Df", "GVIF^(1/(2*Df))", "Status"),
               row.names = FALSE)
} else {
  vif_res <- data.frame(
    Predictor = names(vif_raw),
    VIF       = round(vif_raw, 4),
    Status    = ifelse(vif_raw < 5, "Acceptable", "High Multicollinearity"),
    row.names = NULL
  )
  knitr::kable(vif_res,
               caption = "VIF - Multicollinearity Check (All 9 Predictors)",
               row.names = FALSE)
}

VIF - Multicollinearity Check (All 9 Predictors; GVIF adjusted for multi-level factors)
Predictor	GVIF	Df	GVIF^(1/(2*Df))	Status
Age	2.6017	1	1.6130	Acceptable
Work_Experience	1.0860	1	1.0421	Acceptable
Family_Size	1.2900	1	1.1358	Acceptable
Gender	1.1728	1	1.0830	Acceptable
Ever_Married	2.4205	1	1.5558	Acceptable
Graduated	1.2818	1	1.1322	Acceptable
Profession	4.0663	8	1.0916	Acceptable
Spending_Score	2.6335	2	1.2739	Acceptable
Var_1	1.1718	6	1.0133	Acceptable

Sample Adequacy

# model.matrix() counts dummy-expanded params correctly; -1 excludes intercept.
n_predictors <- ncol(model.matrix(
  ~ Age + Work_Experience + Family_Size +
    Gender + Ever_Married + Graduated +
    Profession + Spending_Score + Var_1,
  data = train_mlr
)) - 1   # true dummy-expanded predictor count (excl. intercept)
n_categories <- nlevels(train_mlr$Segmentation)
n_train      <- nrow(train_mlr)
min_required <- n_predictors * n_categories * 10

sample_check <- data.frame(
  Metric       = c("Total Training Samples",
                   "Number of Predictors",
                   "Number of Categories",
                   "Minimum Required (10 per predictor per category)",
                   "Adequacy Status"),
  Value        = c(n_train,
                   n_predictors,
                   n_categories,
                   min_required,
                   ifelse(n_train >= min_required,
                          "PASSED", "FAILED"))
)

knitr::kable(sample_check,
             caption = "Sample Adequacy Check for MLR",
             row.names = FALSE)

Sample Adequacy Check for MLR
Metric	Value
Total Training Samples	8558
Number of Predictors	22
Number of Categories	4
Minimum Required (10 per predictor per category)	880
Adequacy Status	PASSED

Independence

Each row represents a unique customer. Data was collected independently with no repeated measurements or clustering structure. The independence assumption is satisfied by the nature of the dataset.

No Perfect Separation

separation_check <- df_mlr %>%
  group_by(Segmentation) %>%
  summarise(
    Age_range     = paste0(min(Age), "--", max(Age)),
    WorkExp_range = paste0(min(Work_Experience), "--",
                           max(Work_Experience)),
    FamSize_range = paste0(min(Family_Size), "--",
                           max(Family_Size)),
    .groups = "drop"
  )

knitr::kable(separation_check,
             caption = "Feature Range per Segment - Perfect Separation Check",
             row.names = FALSE)

Feature Range per Segment - Perfect Separation Check
Segmentation	Age_range	WorkExp_range	FamSize_range
A	18–89	0–14	1–9
B	18–89	0–14	1–9
C	18–89	0–14	1–9
D	18–89	0–14	1–9

cat("\nConclusion: PASSED - Overlapping ranges across",
    "segments confirm no perfect separation exists")

## 
## Conclusion: PASSED - Overlapping ranges across segments confirm no perfect separation exists

Summary

if (is.matrix(vif_raw)) {
  vif_pass <- all(vif_raw[, "GVIF^(1/(2*Df))"] < sqrt(5))
  vif_conclusion <- ifelse(vif_pass,
    "All adjusted GVIF values below threshold (sqrt(5) ≈ 2.24) for all 9 predictors",
    "High multicollinearity detected in one or more predictors - consider removal")
} else {
  vif_pass <- all(vif_raw < 5)
  vif_conclusion <- ifelse(vif_pass,
    "All VIF values below threshold of 5 for all 9 predictors",
    "High VIF detected - consider removing predictors")
}
n_pass   <- n_train >= min_required

mlr_assumption_summary <- data.frame(
  No         = 1:5,
  Assumption = c(
    "Nominal DV with 3+ Categories",
    "Absence of Multicollinearity",
    "Sample Adequacy",
    "Independence of Observations",
    "No Perfect Separation"
  ),
  Test_Used  = c(
    "Level count of target variable",
    "VIF / GVIF (threshold < 5, or GVIF^(1/(2*Df)) < sqrt(5))",
    "Min. 10 obs per predictor per category",
    "Methodological Design",
    "Feature range overlap per segment"
  ),
  Status     = c(
    "PASSED",
    ifelse(vif_pass,  "PASSED", "FAILED"),
    ifelse(n_pass,    "PASSED", "FAILED"),
    "PASSED",
    "PASSED"
  ),
  Conclusion = c(
    paste0("4 nominal categories: A, B, C, D"),
    vif_conclusion,
    paste0("n = ", n_train, " vs minimum required = ", min_required,
           " (based on ", n_predictors, " estimated parameters excl. intercept)"),
    "Each record represents a unique independent customer",
    "All segments share overlapping feature ranges"
  )
)

knitr::kable(mlr_assumption_summary,
             caption = "Summary Table - MLR Assumption Checking",
             row.names = FALSE)

Summary Table - MLR Assumption Checking
No	Assumption	Test_Used	Status	Conclusion
1	Nominal DV with 3+ Categories	Level count of target variable	PASSED	4 nominal categories: A, B, C, D
2	Absence of Multicollinearity	VIF / GVIF (threshold < 5, or GVIF^(1/(2*Df)) < sqrt(5))	PASSED	All adjusted GVIF values below threshold (sqrt(5) ≈ 2.24) for all 9 predictors
3	Sample Adequacy	Min. 10 obs per predictor per category	PASSED	n = 8558 vs minimum required = 880 (based on 22 estimated parameters excl. intercept)
4	Independence of Observations	Methodological Design	PASSED	Each record represents a unique independent customer
5	No Perfect Separation	Feature range overlap per segment	PASSED	All segments share overlapping feature ranges

Model Fitting

Fit MLR Model

# Reference level "A" was already set via relevel() before the train/test split
mlr_model <- multinom(
  Segmentation ~ Age + Work_Experience + Family_Size +
                 Gender + Ever_Married + Graduated +
                 Profession + Spending_Score + Var_1,
  data  = train_mlr,
  trace = FALSE
)

coef_tbl <- as.data.frame(t(coef(mlr_model)))
coef_tbl <- round(coef_tbl, 4)
coef_tbl$Predictor <- rownames(coef_tbl)
rownames(coef_tbl) <- NULL
class_cols <- setdiff(names(coef_tbl), "Predictor")
coef_tbl <- coef_tbl[, c("Predictor", class_cols)]

knitr::kable(coef_tbl,
             caption = "MLR Coefficients by Class (Reference: Segment A)",
             row.names = FALSE)

MLR Coefficients by Class (Reference: Segment A)
Predictor	B	C	D
(Intercept)	-1.3052	-1.9956	0.4901
Age	0.0114	0.0207	-0.0209
Work_Experience	-0.0239	-0.0204	0.0099
Family_Size	0.0685	0.1993	0.0568
GenderMale	-0.0761	-0.1627	0.1690
Ever_MarriedYes	0.1629	-0.1707	-0.2076
GraduatedYes	0.3421	0.6387	-0.3649
ProfessionDoctor	-0.3137	-0.5403	0.4456
ProfessionEngineer	-0.3017	-1.0852	0.3153
ProfessionEntertainment	-0.5176	-1.0993	0.1477
ProfessionExecutive	-0.0402	-0.2205	0.8483
ProfessionHealthcare	0.0453	0.2588	1.7795
ProfessionHomemaker	-0.0468	-0.6868	0.7724
ProfessionLawyer	-0.6182	-0.9365	1.2276
ProfessionMarketing	-0.3381	-0.3060	1.3538
Spending_ScoreAverage	0.5469	1.3320	-0.2046
Spending_ScoreHigh	0.3751	0.7508	-0.3696
Var_1Cat_2	0.4507	0.0944	-0.4074
Var_1Cat_3	0.1492	-0.2186	-0.3373
Var_1Cat_4	0.1069	-0.7471	-0.2359
Var_1Cat_5	0.1628	-0.0350	-0.1981
Var_1Cat_6	0.2348	0.1083	-0.1977
Var_1Cat_7	0.0411	-0.2476	-0.2082

Coefficients & Odds Ratio

coef_mlr <- coef(mlr_model)
or_mlr   <- exp(coef_mlr)

knitr::kable(round(or_mlr, 4),
             caption = "Odds Ratio - MLR (Reference: Segment A)",
             row.names = TRUE)

Odds Ratio - MLR (Reference: Segment A)
	(Intercept)	Age	Work_Experience	Family_Size	GenderMale	Ever_MarriedYes	GraduatedYes	ProfessionDoctor	ProfessionEngineer	ProfessionEntertainment	ProfessionExecutive	ProfessionHealthcare	ProfessionHomemaker	ProfessionLawyer	ProfessionMarketing	Spending_ScoreAverage	Spending_ScoreHigh	Var_1Cat_2	Var_1Cat_3	Var_1Cat_4	Var_1Cat_5	Var_1Cat_6	Var_1Cat_7
B	0.2711	1.0115	0.9764	1.0709	0.9267	1.1770	1.4079	0.7307	0.7395	0.5960	0.9606	1.0463	0.9543	0.5389	0.7131	1.7279	1.4551	1.5695	1.1610	1.1129	1.1768	1.2647	1.0420
C	0.1359	1.0209	0.9798	1.2206	0.8499	0.8431	1.8940	0.5826	0.3378	0.3331	0.8021	1.2954	0.5032	0.3920	0.7364	3.7884	2.1188	1.0990	0.8036	0.4738	0.9656	1.1144	0.7806
D	1.6325	0.9793	1.0099	1.0585	1.1842	0.8125	0.6943	1.5615	1.3707	1.1592	2.3358	5.9266	2.1650	3.4129	3.8720	0.8149	0.6910	0.6654	0.7137	0.7899	0.8203	0.8206	0.8121

Predictor Significance Test (z-test)

mlr_coef  <- summary(mlr_model)$coefficients
mlr_se    <- summary(mlr_model)$standard.errors
mlr_z     <- mlr_coef / mlr_se
mlr_pval  <- 2 * (1 - pnorm(abs(mlr_z)))

knitr::kable(round(mlr_pval, 4),
             caption = "P-Value Z-Test per Predictor and Class",
             row.names = TRUE)

P-Value Z-Test per Predictor and Class
	(Intercept)	Age	Work_Experience	Family_Size	GenderMale	Ever_MarriedYes	ProfessionDoctor	ProfessionEngineer	ProfessionEntertainment	ProfessionExecutive	ProfessionHealthcare	ProfessionHomemaker	ProfessionMarketing	Spending_ScoreAverage	Spending_ScoreHigh	Var_1Cat_2	Var_1Cat_3	Var_1Cat_4	Var_1Cat_5	Var_1Cat_6	Var_1Cat_7
B	0.0000	1e-04	0.0199	0.0057	0.2724	0.0862	0.0091	0.0094	0.0000	0.7839	0.7470	0.8051	0.1015	0.0000	0.0007	0.1440	0.6073	0.7091	0.6987	0.3965	0.9042
C	0.0000	0e+00	0.0574	0.0000	0.0233	0.1114	0.0000	0.0000	0.0000	0.1406	0.0634	0.0021	0.1463	0.0000	0.0000	0.7585	0.4505	0.0098	0.9339	0.6908	0.4698
D	0.0939	0e+00	0.3061	0.0146	0.0147	0.0257	0.0002	0.0119	0.1851	0.0000	0.0000	0.0000	0.0000	0.0662	0.0033	0.1460	0.1906	0.3507	0.6018	0.4174	0.4976

Pseudo R-Squared (McFadden)

mlr_pseudoR2 <- PseudoR2(mlr_model, which = "McFadden")

pseudoR2_tbl <- data.frame(
  Metric        = "McFadden Pseudo R-Squared",
  Value         = round(mlr_pseudoR2, 4),
  # Thresholds: <0.10 Poor, 0.10–0.20 Acceptable, 0.20–0.40 Good, >0.40 Excellent.
  Interpretation = case_when(
    mlr_pseudoR2 < 0.10 ~ "Poor",
    mlr_pseudoR2 < 0.20 ~ "Acceptable",
    mlr_pseudoR2 < 0.40 ~ "Good / Excellent",
    TRUE                ~ "Excellent / Very High Fit"
  )
)

knitr::kable(pseudoR2_tbl,
             caption = "McFadden Pseudo R-Squared",
             row.names = FALSE)

McFadden Pseudo R-Squared
Metric	Value	Interpretation
McFadden Pseudo R-Squared	0.1333	Acceptable

Overall Model Test (Likelihood Ratio Test)

mlr_null <- multinom(Segmentation ~ 1,
                     data  = train_mlr,
                     trace = FALSE)

lrt_result <- anova(mlr_null, mlr_model, test = "Chisq")

# nnet anova column names may have leading spaces; grep handles this robustly.
df_col  <- grep("Df",  names(lrt_result), value = TRUE)[1]
chi_col <- grep("LR",  names(lrt_result), value = TRUE)[1]
p_col   <- grep("Pr",  names(lrt_result), value = TRUE)[1]

stopifnot(
  "Could not find Df column in LRT output"    = !is.na(df_col),
  "Could not find LR stat column in LRT output" = !is.na(chi_col),
  "Could not find p-value column in LRT output" = !is.na(p_col)
)

lrt_tbl <- data.frame(
  Model      = c("Null (intercept only)", "Full (all predictors)"),
  Resid.df   = lrt_result[["Resid. df"]],
  Resid.Dev  = round(lrt_result[["Resid. Dev"]], 2),
  df         = c(NA, lrt_result[2, df_col]),
  Chi.sq     = c(NA, round(lrt_result[2, chi_col], 2)),
  p.value    = c(NA, round(lrt_result[2, p_col],   4))
)

knitr::kable(lrt_tbl,
             caption = "Likelihood Ratio Test - Null vs Full MLR Model",
             row.names = FALSE,
             na = "")

Likelihood Ratio Test - Null vs Full MLR Model
Model	Resid.df	Resid.Dev	df	Chi.sq	p.value
Null (intercept only)	25671	23647.52	NA	NA	NA
Full (all predictors)	25605	20496.38	66	3151.14	0

cat("\nConclusion:",
    ifelse(lrt_result[2, p_col] < 0.05,
           "SIGNIFICANT - Full model outperforms the null model (p < 0.05)",
           "NOT SIGNIFICANT - No meaningful improvement over the null model"))

## 
## Conclusion: SIGNIFICANT - Full model outperforms the null model (p < 0.05)

Prediction & Evaluation

Prediction on Test Set

# Re-level predicted factor to match training levels for confusionMatrix().
mlr_pred_class <- factor(
  predict(mlr_model, newdata = test_mlr, type = "class"),
  levels = levels(train_mlr$Segmentation)
)
mlr_pred_prob  <- predict(mlr_model, newdata = test_mlr, type = "probs")

mlr_cm <- confusionMatrix(
  data      = mlr_pred_class,
  reference = test_mlr$Segmentation
)

cm_tbl <- as.data.frame(mlr_cm$table)
cm_wide <- tidyr::pivot_wider(cm_tbl,
                               names_from  = Reference,
                               values_from = Freq)

knitr::kable(cm_wide,
             caption = "Confusion Matrix - MLR (Test Set)",
             row.names = FALSE)

Confusion Matrix - MLR (Test Set)
Prediction	A	B	C	D
A	282	181	119	150
B	55	68	55	24
C	108	168	260	53
D	118	64	54	378

Confusion Matrix Visualization - MLR

cm_df <- as.data.frame(mlr_cm$table)

ggplot(cm_df, aes(x = Reference, y = Prediction, fill = Freq)) +
  geom_tile(color = "white", linewidth = 0.8) +
  geom_text(aes(label = Freq,
                color  = Freq > max(Freq) * 0.4),
            size = 5, fontface = "bold") +
  scale_color_manual(values = c("TRUE" = "white", "FALSE" = "#3A3A3A"),
                     guide = "none") +
  scale_fill_gradient(low = "#D6EAF8", high = "#2E86C1") +
  theme_minimal(base_size = 12) +
  theme(panel.grid = element_blank()) +
  labs(title = "Confusion Matrix - MLR (Test Set)",
       x     = "Actual Class",
       y     = "Predicted Class",
       fill  = "Count")

Per-Class Metrics - MLR

mlr_byclass <- as.data.frame(mlr_cm$byClass)
mlr_byclass$Class <- rownames(mlr_byclass)
rownames(mlr_byclass) <- NULL

mlr_metrics_tbl <- mlr_byclass %>%
  dplyr::select(Class, Sensitivity, Specificity, `Pos Pred Value`,
                `Neg Pred Value`, F1) %>%
  mutate(across(where(is.numeric), ~ round(.x, 4)))

knitr::kable(mlr_metrics_tbl,
             caption = "Per-Class Performance Metrics - MLR",
             row.names = FALSE)

Per-Class Performance Metrics - MLR
Class	Sensitivity	Specificity	Pos Pred Value	Neg Pred Value	F1
Class: A	0.5009	0.7141	0.3852	0.8000	0.4355
Class: B	0.1414	0.9191	0.3366	0.7866	0.1991
Class: C	0.5328	0.8005	0.4414	0.8527	0.4828
Class: D	0.6248	0.8460	0.6156	0.8510	0.6202

MLR Overall Performance

mlr_acc      <- round(mlr_cm$overall["Accuracy"] * 100, 2)
mlr_kappa    <- round(mlr_cm$overall["Kappa"],          4)

# Warn if any class has undefined F1 (zero predicted positives)
mlr_na_f1 <- sum(is.na(mlr_byclass$F1))
if (mlr_na_f1 > 0) {
  cat("Note:", mlr_na_f1, "class(es) had undefined F1 (excluded from macro average)\n")
}
mlr_f1_macro <- round(mean(mlr_byclass$F1, na.rm = TRUE), 4)

overall_tbl <- data.frame(
  Metric = c("Accuracy (%)", "Kappa", "Macro-F1"),
  Value  = c(mlr_acc, mlr_kappa, mlr_f1_macro),
  Note   = c(
    ifelse(mlr_acc >= 70, "Good", ifelse(mlr_acc >= 50, "Moderate", "Weak")),
    ifelse(mlr_kappa >= 0.8, "Almost Perfect",
           ifelse(mlr_kappa >= 0.6, "Substantial",
                  ifelse(mlr_kappa >= 0.4, "Moderate",
                         ifelse(mlr_kappa >= 0.2, "Fair", "Poor/Slight")))),
    ifelse(mlr_f1_macro >= 0.6, "Good", ifelse(mlr_f1_macro >= 0.4, "Moderate", "Weak"))
  )
)

knitr::kable(overall_tbl,
             caption = "MLR Overall Performance Metrics",
             row.names = FALSE)

MLR Overall Performance Metrics
Metric	Value	Note
Accuracy (%)	46.2300	Weak
Kappa	0.2775	Fair
Macro-F1	0.4344	Moderate

Posterior Probability Plot - MLR

mlr_post_df <- as.data.frame(mlr_pred_prob)
mlr_post_df$Predicted <- mlr_pred_class
mlr_post_df$Actual    <- test_mlr$Segmentation
mlr_post_df$MaxProb   <- apply(mlr_pred_prob, 1, max)
mlr_post_df$Correct   <- mlr_post_df$Predicted == mlr_post_df$Actual

ggplot(mlr_post_df, aes(x = MaxProb, fill = Correct)) +
  geom_histogram(bins = 30, color = "white", alpha = 0.85) +
  scale_fill_manual(values = c("TRUE"  = "#7EC8C8",
                                "FALSE" = "#F4A7B9"),
                    labels = c("TRUE"  = "Correct",
                                "FALSE" = "Incorrect")) +
  theme_minimal() +
  labs(title    = "Distribution of Maximum Posterior Probability - MLR",
       subtitle = "Higher probability = more confident prediction",
       x    = "Max Posterior Probability",
       y    = "Count",
       fill = "Prediction")

MODEL COMPARISON

Performance Metrics: LDA vs MLR

comparison_df <- data.frame(
  Model    = c("LDA", "MLR"),
  Accuracy = c(lda_acc, mlr_acc),
  Kappa    = c(lda_kappa, mlr_kappa),
  MacroF1  = c(lda_f1_macro, mlr_f1_macro)
)

knitr::kable(comparison_df,
             caption = "Performance Comparison: LDA vs MLR (Test Set)",
             row.names = FALSE)

Performance Comparison: LDA vs MLR (Test Set)
Model	Accuracy	Kappa	MacroF1
LDA	39.96	0.1869	0.3338
MLR	46.23	0.2775	0.4344

comparison_df %>%
  pivot_longer(cols      = c(Accuracy, Kappa, MacroF1),
               names_to  = "Metric",
               values_to = "Value") %>%
  mutate(Metric = factor(Metric, levels = c("Accuracy", "Kappa", "MacroF1"))) %>%
  ggplot(aes(x = Metric, y = Value, fill = Model)) +
  geom_col(position = "dodge", width = 0.6) +
  geom_text(aes(label = round(Value, 3)),
            position = position_dodge(width = 0.6),
            vjust = -0.4, size = 3.5) +
  scale_fill_manual(values = c("LDA" = "#A8C5E0", "MLR" = "#B5D5C5")) +
  theme_minimal() +
  labs(title = "Model Comparison: LDA vs MLR (Test Set)",
       x     = "Metric",
       y     = "Value",
       fill  = "Model")

Summary Table

best_model <- ifelse(mlr_acc >= lda_acc, "MLR", "LDA")

summary_final <- data.frame(
  Metric           = c("Accuracy (%)", "Kappa", "Macro-F1",
                       "Predictors Used", "No. of Features",
                       "Assumptions Checked", "Best Model"),
  LDA              = c(paste0(lda_acc, "%"),
                       as.character(lda_kappa),
                       as.character(lda_f1_macro),
                       "Age, Work_Experience, Family_Size",
                       "3 (numeric only)",
                       "MVN, Homogeneity, No Multicollinearity, Independence",
                       ifelse(best_model == "LDA", "✓", "—")),
  MLR              = c(paste0(mlr_acc, "%"),
                       as.character(mlr_kappa),
                       as.character(mlr_f1_macro),
                       "All 9 Features (numeric + categorical)",
                       "9",
                       "Nominal DV, No Multicollinearity, Sample Adequacy, Independence, No Perfect Separation",
                       ifelse(best_model == "MLR", "✓", "—"))
)

knitr::kable(summary_final,
             caption = "Summary Comparison Table — LDA vs MLR",
             row.names = FALSE)

Summary Comparison Table — LDA vs MLR
Metric	LDA	MLR
Accuracy (%)	39.96%	46.23%
Kappa	0.1869	0.2775
Macro-F1	0.3338	0.4344
Predictors Used	Age, Work_Experience, Family_Size	All 9 Features (numeric + categorical)
No. of Features	3 (numeric only)	9
Assumptions Checked	MVN, Homogeneity, No Multicollinearity, Independence	Nominal DV, No Multicollinearity, Sample Adequacy, Independence, No Perfect Separation
Best Model	—	✓

cat("\nBest model based on Accuracy:", best_model, "\n")

## 
## Best model based on Accuracy: MLR

cat("Accuracy gain of MLR over LDA:", round(mlr_acc - lda_acc, 2), "%\n")

## Accuracy gain of MLR over LDA: 6.27 %

Multivariate Analysis: Linear Discriminant Analysis & Multinomial Logistic Regression

Mylovia Mahesa Ayu (24031554001) Syafira Najema Putri Anisa (24031554013) Rizki Piji Fathoni (24031554029)

May 2026

SETUP

LOAD DATA & OVERVIEW

Load & Merge Data

Structure & Dim

Missing Values Check

EDA

Descriptive Statistics

Target Distribution

Numeric Feature Distribution

Categorical Feature Distribution

Correlation Matrix

Outlier Detection

PREPROCESSING

Handling Missing Values

Encoding Categorical Variables

Feature Selection per Method

Train-Test Split

Post-Preprocessing Verification

DISCRIMINANT ANALYSIS

Assumption Checking

Preprocessing for LDA

Multivariate Normality

Homogeneity

Absence of Multicollinearity

Independence

Summary

Model Fitting

Fit LDA Model

Discriminant Function Coefficients

Proportion of Trace (Variance Explained)

LDA Bi-Plot (LD1 vs LD2)

Prediction & Confusion Matrix — LDA

Per-Class Metrics — LDA

LDA Overall Performance

Posterior Probability Plot — LDA

MULTINOMIAL LOGISTIC REGRESSION (MLR)

Assumption Checking

Nominal Dependent Variable

VIF

Sample Adequacy

Independence

No Perfect Separation

Summary

Model Fitting

Fit MLR Model

Coefficients & Odds Ratio

Predictor Significance Test (z-test)

Pseudo R-Squared (McFadden)

Overall Model Test (Likelihood Ratio Test)

Prediction & Evaluation

Prediction on Test Set

Confusion Matrix Visualization - MLR

Per-Class Metrics - MLR

MLR Overall Performance

Posterior Probability Plot - MLR

MODEL COMPARISON

Performance Metrics: LDA vs MLR

Summary Table

Mylovia Mahesa Ayu (24031554001)
Syafira Najema Putri Anisa (24031554013)
Rizki Piji Fathoni (24031554029)