options(repos = c(CRAN = "https://cran.rstudio.com/"))

1 Instalasi & Load Library

packages <- c("tidyverse", "caret", "MASS", "nnet", "corrplot",
               "MVN", "car", "nortest", "ggplot2", "knitr", "kableExtra",
               "pROC", "reshape2", "MLmetrics")

installed <- packages %in% rownames(installed.packages())
if (any(!installed)) install.packages(packages[!installed])

library(MASS)         
library(nnet)         
library(tidyverse)    
library(caret)        
library(corrplot)     
library(MVN)          
library(car)          
library(nortest)      
library(ggplot2)      
library(knitr)        
library(kableExtra)   
library(pROC)         
library(reshape2)     

select <- dplyr::select
filter <- dplyr::filter

2 Tahap 1 — Load & Eksplorasi Data (EDA)

2.1 Load Dataset

wine_path <- if (file.exists("Wine.csv")) {
  "Wine.csv"
} else {
  file.path(dirname(knitr::current_input(dir = TRUE)), "Wine.csv")
}

if (!file.exists(wine_path)) {
  stop("File Wine.csv tidak ditemukan. Pastikan Wine.csv berada di folder yang sama dengan file .Rmd ini.")
}

df <- read.csv(wine_path)
df$Customer_Segment <- as.factor(df$Customer_Segment)

cat("Dimensi data:", dim(df), "\n")
## Dimensi data: 178 14
cat("Jumlah kelas:", nlevels(df$Customer_Segment), "\n")
## Jumlah kelas: 3
cat("Label kelas:", levels(df$Customer_Segment), "\n")
## Label kelas: 1 2 3

2.2 Struktur Data

str(df)
## 'data.frame':    178 obs. of  14 variables:
##  $ Alcohol             : num  14.2 13.2 13.2 14.4 13.2 ...
##  $ Malic_Acid          : num  1.71 1.78 2.36 1.95 2.59 1.76 1.87 2.15 1.64 1.35 ...
##  $ Ash                 : num  2.43 2.14 2.67 2.5 2.87 2.45 2.45 2.61 2.17 2.27 ...
##  $ Ash_Alcanity        : num  15.6 11.2 18.6 16.8 21 15.2 14.6 17.6 14 16 ...
##  $ Magnesium           : int  127 100 101 113 118 112 96 121 97 98 ...
##  $ Total_Phenols       : num  2.8 2.65 2.8 3.85 2.8 3.27 2.5 2.6 2.8 2.98 ...
##  $ Flavanoids          : num  3.06 2.76 3.24 3.49 2.69 3.39 2.52 2.51 2.98 3.15 ...
##  $ Nonflavanoid_Phenols: num  0.28 0.26 0.3 0.24 0.39 0.34 0.3 0.31 0.29 0.22 ...
##  $ Proanthocyanins     : num  2.29 1.28 2.81 2.18 1.82 1.97 1.98 1.25 1.98 1.85 ...
##  $ Color_Intensity     : num  5.64 4.38 5.68 7.8 4.32 6.75 5.25 5.05 5.2 7.22 ...
##  $ Hue                 : num  1.04 1.05 1.03 0.86 1.04 1.05 1.02 1.06 1.08 1.01 ...
##  $ OD280               : num  3.92 3.4 3.17 3.45 2.93 2.85 3.58 3.58 2.85 3.55 ...
##  $ Proline             : int  1065 1050 1185 1480 735 1450 1290 1295 1045 1045 ...
##  $ Customer_Segment    : Factor w/ 3 levels "1","2","3": 1 1 1 1 1 1 1 1 1 1 ...

2.3 Cek 6 Baris Pertama

head(df) %>%
  kable(caption = "6 Baris Pertama Dataset Wine") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"),
                full_width = FALSE)
6 Baris Pertama Dataset Wine
Alcohol Malic_Acid Ash Ash_Alcanity Magnesium Total_Phenols Flavanoids Nonflavanoid_Phenols Proanthocyanins Color_Intensity Hue OD280 Proline Customer_Segment
14.23 1.71 2.43 15.6 127 2.80 3.06 0.28 2.29 5.64 1.04 3.92 1065 1
13.20 1.78 2.14 11.2 100 2.65 2.76 0.26 1.28 4.38 1.05 3.40 1050 1
13.16 2.36 2.67 18.6 101 2.80 3.24 0.30 2.81 5.68 1.03 3.17 1185 1
14.37 1.95 2.50 16.8 113 3.85 3.49 0.24 2.18 7.80 0.86 3.45 1480 1
13.24 2.59 2.87 21.0 118 2.80 2.69 0.39 1.82 4.32 1.04 2.93 735 1
14.20 1.76 2.45 15.2 112 3.27 3.39 0.34 1.97 6.75 1.05 2.85 1450 1

2.4 Distribusi Kelas (Customer_Segment)

tabel_kelas <- df %>%
  count(Customer_Segment) %>%
  mutate(Proporsi = round(n / sum(n) * 100, 2))

tabel_kelas %>%
  kable(caption = "Distribusi Kelas Customer_Segment",
        col.names = c("Kelas", "Frekuensi", "Proporsi (%)")) %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)
Distribusi Kelas Customer_Segment
Kelas Frekuensi Proporsi (%)
1 59 33.15
2 71 39.89
3 48 26.97
ggplot(df, aes(x = Customer_Segment, fill = Customer_Segment)) +
  geom_bar(width = 0.6, show.legend = FALSE) +
  geom_text(stat = "count", aes(label = after_stat(count)), vjust = -0.5, size = 4) +
  scale_fill_manual(values = c("#4E79A7", "#F28E2B", "#59A14F")) +
  labs(title = "Distribusi Kelas Customer_Segment", x = "Kelas", y = "Frekuensi") +
  theme_minimal(base_size = 13)

2.5 Statistik Deskriptif

fitur_numerik <- df %>% select(-Customer_Segment)

summary(fitur_numerik) %>%
  kable(caption = "Statistik Deskriptif Fitur Numerik") %>%
  kable_styling(bootstrap_options = c("striped", "condensed"),
                full_width = TRUE, font_size = 11)
Statistik Deskriptif Fitur Numerik
Alcohol Malic_Acid Ash Ash_Alcanity Magnesium Total_Phenols Flavanoids Nonflavanoid_Phenols Proanthocyanins Color_Intensity Hue OD280 Proline
Min. :11.03 Min. :0.740 Min. :1.360 Min. :10.60 Min. : 70.00 Min. :0.980 Min. :0.340 Min. :0.1300 Min. :0.410 Min. : 1.280 Min. :0.4800 Min. :1.270 Min. : 278.0
1st Qu.:12.36 1st Qu.:1.603 1st Qu.:2.210 1st Qu.:17.20 1st Qu.: 88.00 1st Qu.:1.742 1st Qu.:1.205 1st Qu.:0.2700 1st Qu.:1.250 1st Qu.: 3.220 1st Qu.:0.7825 1st Qu.:1.938 1st Qu.: 500.5
Median :13.05 Median :1.865 Median :2.360 Median :19.50 Median : 98.00 Median :2.355 Median :2.135 Median :0.3400 Median :1.555 Median : 4.690 Median :0.9650 Median :2.780 Median : 673.5
Mean :13.00 Mean :2.336 Mean :2.367 Mean :19.49 Mean : 99.74 Mean :2.295 Mean :2.029 Mean :0.3619 Mean :1.591 Mean : 5.058 Mean :0.9574 Mean :2.612 Mean : 746.9
3rd Qu.:13.68 3rd Qu.:3.083 3rd Qu.:2.558 3rd Qu.:21.50 3rd Qu.:107.00 3rd Qu.:2.800 3rd Qu.:2.875 3rd Qu.:0.4375 3rd Qu.:1.950 3rd Qu.: 6.200 3rd Qu.:1.1200 3rd Qu.:3.170 3rd Qu.: 985.0
Max. :14.83 Max. :5.800 Max. :3.230 Max. :30.00 Max. :162.00 Max. :3.880 Max. :5.080 Max. :0.6600 Max. :3.580 Max. :13.000 Max. :1.7100 Max. :4.000 Max. :1680.0

2.6 Boxplot per Fitur

df_long <- df %>%
  pivot_longer(cols = -Customer_Segment, names_to = "Fitur", values_to = "Nilai")

ggplot(df_long, aes(x = Customer_Segment, y = Nilai, fill = Customer_Segment)) +
  geom_boxplot(outlier.colour = "red", outlier.size = 1.5, alpha = 0.8) +
  scale_fill_manual(values = c("#4E79A7", "#F28E2B", "#59A14F")) +
  facet_wrap(~ Fitur, scales = "free_y", ncol = 4) +
  labs(title = "Boxplot Setiap Fitur per Kelas", x = "Kelas", y = "Nilai") +
  theme_minimal(base_size = 11) +
  theme(legend.position = "none")

3 Tahap 2 — Cek Asumsi

3.1 Asumsi 1 — Linearitas Log-Odds (untuk MLR)

df_check <- df %>%
  mutate(kelas_bin = ifelse(Customer_Segment == 1, 1, 0))

ggplot(df_check, aes(x = Alcohol, y = kelas_bin)) +
  geom_point(alpha = 0.4, colour = "#4E79A7") +
  geom_smooth(method = "glm", method.args = list(family = "binomial"),
              se = TRUE, colour = "#E15759") +
  labs(title = "Cek Linearitas Log-Odds: Alcohol vs Kelas 1",
       x = "Alcohol", y = "Probabilitas Kelas 1") +
  theme_minimal(base_size = 13)

3.2 Asumsi 2 — Independensi Observasi

lm_proxy <- lm(as.numeric(Customer_Segment) ~ ., data = df)
dw_result <- car::durbinWatsonTest(lm_proxy)
print(dw_result)
##  lag Autocorrelation D-W Statistic p-value
##    1       0.2000099      1.594091       0
##  Alternative hypothesis: rho != 0
cat("\nInterpretasi:\n")
## 
## Interpretasi:
cat("Nilai DW mendekati 2 → tidak ada autokorelasi (asumsi independensi terpenuhi)\n")
## Nilai DW mendekati 2 → tidak ada autokorelasi (asumsi independensi terpenuhi)

3.3 Asumsi 3 — No Multikolinearitas

vif_values <- car::vif(lm_proxy)
vif_df <- data.frame(
  Fitur = names(vif_values),
  VIF   = round(vif_values, 3)
) %>% arrange(desc(VIF))

vif_df %>%
  mutate(Status = ifelse(VIF > 10, "⚠️ Tinggi", "✅ OK")) %>%
  kable(caption = "Variance Inflation Factor (VIF)") %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE) %>%
  row_spec(which(vif_df$VIF > 10), background = "#FFF3CD")
Variance Inflation Factor (VIF)
Fitur VIF Status
Flavanoids Flavanoids 7.029 ✅ OK |
Total_Phenols Total_Phenols 4.335 ✅ OK |
OD280 OD280 3.785 ✅ OK |
Color_Intensity Color_Intensity 3.026 ✅ OK |
Proline Proline 2.824 ✅ OK |
Hue Hue 2.551 ✅ OK |
Alcohol Alcohol 2.460 ✅ OK |
Ash_Alcanity Ash_Alcanity 2.239 ✅ OK |
Ash Ash 2.185 ✅ OK |
Proanthocyanins Proanthocyanins 1.976 ✅ OK |
Nonflavanoid_Phenols Nonflavanoid_Phenols 1.796 ✅ OK |
Malic_Acid Malic_Acid 1.657 ✅ OK |
Magnesium Magnesium 1.418 ✅ OK |
cat("\nKetentuan: VIF > 10 indikasi multikolinearitas serius\n")
## 
## Ketentuan: VIF > 10 indikasi multikolinearitas serius

3.3.1 Matriks korelasi asumsi 3

cor_matrix <- cor(fitur_numerik)

corrplot(cor_matrix, method = "color", type = "upper",
         tl.cex = 0.8, number.cex = 0.65, addCoef.col = "black",
         title = "Matriks Korelasi Antar Fitur", mar = c(0, 0, 1, 0))

3.4 Asumsi 4 — No Outlier (IQR Method)

deteksi_outlier <- function(x) {
  Q1  <- quantile(x, 0.25)
  Q3  <- quantile(x, 0.75)
  IQR <- Q3 - Q1
  sum(x < (Q1 - 1.5 * IQR) | x > (Q3 + 1.5 * IQR))
}

outlier_summary <- data.frame(
  Fitur          = names(fitur_numerik),
  Jumlah_Outlier = sapply(fitur_numerik, deteksi_outlier)
) %>% arrange(desc(Jumlah_Outlier))

outlier_summary %>%
  mutate(Status = ifelse(Jumlah_Outlier > 0, "⚠️ Ada outlier", "✅ Bersih")) %>%
  kable(caption = "Deteksi Outlier per Fitur (Metode IQR)") %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE) %>%
  row_spec(which(outlier_summary$Jumlah_Outlier > 0), background = "#FFF3CD")
Deteksi Outlier per Fitur (Metode IQR)
Fitur Jumlah_Outlier Status
Ash_Alcanity Ash_Alcanity 4 ⚠️ Ada outlier
Magnesium Magnesium 4 ⚠️ Ada outlier
Color_Intensity Color_Intensity 4 ⚠️ Ada outlier
Malic_Acid Malic_Acid 3 ⚠️ Ada outlier
Ash Ash 3 ⚠️ Ada outlier
Proanthocyanins Proanthocyanins 2 ⚠️ Ada outlier
Hue Hue 1 ⚠️ Ada outlier
Alcohol Alcohol 0 ✅ Bersih |
Total_Phenols Total_Phenols 0 ✅ Bersih |
Flavanoids Flavanoids 0 ✅ Bersih |
Nonflavanoid_Phenols Nonflavanoid_Phenols 0 ✅ Bersih |
OD280 OD280 0 ✅ Bersih |
Proline Proline 0 ✅ Bersih |

3.5 Asumsi 5 — Normalitas (Shapiro-Wilk per Fitur)

shapiro_results <- sapply(fitur_numerik, function(x) shapiro.test(x)$p.value)

normalitas_df <- data.frame(
  Fitur   = names(shapiro_results),
  p_value = round(shapiro_results, 4)
) %>%
  mutate(Status = ifelse(p_value >= 0.05, "✅ Normal", "⚠️ Tidak Normal")) %>%
  arrange(p_value)

normalitas_df %>%
  kable(caption = "Uji Normalitas Shapiro-Wilk per Fitur (α = 0.05)") %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE) %>%
  row_spec(which(normalitas_df$p_value < 0.05), background = "#FFF3CD")
Uji Normalitas Shapiro-Wilk per Fitur (α = 0.05)
Fitur p_value Status
Malic_Acid Malic_Acid 0.0000 ⚠️ Tidak Normal
Magnesium Magnesium 0.0000 ⚠️ Tidak Normal
Flavanoids Flavanoids 0.0000 ⚠️ Tidak Normal
Color_Intensity Color_Intensity 0.0000 ⚠️ Tidak Normal
OD280 OD280 0.0000 ⚠️ Tidak Normal
Proline Proline 0.0000 ⚠️ Tidak Normal
Nonflavanoid_Phenols Nonflavanoid_Phenols 0.0001 ⚠️ Tidak Normal
Total_Phenols Total_Phenols 0.0044 ⚠️ Tidak Normal
Proanthocyanins Proanthocyanins 0.0145 ⚠️ Tidak Normal
Hue Hue 0.0174 ⚠️ Tidak Normal
Alcohol Alcohol 0.0200 ⚠️ Tidak Normal
Ash Ash 0.0387 ⚠️ Tidak Normal
Ash_Alcanity Ash_Alcanity 0.2639 ✅ Normal |
mvn_result <- tryCatch(
  mvn(data = fitur_numerik, mvnTest = "mardia"),
  error = function(e) tryCatch(
    mvn(data = fitur_numerik, test = "mardia"),
    error = function(e2) mvn(data = fitur_numerik)
  )
)

print(mvn_result$multivariateNormality)
## NULL

4 Tahap 3 — Preprocessing

4.1 Pisah X dan y

X <- df %>% select(-Customer_Segment)
y <- df$Customer_Segment

cat("Dimensi X (fitur)  :", dim(X), "\n")
## Dimensi X (fitur)  : 178 13
cat("Panjang y (target) :", length(y), "\n")
## Panjang y (target) : 178
cat("Distribusi y       :\n")
## Distribusi y       :
print(table(y))
## y
##  1  2  3 
## 59 71 48

4.2 Set Seed

set.seed(42)
cat("Seed ditetapkan: 42\n")
## Seed ditetapkan: 42

4.3 Train-Test Split (75:25 Stratified)

train_index <- createDataPartition(y, p = 0.75, list = FALSE)

X_train <- X[train_index, ]
X_test  <- X[-train_index, ]
y_train <- y[train_index]
y_test  <- y[-train_index]

cat("=== Ukuran Data ===\n")
## === Ukuran Data ===
cat("Train:", nrow(X_train), "observasi | Test:", nrow(X_test), "observasi\n\n")
## Train: 135 observasi | Test: 43 observasi
cat("=== Distribusi Kelas — Train ===\n")
## === Distribusi Kelas — Train ===
print(table(y_train))
## y_train
##  1  2  3 
## 45 54 36
cat("\n=== Distribusi Kelas — Test ===\n")
## 
## === Distribusi Kelas — Test ===
print(table(y_test))
## y_test
##  1  2  3 
## 14 17 12
prop_train <- round(prop.table(table(y_train)) * 100, 1)
prop_test  <- round(prop.table(table(y_test))  * 100, 1)

cat("\nProporsi Train (%):\n")
## 
## Proporsi Train (%):
print(prop_train)
## y_train
##    1    2    3 
## 33.3 40.0 26.7
cat("\nProporsi Test  (%):\n")
## 
## Proporsi Test  (%):
print(prop_test)
## y_test
##    1    2    3 
## 32.6 39.5 27.9

4.4 Standarisasi (Z-score, fit pada Train saja)

preproc <- preProcess(X_train, method = c("center", "scale"))

X_train_scaled <- predict(preproc, X_train)
X_test_scaled  <- predict(preproc, X_test)

mean_check <- round(colMeans(X_train_scaled), 3)
sd_check   <- round(apply(X_train_scaled, 2, sd), 3)

cat("=== Verifikasi Standarisasi (Train) ===\n")
## === Verifikasi Standarisasi (Train) ===
verif_df <- data.frame(Fitur = names(mean_check), Mean = mean_check, SD = sd_check)

verif_df %>%
  kable(caption = "Mean dan SD Setelah Standarisasi (Train)") %>%
  kable_styling(bootstrap_options = c("striped", "hover"),
                full_width = FALSE, font_size = 11)
Mean dan SD Setelah Standarisasi (Train)
Fitur Mean SD
Alcohol Alcohol 0 1
Malic_Acid Malic_Acid 0 1
Ash Ash 0 1
Ash_Alcanity Ash_Alcanity 0 1
Magnesium Magnesium 0 1
Total_Phenols Total_Phenols 0 1
Flavanoids Flavanoids 0 1
Nonflavanoid_Phenols Nonflavanoid_Phenols 0 1
Proanthocyanins Proanthocyanins 0 1
Color_Intensity Color_Intensity 0 1
Hue Hue 0 1
OD280 OD280 0 1
Proline Proline 0 1

4.5 Gabungkan X dan y untuk Modeling

train_data <- cbind(X_train_scaled, Customer_Segment = y_train)
test_data  <- cbind(X_test_scaled,  Customer_Segment = y_test)

cat("Data train siap:", nrow(train_data), "baris,", ncol(train_data), "kolom\n")
## Data train siap: 135 baris, 14 kolom
cat("Data test  siap:", nrow(test_data),  "baris,", ncol(test_data),  "kolom\n")
## Data test  siap: 43 baris, 14 kolom
cat("\nPreprocessing selesai. Lanjut ke Tahap 4: Fitting Model MLR & LDA.\n")
## 
## Preprocessing selesai. Lanjut ke Tahap 4: Fitting Model MLR & LDA.

5 Tahap 4A — Linear Discriminant Analysis (LDA)

5.1 Fit Model LDA

lda_model <- lda(Customer_Segment ~ ., data = train_data)
cat("=== Model LDA berhasil di-fit ===\n\n")
## === Model LDA berhasil di-fit ===
print(lda_model)
## Call:
## lda(Customer_Segment ~ ., data = train_data)
## 
## Prior probabilities of groups:
##         1         2         3 
## 0.3333333 0.4000000 0.2666667 
## 
## Group means:
##      Alcohol Malic_Acid        Ash Ash_Alcanity   Magnesium Total_Phenols
## 1  0.8614216 -0.2652798  0.2906065   -0.7350033  0.51816969    0.89199963
## 2 -0.8807290 -0.4160982 -0.3652704    0.2438351 -0.41017334   -0.08453745
## 3  0.2443165  0.9557471  0.1846475    0.5530014 -0.03245211   -0.98819336
##    Flavanoids Nonflavanoid_Phenols Proanthocyanins Color_Intensity        Hue
## 1  0.93641652          -0.65222026      0.56809014       0.1834782  0.4056861
## 2  0.06768397          -0.02287984      0.01030215      -0.8350753  0.4692576
## 3 -1.27204661           0.84959508     -0.72556590       1.0232653 -1.2109941
##        OD280    Proline
## 1  0.8361276  1.1462003
## 2  0.1747559 -0.7311564
## 3 -1.3072932 -0.3360158
## 
## Coefficients of linear discriminants:
##                              LD1         LD2
## Alcohol              -0.37406517  0.67904416
## Malic_Acid            0.19365178  0.36305408
## Ash                  -0.15212531  0.53979962
## Ash_Alcanity          0.44514061 -0.40193781
## Magnesium            -0.05641899  0.02241295
## Total_Phenols         0.46830400  0.06211205
## Flavanoids           -1.72429984 -0.57240210
## Nonflavanoid_Phenols -0.13439089 -0.10721773
## Proanthocyanins       0.10104462 -0.18060995
## Color_Intensity       0.83819227  0.61129435
## Hue                  -0.15989148 -0.44033959
## OD280                -0.84166138  0.22604124
## Proline              -0.88766530  0.93254363
## 
## Proportion of trace:
##    LD1    LD2 
## 0.6895 0.3105

5.2 Komponen Diskriminan — Explained Variance Ratio (LD1 & LD2)

# Proporsi varian yang dijelaskan tiap komponen diskriminan
prop_var <- lda_model$svd^2 / sum(lda_model$svd^2)
prop_var_df <- data.frame(
  Komponen       = paste0("LD", seq_along(prop_var)),
  Eigenvalue     = round(lda_model$svd^2, 4),
  Proporsi       = round(prop_var * 100, 2),
  Kumulatif_Pct  = round(cumsum(prop_var) * 100, 2)
)

prop_var_df %>%
  kable(caption = "Explained Variance Ratio — Komponen Diskriminan LDA",
        col.names = c("Komponen", "Eigenvalue", "Proporsi (%)", "Kumulatif (%)")) %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)
Explained Variance Ratio — Komponen Diskriminan LDA
Komponen Eigenvalue Proporsi (%) Kumulatif (%)
LD1 602.9522 68.95 68.95
LD2 271.5138 31.05 100.00
cat(sprintf("\nLD1 menjelaskan %.1f%% dan LD2 menjelaskan %.1f%% variansi antarkelas.\n",
            prop_var_df$Proporsi[1], prop_var_df$Proporsi[2]))
## 
## LD1 menjelaskan 69.0% dan LD2 menjelaskan 31.1% variansi antarkelas.

5.3 Proyeksi LDA — Scatter Plot LD1 vs LD2

# Proyeksi data train ke ruang diskriminan
lda_proj_train <- predict(lda_model, X_train_scaled)
lda_proj_test  <- predict(lda_model, X_test_scaled)

df_proj <- data.frame(
  LD1   = lda_proj_train$x[, 1],
  LD2   = lda_proj_train$x[, 2],
  Kelas = y_train
)

ggplot(df_proj, aes(x = LD1, y = LD2, colour = Kelas, shape = Kelas)) +
  geom_point(size = 3, alpha = 0.85) +
  stat_ellipse(level = 0.90, linewidth = 0.8, linetype = "dashed") +
  scale_colour_manual(values = c("#4E79A7", "#F28E2B", "#59A14F"),
                      name = "Customer\nSegment") +
  scale_shape_manual(values = c(16, 17, 15), name = "Customer\nSegment") +
  labs(title = "Proyeksi LDA: LD1 vs LD2 (Data Train)",
       subtitle = sprintf("LD1 = %.1f%%  |  LD2 = %.1f%%  variansi antarkelas",
                          prop_var_df$Proporsi[1], prop_var_df$Proporsi[2]),
       x = "LD1", y = "LD2") +
  theme_minimal(base_size = 13) +
  theme(legend.position = "right")

5.4 Koefisien Diskriminan — Kontribusi Fitur

# Koefisien (loading) tiap fitur pada LD1 dan LD2
coef_df <- as.data.frame(lda_model$scaling) %>%
  rownames_to_column("Fitur") %>%
  mutate(across(where(is.numeric), ~ round(., 4))) %>%
  mutate(Abs_LD1 = abs(LD1)) %>%
  arrange(desc(Abs_LD1)) %>%
  select(-Abs_LD1)

coef_df %>%
  kable(caption = "Koefisien Diskriminan LDA (Kontribusi Fitur per Komponen)") %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE) %>%
  row_spec(1:3, background = "#E8F4FD")   # highlight 3 fitur teratas
Koefisien Diskriminan LDA (Kontribusi Fitur per Komponen)
Fitur LD1 LD2
Flavanoids -1.7243 -0.5724
Proline -0.8877 0.9325
OD280 -0.8417 0.2260
Color_Intensity 0.8382 0.6113
Total_Phenols 0.4683 0.0621
Ash_Alcanity 0.4451 -0.4019
Alcohol -0.3741 0.6790
Malic_Acid 0.1937 0.3631
Hue -0.1599 -0.4403
Ash -0.1521 0.5398
Nonflavanoid_Phenols -0.1344 -0.1072
Proanthocyanins 0.1010 -0.1806
Magnesium -0.0564 0.0224
cat("\nFitur dengan kontribusi terbesar pada LD1:\n")
## 
## Fitur dengan kontribusi terbesar pada LD1:
cat(paste(head(coef_df$Fitur, 3), collapse = ", "), "\n")
## Flavanoids, Proline, OD280

6 Tahap 4B — Multinomial Logistic Regression (MLR)

6.1 Fit Model MLR

# Referensi kategori: kelas 1 (baseline)
train_data$Customer_Segment <- relevel(train_data$Customer_Segment, ref = "1")

mlr_model <- multinom(Customer_Segment ~ ., data = train_data, trace = FALSE)

cat("=== Model MLR berhasil di-fit ===\n\n")
## === Model MLR berhasil di-fit ===
summary(mlr_model)
## Call:
## multinom(formula = Customer_Segment ~ ., data = train_data, trace = FALSE)
## 
## Coefficients:
##   (Intercept)    Alcohol Malic_Acid        Ash Ash_Alcanity  Magnesium
## 2   -1.156250 -11.881344 -3.2514642 -10.016165     9.321990 -0.5202924
## 3   -5.222332  -2.068507  0.9017964   4.014963     4.038532  0.9348684
##   Total_Phenols Flavanoids Nonflavanoid_Phenols Proanthocyanins Color_Intensity
## 2     -2.097594  -1.927518             5.725801       0.9563483       -12.23829
## 3     -4.884002 -12.655078             1.969334      -2.6212234        12.57532
##          Hue     OD280     Proline
## 2   2.968839 -4.645503 -16.6173993
## 3 -10.354985 -9.819660   0.3185086
## 
## Std. Errors:
##   (Intercept)   Alcohol Malic_Acid       Ash Ash_Alcanity Magnesium
## 2    18353.62  4649.292   8035.895  8527.676     15182.19  3027.783
## 3    32757.73 40833.784  12634.577 52217.100     30516.45 78036.250
##   Total_Phenols Flavanoids Nonflavanoid_Phenols Proanthocyanins Color_Intensity
## 2      7274.864   7709.677             6011.432        2625.791        20202.27
## 3     52549.791  61596.183            39871.074       59249.474        44300.84
##        Hue     OD280  Proline
## 2 11248.28  9895.901 11640.20
## 3 33777.78 58599.008 18731.91
## 
## Residual Deviance: 0.0001872249 
## AIC: 56.00019

6.2 Koefisien β (Log Odds)

coef_mlr <- summary(mlr_model)$coefficients
coef_mlr_df <- as.data.frame(t(coef_mlr)) %>%
  rownames_to_column("Fitur") %>%
  rename_with(~ paste0("LogOdds_Kelas", .), -Fitur)

coef_mlr_df %>%
  mutate(across(where(is.numeric), ~ round(., 4))) %>%
  kable(caption = "Koefisien β (Log Odds) — MLR per Kategori vs Baseline (Kelas 1)") %>%
  kable_styling(bootstrap_options = c("striped", "hover"),
                full_width = TRUE, font_size = 11)
Koefisien β (Log Odds) — MLR per Kategori vs Baseline (Kelas 1)
Fitur LogOdds_Kelas2 LogOdds_Kelas3
(Intercept) -1.1563 -5.2223
Alcohol -11.8813 -2.0685
Malic_Acid -3.2515 0.9018
Ash -10.0162 4.0150
Ash_Alcanity 9.3220 4.0385
Magnesium -0.5203 0.9349
Total_Phenols -2.0976 -4.8840
Flavanoids -1.9275 -12.6551
Nonflavanoid_Phenols 5.7258 1.9693
Proanthocyanins 0.9563 -2.6212
Color_Intensity -12.2383 12.5753
Hue 2.9688 -10.3550
OD280 -4.6455 -9.8197
Proline -16.6174 0.3185

6.3 Relative Risk Ratio (RRR)

# RRR = exp(β)
rrr_mlr <- exp(coef_mlr)
rrr_df  <- as.data.frame(t(rrr_mlr)) %>%
  rownames_to_column("Fitur") %>%
  rename_with(~ paste0("RRR_Kelas", .), -Fitur)

rrr_df %>%
  mutate(across(where(is.numeric), ~ round(., 4))) %>%
  kable(caption = "Relative Risk Ratio (RRR = exp(β)) — MLR") %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)
Relative Risk Ratio (RRR = exp(β)) — MLR
Fitur RRR_Kelas2 RRR_Kelas3
(Intercept) 0.3147 0.0054
Alcohol 0.0000 0.1264
Malic_Acid 0.0387 2.4640
Ash 0.0000 55.4213
Ash_Alcanity 11181.2158 56.7430
Magnesium 0.5943 2.5469
Total_Phenols 0.1228 0.0076
Flavanoids 0.1455 0.0000
Nonflavanoid_Phenols 306.6788 7.1659
Proanthocyanins 2.6022 0.0727
Color_Intensity 0.0000 289329.4550
Hue 19.4693 0.0000
OD280 0.0096 0.0001
Proline 0.0000 1.3751
# Bar chart RRR
rrr_long <- rrr_df %>%
  filter(Fitur != "(Intercept)") %>%
  pivot_longer(-Fitur, names_to = "Kelas", values_to = "RRR")

ggplot(rrr_long, aes(x = reorder(Fitur, RRR), y = RRR, fill = Kelas)) +
  geom_bar(stat = "identity", position = "dodge", alpha = 0.85) +
  geom_hline(yintercept = 1, linetype = "dashed", colour = "grey40") +
  scale_fill_manual(values = c("#F28E2B", "#59A14F")) +
  coord_flip() +
  labs(title = "Relative Risk Ratio (RRR) per Fitur — MLR",
       subtitle = "Garis putus-putus = RRR 1 (tidak ada efek vs baseline)",
       x = "Fitur", y = "RRR") +
  theme_minimal(base_size = 12)

6.4 Uji Signifikansi — Uji Wald (Serentak & Parsial)

# Uji Wald parsial: z = koef / SE, p-value = 2 * (1 - Φ(|z|))
se_mlr  <- summary(mlr_model)$standard.errors
z_stat  <- coef_mlr / se_mlr
p_wald  <- 2 * (1 - pnorm(abs(z_stat)))

wald_df <- as.data.frame(t(p_wald)) %>%
  rownames_to_column("Fitur") %>%
  rename_with(~ paste0("p_Kelas", .), -Fitur) %>%
  mutate(across(where(is.numeric), ~ round(., 4)))

wald_df %>%
  kable(caption = "P-value Uji Wald Parsial per Fitur (α = 0.05)") %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE) %>%
  row_spec(which(apply(wald_df[, -1], 1, function(r) any(r < 0.05))),
           background = "#D5F5E3")
P-value Uji Wald Parsial per Fitur (α = 0.05)
Fitur p_Kelas2 p_Kelas3
(Intercept) 0.9999 0.9999
Alcohol 0.9980 1.0000
Malic_Acid 0.9997 0.9999
Ash 0.9991 0.9999
Ash_Alcanity 0.9995 0.9999
Magnesium 0.9999 1.0000
Total_Phenols 0.9998 0.9999
Flavanoids 0.9998 0.9998
Nonflavanoid_Phenols 0.9992 1.0000
Proanthocyanins 0.9997 1.0000
Color_Intensity 0.9995 0.9998
Hue 0.9998 0.9998
OD280 0.9996 0.9999
Proline 0.9989 1.0000
cat("\nFitur signifikan (p < 0.05 minimal satu kelas):\n")
## 
## Fitur signifikan (p < 0.05 minimal satu kelas):
sig_fitur <- wald_df$Fitur[apply(wald_df[, -1], 1,
                                 function(r) any(r < 0.05, na.rm = TRUE))]
cat(paste(sig_fitur, collapse = ", "), "\n")

7 Tahap 5 — Evaluasi & Pengujian Model

7.1 Prediksi pada Data Test

# LDA
pred_lda  <- predict(lda_model, X_test_scaled)$class

# MLR
pred_mlr  <- predict(mlr_model, X_test_scaled)

cat("Distribusi prediksi LDA:", table(pred_lda), "\n")
## Distribusi prediksi LDA: 14 17 12
cat("Distribusi prediksi MLR:", table(pred_mlr), "\n")
## Distribusi prediksi MLR: 14 17 12

7.2 Confusion Matrix — LDA

cm_lda <- confusionMatrix(pred_lda, y_test)
print(cm_lda)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  1  2  3
##          1 14  0  0
##          2  0 17  0
##          3  0  0 12
## 
## Overall Statistics
##                                      
##                Accuracy : 1          
##                  95% CI : (0.9178, 1)
##     No Information Rate : 0.3953     
##     P-Value [Acc > NIR] : < 2.2e-16  
##                                      
##                   Kappa : 1          
##                                      
##  Mcnemar's Test P-Value : NA         
## 
## Statistics by Class:
## 
##                      Class: 1 Class: 2 Class: 3
## Sensitivity            1.0000   1.0000   1.0000
## Specificity            1.0000   1.0000   1.0000
## Pos Pred Value         1.0000   1.0000   1.0000
## Neg Pred Value         1.0000   1.0000   1.0000
## Prevalence             0.3256   0.3953   0.2791
## Detection Rate         0.3256   0.3953   0.2791
## Detection Prevalence   0.3256   0.3953   0.2791
## Balanced Accuracy      1.0000   1.0000   1.0000
# Heatmap Confusion Matrix LDA
cm_lda_df <- as.data.frame(cm_lda$table)
colnames(cm_lda_df) <- c("Prediksi", "Aktual", "Frekuensi")

ggplot(cm_lda_df, aes(x = Aktual, y = Prediksi, fill = Frekuensi)) +
  geom_tile(colour = "white") +
  geom_text(aes(label = Frekuensi), size = 6, fontface = "bold") +
  scale_fill_gradient(low = "#EBF5FB", high = "#1A5276") +
  labs(title = "Confusion Matrix — LDA (Test Set)",
       x = "Aktual", y = "Prediksi") +
  theme_minimal(base_size = 13)

7.3 Confusion Matrix — MLR

pred_mlr_fac <- factor(pred_mlr, levels = levels(y_test))
cm_mlr <- confusionMatrix(pred_mlr_fac, y_test)
print(cm_mlr)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  1  2  3
##          1 14  0  0
##          2  0 17  0
##          3  0  0 12
## 
## Overall Statistics
##                                      
##                Accuracy : 1          
##                  95% CI : (0.9178, 1)
##     No Information Rate : 0.3953     
##     P-Value [Acc > NIR] : < 2.2e-16  
##                                      
##                   Kappa : 1          
##                                      
##  Mcnemar's Test P-Value : NA         
## 
## Statistics by Class:
## 
##                      Class: 1 Class: 2 Class: 3
## Sensitivity            1.0000   1.0000   1.0000
## Specificity            1.0000   1.0000   1.0000
## Pos Pred Value         1.0000   1.0000   1.0000
## Neg Pred Value         1.0000   1.0000   1.0000
## Prevalence             0.3256   0.3953   0.2791
## Detection Rate         0.3256   0.3953   0.2791
## Detection Prevalence   0.3256   0.3953   0.2791
## Balanced Accuracy      1.0000   1.0000   1.0000
# Heatmap Confusion Matrix MLR
cm_mlr_df <- as.data.frame(cm_mlr$table)
colnames(cm_mlr_df) <- c("Prediksi", "Aktual", "Frekuensi")

ggplot(cm_mlr_df, aes(x = Aktual, y = Prediksi, fill = Frekuensi)) +
  geom_tile(colour = "white") +
  geom_text(aes(label = Frekuensi), size = 6, fontface = "bold") +
  scale_fill_gradient(low = "#FDFEFE", high = "#7D6608") +
  labs(title = "Confusion Matrix — MLR (Test Set)",
       x = "Aktual", y = "Prediksi") +
  theme_minimal(base_size = 13)

7.4 Akurasi & F1 (Macro/Weighted Average)

# Fungsi hitung F1 macro dan weighted
get_metrics <- function(cm_obj, model_name) {
  stats    <- cm_obj$byClass
  support  <- rowSums(cm_obj$table)     # jumlah observasi per kelas (aktual)
  f1_per   <- 2 * stats[, "Sensitivity"] * stats[, "Pos Pred Value"] /
              (stats[, "Sensitivity"] + stats[, "Pos Pred Value"])
  f1_macro    <- mean(f1_per, na.rm = TRUE)
  f1_weighted <- weighted.mean(f1_per, w = support, na.rm = TRUE)
  acc <- cm_obj$overall["Accuracy"]
  data.frame(Model = model_name,
             Akurasi        = round(acc, 4),
             F1_Macro       = round(f1_macro, 4),
             F1_Weighted    = round(f1_weighted, 4))
}

metrics_lda <- get_metrics(cm_lda, "LDA")
metrics_mlr <- get_metrics(cm_mlr, "MLR")

metrics_all <- bind_rows(metrics_lda, metrics_mlr)

metrics_all %>%
  kable(caption = "Ringkasan Akurasi & F1 — LDA vs MLR (Test Set)",
        col.names = c("Model", "Akurasi", "F1 Macro", "F1 Weighted")) %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE) %>%
  row_spec(which.max(metrics_all$Akurasi), background = "#D5F5E3")
Ringkasan Akurasi & F1 — LDA vs MLR (Test Set)
Model Akurasi F1 Macro F1 Weighted
Accuracy…1 LDA 1 1 1
Accuracy…2 MLR 1 1 1
install.packages("MLmetrics")
## package 'MLmetrics' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\Asus\AppData\Local\Temp\Rtmpgpf847\downloaded_packages

7.5 CV 9-Fold — Cross Validation Score

set.seed(42)
ctrl <- trainControl(method = "cv", number = 9,
                     classProbs = TRUE,
                     summaryFunction = multiClassSummary,
                     savePredictions = "final")

# Caret mensyaratkan level faktor berupa valid R variable name
# Ubah "1","2","3" → "Seg1","Seg2","Seg3"
y_train_cv <- factor(paste0("Seg", y_train),
                     levels = paste0("Seg", levels(y_train)))

# Data gabung untuk caret train
train_cv <- cbind(X_train_scaled, Customer_Segment = y_train_cv)

# CV LDA
cv_lda <- train(Customer_Segment ~ .,
                data    = train_cv,
                method  = "lda",
                trControl = ctrl,
                metric  = "Accuracy")

# CV MLR
cv_mlr <- train(Customer_Segment ~ .,
                data    = train_cv,
                method  = "multinom",
                trControl = ctrl,
                metric  = "Accuracy",
                trace   = FALSE)

cv_summary <- data.frame(
  Model       = c("LDA", "MLR"),
  CV_Accuracy_Mean = c(
    round(mean(cv_lda$resample$Accuracy), 4),
    round(mean(cv_mlr$resample$Accuracy), 4)
  ),
  CV_Accuracy_SD   = c(
    round(sd(cv_lda$resample$Accuracy), 4),
    round(sd(cv_mlr$resample$Accuracy), 4)
  )
)

cv_summary %>%
  kable(caption = "Cross Validation 9-Fold — Akurasi LDA vs MLR",
        col.names = c("Model", "Mean Accuracy", "SD Accuracy")) %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)
Cross Validation 9-Fold — Akurasi LDA vs MLR
Model Mean Accuracy SD Accuracy
LDA 0.9778 0.0333
MLR 0.9704 0.0351

7.6 ROC-AUC (OvR per Kelas)

# Probabilitas dari LDA dan MLR pada test set
prob_lda <- predict(lda_model, X_test_scaled)$posterior
prob_mlr <- predict(mlr_model, X_test_scaled, type = "probs")

kelas_list <- levels(y_test)
auc_results <- data.frame(
  Kelas   = character(),
  AUC_LDA = numeric(),
  AUC_MLR = numeric()
)

par(mfrow = c(1, 3))
for (k in kelas_list) {
  y_bin  <- ifelse(y_test == k, 1, 0)
  roc_lda_k <- roc(y_bin, prob_lda[, k], quiet = TRUE)
  roc_mlr_k <- roc(y_bin, prob_mlr[, k], quiet = TRUE)

  auc_results <- rbind(auc_results,
    data.frame(Kelas   = k,
               AUC_LDA = round(auc(roc_lda_k), 4),
               AUC_MLR = round(auc(roc_mlr_k), 4)))

  plot(roc_lda_k, col = "#4E79A7", main = paste("ROC — Kelas", k),
       lwd = 2, legacy.axes = TRUE)
  plot(roc_mlr_k, col = "#F28E2B", add = TRUE, lwd = 2, lty = 2)
  legend("bottomright",
         legend = c(sprintf("LDA (AUC=%.3f)", auc(roc_lda_k)),
                    sprintf("MLR (AUC=%.3f)", auc(roc_mlr_k))),
         col = c("#4E79A7", "#F28E2B"), lty = c(1, 2), lwd = 2, cex = 0.85)
}

par(mfrow = c(1, 1))

auc_results %>%
  kable(caption = "AUC per Kelas (OvR) — LDA vs MLR") %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)
AUC per Kelas (OvR) — LDA vs MLR
Kelas AUC_LDA AUC_MLR
1 1 1
2 1 1
3 1 1

8 Tahap 6 — Visualisasi Hasil

8.1 LDA Scatter Plot (LD1 vs LD2) — Train & Test

df_train_proj <- data.frame(
  LD1   = lda_proj_train$x[, 1],
  LD2   = lda_proj_train$x[, 2],
  Kelas = y_train,
  Set   = "Train"
)
df_test_proj <- data.frame(
  LD1   = lda_proj_test$x[, 1],
  LD2   = lda_proj_test$x[, 2],
  Kelas = y_test,
  Set   = "Test"
)
df_all_proj <- bind_rows(df_train_proj, df_test_proj)

ggplot(df_all_proj, aes(x = LD1, y = LD2, colour = Kelas, shape = Set)) +
  geom_point(size = 3, alpha = 0.8) +
  stat_ellipse(data = df_train_proj, aes(colour = Kelas),
               level = 0.90, linewidth = 0.7, linetype = "dashed") +
  scale_colour_manual(values = c("#4E79A7", "#F28E2B", "#59A14F"),
                      name = "Customer\nSegment") +
  scale_shape_manual(values = c(16, 4), name = "Dataset") +
  labs(title = "LDA Scatter Plot — LD1 vs LD2",
       subtitle = "Titik bulat = Train | Tanda silang = Test | Elips = batas 90% CI",
       x = paste0("LD1 (", prop_var_df$Proporsi[1], "%)"),
       y = paste0("LD2 (", prop_var_df$Proporsi[2], "%)")) +
  theme_minimal(base_size = 13)

8.2 RRR Bar Chart per Fitur (MLR)

rrr_long_plot <- rrr_df %>%
  filter(Fitur != "(Intercept)") %>%
  pivot_longer(-Fitur, names_to = "Kelas", values_to = "RRR") %>%
  mutate(Kelas = gsub("RRR_Kelas", "Kelas ", Kelas))

ggplot(rrr_long_plot, aes(x = reorder(Fitur, -RRR), y = RRR, fill = Kelas)) +
  geom_bar(stat = "identity", position = "dodge", alpha = 0.85, width = 0.7) +
  geom_hline(yintercept = 1, linetype = "dashed", colour = "grey30", linewidth = 0.7) +
  scale_fill_manual(values = c("#F28E2B", "#59A14F")) +
  labs(title = "Relative Risk Ratio (RRR) per Fitur — Multinomial Logit",
       subtitle = "Baseline = Kelas 1 | RRR > 1: risiko naik | RRR < 1: risiko turun",
       x = "Fitur", y = "RRR") +
  theme_minimal(base_size = 12) +
  theme(axis.text.x = element_text(angle = 35, hjust = 1))

8.3 Heatmap Koefisien MLR (β per Fitur × Kelas)

coef_long <- as.data.frame(t(coef_mlr)) %>%
  rownames_to_column("Fitur") %>%
  filter(Fitur != "(Intercept)") %>%
  pivot_longer(-Fitur, names_to = "Kelas", values_to = "Beta") %>%
  mutate(Kelas = paste("Kelas", Kelas))

ggplot(coef_long, aes(x = Kelas, y = Fitur, fill = Beta)) +
  geom_tile(colour = "white") +
  geom_text(aes(label = round(Beta, 2)), size = 3.5) +
  scale_fill_gradient2(low = "#C0392B", mid = "white", high = "#1A5276",
                       midpoint = 0, name = "β") +
  labs(title = "Heatmap Koefisien MLR (β) per Fitur & Kelas",
       subtitle = "Merah = efek negatif | Biru = efek positif vs baseline (Kelas 1)",
       x = "Kelas (vs Baseline Kelas 1)", y = "Fitur") +
  theme_minimal(base_size = 12) +
  theme(axis.text.x = element_text(size = 11))

8.4 Confusion Matrix — Heatmap Keduanya

# Normalisasi per baris (proporsi prediksi benar per kelas aktual)
cm_lda_norm <- as.data.frame(cm_lda$table)
colnames(cm_lda_norm) <- c("Prediksi", "Aktual", "Frekuensi")
cm_lda_norm$Model <- "LDA"

cm_mlr_norm <- as.data.frame(cm_mlr$table)
colnames(cm_mlr_norm) <- c("Prediksi", "Aktual", "Frekuensi")
cm_mlr_norm$Model <- "MLR"

cm_both <- bind_rows(cm_lda_norm, cm_mlr_norm)

ggplot(cm_both, aes(x = Aktual, y = Prediksi, fill = Frekuensi)) +
  geom_tile(colour = "white") +
  geom_text(aes(label = Frekuensi), size = 6, fontface = "bold") +
  scale_fill_gradient(low = "#FDFEFE", high = "#1A5276") +
  facet_wrap(~ Model) +
  labs(title = "Confusion Matrix — LDA vs MLR (Test Set)",
       x = "Aktual", y = "Prediksi") +
  theme_minimal(base_size = 13)

9 Tahap 7 — Interpretasi Model

9.1 Interpretasi LDA

9.1.1 Koefisien Diskriminan

cat("=================================================================\n")
## =================================================================
cat("          INTERPRETASI MODEL LDA\n")
##           INTERPRETASI MODEL LDA
cat("=================================================================\n\n")
## =================================================================
cat("1. KOMPONEN DISKRIMINAN\n")
## 1. KOMPONEN DISKRIMINAN
cat("   LD1 menjelaskan", prop_var_df$Proporsi[1], "% variansi antarkelas.\n")
##    LD1 menjelaskan 68.95 % variansi antarkelas.
cat("   LD2 menjelaskan", prop_var_df$Proporsi[2], "% variansi antarkelas.\n")
##    LD2 menjelaskan 31.05 % variansi antarkelas.
cat("   → Total:", prop_var_df$Kumulatif_Pct[2], "% variansi dapat dijelaskan.\n\n")
##    → Total: 100 % variansi dapat dijelaskan.
cat("2. FITUR TERPENTING (loading |LD1| tertinggi):\n")
## 2. FITUR TERPENTING (loading |LD1| tertinggi):
top3 <- head(coef_df$Fitur, 3)
for (i in seq_along(top3)) {
  ld1_val <- coef_df$LD1[coef_df$Fitur == top3[i]]
  cat(sprintf("   %d. %s (LD1 = %.4f)\n", i, top3[i], ld1_val))
}
##    1. Flavanoids (LD1 = -1.7243)
##    2. Proline (LD1 = -0.8877)
##    3. OD280 (LD1 = -0.8417)

9.1.2 Marginal Effects — Perubahan Probabilitas per Fitur

baseline_post <- colMeans(predict(lda_model, X_test_scaled)$posterior)

delta_df <- lapply(names(X_test_scaled), function(feat) {
  X_shifted          <- X_test_scaled
  X_shifted[[feat]]  <- X_shifted[[feat]] + 1   # naikkan 1 SD
  post_shifted       <- colMeans(predict(lda_model, X_shifted)$posterior)
  delta              <- post_shifted - baseline_post
  data.frame(Fitur = feat,
             Kelas = names(delta),
             Delta_Prob = round(delta, 4))
}) %>% bind_rows()

ggplot(delta_df, aes(x = reorder(Fitur, abs(Delta_Prob)), y = Delta_Prob, fill = Kelas)) +
  geom_bar(stat = "identity", position = "dodge", alpha = 0.85) +
  geom_hline(yintercept = 0, linetype = "dashed") +
  scale_fill_manual(values = c("#4E79A7", "#F28E2B", "#59A14F")) +
  coord_flip() +
  labs(title = "Marginal Effects LDA — Perubahan Probabilitas per Kelas",
       subtitle = "Efek kenaikan 1 SD setiap fitur terhadap probabilitas posterior",
       x = "Fitur", y = "ΔProb") +
  theme_minimal(base_size = 12)

9.2 Interpretasi MLR

cat("sig_fitur:", sig_fitur, "\n")
## sig_fitur:
cat("ncol sig_coef:", ncol(coef_mlr[, sig_fitur, drop=FALSE]), "\n")
## ncol sig_coef: 0

9.2.1 Log Odds & Uji Parsial Wald

cat("          INTERPRETASI MODEL MLR\n")
##           INTERPRETASI MODEL MLR
cat("1. LOG ODDS vs BASELINE (Kelas 1):\n")
## 1. LOG ODDS vs BASELINE (Kelas 1):
cat("   Koefisien positif → probabilitas kelas tersebut relatif lebih\n")
##    Koefisien positif → probabilitas kelas tersebut relatif lebih
cat("   tinggi dibanding kelas 1 saat fitur meningkat.\n\n")
##    tinggi dibanding kelas 1 saat fitur meningkat.
cat("2. RELATIVE RISK RATIO (RRR = exp(β)):\n")
## 2. RELATIVE RISK RATIO (RRR = exp(β)):
cat("   RRR > 1 → fitur meningkatkan risiko relatif masuk kelas tsb.\n")
##    RRR > 1 → fitur meningkatkan risiko relatif masuk kelas tsb.
cat("   RRR < 1 → fitur menurunkan risiko relatif masuk kelas tsb.\n\n")
##    RRR < 1 → fitur menurunkan risiko relatif masuk kelas tsb.
cat("3. UJI WALD PARSIAL:\n")
## 3. UJI WALD PARSIAL:
cat("   Fitur signifikan (p < 0.05) minimal satu kelas:\n")
##    Fitur signifikan (p < 0.05) minimal satu kelas:
cat("  ", paste(sig_fitur, collapse = ", "), "\n\n")
# Tabel ringkas log-odds & RRR fitur signifikan
cat("Ringkasan koefisien fitur paling signifikan:\n")
## Ringkasan koefisien fitur paling signifikan:
sig_coef <- coef_mlr[, sig_fitur, drop = FALSE]
sig_rrr  <- exp(sig_coef)

sig_coef <- coef_mlr[, sig_fitur, drop = FALSE]
sig_rrr  <- exp(sig_coef)

n_show <- min(4, ncol(sig_coef))

if (n_show > 0) {
  cbind(
    LogOdds = round(sig_coef[, 1:n_show, drop = FALSE], 3),
    RRR     = round(sig_rrr[,  1:n_show, drop = FALSE], 3)
  ) %>%
    kable(caption = "Log Odds & RRR Fitur Signifikan Teratas") %>%
    kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)
} else {
  cat("Tidak ada fitur signifikan yang ditemukan.\n")
}
## Tidak ada fitur signifikan yang ditemukan.

9.2.2 Uji Parsial Wald — P-value per Fitur

# Visualisasi p-value Wald per fitur & kelas
wald_long <- wald_df %>%
  filter(Fitur != "(Intercept)") %>%
  pivot_longer(-Fitur, names_to = "Kelas", values_to = "p_value") %>%
  mutate(Signifikan = ifelse(p_value < 0.05, "Signifikan", "Tidak"))

ggplot(wald_long, aes(x = reorder(Fitur, p_value), y = -log10(p_value + 1e-6),
                       fill = Signifikan)) +
  geom_bar(stat = "identity", position = "dodge", alpha = 0.85) +
  geom_hline(yintercept = -log10(0.05), linetype = "dashed",
             colour = "red", linewidth = 0.7) +
  scale_fill_manual(values = c("#59A14F", "#E15759")) +
  facet_wrap(~ Kelas) +
  coord_flip() +
  labs(title = "Uji Wald Parsial MLR — -log10(p-value) per Fitur & Kelas",
       subtitle = "Garis merah = threshold p = 0.05",
       x = "Fitur", y = "-log10(p-value)") +
  theme_minimal(base_size = 11)

10 Tahap 8 — Perbandingan & Kesimpulan

10.1 Tabel Perbandingan Lengkap

# AUC macro
auc_macro_lda <- round(mean(auc_results$AUC_LDA), 4)
auc_macro_mlr <- round(mean(auc_results$AUC_MLR), 4)

tabel_final <- data.frame(
  Kriteria = c(
    "Akurasi Test",
    "F1 Macro",
    "F1 Weighted",
    "CV 9-Fold Mean Accuracy",
    "CV 9-Fold SD Accuracy",
    "AUC Macro (OvR)",
    "Asumsi Utama",
    "Interpretabilitas"
  ),
  LDA = c(
    paste0(round(metrics_lda$Akurasi * 100, 1), "%"),
    round(metrics_lda$F1_Macro, 4),
    round(metrics_lda$F1_Weighted, 4),
    paste0(round(cv_summary$CV_Accuracy_Mean[1] * 100, 1), "%"),
    round(cv_summary$CV_Accuracy_SD[1], 4),
    auc_macro_lda,
    "Normalitas multivariat, Σ homogen",
    "Koefisien diskriminan & plot LD"
  ),
  MLR = c(
    paste0(round(metrics_mlr$Akurasi * 100, 1), "%"),
    round(metrics_mlr$F1_Macro, 4),
    round(metrics_mlr$F1_Weighted, 4),
    paste0(round(cv_summary$CV_Accuracy_Mean[2] * 100, 1), "%"),
    round(cv_summary$CV_Accuracy_SD[2], 4),
    auc_macro_mlr,
    "Linearitas log-odds, independensi obs",
    "Log odds, RRR, uji Wald"
  )
)

tabel_final %>%
  kable(caption = "Tabel Perbandingan Lengkap: LDA vs MLR") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "bordered"),
                full_width = FALSE) %>%
  column_spec(1, bold = TRUE) %>%
  row_spec(0, background = "#2E86AB", color = "white")
Tabel Perbandingan Lengkap: LDA vs MLR
Kriteria LDA MLR
Akurasi Test 100% 100%
F1 Macro 1 1
F1 Weighted 1 1
CV 9-Fold Mean Accuracy 97.8% 97%
CV 9-Fold SD Accuracy 0.0333 0.0351
AUC Macro (OvR) 1 1
Asumsi Utama Normalitas multivariat, Σ homogen Linearitas log-odds, independensi obs
Interpretabilitas Koefisien diskriminan & plot LD Log odds, RRR, uji Wald

10.2 Kelebihan & Kekurangan

kelebihan_df <- data.frame(
  Aspek = c("Kelebihan utama",
            "Keunggulan prediksi",
            "Kekurangan utama",
            "Masalah asumsi",
            "Interpretasi output"),
  LDA   = c(
    "Efisien dengan data kecil, dimensi reduksi alami (LD plot)",
    "Sangat baik jika asumsi normalitas & homogenitas Σ terpenuhi",
    "Sensitif terhadap pelanggaran asumsi normalitas & outlier",
    "Asumsi normalitas multivariat sering dilanggar di data nyata",
    "Koefisien diskriminan & proyeksi LD mudah divisualisasikan"
  ),
  MLR   = c(
    "Tidak mensyaratkan normalitas distribusi fitur",
    "Lebih robust pada data yang tidak normal atau ada outlier",
    "Butuh banyak data, rentan multikolinearitas",
    "Asumsi linearitas log-odds, perlu dicek lebih ketat",
    "Log odds & RRR mudah diinterpretasi secara statistik"
  )
)

kelebihan_df %>%
  kable(caption = "Kelebihan & Kekurangan: LDA vs MLR") %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = TRUE) %>%
  column_spec(1, bold = TRUE, width = "20%") %>%
  column_spec(2, width = "40%") %>%
  column_spec(3, width = "40%")
Kelebihan & Kekurangan: LDA vs MLR
Aspek LDA MLR
Kelebihan utama Efisien dengan data kecil, dimensi reduksi alami (LD plot) Tidak mensyaratkan normalitas distribusi fitur
Keunggulan prediksi Sangat baik jika asumsi normalitas & homogenitas Σ terpenuhi Lebih robust pada data yang tidak normal atau ada outlier
Kekurangan utama Sensitif terhadap pelanggaran asumsi normalitas & outlier Butuh banyak data, rentan multikolinearitas
Masalah asumsi Asumsi normalitas multivariat sering dilanggar di data nyata Asumsi linearitas log-odds, perlu dicek lebih ketat
Interpretasi output Koefisien diskriminan & proyeksi LD mudah divisualisasikan Log odds & RRR mudah diinterpretasi secara statistik