options(repos = c(CRAN = "https://cran.rstudio.com/"))

1 Instalasi & Load Library

packages <- c("tidyverse", "caret", "MASS", "nnet", "corrplot",
               "MVN", "car", "nortest", "ggplot2", "knitr", "kableExtra",
               "pROC", "reshape2", "MLmetrics")

installed <- packages %in% rownames(installed.packages())
if (any(!installed)) install.packages(packages[!installed])

library(MASS)         
library(nnet)         
library(tidyverse)    
library(caret)        
library(corrplot)     
library(MVN)          
library(car)          
library(nortest)      
library(ggplot2)      
library(knitr)        
library(kableExtra)   
library(pROC)         
library(reshape2)     

select <- dplyr::select
filter <- dplyr::filter

2 Tahap 1 — Load & Eksplorasi Data (EDA)

2.1 Load Dataset

wine_path <- if (file.exists("Wine.csv")) {
  "Wine.csv"
} else {
  file.path(dirname(knitr::current_input(dir = TRUE)), "Wine.csv")
}

if (!file.exists(wine_path)) {
  stop("File Wine.csv tidak ditemukan. Pastikan Wine.csv berada di folder yang sama dengan file .Rmd ini.")
}

df <- read.csv(wine_path)
df$Customer_Segment <- as.factor(df$Customer_Segment)

cat("Dimensi data:", dim(df), "\n")

## Dimensi data: 178 14

cat("Jumlah kelas:", nlevels(df$Customer_Segment), "\n")

## Jumlah kelas: 3

cat("Label kelas:", levels(df$Customer_Segment), "\n")

## Label kelas: 1 2 3

2.2 Struktur Data

str(df)

## 'data.frame':    178 obs. of  14 variables:
##  $ Alcohol             : num  14.2 13.2 13.2 14.4 13.2 ...
##  $ Malic_Acid          : num  1.71 1.78 2.36 1.95 2.59 1.76 1.87 2.15 1.64 1.35 ...
##  $ Ash                 : num  2.43 2.14 2.67 2.5 2.87 2.45 2.45 2.61 2.17 2.27 ...
##  $ Ash_Alcanity        : num  15.6 11.2 18.6 16.8 21 15.2 14.6 17.6 14 16 ...
##  $ Magnesium           : int  127 100 101 113 118 112 96 121 97 98 ...
##  $ Total_Phenols       : num  2.8 2.65 2.8 3.85 2.8 3.27 2.5 2.6 2.8 2.98 ...
##  $ Flavanoids          : num  3.06 2.76 3.24 3.49 2.69 3.39 2.52 2.51 2.98 3.15 ...
##  $ Nonflavanoid_Phenols: num  0.28 0.26 0.3 0.24 0.39 0.34 0.3 0.31 0.29 0.22 ...
##  $ Proanthocyanins     : num  2.29 1.28 2.81 2.18 1.82 1.97 1.98 1.25 1.98 1.85 ...
##  $ Color_Intensity     : num  5.64 4.38 5.68 7.8 4.32 6.75 5.25 5.05 5.2 7.22 ...
##  $ Hue                 : num  1.04 1.05 1.03 0.86 1.04 1.05 1.02 1.06 1.08 1.01 ...
##  $ OD280               : num  3.92 3.4 3.17 3.45 2.93 2.85 3.58 3.58 2.85 3.55 ...
##  $ Proline             : int  1065 1050 1185 1480 735 1450 1290 1295 1045 1045 ...
##  $ Customer_Segment    : Factor w/ 3 levels "1","2","3": 1 1 1 1 1 1 1 1 1 1 ...

2.3 Cek 6 Baris Pertama

head(df) %>%
  kable(caption = "6 Baris Pertama Dataset Wine") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"),
                full_width = FALSE)

6 Baris Pertama Dataset Wine
Alcohol	Malic_Acid	Ash	Ash_Alcanity	Magnesium	Total_Phenols	Flavanoids	Nonflavanoid_Phenols	Proanthocyanins	Color_Intensity	Hue	OD280	Proline	Customer_Segment
14.23	1.71	2.43	15.6	127	2.80	3.06	0.28	2.29	5.64	1.04	3.92	1065	1
13.20	1.78	2.14	11.2	100	2.65	2.76	0.26	1.28	4.38	1.05	3.40	1050	1
13.16	2.36	2.67	18.6	101	2.80	3.24	0.30	2.81	5.68	1.03	3.17	1185	1
14.37	1.95	2.50	16.8	113	3.85	3.49	0.24	2.18	7.80	0.86	3.45	1480	1
13.24	2.59	2.87	21.0	118	2.80	2.69	0.39	1.82	4.32	1.04	2.93	735	1
14.20	1.76	2.45	15.2	112	3.27	3.39	0.34	1.97	6.75	1.05	2.85	1450	1

2.4 Distribusi Kelas (Customer_Segment)

tabel_kelas <- df %>%
  count(Customer_Segment) %>%
  mutate(Proporsi = round(n / sum(n) * 100, 2))

tabel_kelas %>%
  kable(caption = "Distribusi Kelas Customer_Segment",
        col.names = c("Kelas", "Frekuensi", "Proporsi (%)")) %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)

Distribusi Kelas Customer_Segment
Kelas	Frekuensi	Proporsi (%)
1	59	33.15
2	71	39.89
3	48	26.97

ggplot(df, aes(x = Customer_Segment, fill = Customer_Segment)) +
  geom_bar(width = 0.6, show.legend = FALSE) +
  geom_text(stat = "count", aes(label = after_stat(count)), vjust = -0.5, size = 4) +
  scale_fill_manual(values = c("#4E79A7", "#F28E2B", "#59A14F")) +
  labs(title = "Distribusi Kelas Customer_Segment", x = "Kelas", y = "Frekuensi") +
  theme_minimal(base_size = 13)

2.5 Statistik Deskriptif

fitur_numerik <- df %>% select(-Customer_Segment)

summary(fitur_numerik) %>%
  kable(caption = "Statistik Deskriptif Fitur Numerik") %>%
  kable_styling(bootstrap_options = c("striped", "condensed"),
                full_width = TRUE, font_size = 11)

Statistik Deskriptif Fitur Numerik
Alcohol	Malic_Acid	Ash	Ash_Alcanity	Magnesium	Total_Phenols	Flavanoids	Nonflavanoid_Phenols	Proanthocyanins	Color_Intensity	Hue	OD280	Proline
Min. :11.03	Min. :0.740	Min. :1.360	Min. :10.60	Min. : 70.00	Min. :0.980	Min. :0.340	Min. :0.1300	Min. :0.410	Min. : 1.280	Min. :0.4800	Min. :1.270	Min. : 278.0
1st Qu.:12.36	1st Qu.:1.603	1st Qu.:2.210	1st Qu.:17.20	1st Qu.: 88.00	1st Qu.:1.742	1st Qu.:1.205	1st Qu.:0.2700	1st Qu.:1.250	1st Qu.: 3.220	1st Qu.:0.7825	1st Qu.:1.938	1st Qu.: 500.5
Median :13.05	Median :1.865	Median :2.360	Median :19.50	Median : 98.00	Median :2.355	Median :2.135	Median :0.3400	Median :1.555	Median : 4.690	Median :0.9650	Median :2.780	Median : 673.5
Mean :13.00	Mean :2.336	Mean :2.367	Mean :19.49	Mean : 99.74	Mean :2.295	Mean :2.029	Mean :0.3619	Mean :1.591	Mean : 5.058	Mean :0.9574	Mean :2.612	Mean : 746.9
3rd Qu.:13.68	3rd Qu.:3.083	3rd Qu.:2.558	3rd Qu.:21.50	3rd Qu.:107.00	3rd Qu.:2.800	3rd Qu.:2.875	3rd Qu.:0.4375	3rd Qu.:1.950	3rd Qu.: 6.200	3rd Qu.:1.1200	3rd Qu.:3.170	3rd Qu.: 985.0
Max. :14.83	Max. :5.800	Max. :3.230	Max. :30.00	Max. :162.00	Max. :3.880	Max. :5.080	Max. :0.6600	Max. :3.580	Max. :13.000	Max. :1.7100	Max. :4.000	Max. :1680.0

2.6 Boxplot per Fitur

df_long <- df %>%
  pivot_longer(cols = -Customer_Segment, names_to = "Fitur", values_to = "Nilai")

ggplot(df_long, aes(x = Customer_Segment, y = Nilai, fill = Customer_Segment)) +
  geom_boxplot(outlier.colour = "red", outlier.size = 1.5, alpha = 0.8) +
  scale_fill_manual(values = c("#4E79A7", "#F28E2B", "#59A14F")) +
  facet_wrap(~ Fitur, scales = "free_y", ncol = 4) +
  labs(title = "Boxplot Setiap Fitur per Kelas", x = "Kelas", y = "Nilai") +
  theme_minimal(base_size = 11) +
  theme(legend.position = "none")

3 Tahap 2 — Cek Asumsi

3.1 Asumsi 1 — Linearitas Log-Odds (untuk MLR)

df_check <- df %>%
  mutate(kelas_bin = ifelse(Customer_Segment == 1, 1, 0))

ggplot(df_check, aes(x = Alcohol, y = kelas_bin)) +
  geom_point(alpha = 0.4, colour = "#4E79A7") +
  geom_smooth(method = "glm", method.args = list(family = "binomial"),
              se = TRUE, colour = "#E15759") +
  labs(title = "Cek Linearitas Log-Odds: Alcohol vs Kelas 1",
       x = "Alcohol", y = "Probabilitas Kelas 1") +
  theme_minimal(base_size = 13)

3.2 Asumsi 2 — Independensi Observasi

lm_proxy <- lm(as.numeric(Customer_Segment) ~ ., data = df)
dw_result <- car::durbinWatsonTest(lm_proxy)
print(dw_result)

##  lag Autocorrelation D-W Statistic p-value
##    1       0.2000099      1.594091       0
##  Alternative hypothesis: rho != 0

cat("\nInterpretasi:\n")

## 
## Interpretasi:

cat("Nilai DW mendekati 2 → tidak ada autokorelasi (asumsi independensi terpenuhi)\n")

## Nilai DW mendekati 2 → tidak ada autokorelasi (asumsi independensi terpenuhi)

3.3 Asumsi 3 — No Multikolinearitas

vif_values <- car::vif(lm_proxy)
vif_df <- data.frame(
  Fitur = names(vif_values),
  VIF   = round(vif_values, 3)
) %>% arrange(desc(VIF))

vif_df %>%
  mutate(Status = ifelse(VIF > 10, "⚠️ Tinggi", "✅ OK")) %>%
  kable(caption = "Variance Inflation Factor (VIF)") %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE) %>%
  row_spec(which(vif_df$VIF > 10), background = "#FFF3CD")

Variance Inflation Factor (VIF)
	Fitur	VIF	Status
Flavanoids	Flavanoids	7.029	✅ OK \|
Total_Phenols	Total_Phenols	4.335	✅ OK \|
OD280	OD280	3.785	✅ OK \|
Color_Intensity	Color_Intensity	3.026	✅ OK \|
Proline	Proline	2.824	✅ OK \|
Hue	Hue	2.551	✅ OK \|
Alcohol	Alcohol	2.460	✅ OK \|
Ash_Alcanity	Ash_Alcanity	2.239	✅ OK \|
Ash	Ash	2.185	✅ OK \|
Proanthocyanins	Proanthocyanins	1.976	✅ OK \|
Nonflavanoid_Phenols	Nonflavanoid_Phenols	1.796	✅ OK \|
Malic_Acid	Malic_Acid	1.657	✅ OK \|
Magnesium	Magnesium	1.418	✅ OK \|

cat("\nKetentuan: VIF > 10 indikasi multikolinearitas serius\n")

## 
## Ketentuan: VIF > 10 indikasi multikolinearitas serius

3.3.1 Matriks korelasi asumsi 3

cor_matrix <- cor(fitur_numerik)

corrplot(cor_matrix, method = "color", type = "upper",
         tl.cex = 0.8, number.cex = 0.65, addCoef.col = "black",
         title = "Matriks Korelasi Antar Fitur", mar = c(0, 0, 1, 0))

3.4 Asumsi 4 — No Outlier (IQR Method)

deteksi_outlier <- function(x) {
  Q1  <- quantile(x, 0.25)
  Q3  <- quantile(x, 0.75)
  IQR <- Q3 - Q1
  sum(x < (Q1 - 1.5 * IQR) | x > (Q3 + 1.5 * IQR))
}

outlier_summary <- data.frame(
  Fitur          = names(fitur_numerik),
  Jumlah_Outlier = sapply(fitur_numerik, deteksi_outlier)
) %>% arrange(desc(Jumlah_Outlier))

outlier_summary %>%
  mutate(Status = ifelse(Jumlah_Outlier > 0, "⚠️ Ada outlier", "✅ Bersih")) %>%
  kable(caption = "Deteksi Outlier per Fitur (Metode IQR)") %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE) %>%
  row_spec(which(outlier_summary$Jumlah_Outlier > 0), background = "#FFF3CD")

Deteksi Outlier per Fitur (Metode IQR)
	Fitur	Jumlah_Outlier	Status
Ash_Alcanity	Ash_Alcanity	4	⚠️ Ada outlier
Magnesium	Magnesium	4	⚠️ Ada outlier
Color_Intensity	Color_Intensity	4	⚠️ Ada outlier
Malic_Acid	Malic_Acid	3	⚠️ Ada outlier
Ash	Ash	3	⚠️ Ada outlier
Proanthocyanins	Proanthocyanins	2	⚠️ Ada outlier
Hue	Hue	1	⚠️ Ada outlier
Alcohol	Alcohol	0	✅ Bersih \|
Total_Phenols	Total_Phenols	0	✅ Bersih \|
Flavanoids	Flavanoids	0	✅ Bersih \|
Nonflavanoid_Phenols	Nonflavanoid_Phenols	0	✅ Bersih \|
OD280	OD280	0	✅ Bersih \|
Proline	Proline	0	✅ Bersih \|

3.5 Asumsi 5 — Normalitas (Shapiro-Wilk per Fitur)

shapiro_results <- sapply(fitur_numerik, function(x) shapiro.test(x)$p.value)

normalitas_df <- data.frame(
  Fitur   = names(shapiro_results),
  p_value = round(shapiro_results, 4)
) %>%
  mutate(Status = ifelse(p_value >= 0.05, "✅ Normal", "⚠️ Tidak Normal")) %>%
  arrange(p_value)

normalitas_df %>%
  kable(caption = "Uji Normalitas Shapiro-Wilk per Fitur (α = 0.05)") %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE) %>%
  row_spec(which(normalitas_df$p_value < 0.05), background = "#FFF3CD")

Uji Normalitas Shapiro-Wilk per Fitur (α = 0.05)
	Fitur	p_value	Status
Malic_Acid	Malic_Acid	0.0000	⚠️ Tidak Normal
Magnesium	Magnesium	0.0000	⚠️ Tidak Normal
Flavanoids	Flavanoids	0.0000	⚠️ Tidak Normal
Color_Intensity	Color_Intensity	0.0000	⚠️ Tidak Normal
OD280	OD280	0.0000	⚠️ Tidak Normal
Proline	Proline	0.0000	⚠️ Tidak Normal
Nonflavanoid_Phenols	Nonflavanoid_Phenols	0.0001	⚠️ Tidak Normal
Total_Phenols	Total_Phenols	0.0044	⚠️ Tidak Normal
Proanthocyanins	Proanthocyanins	0.0145	⚠️ Tidak Normal
Hue	Hue	0.0174	⚠️ Tidak Normal
Alcohol	Alcohol	0.0200	⚠️ Tidak Normal
Ash	Ash	0.0387	⚠️ Tidak Normal
Ash_Alcanity	Ash_Alcanity	0.2639	✅ Normal \|

mvn_result <- tryCatch(
  mvn(data = fitur_numerik, mvnTest = "mardia"),
  error = function(e) tryCatch(
    mvn(data = fitur_numerik, test = "mardia"),
    error = function(e2) mvn(data = fitur_numerik)
  )
)

print(mvn_result$multivariateNormality)

## NULL

4 Tahap 3 — Preprocessing

4.1 Pisah X dan y

X <- df %>% select(-Customer_Segment)
y <- df$Customer_Segment

cat("Dimensi X (fitur)  :", dim(X), "\n")

## Dimensi X (fitur)  : 178 13

cat("Panjang y (target) :", length(y), "\n")

## Panjang y (target) : 178

cat("Distribusi y       :\n")

## Distribusi y       :

print(table(y))

## y
##  1  2  3 
## 59 71 48

4.2 Set Seed

set.seed(42)
cat("Seed ditetapkan: 42\n")

## Seed ditetapkan: 42

4.3 Train-Test Split (75:25 Stratified)

train_index <- createDataPartition(y, p = 0.75, list = FALSE)

X_train <- X[train_index, ]
X_test  <- X[-train_index, ]
y_train <- y[train_index]
y_test  <- y[-train_index]

cat("=== Ukuran Data ===\n")

## === Ukuran Data ===

cat("Train:", nrow(X_train), "observasi | Test:", nrow(X_test), "observasi\n\n")

## Train: 135 observasi | Test: 43 observasi

cat("=== Distribusi Kelas — Train ===\n")

## === Distribusi Kelas — Train ===

print(table(y_train))

## y_train
##  1  2  3 
## 45 54 36

cat("\n=== Distribusi Kelas — Test ===\n")

## 
## === Distribusi Kelas — Test ===

print(table(y_test))

## y_test
##  1  2  3 
## 14 17 12

prop_train <- round(prop.table(table(y_train)) * 100, 1)
prop_test  <- round(prop.table(table(y_test))  * 100, 1)

cat("\nProporsi Train (%):\n")

## 
## Proporsi Train (%):

print(prop_train)

## y_train
##    1    2    3 
## 33.3 40.0 26.7

cat("\nProporsi Test  (%):\n")

## 
## Proporsi Test  (%):

print(prop_test)

## y_test
##    1    2    3 
## 32.6 39.5 27.9

4.4 Standarisasi (Z-score, fit pada Train saja)

preproc <- preProcess(X_train, method = c("center", "scale"))

X_train_scaled <- predict(preproc, X_train)
X_test_scaled  <- predict(preproc, X_test)

mean_check <- round(colMeans(X_train_scaled), 3)
sd_check   <- round(apply(X_train_scaled, 2, sd), 3)

cat("=== Verifikasi Standarisasi (Train) ===\n")

## === Verifikasi Standarisasi (Train) ===

verif_df <- data.frame(Fitur = names(mean_check), Mean = mean_check, SD = sd_check)

verif_df %>%
  kable(caption = "Mean dan SD Setelah Standarisasi (Train)") %>%
  kable_styling(bootstrap_options = c("striped", "hover"),
                full_width = FALSE, font_size = 11)

Mean dan SD Setelah Standarisasi (Train)
	Fitur	SD
Alcohol	Alcohol	1
Malic_Acid	Malic_Acid	1
Ash	Ash	1
Ash_Alcanity	Ash_Alcanity	1
Magnesium	Magnesium	1
Total_Phenols	Total_Phenols	1
Flavanoids	Flavanoids	1
Nonflavanoid_Phenols	Nonflavanoid_Phenols	1
Proanthocyanins	Proanthocyanins	1
Color_Intensity	Color_Intensity	1
Hue	Hue	1
OD280	OD280	1
Proline	Proline	1

4.5 Gabungkan X dan y untuk Modeling

train_data <- cbind(X_train_scaled, Customer_Segment = y_train)
test_data  <- cbind(X_test_scaled,  Customer_Segment = y_test)

cat("Data train siap:", nrow(train_data), "baris,", ncol(train_data), "kolom\n")

## Data train siap: 135 baris, 14 kolom

cat("Data test  siap:", nrow(test_data),  "baris,", ncol(test_data),  "kolom\n")

## Data test  siap: 43 baris, 14 kolom

cat("\nPreprocessing selesai. Lanjut ke Tahap 4: Fitting Model MLR & LDA.\n")

## 
## Preprocessing selesai. Lanjut ke Tahap 4: Fitting Model MLR & LDA.

5 Tahap 4A — Linear Discriminant Analysis (LDA)

5.1 Fit Model LDA

lda_model <- lda(Customer_Segment ~ ., data = train_data)
cat("=== Model LDA berhasil di-fit ===\n\n")

## === Model LDA berhasil di-fit ===

print(lda_model)

## Call:
## lda(Customer_Segment ~ ., data = train_data)
## 
## Prior probabilities of groups:
##         1         2         3 
## 0.3333333 0.4000000 0.2666667 
## 
## Group means:
##      Alcohol Malic_Acid        Ash Ash_Alcanity   Magnesium Total_Phenols
## 1  0.8614216 -0.2652798  0.2906065   -0.7350033  0.51816969    0.89199963
## 2 -0.8807290 -0.4160982 -0.3652704    0.2438351 -0.41017334   -0.08453745
## 3  0.2443165  0.9557471  0.1846475    0.5530014 -0.03245211   -0.98819336
##    Flavanoids Nonflavanoid_Phenols Proanthocyanins Color_Intensity        Hue
## 1  0.93641652          -0.65222026      0.56809014       0.1834782  0.4056861
## 2  0.06768397          -0.02287984      0.01030215      -0.8350753  0.4692576
## 3 -1.27204661           0.84959508     -0.72556590       1.0232653 -1.2109941
##        OD280    Proline
## 1  0.8361276  1.1462003
## 2  0.1747559 -0.7311564
## 3 -1.3072932 -0.3360158
## 
## Coefficients of linear discriminants:
##                              LD1         LD2
## Alcohol              -0.37406517  0.67904416
## Malic_Acid            0.19365178  0.36305408
## Ash                  -0.15212531  0.53979962
## Ash_Alcanity          0.44514061 -0.40193781
## Magnesium            -0.05641899  0.02241295
## Total_Phenols         0.46830400  0.06211205
## Flavanoids           -1.72429984 -0.57240210
## Nonflavanoid_Phenols -0.13439089 -0.10721773
## Proanthocyanins       0.10104462 -0.18060995
## Color_Intensity       0.83819227  0.61129435
## Hue                  -0.15989148 -0.44033959
## OD280                -0.84166138  0.22604124
## Proline              -0.88766530  0.93254363
## 
## Proportion of trace:
##    LD1    LD2 
## 0.6895 0.3105

5.2 Komponen Diskriminan — Explained Variance Ratio (LD1 & LD2)

# Proporsi varian yang dijelaskan tiap komponen diskriminan
prop_var <- lda_model$svd^2 / sum(lda_model$svd^2)
prop_var_df <- data.frame(
  Komponen       = paste0("LD", seq_along(prop_var)),
  Eigenvalue     = round(lda_model$svd^2, 4),
  Proporsi       = round(prop_var * 100, 2),
  Kumulatif_Pct  = round(cumsum(prop_var) * 100, 2)
)

prop_var_df %>%
  kable(caption = "Explained Variance Ratio — Komponen Diskriminan LDA",
        col.names = c("Komponen", "Eigenvalue", "Proporsi (%)", "Kumulatif (%)")) %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)

Explained Variance Ratio — Komponen Diskriminan LDA
Komponen	Eigenvalue	Proporsi (%)	Kumulatif (%)
LD1	602.9522	68.95	68.95
LD2	271.5138	31.05	100.00

cat(sprintf("\nLD1 menjelaskan %.1f%% dan LD2 menjelaskan %.1f%% variansi antarkelas.\n",
            prop_var_df$Proporsi[1], prop_var_df$Proporsi[2]))

## 
## LD1 menjelaskan 69.0% dan LD2 menjelaskan 31.1% variansi antarkelas.

5.3 Proyeksi LDA — Scatter Plot LD1 vs LD2

# Proyeksi data train ke ruang diskriminan
lda_proj_train <- predict(lda_model, X_train_scaled)
lda_proj_test  <- predict(lda_model, X_test_scaled)

df_proj <- data.frame(
  LD1   = lda_proj_train$x[, 1],
  LD2   = lda_proj_train$x[, 2],
  Kelas = y_train
)

ggplot(df_proj, aes(x = LD1, y = LD2, colour = Kelas, shape = Kelas)) +
  geom_point(size = 3, alpha = 0.85) +
  stat_ellipse(level = 0.90, linewidth = 0.8, linetype = "dashed") +
  scale_colour_manual(values = c("#4E79A7", "#F28E2B", "#59A14F"),
                      name = "Customer\nSegment") +
  scale_shape_manual(values = c(16, 17, 15), name = "Customer\nSegment") +
  labs(title = "Proyeksi LDA: LD1 vs LD2 (Data Train)",
       subtitle = sprintf("LD1 = %.1f%%  |  LD2 = %.1f%%  variansi antarkelas",
                          prop_var_df$Proporsi[1], prop_var_df$Proporsi[2]),
       x = "LD1", y = "LD2") +
  theme_minimal(base_size = 13) +
  theme(legend.position = "right")

5.4 Koefisien Diskriminan — Kontribusi Fitur

# Koefisien (loading) tiap fitur pada LD1 dan LD2
coef_df <- as.data.frame(lda_model$scaling) %>%
  rownames_to_column("Fitur") %>%
  mutate(across(where(is.numeric), ~ round(., 4))) %>%
  mutate(Abs_LD1 = abs(LD1)) %>%
  arrange(desc(Abs_LD1)) %>%
  select(-Abs_LD1)

coef_df %>%
  kable(caption = "Koefisien Diskriminan LDA (Kontribusi Fitur per Komponen)") %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE) %>%
  row_spec(1:3, background = "#E8F4FD")   # highlight 3 fitur teratas

Koefisien Diskriminan LDA (Kontribusi Fitur per Komponen)
Fitur	LD1	LD2
Flavanoids	-1.7243	-0.5724
Proline	-0.8877	0.9325
OD280	-0.8417	0.2260
Color_Intensity	0.8382	0.6113
Total_Phenols	0.4683	0.0621
Ash_Alcanity	0.4451	-0.4019
Alcohol	-0.3741	0.6790
Malic_Acid	0.1937	0.3631
Hue	-0.1599	-0.4403
Ash	-0.1521	0.5398
Nonflavanoid_Phenols	-0.1344	-0.1072
Proanthocyanins	0.1010	-0.1806
Magnesium	-0.0564	0.0224

cat("\nFitur dengan kontribusi terbesar pada LD1:\n")

## 
## Fitur dengan kontribusi terbesar pada LD1:

cat(paste(head(coef_df$Fitur, 3), collapse = ", "), "\n")

## Flavanoids, Proline, OD280

6 Tahap 4B — Multinomial Logistic Regression (MLR)

6.1 Fit Model MLR

# Referensi kategori: kelas 1 (baseline)
train_data$Customer_Segment <- relevel(train_data$Customer_Segment, ref = "1")

mlr_model <- multinom(Customer_Segment ~ ., data = train_data, trace = FALSE)

cat("=== Model MLR berhasil di-fit ===\n\n")

## === Model MLR berhasil di-fit ===

summary(mlr_model)

## Call:
## multinom(formula = Customer_Segment ~ ., data = train_data, trace = FALSE)
## 
## Coefficients:
##   (Intercept)    Alcohol Malic_Acid        Ash Ash_Alcanity  Magnesium
## 2   -1.156250 -11.881344 -3.2514642 -10.016165     9.321990 -0.5202924
## 3   -5.222332  -2.068507  0.9017964   4.014963     4.038532  0.9348684
##   Total_Phenols Flavanoids Nonflavanoid_Phenols Proanthocyanins Color_Intensity
## 2     -2.097594  -1.927518             5.725801       0.9563483       -12.23829
## 3     -4.884002 -12.655078             1.969334      -2.6212234        12.57532
##          Hue     OD280     Proline
## 2   2.968839 -4.645503 -16.6173993
## 3 -10.354985 -9.819660   0.3185086
## 
## Std. Errors:
##   (Intercept)   Alcohol Malic_Acid       Ash Ash_Alcanity Magnesium
## 2    18353.62  4649.292   8035.895  8527.676     15182.19  3027.783
## 3    32757.73 40833.784  12634.577 52217.100     30516.45 78036.250
##   Total_Phenols Flavanoids Nonflavanoid_Phenols Proanthocyanins Color_Intensity
## 2      7274.864   7709.677             6011.432        2625.791        20202.27
## 3     52549.791  61596.183            39871.074       59249.474        44300.84
##        Hue     OD280  Proline
## 2 11248.28  9895.901 11640.20
## 3 33777.78 58599.008 18731.91
## 
## Residual Deviance: 0.0001872249 
## AIC: 56.00019

6.2 Koefisien β (Log Odds)

coef_mlr <- summary(mlr_model)$coefficients
coef_mlr_df <- as.data.frame(t(coef_mlr)) %>%
  rownames_to_column("Fitur") %>%
  rename_with(~ paste0("LogOdds_Kelas", .), -Fitur)

coef_mlr_df %>%
  mutate(across(where(is.numeric), ~ round(., 4))) %>%
  kable(caption = "Koefisien β (Log Odds) — MLR per Kategori vs Baseline (Kelas 1)") %>%
  kable_styling(bootstrap_options = c("striped", "hover"),
                full_width = TRUE, font_size = 11)

Koefisien β (Log Odds) — MLR per Kategori vs Baseline (Kelas 1)
Fitur	LogOdds_Kelas2	LogOdds_Kelas3
(Intercept)	-1.1563	-5.2223
Alcohol	-11.8813	-2.0685
Malic_Acid	-3.2515	0.9018
Ash	-10.0162	4.0150
Ash_Alcanity	9.3220	4.0385
Magnesium	-0.5203	0.9349
Total_Phenols	-2.0976	-4.8840
Flavanoids	-1.9275	-12.6551
Nonflavanoid_Phenols	5.7258	1.9693
Proanthocyanins	0.9563	-2.6212
Color_Intensity	-12.2383	12.5753
Hue	2.9688	-10.3550
OD280	-4.6455	-9.8197
Proline	-16.6174	0.3185

6.3 Relative Risk Ratio (RRR)

# RRR = exp(β)
rrr_mlr <- exp(coef_mlr)
rrr_df  <- as.data.frame(t(rrr_mlr)) %>%
  rownames_to_column("Fitur") %>%
  rename_with(~ paste0("RRR_Kelas", .), -Fitur)

rrr_df %>%
  mutate(across(where(is.numeric), ~ round(., 4))) %>%
  kable(caption = "Relative Risk Ratio (RRR = exp(β)) — MLR") %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)

Relative Risk Ratio (RRR = exp(β)) — MLR
Fitur	RRR_Kelas2	RRR_Kelas3
(Intercept)	0.3147	0.0054
Alcohol	0.0000	0.1264
Malic_Acid	0.0387	2.4640
Ash	0.0000	55.4213
Ash_Alcanity	11181.2158	56.7430
Magnesium	0.5943	2.5469
Total_Phenols	0.1228	0.0076
Flavanoids	0.1455	0.0000
Nonflavanoid_Phenols	306.6788	7.1659
Proanthocyanins	2.6022	0.0727
Color_Intensity	0.0000	289329.4550
Hue	19.4693	0.0000
OD280	0.0096	0.0001
Proline	0.0000	1.3751

# Bar chart RRR
rrr_long <- rrr_df %>%
  filter(Fitur != "(Intercept)") %>%
  pivot_longer(-Fitur, names_to = "Kelas", values_to = "RRR")

ggplot(rrr_long, aes(x = reorder(Fitur, RRR), y = RRR, fill = Kelas)) +
  geom_bar(stat = "identity", position = "dodge", alpha = 0.85) +
  geom_hline(yintercept = 1, linetype = "dashed", colour = "grey40") +
  scale_fill_manual(values = c("#F28E2B", "#59A14F")) +
  coord_flip() +
  labs(title = "Relative Risk Ratio (RRR) per Fitur — MLR",
       subtitle = "Garis putus-putus = RRR 1 (tidak ada efek vs baseline)",
       x = "Fitur", y = "RRR") +
  theme_minimal(base_size = 12)

6.4 Uji Signifikansi — Uji Wald (Serentak & Parsial)

# Uji Wald parsial: z = koef / SE, p-value = 2 * (1 - Φ(|z|))
se_mlr  <- summary(mlr_model)$standard.errors
z_stat  <- coef_mlr / se_mlr
p_wald  <- 2 * (1 - pnorm(abs(z_stat)))

wald_df <- as.data.frame(t(p_wald)) %>%
  rownames_to_column("Fitur") %>%
  rename_with(~ paste0("p_Kelas", .), -Fitur) %>%
  mutate(across(where(is.numeric), ~ round(., 4)))

wald_df %>%
  kable(caption = "P-value Uji Wald Parsial per Fitur (α = 0.05)") %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE) %>%
  row_spec(which(apply(wald_df[, -1], 1, function(r) any(r < 0.05))),
           background = "#D5F5E3")

P-value Uji Wald Parsial per Fitur (α = 0.05)
Fitur	p_Kelas2	p_Kelas3
(Intercept)	0.9999	0.9999
Alcohol	0.9980	1.0000
Malic_Acid	0.9997	0.9999
Ash	0.9991	0.9999
Ash_Alcanity	0.9995	0.9999
Magnesium	0.9999	1.0000
Total_Phenols	0.9998	0.9999
Flavanoids	0.9998	0.9998
Nonflavanoid_Phenols	0.9992	1.0000
Proanthocyanins	0.9997	1.0000
Color_Intensity	0.9995	0.9998
Hue	0.9998	0.9998
OD280	0.9996	0.9999
Proline	0.9989	1.0000

cat("\nFitur signifikan (p < 0.05 minimal satu kelas):\n")

## 
## Fitur signifikan (p < 0.05 minimal satu kelas):

sig_fitur <- wald_df$Fitur[apply(wald_df[, -1], 1,
                                 function(r) any(r < 0.05, na.rm = TRUE))]
cat(paste(sig_fitur, collapse = ", "), "\n")

7 Tahap 5 — Evaluasi & Pengujian Model

7.1 Prediksi pada Data Test

# LDA
pred_lda  <- predict(lda_model, X_test_scaled)$class

# MLR
pred_mlr  <- predict(mlr_model, X_test_scaled)

cat("Distribusi prediksi LDA:", table(pred_lda), "\n")

## Distribusi prediksi LDA: 14 17 12

cat("Distribusi prediksi MLR:", table(pred_mlr), "\n")

## Distribusi prediksi MLR: 14 17 12

7.2 Confusion Matrix — LDA

cm_lda <- confusionMatrix(pred_lda, y_test)
print(cm_lda)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  1  2  3
##          1 14  0  0
##          2  0 17  0
##          3  0  0 12
## 
## Overall Statistics
##                                      
##                Accuracy : 1          
##                  95% CI : (0.9178, 1)
##     No Information Rate : 0.3953     
##     P-Value [Acc > NIR] : < 2.2e-16  
##                                      
##                   Kappa : 1          
##                                      
##  Mcnemar's Test P-Value : NA         
## 
## Statistics by Class:
## 
##                      Class: 1 Class: 2 Class: 3
## Sensitivity            1.0000   1.0000   1.0000
## Specificity            1.0000   1.0000   1.0000
## Pos Pred Value         1.0000   1.0000   1.0000
## Neg Pred Value         1.0000   1.0000   1.0000
## Prevalence             0.3256   0.3953   0.2791
## Detection Rate         0.3256   0.3953   0.2791
## Detection Prevalence   0.3256   0.3953   0.2791
## Balanced Accuracy      1.0000   1.0000   1.0000

# Heatmap Confusion Matrix LDA
cm_lda_df <- as.data.frame(cm_lda$table)
colnames(cm_lda_df) <- c("Prediksi", "Aktual", "Frekuensi")

ggplot(cm_lda_df, aes(x = Aktual, y = Prediksi, fill = Frekuensi)) +
  geom_tile(colour = "white") +
  geom_text(aes(label = Frekuensi), size = 6, fontface = "bold") +
  scale_fill_gradient(low = "#EBF5FB", high = "#1A5276") +
  labs(title = "Confusion Matrix — LDA (Test Set)",
       x = "Aktual", y = "Prediksi") +
  theme_minimal(base_size = 13)

7.3 Confusion Matrix — MLR

pred_mlr_fac <- factor(pred_mlr, levels = levels(y_test))
cm_mlr <- confusionMatrix(pred_mlr_fac, y_test)
print(cm_mlr)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  1  2  3
##          1 14  0  0
##          2  0 17  0
##          3  0  0 12
## 
## Overall Statistics
##                                      
##                Accuracy : 1          
##                  95% CI : (0.9178, 1)
##     No Information Rate : 0.3953     
##     P-Value [Acc > NIR] : < 2.2e-16  
##                                      
##                   Kappa : 1          
##                                      
##  Mcnemar's Test P-Value : NA         
## 
## Statistics by Class:
## 
##                      Class: 1 Class: 2 Class: 3
## Sensitivity            1.0000   1.0000   1.0000
## Specificity            1.0000   1.0000   1.0000
## Pos Pred Value         1.0000   1.0000   1.0000
## Neg Pred Value         1.0000   1.0000   1.0000
## Prevalence             0.3256   0.3953   0.2791
## Detection Rate         0.3256   0.3953   0.2791
## Detection Prevalence   0.3256   0.3953   0.2791
## Balanced Accuracy      1.0000   1.0000   1.0000

# Heatmap Confusion Matrix MLR
cm_mlr_df <- as.data.frame(cm_mlr$table)
colnames(cm_mlr_df) <- c("Prediksi", "Aktual", "Frekuensi")

ggplot(cm_mlr_df, aes(x = Aktual, y = Prediksi, fill = Frekuensi)) +
  geom_tile(colour = "white") +
  geom_text(aes(label = Frekuensi), size = 6, fontface = "bold") +
  scale_fill_gradient(low = "#FDFEFE", high = "#7D6608") +
  labs(title = "Confusion Matrix — MLR (Test Set)",
       x = "Aktual", y = "Prediksi") +
  theme_minimal(base_size = 13)

7.4 Akurasi & F1 (Macro/Weighted Average)

# Fungsi hitung F1 macro dan weighted
get_metrics <- function(cm_obj, model_name) {
  stats    <- cm_obj$byClass
  support  <- rowSums(cm_obj$table)     # jumlah observasi per kelas (aktual)
  f1_per   <- 2 * stats[, "Sensitivity"] * stats[, "Pos Pred Value"] /
              (stats[, "Sensitivity"] + stats[, "Pos Pred Value"])
  f1_macro    <- mean(f1_per, na.rm = TRUE)
  f1_weighted <- weighted.mean(f1_per, w = support, na.rm = TRUE)
  acc <- cm_obj$overall["Accuracy"]
  data.frame(Model = model_name,
             Akurasi        = round(acc, 4),
             F1_Macro       = round(f1_macro, 4),
             F1_Weighted    = round(f1_weighted, 4))
}

metrics_lda <- get_metrics(cm_lda, "LDA")
metrics_mlr <- get_metrics(cm_mlr, "MLR")

metrics_all <- bind_rows(metrics_lda, metrics_mlr)

metrics_all %>%
  kable(caption = "Ringkasan Akurasi & F1 — LDA vs MLR (Test Set)",
        col.names = c("Model", "Akurasi", "F1 Macro", "F1 Weighted")) %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE) %>%
  row_spec(which.max(metrics_all$Akurasi), background = "#D5F5E3")

Ringkasan Akurasi & F1 — LDA vs MLR (Test Set)
	Model	Akurasi	F1 Macro	F1 Weighted
Accuracy…1	LDA	1	1	1
Accuracy…2	MLR	1	1	1

install.packages("MLmetrics")

## package 'MLmetrics' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\Asus\AppData\Local\Temp\Rtmpgpf847\downloaded_packages

7.5 CV 9-Fold — Cross Validation Score

set.seed(42)
ctrl <- trainControl(method = "cv", number = 9,
                     classProbs = TRUE,
                     summaryFunction = multiClassSummary,
                     savePredictions = "final")

# Caret mensyaratkan level faktor berupa valid R variable name
# Ubah "1","2","3" → "Seg1","Seg2","Seg3"
y_train_cv <- factor(paste0("Seg", y_train),
                     levels = paste0("Seg", levels(y_train)))

# Data gabung untuk caret train
train_cv <- cbind(X_train_scaled, Customer_Segment = y_train_cv)

# CV LDA
cv_lda <- train(Customer_Segment ~ .,
                data    = train_cv,
                method  = "lda",
                trControl = ctrl,
                metric  = "Accuracy")

# CV MLR
cv_mlr <- train(Customer_Segment ~ .,
                data    = train_cv,
                method  = "multinom",
                trControl = ctrl,
                metric  = "Accuracy",
                trace   = FALSE)

cv_summary <- data.frame(
  Model       = c("LDA", "MLR"),
  CV_Accuracy_Mean = c(
    round(mean(cv_lda$resample$Accuracy), 4),
    round(mean(cv_mlr$resample$Accuracy), 4)
  ),
  CV_Accuracy_SD   = c(
    round(sd(cv_lda$resample$Accuracy), 4),
    round(sd(cv_mlr$resample$Accuracy), 4)
  )
)

cv_summary %>%
  kable(caption = "Cross Validation 9-Fold — Akurasi LDA vs MLR",
        col.names = c("Model", "Mean Accuracy", "SD Accuracy")) %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)

Cross Validation 9-Fold — Akurasi LDA vs MLR
Model	Mean Accuracy	SD Accuracy
LDA	0.9778	0.0333
MLR	0.9704	0.0351

7.6 ROC-AUC (OvR per Kelas)

# Probabilitas dari LDA dan MLR pada test set
prob_lda <- predict(lda_model, X_test_scaled)$posterior
prob_mlr <- predict(mlr_model, X_test_scaled, type = "probs")

kelas_list <- levels(y_test)
auc_results <- data.frame(
  Kelas   = character(),
  AUC_LDA = numeric(),
  AUC_MLR = numeric()
)

par(mfrow = c(1, 3))
for (k in kelas_list) {
  y_bin  <- ifelse(y_test == k, 1, 0)
  roc_lda_k <- roc(y_bin, prob_lda[, k], quiet = TRUE)
  roc_mlr_k <- roc(y_bin, prob_mlr[, k], quiet = TRUE)

  auc_results <- rbind(auc_results,
    data.frame(Kelas   = k,
               AUC_LDA = round(auc(roc_lda_k), 4),
               AUC_MLR = round(auc(roc_mlr_k), 4)))

  plot(roc_lda_k, col = "#4E79A7", main = paste("ROC — Kelas", k),
       lwd = 2, legacy.axes = TRUE)
  plot(roc_mlr_k, col = "#F28E2B", add = TRUE, lwd = 2, lty = 2)
  legend("bottomright",
         legend = c(sprintf("LDA (AUC=%.3f)", auc(roc_lda_k)),
                    sprintf("MLR (AUC=%.3f)", auc(roc_mlr_k))),
         col = c("#4E79A7", "#F28E2B"), lty = c(1, 2), lwd = 2, cex = 0.85)
}

par(mfrow = c(1, 1))

auc_results %>%
  kable(caption = "AUC per Kelas (OvR) — LDA vs MLR") %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)

AUC per Kelas (OvR) — LDA vs MLR
Kelas	AUC_LDA	AUC_MLR
1	1	1
2	1	1
3	1	1

8 Tahap 6 — Visualisasi Hasil

8.1 LDA Scatter Plot (LD1 vs LD2) — Train & Test

df_train_proj <- data.frame(
  LD1   = lda_proj_train$x[, 1],
  LD2   = lda_proj_train$x[, 2],
  Kelas = y_train,
  Set   = "Train"
)
df_test_proj <- data.frame(
  LD1   = lda_proj_test$x[, 1],
  LD2   = lda_proj_test$x[, 2],
  Kelas = y_test,
  Set   = "Test"
)
df_all_proj <- bind_rows(df_train_proj, df_test_proj)

ggplot(df_all_proj, aes(x = LD1, y = LD2, colour = Kelas, shape = Set)) +
  geom_point(size = 3, alpha = 0.8) +
  stat_ellipse(data = df_train_proj, aes(colour = Kelas),
               level = 0.90, linewidth = 0.7, linetype = "dashed") +
  scale_colour_manual(values = c("#4E79A7", "#F28E2B", "#59A14F"),
                      name = "Customer\nSegment") +
  scale_shape_manual(values = c(16, 4), name = "Dataset") +
  labs(title = "LDA Scatter Plot — LD1 vs LD2",
       subtitle = "Titik bulat = Train | Tanda silang = Test | Elips = batas 90% CI",
       x = paste0("LD1 (", prop_var_df$Proporsi[1], "%)"),
       y = paste0("LD2 (", prop_var_df$Proporsi[2], "%)")) +
  theme_minimal(base_size = 13)

8.2 RRR Bar Chart per Fitur (MLR)

rrr_long_plot <- rrr_df %>%
  filter(Fitur != "(Intercept)") %>%
  pivot_longer(-Fitur, names_to = "Kelas", values_to = "RRR") %>%
  mutate(Kelas = gsub("RRR_Kelas", "Kelas ", Kelas))

ggplot(rrr_long_plot, aes(x = reorder(Fitur, -RRR), y = RRR, fill = Kelas)) +
  geom_bar(stat = "identity", position = "dodge", alpha = 0.85, width = 0.7) +
  geom_hline(yintercept = 1, linetype = "dashed", colour = "grey30", linewidth = 0.7) +
  scale_fill_manual(values = c("#F28E2B", "#59A14F")) +
  labs(title = "Relative Risk Ratio (RRR) per Fitur — Multinomial Logit",
       subtitle = "Baseline = Kelas 1 | RRR > 1: risiko naik | RRR < 1: risiko turun",
       x = "Fitur", y = "RRR") +
  theme_minimal(base_size = 12) +
  theme(axis.text.x = element_text(angle = 35, hjust = 1))

8.3 Heatmap Koefisien MLR (β per Fitur × Kelas)

coef_long <- as.data.frame(t(coef_mlr)) %>%
  rownames_to_column("Fitur") %>%
  filter(Fitur != "(Intercept)") %>%
  pivot_longer(-Fitur, names_to = "Kelas", values_to = "Beta") %>%
  mutate(Kelas = paste("Kelas", Kelas))

ggplot(coef_long, aes(x = Kelas, y = Fitur, fill = Beta)) +
  geom_tile(colour = "white") +
  geom_text(aes(label = round(Beta, 2)), size = 3.5) +
  scale_fill_gradient2(low = "#C0392B", mid = "white", high = "#1A5276",
                       midpoint = 0, name = "β") +
  labs(title = "Heatmap Koefisien MLR (β) per Fitur & Kelas",
       subtitle = "Merah = efek negatif | Biru = efek positif vs baseline (Kelas 1)",
       x = "Kelas (vs Baseline Kelas 1)", y = "Fitur") +
  theme_minimal(base_size = 12) +
  theme(axis.text.x = element_text(size = 11))

8.4 Confusion Matrix — Heatmap Keduanya

# Normalisasi per baris (proporsi prediksi benar per kelas aktual)
cm_lda_norm <- as.data.frame(cm_lda$table)
colnames(cm_lda_norm) <- c("Prediksi", "Aktual", "Frekuensi")
cm_lda_norm$Model <- "LDA"

cm_mlr_norm <- as.data.frame(cm_mlr$table)
colnames(cm_mlr_norm) <- c("Prediksi", "Aktual", "Frekuensi")
cm_mlr_norm$Model <- "MLR"

cm_both <- bind_rows(cm_lda_norm, cm_mlr_norm)

ggplot(cm_both, aes(x = Aktual, y = Prediksi, fill = Frekuensi)) +
  geom_tile(colour = "white") +
  geom_text(aes(label = Frekuensi), size = 6, fontface = "bold") +
  scale_fill_gradient(low = "#FDFEFE", high = "#1A5276") +
  facet_wrap(~ Model) +
  labs(title = "Confusion Matrix — LDA vs MLR (Test Set)",
       x = "Aktual", y = "Prediksi") +
  theme_minimal(base_size = 13)

9 Tahap 7 — Interpretasi Model

9.1 Interpretasi LDA

9.1.1 Koefisien Diskriminan

cat("=================================================================\n")

## =================================================================

cat("          INTERPRETASI MODEL LDA\n")

##           INTERPRETASI MODEL LDA

cat("=================================================================\n\n")

## =================================================================

cat("1. KOMPONEN DISKRIMINAN\n")

## 1. KOMPONEN DISKRIMINAN

cat("   LD1 menjelaskan", prop_var_df$Proporsi[1], "% variansi antarkelas.\n")

##    LD1 menjelaskan 68.95 % variansi antarkelas.

cat("   LD2 menjelaskan", prop_var_df$Proporsi[2], "% variansi antarkelas.\n")

##    LD2 menjelaskan 31.05 % variansi antarkelas.

cat("   → Total:", prop_var_df$Kumulatif_Pct[2], "% variansi dapat dijelaskan.\n\n")

##    → Total: 100 % variansi dapat dijelaskan.

cat("2. FITUR TERPENTING (loading |LD1| tertinggi):\n")

## 2. FITUR TERPENTING (loading |LD1| tertinggi):

top3 <- head(coef_df$Fitur, 3)
for (i in seq_along(top3)) {
  ld1_val <- coef_df$LD1[coef_df$Fitur == top3[i]]
  cat(sprintf("   %d. %s (LD1 = %.4f)\n", i, top3[i], ld1_val))
}

##    1. Flavanoids (LD1 = -1.7243)
##    2. Proline (LD1 = -0.8877)
##    3. OD280 (LD1 = -0.8417)

9.1.2 Marginal Effects — Perubahan Probabilitas per Fitur

baseline_post <- colMeans(predict(lda_model, X_test_scaled)$posterior)

delta_df <- lapply(names(X_test_scaled), function(feat) {
  X_shifted          <- X_test_scaled
  X_shifted[[feat]]  <- X_shifted[[feat]] + 1   # naikkan 1 SD
  post_shifted       <- colMeans(predict(lda_model, X_shifted)$posterior)
  delta              <- post_shifted - baseline_post
  data.frame(Fitur = feat,
             Kelas = names(delta),
             Delta_Prob = round(delta, 4))
}) %>% bind_rows()

ggplot(delta_df, aes(x = reorder(Fitur, abs(Delta_Prob)), y = Delta_Prob, fill = Kelas)) +
  geom_bar(stat = "identity", position = "dodge", alpha = 0.85) +
  geom_hline(yintercept = 0, linetype = "dashed") +
  scale_fill_manual(values = c("#4E79A7", "#F28E2B", "#59A14F")) +
  coord_flip() +
  labs(title = "Marginal Effects LDA — Perubahan Probabilitas per Kelas",
       subtitle = "Efek kenaikan 1 SD setiap fitur terhadap probabilitas posterior",
       x = "Fitur", y = "ΔProb") +
  theme_minimal(base_size = 12)

9.2 Interpretasi MLR

cat("sig_fitur:", sig_fitur, "\n")

## sig_fitur:

cat("ncol sig_coef:", ncol(coef_mlr[, sig_fitur, drop=FALSE]), "\n")

## ncol sig_coef: 0

9.2.1 Log Odds & Uji Parsial Wald

cat("          INTERPRETASI MODEL MLR\n")

##           INTERPRETASI MODEL MLR

cat("1. LOG ODDS vs BASELINE (Kelas 1):\n")

## 1. LOG ODDS vs BASELINE (Kelas 1):

cat("   Koefisien positif → probabilitas kelas tersebut relatif lebih\n")

##    Koefisien positif → probabilitas kelas tersebut relatif lebih

cat("   tinggi dibanding kelas 1 saat fitur meningkat.\n\n")

##    tinggi dibanding kelas 1 saat fitur meningkat.

cat("2. RELATIVE RISK RATIO (RRR = exp(β)):\n")

## 2. RELATIVE RISK RATIO (RRR = exp(β)):

cat("   RRR > 1 → fitur meningkatkan risiko relatif masuk kelas tsb.\n")

##    RRR > 1 → fitur meningkatkan risiko relatif masuk kelas tsb.

cat("   RRR < 1 → fitur menurunkan risiko relatif masuk kelas tsb.\n\n")

##    RRR < 1 → fitur menurunkan risiko relatif masuk kelas tsb.

cat("3. UJI WALD PARSIAL:\n")

## 3. UJI WALD PARSIAL:

cat("   Fitur signifikan (p < 0.05) minimal satu kelas:\n")

##    Fitur signifikan (p < 0.05) minimal satu kelas:

cat("  ", paste(sig_fitur, collapse = ", "), "\n\n")

# Tabel ringkas log-odds & RRR fitur signifikan
cat("Ringkasan koefisien fitur paling signifikan:\n")

## Ringkasan koefisien fitur paling signifikan:

sig_coef <- coef_mlr[, sig_fitur, drop = FALSE]
sig_rrr  <- exp(sig_coef)

sig_coef <- coef_mlr[, sig_fitur, drop = FALSE]
sig_rrr  <- exp(sig_coef)

n_show <- min(4, ncol(sig_coef))

if (n_show > 0) {
  cbind(
    LogOdds = round(sig_coef[, 1:n_show, drop = FALSE], 3),
    RRR     = round(sig_rrr[,  1:n_show, drop = FALSE], 3)
  ) %>%
    kable(caption = "Log Odds & RRR Fitur Signifikan Teratas") %>%
    kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)
} else {
  cat("Tidak ada fitur signifikan yang ditemukan.\n")
}

## Tidak ada fitur signifikan yang ditemukan.

9.2.2 Uji Parsial Wald — P-value per Fitur

# Visualisasi p-value Wald per fitur & kelas
wald_long <- wald_df %>%
  filter(Fitur != "(Intercept)") %>%
  pivot_longer(-Fitur, names_to = "Kelas", values_to = "p_value") %>%
  mutate(Signifikan = ifelse(p_value < 0.05, "Signifikan", "Tidak"))

ggplot(wald_long, aes(x = reorder(Fitur, p_value), y = -log10(p_value + 1e-6),
                       fill = Signifikan)) +
  geom_bar(stat = "identity", position = "dodge", alpha = 0.85) +
  geom_hline(yintercept = -log10(0.05), linetype = "dashed",
             colour = "red", linewidth = 0.7) +
  scale_fill_manual(values = c("#59A14F", "#E15759")) +
  facet_wrap(~ Kelas) +
  coord_flip() +
  labs(title = "Uji Wald Parsial MLR — -log10(p-value) per Fitur & Kelas",
       subtitle = "Garis merah = threshold p = 0.05",
       x = "Fitur", y = "-log10(p-value)") +
  theme_minimal(base_size = 11)

10 Tahap 8 — Perbandingan & Kesimpulan

10.1 Tabel Perbandingan Lengkap

# AUC macro
auc_macro_lda <- round(mean(auc_results$AUC_LDA), 4)
auc_macro_mlr <- round(mean(auc_results$AUC_MLR), 4)

tabel_final <- data.frame(
  Kriteria = c(
    "Akurasi Test",
    "F1 Macro",
    "F1 Weighted",
    "CV 9-Fold Mean Accuracy",
    "CV 9-Fold SD Accuracy",
    "AUC Macro (OvR)",
    "Asumsi Utama",
    "Interpretabilitas"
  ),
  LDA = c(
    paste0(round(metrics_lda$Akurasi * 100, 1), "%"),
    round(metrics_lda$F1_Macro, 4),
    round(metrics_lda$F1_Weighted, 4),
    paste0(round(cv_summary$CV_Accuracy_Mean[1] * 100, 1), "%"),
    round(cv_summary$CV_Accuracy_SD[1], 4),
    auc_macro_lda,
    "Normalitas multivariat, Σ homogen",
    "Koefisien diskriminan & plot LD"
  ),
  MLR = c(
    paste0(round(metrics_mlr$Akurasi * 100, 1), "%"),
    round(metrics_mlr$F1_Macro, 4),
    round(metrics_mlr$F1_Weighted, 4),
    paste0(round(cv_summary$CV_Accuracy_Mean[2] * 100, 1), "%"),
    round(cv_summary$CV_Accuracy_SD[2], 4),
    auc_macro_mlr,
    "Linearitas log-odds, independensi obs",
    "Log odds, RRR, uji Wald"
  )
)

tabel_final %>%
  kable(caption = "Tabel Perbandingan Lengkap: LDA vs MLR") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "bordered"),
                full_width = FALSE) %>%
  column_spec(1, bold = TRUE) %>%
  row_spec(0, background = "#2E86AB", color = "white")

Tabel Perbandingan Lengkap: LDA vs MLR
Kriteria	LDA	MLR
Akurasi Test	100%	100%
F1 Macro	1	1
F1 Weighted	1	1
CV 9-Fold Mean Accuracy	97.8%	97%
CV 9-Fold SD Accuracy	0.0333	0.0351
AUC Macro (OvR)	1	1
Asumsi Utama	Normalitas multivariat, Σ homogen	Linearitas log-odds, independensi obs
Interpretabilitas	Koefisien diskriminan & plot LD	Log odds, RRR, uji Wald

10.2 Kelebihan & Kekurangan

kelebihan_df <- data.frame(
  Aspek = c("Kelebihan utama",
            "Keunggulan prediksi",
            "Kekurangan utama",
            "Masalah asumsi",
            "Interpretasi output"),
  LDA   = c(
    "Efisien dengan data kecil, dimensi reduksi alami (LD plot)",
    "Sangat baik jika asumsi normalitas & homogenitas Σ terpenuhi",
    "Sensitif terhadap pelanggaran asumsi normalitas & outlier",
    "Asumsi normalitas multivariat sering dilanggar di data nyata",
    "Koefisien diskriminan & proyeksi LD mudah divisualisasikan"
  ),
  MLR   = c(
    "Tidak mensyaratkan normalitas distribusi fitur",
    "Lebih robust pada data yang tidak normal atau ada outlier",
    "Butuh banyak data, rentan multikolinearitas",
    "Asumsi linearitas log-odds, perlu dicek lebih ketat",
    "Log odds & RRR mudah diinterpretasi secara statistik"
  )
)

kelebihan_df %>%
  kable(caption = "Kelebihan & Kekurangan: LDA vs MLR") %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = TRUE) %>%
  column_spec(1, bold = TRUE, width = "20%") %>%
  column_spec(2, width = "40%") %>%
  column_spec(3, width = "40%")

Kelebihan & Kekurangan: LDA vs MLR
Aspek	LDA	MLR
Kelebihan utama	Efisien dengan data kecil, dimensi reduksi alami (LD plot)	Tidak mensyaratkan normalitas distribusi fitur
Keunggulan prediksi	Sangat baik jika asumsi normalitas & homogenitas Σ terpenuhi	Lebih robust pada data yang tidak normal atau ada outlier
Kekurangan utama	Sensitif terhadap pelanggaran asumsi normalitas & outlier	Butuh banyak data, rentan multikolinearitas
Masalah asumsi	Asumsi normalitas multivariat sering dilanggar di data nyata	Asumsi linearitas log-odds, perlu dicek lebih ketat
Interpretasi output	Koefisien diskriminan & proyeksi LD mudah divisualisasikan	Log odds & RRR mudah diinterpretasi secara statistik

Multinomial Logistic Regression vs Analisis Diskriminan (LDA)

Dataset Wine

Darista Wardhani (146), Fitri Saidatur Hanifah (035), Yuli Inka Valentina(118)

2026-05-03

1 Instalasi & Load Library

2 Tahap 1 — Load & Eksplorasi Data (EDA)

2.1 Load Dataset

2.2 Struktur Data

2.3 Cek 6 Baris Pertama

2.4 Distribusi Kelas (Customer_Segment)

2.5 Statistik Deskriptif

2.6 Boxplot per Fitur

3 Tahap 2 — Cek Asumsi

3.1 Asumsi 1 — Linearitas Log-Odds (untuk MLR)

3.2 Asumsi 2 — Independensi Observasi

3.3 Asumsi 3 — No Multikolinearitas

3.3.1 Matriks korelasi asumsi 3

3.4 Asumsi 4 — No Outlier (IQR Method)

3.5 Asumsi 5 — Normalitas (Shapiro-Wilk per Fitur)

4 Tahap 3 — Preprocessing

4.1 Pisah X dan y

4.2 Set Seed

4.3 Train-Test Split (75:25 Stratified)

4.4 Standarisasi (Z-score, fit pada Train saja)

4.5 Gabungkan X dan y untuk Modeling

5 Tahap 4A — Linear Discriminant Analysis (LDA)

5.1 Fit Model LDA

5.2 Komponen Diskriminan — Explained Variance Ratio (LD1 & LD2)

5.3 Proyeksi LDA — Scatter Plot LD1 vs LD2

5.4 Koefisien Diskriminan — Kontribusi Fitur

6 Tahap 4B — Multinomial Logistic Regression (MLR)

6.1 Fit Model MLR

6.2 Koefisien β (Log Odds)

6.3 Relative Risk Ratio (RRR)

6.4 Uji Signifikansi — Uji Wald (Serentak & Parsial)

7 Tahap 5 — Evaluasi & Pengujian Model

7.1 Prediksi pada Data Test

7.2 Confusion Matrix — LDA

7.3 Confusion Matrix — MLR

7.4 Akurasi & F1 (Macro/Weighted Average)

7.5 CV 9-Fold — Cross Validation Score

7.6 ROC-AUC (OvR per Kelas)

8 Tahap 6 — Visualisasi Hasil

8.1 LDA Scatter Plot (LD1 vs LD2) — Train & Test

8.2 RRR Bar Chart per Fitur (MLR)

8.3 Heatmap Koefisien MLR (β per Fitur × Kelas)

8.4 Confusion Matrix — Heatmap Keduanya

9 Tahap 7 — Interpretasi Model

9.1 Interpretasi LDA

9.1.1 Koefisien Diskriminan

9.1.2 Marginal Effects — Perubahan Probabilitas per Fitur

9.2 Interpretasi MLR

9.2.1 Log Odds & Uji Parsial Wald

9.2.2 Uji Parsial Wald — P-value per Fitur

10 Tahap 8 — Perbandingan & Kesimpulan

10.1 Tabel Perbandingan Lengkap

10.2 Kelebihan & Kekurangan