packages <- c("tidyverse", "caret", "MASS", "nnet", "corrplot",
"MVN", "car", "nortest", "ggplot2", "knitr", "kableExtra",
"pROC", "reshape2", "MLmetrics")
installed <- packages %in% rownames(installed.packages())
if (any(!installed)) install.packages(packages[!installed])
library(MASS)
library(nnet)
library(tidyverse)
library(caret)
library(corrplot)
library(MVN)
library(car)
library(nortest)
library(ggplot2)
library(knitr)
library(kableExtra)
library(pROC)
library(reshape2)
select <- dplyr::select
filter <- dplyr::filterwine_path <- if (file.exists("Wine.csv")) {
"Wine.csv"
} else {
file.path(dirname(knitr::current_input(dir = TRUE)), "Wine.csv")
}
if (!file.exists(wine_path)) {
stop("File Wine.csv tidak ditemukan. Pastikan Wine.csv berada di folder yang sama dengan file .Rmd ini.")
}
df <- read.csv(wine_path)
df$Customer_Segment <- as.factor(df$Customer_Segment)
cat("Dimensi data:", dim(df), "\n")## Dimensi data: 178 14
## Jumlah kelas: 3
## Label kelas: 1 2 3
## 'data.frame': 178 obs. of 14 variables:
## $ Alcohol : num 14.2 13.2 13.2 14.4 13.2 ...
## $ Malic_Acid : num 1.71 1.78 2.36 1.95 2.59 1.76 1.87 2.15 1.64 1.35 ...
## $ Ash : num 2.43 2.14 2.67 2.5 2.87 2.45 2.45 2.61 2.17 2.27 ...
## $ Ash_Alcanity : num 15.6 11.2 18.6 16.8 21 15.2 14.6 17.6 14 16 ...
## $ Magnesium : int 127 100 101 113 118 112 96 121 97 98 ...
## $ Total_Phenols : num 2.8 2.65 2.8 3.85 2.8 3.27 2.5 2.6 2.8 2.98 ...
## $ Flavanoids : num 3.06 2.76 3.24 3.49 2.69 3.39 2.52 2.51 2.98 3.15 ...
## $ Nonflavanoid_Phenols: num 0.28 0.26 0.3 0.24 0.39 0.34 0.3 0.31 0.29 0.22 ...
## $ Proanthocyanins : num 2.29 1.28 2.81 2.18 1.82 1.97 1.98 1.25 1.98 1.85 ...
## $ Color_Intensity : num 5.64 4.38 5.68 7.8 4.32 6.75 5.25 5.05 5.2 7.22 ...
## $ Hue : num 1.04 1.05 1.03 0.86 1.04 1.05 1.02 1.06 1.08 1.01 ...
## $ OD280 : num 3.92 3.4 3.17 3.45 2.93 2.85 3.58 3.58 2.85 3.55 ...
## $ Proline : int 1065 1050 1185 1480 735 1450 1290 1295 1045 1045 ...
## $ Customer_Segment : Factor w/ 3 levels "1","2","3": 1 1 1 1 1 1 1 1 1 1 ...
head(df) %>%
kable(caption = "6 Baris Pertama Dataset Wine") %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed"),
full_width = FALSE)| Alcohol | Malic_Acid | Ash | Ash_Alcanity | Magnesium | Total_Phenols | Flavanoids | Nonflavanoid_Phenols | Proanthocyanins | Color_Intensity | Hue | OD280 | Proline | Customer_Segment |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 14.23 | 1.71 | 2.43 | 15.6 | 127 | 2.80 | 3.06 | 0.28 | 2.29 | 5.64 | 1.04 | 3.92 | 1065 | 1 |
| 13.20 | 1.78 | 2.14 | 11.2 | 100 | 2.65 | 2.76 | 0.26 | 1.28 | 4.38 | 1.05 | 3.40 | 1050 | 1 |
| 13.16 | 2.36 | 2.67 | 18.6 | 101 | 2.80 | 3.24 | 0.30 | 2.81 | 5.68 | 1.03 | 3.17 | 1185 | 1 |
| 14.37 | 1.95 | 2.50 | 16.8 | 113 | 3.85 | 3.49 | 0.24 | 2.18 | 7.80 | 0.86 | 3.45 | 1480 | 1 |
| 13.24 | 2.59 | 2.87 | 21.0 | 118 | 2.80 | 2.69 | 0.39 | 1.82 | 4.32 | 1.04 | 2.93 | 735 | 1 |
| 14.20 | 1.76 | 2.45 | 15.2 | 112 | 3.27 | 3.39 | 0.34 | 1.97 | 6.75 | 1.05 | 2.85 | 1450 | 1 |
tabel_kelas <- df %>%
count(Customer_Segment) %>%
mutate(Proporsi = round(n / sum(n) * 100, 2))
tabel_kelas %>%
kable(caption = "Distribusi Kelas Customer_Segment",
col.names = c("Kelas", "Frekuensi", "Proporsi (%)")) %>%
kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)| Kelas | Frekuensi | Proporsi (%) |
|---|---|---|
| 1 | 59 | 33.15 |
| 2 | 71 | 39.89 |
| 3 | 48 | 26.97 |
ggplot(df, aes(x = Customer_Segment, fill = Customer_Segment)) +
geom_bar(width = 0.6, show.legend = FALSE) +
geom_text(stat = "count", aes(label = after_stat(count)), vjust = -0.5, size = 4) +
scale_fill_manual(values = c("#4E79A7", "#F28E2B", "#59A14F")) +
labs(title = "Distribusi Kelas Customer_Segment", x = "Kelas", y = "Frekuensi") +
theme_minimal(base_size = 13)fitur_numerik <- df %>% select(-Customer_Segment)
summary(fitur_numerik) %>%
kable(caption = "Statistik Deskriptif Fitur Numerik") %>%
kable_styling(bootstrap_options = c("striped", "condensed"),
full_width = TRUE, font_size = 11)| Alcohol | Malic_Acid | Ash | Ash_Alcanity | Magnesium | Total_Phenols | Flavanoids | Nonflavanoid_Phenols | Proanthocyanins | Color_Intensity | Hue | OD280 | Proline | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Min. :11.03 | Min. :0.740 | Min. :1.360 | Min. :10.60 | Min. : 70.00 | Min. :0.980 | Min. :0.340 | Min. :0.1300 | Min. :0.410 | Min. : 1.280 | Min. :0.4800 | Min. :1.270 | Min. : 278.0 | |
| 1st Qu.:12.36 | 1st Qu.:1.603 | 1st Qu.:2.210 | 1st Qu.:17.20 | 1st Qu.: 88.00 | 1st Qu.:1.742 | 1st Qu.:1.205 | 1st Qu.:0.2700 | 1st Qu.:1.250 | 1st Qu.: 3.220 | 1st Qu.:0.7825 | 1st Qu.:1.938 | 1st Qu.: 500.5 | |
| Median :13.05 | Median :1.865 | Median :2.360 | Median :19.50 | Median : 98.00 | Median :2.355 | Median :2.135 | Median :0.3400 | Median :1.555 | Median : 4.690 | Median :0.9650 | Median :2.780 | Median : 673.5 | |
| Mean :13.00 | Mean :2.336 | Mean :2.367 | Mean :19.49 | Mean : 99.74 | Mean :2.295 | Mean :2.029 | Mean :0.3619 | Mean :1.591 | Mean : 5.058 | Mean :0.9574 | Mean :2.612 | Mean : 746.9 | |
| 3rd Qu.:13.68 | 3rd Qu.:3.083 | 3rd Qu.:2.558 | 3rd Qu.:21.50 | 3rd Qu.:107.00 | 3rd Qu.:2.800 | 3rd Qu.:2.875 | 3rd Qu.:0.4375 | 3rd Qu.:1.950 | 3rd Qu.: 6.200 | 3rd Qu.:1.1200 | 3rd Qu.:3.170 | 3rd Qu.: 985.0 | |
| Max. :14.83 | Max. :5.800 | Max. :3.230 | Max. :30.00 | Max. :162.00 | Max. :3.880 | Max. :5.080 | Max. :0.6600 | Max. :3.580 | Max. :13.000 | Max. :1.7100 | Max. :4.000 | Max. :1680.0 |
df_long <- df %>%
pivot_longer(cols = -Customer_Segment, names_to = "Fitur", values_to = "Nilai")
ggplot(df_long, aes(x = Customer_Segment, y = Nilai, fill = Customer_Segment)) +
geom_boxplot(outlier.colour = "red", outlier.size = 1.5, alpha = 0.8) +
scale_fill_manual(values = c("#4E79A7", "#F28E2B", "#59A14F")) +
facet_wrap(~ Fitur, scales = "free_y", ncol = 4) +
labs(title = "Boxplot Setiap Fitur per Kelas", x = "Kelas", y = "Nilai") +
theme_minimal(base_size = 11) +
theme(legend.position = "none")df_check <- df %>%
mutate(kelas_bin = ifelse(Customer_Segment == 1, 1, 0))
ggplot(df_check, aes(x = Alcohol, y = kelas_bin)) +
geom_point(alpha = 0.4, colour = "#4E79A7") +
geom_smooth(method = "glm", method.args = list(family = "binomial"),
se = TRUE, colour = "#E15759") +
labs(title = "Cek Linearitas Log-Odds: Alcohol vs Kelas 1",
x = "Alcohol", y = "Probabilitas Kelas 1") +
theme_minimal(base_size = 13)lm_proxy <- lm(as.numeric(Customer_Segment) ~ ., data = df)
dw_result <- car::durbinWatsonTest(lm_proxy)
print(dw_result)## lag Autocorrelation D-W Statistic p-value
## 1 0.2000099 1.594091 0
## Alternative hypothesis: rho != 0
##
## Interpretasi:
## Nilai DW mendekati 2 → tidak ada autokorelasi (asumsi independensi terpenuhi)
vif_values <- car::vif(lm_proxy)
vif_df <- data.frame(
Fitur = names(vif_values),
VIF = round(vif_values, 3)
) %>% arrange(desc(VIF))
vif_df %>%
mutate(Status = ifelse(VIF > 10, "⚠️ Tinggi", "✅ OK")) %>%
kable(caption = "Variance Inflation Factor (VIF)") %>%
kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE) %>%
row_spec(which(vif_df$VIF > 10), background = "#FFF3CD")| Fitur | VIF | Status | |
|---|---|---|---|
| Flavanoids | Flavanoids | 7.029 | ✅ OK | |
| Total_Phenols | Total_Phenols | 4.335 | ✅ OK | |
| OD280 | OD280 | 3.785 | ✅ OK | |
| Color_Intensity | Color_Intensity | 3.026 | ✅ OK | |
| Proline | Proline | 2.824 | ✅ OK | |
| Hue | Hue | 2.551 | ✅ OK | |
| Alcohol | Alcohol | 2.460 | ✅ OK | |
| Ash_Alcanity | Ash_Alcanity | 2.239 | ✅ OK | |
| Ash | Ash | 2.185 | ✅ OK | |
| Proanthocyanins | Proanthocyanins | 1.976 | ✅ OK | |
| Nonflavanoid_Phenols | Nonflavanoid_Phenols | 1.796 | ✅ OK | |
| Malic_Acid | Malic_Acid | 1.657 | ✅ OK | |
| Magnesium | Magnesium | 1.418 | ✅ OK | |
##
## Ketentuan: VIF > 10 indikasi multikolinearitas serius
deteksi_outlier <- function(x) {
Q1 <- quantile(x, 0.25)
Q3 <- quantile(x, 0.75)
IQR <- Q3 - Q1
sum(x < (Q1 - 1.5 * IQR) | x > (Q3 + 1.5 * IQR))
}
outlier_summary <- data.frame(
Fitur = names(fitur_numerik),
Jumlah_Outlier = sapply(fitur_numerik, deteksi_outlier)
) %>% arrange(desc(Jumlah_Outlier))
outlier_summary %>%
mutate(Status = ifelse(Jumlah_Outlier > 0, "⚠️ Ada outlier", "✅ Bersih")) %>%
kable(caption = "Deteksi Outlier per Fitur (Metode IQR)") %>%
kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE) %>%
row_spec(which(outlier_summary$Jumlah_Outlier > 0), background = "#FFF3CD")| Fitur | Jumlah_Outlier | Status | |
|---|---|---|---|
| Ash_Alcanity | Ash_Alcanity | 4 | ⚠️ Ada outlier |
| Magnesium | Magnesium | 4 | ⚠️ Ada outlier |
| Color_Intensity | Color_Intensity | 4 | ⚠️ Ada outlier |
| Malic_Acid | Malic_Acid | 3 | ⚠️ Ada outlier |
| Ash | Ash | 3 | ⚠️ Ada outlier |
| Proanthocyanins | Proanthocyanins | 2 | ⚠️ Ada outlier |
| Hue | Hue | 1 | ⚠️ Ada outlier |
| Alcohol | Alcohol | 0 | ✅ Bersih | |
| Total_Phenols | Total_Phenols | 0 | ✅ Bersih | |
| Flavanoids | Flavanoids | 0 | ✅ Bersih | |
| Nonflavanoid_Phenols | Nonflavanoid_Phenols | 0 | ✅ Bersih | |
| OD280 | OD280 | 0 | ✅ Bersih | |
| Proline | Proline | 0 | ✅ Bersih | |
shapiro_results <- sapply(fitur_numerik, function(x) shapiro.test(x)$p.value)
normalitas_df <- data.frame(
Fitur = names(shapiro_results),
p_value = round(shapiro_results, 4)
) %>%
mutate(Status = ifelse(p_value >= 0.05, "✅ Normal", "⚠️ Tidak Normal")) %>%
arrange(p_value)
normalitas_df %>%
kable(caption = "Uji Normalitas Shapiro-Wilk per Fitur (α = 0.05)") %>%
kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE) %>%
row_spec(which(normalitas_df$p_value < 0.05), background = "#FFF3CD")| Fitur | p_value | Status | |
|---|---|---|---|
| Malic_Acid | Malic_Acid | 0.0000 | ⚠️ Tidak Normal |
| Magnesium | Magnesium | 0.0000 | ⚠️ Tidak Normal |
| Flavanoids | Flavanoids | 0.0000 | ⚠️ Tidak Normal |
| Color_Intensity | Color_Intensity | 0.0000 | ⚠️ Tidak Normal |
| OD280 | OD280 | 0.0000 | ⚠️ Tidak Normal |
| Proline | Proline | 0.0000 | ⚠️ Tidak Normal |
| Nonflavanoid_Phenols | Nonflavanoid_Phenols | 0.0001 | ⚠️ Tidak Normal |
| Total_Phenols | Total_Phenols | 0.0044 | ⚠️ Tidak Normal |
| Proanthocyanins | Proanthocyanins | 0.0145 | ⚠️ Tidak Normal |
| Hue | Hue | 0.0174 | ⚠️ Tidak Normal |
| Alcohol | Alcohol | 0.0200 | ⚠️ Tidak Normal |
| Ash | Ash | 0.0387 | ⚠️ Tidak Normal |
| Ash_Alcanity | Ash_Alcanity | 0.2639 | ✅ Normal | |
mvn_result <- tryCatch(
mvn(data = fitur_numerik, mvnTest = "mardia"),
error = function(e) tryCatch(
mvn(data = fitur_numerik, test = "mardia"),
error = function(e2) mvn(data = fitur_numerik)
)
)
print(mvn_result$multivariateNormality)## NULL
X <- df %>% select(-Customer_Segment)
y <- df$Customer_Segment
cat("Dimensi X (fitur) :", dim(X), "\n")## Dimensi X (fitur) : 178 13
## Panjang y (target) : 178
## Distribusi y :
## y
## 1 2 3
## 59 71 48
train_index <- createDataPartition(y, p = 0.75, list = FALSE)
X_train <- X[train_index, ]
X_test <- X[-train_index, ]
y_train <- y[train_index]
y_test <- y[-train_index]
cat("=== Ukuran Data ===\n")## === Ukuran Data ===
## Train: 135 observasi | Test: 43 observasi
## === Distribusi Kelas — Train ===
## y_train
## 1 2 3
## 45 54 36
##
## === Distribusi Kelas — Test ===
## y_test
## 1 2 3
## 14 17 12
prop_train <- round(prop.table(table(y_train)) * 100, 1)
prop_test <- round(prop.table(table(y_test)) * 100, 1)
cat("\nProporsi Train (%):\n")##
## Proporsi Train (%):
## y_train
## 1 2 3
## 33.3 40.0 26.7
##
## Proporsi Test (%):
## y_test
## 1 2 3
## 32.6 39.5 27.9
preproc <- preProcess(X_train, method = c("center", "scale"))
X_train_scaled <- predict(preproc, X_train)
X_test_scaled <- predict(preproc, X_test)
mean_check <- round(colMeans(X_train_scaled), 3)
sd_check <- round(apply(X_train_scaled, 2, sd), 3)
cat("=== Verifikasi Standarisasi (Train) ===\n")## === Verifikasi Standarisasi (Train) ===
verif_df <- data.frame(Fitur = names(mean_check), Mean = mean_check, SD = sd_check)
verif_df %>%
kable(caption = "Mean dan SD Setelah Standarisasi (Train)") %>%
kable_styling(bootstrap_options = c("striped", "hover"),
full_width = FALSE, font_size = 11)| Fitur | Mean | SD | |
|---|---|---|---|
| Alcohol | Alcohol | 0 | 1 |
| Malic_Acid | Malic_Acid | 0 | 1 |
| Ash | Ash | 0 | 1 |
| Ash_Alcanity | Ash_Alcanity | 0 | 1 |
| Magnesium | Magnesium | 0 | 1 |
| Total_Phenols | Total_Phenols | 0 | 1 |
| Flavanoids | Flavanoids | 0 | 1 |
| Nonflavanoid_Phenols | Nonflavanoid_Phenols | 0 | 1 |
| Proanthocyanins | Proanthocyanins | 0 | 1 |
| Color_Intensity | Color_Intensity | 0 | 1 |
| Hue | Hue | 0 | 1 |
| OD280 | OD280 | 0 | 1 |
| Proline | Proline | 0 | 1 |
train_data <- cbind(X_train_scaled, Customer_Segment = y_train)
test_data <- cbind(X_test_scaled, Customer_Segment = y_test)
cat("Data train siap:", nrow(train_data), "baris,", ncol(train_data), "kolom\n")## Data train siap: 135 baris, 14 kolom
## Data test siap: 43 baris, 14 kolom
##
## Preprocessing selesai. Lanjut ke Tahap 4: Fitting Model MLR & LDA.
lda_model <- lda(Customer_Segment ~ ., data = train_data)
cat("=== Model LDA berhasil di-fit ===\n\n")## === Model LDA berhasil di-fit ===
## Call:
## lda(Customer_Segment ~ ., data = train_data)
##
## Prior probabilities of groups:
## 1 2 3
## 0.3333333 0.4000000 0.2666667
##
## Group means:
## Alcohol Malic_Acid Ash Ash_Alcanity Magnesium Total_Phenols
## 1 0.8614216 -0.2652798 0.2906065 -0.7350033 0.51816969 0.89199963
## 2 -0.8807290 -0.4160982 -0.3652704 0.2438351 -0.41017334 -0.08453745
## 3 0.2443165 0.9557471 0.1846475 0.5530014 -0.03245211 -0.98819336
## Flavanoids Nonflavanoid_Phenols Proanthocyanins Color_Intensity Hue
## 1 0.93641652 -0.65222026 0.56809014 0.1834782 0.4056861
## 2 0.06768397 -0.02287984 0.01030215 -0.8350753 0.4692576
## 3 -1.27204661 0.84959508 -0.72556590 1.0232653 -1.2109941
## OD280 Proline
## 1 0.8361276 1.1462003
## 2 0.1747559 -0.7311564
## 3 -1.3072932 -0.3360158
##
## Coefficients of linear discriminants:
## LD1 LD2
## Alcohol -0.37406517 0.67904416
## Malic_Acid 0.19365178 0.36305408
## Ash -0.15212531 0.53979962
## Ash_Alcanity 0.44514061 -0.40193781
## Magnesium -0.05641899 0.02241295
## Total_Phenols 0.46830400 0.06211205
## Flavanoids -1.72429984 -0.57240210
## Nonflavanoid_Phenols -0.13439089 -0.10721773
## Proanthocyanins 0.10104462 -0.18060995
## Color_Intensity 0.83819227 0.61129435
## Hue -0.15989148 -0.44033959
## OD280 -0.84166138 0.22604124
## Proline -0.88766530 0.93254363
##
## Proportion of trace:
## LD1 LD2
## 0.6895 0.3105
# Proporsi varian yang dijelaskan tiap komponen diskriminan
prop_var <- lda_model$svd^2 / sum(lda_model$svd^2)
prop_var_df <- data.frame(
Komponen = paste0("LD", seq_along(prop_var)),
Eigenvalue = round(lda_model$svd^2, 4),
Proporsi = round(prop_var * 100, 2),
Kumulatif_Pct = round(cumsum(prop_var) * 100, 2)
)
prop_var_df %>%
kable(caption = "Explained Variance Ratio — Komponen Diskriminan LDA",
col.names = c("Komponen", "Eigenvalue", "Proporsi (%)", "Kumulatif (%)")) %>%
kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)| Komponen | Eigenvalue | Proporsi (%) | Kumulatif (%) |
|---|---|---|---|
| LD1 | 602.9522 | 68.95 | 68.95 |
| LD2 | 271.5138 | 31.05 | 100.00 |
cat(sprintf("\nLD1 menjelaskan %.1f%% dan LD2 menjelaskan %.1f%% variansi antarkelas.\n",
prop_var_df$Proporsi[1], prop_var_df$Proporsi[2]))##
## LD1 menjelaskan 69.0% dan LD2 menjelaskan 31.1% variansi antarkelas.
# Proyeksi data train ke ruang diskriminan
lda_proj_train <- predict(lda_model, X_train_scaled)
lda_proj_test <- predict(lda_model, X_test_scaled)
df_proj <- data.frame(
LD1 = lda_proj_train$x[, 1],
LD2 = lda_proj_train$x[, 2],
Kelas = y_train
)
ggplot(df_proj, aes(x = LD1, y = LD2, colour = Kelas, shape = Kelas)) +
geom_point(size = 3, alpha = 0.85) +
stat_ellipse(level = 0.90, linewidth = 0.8, linetype = "dashed") +
scale_colour_manual(values = c("#4E79A7", "#F28E2B", "#59A14F"),
name = "Customer\nSegment") +
scale_shape_manual(values = c(16, 17, 15), name = "Customer\nSegment") +
labs(title = "Proyeksi LDA: LD1 vs LD2 (Data Train)",
subtitle = sprintf("LD1 = %.1f%% | LD2 = %.1f%% variansi antarkelas",
prop_var_df$Proporsi[1], prop_var_df$Proporsi[2]),
x = "LD1", y = "LD2") +
theme_minimal(base_size = 13) +
theme(legend.position = "right")# Koefisien (loading) tiap fitur pada LD1 dan LD2
coef_df <- as.data.frame(lda_model$scaling) %>%
rownames_to_column("Fitur") %>%
mutate(across(where(is.numeric), ~ round(., 4))) %>%
mutate(Abs_LD1 = abs(LD1)) %>%
arrange(desc(Abs_LD1)) %>%
select(-Abs_LD1)
coef_df %>%
kable(caption = "Koefisien Diskriminan LDA (Kontribusi Fitur per Komponen)") %>%
kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE) %>%
row_spec(1:3, background = "#E8F4FD") # highlight 3 fitur teratas| Fitur | LD1 | LD2 |
|---|---|---|
| Flavanoids | -1.7243 | -0.5724 |
| Proline | -0.8877 | 0.9325 |
| OD280 | -0.8417 | 0.2260 |
| Color_Intensity | 0.8382 | 0.6113 |
| Total_Phenols | 0.4683 | 0.0621 |
| Ash_Alcanity | 0.4451 | -0.4019 |
| Alcohol | -0.3741 | 0.6790 |
| Malic_Acid | 0.1937 | 0.3631 |
| Hue | -0.1599 | -0.4403 |
| Ash | -0.1521 | 0.5398 |
| Nonflavanoid_Phenols | -0.1344 | -0.1072 |
| Proanthocyanins | 0.1010 | -0.1806 |
| Magnesium | -0.0564 | 0.0224 |
##
## Fitur dengan kontribusi terbesar pada LD1:
## Flavanoids, Proline, OD280
# Referensi kategori: kelas 1 (baseline)
train_data$Customer_Segment <- relevel(train_data$Customer_Segment, ref = "1")
mlr_model <- multinom(Customer_Segment ~ ., data = train_data, trace = FALSE)
cat("=== Model MLR berhasil di-fit ===\n\n")## === Model MLR berhasil di-fit ===
## Call:
## multinom(formula = Customer_Segment ~ ., data = train_data, trace = FALSE)
##
## Coefficients:
## (Intercept) Alcohol Malic_Acid Ash Ash_Alcanity Magnesium
## 2 -1.156250 -11.881344 -3.2514642 -10.016165 9.321990 -0.5202924
## 3 -5.222332 -2.068507 0.9017964 4.014963 4.038532 0.9348684
## Total_Phenols Flavanoids Nonflavanoid_Phenols Proanthocyanins Color_Intensity
## 2 -2.097594 -1.927518 5.725801 0.9563483 -12.23829
## 3 -4.884002 -12.655078 1.969334 -2.6212234 12.57532
## Hue OD280 Proline
## 2 2.968839 -4.645503 -16.6173993
## 3 -10.354985 -9.819660 0.3185086
##
## Std. Errors:
## (Intercept) Alcohol Malic_Acid Ash Ash_Alcanity Magnesium
## 2 18353.62 4649.292 8035.895 8527.676 15182.19 3027.783
## 3 32757.73 40833.784 12634.577 52217.100 30516.45 78036.250
## Total_Phenols Flavanoids Nonflavanoid_Phenols Proanthocyanins Color_Intensity
## 2 7274.864 7709.677 6011.432 2625.791 20202.27
## 3 52549.791 61596.183 39871.074 59249.474 44300.84
## Hue OD280 Proline
## 2 11248.28 9895.901 11640.20
## 3 33777.78 58599.008 18731.91
##
## Residual Deviance: 0.0001872249
## AIC: 56.00019
coef_mlr <- summary(mlr_model)$coefficients
coef_mlr_df <- as.data.frame(t(coef_mlr)) %>%
rownames_to_column("Fitur") %>%
rename_with(~ paste0("LogOdds_Kelas", .), -Fitur)
coef_mlr_df %>%
mutate(across(where(is.numeric), ~ round(., 4))) %>%
kable(caption = "Koefisien β (Log Odds) — MLR per Kategori vs Baseline (Kelas 1)") %>%
kable_styling(bootstrap_options = c("striped", "hover"),
full_width = TRUE, font_size = 11)| Fitur | LogOdds_Kelas2 | LogOdds_Kelas3 |
|---|---|---|
| (Intercept) | -1.1563 | -5.2223 |
| Alcohol | -11.8813 | -2.0685 |
| Malic_Acid | -3.2515 | 0.9018 |
| Ash | -10.0162 | 4.0150 |
| Ash_Alcanity | 9.3220 | 4.0385 |
| Magnesium | -0.5203 | 0.9349 |
| Total_Phenols | -2.0976 | -4.8840 |
| Flavanoids | -1.9275 | -12.6551 |
| Nonflavanoid_Phenols | 5.7258 | 1.9693 |
| Proanthocyanins | 0.9563 | -2.6212 |
| Color_Intensity | -12.2383 | 12.5753 |
| Hue | 2.9688 | -10.3550 |
| OD280 | -4.6455 | -9.8197 |
| Proline | -16.6174 | 0.3185 |
# RRR = exp(β)
rrr_mlr <- exp(coef_mlr)
rrr_df <- as.data.frame(t(rrr_mlr)) %>%
rownames_to_column("Fitur") %>%
rename_with(~ paste0("RRR_Kelas", .), -Fitur)
rrr_df %>%
mutate(across(where(is.numeric), ~ round(., 4))) %>%
kable(caption = "Relative Risk Ratio (RRR = exp(β)) — MLR") %>%
kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)| Fitur | RRR_Kelas2 | RRR_Kelas3 |
|---|---|---|
| (Intercept) | 0.3147 | 0.0054 |
| Alcohol | 0.0000 | 0.1264 |
| Malic_Acid | 0.0387 | 2.4640 |
| Ash | 0.0000 | 55.4213 |
| Ash_Alcanity | 11181.2158 | 56.7430 |
| Magnesium | 0.5943 | 2.5469 |
| Total_Phenols | 0.1228 | 0.0076 |
| Flavanoids | 0.1455 | 0.0000 |
| Nonflavanoid_Phenols | 306.6788 | 7.1659 |
| Proanthocyanins | 2.6022 | 0.0727 |
| Color_Intensity | 0.0000 | 289329.4550 |
| Hue | 19.4693 | 0.0000 |
| OD280 | 0.0096 | 0.0001 |
| Proline | 0.0000 | 1.3751 |
# Bar chart RRR
rrr_long <- rrr_df %>%
filter(Fitur != "(Intercept)") %>%
pivot_longer(-Fitur, names_to = "Kelas", values_to = "RRR")
ggplot(rrr_long, aes(x = reorder(Fitur, RRR), y = RRR, fill = Kelas)) +
geom_bar(stat = "identity", position = "dodge", alpha = 0.85) +
geom_hline(yintercept = 1, linetype = "dashed", colour = "grey40") +
scale_fill_manual(values = c("#F28E2B", "#59A14F")) +
coord_flip() +
labs(title = "Relative Risk Ratio (RRR) per Fitur — MLR",
subtitle = "Garis putus-putus = RRR 1 (tidak ada efek vs baseline)",
x = "Fitur", y = "RRR") +
theme_minimal(base_size = 12)# Uji Wald parsial: z = koef / SE, p-value = 2 * (1 - Φ(|z|))
se_mlr <- summary(mlr_model)$standard.errors
z_stat <- coef_mlr / se_mlr
p_wald <- 2 * (1 - pnorm(abs(z_stat)))
wald_df <- as.data.frame(t(p_wald)) %>%
rownames_to_column("Fitur") %>%
rename_with(~ paste0("p_Kelas", .), -Fitur) %>%
mutate(across(where(is.numeric), ~ round(., 4)))
wald_df %>%
kable(caption = "P-value Uji Wald Parsial per Fitur (α = 0.05)") %>%
kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE) %>%
row_spec(which(apply(wald_df[, -1], 1, function(r) any(r < 0.05))),
background = "#D5F5E3")| Fitur | p_Kelas2 | p_Kelas3 |
|---|---|---|
| (Intercept) | 0.9999 | 0.9999 |
| Alcohol | 0.9980 | 1.0000 |
| Malic_Acid | 0.9997 | 0.9999 |
| Ash | 0.9991 | 0.9999 |
| Ash_Alcanity | 0.9995 | 0.9999 |
| Magnesium | 0.9999 | 1.0000 |
| Total_Phenols | 0.9998 | 0.9999 |
| Flavanoids | 0.9998 | 0.9998 |
| Nonflavanoid_Phenols | 0.9992 | 1.0000 |
| Proanthocyanins | 0.9997 | 1.0000 |
| Color_Intensity | 0.9995 | 0.9998 |
| Hue | 0.9998 | 0.9998 |
| OD280 | 0.9996 | 0.9999 |
| Proline | 0.9989 | 1.0000 |
##
## Fitur signifikan (p < 0.05 minimal satu kelas):
sig_fitur <- wald_df$Fitur[apply(wald_df[, -1], 1,
function(r) any(r < 0.05, na.rm = TRUE))]
cat(paste(sig_fitur, collapse = ", "), "\n")# LDA
pred_lda <- predict(lda_model, X_test_scaled)$class
# MLR
pred_mlr <- predict(mlr_model, X_test_scaled)
cat("Distribusi prediksi LDA:", table(pred_lda), "\n")## Distribusi prediksi LDA: 14 17 12
## Distribusi prediksi MLR: 14 17 12
## Confusion Matrix and Statistics
##
## Reference
## Prediction 1 2 3
## 1 14 0 0
## 2 0 17 0
## 3 0 0 12
##
## Overall Statistics
##
## Accuracy : 1
## 95% CI : (0.9178, 1)
## No Information Rate : 0.3953
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 1
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: 1 Class: 2 Class: 3
## Sensitivity 1.0000 1.0000 1.0000
## Specificity 1.0000 1.0000 1.0000
## Pos Pred Value 1.0000 1.0000 1.0000
## Neg Pred Value 1.0000 1.0000 1.0000
## Prevalence 0.3256 0.3953 0.2791
## Detection Rate 0.3256 0.3953 0.2791
## Detection Prevalence 0.3256 0.3953 0.2791
## Balanced Accuracy 1.0000 1.0000 1.0000
# Heatmap Confusion Matrix LDA
cm_lda_df <- as.data.frame(cm_lda$table)
colnames(cm_lda_df) <- c("Prediksi", "Aktual", "Frekuensi")
ggplot(cm_lda_df, aes(x = Aktual, y = Prediksi, fill = Frekuensi)) +
geom_tile(colour = "white") +
geom_text(aes(label = Frekuensi), size = 6, fontface = "bold") +
scale_fill_gradient(low = "#EBF5FB", high = "#1A5276") +
labs(title = "Confusion Matrix — LDA (Test Set)",
x = "Aktual", y = "Prediksi") +
theme_minimal(base_size = 13)pred_mlr_fac <- factor(pred_mlr, levels = levels(y_test))
cm_mlr <- confusionMatrix(pred_mlr_fac, y_test)
print(cm_mlr)## Confusion Matrix and Statistics
##
## Reference
## Prediction 1 2 3
## 1 14 0 0
## 2 0 17 0
## 3 0 0 12
##
## Overall Statistics
##
## Accuracy : 1
## 95% CI : (0.9178, 1)
## No Information Rate : 0.3953
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 1
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: 1 Class: 2 Class: 3
## Sensitivity 1.0000 1.0000 1.0000
## Specificity 1.0000 1.0000 1.0000
## Pos Pred Value 1.0000 1.0000 1.0000
## Neg Pred Value 1.0000 1.0000 1.0000
## Prevalence 0.3256 0.3953 0.2791
## Detection Rate 0.3256 0.3953 0.2791
## Detection Prevalence 0.3256 0.3953 0.2791
## Balanced Accuracy 1.0000 1.0000 1.0000
# Heatmap Confusion Matrix MLR
cm_mlr_df <- as.data.frame(cm_mlr$table)
colnames(cm_mlr_df) <- c("Prediksi", "Aktual", "Frekuensi")
ggplot(cm_mlr_df, aes(x = Aktual, y = Prediksi, fill = Frekuensi)) +
geom_tile(colour = "white") +
geom_text(aes(label = Frekuensi), size = 6, fontface = "bold") +
scale_fill_gradient(low = "#FDFEFE", high = "#7D6608") +
labs(title = "Confusion Matrix — MLR (Test Set)",
x = "Aktual", y = "Prediksi") +
theme_minimal(base_size = 13)# Fungsi hitung F1 macro dan weighted
get_metrics <- function(cm_obj, model_name) {
stats <- cm_obj$byClass
support <- rowSums(cm_obj$table) # jumlah observasi per kelas (aktual)
f1_per <- 2 * stats[, "Sensitivity"] * stats[, "Pos Pred Value"] /
(stats[, "Sensitivity"] + stats[, "Pos Pred Value"])
f1_macro <- mean(f1_per, na.rm = TRUE)
f1_weighted <- weighted.mean(f1_per, w = support, na.rm = TRUE)
acc <- cm_obj$overall["Accuracy"]
data.frame(Model = model_name,
Akurasi = round(acc, 4),
F1_Macro = round(f1_macro, 4),
F1_Weighted = round(f1_weighted, 4))
}
metrics_lda <- get_metrics(cm_lda, "LDA")
metrics_mlr <- get_metrics(cm_mlr, "MLR")
metrics_all <- bind_rows(metrics_lda, metrics_mlr)
metrics_all %>%
kable(caption = "Ringkasan Akurasi & F1 — LDA vs MLR (Test Set)",
col.names = c("Model", "Akurasi", "F1 Macro", "F1 Weighted")) %>%
kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE) %>%
row_spec(which.max(metrics_all$Akurasi), background = "#D5F5E3")| Model | Akurasi | F1 Macro | F1 Weighted | |
|---|---|---|---|---|
| Accuracy…1 | LDA | 1 | 1 | 1 |
| Accuracy…2 | MLR | 1 | 1 | 1 |
## package 'MLmetrics' successfully unpacked and MD5 sums checked
##
## The downloaded binary packages are in
## C:\Users\Asus\AppData\Local\Temp\Rtmpgpf847\downloaded_packages
set.seed(42)
ctrl <- trainControl(method = "cv", number = 9,
classProbs = TRUE,
summaryFunction = multiClassSummary,
savePredictions = "final")
# Caret mensyaratkan level faktor berupa valid R variable name
# Ubah "1","2","3" → "Seg1","Seg2","Seg3"
y_train_cv <- factor(paste0("Seg", y_train),
levels = paste0("Seg", levels(y_train)))
# Data gabung untuk caret train
train_cv <- cbind(X_train_scaled, Customer_Segment = y_train_cv)
# CV LDA
cv_lda <- train(Customer_Segment ~ .,
data = train_cv,
method = "lda",
trControl = ctrl,
metric = "Accuracy")
# CV MLR
cv_mlr <- train(Customer_Segment ~ .,
data = train_cv,
method = "multinom",
trControl = ctrl,
metric = "Accuracy",
trace = FALSE)
cv_summary <- data.frame(
Model = c("LDA", "MLR"),
CV_Accuracy_Mean = c(
round(mean(cv_lda$resample$Accuracy), 4),
round(mean(cv_mlr$resample$Accuracy), 4)
),
CV_Accuracy_SD = c(
round(sd(cv_lda$resample$Accuracy), 4),
round(sd(cv_mlr$resample$Accuracy), 4)
)
)
cv_summary %>%
kable(caption = "Cross Validation 9-Fold — Akurasi LDA vs MLR",
col.names = c("Model", "Mean Accuracy", "SD Accuracy")) %>%
kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)| Model | Mean Accuracy | SD Accuracy |
|---|---|---|
| LDA | 0.9778 | 0.0333 |
| MLR | 0.9704 | 0.0351 |
# Probabilitas dari LDA dan MLR pada test set
prob_lda <- predict(lda_model, X_test_scaled)$posterior
prob_mlr <- predict(mlr_model, X_test_scaled, type = "probs")
kelas_list <- levels(y_test)
auc_results <- data.frame(
Kelas = character(),
AUC_LDA = numeric(),
AUC_MLR = numeric()
)
par(mfrow = c(1, 3))
for (k in kelas_list) {
y_bin <- ifelse(y_test == k, 1, 0)
roc_lda_k <- roc(y_bin, prob_lda[, k], quiet = TRUE)
roc_mlr_k <- roc(y_bin, prob_mlr[, k], quiet = TRUE)
auc_results <- rbind(auc_results,
data.frame(Kelas = k,
AUC_LDA = round(auc(roc_lda_k), 4),
AUC_MLR = round(auc(roc_mlr_k), 4)))
plot(roc_lda_k, col = "#4E79A7", main = paste("ROC — Kelas", k),
lwd = 2, legacy.axes = TRUE)
plot(roc_mlr_k, col = "#F28E2B", add = TRUE, lwd = 2, lty = 2)
legend("bottomright",
legend = c(sprintf("LDA (AUC=%.3f)", auc(roc_lda_k)),
sprintf("MLR (AUC=%.3f)", auc(roc_mlr_k))),
col = c("#4E79A7", "#F28E2B"), lty = c(1, 2), lwd = 2, cex = 0.85)
}par(mfrow = c(1, 1))
auc_results %>%
kable(caption = "AUC per Kelas (OvR) — LDA vs MLR") %>%
kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)| Kelas | AUC_LDA | AUC_MLR |
|---|---|---|
| 1 | 1 | 1 |
| 2 | 1 | 1 |
| 3 | 1 | 1 |
df_train_proj <- data.frame(
LD1 = lda_proj_train$x[, 1],
LD2 = lda_proj_train$x[, 2],
Kelas = y_train,
Set = "Train"
)
df_test_proj <- data.frame(
LD1 = lda_proj_test$x[, 1],
LD2 = lda_proj_test$x[, 2],
Kelas = y_test,
Set = "Test"
)
df_all_proj <- bind_rows(df_train_proj, df_test_proj)
ggplot(df_all_proj, aes(x = LD1, y = LD2, colour = Kelas, shape = Set)) +
geom_point(size = 3, alpha = 0.8) +
stat_ellipse(data = df_train_proj, aes(colour = Kelas),
level = 0.90, linewidth = 0.7, linetype = "dashed") +
scale_colour_manual(values = c("#4E79A7", "#F28E2B", "#59A14F"),
name = "Customer\nSegment") +
scale_shape_manual(values = c(16, 4), name = "Dataset") +
labs(title = "LDA Scatter Plot — LD1 vs LD2",
subtitle = "Titik bulat = Train | Tanda silang = Test | Elips = batas 90% CI",
x = paste0("LD1 (", prop_var_df$Proporsi[1], "%)"),
y = paste0("LD2 (", prop_var_df$Proporsi[2], "%)")) +
theme_minimal(base_size = 13)rrr_long_plot <- rrr_df %>%
filter(Fitur != "(Intercept)") %>%
pivot_longer(-Fitur, names_to = "Kelas", values_to = "RRR") %>%
mutate(Kelas = gsub("RRR_Kelas", "Kelas ", Kelas))
ggplot(rrr_long_plot, aes(x = reorder(Fitur, -RRR), y = RRR, fill = Kelas)) +
geom_bar(stat = "identity", position = "dodge", alpha = 0.85, width = 0.7) +
geom_hline(yintercept = 1, linetype = "dashed", colour = "grey30", linewidth = 0.7) +
scale_fill_manual(values = c("#F28E2B", "#59A14F")) +
labs(title = "Relative Risk Ratio (RRR) per Fitur — Multinomial Logit",
subtitle = "Baseline = Kelas 1 | RRR > 1: risiko naik | RRR < 1: risiko turun",
x = "Fitur", y = "RRR") +
theme_minimal(base_size = 12) +
theme(axis.text.x = element_text(angle = 35, hjust = 1))coef_long <- as.data.frame(t(coef_mlr)) %>%
rownames_to_column("Fitur") %>%
filter(Fitur != "(Intercept)") %>%
pivot_longer(-Fitur, names_to = "Kelas", values_to = "Beta") %>%
mutate(Kelas = paste("Kelas", Kelas))
ggplot(coef_long, aes(x = Kelas, y = Fitur, fill = Beta)) +
geom_tile(colour = "white") +
geom_text(aes(label = round(Beta, 2)), size = 3.5) +
scale_fill_gradient2(low = "#C0392B", mid = "white", high = "#1A5276",
midpoint = 0, name = "β") +
labs(title = "Heatmap Koefisien MLR (β) per Fitur & Kelas",
subtitle = "Merah = efek negatif | Biru = efek positif vs baseline (Kelas 1)",
x = "Kelas (vs Baseline Kelas 1)", y = "Fitur") +
theme_minimal(base_size = 12) +
theme(axis.text.x = element_text(size = 11))# Normalisasi per baris (proporsi prediksi benar per kelas aktual)
cm_lda_norm <- as.data.frame(cm_lda$table)
colnames(cm_lda_norm) <- c("Prediksi", "Aktual", "Frekuensi")
cm_lda_norm$Model <- "LDA"
cm_mlr_norm <- as.data.frame(cm_mlr$table)
colnames(cm_mlr_norm) <- c("Prediksi", "Aktual", "Frekuensi")
cm_mlr_norm$Model <- "MLR"
cm_both <- bind_rows(cm_lda_norm, cm_mlr_norm)
ggplot(cm_both, aes(x = Aktual, y = Prediksi, fill = Frekuensi)) +
geom_tile(colour = "white") +
geom_text(aes(label = Frekuensi), size = 6, fontface = "bold") +
scale_fill_gradient(low = "#FDFEFE", high = "#1A5276") +
facet_wrap(~ Model) +
labs(title = "Confusion Matrix — LDA vs MLR (Test Set)",
x = "Aktual", y = "Prediksi") +
theme_minimal(base_size = 13)## =================================================================
## INTERPRETASI MODEL LDA
## =================================================================
## 1. KOMPONEN DISKRIMINAN
## LD1 menjelaskan 68.95 % variansi antarkelas.
## LD2 menjelaskan 31.05 % variansi antarkelas.
## → Total: 100 % variansi dapat dijelaskan.
## 2. FITUR TERPENTING (loading |LD1| tertinggi):
top3 <- head(coef_df$Fitur, 3)
for (i in seq_along(top3)) {
ld1_val <- coef_df$LD1[coef_df$Fitur == top3[i]]
cat(sprintf(" %d. %s (LD1 = %.4f)\n", i, top3[i], ld1_val))
}## 1. Flavanoids (LD1 = -1.7243)
## 2. Proline (LD1 = -0.8877)
## 3. OD280 (LD1 = -0.8417)
baseline_post <- colMeans(predict(lda_model, X_test_scaled)$posterior)
delta_df <- lapply(names(X_test_scaled), function(feat) {
X_shifted <- X_test_scaled
X_shifted[[feat]] <- X_shifted[[feat]] + 1 # naikkan 1 SD
post_shifted <- colMeans(predict(lda_model, X_shifted)$posterior)
delta <- post_shifted - baseline_post
data.frame(Fitur = feat,
Kelas = names(delta),
Delta_Prob = round(delta, 4))
}) %>% bind_rows()
ggplot(delta_df, aes(x = reorder(Fitur, abs(Delta_Prob)), y = Delta_Prob, fill = Kelas)) +
geom_bar(stat = "identity", position = "dodge", alpha = 0.85) +
geom_hline(yintercept = 0, linetype = "dashed") +
scale_fill_manual(values = c("#4E79A7", "#F28E2B", "#59A14F")) +
coord_flip() +
labs(title = "Marginal Effects LDA — Perubahan Probabilitas per Kelas",
subtitle = "Efek kenaikan 1 SD setiap fitur terhadap probabilitas posterior",
x = "Fitur", y = "ΔProb") +
theme_minimal(base_size = 12)## sig_fitur:
## ncol sig_coef: 0
## INTERPRETASI MODEL MLR
## 1. LOG ODDS vs BASELINE (Kelas 1):
## Koefisien positif → probabilitas kelas tersebut relatif lebih
## tinggi dibanding kelas 1 saat fitur meningkat.
## 2. RELATIVE RISK RATIO (RRR = exp(β)):
## RRR > 1 → fitur meningkatkan risiko relatif masuk kelas tsb.
## RRR < 1 → fitur menurunkan risiko relatif masuk kelas tsb.
## 3. UJI WALD PARSIAL:
## Fitur signifikan (p < 0.05) minimal satu kelas:
# Tabel ringkas log-odds & RRR fitur signifikan
cat("Ringkasan koefisien fitur paling signifikan:\n")## Ringkasan koefisien fitur paling signifikan:
sig_coef <- coef_mlr[, sig_fitur, drop = FALSE]
sig_rrr <- exp(sig_coef)
sig_coef <- coef_mlr[, sig_fitur, drop = FALSE]
sig_rrr <- exp(sig_coef)
n_show <- min(4, ncol(sig_coef))
if (n_show > 0) {
cbind(
LogOdds = round(sig_coef[, 1:n_show, drop = FALSE], 3),
RRR = round(sig_rrr[, 1:n_show, drop = FALSE], 3)
) %>%
kable(caption = "Log Odds & RRR Fitur Signifikan Teratas") %>%
kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)
} else {
cat("Tidak ada fitur signifikan yang ditemukan.\n")
}## Tidak ada fitur signifikan yang ditemukan.
# Visualisasi p-value Wald per fitur & kelas
wald_long <- wald_df %>%
filter(Fitur != "(Intercept)") %>%
pivot_longer(-Fitur, names_to = "Kelas", values_to = "p_value") %>%
mutate(Signifikan = ifelse(p_value < 0.05, "Signifikan", "Tidak"))
ggplot(wald_long, aes(x = reorder(Fitur, p_value), y = -log10(p_value + 1e-6),
fill = Signifikan)) +
geom_bar(stat = "identity", position = "dodge", alpha = 0.85) +
geom_hline(yintercept = -log10(0.05), linetype = "dashed",
colour = "red", linewidth = 0.7) +
scale_fill_manual(values = c("#59A14F", "#E15759")) +
facet_wrap(~ Kelas) +
coord_flip() +
labs(title = "Uji Wald Parsial MLR — -log10(p-value) per Fitur & Kelas",
subtitle = "Garis merah = threshold p = 0.05",
x = "Fitur", y = "-log10(p-value)") +
theme_minimal(base_size = 11)# AUC macro
auc_macro_lda <- round(mean(auc_results$AUC_LDA), 4)
auc_macro_mlr <- round(mean(auc_results$AUC_MLR), 4)
tabel_final <- data.frame(
Kriteria = c(
"Akurasi Test",
"F1 Macro",
"F1 Weighted",
"CV 9-Fold Mean Accuracy",
"CV 9-Fold SD Accuracy",
"AUC Macro (OvR)",
"Asumsi Utama",
"Interpretabilitas"
),
LDA = c(
paste0(round(metrics_lda$Akurasi * 100, 1), "%"),
round(metrics_lda$F1_Macro, 4),
round(metrics_lda$F1_Weighted, 4),
paste0(round(cv_summary$CV_Accuracy_Mean[1] * 100, 1), "%"),
round(cv_summary$CV_Accuracy_SD[1], 4),
auc_macro_lda,
"Normalitas multivariat, Σ homogen",
"Koefisien diskriminan & plot LD"
),
MLR = c(
paste0(round(metrics_mlr$Akurasi * 100, 1), "%"),
round(metrics_mlr$F1_Macro, 4),
round(metrics_mlr$F1_Weighted, 4),
paste0(round(cv_summary$CV_Accuracy_Mean[2] * 100, 1), "%"),
round(cv_summary$CV_Accuracy_SD[2], 4),
auc_macro_mlr,
"Linearitas log-odds, independensi obs",
"Log odds, RRR, uji Wald"
)
)
tabel_final %>%
kable(caption = "Tabel Perbandingan Lengkap: LDA vs MLR") %>%
kable_styling(bootstrap_options = c("striped", "hover", "bordered"),
full_width = FALSE) %>%
column_spec(1, bold = TRUE) %>%
row_spec(0, background = "#2E86AB", color = "white")| Kriteria | LDA | MLR |
|---|---|---|
| Akurasi Test | 100% | 100% |
| F1 Macro | 1 | 1 |
| F1 Weighted | 1 | 1 |
| CV 9-Fold Mean Accuracy | 97.8% | 97% |
| CV 9-Fold SD Accuracy | 0.0333 | 0.0351 |
| AUC Macro (OvR) | 1 | 1 |
| Asumsi Utama | Normalitas multivariat, Σ homogen | Linearitas log-odds, independensi obs |
| Interpretabilitas | Koefisien diskriminan & plot LD | Log odds, RRR, uji Wald |
kelebihan_df <- data.frame(
Aspek = c("Kelebihan utama",
"Keunggulan prediksi",
"Kekurangan utama",
"Masalah asumsi",
"Interpretasi output"),
LDA = c(
"Efisien dengan data kecil, dimensi reduksi alami (LD plot)",
"Sangat baik jika asumsi normalitas & homogenitas Σ terpenuhi",
"Sensitif terhadap pelanggaran asumsi normalitas & outlier",
"Asumsi normalitas multivariat sering dilanggar di data nyata",
"Koefisien diskriminan & proyeksi LD mudah divisualisasikan"
),
MLR = c(
"Tidak mensyaratkan normalitas distribusi fitur",
"Lebih robust pada data yang tidak normal atau ada outlier",
"Butuh banyak data, rentan multikolinearitas",
"Asumsi linearitas log-odds, perlu dicek lebih ketat",
"Log odds & RRR mudah diinterpretasi secara statistik"
)
)
kelebihan_df %>%
kable(caption = "Kelebihan & Kekurangan: LDA vs MLR") %>%
kable_styling(bootstrap_options = c("striped", "hover"), full_width = TRUE) %>%
column_spec(1, bold = TRUE, width = "20%") %>%
column_spec(2, width = "40%") %>%
column_spec(3, width = "40%")| Aspek | LDA | MLR |
|---|---|---|
| Kelebihan utama | Efisien dengan data kecil, dimensi reduksi alami (LD plot) | Tidak mensyaratkan normalitas distribusi fitur |
| Keunggulan prediksi | Sangat baik jika asumsi normalitas & homogenitas Σ terpenuhi | Lebih robust pada data yang tidak normal atau ada outlier |
| Kekurangan utama | Sensitif terhadap pelanggaran asumsi normalitas & outlier | Butuh banyak data, rentan multikolinearitas |
| Masalah asumsi | Asumsi normalitas multivariat sering dilanggar di data nyata | Asumsi linearitas log-odds, perlu dicek lebih ketat |
| Interpretasi output | Koefisien diskriminan & proyeksi LD mudah divisualisasikan | Log odds & RRR mudah diinterpretasi secara statistik |