1 Persiapan

1.1 Load Package

# install.packages(c("MASS", "nnet", "ggplot2", "dplyr", "caret",
#                    "knitr", "kableExtra", "corrplot", "tidyr", "car"))

library(MASS)
library(nnet)
library(ggplot2)
library(dplyr)
library(caret)
library(knitr)
library(kableExtra)
library(corrplot)
library(tidyr)
library(car)

1.2 Load & Preprocessing Data

penguins_raw <- read.csv("penguins_size.csv", stringsAsFactors = FALSE)
str(penguins_raw)
## 'data.frame':    344 obs. of  7 variables:
##  $ species          : chr  "Adelie" "Adelie" "Adelie" "Adelie" ...
##  $ island           : chr  "Torgersen" "Torgersen" "Torgersen" "Torgersen" ...
##  $ culmen_length_mm : num  39.1 39.5 40.3 NA 36.7 39.3 38.9 39.2 34.1 42 ...
##  $ culmen_depth_mm  : num  18.7 17.4 18 NA 19.3 20.6 17.8 19.6 18.1 20.2 ...
##  $ flipper_length_mm: int  181 186 195 NA 193 190 181 195 193 190 ...
##  $ body_mass_g      : int  3750 3800 3250 NA 3450 3650 3625 4675 3475 4250 ...
##  $ sex              : chr  "MALE" "FEMALE" "FEMALE" NA ...
penguins <- penguins_raw %>%
  mutate(across(everything(), ~ifelse(. %in% c("NA", ".", ""), NA, .))) %>%
  mutate(
    culmen_length_mm  = as.numeric(culmen_length_mm),
    culmen_depth_mm   = as.numeric(culmen_depth_mm),
    flipper_length_mm = as.numeric(flipper_length_mm),
    body_mass_g       = as.numeric(body_mass_g),
    species = as.factor(species),
    island  = as.factor(island),
    sex     = as.factor(toupper(sex))
  ) %>%
  na.omit()

cat("Dimensi data setelah cleaning:", nrow(penguins), "x", ncol(penguins), "\n")
## Dimensi data setelah cleaning: 333 x 7
cat("Distribusi spesies:\n")
## Distribusi spesies:
print(table(penguins$species))
## 
##    Adelie Chinstrap    Gentoo 
##       146        68       119

2 Statistika Deskriptif

desc_stats <- penguins %>%
  group_by(species) %>%
  summarise(
    n               = n(),
    culmen_len_mean = round(mean(culmen_length_mm), 2),
    culmen_dep_mean = round(mean(culmen_depth_mm), 2),
    flipper_mean    = round(mean(flipper_length_mm), 2),
    mass_mean       = round(mean(body_mass_g), 1)
  )

kable(desc_stats,
      col.names = c("Spesies", "n", "Culmen Length (mm)",
                    "Culmen Depth (mm)", "Flipper Length (mm)", "Body Mass (g)"),
      caption = "Tabel 1. Rata-rata Variabel Prediktor per Spesies") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"),
                full_width = FALSE)
Tabel 1. Rata-rata Variabel Prediktor per Spesies
Spesies n Culmen Length (mm) Culmen Depth (mm) Flipper Length (mm) Body Mass (g)
Adelie 146 38.82 18.35 190.10 3706.2
Chinstrap 68 48.83 18.42 195.82 3733.1
Gentoo 119 47.57 15.00 217.24 5092.4
penguins_long <- penguins %>%
  select(species, culmen_length_mm, culmen_depth_mm, flipper_length_mm, body_mass_g) %>%
  pivot_longer(-species, names_to = "variable", values_to = "value") %>%
  mutate(variable = case_when(
    variable == "culmen_length_mm"  ~ "Culmen Length (mm)",
    variable == "culmen_depth_mm"   ~ "Culmen Depth (mm)",
    variable == "flipper_length_mm" ~ "Flipper Length (mm)",
    variable == "body_mass_g"       ~ "Body Mass (g)",
    TRUE ~ variable
  ))

ggplot(penguins_long, aes(x = species, y = value, fill = species)) +
  geom_boxplot(alpha = 0.7, outlier.shape = 21) +
  facet_wrap(~variable, scales = "free_y", ncol = 2) +
  scale_fill_manual(values = c("#FF6B6B", "#4ECDC4", "#45B7D1")) +
  labs(title = "Gambar 1. Distribusi Variabel Prediktor per Spesies",
       x = "Spesies", y = "Nilai", fill = "Spesies") +
  theme_bw() +
  theme(legend.position = "bottom",
        strip.background = element_rect(fill = "#2C3E50"),
        strip.text = element_text(color = "white", face = "bold"))

ggplot(penguins, aes(x = culmen_length_mm, y = flipper_length_mm,
                     color = species, shape = species)) +
  geom_point(alpha = 0.7, size = 2.5) +
  scale_color_manual(values = c("#E74C3C", "#27AE60", "#2980B9")) +
  labs(title = "Gambar 2. Scatter Plot Culmen Length vs Flipper Length",
       subtitle = "Berdasarkan Spesies Penguin",
       x = "Culmen Length (mm)", y = "Flipper Length (mm)",
       color = "Spesies", shape = "Spesies") +
  theme_bw() +
  theme(legend.position = "bottom")

cor_matrix <- cor(penguins %>%
                    select(culmen_length_mm, culmen_depth_mm,
                           flipper_length_mm, body_mass_g))

corrplot(cor_matrix, method = "color", type = "upper",
         addCoef.col = "black", number.cex = 0.8,
         tl.col = "black", tl.srt = 45,
         col = colorRampPalette(c("#D73027", "white", "#4575B4"))(200),
         title = "Gambar 3. Matriks Korelasi Variabel Prediktor",
         mar = c(0, 0, 2, 0))

3 Analisis Diskriminan

3.1 Uji Asumsi

3.1.1 Uji Normalitas (Shapiro-Wilk)

vars       <- c("culmen_length_mm", "culmen_depth_mm", "flipper_length_mm", "body_mass_g")
var_labels <- c("Culmen Length", "Culmen Depth", "Flipper Length", "Body Mass")

normalitas_result <- data.frame()
for (sp in levels(penguins$species)) {
  for (i in seq_along(vars)) {
    x  <- penguins %>% filter(species == sp) %>% pull(vars[i])
    sw <- shapiro.test(x)
    normalitas_result <- rbind(normalitas_result, data.frame(
      Spesies    = sp,
      Variabel   = var_labels[i],
      W          = round(sw$statistic, 4),
      p_value    = round(sw$p.value, 4),
      Kesimpulan = ifelse(sw$p.value > 0.05, "Normal", "Tidak Normal")
    ))
  }
}

kable(normalitas_result, row.names = FALSE,
      caption = "Tabel 2. Uji Shapiro-Wilk Normalitas per Variabel per Spesies") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed")) %>%
  row_spec(which(normalitas_result$Kesimpulan == "Tidak Normal"),
           background = "#FFF3CD")
Tabel 2. Uji Shapiro-Wilk Normalitas per Variabel per Spesies
Spesies Variabel W p_value Kesimpulan
Adelie Culmen Length 0.9929 0.6848 Normal
Adelie Culmen Depth 0.9847 0.1047 Normal
Adelie Flipper Length 0.9934 0.7427 Normal
Adelie Body Mass 0.9812 0.0423 Tidak Normal
Chinstrap Culmen Length 0.9752 0.1941 Normal
Chinstrap Culmen Depth 0.9727 0.1418 Normal
Chinstrap Flipper Length 0.9889 0.8106 Normal
Chinstrap Body Mass 0.9845 0.5605 Normal
Gentoo Culmen Length 0.9738 0.0199 Tidak Normal
Gentoo Culmen Depth 0.9776 0.0438 Tidak Normal
Gentoo Flipper Length 0.9615 0.0018 Tidak Normal
Gentoo Body Mass 0.9861 0.2605 Normal

3.1.2 Uji Homogenitas Varians (Levene)

levene_results <- data.frame()
for (i in seq_along(vars)) {
  lev <- car::leveneTest(penguins[[vars[i]]] ~ penguins$species)
  levene_results <- rbind(levene_results, data.frame(
    Variabel   = var_labels[i],
    F_stat     = round(lev$`F value`[1], 4),
    p_value    = round(lev$`Pr(>F)`[1], 4),
    Kesimpulan = ifelse(lev$`Pr(>F)`[1] > 0.05, "Homogen", "Tidak Homogen")
  ))
}

kable(levene_results, row.names = FALSE,
      caption = "Tabel 3. Uji Levene Homogenitas Varians per Variabel") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed")) %>%
  row_spec(which(levene_results$Kesimpulan == "Tidak Homogen"),
           background = "#FFF3CD")
Tabel 3. Uji Levene Homogenitas Varians per Variabel
Variabel F_stat p_value Kesimpulan
Culmen Length 2.2855 0.1033 Homogen
Culmen Depth 1.9124 0.1494 Homogen
Flipper Length 0.4428 0.6426 Homogen
Body Mass 5.1349 0.0064 Tidak Homogen

3.2 Pembagian Data Training dan Testing

set.seed(42)
train_idx  <- createDataPartition(penguins$species, p = 0.8, list = FALSE)
train_data <- penguins[train_idx, ]
test_data  <- penguins[-train_idx, ]

cat("Data training:", nrow(train_data), "observasi\n")
## Data training: 268 observasi
cat("Data testing :", nrow(test_data), "observasi\n")
## Data testing : 65 observasi
cat("\nDistribusi spesies - Training:\n")
## 
## Distribusi spesies - Training:
print(table(train_data$species))
## 
##    Adelie Chinstrap    Gentoo 
##       117        55        96
cat("\nDistribusi spesies - Testing:\n")
## 
## Distribusi spesies - Testing:
print(table(test_data$species))
## 
##    Adelie Chinstrap    Gentoo 
##        29        13        23

3.3 Perhitungan Manual Matriks B dan W

pred_vars     <- c("culmen_length_mm", "culmen_depth_mm", "flipper_length_mm", "body_mass_g")
X_train       <- as.matrix(train_data[, pred_vars])
species_train <- train_data$species
species_list  <- levels(species_train)
g <- length(species_list)
p <- ncol(X_train)
n <- nrow(X_train)

x_bar <- colMeans(X_train)

group_means <- sapply(species_list, function(sp) {
  colMeans(X_train[species_train == sp, ])
})
colnames(group_means) <- species_list

cat("=== GRAND MEAN ===\n")
## === GRAND MEAN ===
print(round(x_bar, 3))
##  culmen_length_mm   culmen_depth_mm flipper_length_mm       body_mass_g 
##            43.826            17.146           200.806          4193.750
cat("\n=== MEAN PER SPESIES ===\n")
## 
## === MEAN PER SPESIES ===
print(round(group_means, 3))
##                     Adelie Chinstrap   Gentoo
## culmen_length_mm    38.597    48.780   47.360
## culmen_depth_mm     18.316    18.391   15.005
## flipper_length_mm  189.735   195.636  217.260
## body_mass_g       3694.658  3716.818 5075.260
B <- matrix(0, p, p)
for (sp in species_list) {
  ni   <- sum(species_train == sp)
  diff <- group_means[, sp] - x_bar
  B    <- B + ni * (diff %o% diff)
}

W <- matrix(0, p, p)
for (sp in species_list) {
  Xi     <- X_train[species_train == sp, ]
  xi_bar <- group_means[, sp]
  for (j in 1:nrow(Xi)) {
    diff <- Xi[j, ] - xi_bar
    W    <- W + (diff %o% diff)
  }
}

cat("\n=== MATRIKS BETWEEN-GROUPS (B) ===\n")
## 
## === MATRIKS BETWEEN-GROUPS (B) ===
print(round(B, 2))
##                   culmen_length_mm culmen_depth_mm flipper_length_mm
## culmen_length_mm           5747.59        -1103.06          10947.05
## culmen_depth_mm           -1103.06          685.43          -5251.42
## flipper_length_mm         10947.05        -5251.42          41801.91
## body_mass_g              474465.86      -282154.68        2174535.75
##                   body_mass_g
## culmen_length_mm     474465.9
## culmen_depth_mm     -282154.7
## flipper_length_mm   2174535.8
## body_mass_g       116252183.2
cat("\n=== MATRIKS WITHIN-GROUPS (W) ===\n")
## 
## === MATRIKS WITHIN-GROUPS (W) ===
print(round(W, 2))
##                   culmen_length_mm culmen_depth_mm flipper_length_mm
## culmen_length_mm           2369.17          469.56           2329.91
## culmen_depth_mm             469.56          354.49            957.09
## flipper_length_mm          2329.91          957.09          11374.00
## body_mass_g              212352.89        87775.93         475014.25
##                   body_mass_g
## culmen_length_mm    212352.89
## culmen_depth_mm      87775.93
## flipper_length_mm   475014.25
## body_mass_g       57555473.00

3.4 Eigenvector W⁻¹B

W_inv <- solve(W)
WinvB <- W_inv %*% B
eig   <- eigen(WinvB)

cat("=== EIGENVALUES W-1B ===\n")
## === EIGENVALUES W-1B ===
print(round(Re(eig$values), 6))
## [1] 14.253712  2.359715  0.000000  0.000000
eig_vec <- Re(eig$vectors)
rownames(eig_vec) <- pred_vars
colnames(eig_vec) <- paste0("LD", 1:p)

cat("\n=== EIGENVECTORS (Koefisien Fungsi Diskriminan) ===\n")
## 
## === EIGENVECTORS (Koefisien Fungsi Diskriminan) ===
print(round(eig_vec[, 1:2], 6))
##                         LD1       LD2
## culmen_length_mm  -0.090808 -0.995887
## culmen_depth_mm    0.991617 -0.089487
## flipper_length_mm -0.091917  0.013501
## body_mass_g       -0.001230  0.004321
eig_vals     <- Re(eig$values)
eig_vals_pos <- eig_vals[eig_vals > 1e-10]
prop_var     <- eig_vals_pos / sum(eig_vals_pos)

cat("\n=== PROPORSI VARIANSI YANG DIJELASKAN ===\n")
## 
## === PROPORSI VARIANSI YANG DIJELASKAN ===
for (i in seq_along(prop_var)) {
  cat(sprintf("LD%d: %.2f%%\n", i, prop_var[i] * 100))
}
## LD1: 85.80%
## LD2: 14.20%

3.5 Model LDA

lda_model <- lda(species ~ culmen_length_mm + culmen_depth_mm +
                              flipper_length_mm + body_mass_g,
                 data  = train_data,
                 prior = c(1, 1, 1) / 3)

print(lda_model)
## Call:
## lda(species ~ culmen_length_mm + culmen_depth_mm + flipper_length_mm + 
##     body_mass_g, data = train_data, prior = c(1, 1, 1)/3)
## 
## Prior probabilities of groups:
##    Adelie Chinstrap    Gentoo 
## 0.3333333 0.3333333 0.3333333 
## 
## Group means:
##           culmen_length_mm culmen_depth_mm flipper_length_mm body_mass_g
## Adelie            38.59744        18.31624          189.7350    3694.658
## Chinstrap         48.78000        18.39091          195.6364    3716.818
## Gentoo            47.36042        15.00521          217.2604    5075.260
## 
## Coefficients of linear discriminants:
##                            LD1          LD2
## culmen_length_mm  -0.048088861 -0.411317854
## culmen_depth_mm    0.977608936  0.063375525
## flipper_length_mm -0.090833392 -0.003771808
## body_mass_g       -0.001386521  0.001621606
## 
## Proportion of trace:
##    LD1    LD2 
## 0.8151 0.1849
lda_coef          <- as.data.frame(lda_model$scaling)
lda_coef$Variabel <- rownames(lda_coef)

kable(lda_coef[, c("Variabel", "LD1", "LD2")],
      row.names = FALSE, digits = 4,
      caption = "Tabel 4. Koefisien Linear Discriminant Functions") %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)
Tabel 4. Koefisien Linear Discriminant Functions
Variabel LD1 LD2
culmen_length_mm -0.0481 -0.4113
culmen_depth_mm 0.9776 0.0634
flipper_length_mm -0.0908 -0.0038
body_mass_g -0.0014 0.0016
prop_trace <- round(lda_model$svd^2 / sum(lda_model$svd^2) * 100, 2)
cat("Proporsi Variansi:\n")
## Proporsi Variansi:
cat(sprintf("  LD1: %.2f%%\n", prop_trace[1]))
##   LD1: 81.51%
cat(sprintf("  LD2: %.2f%%\n", prop_trace[2]))
##   LD2: 18.49%
cat(sprintf("  Total: %.2f%%\n", sum(prop_trace)))
##   Total: 100.00%

3.6 Visualisasi Fungsi Diskriminan

lda_pred_train <- predict(lda_model, train_data)

lda_df <- data.frame(
  LD1     = lda_pred_train$x[, 1],
  LD2     = lda_pred_train$x[, 2],
  Species = train_data$species
)

ggplot(lda_df, aes(x = LD1, y = LD2, color = Species, shape = Species)) +
  geom_point(alpha = 0.75, size = 2.5) +
  stat_ellipse(aes(fill = Species), geom = "polygon", alpha = 0.1, level = 0.9) +
  scale_color_manual(values = c("#E74C3C", "#27AE60", "#2980B9")) +
  scale_fill_manual(values  = c("#E74C3C", "#27AE60", "#2980B9")) +
  labs(title = "Gambar 4. Proyeksi LDA: LD1 vs LD2",
       subtitle = sprintf("LD1: %.1f%% | LD2: %.1f%% variansi terjelaskan",
                          prop_trace[1], prop_trace[2]),
       x = paste0("LD1 (", prop_trace[1], "%)"),
       y = paste0("LD2 (", prop_trace[2], "%)")) +
  theme_bw() +
  theme(legend.position = "bottom")

ggplot(lda_df, aes(x = LD1, fill = Species)) +
  geom_histogram(bins = 25, alpha = 0.6, position = "identity") +
  scale_fill_manual(values = c("#E74C3C", "#27AE60", "#2980B9")) +
  labs(title = "Gambar 5. Distribusi Skor LD1 per Spesies",
       x = "Skor LD1", y = "Frekuensi") +
  theme_bw() +
  theme(legend.position = "bottom")

3.7 Posterior Probability & Classification Rule

pred_test <- predict(lda_model, test_data)

posterior_df           <- as.data.frame(round(pred_test$posterior, 4))
posterior_df$Predicted <- pred_test$class
posterior_df$Actual    <- test_data$species
posterior_df$Correct   <- ifelse(posterior_df$Predicted == posterior_df$Actual, "V", "X")

kable(head(posterior_df, 10),
      caption = "Tabel 5. Posterior Probability (10 Observasi Pertama Test Set)") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"))
Tabel 5. Posterior Probability (10 Observasi Pertama Test Set)
Adelie Chinstrap Gentoo Predicted Actual Correct
8 1.0000 0.0000 0 Adelie Adelie V
16 1.0000 0.0000 0 Adelie Adelie V
17 0.9997 0.0003 0 Adelie Adelie V
28 0.9494 0.0506 0 Adelie Adelie V
33 0.9933 0.0067 0 Adelie Adelie V
42 0.9974 0.0026 0 Adelie Adelie V
44 0.9869 0.0131 0 Adelie Adelie V
51 0.9975 0.0025 0 Adelie Adelie V
57 0.9992 0.0008 0 Adelie Adelie V
58 0.9984 0.0016 0 Adelie Adelie V

3.8 Evaluasi Model LDA

3.8.1 Confusion Matrix & APER - Training Set

pred_train_class <- predict(lda_model, train_data)$class
cm_train         <- confusionMatrix(pred_train_class, train_data$species)

cat("=== CONFUSION MATRIX - TRAINING SET ===\n")
## === CONFUSION MATRIX - TRAINING SET ===
print(cm_train$table)
##            Reference
## Prediction  Adelie Chinstrap Gentoo
##   Adelie       115         1      0
##   Chinstrap      2        54      0
##   Gentoo         0         0     96
n_total    <- sum(cm_train$table)
n_wrong    <- n_total - sum(diag(cm_train$table))
aper_train <- n_wrong / n_total

cat(sprintf("\nAPER (Training) = %d / %d = %.4f = %.2f%%\n",
            n_wrong, n_total, aper_train, aper_train * 100))
## 
## APER (Training) = 3 / 268 = 0.0112 = 1.12%
cat(sprintf("Akurasi (Training) = %.2f%%\n",
            cm_train$overall["Accuracy"] * 100))
## Akurasi (Training) = 98.88%

3.8.2 Confusion Matrix & APER - Test Set

pred_test_class <- predict(lda_model, test_data)$class
cm_test         <- confusionMatrix(pred_test_class, test_data$species)

cat("=== CONFUSION MATRIX - TEST SET ===\n")
## === CONFUSION MATRIX - TEST SET ===
print(cm_test$table)
##            Reference
## Prediction  Adelie Chinstrap Gentoo
##   Adelie        29         1      0
##   Chinstrap      0        12      0
##   Gentoo         0         0     23
n_total_t <- sum(cm_test$table)
n_wrong_t <- n_total_t - sum(diag(cm_test$table))
aper_test <- n_wrong_t / n_total_t

cat(sprintf("\nAPER (Testing) = %d / %d = %.4f = %.2f%%\n",
            n_wrong_t, n_total_t, aper_test, aper_test * 100))
## 
## APER (Testing) = 1 / 65 = 0.0154 = 1.54%
cat(sprintf("Akurasi (Testing) = %.2f%%\n",
            cm_test$overall["Accuracy"] * 100))
## Akurasi (Testing) = 98.46%
cm_df        <- as.data.frame(cm_test$table)
names(cm_df) <- c("Predicted", "Actual", "Freq")

ggplot(cm_df, aes(x = Actual, y = Predicted, fill = Freq)) +
  geom_tile(color = "white", linewidth = 1) +
  geom_text(aes(label = Freq), size = 8, fontface = "bold") +
  scale_fill_gradient(low = "#EBF5FB", high = "#1A5276") +
  labs(title = "Gambar 6. Confusion Matrix LDA - Test Set",
       x = "Kelas Aktual", y = "Kelas Prediksi", fill = "Frekuensi") +
  theme_bw() +
  theme(axis.text  = element_text(size = 12),
        axis.title = element_text(size = 12, face = "bold"),
        plot.title = element_text(size = 13, face = "bold"))

metrics_df <- data.frame(
  Spesies     = levels(test_data$species),
  Sensitivity = round(cm_test$byClass[, "Sensitivity"], 4),
  Specificity = round(cm_test$byClass[, "Specificity"], 4),
  Precision   = round(cm_test$byClass[, "Precision"], 4),
  F1_Score    = round(cm_test$byClass[, "F1"], 4)
)

kable(metrics_df, row.names = FALSE,
      caption = "Tabel 6. Metrik Performa LDA per Kelas (Test Set)") %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)
Tabel 6. Metrik Performa LDA per Kelas (Test Set)
Spesies Sensitivity Specificity Precision F1_Score
Adelie 1.0000 0.9722 0.9667 0.9831
Chinstrap 0.9231 1.0000 1.0000 0.9600
Gentoo 1.0000 1.0000 1.0000 1.0000

3.8.3 Leave-One-Out Cross Validation (LOO-CV)

lda_cv  <- lda(species ~ culmen_length_mm + culmen_depth_mm +
                           flipper_length_mm + body_mass_g,
               data = penguins,
               CV   = TRUE)

cm_cv   <- table(Actual = penguins$species, Predicted = lda_cv$class)
aper_cv <- (sum(cm_cv) - sum(diag(cm_cv))) / sum(cm_cv)

cat("=== CONFUSION MATRIX - LOO Cross Validation ===\n")
## === CONFUSION MATRIX - LOO Cross Validation ===
print(cm_cv)
##            Predicted
## Actual      Adelie Chinstrap Gentoo
##   Adelie       144         2      0
##   Chinstrap      3        65      0
##   Gentoo         0         0    119
cat(sprintf("\nAPER (LOO-CV)    = %.4f = %.2f%%\n", aper_cv, aper_cv * 100))
## 
## APER (LOO-CV)    = 0.0150 = 1.50%
cat(sprintf("Akurasi (LOO-CV) = %.2f%%\n", (1 - aper_cv) * 100))
## Akurasi (LOO-CV) = 98.50%

3.9 QDA (Quadratic Discriminant Analysis)

qda_model <- qda(species ~ culmen_length_mm + culmen_depth_mm +
                               flipper_length_mm + body_mass_g,
                 data = train_data)

pred_qda <- predict(qda_model, test_data)
cm_qda   <- confusionMatrix(pred_qda$class, test_data$species)
aper_qda <- (sum(cm_qda$table) - sum(diag(cm_qda$table))) / sum(cm_qda$table)

cat("=== QDA - CONFUSION MATRIX (Test Set) ===\n")
## === QDA - CONFUSION MATRIX (Test Set) ===
print(cm_qda$table)
##            Reference
## Prediction  Adelie Chinstrap Gentoo
##   Adelie        29         1      0
##   Chinstrap      0        12      0
##   Gentoo         0         0     23
cat(sprintf("\nAPER (QDA)    = %.4f = %.2f%%\n", aper_qda, aper_qda * 100))
## 
## APER (QDA)    = 0.0154 = 1.54%
cat(sprintf("Akurasi (QDA) = %.2f%%\n", cm_qda$overall["Accuracy"] * 100))
## Akurasi (QDA) = 98.46%

3.10 Perbandingan LDA vs QDA

comparison_df <- data.frame(
  Metode  = c("LDA (Test)", "LDA (LOO-CV)", "QDA (Test)"),
  APER    = paste0(round(c(aper_test, aper_cv, aper_qda) * 100, 2), "%"),
  Akurasi = paste0(round(c(1 - aper_test, 1 - aper_cv, 1 - aper_qda) * 100, 2), "%")
)

kable(comparison_df, row.names = FALSE,
      caption = "Tabel 7. Perbandingan Performa LDA dan QDA") %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)
Tabel 7. Perbandingan Performa LDA dan QDA
Metode APER Akurasi
LDA (Test) 1.54% 98.46%
LDA (LOO-CV) 1.5% 98.5%
QDA (Test) 1.54% 98.46%

4 Regresi Logistik Multinomial

4.1 Uji Asumsi

4.1.1 Tidak Ada Multikolinieritas (VIF)

lm_proxy <- lm(as.numeric(species) ~ culmen_length_mm + culmen_depth_mm +
                 flipper_length_mm + body_mass_g + island + sex,
               data = train_data)

vif_raw <- car::vif(lm_proxy)

if (is.matrix(vif_raw)) {
  vif_df <- data.frame(
    Variabel   = rownames(vif_raw),
    VIF        = round(vif_raw[, "GVIF^(1/(2*Df))"]^2, 4),
    Keterangan = ifelse(vif_raw[, "GVIF^(1/(2*Df))"]^2 > 10,
                        "Multikolinieritas", "Aman")
  )
} else {
  vif_df <- data.frame(
    Variabel   = names(vif_raw),
    VIF        = round(as.numeric(vif_raw), 4),
    Keterangan = ifelse(as.numeric(vif_raw) > 10, "Multikolinieritas", "Aman")
  )
}

kable(vif_df, row.names = FALSE,
      caption = "Tabel 8. Variance Inflation Factor (VIF)") %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE) %>%
  row_spec(which(vif_df$Keterangan == "Multikolinieritas"), background = "#FFF3CD")
Tabel 8. Variance Inflation Factor (VIF)
Variabel VIF Keterangan
culmen_length_mm 2.2660 Aman
culmen_depth_mm 3.1494 Aman
flipper_length_mm 6.0184 Aman
body_mass_g 6.2560 Aman
island 1.6380 Aman
sex 2.4907 Aman

4.1.2 Deteksi Outlier (Mahalanobis Distance)

X_all     <- as.matrix(penguins[, pred_vars])
mah_dist  <- mahalanobis(X_all, colMeans(X_all), cov(X_all))
threshold <- qchisq(0.999, df = 4)
n_outlier <- sum(mah_dist > threshold)

penguins$mah_dist <- mah_dist

cat(sprintf("Threshold Mahalanobis (chi-sq df=4, alpha=0.001): %.2f\n", threshold))
## Threshold Mahalanobis (chi-sq df=4, alpha=0.001): 18.47
cat(sprintf("Jumlah potensi outlier: %d dari %d observasi (%.1f%%)\n",
            n_outlier, nrow(penguins), n_outlier / nrow(penguins) * 100))
## Jumlah potensi outlier: 1 dari 333 observasi (0.3%)
ggplot(penguins, aes(x = seq_along(mah_dist), y = mah_dist, color = species)) +
  geom_point(alpha = 0.6) +
  geom_hline(yintercept = threshold, linetype = "dashed", color = "red", linewidth = 1) +
  scale_color_manual(values = c("#E74C3C", "#27AE60", "#2980B9")) +
  labs(title = "Gambar 7. Jarak Mahalanobis untuk Deteksi Outlier",
       x = "Indeks Observasi", y = "Jarak Mahalanobis", color = "Spesies") +
  theme_bw() +
  theme(legend.position = "bottom")

4.2 Persiapan Data Multinomial

penguins_mn         <- penguins %>% select(-mah_dist)
penguins_mn$species <- relevel(penguins_mn$species, ref = "Adelie")

set.seed(42)
train_idx_mn <- createDataPartition(penguins_mn$species, p = 0.8, list = FALSE)
train_mn     <- penguins_mn[train_idx_mn, ]
test_mn      <- penguins_mn[-train_idx_mn, ]

cat("Referensi kategori : Adelie\n")
## Referensi kategori : Adelie
cat("Kategori lain      : Chinstrap (g1), Gentoo (g2)\n\n")
## Kategori lain      : Chinstrap (g1), Gentoo (g2)
cat("Data training:", nrow(train_mn), "\n")
## Data training: 268
cat("Data testing :", nrow(test_mn), "\n")
## Data testing : 65

4.3 Model Multinomial

4.3.1 Model Penuh

set.seed(42)
model_penuh <- multinom(
  species ~ culmen_length_mm + culmen_depth_mm + flipper_length_mm +
            body_mass_g + island + sex,
  data  = train_mn,
  trace = FALSE,
  maxit = 500
)

summary(model_penuh)
## Call:
## multinom(formula = species ~ culmen_length_mm + culmen_depth_mm + 
##     flipper_length_mm + body_mass_g + island + sex, data = train_mn, 
##     trace = FALSE, maxit = 500)
## 
## Coefficients:
##           (Intercept) culmen_length_mm culmen_depth_mm flipper_length_mm
## Chinstrap  -139.76675         17.28758       -23.84697        -0.4478519
## Gentoo      -11.03335         10.67415       -21.37892        -0.5511787
##           body_mass_g islandDream islandTorgersen    sexMALE
## Chinstrap -0.03292373    31.86048       -5.249291 -23.287496
## Gentoo     0.01052112   -47.22957      -23.064824  -2.981905
## 
## Std. Errors:
##           (Intercept) culmen_length_mm culmen_depth_mm flipper_length_mm
## Chinstrap    6.522165         16.98786       123.14247          14.61513
## Gentoo       1.421835         39.21404        94.74673          59.71040
##           body_mass_g  islandDream islandTorgersen      sexMALE
## Chinstrap   0.5772008 2.313083e+01    4.868708e-12  0.001169866
## Gentoo      2.7187739 2.935303e-09    2.537818e+00 17.701742622
## 
## Residual Deviance: 0.000190203 
## AIC: 32.00019

4.3.2 Model Parsimoni

set.seed(42)
model_mn <- multinom(
  species ~ culmen_length_mm + culmen_depth_mm + flipper_length_mm +
            body_mass_g + island,
  data  = train_mn,
  trace = FALSE,
  maxit = 500
)

summary(model_mn)
## Call:
## multinom(formula = species ~ culmen_length_mm + culmen_depth_mm + 
##     flipper_length_mm + body_mass_g + island, data = train_mn, 
##     trace = FALSE, maxit = 500)
## 
## Coefficients:
##           (Intercept) culmen_length_mm culmen_depth_mm flipper_length_mm
## Chinstrap -107.728575         21.81565       -28.56980        -0.8293417
## Gentoo      -9.857894         13.87828       -29.35592        -0.4964888
##            body_mass_g islandDream islandTorgersen
## Chinstrap -0.048380887    18.10598       -7.400944
## Gentoo     0.007588159   -40.72572      -21.875173
## 
## Std. Errors:
##           (Intercept) culmen_length_mm culmen_depth_mm flipper_length_mm
## Chinstrap   0.4763064         1.139501        7.215415          6.899134
## Gentoo      0.8876804        63.135870       21.137519        195.277749
##           body_mass_g  islandDream islandTorgersen
## Chinstrap    0.394485 1.438934e+00    0.0002271007
## Gentoo       8.887852 2.059447e-11    0.2892103101
## 
## Residual Deviance: 0.0001977755 
## AIC: 28.0002

4.4 Uji Signifikansi Parameter

4.4.1 Uji Serentak (Likelihood Ratio Test)

model_null <- multinom(species ~ 1, data = train_mn, trace = FALSE)

lrt <- anova(model_null, model_mn, test = "Chisq")
print(lrt)
## Likelihood ratio tests of Multinomial Models
## 
## Response: species
##                                                                           Model
## 1                                                                             1
## 2 culmen_length_mm + culmen_depth_mm + flipper_length_mm + body_mass_g + island
##   Resid. df   Resid. Dev   Test    Df LR stat. Pr(Chi)
## 1       534 5.652588e+02                              
## 2       522 1.977755e-04 1 vs 2    12 565.2586       0
G_stat    <- deviance(model_null) - deviance(model_mn)
df_diff   <- length(coef(model_mn)) - length(coef(model_null))
p_value_G <- pchisq(G_stat, df = df_diff, lower.tail = FALSE)

cat(sprintf("\n=== UJI SERENTAK (LRT) ===\n"))
## 
## === UJI SERENTAK (LRT) ===
cat(sprintf("G2            = %.4f\n", G_stat))
## G2            = 565.2586
cat(sprintf("Derajat bebas = %d\n", df_diff))
## Derajat bebas = 12
cat(sprintf("P-value       = %.6f\n", p_value_G))
## P-value       = 0.000000
cat(sprintf("chi2(df=%d, alpha=0.05) = %.4f\n", df_diff, qchisq(0.95, df_diff)))
## chi2(df=12, alpha=0.05) = 21.0261
cat(sprintf("\nKeputusan : %s H0\n", ifelse(p_value_G < 0.05, "TOLAK", "GAGAL TOLAK")))
## 
## Keputusan : TOLAK H0
cat(sprintf("Kesimpulan: %s\n",
            ifelse(p_value_G < 0.05,
                   "Minimal satu prediktor berpengaruh signifikan terhadap model.",
                   "Tidak ada prediktor yang berpengaruh signifikan.")))
## Kesimpulan: Minimal satu prediktor berpengaruh signifikan terhadap model.

4.4.2 Uji Parsial (Wald Test)

coef_mn  <- summary(model_mn)$coefficients
se_mn    <- summary(model_mn)$standard.errors
z_scores <- coef_mn / se_mn
p_values <- 2 * pnorm(abs(z_scores), lower.tail = FALSE)

cat("=== KOEFISIEN MODEL LOGIT 1 (Chinstrap vs Adelie) ===\n")
## === KOEFISIEN MODEL LOGIT 1 (Chinstrap vs Adelie) ===
result_g1 <- data.frame(
  Variabel = colnames(coef_mn),
  Beta     = round(coef_mn["Chinstrap", ], 4),
  SE       = round(se_mn["Chinstrap", ], 4),
  Wald_Z   = round(z_scores["Chinstrap", ], 4),
  P_value  = round(p_values["Chinstrap", ], 4),
  Sig      = ifelse(p_values["Chinstrap", ] < 0.001, "***",
             ifelse(p_values["Chinstrap", ] < 0.01,  "**",
             ifelse(p_values["Chinstrap", ] < 0.05,  "*", "ns")))
)
print(result_g1)
##                            Variabel      Beta     SE      Wald_Z P_value Sig
## (Intercept)             (Intercept) -107.7286 0.4763   -226.1749  0.0000 ***
## culmen_length_mm   culmen_length_mm   21.8157 1.1395     19.1449  0.0000 ***
## culmen_depth_mm     culmen_depth_mm  -28.5698 7.2154     -3.9596  0.0001 ***
## flipper_length_mm flipper_length_mm   -0.8293 6.8991     -0.1202  0.9043  ns
## body_mass_g             body_mass_g   -0.0484 0.3945     -0.1226  0.9024  ns
## islandDream             islandDream   18.1060 1.4389     12.5829  0.0000 ***
## islandTorgersen     islandTorgersen   -7.4009 0.0002 -32588.8240  0.0000 ***
cat("\n=== KOEFISIEN MODEL LOGIT 2 (Gentoo vs Adelie) ===\n")
## 
## === KOEFISIEN MODEL LOGIT 2 (Gentoo vs Adelie) ===
result_g2 <- data.frame(
  Variabel = colnames(coef_mn),
  Beta     = round(coef_mn["Gentoo", ], 4),
  SE       = round(se_mn["Gentoo", ], 4),
  Wald_Z   = round(z_scores["Gentoo", ], 4),
  P_value  = round(p_values["Gentoo", ], 4),
  Sig      = ifelse(p_values["Gentoo", ] < 0.001, "***",
             ifelse(p_values["Gentoo", ] < 0.01,  "**",
             ifelse(p_values["Gentoo", ] < 0.05,  "*", "ns")))
)
print(result_g2)
##                            Variabel     Beta       SE        Wald_Z P_value Sig
## (Intercept)             (Intercept)  -9.8579   0.8877 -1.110520e+01  0.0000 ***
## culmen_length_mm   culmen_length_mm  13.8783  63.1359  2.198000e-01  0.8260  ns
## culmen_depth_mm     culmen_depth_mm -29.3559  21.1375 -1.388800e+00  0.1649  ns
## flipper_length_mm flipper_length_mm  -0.4965 195.2777 -2.500000e-03  0.9980  ns
## body_mass_g             body_mass_g   0.0076   8.8879  9.000000e-04  0.9993  ns
## islandDream             islandDream -40.7257   0.0000 -1.977508e+12  0.0000 ***
## islandTorgersen     islandTorgersen -21.8752   0.2892 -7.563760e+01  0.0000 ***
cat("\nKeterangan: *** p<0.001  ** p<0.01  * p<0.05  ns = tidak signifikan\n")
## 
## Keterangan: *** p<0.001  ** p<0.01  * p<0.05  ns = tidak signifikan
wald_all <- rbind(
  data.frame(Model = "Logit 1 (Chinstrap vs Adelie)", result_g1),
  data.frame(Model = "Logit 2 (Gentoo vs Adelie)",    result_g2)
)

kable(wald_all, row.names = FALSE,
      caption = "Tabel 9. Hasil Uji Wald (Uji Parsial) Model Multinomial") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"), font_size = 11) %>%
  collapse_rows(columns = 1, valign = "middle") %>%
  row_spec(which(wald_all$Sig %in% c("***", "**", "*")), background = "#EBF5FB")
Tabel 9. Hasil Uji Wald (Uji Parsial) Model Multinomial
Model Variabel Beta SE Wald_Z P_value Sig
Logit 1 (Chinstrap vs Adelie) (Intercept) -107.7286 0.4763 -2.261749e+02 0.0000 ***
culmen_length_mm 21.8157 1.1395 1.914490e+01 0.0000 ***
culmen_depth_mm -28.5698 7.2154 -3.959600e+00 0.0001 ***
flipper_length_mm -0.8293 6.8991 -1.202000e-01 0.9043 ns
body_mass_g -0.0484 0.3945 -1.226000e-01 0.9024 ns
islandDream 18.1060 1.4389 1.258290e+01 0.0000 ***
islandTorgersen -7.4009 0.0002 -3.258882e+04 0.0000 ***
Logit 2 (Gentoo vs Adelie) (Intercept) -9.8579 0.8877 -1.110520e+01 0.0000 ***
culmen_length_mm 13.8783 63.1359 2.198000e-01 0.8260 ns
culmen_depth_mm -29.3559 21.1375 -1.388800e+00 0.1649 ns
flipper_length_mm -0.4965 195.2777 -2.500000e-03 0.9980 ns
body_mass_g 0.0076 8.8879 9.000000e-04 0.9993 ns
islandDream -40.7257 0.0000 -1.977508e+12 0.0000 ***
islandTorgersen -21.8752 0.2892 -7.563760e+01 0.0000 ***

4.5 Persamaan Model

cat("=== MODEL REGRESI LOGISTIK MULTINOMIAL ===\n\n")
## === MODEL REGRESI LOGISTIK MULTINOMIAL ===
cat("LOGIT 1: ln[P(Chinstrap|x) / P(Adelie|x)] =\n  ")
## LOGIT 1: ln[P(Chinstrap|x) / P(Adelie|x)] =
## 
cat(sprintf("%.4f", coef_mn["Chinstrap", "(Intercept)"]))
## -107.7286
for (v in colnames(coef_mn)[-1]) {
  cat(sprintf(" %+.4f*%s", coef_mn["Chinstrap", v], v))
}
##  +21.8157*culmen_length_mm -28.5698*culmen_depth_mm -0.8293*flipper_length_mm -0.0484*body_mass_g +18.1060*islandDream -7.4009*islandTorgersen
cat("\n\n")
cat("LOGIT 2: ln[P(Gentoo|x) / P(Adelie|x)] =\n  ")
## LOGIT 2: ln[P(Gentoo|x) / P(Adelie|x)] =
## 
cat(sprintf("%.4f", coef_mn["Gentoo", "(Intercept)"]))
## -9.8579
for (v in colnames(coef_mn)[-1]) {
  cat(sprintf(" %+.4f*%s", coef_mn["Gentoo", v], v))
}
##  +13.8783*culmen_length_mm -29.3559*culmen_depth_mm -0.4965*flipper_length_mm +0.0076*body_mass_g -40.7257*islandDream -21.8752*islandTorgersen
cat("\n")

4.6 Odds Ratio

or_mn    <- exp(coef(model_mn))
ci_lower <- exp(coef(model_mn) - 1.96 * summary(model_mn)$standard.errors)
ci_upper <- exp(coef(model_mn) + 1.96 * summary(model_mn)$standard.errors)

or_g1 <- data.frame(
  Variabel = colnames(or_mn),
  OR       = round(or_mn["Chinstrap", ], 4),
  CI_Lower = round(ci_lower["Chinstrap", ], 4),
  CI_Upper = round(ci_upper["Chinstrap", ], 4),
  P_value  = round(p_values["Chinstrap", ], 4)
)

or_g2 <- data.frame(
  Variabel = colnames(or_mn),
  OR       = round(or_mn["Gentoo", ], 4),
  CI_Lower = round(ci_lower["Gentoo", ], 4),
  CI_Upper = round(ci_upper["Gentoo", ], 4),
  P_value  = round(p_values["Gentoo", ], 4)
)

cat("=== ODDS RATIO - LOGIT 1 (Chinstrap vs Adelie) ===\n")
## === ODDS RATIO - LOGIT 1 (Chinstrap vs Adelie) ===
print(or_g1)
##                            Variabel           OR     CI_Lower     CI_Upper
## (Intercept)             (Intercept) 0.000000e+00 0.000000e+00 0.000000e+00
## culmen_length_mm   culmen_length_mm 2.981388e+09 3.194890e+08 2.782154e+10
## culmen_depth_mm     culmen_depth_mm 0.000000e+00 0.000000e+00 0.000000e+00
## flipper_length_mm flipper_length_mm 4.363000e-01 0.000000e+00 3.254492e+05
## body_mass_g             body_mass_g 9.528000e-01 4.397000e-01 2.064300e+00
## islandDream             islandDream 7.300084e+07 4.349935e+06 1.225104e+09
## islandTorgersen     islandTorgersen 6.000000e-04 6.000000e-04 6.000000e-04
##                   P_value
## (Intercept)        0.0000
## culmen_length_mm   0.0000
## culmen_depth_mm    0.0001
## flipper_length_mm  0.9043
## body_mass_g        0.9024
## islandDream        0.0000
## islandTorgersen    0.0000
cat("\n=== ODDS RATIO - LOGIT 2 (Gentoo vs Adelie) ===\n")
## 
## === ODDS RATIO - LOGIT 2 (Gentoo vs Adelie) ===
print(or_g2)
##                            Variabel           OR CI_Lower      CI_Upper P_value
## (Intercept)             (Intercept)       0.0001        0  3.000000e-04  0.0000
## culmen_length_mm   culmen_length_mm 1064784.1863        0  5.883006e+59  0.8260
## culmen_depth_mm     culmen_depth_mm       0.0000        0  1.751888e+05  0.1649
## flipper_length_mm flipper_length_mm       0.6087        0 1.018952e+166  0.9980
## body_mass_g             body_mass_g       1.0076        0  3.704997e+07  0.9993
## islandDream             islandDream       0.0000        0  0.000000e+00  0.0000
## islandTorgersen     islandTorgersen       0.0000        0  0.000000e+00  0.0000
or_combined <- rbind(
  data.frame(Model = "Chinstrap vs Adelie", or_g1),
  data.frame(Model = "Gentoo vs Adelie",    or_g2)
)

kable(or_combined, row.names = FALSE, digits = 4,
      caption = "Tabel 10. Odds Ratio dengan 95% Confidence Interval") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"), font_size = 11) %>%
  collapse_rows(columns = 1, valign = "middle")
Tabel 10. Odds Ratio dengan 95% Confidence Interval
Model Variabel OR CI_Lower CI_Upper P_value
Chinstrap vs Adelie (Intercept) 0.000000e+00 0.000000e+00 0.000000e+00 0.0000
culmen_length_mm 2.981388e+09 3.194890e+08 2.782154e+10 0.0000
culmen_depth_mm 0.000000e+00 0.000000e+00 0.000000e+00 0.0001
flipper_length_mm 4.363000e-01 0.000000e+00 3.254492e+05 0.9043
body_mass_g 9.528000e-01 4.397000e-01 2.064300e+00 0.9024
islandDream 7.300084e+07 4.349935e+06 1.225104e+09 0.0000
islandTorgersen 6.000000e-04 6.000000e-04 6.000000e-04 0.0000
Gentoo vs Adelie (Intercept) 1.000000e-04 0.000000e+00 3.000000e-04 0.0000
culmen_length_mm 1.064784e+06 0.000000e+00 5.883006e+59 0.8260
culmen_depth_mm 0.000000e+00 0.000000e+00 1.751888e+05 0.1649
flipper_length_mm 6.087000e-01 0.000000e+00 1.018952e+166 0.9980
body_mass_g 1.007600e+00 0.000000e+00 3.704997e+07 0.9993
islandDream 0.000000e+00 0.000000e+00 0.000000e+00 0.0000
islandTorgersen 0.000000e+00 0.000000e+00 0.000000e+00 0.0000
cat("=== INTERPRETASI ODDS RATIO (variabel numerik, p < 0.05) ===\n\n")
## === INTERPRETASI ODDS RATIO (variabel numerik, p < 0.05) ===
vars_num <- c("culmen_length_mm", "culmen_depth_mm", "flipper_length_mm", "body_mass_g")

for (v in vars_num) {
  or_val <- or_mn["Chinstrap", v]
  pv     <- p_values["Chinstrap", v]
  if (pv < 0.05) {
    arah <- ifelse(or_val > 1, "meningkat", "menurun")
    cat(sprintf("[Logit 1 - Chinstrap vs Adelie]\n"))
    cat(sprintf("  Setiap kenaikan 1 unit %s, peluang Chinstrap vs Adelie\n", v))
    cat(sprintf("  %s sebesar %.4f kali (OR=%.4f, p=%.4f)\n\n", arah, or_val, or_val, pv))
  }
}
## [Logit 1 - Chinstrap vs Adelie]
##   Setiap kenaikan 1 unit culmen_length_mm, peluang Chinstrap vs Adelie
##   meningkat sebesar 2981388376.5594 kali (OR=2981388376.5594, p=0.0000)
## 
## [Logit 1 - Chinstrap vs Adelie]
##   Setiap kenaikan 1 unit culmen_depth_mm, peluang Chinstrap vs Adelie
##   menurun sebesar 0.0000 kali (OR=0.0000, p=0.0001)
for (v in vars_num) {
  or_val <- or_mn["Gentoo", v]
  pv     <- p_values["Gentoo", v]
  if (pv < 0.05) {
    arah <- ifelse(or_val > 1, "meningkat", "menurun")
    cat(sprintf("[Logit 2 - Gentoo vs Adelie]\n"))
    cat(sprintf("  Setiap kenaikan 1 unit %s, peluang Gentoo vs Adelie\n", v))
    cat(sprintf("  %s sebesar %.4f kali (OR=%.4f, p=%.4f)\n\n", arah, or_val, or_val, pv))
  }
}

4.7 Uji Kesesuaian Model (Goodness of Fit)

ll_null  <- logLik(model_null)
ll_model <- logLik(model_mn)

mcfadden_r2   <- as.numeric(1 - (ll_model / ll_null))
cox_r2        <- as.numeric(1 - exp((2 / nrow(train_mn)) * (ll_null - ll_model)))
nagelkerke_r2 <- cox_r2 / (1 - exp((2 / nrow(train_mn)) * as.numeric(ll_null)))

cat("=== UJI KESESUAIAN MODEL (Goodness of Fit) ===\n\n")
## === UJI KESESUAIAN MODEL (Goodness of Fit) ===
cat(sprintf("Log-likelihood (Null)  : %.4f\n", as.numeric(ll_null)))
## Log-likelihood (Null)  : -282.6294
cat(sprintf("Log-likelihood (Model) : %.4f\n", as.numeric(ll_model)))
## Log-likelihood (Model) : -0.0001
cat(sprintf("AIC                    : %.4f\n", AIC(model_mn)))
## AIC                    : 28.0002
cat(sprintf("BIC                    : %.4f\n", BIC(model_mn)))
## BIC                    : 78.2740
cat(sprintf("\nMcFadden R2    : %.4f (%.2f%%)\n", mcfadden_r2,   mcfadden_r2   * 100))
## 
## McFadden R2    : 1.0000 (100.00%)
cat(sprintf("Cox & Snell R2 : %.4f (%.2f%%)\n", cox_r2,         cox_r2        * 100))
## Cox & Snell R2 : 0.8787 (87.87%)
cat(sprintf("Nagelkerke R2  : %.4f (%.2f%%)\n", nagelkerke_r2,  nagelkerke_r2 * 100))
## Nagelkerke R2  : 1.0000 (100.00%)
cat("=== DEVIANCE TEST ===\n")
## === DEVIANCE TEST ===
cat(sprintf("Null Deviance     : %.4f\n", deviance(model_null)))
## Null Deviance     : 565.2588
cat(sprintf("Residual Deviance : %.4f\n", deviance(model_mn)))
## Residual Deviance : 0.0002
cat(sprintf("LR G2             : %.4f\n", deviance(model_null) - deviance(model_mn)))
## LR G2             : 565.2586
cat(sprintf("P-value           : %.6f\n",
            pchisq(deviance(model_null) - deviance(model_mn),
                   df = model_mn$edf - 2, lower.tail = FALSE)))
## P-value           : 0.000000

4.8 Evaluasi Model Multinomial

pred_mn_prob  <- predict(model_mn, test_mn, type = "probs")
pred_mn_class <- predict(model_mn, test_mn, type = "class")

cm_mn   <- confusionMatrix(factor(pred_mn_class, levels = levels(test_mn$species)),
                            test_mn$species)
aper_mn <- (sum(cm_mn$table) - sum(diag(cm_mn$table))) / sum(cm_mn$table)

cat("=== CONFUSION MATRIX - MULTINOMIAL (Test Set) ===\n")
## === CONFUSION MATRIX - MULTINOMIAL (Test Set) ===
print(cm_mn$table)
##            Reference
## Prediction  Adelie Chinstrap Gentoo
##   Adelie        29         0      0
##   Chinstrap      0        13      0
##   Gentoo         0         0     23
cat(sprintf("\nAkurasi : %.2f%%\n", cm_mn$overall["Accuracy"] * 100))
## 
## Akurasi : 100.00%
cat(sprintf("APER    : %.4f = %.2f%%\n", aper_mn, aper_mn * 100))
## APER    : 0.0000 = 0.00%
cm_mn_df        <- as.data.frame(cm_mn$table)
names(cm_mn_df) <- c("Predicted", "Actual", "Freq")

ggplot(cm_mn_df, aes(x = Actual, y = Predicted, fill = Freq)) +
  geom_tile(color = "white", linewidth = 1) +
  geom_text(aes(label = Freq), size = 8, fontface = "bold") +
  scale_fill_gradient(low = "#FDFEFE", high = "#1E8449") +
  labs(title = "Gambar 8. Confusion Matrix Multinomial Logistik - Test Set",
       x = "Kelas Aktual", y = "Kelas Prediksi", fill = "Frekuensi") +
  theme_bw() +
  theme(axis.text  = element_text(size = 12),
        axis.title = element_text(size = 12, face = "bold"),
        plot.title = element_text(size = 13, face = "bold"))

prob_df           <- as.data.frame(round(pred_mn_prob, 4))
prob_df$Predicted <- pred_mn_class
prob_df$Actual    <- test_mn$species
prob_df$Correct   <- ifelse(prob_df$Predicted == prob_df$Actual, "V", "X")

kable(head(prob_df, 10), row.names = FALSE,
      caption = "Tabel 11. Peluang Prediksi per Kelas - 10 Observasi Pertama (Test Set)") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"))
Tabel 11. Peluang Prediksi per Kelas - 10 Observasi Pertama (Test Set)
Adelie Chinstrap Gentoo Predicted Actual Correct
1 0 0 Adelie Adelie V
1 0 0 Adelie Adelie V
1 0 0 Adelie Adelie V
1 0 0 Adelie Adelie V
1 0 0 Adelie Adelie V
1 0 0 Adelie Adelie V
1 0 0 Adelie Adelie V
1 0 0 Adelie Adelie V
1 0 0 Adelie Adelie V
1 0 0 Adelie Adelie V
prob_long <- as.data.frame(pred_mn_prob) %>%
  mutate(Actual = test_mn$species) %>%
  pivot_longer(c(Adelie, Chinstrap, Gentoo),
               names_to  = "Pred_Species",
               values_to = "Probability")

ggplot(prob_long, aes(x = Pred_Species, y = Probability, fill = Pred_Species)) +
  geom_boxplot(alpha = 0.7) +
  facet_wrap(~Actual, labeller = label_both) +
  scale_fill_manual(values = c("#E74C3C", "#27AE60", "#2980B9")) +
  labs(title = "Gambar 9. Distribusi Probabilitas Prediksi per Kelas Aktual",
       x = "Spesies Prediksi", y = "Probabilitas") +
  theme_bw() +
  theme(legend.position = "none",
        strip.background = element_rect(fill = "#2C3E50"),
        strip.text = element_text(color = "white", face = "bold"))

5 Perbandingan LDA vs Regresi Logistik Multinomial

final_comparison <- data.frame(
  Aspek = c("Metode Estimasi", "Asumsi Distribusi", "Tipe Prediktor",
            "Interpretasi", "APER Test Set", "Akurasi Test Set", "AIC"),
  LDA = c(
    "Matriks B dan W, eigenvector W-1B",
    "Multivariat Normal, kovarians homogen",
    "Hanya numerik (interval/rasio)",
    "Skor LD, cutting score, posterior prob",
    sprintf("%.2f%%", aper_test * 100),
    sprintf("%.2f%%", (1 - aper_test) * 100),
    "-"
  ),
  Multinomial_Logistik = c(
    "Maximum Likelihood (Newton-Raphson)",
    "Tidak diasumsikan",
    "Numerik & kategorik",
    "Koefisien logit, OR, posterior prob",
    sprintf("%.2f%%", aper_mn * 100),
    sprintf("%.2f%%", (1 - aper_mn) * 100),
    sprintf("%.2f", AIC(model_mn))
  )
)

kable(final_comparison, row.names = FALSE,
      col.names = c("Aspek", "LDA", "Multinomial Logistik"),
      caption = "Tabel 12. Perbandingan LDA vs Regresi Logistik Multinomial") %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = TRUE) %>%
  column_spec(1, bold = TRUE)
Tabel 12. Perbandingan LDA vs Regresi Logistik Multinomial
Aspek LDA Multinomial Logistik
Metode Estimasi Matriks B dan W, eigenvector W-1B Maximum Likelihood (Newton-Raphson)
Asumsi Distribusi Multivariat Normal, kovarians homogen Tidak diasumsikan
Tipe Prediktor Hanya numerik (interval/rasio) Numerik & kategorik
Interpretasi Skor LD, cutting score, posterior prob Koefisien logit, OR, posterior prob
APER Test Set 1.54% 0.00%
Akurasi Test Set 98.46% 100.00%
AIC
28.00