1 Persiapan

1.1 Load Package

# install.packages(c("MASS", "nnet", "ggplot2", "dplyr", "caret",
#                    "knitr", "kableExtra", "corrplot", "tidyr", "car"))

library(MASS)
library(nnet)
library(ggplot2)
library(dplyr)
library(caret)
library(knitr)
library(kableExtra)
library(corrplot)
library(tidyr)
library(car)

1.2 Load & Preprocessing Data

penguins_raw <- read.csv("penguins_size.csv", stringsAsFactors = FALSE)
str(penguins_raw)

## 'data.frame':    344 obs. of  7 variables:
##  $ species          : chr  "Adelie" "Adelie" "Adelie" "Adelie" ...
##  $ island           : chr  "Torgersen" "Torgersen" "Torgersen" "Torgersen" ...
##  $ culmen_length_mm : num  39.1 39.5 40.3 NA 36.7 39.3 38.9 39.2 34.1 42 ...
##  $ culmen_depth_mm  : num  18.7 17.4 18 NA 19.3 20.6 17.8 19.6 18.1 20.2 ...
##  $ flipper_length_mm: int  181 186 195 NA 193 190 181 195 193 190 ...
##  $ body_mass_g      : int  3750 3800 3250 NA 3450 3650 3625 4675 3475 4250 ...
##  $ sex              : chr  "MALE" "FEMALE" "FEMALE" NA ...

penguins <- penguins_raw %>%
  mutate(across(everything(), ~ifelse(. %in% c("NA", ".", ""), NA, .))) %>%
  mutate(
    culmen_length_mm  = as.numeric(culmen_length_mm),
    culmen_depth_mm   = as.numeric(culmen_depth_mm),
    flipper_length_mm = as.numeric(flipper_length_mm),
    body_mass_g       = as.numeric(body_mass_g),
    species = as.factor(species),
    island  = as.factor(island),
    sex     = as.factor(toupper(sex))
  ) %>%
  na.omit()

cat("Dimensi data setelah cleaning:", nrow(penguins), "x", ncol(penguins), "\n")

## Dimensi data setelah cleaning: 333 x 7

cat("Distribusi spesies:\n")

## Distribusi spesies:

print(table(penguins$species))

## 
##    Adelie Chinstrap    Gentoo 
##       146        68       119

2 Statistika Deskriptif

desc_stats <- penguins %>%
  group_by(species) %>%
  summarise(
    n               = n(),
    culmen_len_mean = round(mean(culmen_length_mm), 2),
    culmen_dep_mean = round(mean(culmen_depth_mm), 2),
    flipper_mean    = round(mean(flipper_length_mm), 2),
    mass_mean       = round(mean(body_mass_g), 1)
  )

kable(desc_stats,
      col.names = c("Spesies", "n", "Culmen Length (mm)",
                    "Culmen Depth (mm)", "Flipper Length (mm)", "Body Mass (g)"),
      caption = "Tabel 1. Rata-rata Variabel Prediktor per Spesies") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"),
                full_width = FALSE)

Tabel 1. Rata-rata Variabel Prediktor per Spesies
Spesies	n	Culmen Length (mm)	Culmen Depth (mm)	Flipper Length (mm)	Body Mass (g)
Adelie	146	38.82	18.35	190.10	3706.2
Chinstrap	68	48.83	18.42	195.82	3733.1
Gentoo	119	47.57	15.00	217.24	5092.4

penguins_long <- penguins %>%
  select(species, culmen_length_mm, culmen_depth_mm, flipper_length_mm, body_mass_g) %>%
  pivot_longer(-species, names_to = "variable", values_to = "value") %>%
  mutate(variable = case_when(
    variable == "culmen_length_mm"  ~ "Culmen Length (mm)",
    variable == "culmen_depth_mm"   ~ "Culmen Depth (mm)",
    variable == "flipper_length_mm" ~ "Flipper Length (mm)",
    variable == "body_mass_g"       ~ "Body Mass (g)",
    TRUE ~ variable
  ))

ggplot(penguins_long, aes(x = species, y = value, fill = species)) +
  geom_boxplot(alpha = 0.7, outlier.shape = 21) +
  facet_wrap(~variable, scales = "free_y", ncol = 2) +
  scale_fill_manual(values = c("#FF6B6B", "#4ECDC4", "#45B7D1")) +
  labs(title = "Gambar 1. Distribusi Variabel Prediktor per Spesies",
       x = "Spesies", y = "Nilai", fill = "Spesies") +
  theme_bw() +
  theme(legend.position = "bottom",
        strip.background = element_rect(fill = "#2C3E50"),
        strip.text = element_text(color = "white", face = "bold"))

ggplot(penguins, aes(x = culmen_length_mm, y = flipper_length_mm,
                     color = species, shape = species)) +
  geom_point(alpha = 0.7, size = 2.5) +
  scale_color_manual(values = c("#E74C3C", "#27AE60", "#2980B9")) +
  labs(title = "Gambar 2. Scatter Plot Culmen Length vs Flipper Length",
       subtitle = "Berdasarkan Spesies Penguin",
       x = "Culmen Length (mm)", y = "Flipper Length (mm)",
       color = "Spesies", shape = "Spesies") +
  theme_bw() +
  theme(legend.position = "bottom")

cor_matrix <- cor(penguins %>%
                    select(culmen_length_mm, culmen_depth_mm,
                           flipper_length_mm, body_mass_g))

corrplot(cor_matrix, method = "color", type = "upper",
         addCoef.col = "black", number.cex = 0.8,
         tl.col = "black", tl.srt = 45,
         col = colorRampPalette(c("#D73027", "white", "#4575B4"))(200),
         title = "Gambar 3. Matriks Korelasi Variabel Prediktor",
         mar = c(0, 0, 2, 0))

3 Analisis Diskriminan

3.1 Uji Asumsi

3.1.1 Uji Normalitas (Shapiro-Wilk)

vars       <- c("culmen_length_mm", "culmen_depth_mm", "flipper_length_mm", "body_mass_g")
var_labels <- c("Culmen Length", "Culmen Depth", "Flipper Length", "Body Mass")

normalitas_result <- data.frame()
for (sp in levels(penguins$species)) {
  for (i in seq_along(vars)) {
    x  <- penguins %>% filter(species == sp) %>% pull(vars[i])
    sw <- shapiro.test(x)
    normalitas_result <- rbind(normalitas_result, data.frame(
      Spesies    = sp,
      Variabel   = var_labels[i],
      W          = round(sw$statistic, 4),
      p_value    = round(sw$p.value, 4),
      Kesimpulan = ifelse(sw$p.value > 0.05, "Normal", "Tidak Normal")
    ))
  }
}

kable(normalitas_result, row.names = FALSE,
      caption = "Tabel 2. Uji Shapiro-Wilk Normalitas per Variabel per Spesies") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed")) %>%
  row_spec(which(normalitas_result$Kesimpulan == "Tidak Normal"),
           background = "#FFF3CD")

Tabel 2. Uji Shapiro-Wilk Normalitas per Variabel per Spesies
Spesies	Variabel	W	p_value	Kesimpulan
Adelie	Culmen Length	0.9929	0.6848	Normal
Adelie	Culmen Depth	0.9847	0.1047	Normal
Adelie	Flipper Length	0.9934	0.7427	Normal
Adelie	Body Mass	0.9812	0.0423	Tidak Normal
Chinstrap	Culmen Length	0.9752	0.1941	Normal
Chinstrap	Culmen Depth	0.9727	0.1418	Normal
Chinstrap	Flipper Length	0.9889	0.8106	Normal
Chinstrap	Body Mass	0.9845	0.5605	Normal
Gentoo	Culmen Length	0.9738	0.0199	Tidak Normal
Gentoo	Culmen Depth	0.9776	0.0438	Tidak Normal
Gentoo	Flipper Length	0.9615	0.0018	Tidak Normal
Gentoo	Body Mass	0.9861	0.2605	Normal

3.1.2 Uji Homogenitas Varians (Levene)

levene_results <- data.frame()
for (i in seq_along(vars)) {
  lev <- car::leveneTest(penguins[[vars[i]]] ~ penguins$species)
  levene_results <- rbind(levene_results, data.frame(
    Variabel   = var_labels[i],
    F_stat     = round(lev$`F value`[1], 4),
    p_value    = round(lev$`Pr(>F)`[1], 4),
    Kesimpulan = ifelse(lev$`Pr(>F)`[1] > 0.05, "Homogen", "Tidak Homogen")
  ))
}

kable(levene_results, row.names = FALSE,
      caption = "Tabel 3. Uji Levene Homogenitas Varians per Variabel") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed")) %>%
  row_spec(which(levene_results$Kesimpulan == "Tidak Homogen"),
           background = "#FFF3CD")

Tabel 3. Uji Levene Homogenitas Varians per Variabel
Variabel	F_stat	p_value	Kesimpulan
Culmen Length	2.2855	0.1033	Homogen
Culmen Depth	1.9124	0.1494	Homogen
Flipper Length	0.4428	0.6426	Homogen
Body Mass	5.1349	0.0064	Tidak Homogen

3.2 Pembagian Data Training dan Testing

set.seed(42)
train_idx  <- createDataPartition(penguins$species, p = 0.8, list = FALSE)
train_data <- penguins[train_idx, ]
test_data  <- penguins[-train_idx, ]

cat("Data training:", nrow(train_data), "observasi\n")

## Data training: 268 observasi

cat("Data testing :", nrow(test_data), "observasi\n")

## Data testing : 65 observasi

cat("\nDistribusi spesies - Training:\n")

## 
## Distribusi spesies - Training:

print(table(train_data$species))

## 
##    Adelie Chinstrap    Gentoo 
##       117        55        96

cat("\nDistribusi spesies - Testing:\n")

## 
## Distribusi spesies - Testing:

print(table(test_data$species))

## 
##    Adelie Chinstrap    Gentoo 
##        29        13        23

3.3 Perhitungan Manual Matriks B dan W

pred_vars     <- c("culmen_length_mm", "culmen_depth_mm", "flipper_length_mm", "body_mass_g")
X_train       <- as.matrix(train_data[, pred_vars])
species_train <- train_data$species
species_list  <- levels(species_train)
g <- length(species_list)
p <- ncol(X_train)
n <- nrow(X_train)

x_bar <- colMeans(X_train)

group_means <- sapply(species_list, function(sp) {
  colMeans(X_train[species_train == sp, ])
})
colnames(group_means) <- species_list

cat("=== GRAND MEAN ===\n")

## === GRAND MEAN ===

print(round(x_bar, 3))

##  culmen_length_mm   culmen_depth_mm flipper_length_mm       body_mass_g 
##            43.826            17.146           200.806          4193.750

cat("\n=== MEAN PER SPESIES ===\n")

## 
## === MEAN PER SPESIES ===

print(round(group_means, 3))

##                     Adelie Chinstrap   Gentoo
## culmen_length_mm    38.597    48.780   47.360
## culmen_depth_mm     18.316    18.391   15.005
## flipper_length_mm  189.735   195.636  217.260
## body_mass_g       3694.658  3716.818 5075.260

B <- matrix(0, p, p)
for (sp in species_list) {
  ni   <- sum(species_train == sp)
  diff <- group_means[, sp] - x_bar
  B    <- B + ni * (diff %o% diff)
}

W <- matrix(0, p, p)
for (sp in species_list) {
  Xi     <- X_train[species_train == sp, ]
  xi_bar <- group_means[, sp]
  for (j in 1:nrow(Xi)) {
    diff <- Xi[j, ] - xi_bar
    W    <- W + (diff %o% diff)
  }
}

cat("\n=== MATRIKS BETWEEN-GROUPS (B) ===\n")

## 
## === MATRIKS BETWEEN-GROUPS (B) ===

print(round(B, 2))

##                   culmen_length_mm culmen_depth_mm flipper_length_mm
## culmen_length_mm           5747.59        -1103.06          10947.05
## culmen_depth_mm           -1103.06          685.43          -5251.42
## flipper_length_mm         10947.05        -5251.42          41801.91
## body_mass_g              474465.86      -282154.68        2174535.75
##                   body_mass_g
## culmen_length_mm     474465.9
## culmen_depth_mm     -282154.7
## flipper_length_mm   2174535.8
## body_mass_g       116252183.2

cat("\n=== MATRIKS WITHIN-GROUPS (W) ===\n")

## 
## === MATRIKS WITHIN-GROUPS (W) ===

print(round(W, 2))

##                   culmen_length_mm culmen_depth_mm flipper_length_mm
## culmen_length_mm           2369.17          469.56           2329.91
## culmen_depth_mm             469.56          354.49            957.09
## flipper_length_mm          2329.91          957.09          11374.00
## body_mass_g              212352.89        87775.93         475014.25
##                   body_mass_g
## culmen_length_mm    212352.89
## culmen_depth_mm      87775.93
## flipper_length_mm   475014.25
## body_mass_g       57555473.00

3.4 Eigenvector W⁻¹B

W_inv <- solve(W)
WinvB <- W_inv %*% B
eig   <- eigen(WinvB)

cat("=== EIGENVALUES W-1B ===\n")

## === EIGENVALUES W-1B ===

print(round(Re(eig$values), 6))

## [1] 14.253712  2.359715  0.000000  0.000000

eig_vec <- Re(eig$vectors)
rownames(eig_vec) <- pred_vars
colnames(eig_vec) <- paste0("LD", 1:p)

cat("\n=== EIGENVECTORS (Koefisien Fungsi Diskriminan) ===\n")

## 
## === EIGENVECTORS (Koefisien Fungsi Diskriminan) ===

print(round(eig_vec[, 1:2], 6))

##                         LD1       LD2
## culmen_length_mm  -0.090808 -0.995887
## culmen_depth_mm    0.991617 -0.089487
## flipper_length_mm -0.091917  0.013501
## body_mass_g       -0.001230  0.004321

eig_vals     <- Re(eig$values)
eig_vals_pos <- eig_vals[eig_vals > 1e-10]
prop_var     <- eig_vals_pos / sum(eig_vals_pos)

cat("\n=== PROPORSI VARIANSI YANG DIJELASKAN ===\n")

## 
## === PROPORSI VARIANSI YANG DIJELASKAN ===

for (i in seq_along(prop_var)) {
  cat(sprintf("LD%d: %.2f%%\n", i, prop_var[i] * 100))
}

## LD1: 85.80%
## LD2: 14.20%

3.5 Model LDA

lda_model <- lda(species ~ culmen_length_mm + culmen_depth_mm +
                              flipper_length_mm + body_mass_g,
                 data  = train_data,
                 prior = c(1, 1, 1) / 3)

print(lda_model)

## Call:
## lda(species ~ culmen_length_mm + culmen_depth_mm + flipper_length_mm + 
##     body_mass_g, data = train_data, prior = c(1, 1, 1)/3)
## 
## Prior probabilities of groups:
##    Adelie Chinstrap    Gentoo 
## 0.3333333 0.3333333 0.3333333 
## 
## Group means:
##           culmen_length_mm culmen_depth_mm flipper_length_mm body_mass_g
## Adelie            38.59744        18.31624          189.7350    3694.658
## Chinstrap         48.78000        18.39091          195.6364    3716.818
## Gentoo            47.36042        15.00521          217.2604    5075.260
## 
## Coefficients of linear discriminants:
##                            LD1          LD2
## culmen_length_mm  -0.048088861 -0.411317854
## culmen_depth_mm    0.977608936  0.063375525
## flipper_length_mm -0.090833392 -0.003771808
## body_mass_g       -0.001386521  0.001621606
## 
## Proportion of trace:
##    LD1    LD2 
## 0.8151 0.1849

lda_coef          <- as.data.frame(lda_model$scaling)
lda_coef$Variabel <- rownames(lda_coef)

kable(lda_coef[, c("Variabel", "LD1", "LD2")],
      row.names = FALSE, digits = 4,
      caption = "Tabel 4. Koefisien Linear Discriminant Functions") %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)

Tabel 4. Koefisien Linear Discriminant Functions
Variabel	LD1	LD2
culmen_length_mm	-0.0481	-0.4113
culmen_depth_mm	0.9776	0.0634
flipper_length_mm	-0.0908	-0.0038
body_mass_g	-0.0014	0.0016

prop_trace <- round(lda_model$svd^2 / sum(lda_model$svd^2) * 100, 2)
cat("Proporsi Variansi:\n")

## Proporsi Variansi:

cat(sprintf("  LD1: %.2f%%\n", prop_trace[1]))

##   LD1: 81.51%

cat(sprintf("  LD2: %.2f%%\n", prop_trace[2]))

##   LD2: 18.49%

cat(sprintf("  Total: %.2f%%\n", sum(prop_trace)))

##   Total: 100.00%

3.6 Visualisasi Fungsi Diskriminan

lda_pred_train <- predict(lda_model, train_data)

lda_df <- data.frame(
  LD1     = lda_pred_train$x[, 1],
  LD2     = lda_pred_train$x[, 2],
  Species = train_data$species
)

ggplot(lda_df, aes(x = LD1, y = LD2, color = Species, shape = Species)) +
  geom_point(alpha = 0.75, size = 2.5) +
  stat_ellipse(aes(fill = Species), geom = "polygon", alpha = 0.1, level = 0.9) +
  scale_color_manual(values = c("#E74C3C", "#27AE60", "#2980B9")) +
  scale_fill_manual(values  = c("#E74C3C", "#27AE60", "#2980B9")) +
  labs(title = "Gambar 4. Proyeksi LDA: LD1 vs LD2",
       subtitle = sprintf("LD1: %.1f%% | LD2: %.1f%% variansi terjelaskan",
                          prop_trace[1], prop_trace[2]),
       x = paste0("LD1 (", prop_trace[1], "%)"),
       y = paste0("LD2 (", prop_trace[2], "%)")) +
  theme_bw() +
  theme(legend.position = "bottom")

ggplot(lda_df, aes(x = LD1, fill = Species)) +
  geom_histogram(bins = 25, alpha = 0.6, position = "identity") +
  scale_fill_manual(values = c("#E74C3C", "#27AE60", "#2980B9")) +
  labs(title = "Gambar 5. Distribusi Skor LD1 per Spesies",
       x = "Skor LD1", y = "Frekuensi") +
  theme_bw() +
  theme(legend.position = "bottom")

3.7 Posterior Probability & Classification Rule

pred_test <- predict(lda_model, test_data)

posterior_df           <- as.data.frame(round(pred_test$posterior, 4))
posterior_df$Predicted <- pred_test$class
posterior_df$Actual    <- test_data$species
posterior_df$Correct   <- ifelse(posterior_df$Predicted == posterior_df$Actual, "V", "X")

kable(head(posterior_df, 10),
      caption = "Tabel 5. Posterior Probability (10 Observasi Pertama Test Set)") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"))

Tabel 5. Posterior Probability (10 Observasi Pertama Test Set)
	Adelie	Chinstrap	Predicted	Actual	Correct
8	1.0000	0.0000	Adelie	Adelie	V
16	1.0000	0.0000	Adelie	Adelie	V
17	0.9997	0.0003	Adelie	Adelie	V
28	0.9494	0.0506	Adelie	Adelie	V
33	0.9933	0.0067	Adelie	Adelie	V
42	0.9974	0.0026	Adelie	Adelie	V
44	0.9869	0.0131	Adelie	Adelie	V
51	0.9975	0.0025	Adelie	Adelie	V
57	0.9992	0.0008	Adelie	Adelie	V
58	0.9984	0.0016	Adelie	Adelie	V

3.8 Evaluasi Model LDA

3.8.1 Confusion Matrix & APER - Training Set

pred_train_class <- predict(lda_model, train_data)$class
cm_train         <- confusionMatrix(pred_train_class, train_data$species)

cat("=== CONFUSION MATRIX - TRAINING SET ===\n")

## === CONFUSION MATRIX - TRAINING SET ===

print(cm_train$table)

##            Reference
## Prediction  Adelie Chinstrap Gentoo
##   Adelie       115         1      0
##   Chinstrap      2        54      0
##   Gentoo         0         0     96

n_total    <- sum(cm_train$table)
n_wrong    <- n_total - sum(diag(cm_train$table))
aper_train <- n_wrong / n_total

cat(sprintf("\nAPER (Training) = %d / %d = %.4f = %.2f%%\n",
            n_wrong, n_total, aper_train, aper_train * 100))

## 
## APER (Training) = 3 / 268 = 0.0112 = 1.12%

cat(sprintf("Akurasi (Training) = %.2f%%\n",
            cm_train$overall["Accuracy"] * 100))

## Akurasi (Training) = 98.88%

3.8.2 Confusion Matrix & APER - Test Set

pred_test_class <- predict(lda_model, test_data)$class
cm_test         <- confusionMatrix(pred_test_class, test_data$species)

cat("=== CONFUSION MATRIX - TEST SET ===\n")

## === CONFUSION MATRIX - TEST SET ===

print(cm_test$table)

##            Reference
## Prediction  Adelie Chinstrap Gentoo
##   Adelie        29         1      0
##   Chinstrap      0        12      0
##   Gentoo         0         0     23

n_total_t <- sum(cm_test$table)
n_wrong_t <- n_total_t - sum(diag(cm_test$table))
aper_test <- n_wrong_t / n_total_t

cat(sprintf("\nAPER (Testing) = %d / %d = %.4f = %.2f%%\n",
            n_wrong_t, n_total_t, aper_test, aper_test * 100))

## 
## APER (Testing) = 1 / 65 = 0.0154 = 1.54%

cat(sprintf("Akurasi (Testing) = %.2f%%\n",
            cm_test$overall["Accuracy"] * 100))

## Akurasi (Testing) = 98.46%

cm_df        <- as.data.frame(cm_test$table)
names(cm_df) <- c("Predicted", "Actual", "Freq")

ggplot(cm_df, aes(x = Actual, y = Predicted, fill = Freq)) +
  geom_tile(color = "white", linewidth = 1) +
  geom_text(aes(label = Freq), size = 8, fontface = "bold") +
  scale_fill_gradient(low = "#EBF5FB", high = "#1A5276") +
  labs(title = "Gambar 6. Confusion Matrix LDA - Test Set",
       x = "Kelas Aktual", y = "Kelas Prediksi", fill = "Frekuensi") +
  theme_bw() +
  theme(axis.text  = element_text(size = 12),
        axis.title = element_text(size = 12, face = "bold"),
        plot.title = element_text(size = 13, face = "bold"))

metrics_df <- data.frame(
  Spesies     = levels(test_data$species),
  Sensitivity = round(cm_test$byClass[, "Sensitivity"], 4),
  Specificity = round(cm_test$byClass[, "Specificity"], 4),
  Precision   = round(cm_test$byClass[, "Precision"], 4),
  F1_Score    = round(cm_test$byClass[, "F1"], 4)
)

kable(metrics_df, row.names = FALSE,
      caption = "Tabel 6. Metrik Performa LDA per Kelas (Test Set)") %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)

Tabel 6. Metrik Performa LDA per Kelas (Test Set)
Spesies	Sensitivity	Specificity	Precision	F1_Score
Adelie	1.0000	0.9722	0.9667	0.9831
Chinstrap	0.9231	1.0000	1.0000	0.9600
Gentoo	1.0000	1.0000	1.0000	1.0000

3.8.3 Leave-One-Out Cross Validation (LOO-CV)

lda_cv  <- lda(species ~ culmen_length_mm + culmen_depth_mm +
                           flipper_length_mm + body_mass_g,
               data = penguins,
               CV   = TRUE)

cm_cv   <- table(Actual = penguins$species, Predicted = lda_cv$class)
aper_cv <- (sum(cm_cv) - sum(diag(cm_cv))) / sum(cm_cv)

cat("=== CONFUSION MATRIX - LOO Cross Validation ===\n")

## === CONFUSION MATRIX - LOO Cross Validation ===

print(cm_cv)

##            Predicted
## Actual      Adelie Chinstrap Gentoo
##   Adelie       144         2      0
##   Chinstrap      3        65      0
##   Gentoo         0         0    119

cat(sprintf("\nAPER (LOO-CV)    = %.4f = %.2f%%\n", aper_cv, aper_cv * 100))

## 
## APER (LOO-CV)    = 0.0150 = 1.50%

cat(sprintf("Akurasi (LOO-CV) = %.2f%%\n", (1 - aper_cv) * 100))

## Akurasi (LOO-CV) = 98.50%

3.9 QDA (Quadratic Discriminant Analysis)

qda_model <- qda(species ~ culmen_length_mm + culmen_depth_mm +
                               flipper_length_mm + body_mass_g,
                 data = train_data)

pred_qda <- predict(qda_model, test_data)
cm_qda   <- confusionMatrix(pred_qda$class, test_data$species)
aper_qda <- (sum(cm_qda$table) - sum(diag(cm_qda$table))) / sum(cm_qda$table)

cat("=== QDA - CONFUSION MATRIX (Test Set) ===\n")

## === QDA - CONFUSION MATRIX (Test Set) ===

print(cm_qda$table)

##            Reference
## Prediction  Adelie Chinstrap Gentoo
##   Adelie        29         1      0
##   Chinstrap      0        12      0
##   Gentoo         0         0     23

cat(sprintf("\nAPER (QDA)    = %.4f = %.2f%%\n", aper_qda, aper_qda * 100))

## 
## APER (QDA)    = 0.0154 = 1.54%

cat(sprintf("Akurasi (QDA) = %.2f%%\n", cm_qda$overall["Accuracy"] * 100))

## Akurasi (QDA) = 98.46%

3.10 Perbandingan LDA vs QDA

comparison_df <- data.frame(
  Metode  = c("LDA (Test)", "LDA (LOO-CV)", "QDA (Test)"),
  APER    = paste0(round(c(aper_test, aper_cv, aper_qda) * 100, 2), "%"),
  Akurasi = paste0(round(c(1 - aper_test, 1 - aper_cv, 1 - aper_qda) * 100, 2), "%")
)

kable(comparison_df, row.names = FALSE,
      caption = "Tabel 7. Perbandingan Performa LDA dan QDA") %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)

Tabel 7. Perbandingan Performa LDA dan QDA
Metode	APER	Akurasi
LDA (Test)	1.54%	98.46%
LDA (LOO-CV)	1.5%	98.5%
QDA (Test)	1.54%	98.46%

4 Regresi Logistik Multinomial

4.1 Uji Asumsi

4.1.1 Tidak Ada Multikolinieritas (VIF)

lm_proxy <- lm(as.numeric(species) ~ culmen_length_mm + culmen_depth_mm +
                 flipper_length_mm + body_mass_g + island + sex,
               data = train_data)

vif_raw <- car::vif(lm_proxy)

if (is.matrix(vif_raw)) {
  vif_df <- data.frame(
    Variabel   = rownames(vif_raw),
    VIF        = round(vif_raw[, "GVIF^(1/(2*Df))"]^2, 4),
    Keterangan = ifelse(vif_raw[, "GVIF^(1/(2*Df))"]^2 > 10,
                        "Multikolinieritas", "Aman")
  )
} else {
  vif_df <- data.frame(
    Variabel   = names(vif_raw),
    VIF        = round(as.numeric(vif_raw), 4),
    Keterangan = ifelse(as.numeric(vif_raw) > 10, "Multikolinieritas", "Aman")
  )
}

kable(vif_df, row.names = FALSE,
      caption = "Tabel 8. Variance Inflation Factor (VIF)") %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE) %>%
  row_spec(which(vif_df$Keterangan == "Multikolinieritas"), background = "#FFF3CD")

Tabel 8. Variance Inflation Factor (VIF)
Variabel	VIF	Keterangan
culmen_length_mm	2.2660	Aman
culmen_depth_mm	3.1494	Aman
flipper_length_mm	6.0184	Aman
body_mass_g	6.2560	Aman
island	1.6380	Aman
sex	2.4907	Aman

4.1.2 Deteksi Outlier (Mahalanobis Distance)

X_all     <- as.matrix(penguins[, pred_vars])
mah_dist  <- mahalanobis(X_all, colMeans(X_all), cov(X_all))
threshold <- qchisq(0.999, df = 4)
n_outlier <- sum(mah_dist > threshold)

penguins$mah_dist <- mah_dist

cat(sprintf("Threshold Mahalanobis (chi-sq df=4, alpha=0.001): %.2f\n", threshold))

## Threshold Mahalanobis (chi-sq df=4, alpha=0.001): 18.47

cat(sprintf("Jumlah potensi outlier: %d dari %d observasi (%.1f%%)\n",
            n_outlier, nrow(penguins), n_outlier / nrow(penguins) * 100))

## Jumlah potensi outlier: 1 dari 333 observasi (0.3%)

ggplot(penguins, aes(x = seq_along(mah_dist), y = mah_dist, color = species)) +
  geom_point(alpha = 0.6) +
  geom_hline(yintercept = threshold, linetype = "dashed", color = "red", linewidth = 1) +
  scale_color_manual(values = c("#E74C3C", "#27AE60", "#2980B9")) +
  labs(title = "Gambar 7. Jarak Mahalanobis untuk Deteksi Outlier",
       x = "Indeks Observasi", y = "Jarak Mahalanobis", color = "Spesies") +
  theme_bw() +
  theme(legend.position = "bottom")

4.2 Persiapan Data Multinomial

penguins_mn         <- penguins %>% select(-mah_dist)
penguins_mn$species <- relevel(penguins_mn$species, ref = "Adelie")

set.seed(42)
train_idx_mn <- createDataPartition(penguins_mn$species, p = 0.8, list = FALSE)
train_mn     <- penguins_mn[train_idx_mn, ]
test_mn      <- penguins_mn[-train_idx_mn, ]

cat("Referensi kategori : Adelie\n")

## Referensi kategori : Adelie

cat("Kategori lain      : Chinstrap (g1), Gentoo (g2)\n\n")

## Kategori lain      : Chinstrap (g1), Gentoo (g2)

cat("Data training:", nrow(train_mn), "\n")

## Data training: 268

cat("Data testing :", nrow(test_mn), "\n")

## Data testing : 65

4.3 Model Multinomial

4.3.1 Model Penuh

set.seed(42)
model_penuh <- multinom(
  species ~ culmen_length_mm + culmen_depth_mm + flipper_length_mm +
            body_mass_g + island + sex,
  data  = train_mn,
  trace = FALSE,
  maxit = 500
)

summary(model_penuh)

## Call:
## multinom(formula = species ~ culmen_length_mm + culmen_depth_mm + 
##     flipper_length_mm + body_mass_g + island + sex, data = train_mn, 
##     trace = FALSE, maxit = 500)
## 
## Coefficients:
##           (Intercept) culmen_length_mm culmen_depth_mm flipper_length_mm
## Chinstrap  -139.76675         17.28758       -23.84697        -0.4478519
## Gentoo      -11.03335         10.67415       -21.37892        -0.5511787
##           body_mass_g islandDream islandTorgersen    sexMALE
## Chinstrap -0.03292373    31.86048       -5.249291 -23.287496
## Gentoo     0.01052112   -47.22957      -23.064824  -2.981905
## 
## Std. Errors:
##           (Intercept) culmen_length_mm culmen_depth_mm flipper_length_mm
## Chinstrap    6.522165         16.98786       123.14247          14.61513
## Gentoo       1.421835         39.21404        94.74673          59.71040
##           body_mass_g  islandDream islandTorgersen      sexMALE
## Chinstrap   0.5772008 2.313083e+01    4.868708e-12  0.001169866
## Gentoo      2.7187739 2.935303e-09    2.537818e+00 17.701742622
## 
## Residual Deviance: 0.000190203 
## AIC: 32.00019

4.3.2 Model Parsimoni

set.seed(42)
model_mn <- multinom(
  species ~ culmen_length_mm + culmen_depth_mm + flipper_length_mm +
            body_mass_g + island,
  data  = train_mn,
  trace = FALSE,
  maxit = 500
)

summary(model_mn)

## Call:
## multinom(formula = species ~ culmen_length_mm + culmen_depth_mm + 
##     flipper_length_mm + body_mass_g + island, data = train_mn, 
##     trace = FALSE, maxit = 500)
## 
## Coefficients:
##           (Intercept) culmen_length_mm culmen_depth_mm flipper_length_mm
## Chinstrap -107.728575         21.81565       -28.56980        -0.8293417
## Gentoo      -9.857894         13.87828       -29.35592        -0.4964888
##            body_mass_g islandDream islandTorgersen
## Chinstrap -0.048380887    18.10598       -7.400944
## Gentoo     0.007588159   -40.72572      -21.875173
## 
## Std. Errors:
##           (Intercept) culmen_length_mm culmen_depth_mm flipper_length_mm
## Chinstrap   0.4763064         1.139501        7.215415          6.899134
## Gentoo      0.8876804        63.135870       21.137519        195.277749
##           body_mass_g  islandDream islandTorgersen
## Chinstrap    0.394485 1.438934e+00    0.0002271007
## Gentoo       8.887852 2.059447e-11    0.2892103101
## 
## Residual Deviance: 0.0001977755 
## AIC: 28.0002

4.4 Uji Signifikansi Parameter

4.4.1 Uji Serentak (Likelihood Ratio Test)

model_null <- multinom(species ~ 1, data = train_mn, trace = FALSE)

lrt <- anova(model_null, model_mn, test = "Chisq")
print(lrt)

## Likelihood ratio tests of Multinomial Models
## 
## Response: species
##                                                                           Model
## 1                                                                             1
## 2 culmen_length_mm + culmen_depth_mm + flipper_length_mm + body_mass_g + island
##   Resid. df   Resid. Dev   Test    Df LR stat. Pr(Chi)
## 1       534 5.652588e+02                              
## 2       522 1.977755e-04 1 vs 2    12 565.2586       0

G_stat    <- deviance(model_null) - deviance(model_mn)
df_diff   <- length(coef(model_mn)) - length(coef(model_null))
p_value_G <- pchisq(G_stat, df = df_diff, lower.tail = FALSE)

cat(sprintf("\n=== UJI SERENTAK (LRT) ===\n"))

## 
## === UJI SERENTAK (LRT) ===

cat(sprintf("G2            = %.4f\n", G_stat))

## G2            = 565.2586

cat(sprintf("Derajat bebas = %d\n", df_diff))

## Derajat bebas = 12

cat(sprintf("P-value       = %.6f\n", p_value_G))

## P-value       = 0.000000

cat(sprintf("chi2(df=%d, alpha=0.05) = %.4f\n", df_diff, qchisq(0.95, df_diff)))

## chi2(df=12, alpha=0.05) = 21.0261

cat(sprintf("\nKeputusan : %s H0\n", ifelse(p_value_G < 0.05, "TOLAK", "GAGAL TOLAK")))

## 
## Keputusan : TOLAK H0

cat(sprintf("Kesimpulan: %s\n",
            ifelse(p_value_G < 0.05,
                   "Minimal satu prediktor berpengaruh signifikan terhadap model.",
                   "Tidak ada prediktor yang berpengaruh signifikan.")))

## Kesimpulan: Minimal satu prediktor berpengaruh signifikan terhadap model.

4.4.2 Uji Parsial (Wald Test)

coef_mn  <- summary(model_mn)$coefficients
se_mn    <- summary(model_mn)$standard.errors
z_scores <- coef_mn / se_mn
p_values <- 2 * pnorm(abs(z_scores), lower.tail = FALSE)

cat("=== KOEFISIEN MODEL LOGIT 1 (Chinstrap vs Adelie) ===\n")

## === KOEFISIEN MODEL LOGIT 1 (Chinstrap vs Adelie) ===

result_g1 <- data.frame(
  Variabel = colnames(coef_mn),
  Beta     = round(coef_mn["Chinstrap", ], 4),
  SE       = round(se_mn["Chinstrap", ], 4),
  Wald_Z   = round(z_scores["Chinstrap", ], 4),
  P_value  = round(p_values["Chinstrap", ], 4),
  Sig      = ifelse(p_values["Chinstrap", ] < 0.001, "***",
             ifelse(p_values["Chinstrap", ] < 0.01,  "**",
             ifelse(p_values["Chinstrap", ] < 0.05,  "*", "ns")))
)
print(result_g1)

##                            Variabel      Beta     SE      Wald_Z P_value Sig
## (Intercept)             (Intercept) -107.7286 0.4763   -226.1749  0.0000 ***
## culmen_length_mm   culmen_length_mm   21.8157 1.1395     19.1449  0.0000 ***
## culmen_depth_mm     culmen_depth_mm  -28.5698 7.2154     -3.9596  0.0001 ***
## flipper_length_mm flipper_length_mm   -0.8293 6.8991     -0.1202  0.9043  ns
## body_mass_g             body_mass_g   -0.0484 0.3945     -0.1226  0.9024  ns
## islandDream             islandDream   18.1060 1.4389     12.5829  0.0000 ***
## islandTorgersen     islandTorgersen   -7.4009 0.0002 -32588.8240  0.0000 ***

cat("\n=== KOEFISIEN MODEL LOGIT 2 (Gentoo vs Adelie) ===\n")

## 
## === KOEFISIEN MODEL LOGIT 2 (Gentoo vs Adelie) ===

result_g2 <- data.frame(
  Variabel = colnames(coef_mn),
  Beta     = round(coef_mn["Gentoo", ], 4),
  SE       = round(se_mn["Gentoo", ], 4),
  Wald_Z   = round(z_scores["Gentoo", ], 4),
  P_value  = round(p_values["Gentoo", ], 4),
  Sig      = ifelse(p_values["Gentoo", ] < 0.001, "***",
             ifelse(p_values["Gentoo", ] < 0.01,  "**",
             ifelse(p_values["Gentoo", ] < 0.05,  "*", "ns")))
)
print(result_g2)

##                            Variabel     Beta       SE        Wald_Z P_value Sig
## (Intercept)             (Intercept)  -9.8579   0.8877 -1.110520e+01  0.0000 ***
## culmen_length_mm   culmen_length_mm  13.8783  63.1359  2.198000e-01  0.8260  ns
## culmen_depth_mm     culmen_depth_mm -29.3559  21.1375 -1.388800e+00  0.1649  ns
## flipper_length_mm flipper_length_mm  -0.4965 195.2777 -2.500000e-03  0.9980  ns
## body_mass_g             body_mass_g   0.0076   8.8879  9.000000e-04  0.9993  ns
## islandDream             islandDream -40.7257   0.0000 -1.977508e+12  0.0000 ***
## islandTorgersen     islandTorgersen -21.8752   0.2892 -7.563760e+01  0.0000 ***

cat("\nKeterangan: *** p<0.001  ** p<0.01  * p<0.05  ns = tidak signifikan\n")

## 
## Keterangan: *** p<0.001  ** p<0.01  * p<0.05  ns = tidak signifikan

wald_all <- rbind(
  data.frame(Model = "Logit 1 (Chinstrap vs Adelie)", result_g1),
  data.frame(Model = "Logit 2 (Gentoo vs Adelie)",    result_g2)
)

kable(wald_all, row.names = FALSE,
      caption = "Tabel 9. Hasil Uji Wald (Uji Parsial) Model Multinomial") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"), font_size = 11) %>%
  collapse_rows(columns = 1, valign = "middle") %>%
  row_spec(which(wald_all$Sig %in% c("***", "**", "*")), background = "#EBF5FB")

Tabel 9. Hasil Uji Wald (Uji Parsial) Model Multinomial
Model	Variabel	Beta	SE	Wald_Z	P_value	Sig
Logit 1 (Chinstrap vs Adelie)	(Intercept)	-107.7286	0.4763	-2.261749e+02	0.0000	***
	culmen_length_mm	21.8157	1.1395	1.914490e+01	0.0000	***
	culmen_depth_mm	-28.5698	7.2154	-3.959600e+00	0.0001	***
	flipper_length_mm	-0.8293	6.8991	-1.202000e-01	0.9043	ns
	body_mass_g	-0.0484	0.3945	-1.226000e-01	0.9024	ns
	islandDream	18.1060	1.4389	1.258290e+01	0.0000	***
	islandTorgersen	-7.4009	0.0002	-3.258882e+04	0.0000	***
Logit 2 (Gentoo vs Adelie)	(Intercept)	-9.8579	0.8877	-1.110520e+01	0.0000	***
	culmen_length_mm	13.8783	63.1359	2.198000e-01	0.8260	ns
	culmen_depth_mm	-29.3559	21.1375	-1.388800e+00	0.1649	ns
	flipper_length_mm	-0.4965	195.2777	-2.500000e-03	0.9980	ns
	body_mass_g	0.0076	8.8879	9.000000e-04	0.9993	ns
	islandDream	-40.7257	0.0000	-1.977508e+12	0.0000	***
	islandTorgersen	-21.8752	0.2892	-7.563760e+01	0.0000	***

4.5 Persamaan Model

cat("=== MODEL REGRESI LOGISTIK MULTINOMIAL ===\n\n")

## === MODEL REGRESI LOGISTIK MULTINOMIAL ===

cat("LOGIT 1: ln[P(Chinstrap|x) / P(Adelie|x)] =\n  ")

## LOGIT 1: ln[P(Chinstrap|x) / P(Adelie|x)] =
##

cat(sprintf("%.4f", coef_mn["Chinstrap", "(Intercept)"]))

## -107.7286

for (v in colnames(coef_mn)[-1]) {
  cat(sprintf(" %+.4f*%s", coef_mn["Chinstrap", v], v))
}

##  +21.8157*culmen_length_mm -28.5698*culmen_depth_mm -0.8293*flipper_length_mm -0.0484*body_mass_g +18.1060*islandDream -7.4009*islandTorgersen

cat("\n\n")

cat("LOGIT 2: ln[P(Gentoo|x) / P(Adelie|x)] =\n  ")

## LOGIT 2: ln[P(Gentoo|x) / P(Adelie|x)] =
##

cat(sprintf("%.4f", coef_mn["Gentoo", "(Intercept)"]))

## -9.8579

for (v in colnames(coef_mn)[-1]) {
  cat(sprintf(" %+.4f*%s", coef_mn["Gentoo", v], v))
}

##  +13.8783*culmen_length_mm -29.3559*culmen_depth_mm -0.4965*flipper_length_mm +0.0076*body_mass_g -40.7257*islandDream -21.8752*islandTorgersen

cat("\n")

4.6 Odds Ratio

or_mn    <- exp(coef(model_mn))
ci_lower <- exp(coef(model_mn) - 1.96 * summary(model_mn)$standard.errors)
ci_upper <- exp(coef(model_mn) + 1.96 * summary(model_mn)$standard.errors)

or_g1 <- data.frame(
  Variabel = colnames(or_mn),
  OR       = round(or_mn["Chinstrap", ], 4),
  CI_Lower = round(ci_lower["Chinstrap", ], 4),
  CI_Upper = round(ci_upper["Chinstrap", ], 4),
  P_value  = round(p_values["Chinstrap", ], 4)
)

or_g2 <- data.frame(
  Variabel = colnames(or_mn),
  OR       = round(or_mn["Gentoo", ], 4),
  CI_Lower = round(ci_lower["Gentoo", ], 4),
  CI_Upper = round(ci_upper["Gentoo", ], 4),
  P_value  = round(p_values["Gentoo", ], 4)
)

cat("=== ODDS RATIO - LOGIT 1 (Chinstrap vs Adelie) ===\n")

## === ODDS RATIO - LOGIT 1 (Chinstrap vs Adelie) ===

print(or_g1)

##                            Variabel           OR     CI_Lower     CI_Upper
## (Intercept)             (Intercept) 0.000000e+00 0.000000e+00 0.000000e+00
## culmen_length_mm   culmen_length_mm 2.981388e+09 3.194890e+08 2.782154e+10
## culmen_depth_mm     culmen_depth_mm 0.000000e+00 0.000000e+00 0.000000e+00
## flipper_length_mm flipper_length_mm 4.363000e-01 0.000000e+00 3.254492e+05
## body_mass_g             body_mass_g 9.528000e-01 4.397000e-01 2.064300e+00
## islandDream             islandDream 7.300084e+07 4.349935e+06 1.225104e+09
## islandTorgersen     islandTorgersen 6.000000e-04 6.000000e-04 6.000000e-04
##                   P_value
## (Intercept)        0.0000
## culmen_length_mm   0.0000
## culmen_depth_mm    0.0001
## flipper_length_mm  0.9043
## body_mass_g        0.9024
## islandDream        0.0000
## islandTorgersen    0.0000

cat("\n=== ODDS RATIO - LOGIT 2 (Gentoo vs Adelie) ===\n")

## 
## === ODDS RATIO - LOGIT 2 (Gentoo vs Adelie) ===

print(or_g2)

##                            Variabel           OR CI_Lower      CI_Upper P_value
## (Intercept)             (Intercept)       0.0001        0  3.000000e-04  0.0000
## culmen_length_mm   culmen_length_mm 1064784.1863        0  5.883006e+59  0.8260
## culmen_depth_mm     culmen_depth_mm       0.0000        0  1.751888e+05  0.1649
## flipper_length_mm flipper_length_mm       0.6087        0 1.018952e+166  0.9980
## body_mass_g             body_mass_g       1.0076        0  3.704997e+07  0.9993
## islandDream             islandDream       0.0000        0  0.000000e+00  0.0000
## islandTorgersen     islandTorgersen       0.0000        0  0.000000e+00  0.0000

or_combined <- rbind(
  data.frame(Model = "Chinstrap vs Adelie", or_g1),
  data.frame(Model = "Gentoo vs Adelie",    or_g2)
)

kable(or_combined, row.names = FALSE, digits = 4,
      caption = "Tabel 10. Odds Ratio dengan 95% Confidence Interval") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"), font_size = 11) %>%
  collapse_rows(columns = 1, valign = "middle")

Tabel 10. Odds Ratio dengan 95% Confidence Interval
Model	Variabel	OR	CI_Lower	CI_Upper	P_value
Chinstrap vs Adelie	(Intercept)	0.000000e+00	0.000000e+00	0.000000e+00	0.0000
	culmen_length_mm	2.981388e+09	3.194890e+08	2.782154e+10	0.0000
	culmen_depth_mm	0.000000e+00	0.000000e+00	0.000000e+00	0.0001
	flipper_length_mm	4.363000e-01	0.000000e+00	3.254492e+05	0.9043
	body_mass_g	9.528000e-01	4.397000e-01	2.064300e+00	0.9024
	islandDream	7.300084e+07	4.349935e+06	1.225104e+09	0.0000
	islandTorgersen	6.000000e-04	6.000000e-04	6.000000e-04	0.0000
Gentoo vs Adelie	(Intercept)	1.000000e-04	0.000000e+00	3.000000e-04	0.0000
	culmen_length_mm	1.064784e+06	0.000000e+00	5.883006e+59	0.8260
	culmen_depth_mm	0.000000e+00	0.000000e+00	1.751888e+05	0.1649
	flipper_length_mm	6.087000e-01	0.000000e+00	1.018952e+166	0.9980
	body_mass_g	1.007600e+00	0.000000e+00	3.704997e+07	0.9993
	islandDream	0.000000e+00	0.000000e+00	0.000000e+00	0.0000
	islandTorgersen	0.000000e+00	0.000000e+00	0.000000e+00	0.0000

cat("=== INTERPRETASI ODDS RATIO (variabel numerik, p < 0.05) ===\n\n")

## === INTERPRETASI ODDS RATIO (variabel numerik, p < 0.05) ===

vars_num <- c("culmen_length_mm", "culmen_depth_mm", "flipper_length_mm", "body_mass_g")

for (v in vars_num) {
  or_val <- or_mn["Chinstrap", v]
  pv     <- p_values["Chinstrap", v]
  if (pv < 0.05) {
    arah <- ifelse(or_val > 1, "meningkat", "menurun")
    cat(sprintf("[Logit 1 - Chinstrap vs Adelie]\n"))
    cat(sprintf("  Setiap kenaikan 1 unit %s, peluang Chinstrap vs Adelie\n", v))
    cat(sprintf("  %s sebesar %.4f kali (OR=%.4f, p=%.4f)\n\n", arah, or_val, or_val, pv))
  }
}

## [Logit 1 - Chinstrap vs Adelie]
##   Setiap kenaikan 1 unit culmen_length_mm, peluang Chinstrap vs Adelie
##   meningkat sebesar 2981388376.5594 kali (OR=2981388376.5594, p=0.0000)
## 
## [Logit 1 - Chinstrap vs Adelie]
##   Setiap kenaikan 1 unit culmen_depth_mm, peluang Chinstrap vs Adelie
##   menurun sebesar 0.0000 kali (OR=0.0000, p=0.0001)

for (v in vars_num) {
  or_val <- or_mn["Gentoo", v]
  pv     <- p_values["Gentoo", v]
  if (pv < 0.05) {
    arah <- ifelse(or_val > 1, "meningkat", "menurun")
    cat(sprintf("[Logit 2 - Gentoo vs Adelie]\n"))
    cat(sprintf("  Setiap kenaikan 1 unit %s, peluang Gentoo vs Adelie\n", v))
    cat(sprintf("  %s sebesar %.4f kali (OR=%.4f, p=%.4f)\n\n", arah, or_val, or_val, pv))
  }
}

4.7 Uji Kesesuaian Model (Goodness of Fit)

ll_null  <- logLik(model_null)
ll_model <- logLik(model_mn)

mcfadden_r2   <- as.numeric(1 - (ll_model / ll_null))
cox_r2        <- as.numeric(1 - exp((2 / nrow(train_mn)) * (ll_null - ll_model)))
nagelkerke_r2 <- cox_r2 / (1 - exp((2 / nrow(train_mn)) * as.numeric(ll_null)))

cat("=== UJI KESESUAIAN MODEL (Goodness of Fit) ===\n\n")

## === UJI KESESUAIAN MODEL (Goodness of Fit) ===

cat(sprintf("Log-likelihood (Null)  : %.4f\n", as.numeric(ll_null)))

## Log-likelihood (Null)  : -282.6294

cat(sprintf("Log-likelihood (Model) : %.4f\n", as.numeric(ll_model)))

## Log-likelihood (Model) : -0.0001

cat(sprintf("AIC                    : %.4f\n", AIC(model_mn)))

## AIC                    : 28.0002

cat(sprintf("BIC                    : %.4f\n", BIC(model_mn)))

## BIC                    : 78.2740

cat(sprintf("\nMcFadden R2    : %.4f (%.2f%%)\n", mcfadden_r2,   mcfadden_r2   * 100))

## 
## McFadden R2    : 1.0000 (100.00%)

cat(sprintf("Cox & Snell R2 : %.4f (%.2f%%)\n", cox_r2,         cox_r2        * 100))

## Cox & Snell R2 : 0.8787 (87.87%)

cat(sprintf("Nagelkerke R2  : %.4f (%.2f%%)\n", nagelkerke_r2,  nagelkerke_r2 * 100))

## Nagelkerke R2  : 1.0000 (100.00%)

cat("=== DEVIANCE TEST ===\n")

## === DEVIANCE TEST ===

cat(sprintf("Null Deviance     : %.4f\n", deviance(model_null)))

## Null Deviance     : 565.2588

cat(sprintf("Residual Deviance : %.4f\n", deviance(model_mn)))

## Residual Deviance : 0.0002

cat(sprintf("LR G2             : %.4f\n", deviance(model_null) - deviance(model_mn)))

## LR G2             : 565.2586

cat(sprintf("P-value           : %.6f\n",
            pchisq(deviance(model_null) - deviance(model_mn),
                   df = model_mn$edf - 2, lower.tail = FALSE)))

## P-value           : 0.000000

4.8 Evaluasi Model Multinomial

pred_mn_prob  <- predict(model_mn, test_mn, type = "probs")
pred_mn_class <- predict(model_mn, test_mn, type = "class")

cm_mn   <- confusionMatrix(factor(pred_mn_class, levels = levels(test_mn$species)),
                            test_mn$species)
aper_mn <- (sum(cm_mn$table) - sum(diag(cm_mn$table))) / sum(cm_mn$table)

cat("=== CONFUSION MATRIX - MULTINOMIAL (Test Set) ===\n")

## === CONFUSION MATRIX - MULTINOMIAL (Test Set) ===

print(cm_mn$table)

##            Reference
## Prediction  Adelie Chinstrap Gentoo
##   Adelie        29         0      0
##   Chinstrap      0        13      0
##   Gentoo         0         0     23

cat(sprintf("\nAkurasi : %.2f%%\n", cm_mn$overall["Accuracy"] * 100))

## 
## Akurasi : 100.00%

cat(sprintf("APER    : %.4f = %.2f%%\n", aper_mn, aper_mn * 100))

## APER    : 0.0000 = 0.00%

cm_mn_df        <- as.data.frame(cm_mn$table)
names(cm_mn_df) <- c("Predicted", "Actual", "Freq")

ggplot(cm_mn_df, aes(x = Actual, y = Predicted, fill = Freq)) +
  geom_tile(color = "white", linewidth = 1) +
  geom_text(aes(label = Freq), size = 8, fontface = "bold") +
  scale_fill_gradient(low = "#FDFEFE", high = "#1E8449") +
  labs(title = "Gambar 8. Confusion Matrix Multinomial Logistik - Test Set",
       x = "Kelas Aktual", y = "Kelas Prediksi", fill = "Frekuensi") +
  theme_bw() +
  theme(axis.text  = element_text(size = 12),
        axis.title = element_text(size = 12, face = "bold"),
        plot.title = element_text(size = 13, face = "bold"))

prob_df           <- as.data.frame(round(pred_mn_prob, 4))
prob_df$Predicted <- pred_mn_class
prob_df$Actual    <- test_mn$species
prob_df$Correct   <- ifelse(prob_df$Predicted == prob_df$Actual, "V", "X")

kable(head(prob_df, 10), row.names = FALSE,
      caption = "Tabel 11. Peluang Prediksi per Kelas - 10 Observasi Pertama (Test Set)") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"))

Tabel 11. Peluang Prediksi per Kelas - 10 Observasi Pertama (Test Set)
Adelie	Predicted	Actual	Correct
1	Adelie	Adelie	V
1	Adelie	Adelie	V
1	Adelie	Adelie	V
1	Adelie	Adelie	V
1	Adelie	Adelie	V
1	Adelie	Adelie	V
1	Adelie	Adelie	V
1	Adelie	Adelie	V
1	Adelie	Adelie	V
1	Adelie	Adelie	V

prob_long <- as.data.frame(pred_mn_prob) %>%
  mutate(Actual = test_mn$species) %>%
  pivot_longer(c(Adelie, Chinstrap, Gentoo),
               names_to  = "Pred_Species",
               values_to = "Probability")

ggplot(prob_long, aes(x = Pred_Species, y = Probability, fill = Pred_Species)) +
  geom_boxplot(alpha = 0.7) +
  facet_wrap(~Actual, labeller = label_both) +
  scale_fill_manual(values = c("#E74C3C", "#27AE60", "#2980B9")) +
  labs(title = "Gambar 9. Distribusi Probabilitas Prediksi per Kelas Aktual",
       x = "Spesies Prediksi", y = "Probabilitas") +
  theme_bw() +
  theme(legend.position = "none",
        strip.background = element_rect(fill = "#2C3E50"),
        strip.text = element_text(color = "white", face = "bold"))

5 Perbandingan LDA vs Regresi Logistik Multinomial

final_comparison <- data.frame(
  Aspek = c("Metode Estimasi", "Asumsi Distribusi", "Tipe Prediktor",
            "Interpretasi", "APER Test Set", "Akurasi Test Set", "AIC"),
  LDA = c(
    "Matriks B dan W, eigenvector W-1B",
    "Multivariat Normal, kovarians homogen",
    "Hanya numerik (interval/rasio)",
    "Skor LD, cutting score, posterior prob",
    sprintf("%.2f%%", aper_test * 100),
    sprintf("%.2f%%", (1 - aper_test) * 100),
    "-"
  ),
  Multinomial_Logistik = c(
    "Maximum Likelihood (Newton-Raphson)",
    "Tidak diasumsikan",
    "Numerik & kategorik",
    "Koefisien logit, OR, posterior prob",
    sprintf("%.2f%%", aper_mn * 100),
    sprintf("%.2f%%", (1 - aper_mn) * 100),
    sprintf("%.2f", AIC(model_mn))
  )
)

kable(final_comparison, row.names = FALSE,
      col.names = c("Aspek", "LDA", "Multinomial Logistik"),
      caption = "Tabel 12. Perbandingan LDA vs Regresi Logistik Multinomial") %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = TRUE) %>%
  column_spec(1, bold = TRUE)

Tabel 12. Perbandingan LDA vs Regresi Logistik Multinomial
Aspek	LDA	Multinomial Logistik
Metode Estimasi	Matriks B dan W, eigenvector W-1B	Maximum Likelihood (Newton-Raphson)
Asumsi Distribusi	Multivariat Normal, kovarians homogen	Tidak diasumsikan
Tipe Prediktor	Hanya numerik (interval/rasio)	Numerik & kategorik
Interpretasi	Skor LD, cutting score, posterior prob	Koefisien logit, OR, posterior prob
APER Test Set	1.54%	0.00%
Akurasi Test Set	98.46%	100.00%
AIC		28.00

Analisis Diskriminan & Regresi Logistik Multinomial

Implementasi dengan Dataset Palmer Penguins

Lintang Isa Mardani (24031554104), Ivan Cahya Aryasuta (24031554172), Rizqi Wahyu Gusniadi (24031554203)

11 May 2026

1 Persiapan

1.1 Load Package

1.2 Load & Preprocessing Data

2 Statistika Deskriptif

3 Analisis Diskriminan

3.1 Uji Asumsi

3.1.1 Uji Normalitas (Shapiro-Wilk)

3.1.2 Uji Homogenitas Varians (Levene)

3.2 Pembagian Data Training dan Testing

3.3 Perhitungan Manual Matriks B dan W

3.4 Eigenvector W⁻¹B

3.5 Model LDA

3.6 Visualisasi Fungsi Diskriminan

3.7 Posterior Probability & Classification Rule

3.8 Evaluasi Model LDA

3.8.1 Confusion Matrix & APER - Training Set

3.8.2 Confusion Matrix & APER - Test Set

3.8.3 Leave-One-Out Cross Validation (LOO-CV)

3.9 QDA (Quadratic Discriminant Analysis)

3.10 Perbandingan LDA vs QDA

4 Regresi Logistik Multinomial

4.1 Uji Asumsi

4.1.1 Tidak Ada Multikolinieritas (VIF)

4.1.2 Deteksi Outlier (Mahalanobis Distance)

4.2 Persiapan Data Multinomial

4.3 Model Multinomial

4.3.1 Model Penuh

4.3.2 Model Parsimoni

4.4 Uji Signifikansi Parameter

4.4.1 Uji Serentak (Likelihood Ratio Test)

4.4.2 Uji Parsial (Wald Test)

4.5 Persamaan Model

4.6 Odds Ratio

4.7 Uji Kesesuaian Model (Goodness of Fit)

4.8 Evaluasi Model Multinomial

5 Perbandingan LDA vs Regresi Logistik Multinomial