Analisis Diskriminan: Palmer Archipelago Penguin Data

STEP 0: Library

library(MASS)       
library(ggplot2)    
library(dplyr)      
library(tidyr)     
library(MVN)        
library(biotools)   
library(gridExtra)  
library(GGally)     
library(png)

STEP 1: Load & Preprocessing Data

df_raw <- read.csv("Palmer Archipelago (Antarctica) Penguin Data/penguins_lter.csv",
                   stringsAsFactors = FALSE)

cat("Dimensi data asli:", dim(df_raw), "\n")

## Dimensi data asli: 344 17

# Rename kolom
df_raw <- df_raw %>%
  rename(
    species_raw    = Species,
    island_raw     = Island,
    culmen_length  = Culmen.Length..mm.,
    culmen_depth   = Culmen.Depth..mm.,
    flipper_length = Flipper.Length..mm.,
    body_mass      = Body.Mass..g.,
    sex            = Sex
  )

# Recode Species -> 3 label
# Adelie -> "Adelie" | Gentoo -> "Gentoo" | Chinstrap -> "Others"
# Recode Island  -> 3 label
# Biscoe -> "Biscoe" | Dream -> "Dream" | Torgersen -> "Others"

df_raw <- df_raw %>%
  mutate(
    species = case_when(
      grepl("Adelie", species_raw, ignore.case = TRUE) ~ "Adelie",
      grepl("Gentoo", species_raw, ignore.case = TRUE) ~ "Gentoo",
      TRUE                                              ~ "Others"
    ),
    island = case_when(
      island_raw == "Biscoe" ~ "Biscoe",
      island_raw == "Dream"  ~ "Dream",
      TRUE                   ~ "Others"
    )
  )

# Bersihkan NA, filter sex valid, buat factor
df <- df_raw %>%
  select(species, island, culmen_length, culmen_depth,
         flipper_length, body_mass, sex) %>%
  filter(
    !is.na(culmen_length), !is.na(culmen_depth),
    !is.na(flipper_length), !is.na(body_mass),
    sex %in% c("MALE", "FEMALE")
  ) %>%
  mutate(
    species = factor(species, levels = c("Adelie", "Gentoo", "Others")),
    island  = factor(island,  levels = c("Biscoe", "Dream", "Others")),
    sex     = factor(sex,     levels = c("FEMALE", "MALE"))
  )

# LDA skala asli (bukan log-transform)
vars_x <- c("culmen_length", "culmen_depth", "flipper_length", "body_mass")

cat("Data siap:", nrow(df), "baris\n")

## Data siap: 333 baris

cat("Distribusi species:\n")

## Distribusi species:

print(table(df$species))

## 
## Adelie Gentoo Others 
##    146    119     68

STEP 2: Eksplorasi Data (EDA)

cat("Ringkasan statistik per species:\n")

## Ringkasan statistik per species:

df %>%
  group_by(species) %>%
  summarise(across(all_of(vars_x),
                   list(mean = ~ round(mean(.),2),
                        sd   = ~ round(sd(.),2)),
                   .names = "{.col}_{.fn}")) %>%
  print()

## # A tibble: 3 × 9
##   species culmen_length_mean culmen_length_sd culmen_depth_mean culmen_depth_sd
##   <fct>                <dbl>            <dbl>             <dbl>           <dbl>
## 1 Adelie                38.8             2.66              18.4            1.22
## 2 Gentoo                47.6             3.11              15              0.99
## 3 Others                48.8             3.34              18.4            1.14
## # ℹ 4 more variables: flipper_length_mean <dbl>, flipper_length_sd <dbl>,
## #   body_mass_mean <dbl>, body_mass_sd <dbl>

# Boxplot per variabel per species
df_long <- df %>%
  select(species, all_of(vars_x)) %>%
  pivot_longer(cols = all_of(vars_x), names_to = "var", values_to = "val")

p_box <- ggplot(df_long, aes(x = species, y = val, fill = species)) +
  geom_boxplot(alpha = 0.7, outlier.color = "red", outlier.shape = 16) +
  facet_wrap(~var, scales = "free_y", ncol = 2) +
  scale_fill_manual(values = c("Adelie"="#3498db",
                               "Gentoo"="#2ecc71",
                               "Others"="#e74c3c")) +
  labs(title = "EDA: Distribusi Variabel Numerik per Species",
       x = "Species", y = "Nilai") +
  theme_minimal() + theme(legend.position = "bottom")

png("plot_eda_boxplot.png", width = 1200, height = 900, res = 110)
print(p_box)
dev.off()

## png 
##   2

img <- readPNG("plot_eda_boxplot.png")
plot.new(); rasterImage(img, 0, 0, 1, 1)

# Scatterplot matrix
png("plot_eda_scatter.png", width = 1100, height = 1000, res = 100)
print(
  ggpairs(df, columns = vars_x,
          aes(color = species, alpha = 0.5),
          upper = list(continuous = "cor"),
          lower = list(continuous = "points"),
          diag  = list(continuous = "densityDiag")) +
    scale_color_manual(values = c("Adelie"="#3498db",
                                  "Gentoo"="#2ecc71",
                                  "Others"="#e74c3c")) +
    scale_fill_manual(values  = c("Adelie"="#3498db",
                                  "Gentoo"="#2ecc71",
                                  "Others"="#e74c3c")) +
    labs(title = "EDA: Scatterplot Matrix") +
    theme_minimal()
)

## Warning: No shared levels found between `names(values)` of the manual scale and the
## data's colour values.
## No shared levels found between `names(values)` of the manual scale and the
## data's colour values.

## Warning: No shared levels found between `names(values)` of the manual scale and the
## data's fill values.

## Warning: No shared levels found between `names(values)` of the manual scale and the
## data's colour values.
## No shared levels found between `names(values)` of the manual scale and the
## data's colour values.

## Warning: No shared levels found between `names(values)` of the manual scale and the
## data's fill values.

## Warning: No shared levels found between `names(values)` of the manual scale and the
## data's colour values.
## No shared levels found between `names(values)` of the manual scale and the
## data's colour values.

## Warning: No shared levels found between `names(values)` of the manual scale and the
## data's fill values.

## Warning: No shared levels found between `names(values)` of the manual scale and the
## data's colour values.

## Warning: No shared levels found between `names(values)` of the manual scale and the
## data's fill values.

## Warning: No shared levels found between `names(values)` of the manual scale and the
## data's colour values.
## No shared levels found between `names(values)` of the manual scale and the
## data's colour values.

## Warning: No shared levels found between `names(values)` of the manual scale and the
## data's fill values.

## Warning: No shared levels found between `names(values)` of the manual scale and the
## data's colour values.
## No shared levels found between `names(values)` of the manual scale and the
## data's colour values.

## Warning: No shared levels found between `names(values)` of the manual scale and the
## data's fill values.

## Warning: No shared levels found between `names(values)` of the manual scale and the
## data's colour values.

## Warning: No shared levels found between `names(values)` of the manual scale and the
## data's fill values.
## No shared levels found between `names(values)` of the manual scale and the
## data's fill values.

## Warning: No shared levels found between `names(values)` of the manual scale and the
## data's colour values.
## No shared levels found between `names(values)` of the manual scale and the
## data's colour values.

## Warning: No shared levels found between `names(values)` of the manual scale and the
## data's fill values.

## Warning: No shared levels found between `names(values)` of the manual scale and the
## data's colour values.

## Warning: No shared levels found between `names(values)` of the manual scale and the
## data's fill values.
## No shared levels found between `names(values)` of the manual scale and the
## data's fill values.
## No shared levels found between `names(values)` of the manual scale and the
## data's fill values.

## Warning: No shared levels found between `names(values)` of the manual scale and the
## data's colour values.

dev.off()

## png 
##   2

img <- readPNG("plot_eda_scatter.png")
plot.new(); rasterImage(img, 0, 0, 1, 1)

STEP 3: Uji Asumsi Analisis Diskriminan

Analisis Diskriminan Linear (LDA) memiliki dua asumsi utama:

Multivariate Normal - setiap kelompok berdistribusi normal multivariat
Homogenitas Matriks Kovarians - matriks kovarians antar kelompok homogen

Asumsi 1: Multivariate Normal

cat("========== ASUMSI 1: MULTIVARIATE NORMAL ==========\n")

## ========== ASUMSI 1: MULTIVARIATE NORMAL ==========

cat("Uji: Shapiro-Wilk per variabel per kelompok (univariat)\n")

## Uji: Shapiro-Wilk per variabel per kelompok (univariat)

cat("H0 : Data normal\n")

## H0 : Data normal

cat("Keputusan: Tolak H0 jika p-value < 0.05\n\n")

## Keputusan: Tolak H0 jika p-value < 0.05

mvn_status_list <- c()

for (sp in levels(df$species)) {
  sub <- df %>% filter(species == sp) %>% select(all_of(vars_x))

  shapiro_pvals <- sapply(sub, function(x) shapiro.test(x)$p.value)
  ok            <- all(shapiro_pvals > 0.05)
  mvn_status_list[sp] <- ok

  cat(sprintf("Species %-8s :", sp))
  cat(sprintf(" [%s]\n", ifelse(ok, "NORMAL OK", "Tidak Normal")))
  for (v in names(shapiro_pvals)) {
    flag <- ifelse(shapiro_pvals[v] < 0.05, " <-- p < 0.05", "")
    cat(sprintf("  %-16s p = %.4f%s\n", v, shapiro_pvals[v], flag))
  }
  cat("\n")
}

## Species Adelie   : [Tidak Normal]
##   culmen_length    p = 0.6848
##   culmen_depth     p = 0.1047
##   flipper_length   p = 0.7427
##   body_mass        p = 0.0423 <-- p < 0.05
## 
## Species Gentoo   : [Tidak Normal]
##   culmen_length    p = 0.0199 <-- p < 0.05
##   culmen_depth     p = 0.0438 <-- p < 0.05
##   flipper_length   p = 0.0018 <-- p < 0.05
##   body_mass        p = 0.2605
## 
## Species Others   : [NORMAL OK]
##   culmen_length    p = 0.1941
##   culmen_depth     p = 0.1418
##   flipper_length   p = 0.8106
##   body_mass        p = 0.5605

status_mvn <- ifelse(all(mvn_status_list), "TERPENUHI", "TIDAK TERPENUHI")
cat(sprintf(">> STATUS ASUMSI 1 (MVN) : %s\n", status_mvn))

## >> STATUS ASUMSI 1 (MVN) : TIDAK TERPENUHI

Asumsi normalitas tidak sepenuhnya terpenuhi, terutama pada Gentoo.Ini adalah fenomena biologis: Gentoo memiliki distribusi ukuran tubuh yang sedikit skewed karena variasi antar individu yang lebih besar. Selain itu, Shapiro-Wilk sangat sensitif pada n >= 50, sehingga penyimpangan kecil yang tidak bermakna secara praktis pun terdeteksi. LDA tetap robust untuk pelanggaran ringan ini (Hair et al., 2010). Sebagai mitigasi: QDA akan dijalankan sebagai pembanding.

Asumsi 2: Homogenitas Matriks Kovarians (Box’s M Test)

cat("========== ASUMSI 2: HOMOGENITAS VARIANS ==========\n")

## ========== ASUMSI 2: HOMOGENITAS VARIANS ==========

cat("Uji: Box's M Test\n")

## Uji: Box's M Test

cat("H0 : Matriks kovarians antar kelompok homogen\n")

## H0 : Matriks kovarians antar kelompok homogen

cat("Keputusan: Tolak H0 jika p-value < 0.001\n")

## Keputusan: Tolak H0 jika p-value < 0.001

cat("(Threshold 0.001 karena Box's M sangat sensitif pada n besar)\n\n")

## (Threshold 0.001 karena Box's M sangat sensitif pada n besar)

boxm_res    <- boxM(df[, vars_x], df$species)
print(boxm_res)

## 
##  Box's M-test for Homogeneity of Covariance Matrices
## 
## data:  df[, vars_x]
## Chi-Sq (approx.) = 74.731, df = 20, p-value = 3.02e-08

status_boxm <- ifelse(boxm_res$p.value > 0.001, "TERPENUHI", "TIDAK TERPENUHI")
cat(sprintf("\n>> STATUS ASUMSI 2 (Box's M) : %s\n", status_boxm))

## 
## >> STATUS ASUMSI 2 (Box's M) : TIDAK TERPENUHI

Matriks kovarians tidak homogen karena Gentoo jauh lebih besar dari Adelie dan Chinstrap, sehingga variansi antar kelompok berbeda secara struktural. Ini sifat biologis dataset, bukan error.Konsekuensi: LDA kurang optimal -> QDA lebih sesuai.Kedua metode tetap dijalankan untuk perbandingan

Ringkasan Asumsi LDA

cat("RINGKASAN ASUMSI ANALISIS DISKRIMINAN\n")

## RINGKASAN ASUMSI ANALISIS DISKRIMINAN

asumsi_names    <- c("Multivariate Normal", "Homogenitas Kovarians")
asumsi_statuses <- c(status_mvn, status_boxm)

for (i in seq_along(asumsi_names)) {
  tanda <- if (asumsi_statuses[i] == "TERPENUHI") "[v]" else "[x]"
  cat(sprintf("  %s %-25s : %s\n",
              tanda, asumsi_names[i], asumsi_statuses[i]))
}

##   [x] Multivariate Normal       : TIDAK TERPENUHI
##   [x] Homogenitas Kovarians     : TIDAK TERPENUHI

semua_ok <- all(asumsi_statuses == "TERPENUHI")
if (semua_ok) {
  cat("  KESIMPULAN: SEMUA ASUMSI TERPENUHI, LDA optimal\n")
} else {
  cat("  KESIMPULAN: ADA ASUMSI TIDAK TERPENUHI\n")
}

##   KESIMPULAN: ADA ASUMSI TIDAK TERPENUHI

STEP 4: Bayes Classification Rule & Prior Probability

cat("STEP 4: BAYES CLASSIFICATION RULE\n")

## STEP 4: BAYES CLASSIFICATION RULE

# Prior probability = proporsi tiap kelas di data
prior_prob <- table(df$species) / nrow(df)
cat("Prior Probability per Species:\n")

## Prior Probability per Species:

print(round(prior_prob, 4))

## 
## Adelie Gentoo Others 
## 0.4384 0.3574 0.2042

cat("\nInterpretasi: Tanpa informasi prediktor, probabilitas awal tiap\n")

## 
## Interpretasi: Tanpa informasi prediktor, probabilitas awal tiap

# Fit LDA
lda_model <- lda(species ~ culmen_length + culmen_depth +
                   flipper_length + body_mass,
                 data  = df,
                 prior = as.vector(prior_prob))

cat("Ringkasan Model LDA:\n")

## Ringkasan Model LDA:

print(lda_model)

## Call:
## lda(species ~ culmen_length + culmen_depth + flipper_length + 
##     body_mass, data = df, prior = as.vector(prior_prob))
## 
## Prior probabilities of groups:
##    Adelie    Gentoo    Others 
## 0.4384384 0.3573574 0.2042042 
## 
## Group means:
##        culmen_length culmen_depth flipper_length body_mass
## Adelie      38.82397     18.34726       190.1027  3706.164
## Gentoo      47.56807     14.99664       217.2353  5092.437
## Others      48.83382     18.42059       195.8235  3733.088
## 
## Coefficients of linear discriminants:
##                         LD1         LD2
## culmen_length  -0.085926709  0.41660160
## culmen_depth    1.041646762  0.01042272
## flipper_length -0.084552842 -0.01424552
## body_mass      -0.001347375 -0.00168559
## 
## Proportion of trace:
##    LD1    LD2 
## 0.8655 0.1345

# Posterior probability
lda_pred <- predict(lda_model)

posterior_df <- as.data.frame(lda_pred$posterior) %>%
  mutate(across(everything(), ~ round(.x, 4))) %>%
  mutate(Predicted = lda_pred$class,
         Actual    = df$species)

cat("Contoh 10 baris pertama Posterior Probability:\n")

## Contoh 10 baris pertama Posterior Probability:

print(head(posterior_df, 10))

##    Adelie Gentoo Others Predicted Actual
## 1  1.0000      0 0.0000    Adelie Adelie
## 2  0.9997      0 0.0003    Adelie Adelie
## 3  0.9840      0 0.0160    Adelie Adelie
## 4  1.0000      0 0.0000    Adelie Adelie
## 5  1.0000      0 0.0000    Adelie Adelie
## 6  0.9999      0 0.0001    Adelie Adelie
## 7  1.0000      0 0.0000    Adelie Adelie
## 8  0.9298      0 0.0702    Adelie Adelie
## 9  1.0000      0 0.0000    Adelie Adelie
## 10 1.0000      0 0.0000    Adelie Adelie

STEP 5: Linear Discriminant Function (Fisher)

cat("STEP 5: LINEAR DISCRIMINANT FUNCTION\n")

## STEP 5: LINEAR DISCRIMINANT FUNCTION

X   <- as.matrix(df[, vars_x])
grp <- df$species
k   <- nlevels(grp)
p   <- ncol(X)
n   <- nrow(X)

# Grand mean x̄
x_bar <- colMeans(X)
cat("Grand Mean (x̄):\n")

## Grand Mean (x̄):

print(round(x_bar, 4))

##  culmen_length   culmen_depth flipper_length      body_mass 
##        43.9928        17.1649       200.9670      4207.0571

# Between-groups matrix
B <- matrix(0, p, p)
for (sp in levels(grp)) {
  xi_bar <- colMeans(X[grp == sp, ])
  ni     <- sum(grp == sp)
  d      <- matrix(xi_bar - x_bar, ncol = 1)
  B      <- B + ni * (d %*% t(d))
}
rownames(B) <- colnames(B) <- vars_x
cat("Between-Groups Matrix (B):\n")

## Between-Groups Matrix (B):

print(round(B, 4))

##                culmen_length culmen_depth flipper_length   body_mass
## culmen_length       7015.386   -1401.4088      13426.980    598663.4
## culmen_depth       -1401.409     870.7852      -6512.219   -355385.5
## flipper_length     13426.980   -6512.2188      50525.884   2674311.3
## body_mass         598663.355 -355385.5494    2674311.270 145190219.1

# Within-groups matrix W
W <- matrix(0, p, p)
for (sp in levels(grp)) {
  Xi     <- X[grp == sp, ]
  xi_bar <- colMeans(Xi)
  for (j in 1:nrow(Xi)) {
    d <- matrix(Xi[j, ] - xi_bar, ncol = 1)
    W <- W + (d %*% t(d))
  }
}
rownames(W) <- colnames(W) <- vars_x
cat("Within-Groups Matrix (W):\n")

## Within-Groups Matrix (W):

print(round(W, 4))

##                culmen_length culmen_depth flipper_length  body_mass
## culmen_length      2913.5170     583.9945       3192.340   263083.6
## culmen_depth        583.9945     416.6737       1217.732   106898.1
## flipper_length     3192.3404    1217.7323      14692.753   596616.4
## body_mass        263083.5822  106898.1170     596616.358 70069446.8

# W^-1 B dan Eigendecomposition
W_inv <- solve(W)
WinvB <- W_inv %*% B

eig     <- eigen(WinvB)
eig_val <- Re(eig$values)
eig_vec <- Re(eig$vectors)

cat("Eigenvalues (urut dari terbesar):\n")

## Eigenvalues (urut dari terbesar):

print(round(eig_val, 6))

## [1] 15.030390  2.336239  0.000000  0.000000

# Proporsi varians yang dijelaskan tiap LD
prop_var <- eig_val / sum(eig_val[eig_val > 1e-10])
cat("\nProporsi varians per fungsi diskriminan:\n")

## 
## Proporsi varians per fungsi diskriminan:

for (i in 1:min(k-1, p)) {
  cat(sprintf("  LD%d : %.4f (%.2f%%)\n", i, eig_val[i], prop_var[i]*100))
}

##   LD1 : 15.0304 (86.55%)
##   LD2 : 2.3362 (13.45%)

# Ambil r = min(k-1, p) = min(2, 4) = 2 fungsi diskriminan
r         <- min(k-1, p)
disc_coef <- eig_vec[, 1:r, drop = FALSE]
rownames(disc_coef) <- vars_x
colnames(disc_coef) <- paste0("LD", 1:r)

# Normalisasi tanda: LD1 positif ke arah Gentoo (terbesar)
for (col in 1:ncol(disc_coef)) {
  cg <- colMeans(X[grp == "Gentoo", ]) %*% disc_coef[, col, drop = FALSE]
  ca <- colMeans(X[grp == "Adelie", ]) %*% disc_coef[, col, drop = FALSE]
  if (cg < ca) disc_coef[, col] <- -disc_coef[, col]
}

cat(sprintf("Jumlah fungsi diskriminan: r = min(%d-1, %d) = %d\n", k, p, r))

## Jumlah fungsi diskriminan: r = min(3-1, 4) = 2

cat("\nKoefisien Fungsi Diskriminan (eigenvectors W^-1 B):\n")

## 
## Koefisien Fungsi Diskriminan (eigenvectors W^-1 B):

print(round(disc_coef, 4))

##                    LD1     LD2
## culmen_length   0.0819  0.9991
## culmen_depth   -0.9934  0.0250
## flipper_length  0.0806 -0.0342
## body_mass       0.0013 -0.0040

# Discriminant scores tiap observasi: Y = X * disc_coef
Y <- X %*% disc_coef
cat("Contoh Discriminant Scores (10 obs pertama):\n")

## Contoh Discriminant Scores (10 obs pertama):

print(round(head(Y, 10), 4))

##          LD1     LD2
##  [1,] 4.0413 18.1895
##  [2,] 5.8328 18.1837
##  [3,] 5.3214 20.9138
##  [4,] 3.8307 16.6094
##  [5,] 2.7675 18.5336
##  [6,] 4.7583 18.4725
##  [7,] 5.4728 14.0944
##  [8,] 4.6718 22.3493
##  [9,] 2.3874 17.2087
## [10,] 3.4944 10.5452

STEP 6: Critical Cutting Score & Klasifikasi

# Cutting score = rata-rata dua centroid
cat("STEP 6: CUTTING SCORE & KLASIFIKASI\n")

## STEP 6: CUTTING SCORE & KLASIFIKASI

# Centroid tiap kelas di ruang diskriminan
centroids <- matrix(0, nrow = k, ncol = r)
rownames(centroids) <- levels(grp)
colnames(centroids) <- paste0("LD", 1:r)

for (i in seq_along(levels(grp))) {
  sp             <- levels(grp)[i]
  xi_bar         <- colMeans(X[grp == sp, ])
  centroids[i, ] <- xi_bar %*% disc_coef
}

cat("Centroid tiap species di ruang diskriminan:\n")

## Centroid tiap species di ruang diskriminan:

print(round(centroids, 4))

##            LD1     LD2
## Adelie  5.0467 17.7711
## Gentoo 13.0607 19.8927
## Others  6.2900 27.4695

# Cutting score = rata-rata dua centroid (Slide 18)
m_AG <- 0.5 * (centroids["Adelie", "LD1"] + centroids["Gentoo", "LD1"])
m_AO <- 0.5 * (centroids["Adelie", "LD1"] + centroids["Others", "LD1"])
cat(sprintf("\nCutting Score Adelie-Gentoo (LD1) : %.4f\n", m_AG))

## 
## Cutting Score Adelie-Gentoo (LD1) : 9.0537

cat(sprintf("Cutting Score Adelie-Others (LD1) : %.4f\n", m_AO))

## Cutting Score Adelie-Others (LD1) : 5.6683

Interpretasi: Observasi diklasifikasikan ke kelas dengan centroid terdekat di ruang diskriminan (jarak Euclidean)

# Klasifikasi manual: jarak Euclidean ke centroid
classify_manual <- function(y_scores, centroids) {
  dists <- apply(centroids, 1, function(c) sqrt(sum((y_scores - c)^2)))
  names(which.min(dists))
}

pred_manual <- apply(Y, 1, classify_manual, centroids = centroids)
pred_manual <- factor(pred_manual, levels = levels(grp))

cat("Hasil Klasifikasi Manual (10 obs pertama):\n")

## Hasil Klasifikasi Manual (10 obs pertama):

print(data.frame(
  Actual    = df$species[1:10],
  Predicted = pred_manual[1:10],
  LD1       = round(Y[1:10, 1], 4),
  LD2       = round(Y[1:10, 2], 4)
))

##    Actual Predicted    LD1     LD2
## 1  Adelie    Adelie 4.0413 18.1895
## 2  Adelie    Adelie 5.8328 18.1837
## 3  Adelie    Adelie 5.3214 20.9138
## 4  Adelie    Adelie 3.8307 16.6094
## 5  Adelie    Adelie 2.7675 18.5336
## 6  Adelie    Adelie 4.7583 18.4725
## 7  Adelie    Adelie 5.4728 14.0944
## 8  Adelie    Adelie 4.6718 22.3493
## 9  Adelie    Adelie 2.3874 17.2087
## 10 Adelie    Adelie 3.4944 10.5452

STEP 7: Evaluasi LDA — Confusion Matrix & APER

cat("========== STEP 7: EVALUASI LDA ==========\n")

## ========== STEP 7: EVALUASI LDA ==========

pred_lda <- predict(lda_model)$class
conf_lda <- table(Actual = df$species, Predicted = pred_lda)

cat("Confusion Matrix (LDA):\n")

## Confusion Matrix (LDA):

print(conf_lda)

##         Predicted
## Actual   Adelie Gentoo Others
##   Adelie    145      0      1
##   Gentoo      0    119      0
##   Others      3      0     65

n_total     <- nrow(df)
n_correct   <- sum(diag(conf_lda))
n_wrong     <- n_total - n_correct
APER_lda    <- n_wrong  / n_total
acc_lda     <- n_correct / n_total

cat(sprintf("\nTotal observasi      : %d\n", n_total))

## 
## Total observasi      : 333

cat(sprintf("Benar diklasifikasi  : %d\n", n_correct))

## Benar diklasifikasi  : 329

cat(sprintf("Salah diklasifikasi  : %d\n", n_wrong))

## Salah diklasifikasi  : 4

cat(sprintf("APER (LDA)           : %.4f (%.2f%%)\n", APER_lda, APER_lda*100))

## APER (LDA)           : 0.0120 (1.20%)

cat(sprintf("Akurasi (LDA)        : %.4f (%.2f%%)\n", acc_lda,  acc_lda*100))

## Akurasi (LDA)        : 0.9880 (98.80%)

cat("\nAPER per kelas (LDA):\n")

## 
## APER per kelas (LDA):

for (sp in levels(grp)) {
  n_sp    <- sum(df$species == sp)
  n_err   <- n_sp - conf_lda[sp, sp]
  aper_sp <- n_err / n_sp
  cat(sprintf("  %-8s : %d salah dari %d (APER = %.4f)\n",
              sp, n_err, n_sp, aper_sp))
}

##   Adelie   : 1 salah dari 146 (APER = 0.0068)
##   Gentoo   : 0 salah dari 119 (APER = 0.0000)
##   Others   : 3 salah dari 68 (APER = 0.0441)

# Discriminant Plot LDA (LD1 vs LD2)
plot_df_lda <- data.frame(
  LD1     = lda_pred$x[, 1],
  LD2     = lda_pred$x[, 2],
  Actual  = df$species,
  Correct = lda_pred$class == df$species
)

# Centroid untuk plot
cent_plot <- as.data.frame(lda_model$means %*% disc_coef)
colnames(cent_plot) <- c("LD1", "LD2")
cent_plot$Actual    <- rownames(lda_model$means)

p_lda <- ggplot(plot_df_lda, aes(x = LD1, y = LD2,
                                  color = Actual, shape = Correct)) +
  geom_point(size = 2, alpha = 0.75) +
  stat_ellipse(aes(group = Actual), type = "norm",
               linetype = "dashed", linewidth = 0.7) +
  geom_point(data = cent_plot, aes(x = LD1, y = LD2),
             size = 6, shape = 18, color = "black", inherit.aes = FALSE) +
  geom_text(data = cent_plot, aes(x = LD1, y = LD2, label = Actual),
            vjust = -1, fontface = "bold", color = "black",
            inherit.aes = FALSE, size = 3.5) +
  scale_color_manual(values = c("Adelie"="#3498db",
                                "Gentoo"="#2ecc71",
                                "Others"="#e74c3c")) +
  scale_shape_manual(values = c("TRUE"=16, "FALSE"=4),
                     labels  = c("TRUE"="Benar", "FALSE"="Salah")) +
  labs(title    = sprintf("LDA: Discriminant Plot (Akurasi = %.2f%%)", acc_lda*100),
       subtitle = "Titik besar hitam = centroid | ✕ = misclassified",
       color = "Species", shape = "Klasifikasi") +
  theme_minimal()

png("plot_lda_discriminant.png", width = 1100, height = 800, res = 110)
print(p_lda)

## Warning: The following aesthetics were dropped during statistical transformation: shape.
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
##   the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
##   variable into a factor?

dev.off()

## png 
##   2

img <- readPNG("plot_lda_discriminant.png")
plot.new(); rasterImage(img, 0, 0, 1, 1)

STEP 8: QDA — Sebagai Pembanding (Opsi C)

QDA tidak mengasumsikan homogenitas matriks kovarians antar kelompok, sehingga lebih sesuai ketika Asumsi 2 tidak terpenuhi.

cat("STEP 8: QDA (Quadratic Discriminant Analysis)\n")

## STEP 8: QDA (Quadratic Discriminant Analysis)

qda_model <- qda(species ~ culmen_length + culmen_depth +
                   flipper_length + body_mass,
                 data  = df,
                 prior = as.vector(prior_prob))

cat("Ringkasan Model QDA:\n")

## Ringkasan Model QDA:

print(qda_model)

## Call:
## qda(species ~ culmen_length + culmen_depth + flipper_length + 
##     body_mass, data = df, prior = as.vector(prior_prob))
## 
## Prior probabilities of groups:
##    Adelie    Gentoo    Others 
## 0.4384384 0.3573574 0.2042042 
## 
## Group means:
##        culmen_length culmen_depth flipper_length body_mass
## Adelie      38.82397     18.34726       190.1027  3706.164
## Gentoo      47.56807     14.99664       217.2353  5092.437
## Others      48.83382     18.42059       195.8235  3733.088

# Evaluasi QDA
pred_qda <- predict(qda_model)$class
conf_qda <- table(Actual = df$species, Predicted = pred_qda)

n_correct_qda <- sum(diag(conf_qda))
n_wrong_qda   <- n_total - n_correct_qda
APER_qda      <- n_wrong_qda  / n_total
acc_qda       <- n_correct_qda / n_total

cat("Confusion Matrix (QDA):\n")

## Confusion Matrix (QDA):

print(conf_qda)

##         Predicted
## Actual   Adelie Gentoo Others
##   Adelie    144      0      2
##   Gentoo      0    119      0
##   Others      2      0     66

cat(sprintf("\nTotal observasi : %d\n", n_total))

## 
## Total observasi : 333

cat(sprintf("Benar diklasifikasi : %d\n", n_correct_qda))

## Benar diklasifikasi : 329

cat(sprintf("Salah diklasifikasi : %d\n", n_wrong_qda))

## Salah diklasifikasi : 4

cat(sprintf("APER (QDA) : %.4f (%.2f%%)\n", APER_qda, APER_qda*100))

## APER (QDA) : 0.0120 (1.20%)

cat(sprintf("Akurasi (QDA) : %.4f (%.2f%%)\n", acc_qda,  acc_qda*100))

## Akurasi (QDA) : 0.9880 (98.80%)

cat("\nAPER per kelas (QDA):\n")

## 
## APER per kelas (QDA):

for (sp in levels(grp)) {
  n_sp    <- sum(df$species == sp)
  n_err   <- n_sp - conf_qda[sp, sp]
  aper_sp <- n_err / n_sp
  cat(sprintf("  %-8s : %d salah dari %d (APER = %.4f)\n",
              sp, n_err, n_sp, aper_sp))
}

##   Adelie   : 2 salah dari 146 (APER = 0.0137)
##   Gentoo   : 0 salah dari 119 (APER = 0.0000)
##   Others   : 2 salah dari 68 (APER = 0.0294)

STEP 9: Perbandingan LDA vs QDA

cat("STEP 9: PERBANDINGAN LDA vs QDA\n\n")

## STEP 9: PERBANDINGAN LDA vs QDA

comp_df <- data.frame(
  Metode = c("LDA", "QDA"),
  Akurasi = c(round(acc_lda*100, 2), round(acc_qda*100, 2)),
  APER = c(round(APER_lda*100, 2), round(APER_qda*100, 2)),
  Asumsi_Homogenitas = c("Diperlukan", "Tidak diperlukan"),
  Sesuai_Asumsi = c(ifelse(status_boxm == "TERPENUHI", "Ya", "Tidak"), "Ya")
)
print(comp_df)

##   Metode Akurasi APER Asumsi_Homogenitas Sesuai_Asumsi
## 1    LDA    98.8  1.2         Diperlukan         Tidak
## 2    QDA    98.8  1.2   Tidak diperlukan            Ya

if (acc_qda > acc_lda) {
  selisih <- round((acc_qda - acc_lda)*100, 2)
  cat(sprintf("QDA lebih baik dari LDA (selisih akurasi = %.2f%%)\n", selisih))
  cat("Ini konsisten dengan pelanggaran Asumsi 2 (homogenitas varians).\n")
  cat("-> QDA adalah metode yang lebih tepat untuk dataset penguin ini.\n")
} else if (acc_lda > acc_qda) {
  selisih <- round((acc_lda - acc_qda)*100, 2)
  cat(sprintf("LDA lebih baik dari QDA (selisih akurasi = %.2f%%)\n", selisih))
  cat("LDA tetap robust meski asumsi homogenitas tidak terpenuhi sepenuhnya.\n")
} else {
  cat("LDA dan QDA menghasilkan akurasi yang sama.\n")
  cat("LDA lebih sederhana dan direkomendasikan untuk pelaporan.\n")
}

## LDA dan QDA menghasilkan akurasi yang sama.
## LDA lebih sederhana dan direkomendasikan untuk pelaporan.

# Bar chart perbandingan akurasi
comp_long <- comp_df %>%
  select(Metode, Akurasi, APER) %>%
  pivot_longer(cols = c(Akurasi, APER),
               names_to = "Metrik", values_to = "Nilai")

p_comp <- ggplot(comp_long, aes(x = Metode, y = Nilai, fill = Metode)) +
  geom_col(width = 0.5, alpha = 0.8) +
  geom_text(aes(label = paste0(Nilai, "%")),
            vjust = -0.5, fontface = "bold", size = 4) +
  facet_wrap(~Metrik, scales = "free_y") +
  scale_fill_manual(values = c("LDA"="#3498db", "QDA"="#e67e22")) +
  labs(title    = "Perbandingan LDA vs QDA",
       subtitle = "Akurasi lebih tinggi & APER lebih rendah = lebih baik",
       y = "Nilai (%)", x = "Metode") +
  theme_minimal() +
  theme(legend.position = "none")

png("plot_comparison_lda_qda.png", width = 900, height = 500, res = 110)
print(p_comp)
dev.off()

## png 
##   2

img <- readPNG("plot_comparison_lda_qda.png")
plot.new(); rasterImage(img, 0, 0, 1, 1)

STEP 10: Ringkasan Akhir

cat("RINGKASAN AKHIR ANALISIS DISKRIMINAN\n")

## RINGKASAN AKHIR ANALISIS DISKRIMINAN

cat("Dataset: Palmer Archipelago Penguin\n")

## Dataset: Palmer Archipelago Penguin

cat("-- ASUMSI --\n")

## -- ASUMSI --

cat(sprintf("  [%s] Multivariate Normal    : %s\n",
            ifelse(status_mvn  == "TERPENUHI","v","x"), status_mvn))

##   [x] Multivariate Normal    : TIDAK TERPENUHI

cat(sprintf("  [%s] Homogenitas Kovarians  : %s\n",
            ifelse(status_boxm == "TERPENUHI","v","x"), status_boxm))

##   [x] Homogenitas Kovarians  : TIDAK TERPENUHI

cat("  Catatan: Pelanggaran bersifat biologis, bukan error data.\n\n")

##   Catatan: Pelanggaran bersifat biologis, bukan error data.

cat("-- HASIL MODEL --\n")

## -- HASIL MODEL --

cat(sprintf("  LDA  : Akurasi = %.2f%% | APER = %.2f%%\n",
            acc_lda*100, APER_lda*100))

##   LDA  : Akurasi = 98.80% | APER = 1.20%

cat(sprintf("  QDA  : Akurasi = %.2f%% | APER = %.2f%%\n",
            acc_qda*100, APER_qda*100))

##   QDA  : Akurasi = 98.80% | APER = 1.20%

best <- if (acc_qda >= acc_lda) "QDA" else "LDA"
cat(sprintf("\n  Model terbaik untuk dataset ini: %s\n", best))

## 
##   Model terbaik untuk dataset ini: QDA

cat(sprintf("  Alasan: %s\n",
            if (best == "QDA")
              "Asumsi homogenitas kovarians tidak terpenuhi, QDA lebih sesuai."
            else
              "LDA robust meski asumsi tidak terpenuhi sempurna."))

##   Alasan: Asumsi homogenitas kovarians tidak terpenuhi, QDA lebih sesuai.

Analisis Diskriminan - Palmer Penguin

Abdullah_AlFirdaus_Nuzula (008) | MRaffiFahrezi (100) | Elvira Tiara Suci (213)

2026-04-22