library(MASS)
library(ggplot2)
library(dplyr)
library(tidyr)
library(MVN)
library(biotools)
library(gridExtra)
library(GGally)
library(png)
df_raw <- read.csv("Palmer Archipelago (Antarctica) Penguin Data/penguins_lter.csv",
stringsAsFactors = FALSE)
cat("Dimensi data asli:", dim(df_raw), "\n")
## Dimensi data asli: 344 17
# Rename kolom
df_raw <- df_raw %>%
rename(
species_raw = Species,
island_raw = Island,
culmen_length = Culmen.Length..mm.,
culmen_depth = Culmen.Depth..mm.,
flipper_length = Flipper.Length..mm.,
body_mass = Body.Mass..g.,
sex = Sex
)
# Recode Species -> 3 label
# Adelie -> "Adelie" | Gentoo -> "Gentoo" | Chinstrap -> "Others"
# Recode Island -> 3 label
# Biscoe -> "Biscoe" | Dream -> "Dream" | Torgersen -> "Others"
df_raw <- df_raw %>%
mutate(
species = case_when(
grepl("Adelie", species_raw, ignore.case = TRUE) ~ "Adelie",
grepl("Gentoo", species_raw, ignore.case = TRUE) ~ "Gentoo",
TRUE ~ "Others"
),
island = case_when(
island_raw == "Biscoe" ~ "Biscoe",
island_raw == "Dream" ~ "Dream",
TRUE ~ "Others"
)
)
# Bersihkan NA, filter sex valid, buat factor
df <- df_raw %>%
select(species, island, culmen_length, culmen_depth,
flipper_length, body_mass, sex) %>%
filter(
!is.na(culmen_length), !is.na(culmen_depth),
!is.na(flipper_length), !is.na(body_mass),
sex %in% c("MALE", "FEMALE")
) %>%
mutate(
species = factor(species, levels = c("Adelie", "Gentoo", "Others")),
island = factor(island, levels = c("Biscoe", "Dream", "Others")),
sex = factor(sex, levels = c("FEMALE", "MALE"))
)
# LDA skala asli (bukan log-transform)
vars_x <- c("culmen_length", "culmen_depth", "flipper_length", "body_mass")
cat("Data siap:", nrow(df), "baris\n")
## Data siap: 333 baris
cat("Distribusi species:\n")
## Distribusi species:
print(table(df$species))
##
## Adelie Gentoo Others
## 146 119 68
cat("Ringkasan statistik per species:\n")
## Ringkasan statistik per species:
df %>%
group_by(species) %>%
summarise(across(all_of(vars_x),
list(mean = ~ round(mean(.),2),
sd = ~ round(sd(.),2)),
.names = "{.col}_{.fn}")) %>%
print()
## # A tibble: 3 × 9
## species culmen_length_mean culmen_length_sd culmen_depth_mean culmen_depth_sd
## <fct> <dbl> <dbl> <dbl> <dbl>
## 1 Adelie 38.8 2.66 18.4 1.22
## 2 Gentoo 47.6 3.11 15 0.99
## 3 Others 48.8 3.34 18.4 1.14
## # ℹ 4 more variables: flipper_length_mean <dbl>, flipper_length_sd <dbl>,
## # body_mass_mean <dbl>, body_mass_sd <dbl>
# Boxplot per variabel per species
df_long <- df %>%
select(species, all_of(vars_x)) %>%
pivot_longer(cols = all_of(vars_x), names_to = "var", values_to = "val")
p_box <- ggplot(df_long, aes(x = species, y = val, fill = species)) +
geom_boxplot(alpha = 0.7, outlier.color = "red", outlier.shape = 16) +
facet_wrap(~var, scales = "free_y", ncol = 2) +
scale_fill_manual(values = c("Adelie"="#3498db",
"Gentoo"="#2ecc71",
"Others"="#e74c3c")) +
labs(title = "EDA: Distribusi Variabel Numerik per Species",
x = "Species", y = "Nilai") +
theme_minimal() + theme(legend.position = "bottom")
png("plot_eda_boxplot.png", width = 1200, height = 900, res = 110)
print(p_box)
dev.off()
## png
## 2
img <- readPNG("plot_eda_boxplot.png")
plot.new(); rasterImage(img, 0, 0, 1, 1)
# Scatterplot matrix
png("plot_eda_scatter.png", width = 1100, height = 1000, res = 100)
print(
ggpairs(df, columns = vars_x,
aes(color = species, alpha = 0.5),
upper = list(continuous = "cor"),
lower = list(continuous = "points"),
diag = list(continuous = "densityDiag")) +
scale_color_manual(values = c("Adelie"="#3498db",
"Gentoo"="#2ecc71",
"Others"="#e74c3c")) +
scale_fill_manual(values = c("Adelie"="#3498db",
"Gentoo"="#2ecc71",
"Others"="#e74c3c")) +
labs(title = "EDA: Scatterplot Matrix") +
theme_minimal()
)
## Warning: No shared levels found between `names(values)` of the manual scale and the
## data's colour values.
## No shared levels found between `names(values)` of the manual scale and the
## data's colour values.
## Warning: No shared levels found between `names(values)` of the manual scale and the
## data's fill values.
## Warning: No shared levels found between `names(values)` of the manual scale and the
## data's colour values.
## No shared levels found between `names(values)` of the manual scale and the
## data's colour values.
## Warning: No shared levels found between `names(values)` of the manual scale and the
## data's fill values.
## Warning: No shared levels found between `names(values)` of the manual scale and the
## data's colour values.
## No shared levels found between `names(values)` of the manual scale and the
## data's colour values.
## Warning: No shared levels found between `names(values)` of the manual scale and the
## data's fill values.
## Warning: No shared levels found between `names(values)` of the manual scale and the
## data's colour values.
## Warning: No shared levels found between `names(values)` of the manual scale and the
## data's fill values.
## Warning: No shared levels found between `names(values)` of the manual scale and the
## data's colour values.
## No shared levels found between `names(values)` of the manual scale and the
## data's colour values.
## Warning: No shared levels found between `names(values)` of the manual scale and the
## data's fill values.
## Warning: No shared levels found between `names(values)` of the manual scale and the
## data's colour values.
## No shared levels found between `names(values)` of the manual scale and the
## data's colour values.
## Warning: No shared levels found between `names(values)` of the manual scale and the
## data's fill values.
## Warning: No shared levels found between `names(values)` of the manual scale and the
## data's colour values.
## Warning: No shared levels found between `names(values)` of the manual scale and the
## data's fill values.
## No shared levels found between `names(values)` of the manual scale and the
## data's fill values.
## Warning: No shared levels found between `names(values)` of the manual scale and the
## data's colour values.
## No shared levels found between `names(values)` of the manual scale and the
## data's colour values.
## Warning: No shared levels found between `names(values)` of the manual scale and the
## data's fill values.
## Warning: No shared levels found between `names(values)` of the manual scale and the
## data's colour values.
## Warning: No shared levels found between `names(values)` of the manual scale and the
## data's fill values.
## No shared levels found between `names(values)` of the manual scale and the
## data's fill values.
## No shared levels found between `names(values)` of the manual scale and the
## data's fill values.
## Warning: No shared levels found between `names(values)` of the manual scale and the
## data's colour values.
dev.off()
## png
## 2
img <- readPNG("plot_eda_scatter.png")
plot.new(); rasterImage(img, 0, 0, 1, 1)
Analisis Diskriminan Linear (LDA) memiliki dua asumsi utama:
cat("========== ASUMSI 1: MULTIVARIATE NORMAL ==========\n")
## ========== ASUMSI 1: MULTIVARIATE NORMAL ==========
cat("Uji: Shapiro-Wilk per variabel per kelompok (univariat)\n")
## Uji: Shapiro-Wilk per variabel per kelompok (univariat)
cat("H0 : Data normal\n")
## H0 : Data normal
cat("Keputusan: Tolak H0 jika p-value < 0.05\n\n")
## Keputusan: Tolak H0 jika p-value < 0.05
mvn_status_list <- c()
for (sp in levels(df$species)) {
sub <- df %>% filter(species == sp) %>% select(all_of(vars_x))
shapiro_pvals <- sapply(sub, function(x) shapiro.test(x)$p.value)
ok <- all(shapiro_pvals > 0.05)
mvn_status_list[sp] <- ok
cat(sprintf("Species %-8s :", sp))
cat(sprintf(" [%s]\n", ifelse(ok, "NORMAL OK", "Tidak Normal")))
for (v in names(shapiro_pvals)) {
flag <- ifelse(shapiro_pvals[v] < 0.05, " <-- p < 0.05", "")
cat(sprintf(" %-16s p = %.4f%s\n", v, shapiro_pvals[v], flag))
}
cat("\n")
}
## Species Adelie : [Tidak Normal]
## culmen_length p = 0.6848
## culmen_depth p = 0.1047
## flipper_length p = 0.7427
## body_mass p = 0.0423 <-- p < 0.05
##
## Species Gentoo : [Tidak Normal]
## culmen_length p = 0.0199 <-- p < 0.05
## culmen_depth p = 0.0438 <-- p < 0.05
## flipper_length p = 0.0018 <-- p < 0.05
## body_mass p = 0.2605
##
## Species Others : [NORMAL OK]
## culmen_length p = 0.1941
## culmen_depth p = 0.1418
## flipper_length p = 0.8106
## body_mass p = 0.5605
status_mvn <- ifelse(all(mvn_status_list), "TERPENUHI", "TIDAK TERPENUHI")
cat(sprintf(">> STATUS ASUMSI 1 (MVN) : %s\n", status_mvn))
## >> STATUS ASUMSI 1 (MVN) : TIDAK TERPENUHI
Asumsi normalitas tidak sepenuhnya terpenuhi, terutama pada Gentoo.Ini adalah fenomena biologis: Gentoo memiliki distribusi ukuran tubuh yang sedikit skewed karena variasi antar individu yang lebih besar. Selain itu, Shapiro-Wilk sangat sensitif pada n >= 50, sehingga penyimpangan kecil yang tidak bermakna secara praktis pun terdeteksi. LDA tetap robust untuk pelanggaran ringan ini (Hair et al., 2010). Sebagai mitigasi: QDA akan dijalankan sebagai pembanding.
cat("========== ASUMSI 2: HOMOGENITAS VARIANS ==========\n")
## ========== ASUMSI 2: HOMOGENITAS VARIANS ==========
cat("Uji: Box's M Test\n")
## Uji: Box's M Test
cat("H0 : Matriks kovarians antar kelompok homogen\n")
## H0 : Matriks kovarians antar kelompok homogen
cat("Keputusan: Tolak H0 jika p-value < 0.001\n")
## Keputusan: Tolak H0 jika p-value < 0.001
cat("(Threshold 0.001 karena Box's M sangat sensitif pada n besar)\n\n")
## (Threshold 0.001 karena Box's M sangat sensitif pada n besar)
boxm_res <- boxM(df[, vars_x], df$species)
print(boxm_res)
##
## Box's M-test for Homogeneity of Covariance Matrices
##
## data: df[, vars_x]
## Chi-Sq (approx.) = 74.731, df = 20, p-value = 3.02e-08
status_boxm <- ifelse(boxm_res$p.value > 0.001, "TERPENUHI", "TIDAK TERPENUHI")
cat(sprintf("\n>> STATUS ASUMSI 2 (Box's M) : %s\n", status_boxm))
##
## >> STATUS ASUMSI 2 (Box's M) : TIDAK TERPENUHI
Matriks kovarians tidak homogen karena Gentoo jauh lebih besar dari Adelie dan Chinstrap, sehingga variansi antar kelompok berbeda secara struktural. Ini sifat biologis dataset, bukan error.Konsekuensi: LDA kurang optimal -> QDA lebih sesuai.Kedua metode tetap dijalankan untuk perbandingan
cat("RINGKASAN ASUMSI ANALISIS DISKRIMINAN\n")
## RINGKASAN ASUMSI ANALISIS DISKRIMINAN
asumsi_names <- c("Multivariate Normal", "Homogenitas Kovarians")
asumsi_statuses <- c(status_mvn, status_boxm)
for (i in seq_along(asumsi_names)) {
tanda <- if (asumsi_statuses[i] == "TERPENUHI") "[v]" else "[x]"
cat(sprintf(" %s %-25s : %s\n",
tanda, asumsi_names[i], asumsi_statuses[i]))
}
## [x] Multivariate Normal : TIDAK TERPENUHI
## [x] Homogenitas Kovarians : TIDAK TERPENUHI
semua_ok <- all(asumsi_statuses == "TERPENUHI")
if (semua_ok) {
cat(" KESIMPULAN: SEMUA ASUMSI TERPENUHI, LDA optimal\n")
} else {
cat(" KESIMPULAN: ADA ASUMSI TIDAK TERPENUHI\n")
}
## KESIMPULAN: ADA ASUMSI TIDAK TERPENUHI
cat("STEP 4: BAYES CLASSIFICATION RULE\n")
## STEP 4: BAYES CLASSIFICATION RULE
# Prior probability = proporsi tiap kelas di data
prior_prob <- table(df$species) / nrow(df)
cat("Prior Probability per Species:\n")
## Prior Probability per Species:
print(round(prior_prob, 4))
##
## Adelie Gentoo Others
## 0.4384 0.3574 0.2042
cat("\nInterpretasi: Tanpa informasi prediktor, probabilitas awal tiap\n")
##
## Interpretasi: Tanpa informasi prediktor, probabilitas awal tiap
# Fit LDA
lda_model <- lda(species ~ culmen_length + culmen_depth +
flipper_length + body_mass,
data = df,
prior = as.vector(prior_prob))
cat("Ringkasan Model LDA:\n")
## Ringkasan Model LDA:
print(lda_model)
## Call:
## lda(species ~ culmen_length + culmen_depth + flipper_length +
## body_mass, data = df, prior = as.vector(prior_prob))
##
## Prior probabilities of groups:
## Adelie Gentoo Others
## 0.4384384 0.3573574 0.2042042
##
## Group means:
## culmen_length culmen_depth flipper_length body_mass
## Adelie 38.82397 18.34726 190.1027 3706.164
## Gentoo 47.56807 14.99664 217.2353 5092.437
## Others 48.83382 18.42059 195.8235 3733.088
##
## Coefficients of linear discriminants:
## LD1 LD2
## culmen_length -0.085926709 0.41660160
## culmen_depth 1.041646762 0.01042272
## flipper_length -0.084552842 -0.01424552
## body_mass -0.001347375 -0.00168559
##
## Proportion of trace:
## LD1 LD2
## 0.8655 0.1345
# Posterior probability
lda_pred <- predict(lda_model)
posterior_df <- as.data.frame(lda_pred$posterior) %>%
mutate(across(everything(), ~ round(.x, 4))) %>%
mutate(Predicted = lda_pred$class,
Actual = df$species)
cat("Contoh 10 baris pertama Posterior Probability:\n")
## Contoh 10 baris pertama Posterior Probability:
print(head(posterior_df, 10))
## Adelie Gentoo Others Predicted Actual
## 1 1.0000 0 0.0000 Adelie Adelie
## 2 0.9997 0 0.0003 Adelie Adelie
## 3 0.9840 0 0.0160 Adelie Adelie
## 4 1.0000 0 0.0000 Adelie Adelie
## 5 1.0000 0 0.0000 Adelie Adelie
## 6 0.9999 0 0.0001 Adelie Adelie
## 7 1.0000 0 0.0000 Adelie Adelie
## 8 0.9298 0 0.0702 Adelie Adelie
## 9 1.0000 0 0.0000 Adelie Adelie
## 10 1.0000 0 0.0000 Adelie Adelie
cat("STEP 5: LINEAR DISCRIMINANT FUNCTION\n")
## STEP 5: LINEAR DISCRIMINANT FUNCTION
X <- as.matrix(df[, vars_x])
grp <- df$species
k <- nlevels(grp)
p <- ncol(X)
n <- nrow(X)
# Grand mean x̄
x_bar <- colMeans(X)
cat("Grand Mean (x̄):\n")
## Grand Mean (x̄):
print(round(x_bar, 4))
## culmen_length culmen_depth flipper_length body_mass
## 43.9928 17.1649 200.9670 4207.0571
# Between-groups matrix
B <- matrix(0, p, p)
for (sp in levels(grp)) {
xi_bar <- colMeans(X[grp == sp, ])
ni <- sum(grp == sp)
d <- matrix(xi_bar - x_bar, ncol = 1)
B <- B + ni * (d %*% t(d))
}
rownames(B) <- colnames(B) <- vars_x
cat("Between-Groups Matrix (B):\n")
## Between-Groups Matrix (B):
print(round(B, 4))
## culmen_length culmen_depth flipper_length body_mass
## culmen_length 7015.386 -1401.4088 13426.980 598663.4
## culmen_depth -1401.409 870.7852 -6512.219 -355385.5
## flipper_length 13426.980 -6512.2188 50525.884 2674311.3
## body_mass 598663.355 -355385.5494 2674311.270 145190219.1
# Within-groups matrix W
W <- matrix(0, p, p)
for (sp in levels(grp)) {
Xi <- X[grp == sp, ]
xi_bar <- colMeans(Xi)
for (j in 1:nrow(Xi)) {
d <- matrix(Xi[j, ] - xi_bar, ncol = 1)
W <- W + (d %*% t(d))
}
}
rownames(W) <- colnames(W) <- vars_x
cat("Within-Groups Matrix (W):\n")
## Within-Groups Matrix (W):
print(round(W, 4))
## culmen_length culmen_depth flipper_length body_mass
## culmen_length 2913.5170 583.9945 3192.340 263083.6
## culmen_depth 583.9945 416.6737 1217.732 106898.1
## flipper_length 3192.3404 1217.7323 14692.753 596616.4
## body_mass 263083.5822 106898.1170 596616.358 70069446.8
# W^-1 B dan Eigendecomposition
W_inv <- solve(W)
WinvB <- W_inv %*% B
eig <- eigen(WinvB)
eig_val <- Re(eig$values)
eig_vec <- Re(eig$vectors)
cat("Eigenvalues (urut dari terbesar):\n")
## Eigenvalues (urut dari terbesar):
print(round(eig_val, 6))
## [1] 15.030390 2.336239 0.000000 0.000000
# Proporsi varians yang dijelaskan tiap LD
prop_var <- eig_val / sum(eig_val[eig_val > 1e-10])
cat("\nProporsi varians per fungsi diskriminan:\n")
##
## Proporsi varians per fungsi diskriminan:
for (i in 1:min(k-1, p)) {
cat(sprintf(" LD%d : %.4f (%.2f%%)\n", i, eig_val[i], prop_var[i]*100))
}
## LD1 : 15.0304 (86.55%)
## LD2 : 2.3362 (13.45%)
# Ambil r = min(k-1, p) = min(2, 4) = 2 fungsi diskriminan
r <- min(k-1, p)
disc_coef <- eig_vec[, 1:r, drop = FALSE]
rownames(disc_coef) <- vars_x
colnames(disc_coef) <- paste0("LD", 1:r)
# Normalisasi tanda: LD1 positif ke arah Gentoo (terbesar)
for (col in 1:ncol(disc_coef)) {
cg <- colMeans(X[grp == "Gentoo", ]) %*% disc_coef[, col, drop = FALSE]
ca <- colMeans(X[grp == "Adelie", ]) %*% disc_coef[, col, drop = FALSE]
if (cg < ca) disc_coef[, col] <- -disc_coef[, col]
}
cat(sprintf("Jumlah fungsi diskriminan: r = min(%d-1, %d) = %d\n", k, p, r))
## Jumlah fungsi diskriminan: r = min(3-1, 4) = 2
cat("\nKoefisien Fungsi Diskriminan (eigenvectors W^-1 B):\n")
##
## Koefisien Fungsi Diskriminan (eigenvectors W^-1 B):
print(round(disc_coef, 4))
## LD1 LD2
## culmen_length 0.0819 0.9991
## culmen_depth -0.9934 0.0250
## flipper_length 0.0806 -0.0342
## body_mass 0.0013 -0.0040
# Discriminant scores tiap observasi: Y = X * disc_coef
Y <- X %*% disc_coef
cat("Contoh Discriminant Scores (10 obs pertama):\n")
## Contoh Discriminant Scores (10 obs pertama):
print(round(head(Y, 10), 4))
## LD1 LD2
## [1,] 4.0413 18.1895
## [2,] 5.8328 18.1837
## [3,] 5.3214 20.9138
## [4,] 3.8307 16.6094
## [5,] 2.7675 18.5336
## [6,] 4.7583 18.4725
## [7,] 5.4728 14.0944
## [8,] 4.6718 22.3493
## [9,] 2.3874 17.2087
## [10,] 3.4944 10.5452
# Cutting score = rata-rata dua centroid
cat("STEP 6: CUTTING SCORE & KLASIFIKASI\n")
## STEP 6: CUTTING SCORE & KLASIFIKASI
# Centroid tiap kelas di ruang diskriminan
centroids <- matrix(0, nrow = k, ncol = r)
rownames(centroids) <- levels(grp)
colnames(centroids) <- paste0("LD", 1:r)
for (i in seq_along(levels(grp))) {
sp <- levels(grp)[i]
xi_bar <- colMeans(X[grp == sp, ])
centroids[i, ] <- xi_bar %*% disc_coef
}
cat("Centroid tiap species di ruang diskriminan:\n")
## Centroid tiap species di ruang diskriminan:
print(round(centroids, 4))
## LD1 LD2
## Adelie 5.0467 17.7711
## Gentoo 13.0607 19.8927
## Others 6.2900 27.4695
# Cutting score = rata-rata dua centroid (Slide 18)
m_AG <- 0.5 * (centroids["Adelie", "LD1"] + centroids["Gentoo", "LD1"])
m_AO <- 0.5 * (centroids["Adelie", "LD1"] + centroids["Others", "LD1"])
cat(sprintf("\nCutting Score Adelie-Gentoo (LD1) : %.4f\n", m_AG))
##
## Cutting Score Adelie-Gentoo (LD1) : 9.0537
cat(sprintf("Cutting Score Adelie-Others (LD1) : %.4f\n", m_AO))
## Cutting Score Adelie-Others (LD1) : 5.6683
Interpretasi: Observasi diklasifikasikan ke kelas dengan centroid terdekat di ruang diskriminan (jarak Euclidean)
# Klasifikasi manual: jarak Euclidean ke centroid
classify_manual <- function(y_scores, centroids) {
dists <- apply(centroids, 1, function(c) sqrt(sum((y_scores - c)^2)))
names(which.min(dists))
}
pred_manual <- apply(Y, 1, classify_manual, centroids = centroids)
pred_manual <- factor(pred_manual, levels = levels(grp))
cat("Hasil Klasifikasi Manual (10 obs pertama):\n")
## Hasil Klasifikasi Manual (10 obs pertama):
print(data.frame(
Actual = df$species[1:10],
Predicted = pred_manual[1:10],
LD1 = round(Y[1:10, 1], 4),
LD2 = round(Y[1:10, 2], 4)
))
## Actual Predicted LD1 LD2
## 1 Adelie Adelie 4.0413 18.1895
## 2 Adelie Adelie 5.8328 18.1837
## 3 Adelie Adelie 5.3214 20.9138
## 4 Adelie Adelie 3.8307 16.6094
## 5 Adelie Adelie 2.7675 18.5336
## 6 Adelie Adelie 4.7583 18.4725
## 7 Adelie Adelie 5.4728 14.0944
## 8 Adelie Adelie 4.6718 22.3493
## 9 Adelie Adelie 2.3874 17.2087
## 10 Adelie Adelie 3.4944 10.5452
cat("========== STEP 7: EVALUASI LDA ==========\n")
## ========== STEP 7: EVALUASI LDA ==========
pred_lda <- predict(lda_model)$class
conf_lda <- table(Actual = df$species, Predicted = pred_lda)
cat("Confusion Matrix (LDA):\n")
## Confusion Matrix (LDA):
print(conf_lda)
## Predicted
## Actual Adelie Gentoo Others
## Adelie 145 0 1
## Gentoo 0 119 0
## Others 3 0 65
n_total <- nrow(df)
n_correct <- sum(diag(conf_lda))
n_wrong <- n_total - n_correct
APER_lda <- n_wrong / n_total
acc_lda <- n_correct / n_total
cat(sprintf("\nTotal observasi : %d\n", n_total))
##
## Total observasi : 333
cat(sprintf("Benar diklasifikasi : %d\n", n_correct))
## Benar diklasifikasi : 329
cat(sprintf("Salah diklasifikasi : %d\n", n_wrong))
## Salah diklasifikasi : 4
cat(sprintf("APER (LDA) : %.4f (%.2f%%)\n", APER_lda, APER_lda*100))
## APER (LDA) : 0.0120 (1.20%)
cat(sprintf("Akurasi (LDA) : %.4f (%.2f%%)\n", acc_lda, acc_lda*100))
## Akurasi (LDA) : 0.9880 (98.80%)
cat("\nAPER per kelas (LDA):\n")
##
## APER per kelas (LDA):
for (sp in levels(grp)) {
n_sp <- sum(df$species == sp)
n_err <- n_sp - conf_lda[sp, sp]
aper_sp <- n_err / n_sp
cat(sprintf(" %-8s : %d salah dari %d (APER = %.4f)\n",
sp, n_err, n_sp, aper_sp))
}
## Adelie : 1 salah dari 146 (APER = 0.0068)
## Gentoo : 0 salah dari 119 (APER = 0.0000)
## Others : 3 salah dari 68 (APER = 0.0441)
# Discriminant Plot LDA (LD1 vs LD2)
plot_df_lda <- data.frame(
LD1 = lda_pred$x[, 1],
LD2 = lda_pred$x[, 2],
Actual = df$species,
Correct = lda_pred$class == df$species
)
# Centroid untuk plot
cent_plot <- as.data.frame(lda_model$means %*% disc_coef)
colnames(cent_plot) <- c("LD1", "LD2")
cent_plot$Actual <- rownames(lda_model$means)
p_lda <- ggplot(plot_df_lda, aes(x = LD1, y = LD2,
color = Actual, shape = Correct)) +
geom_point(size = 2, alpha = 0.75) +
stat_ellipse(aes(group = Actual), type = "norm",
linetype = "dashed", linewidth = 0.7) +
geom_point(data = cent_plot, aes(x = LD1, y = LD2),
size = 6, shape = 18, color = "black", inherit.aes = FALSE) +
geom_text(data = cent_plot, aes(x = LD1, y = LD2, label = Actual),
vjust = -1, fontface = "bold", color = "black",
inherit.aes = FALSE, size = 3.5) +
scale_color_manual(values = c("Adelie"="#3498db",
"Gentoo"="#2ecc71",
"Others"="#e74c3c")) +
scale_shape_manual(values = c("TRUE"=16, "FALSE"=4),
labels = c("TRUE"="Benar", "FALSE"="Salah")) +
labs(title = sprintf("LDA: Discriminant Plot (Akurasi = %.2f%%)", acc_lda*100),
subtitle = "Titik besar hitam = centroid | ✕ = misclassified",
color = "Species", shape = "Klasifikasi") +
theme_minimal()
png("plot_lda_discriminant.png", width = 1100, height = 800, res = 110)
print(p_lda)
## Warning: The following aesthetics were dropped during statistical transformation: shape.
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
## the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
## variable into a factor?
dev.off()
## png
## 2
img <- readPNG("plot_lda_discriminant.png")
plot.new(); rasterImage(img, 0, 0, 1, 1)
QDA tidak mengasumsikan homogenitas matriks kovarians antar kelompok, sehingga lebih sesuai ketika Asumsi 2 tidak terpenuhi.
cat("STEP 8: QDA (Quadratic Discriminant Analysis)\n")
## STEP 8: QDA (Quadratic Discriminant Analysis)
qda_model <- qda(species ~ culmen_length + culmen_depth +
flipper_length + body_mass,
data = df,
prior = as.vector(prior_prob))
cat("Ringkasan Model QDA:\n")
## Ringkasan Model QDA:
print(qda_model)
## Call:
## qda(species ~ culmen_length + culmen_depth + flipper_length +
## body_mass, data = df, prior = as.vector(prior_prob))
##
## Prior probabilities of groups:
## Adelie Gentoo Others
## 0.4384384 0.3573574 0.2042042
##
## Group means:
## culmen_length culmen_depth flipper_length body_mass
## Adelie 38.82397 18.34726 190.1027 3706.164
## Gentoo 47.56807 14.99664 217.2353 5092.437
## Others 48.83382 18.42059 195.8235 3733.088
# Evaluasi QDA
pred_qda <- predict(qda_model)$class
conf_qda <- table(Actual = df$species, Predicted = pred_qda)
n_correct_qda <- sum(diag(conf_qda))
n_wrong_qda <- n_total - n_correct_qda
APER_qda <- n_wrong_qda / n_total
acc_qda <- n_correct_qda / n_total
cat("Confusion Matrix (QDA):\n")
## Confusion Matrix (QDA):
print(conf_qda)
## Predicted
## Actual Adelie Gentoo Others
## Adelie 144 0 2
## Gentoo 0 119 0
## Others 2 0 66
cat(sprintf("\nTotal observasi : %d\n", n_total))
##
## Total observasi : 333
cat(sprintf("Benar diklasifikasi : %d\n", n_correct_qda))
## Benar diklasifikasi : 329
cat(sprintf("Salah diklasifikasi : %d\n", n_wrong_qda))
## Salah diklasifikasi : 4
cat(sprintf("APER (QDA) : %.4f (%.2f%%)\n", APER_qda, APER_qda*100))
## APER (QDA) : 0.0120 (1.20%)
cat(sprintf("Akurasi (QDA) : %.4f (%.2f%%)\n", acc_qda, acc_qda*100))
## Akurasi (QDA) : 0.9880 (98.80%)
cat("\nAPER per kelas (QDA):\n")
##
## APER per kelas (QDA):
for (sp in levels(grp)) {
n_sp <- sum(df$species == sp)
n_err <- n_sp - conf_qda[sp, sp]
aper_sp <- n_err / n_sp
cat(sprintf(" %-8s : %d salah dari %d (APER = %.4f)\n",
sp, n_err, n_sp, aper_sp))
}
## Adelie : 2 salah dari 146 (APER = 0.0137)
## Gentoo : 0 salah dari 119 (APER = 0.0000)
## Others : 2 salah dari 68 (APER = 0.0294)
cat("STEP 9: PERBANDINGAN LDA vs QDA\n\n")
## STEP 9: PERBANDINGAN LDA vs QDA
comp_df <- data.frame(
Metode = c("LDA", "QDA"),
Akurasi = c(round(acc_lda*100, 2), round(acc_qda*100, 2)),
APER = c(round(APER_lda*100, 2), round(APER_qda*100, 2)),
Asumsi_Homogenitas = c("Diperlukan", "Tidak diperlukan"),
Sesuai_Asumsi = c(ifelse(status_boxm == "TERPENUHI", "Ya", "Tidak"), "Ya")
)
print(comp_df)
## Metode Akurasi APER Asumsi_Homogenitas Sesuai_Asumsi
## 1 LDA 98.8 1.2 Diperlukan Tidak
## 2 QDA 98.8 1.2 Tidak diperlukan Ya
if (acc_qda > acc_lda) {
selisih <- round((acc_qda - acc_lda)*100, 2)
cat(sprintf("QDA lebih baik dari LDA (selisih akurasi = %.2f%%)\n", selisih))
cat("Ini konsisten dengan pelanggaran Asumsi 2 (homogenitas varians).\n")
cat("-> QDA adalah metode yang lebih tepat untuk dataset penguin ini.\n")
} else if (acc_lda > acc_qda) {
selisih <- round((acc_lda - acc_qda)*100, 2)
cat(sprintf("LDA lebih baik dari QDA (selisih akurasi = %.2f%%)\n", selisih))
cat("LDA tetap robust meski asumsi homogenitas tidak terpenuhi sepenuhnya.\n")
} else {
cat("LDA dan QDA menghasilkan akurasi yang sama.\n")
cat("LDA lebih sederhana dan direkomendasikan untuk pelaporan.\n")
}
## LDA dan QDA menghasilkan akurasi yang sama.
## LDA lebih sederhana dan direkomendasikan untuk pelaporan.
# Bar chart perbandingan akurasi
comp_long <- comp_df %>%
select(Metode, Akurasi, APER) %>%
pivot_longer(cols = c(Akurasi, APER),
names_to = "Metrik", values_to = "Nilai")
p_comp <- ggplot(comp_long, aes(x = Metode, y = Nilai, fill = Metode)) +
geom_col(width = 0.5, alpha = 0.8) +
geom_text(aes(label = paste0(Nilai, "%")),
vjust = -0.5, fontface = "bold", size = 4) +
facet_wrap(~Metrik, scales = "free_y") +
scale_fill_manual(values = c("LDA"="#3498db", "QDA"="#e67e22")) +
labs(title = "Perbandingan LDA vs QDA",
subtitle = "Akurasi lebih tinggi & APER lebih rendah = lebih baik",
y = "Nilai (%)", x = "Metode") +
theme_minimal() +
theme(legend.position = "none")
png("plot_comparison_lda_qda.png", width = 900, height = 500, res = 110)
print(p_comp)
dev.off()
## png
## 2
img <- readPNG("plot_comparison_lda_qda.png")
plot.new(); rasterImage(img, 0, 0, 1, 1)
cat("RINGKASAN AKHIR ANALISIS DISKRIMINAN\n")
## RINGKASAN AKHIR ANALISIS DISKRIMINAN
cat("Dataset: Palmer Archipelago Penguin\n")
## Dataset: Palmer Archipelago Penguin
cat("-- ASUMSI --\n")
## -- ASUMSI --
cat(sprintf(" [%s] Multivariate Normal : %s\n",
ifelse(status_mvn == "TERPENUHI","v","x"), status_mvn))
## [x] Multivariate Normal : TIDAK TERPENUHI
cat(sprintf(" [%s] Homogenitas Kovarians : %s\n",
ifelse(status_boxm == "TERPENUHI","v","x"), status_boxm))
## [x] Homogenitas Kovarians : TIDAK TERPENUHI
cat(" Catatan: Pelanggaran bersifat biologis, bukan error data.\n\n")
## Catatan: Pelanggaran bersifat biologis, bukan error data.
cat("-- HASIL MODEL --\n")
## -- HASIL MODEL --
cat(sprintf(" LDA : Akurasi = %.2f%% | APER = %.2f%%\n",
acc_lda*100, APER_lda*100))
## LDA : Akurasi = 98.80% | APER = 1.20%
cat(sprintf(" QDA : Akurasi = %.2f%% | APER = %.2f%%\n",
acc_qda*100, APER_qda*100))
## QDA : Akurasi = 98.80% | APER = 1.20%
best <- if (acc_qda >= acc_lda) "QDA" else "LDA"
cat(sprintf("\n Model terbaik untuk dataset ini: %s\n", best))
##
## Model terbaik untuk dataset ini: QDA
cat(sprintf(" Alasan: %s\n",
if (best == "QDA")
"Asumsi homogenitas kovarians tidak terpenuhi, QDA lebih sesuai."
else
"LDA robust meski asumsi tidak terpenuhi sempurna."))
## Alasan: Asumsi homogenitas kovarians tidak terpenuhi, QDA lebih sesuai.