library(MASS) # lda()
library(dplyr) # manipulasi data
library(ggplot2) # visualisasi
library(caret) # confusionMatrix()
library(knitr) # tabel rapi
data <- read.csv("penguins_size.csv")
head(data)
## species island culmen_length_mm culmen_depth_mm flipper_length_mm
## 1 Adelie Torgersen 39.1 18.7 181
## 2 Adelie Torgersen 39.5 17.4 186
## 3 Adelie Torgersen 40.3 18.0 195
## 4 Adelie Torgersen NA NA NA
## 5 Adelie Torgersen 36.7 19.3 193
## 6 Adelie Torgersen 39.3 20.6 190
## body_mass_g sex
## 1 3750 MALE
## 2 3800 FEMALE
## 3 3250 FEMALE
## 4 NA <NA>
## 5 3450 FEMALE
## 6 3650 MALE
str(data)
## 'data.frame': 344 obs. of 7 variables:
## $ species : chr "Adelie" "Adelie" "Adelie" "Adelie" ...
## $ island : chr "Torgersen" "Torgersen" "Torgersen" "Torgersen" ...
## $ culmen_length_mm : num 39.1 39.5 40.3 NA 36.7 39.3 38.9 39.2 34.1 42 ...
## $ culmen_depth_mm : num 18.7 17.4 18 NA 19.3 20.6 17.8 19.6 18.1 20.2 ...
## $ flipper_length_mm: int 181 186 195 NA 193 190 181 195 193 190 ...
## $ body_mass_g : int 3750 3800 3250 NA 3450 3650 3625 4675 3475 4250 ...
## $ sex : chr "MALE" "FEMALE" "FEMALE" NA ...
# Hapus baris dengan NA
data <- na.omit(data)
# Hapus baris dengan nilai "." pada sex (data tidak valid)
data <- data[data$sex %in% c("MALE", "FEMALE"), ]
# Pastikan variabel dependen bertipe factor (nominal, tanpa urutan)
data$species <- as.factor(data$species)
data$sex <- as.factor(data$sex)
data$island <- as.factor(data$island)
# Cek kategori species (3 kelas = multinomial)
levels(data$species)
## [1] "Adelie" "Chinstrap" "Gentoo"
table(data$species)
##
## Adelie Chinstrap Gentoo
## 146 68 119
Catatan: Variabel dependen (
species) memiliki 3 kategori (Adelie, Chinstrap, Gentoo) yang bersifat nominal (tidak ada urutan), sehingga digunakan LDA Multinomial yang menghasilkan lebih dari satu fungsi diskriminan.
summary(data[, c("culmen_length_mm", "culmen_depth_mm",
"flipper_length_mm", "body_mass_g")])
## culmen_length_mm culmen_depth_mm flipper_length_mm body_mass_g
## Min. :32.10 Min. :13.10 Min. :172 Min. :2700
## 1st Qu.:39.50 1st Qu.:15.60 1st Qu.:190 1st Qu.:3550
## Median :44.50 Median :17.30 Median :197 Median :4050
## Mean :43.99 Mean :17.16 Mean :201 Mean :4207
## 3rd Qu.:48.60 3rd Qu.:18.70 3rd Qu.:213 3rd Qu.:4775
## Max. :59.60 Max. :21.50 Max. :231 Max. :6300
# Distribusi variabel respon
ggplot(data, aes(x = species, fill = species)) +
geom_bar() +
geom_text(stat = "count", aes(label = after_stat(count)), vjust = -0.5) +
labs(title = "Distribusi Spesies Penguin",
x = "Species", y = "Frekuensi") +
theme_minimal() +
theme(legend.position = "none")
# Scatter plot antar variabel prediktor
ggplot(data, aes(x = culmen_length_mm, y = culmen_depth_mm, color = species)) +
geom_point(alpha = 0.7) +
labs(title = "Scatter Plot: Culmen Length vs Culmen Depth",
x = "Culmen Length (mm)", y = "Culmen Depth (mm)") +
theme_minimal()
ggplot(data, aes(x = flipper_length_mm, y = body_mass_g, color = species)) +
geom_point(alpha = 0.7) +
labs(title = "Scatter Plot: Flipper Length vs Body Mass",
x = "Flipper Length (mm)", y = "Body Mass (g)") +
theme_minimal()
library(MVN)
mvn_result <- mvn(data = data[, c("culmen_length_mm", "culmen_depth_mm",
"flipper_length_mm", "body_mass_g")])
mvn_result$multivariateNormality
## NULL
mvn_result$univariateNormality
## NULL
Interpretasi: Jika p-value > 0.05 maka asumsi normalitas multivariat terpenuhi.
library(heplots)
boxM_result <- boxM(data[, c("culmen_length_mm", "culmen_depth_mm",
"flipper_length_mm", "body_mass_g")],
data$species)
boxM_result
##
## Box's M-test for Homogeneity of Covariance Matrices
##
## data: data[, c("culmen_length_mm", "culmen_depth_mm", "flipper_length_mm", by data$species "body_mass_g")] by data$species
## Chi-Sq (approx.) = 74.7315, df = 20, p-value = 3.02e-08
Interpretasi: Jika p-value > 0.05 maka asumsi kehomogenan matriks kovarians antar kelompok terpenuhi (cocok untuk LDA). Jika tidak terpenuhi, dapat digunakan QDA (Quadratic Discriminant Analysis).
set.seed(123)
trainIndex <- createDataPartition(data$species, p = 0.8, list = FALSE)
data_train <- data[trainIndex, ]
data_test <- data[-trainIndex, ]
cat("Jumlah data training:", nrow(data_train), "\n")
## Jumlah data training: 268
cat("Jumlah data testing:", nrow(data_test), "\n")
## Jumlah data testing: 65
# Model LDA dengan variabel prediktor numerik
model_lda <- lda(species ~ culmen_length_mm + culmen_depth_mm +
flipper_length_mm + body_mass_g,
data = data_train,
prior = c(1, 1, 1) / 3) # prior probabilitas sama
model_lda
## Call:
## lda(species ~ culmen_length_mm + culmen_depth_mm + flipper_length_mm +
## body_mass_g, data = data_train, prior = c(1, 1, 1)/3)
##
## Prior probabilities of groups:
## Adelie Chinstrap Gentoo
## 0.3333333 0.3333333 0.3333333
##
## Group means:
## culmen_length_mm culmen_depth_mm flipper_length_mm body_mass_g
## Adelie 38.87265 18.37607 190.3504 3730.769
## Chinstrap 48.94727 18.48727 195.8909 3745.909
## Gentoo 47.51562 14.95417 217.0417 5071.094
##
## Coefficients of linear discriminants:
## LD1 LD2
## culmen_length_mm -0.050253247 -0.453993983
## culmen_depth_mm 1.042369273 0.118411607
## flipper_length_mm -0.081565599 0.020235448
## body_mass_g -0.001466212 0.001453271
##
## Proportion of trace:
## LD1 LD2
## 0.8018 0.1982
cat("Prior Probability:\n")
## Prior Probability:
print(model_lda$prior)
## Adelie Chinstrap Gentoo
## 0.3333333 0.3333333 0.3333333
cat("Group Means:\n")
## Group Means:
kable(model_lda$means, digits = 3, caption = "Rata-rata Variabel per Kelompok")
| culmen_length_mm | culmen_depth_mm | flipper_length_mm | body_mass_g | |
|---|---|---|---|---|
| Adelie | 38.873 | 18.376 | 190.350 | 3730.769 |
| Chinstrap | 48.947 | 18.487 | 195.891 | 3745.909 |
| Gentoo | 47.516 | 14.954 | 217.042 | 5071.094 |
cat("Koefisien Fungsi Diskriminan Linear:\n")
## Koefisien Fungsi Diskriminan Linear:
kable(model_lda$scaling, digits = 4,
caption = "Koefisien Fungsi Diskriminan (LD1 dan LD2)")
| LD1 | LD2 | |
|---|---|---|
| culmen_length_mm | -0.0503 | -0.4540 |
| culmen_depth_mm | 1.0424 | 0.1184 |
| flipper_length_mm | -0.0816 | 0.0202 |
| body_mass_g | -0.0015 | 0.0015 |
Interpretasi: Dengan 3 kelas, LDA menghasilkan s = g - 1 = 2 fungsi diskriminan (LD1 dan LD2).
prop_trace <- model_lda$svd^2 / sum(model_lda$svd^2)
cat("Proporsi variansi yang dijelaskan tiap fungsi diskriminan:\n")
## Proporsi variansi yang dijelaskan tiap fungsi diskriminan:
cat("LD1:", round(prop_trace[1] * 100, 2), "%\n")
## LD1: 80.18 %
cat("LD2:", round(prop_trace[2] * 100, 2), "%\n")
## LD2: 19.82 %
# Hitung skor diskriminan untuk data training
lda_scores_train <- predict(model_lda, data_train)$x
lda_plot_df <- data.frame(lda_scores_train,
species = data_train$species)
ggplot(lda_plot_df, aes(x = LD1, y = LD2, color = species, shape = species)) +
geom_point(size = 2.5, alpha = 0.8) +
stat_ellipse(level = 0.95, linetype = "dashed") +
labs(title = "Plot Fungsi Diskriminan LDA",
subtitle = "Proyeksi data training pada ruang diskriminan",
x = paste0("LD1 (", round(prop_trace[1] * 100, 1), "%)"),
y = paste0("LD2 (", round(prop_trace[2] * 100, 1), "%)")) +
theme_minimal()
pred_lda <- predict(model_lda, data_test)
# Hasil prediksi
hasil_pred <- data.frame(
Aktual = data_test$species,
Prediksi = pred_lda$class,
Posterior = round(pred_lda$posterior, 4)
)
head(hasil_pred, 10)
## Aktual Prediksi Posterior.Adelie Posterior.Chinstrap Posterior.Gentoo
## 1 Adelie Adelie 1.0000 0.0000 0
## 2 Adelie Adelie 0.9996 0.0004 0
## 6 Adelie Adelie 1.0000 0.0000 0
## 16 Adelie Adelie 1.0000 0.0000 0
## 23 Adelie Adelie 1.0000 0.0000 0
## 24 Adelie Adelie 1.0000 0.0000 0
## 33 Adelie Adelie 0.9970 0.0030 0
## 38 Adelie Adelie 0.9157 0.0843 0
## 41 Adelie Adelie 1.0000 0.0000 0
## 51 Adelie Adelie 0.9984 0.0016 0
cm <- confusionMatrix(pred_lda$class, data_test$species)
cm
## Confusion Matrix and Statistics
##
## Reference
## Prediction Adelie Chinstrap Gentoo
## Adelie 28 1 0
## Chinstrap 1 12 0
## Gentoo 0 0 23
##
## Overall Statistics
##
## Accuracy : 0.9692
## 95% CI : (0.8932, 0.9963)
## No Information Rate : 0.4462
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.9516
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: Adelie Class: Chinstrap Class: Gentoo
## Sensitivity 0.9655 0.9231 1.0000
## Specificity 0.9722 0.9808 1.0000
## Pos Pred Value 0.9655 0.9231 1.0000
## Neg Pred Value 0.9722 0.9808 1.0000
## Prevalence 0.4462 0.2000 0.3538
## Detection Rate 0.4308 0.1846 0.3538
## Detection Prevalence 0.4462 0.2000 0.3538
## Balanced Accuracy 0.9689 0.9519 1.0000
# Visualisasi confusion matrix
cm_table <- as.data.frame(cm$table)
names(cm_table) <- c("Prediksi", "Aktual", "Freq")
ggplot(cm_table, aes(x = Aktual, y = Prediksi, fill = Freq)) +
geom_tile(color = "white") +
geom_text(aes(label = Freq), size = 5, fontface = "bold") +
scale_fill_gradient(low = "white", high = "#2196F3") +
labs(title = "Confusion Matrix - LDA Multinomial",
x = "Kelas Aktual", y = "Kelas Prediksi") +
theme_minimal()
# APER = proporsi observasi yang salah diklasifikasikan
n_salah <- sum(pred_lda$class != data_test$species)
n_total <- nrow(data_test)
APER <- n_salah / n_total
cat("Jumlah observasi testing :", n_total, "\n")
## Jumlah observasi testing : 65
cat("Jumlah salah klasifikasi :", n_salah, "\n")
## Jumlah salah klasifikasi : 2
cat("APER (Apparent Error Rate) :", round(APER * 100, 2), "%\n")
## APER (Apparent Error Rate) : 3.08 %
cat("Akurasi Klasifikasi :", round((1 - APER) * 100, 2), "%\n")
## Akurasi Klasifikasi : 96.92 %
Interpretasi: APER sebesar 3.08% artinya terdapat 3.08% observasi pada data testing yang salah diklasifikasikan oleh model LDA.
cat("=== RINGKASAN HASIL LDA MULTINOMIAL ===\n\n")
## === RINGKASAN HASIL LDA MULTINOMIAL ===
cat("Variabel Dependen : species (3 kelas: Adelie, Chinstrap, Gentoo)\n")
## Variabel Dependen : species (3 kelas: Adelie, Chinstrap, Gentoo)
cat("Variabel Prediktor : culmen_length_mm, culmen_depth_mm,\n")
## Variabel Prediktor : culmen_length_mm, culmen_depth_mm,
cat(" flipper_length_mm, body_mass_g\n\n")
## flipper_length_mm, body_mass_g
cat("Jumlah Fungsi Diskriminan : 2 (LD1 dan LD2)\n")
## Jumlah Fungsi Diskriminan : 2 (LD1 dan LD2)
cat("Proporsi variansi LD1 :", round(prop_trace[1] * 100, 2), "%\n")
## Proporsi variansi LD1 : 80.18 %
cat("Proporsi variansi LD2 :", round(prop_trace[2] * 100, 2), "%\n\n")
## Proporsi variansi LD2 : 19.82 %
cat("Akurasi Model (data test) :", round((1 - APER) * 100, 2), "%\n")
## Akurasi Model (data test) : 96.92 %
cat("APER :", round(APER * 100, 2), "%\n")
## APER : 3.08 %