1. Library

library(MASS)       # lda()
library(dplyr)      # manipulasi data
library(ggplot2)    # visualisasi
library(caret)      # confusionMatrix()
library(knitr)      # tabel rapi

2. Import Data

data <- read.csv("penguins_size.csv")
head(data)
##   species    island culmen_length_mm culmen_depth_mm flipper_length_mm
## 1  Adelie Torgersen             39.1            18.7               181
## 2  Adelie Torgersen             39.5            17.4               186
## 3  Adelie Torgersen             40.3            18.0               195
## 4  Adelie Torgersen               NA              NA                NA
## 5  Adelie Torgersen             36.7            19.3               193
## 6  Adelie Torgersen             39.3            20.6               190
##   body_mass_g    sex
## 1        3750   MALE
## 2        3800 FEMALE
## 3        3250 FEMALE
## 4          NA   <NA>
## 5        3450 FEMALE
## 6        3650   MALE
str(data)
## 'data.frame':    344 obs. of  7 variables:
##  $ species          : chr  "Adelie" "Adelie" "Adelie" "Adelie" ...
##  $ island           : chr  "Torgersen" "Torgersen" "Torgersen" "Torgersen" ...
##  $ culmen_length_mm : num  39.1 39.5 40.3 NA 36.7 39.3 38.9 39.2 34.1 42 ...
##  $ culmen_depth_mm  : num  18.7 17.4 18 NA 19.3 20.6 17.8 19.6 18.1 20.2 ...
##  $ flipper_length_mm: int  181 186 195 NA 193 190 181 195 193 190 ...
##  $ body_mass_g      : int  3750 3800 3250 NA 3450 3650 3625 4675 3475 4250 ...
##  $ sex              : chr  "MALE" "FEMALE" "FEMALE" NA ...

3. Preprocessing Data

# Hapus baris dengan NA
data <- na.omit(data)

# Hapus baris dengan nilai "." pada sex (data tidak valid)
data <- data[data$sex %in% c("MALE", "FEMALE"), ]

# Pastikan variabel dependen bertipe factor (nominal, tanpa urutan)
data$species <- as.factor(data$species)
data$sex     <- as.factor(data$sex)
data$island  <- as.factor(data$island)

# Cek kategori species (3 kelas = multinomial)
levels(data$species)
## [1] "Adelie"    "Chinstrap" "Gentoo"
table(data$species)
## 
##    Adelie Chinstrap    Gentoo 
##       146        68       119

Catatan: Variabel dependen (species) memiliki 3 kategori (Adelie, Chinstrap, Gentoo) yang bersifat nominal (tidak ada urutan), sehingga digunakan LDA Multinomial yang menghasilkan lebih dari satu fungsi diskriminan.


4. Statistika Deskriptif

summary(data[, c("culmen_length_mm", "culmen_depth_mm",
                 "flipper_length_mm", "body_mass_g")])
##  culmen_length_mm culmen_depth_mm flipper_length_mm  body_mass_g  
##  Min.   :32.10    Min.   :13.10   Min.   :172       Min.   :2700  
##  1st Qu.:39.50    1st Qu.:15.60   1st Qu.:190       1st Qu.:3550  
##  Median :44.50    Median :17.30   Median :197       Median :4050  
##  Mean   :43.99    Mean   :17.16   Mean   :201       Mean   :4207  
##  3rd Qu.:48.60    3rd Qu.:18.70   3rd Qu.:213       3rd Qu.:4775  
##  Max.   :59.60    Max.   :21.50   Max.   :231       Max.   :6300
# Distribusi variabel respon
ggplot(data, aes(x = species, fill = species)) +
  geom_bar() +
  geom_text(stat = "count", aes(label = after_stat(count)), vjust = -0.5) +
  labs(title = "Distribusi Spesies Penguin",
       x = "Species", y = "Frekuensi") +
  theme_minimal() +
  theme(legend.position = "none")

# Scatter plot antar variabel prediktor
ggplot(data, aes(x = culmen_length_mm, y = culmen_depth_mm, color = species)) +
  geom_point(alpha = 0.7) +
  labs(title = "Scatter Plot: Culmen Length vs Culmen Depth",
       x = "Culmen Length (mm)", y = "Culmen Depth (mm)") +
  theme_minimal()

ggplot(data, aes(x = flipper_length_mm, y = body_mass_g, color = species)) +
  geom_point(alpha = 0.7) +
  labs(title = "Scatter Plot: Flipper Length vs Body Mass",
       x = "Flipper Length (mm)", y = "Body Mass (g)") +
  theme_minimal()


5. Uji Asumsi LDA

5.1 Uji Normalitas Multivariat (Mardia’s Test)

library(MVN)

mvn_result <- mvn(data = data[, c("culmen_length_mm", "culmen_depth_mm",
                                   "flipper_length_mm", "body_mass_g")])
mvn_result$multivariateNormality
## NULL
mvn_result$univariateNormality
## NULL

Interpretasi: Jika p-value > 0.05 maka asumsi normalitas multivariat terpenuhi.

5.2 Uji Kehomogenan Matriks Varians-Kovarians (Box’s M Test)

library(heplots)

boxM_result <- boxM(data[, c("culmen_length_mm", "culmen_depth_mm",
                              "flipper_length_mm", "body_mass_g")],
                    data$species)
boxM_result
## 
##  Box's M-test for Homogeneity of Covariance Matrices 
## 
## data:  data[, c("culmen_length_mm", "culmen_depth_mm", "flipper_length_mm",  by data$species     "body_mass_g")] by data$species 
## Chi-Sq (approx.) = 74.7315, df = 20, p-value = 3.02e-08

Interpretasi: Jika p-value > 0.05 maka asumsi kehomogenan matriks kovarians antar kelompok terpenuhi (cocok untuk LDA). Jika tidak terpenuhi, dapat digunakan QDA (Quadratic Discriminant Analysis).


6. Pembagian Data Training dan Testing

set.seed(123)
trainIndex <- createDataPartition(data$species, p = 0.8, list = FALSE)
data_train <- data[trainIndex, ]
data_test  <- data[-trainIndex, ]

cat("Jumlah data training:", nrow(data_train), "\n")
## Jumlah data training: 268
cat("Jumlah data testing:", nrow(data_test), "\n")
## Jumlah data testing: 65

7. Membangun Model LDA

# Model LDA dengan variabel prediktor numerik
model_lda <- lda(species ~ culmen_length_mm + culmen_depth_mm +
                           flipper_length_mm + body_mass_g,
                 data = data_train,
                 prior = c(1, 1, 1) / 3)  # prior probabilitas sama

model_lda
## Call:
## lda(species ~ culmen_length_mm + culmen_depth_mm + flipper_length_mm + 
##     body_mass_g, data = data_train, prior = c(1, 1, 1)/3)
## 
## Prior probabilities of groups:
##    Adelie Chinstrap    Gentoo 
## 0.3333333 0.3333333 0.3333333 
## 
## Group means:
##           culmen_length_mm culmen_depth_mm flipper_length_mm body_mass_g
## Adelie            38.87265        18.37607          190.3504    3730.769
## Chinstrap         48.94727        18.48727          195.8909    3745.909
## Gentoo            47.51562        14.95417          217.0417    5071.094
## 
## Coefficients of linear discriminants:
##                            LD1          LD2
## culmen_length_mm  -0.050253247 -0.453993983
## culmen_depth_mm    1.042369273  0.118411607
## flipper_length_mm -0.081565599  0.020235448
## body_mass_g       -0.001466212  0.001453271
## 
## Proportion of trace:
##    LD1    LD2 
## 0.8018 0.1982

8. Hasil Model LDA

8.1 Prior Probability

cat("Prior Probability:\n")
## Prior Probability:
print(model_lda$prior)
##    Adelie Chinstrap    Gentoo 
## 0.3333333 0.3333333 0.3333333

8.2 Group Means (Rata-rata per Kelompok)

cat("Group Means:\n")
## Group Means:
kable(model_lda$means, digits = 3, caption = "Rata-rata Variabel per Kelompok")
Rata-rata Variabel per Kelompok
culmen_length_mm culmen_depth_mm flipper_length_mm body_mass_g
Adelie 38.873 18.376 190.350 3730.769
Chinstrap 48.947 18.487 195.891 3745.909
Gentoo 47.516 14.954 217.042 5071.094

8.3 Koefisien Fungsi Diskriminan

cat("Koefisien Fungsi Diskriminan Linear:\n")
## Koefisien Fungsi Diskriminan Linear:
kable(model_lda$scaling, digits = 4,
      caption = "Koefisien Fungsi Diskriminan (LD1 dan LD2)")
Koefisien Fungsi Diskriminan (LD1 dan LD2)
LD1 LD2
culmen_length_mm -0.0503 -0.4540
culmen_depth_mm 1.0424 0.1184
flipper_length_mm -0.0816 0.0202
body_mass_g -0.0015 0.0015

Interpretasi: Dengan 3 kelas, LDA menghasilkan s = g - 1 = 2 fungsi diskriminan (LD1 dan LD2).

8.4 Proporsi Trace (Variansi yang Dijelaskan)

prop_trace <- model_lda$svd^2 / sum(model_lda$svd^2)
cat("Proporsi variansi yang dijelaskan tiap fungsi diskriminan:\n")
## Proporsi variansi yang dijelaskan tiap fungsi diskriminan:
cat("LD1:", round(prop_trace[1] * 100, 2), "%\n")
## LD1: 80.18 %
cat("LD2:", round(prop_trace[2] * 100, 2), "%\n")
## LD2: 19.82 %

9. Visualisasi Fungsi Diskriminan

# Hitung skor diskriminan untuk data training
lda_scores_train <- predict(model_lda, data_train)$x
lda_plot_df <- data.frame(lda_scores_train,
                           species = data_train$species)

ggplot(lda_plot_df, aes(x = LD1, y = LD2, color = species, shape = species)) +
  geom_point(size = 2.5, alpha = 0.8) +
  stat_ellipse(level = 0.95, linetype = "dashed") +
  labs(title = "Plot Fungsi Diskriminan LDA",
       subtitle = "Proyeksi data training pada ruang diskriminan",
       x = paste0("LD1 (", round(prop_trace[1] * 100, 1), "%)"),
       y = paste0("LD2 (", round(prop_trace[2] * 100, 1), "%)")) +
  theme_minimal()


10. Prediksi dan Evaluasi Model

10.1 Prediksi pada Data Testing

pred_lda <- predict(model_lda, data_test)

# Hasil prediksi
hasil_pred <- data.frame(
  Aktual    = data_test$species,
  Prediksi  = pred_lda$class,
  Posterior = round(pred_lda$posterior, 4)
)
head(hasil_pred, 10)
##    Aktual Prediksi Posterior.Adelie Posterior.Chinstrap Posterior.Gentoo
## 1  Adelie   Adelie           1.0000              0.0000                0
## 2  Adelie   Adelie           0.9996              0.0004                0
## 6  Adelie   Adelie           1.0000              0.0000                0
## 16 Adelie   Adelie           1.0000              0.0000                0
## 23 Adelie   Adelie           1.0000              0.0000                0
## 24 Adelie   Adelie           1.0000              0.0000                0
## 33 Adelie   Adelie           0.9970              0.0030                0
## 38 Adelie   Adelie           0.9157              0.0843                0
## 41 Adelie   Adelie           1.0000              0.0000                0
## 51 Adelie   Adelie           0.9984              0.0016                0

10.2 Confusion Matrix

cm <- confusionMatrix(pred_lda$class, data_test$species)
cm
## Confusion Matrix and Statistics
## 
##            Reference
## Prediction  Adelie Chinstrap Gentoo
##   Adelie        28         1      0
##   Chinstrap      1        12      0
##   Gentoo         0         0     23
## 
## Overall Statistics
##                                           
##                Accuracy : 0.9692          
##                  95% CI : (0.8932, 0.9963)
##     No Information Rate : 0.4462          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.9516          
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: Adelie Class: Chinstrap Class: Gentoo
## Sensitivity                 0.9655           0.9231        1.0000
## Specificity                 0.9722           0.9808        1.0000
## Pos Pred Value              0.9655           0.9231        1.0000
## Neg Pred Value              0.9722           0.9808        1.0000
## Prevalence                  0.4462           0.2000        0.3538
## Detection Rate              0.4308           0.1846        0.3538
## Detection Prevalence        0.4462           0.2000        0.3538
## Balanced Accuracy           0.9689           0.9519        1.0000
# Visualisasi confusion matrix
cm_table <- as.data.frame(cm$table)
names(cm_table) <- c("Prediksi", "Aktual", "Freq")

ggplot(cm_table, aes(x = Aktual, y = Prediksi, fill = Freq)) +
  geom_tile(color = "white") +
  geom_text(aes(label = Freq), size = 5, fontface = "bold") +
  scale_fill_gradient(low = "white", high = "#2196F3") +
  labs(title = "Confusion Matrix - LDA Multinomial",
       x = "Kelas Aktual", y = "Kelas Prediksi") +
  theme_minimal()

10.3 Apparent Error Rate (APER)

# APER = proporsi observasi yang salah diklasifikasikan
n_salah <- sum(pred_lda$class != data_test$species)
n_total <- nrow(data_test)
APER <- n_salah / n_total

cat("Jumlah observasi testing     :", n_total, "\n")
## Jumlah observasi testing     : 65
cat("Jumlah salah klasifikasi     :", n_salah, "\n")
## Jumlah salah klasifikasi     : 2
cat("APER (Apparent Error Rate)   :", round(APER * 100, 2), "%\n")
## APER (Apparent Error Rate)   : 3.08 %
cat("Akurasi Klasifikasi          :", round((1 - APER) * 100, 2), "%\n")
## Akurasi Klasifikasi          : 96.92 %

Interpretasi: APER sebesar 3.08% artinya terdapat 3.08% observasi pada data testing yang salah diklasifikasikan oleh model LDA.


11. Kesimpulan

cat("=== RINGKASAN HASIL LDA MULTINOMIAL ===\n\n")
## === RINGKASAN HASIL LDA MULTINOMIAL ===
cat("Variabel Dependen  : species (3 kelas: Adelie, Chinstrap, Gentoo)\n")
## Variabel Dependen  : species (3 kelas: Adelie, Chinstrap, Gentoo)
cat("Variabel Prediktor : culmen_length_mm, culmen_depth_mm,\n")
## Variabel Prediktor : culmen_length_mm, culmen_depth_mm,
cat("                     flipper_length_mm, body_mass_g\n\n")
##                      flipper_length_mm, body_mass_g
cat("Jumlah Fungsi Diskriminan : 2 (LD1 dan LD2)\n")
## Jumlah Fungsi Diskriminan : 2 (LD1 dan LD2)
cat("Proporsi variansi LD1     :", round(prop_trace[1] * 100, 2), "%\n")
## Proporsi variansi LD1     : 80.18 %
cat("Proporsi variansi LD2     :", round(prop_trace[2] * 100, 2), "%\n\n")
## Proporsi variansi LD2     : 19.82 %
cat("Akurasi Model (data test) :", round((1 - APER) * 100, 2), "%\n")
## Akurasi Model (data test) : 96.92 %
cat("APER                      :", round(APER * 100, 2), "%\n")
## APER                      : 3.08 %