1. Library & Data

library(MASS)        # lda()
library(nnet)        # multinom()
library(tidyverse)   # manipulasi & visualisasi
library(kableExtra)  # tabel rapi
library(MVN)         # uji normalitas multivariat
library(biotools)    # Box's M Test
library(caret)       # confusionMatrix()

df <- read.csv("penguins_size.csv", stringsAsFactors = TRUE)

# Cleaning: pilih variabel relevan, hapus NA
df_clean <- df %>%
  select(species, culmen_length_mm, culmen_depth_mm,
         flipper_length_mm, body_mass_g) %>%
  na.omit()

# Distribusi kelas
table(df_clean$species)

## 
##    Adelie Chinstrap    Gentoo 
##       151        68       123

2. Statistika Deskriptif

df_clean %>%
  group_by(species) %>%
  summarise(
    n             = n(),
    Mean_CulLen   = round(mean(culmen_length_mm), 2),
    SD_CulLen     = round(sd(culmen_length_mm), 2),
    Mean_CulDep   = round(mean(culmen_depth_mm), 2),
    SD_CulDep     = round(sd(culmen_depth_mm), 2),
    Mean_Flip     = round(mean(flipper_length_mm), 2),
    SD_Flip       = round(sd(flipper_length_mm), 2),
    Mean_Mass     = round(mean(body_mass_g), 2),
    SD_Mass       = round(sd(body_mass_g), 2)
  ) %>%
  kable(caption = "Statistika Deskriptif per Spesies",
        col.names = c("Spesies", "n",
                      "Mean", "SD", "Mean", "SD",
                      "Mean", "SD", "Mean", "SD")) %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = TRUE) %>%
  add_header_above(c(" " = 2,
                     "Culmen Length (mm)" = 2,
                     "Culmen Depth (mm)"  = 2,
                     "Flipper Length (mm)"= 2,
                     "Body Mass (g)"      = 2))

Statistika Deskriptif per Spesies
		Culmen Length (mm)		Culmen Depth (mm)		Flipper Length (mm)		Body Mass (g)
Spesies	n	Mean	SD	Mean	SD	Mean	SD	Mean	SD
Adelie	151	38.79	2.66	18.35	1.22	189.95	6.54	3700.66	458.57
Chinstrap	68	48.83	3.34	18.42	1.14	195.82	7.13	3733.09	384.34
Gentoo	123	47.50	3.08	14.98	0.98	217.19	6.48	5076.02	504.12

# Distribusi spesies
df_clean %>%
  count(species) %>%
  mutate(pct   = round(n / sum(n) * 100, 1),
         label = paste0(species, "\n", n, " (", pct, "%)")) %>%
  ggplot(aes(x = "", y = n, fill = species)) +
  geom_col(width = 1, color = "white") +
  coord_polar(theta = "y") +
  geom_text(aes(label = label), position = position_stack(vjust = 0.5), size = 3.5) +
  scale_fill_manual(values = c("#F4A460", "#6495ED", "#90EE90")) +
  labs(title = "Distribusi Spesies Penguin", fill = "Spesies") +
  theme_void() +
  theme(plot.title = element_text(hjust = 0.5, face = "bold", size = 13))

# Boxplot variabel prediktor per spesies
df_clean %>%
  pivot_longer(cols = -species, names_to = "Variabel", values_to = "Nilai") %>%
  ggplot(aes(x = species, y = Nilai, fill = species)) +
  geom_boxplot(alpha = 0.7, outlier.color = "red", outlier.size = 1.5) +
  facet_wrap(~Variabel, scales = "free_y", ncol = 2,
             labeller = as_labeller(c(
               culmen_length_mm  = "Culmen Length (mm)",
               culmen_depth_mm   = "Culmen Depth (mm)",
               flipper_length_mm = "Flipper Length (mm)",
               body_mass_g       = "Body Mass (g)"))) +
  scale_fill_manual(values = c("#F4A460", "#6495ED", "#90EE90")) +
  labs(title = "Distribusi Variabel Prediktor per Spesies",
       x = "Spesies", y = "Nilai", fill = "Spesies") +
  theme_bw() +
  theme(plot.title  = element_text(hjust = 0.5, face = "bold"),
        legend.position = "none")

3. Uji Asumsi

3.1 Normalitas Multivariat (Mardia’s Test)

LDA mengasumsikan data pada setiap kelompok berdistribusi normal multivariat.

prediktor <- df_clean %>%
  select(culmen_length_mm, culmen_depth_mm, flipper_length_mm, body_mass_g)

# Handle perbedaan versi MVN
mvn_args <- names(formals(MVN::mvn))
if ("mvnTest" %in% mvn_args) {
  mvn_result <- mvn(data = prediktor, mvnTest = "mardia")
} else {
  mvn_result <- mvn(data = prediktor)
}

# Chi-Square Q-Q Plot (alternatif visual uji normalitas multivariat)
n  <- nrow(prediktor)
p  <- ncol(prediktor)
mu <- colMeans(prediktor)
S  <- cov(prediktor)
d2 <- mahalanobis(prediktor, center = mu, cov = S)
q_teoritis <- qchisq(ppoints(n), df = p)

qqplot(q_teoritis, sort(d2),
       main = "Chi-Square Q-Q Plot (Normalitas Multivariat)",
       xlab = "Kuantil Chi-Square Teoritis",
       ylab = "Jarak Mahalanobis (Data)",
       pch  = 19, col = "#4472C4", cex = 0.8)
abline(0, 1, col = "red", lwd = 2, lty = 2)
legend("topleft", legend = "Garis Referensi", col = "red", lwd = 2, lty = 2, bty = "n")

Jika titik-titik mendekati garis referensi, asumsi normalitas multivariat terpenuhi.

3.2 Homogenitas Matriks Kovarians (Box’s M Test)

boxm_result <- boxM(
  data     = df_clean[, c("culmen_length_mm", "culmen_depth_mm",
                           "flipper_length_mm", "body_mass_g")],
  grouping = df_clean$species
)
print(boxm_result)

## 
##  Box's M-test for Homogeneity of Covariance Matrices
## 
## data:  df_clean[, c("culmen_length_mm", "culmen_depth_mm", "flipper_length_mm",     "body_mass_g")]
## Chi-Sq (approx.) = 76.795, df = 20, p-value = 1.365e-08

Jika p-value < 0.05 matriks kovarians tidak homogen; LDA tetap cukup robust untuk sampel besar, namun QDA dapat dipertimbangkan sebagai alternatif.

4. Split Data

set.seed(42)
idx   <- createDataPartition(df_clean$species, p = 0.8, list = FALSE)
train <- df_clean[idx, ]
test  <- df_clean[-idx, ]

cat("Training:", nrow(train), "| Testing:", nrow(test))

## Training: 275 | Testing: 67

5. Linear Discriminant Analysis (LDA)

5.1 Pembentukan Fungsi Diskriminan

lda_model <- lda(species ~ culmen_length_mm + culmen_depth_mm +
                            flipper_length_mm + body_mass_g,
                 data  = train,
                 prior = as.vector(prop.table(table(train$species))))
print(lda_model)

## Call:
## lda(species ~ culmen_length_mm + culmen_depth_mm + flipper_length_mm + 
##     body_mass_g, data = train, prior = as.vector(prop.table(table(train$species))))
## 
## Prior probabilities of groups:
##    Adelie Chinstrap    Gentoo 
##      0.44      0.20      0.36 
## 
## Group means:
##           culmen_length_mm culmen_depth_mm flipper_length_mm body_mass_g
## Adelie            38.79752        18.39174          190.3471    3704.132
## Chinstrap         48.80909        18.36182          195.4182    3721.364
## Gentoo            47.66061        14.99091          217.9293    5101.263
## 
## Coefficients of linear discriminants:
##                            LD1          LD2
## culmen_length_mm  -0.086609377 -0.419396547
## culmen_depth_mm    1.011226772 -0.005156790
## flipper_length_mm -0.089996132  0.013219110
## body_mass_g       -0.001281395  0.001769146
## 
## Proportion of trace:
##    LD1    LD2 
## 0.8659 0.1341

5.2 Koefisien Fungsi Diskriminan

lda_model$scaling %>%
  as.data.frame() %>%
  rownames_to_column("Variabel") %>%
  kable(caption = "Koefisien Fungsi Diskriminan Linear", digits = 4) %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)

Koefisien Fungsi Diskriminan Linear
Variabel	LD1	LD2
culmen_length_mm	-0.0866	-0.4194
culmen_depth_mm	1.0112	-0.0052
flipper_length_mm	-0.0900	0.0132
body_mass_g	-0.0013	0.0018

Dengan 3 kelas, LDA menghasilkan s = g − 1 = 2 fungsi diskriminan (LD1 dan LD2).

5.3 Proporsi Variansi yang Dijelaskan

prop_var <- lda_model$svd^2 / sum(lda_model$svd^2) * 100

data.frame(
  `Fungsi Diskriminan` = c("LD1", "LD2"),
  Eigenvalue           = round(lda_model$svd^2, 4),
  `Proporsi (%)`       = round(prop_var, 2),
  `Kumulatif (%)`      = round(cumsum(prop_var), 2)
) %>%
  kable(caption = "Proporsi Variansi yang Dijelaskan Setiap Fungsi Diskriminan") %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)

Proporsi Variansi yang Dijelaskan Setiap Fungsi Diskriminan
Fungsi.Diskriminan	Eigenvalue	Proporsi….	Kumulatif….
LD1	2088.3517	86.59	86.59
LD2	323.5356	13.41	100.00

5.4 Visualisasi LDA

lda_pred_train <- predict(lda_model, train)

lda_plot_df <- data.frame(
  LD1     = lda_pred_train$x[, 1],
  LD2     = lda_pred_train$x[, 2],
  species = train$species
)

ggplot(lda_plot_df, aes(x = LD1, y = LD2, color = species, shape = species)) +
  geom_point(size = 2.5, alpha = 0.8) +
  stat_ellipse(aes(fill = species), geom = "polygon",
               alpha = 0.1, level = 0.95) +
  scale_color_manual(values = c("#E07B39", "#4472C4", "#5BAD5B")) +
  scale_fill_manual(values  = c("#E07B39", "#4472C4", "#5BAD5B")) +
  labs(title    = "Plot Fungsi Diskriminan Linear",
       subtitle = "Pemisahan Tiga Spesies Penguin",
       x        = paste0("LD1 (", round(prop_var[1], 1), "%)"),
       y        = paste0("LD2 (", round(prop_var[2], 1), "%)"),
       color = "Spesies", shape = "Spesies", fill = "Spesies") +
  theme_bw() +
  theme(plot.title    = element_text(hjust = 0.5, face = "bold"),
        plot.subtitle = element_text(hjust = 0.5))

5.5 Evaluasi Model LDA

# Prediksi
lda_pred_test  <- predict(lda_model, test)
lda_pred_train_class <- predict(lda_model, train)$class

# Confusion Matrix
cm_lda <- confusionMatrix(lda_pred_test$class, test$species)
print(cm_lda)

## Confusion Matrix and Statistics
## 
##            Reference
## Prediction  Adelie Chinstrap Gentoo
##   Adelie        30         1      0
##   Chinstrap      0        12      0
##   Gentoo         0         0     24
## 
## Overall Statistics
##                                           
##                Accuracy : 0.9851          
##                  95% CI : (0.9196, 0.9996)
##     No Information Rate : 0.4478          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.9763          
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: Adelie Class: Chinstrap Class: Gentoo
## Sensitivity                 1.0000           0.9231        1.0000
## Specificity                 0.9730           1.0000        1.0000
## Pos Pred Value              0.9677           1.0000        1.0000
## Neg Pred Value              1.0000           0.9818        1.0000
## Prevalence                  0.4478           0.1940        0.3582
## Detection Rate              0.4478           0.1791        0.3582
## Detection Prevalence        0.4627           0.1791        0.3582
## Balanced Accuracy           0.9865           0.9615        1.0000

# Tabel Confusion Matrix rapi
cm_lda$table %>%
  as.data.frame() %>%
  pivot_wider(names_from = Reference, values_from = Freq) %>%
  rename(Prediksi = Prediction) %>%
  kable(caption = "Confusion Matrix — Linear Discriminant Analysis") %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE) %>%
  add_header_above(c(" " = 1, "Aktual" = 3))

Confusion Matrix — Linear Discriminant Analysis
	Aktual
Prediksi	Adelie	Chinstrap	Gentoo
Adelie	30	1	0
Chinstrap	0	12	0
Gentoo	0	0	24

# APER Training & Testing
n_salah_train <- sum(lda_pred_train_class != train$species)
n_salah_test  <- sum(lda_pred_test$class  != test$species)
aper_train    <- n_salah_train / nrow(train)
aper_test     <- n_salah_test  / nrow(test)

data.frame(
  Data        = c("Training", "Testing"),
  `N Total`   = c(nrow(train), nrow(test)),
  `N Salah`   = c(n_salah_train, n_salah_test),
  `APER (%)`  = round(c(aper_train, aper_test) * 100, 2),
  `Akurasi (%)` = round((1 - c(aper_train, aper_test)) * 100, 2)
) %>%
  kable(caption = "APER dan Akurasi Model LDA") %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)

APER dan Akurasi Model LDA
Data	N.Total	N.Salah	APER….	Akurasi….
Training	275	3	1.09	98.91
Testing	67	1	1.49	98.51

APER yang kecil menunjukkan fungsi diskriminan memiliki kemampuan klasifikasi yang baik.

6. Regresi Logistik Multinomial

6.1 Pembentukan Model

# Kategori referensi = Gentoo (terbanyak kedua, sering dijadikan baseline)
train$species <- relevel(train$species, ref = "Gentoo")
test$species  <- relevel(test$species,  ref = "Gentoo")

multinom_model <- multinom(
  species ~ culmen_length_mm + culmen_depth_mm +
            flipper_length_mm + body_mass_g,
  data  = train,
  trace = FALSE,
  Hess  = TRUE
)
summary(multinom_model)

## Call:
## multinom(formula = species ~ culmen_length_mm + culmen_depth_mm + 
##     flipper_length_mm + body_mass_g, data = train, Hess = TRUE, 
##     trace = FALSE)
## 
## Coefficients:
##           (Intercept) culmen_length_mm culmen_depth_mm flipper_length_mm
## Adelie       28.10100        -19.61744       36.333602           2.41380
## Chinstrap   -21.02514         25.72737        1.335902          -1.96246
##           body_mass_g
## Adelie     -0.0673793
## Chinstrap  -0.1910147
## 
## Std. Errors:
##           (Intercept) culmen_length_mm culmen_depth_mm flipper_length_mm
## Adelie      0.8292441         14.10153        20.24099          5.101465
## Chinstrap   0.9970848         15.28610        35.34625          5.513135
##           body_mass_g
## Adelie      0.1515416
## Chinstrap   0.1491054
## 
## Residual Deviance: 0.0007645983 
## AIC: 20.00076

6.2 Estimasi Parameter

koef <- summary(multinom_model)$coefficients
se   <- summary(multinom_model)$standard.errors

koef %>%
  as.data.frame() %>%
  rownames_to_column("Model") %>%
  kable(caption = "Estimasi Koefisien (β) Model Regresi Logistik Multinomial",
        digits = 4) %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = TRUE)

Estimasi Koefisien (β) Model Regresi Logistik Multinomial
Model	(Intercept)	culmen_length_mm	culmen_depth_mm	flipper_length_mm	body_mass_g
Adelie	28.1010	-19.6174	36.3336	2.4138	-0.0674
Chinstrap	-21.0251	25.7274	1.3359	-1.9625	-0.1910

Kategori referensi adalah Gentoo. Model Logit 1 membandingkan Adelie vs Gentoo, Model Logit 2 membandingkan Chinstrap vs Gentoo.

6.3 Uji Signifikansi Parameter

6.3.1 Uji Serentak (Likelihood Ratio Test / G²)

multinom_null <- multinom(species ~ 1, data = train, trace = FALSE)
lrt <- anova(multinom_null, multinom_model, test = "Chisq")
print(lrt)

## Likelihood ratio tests of Multinomial Models
## 
## Response: species
##                                                                  Model
## 1                                                                    1
## 2 culmen_length_mm + culmen_depth_mm + flipper_length_mm + body_mass_g
##   Resid. df   Resid. Dev   Test    Df LR stat. Pr(Chi)
## 1       548 5.780024e+02                              
## 2       540 7.645983e-04 1 vs 2     8 578.0016       0

Tolak H₀ jika p-value < 0.05 → minimal satu variabel prediktor berpengaruh signifikan terhadap model.

6.3.2 Uji Parsial (Wald Test)

z_val <- koef / se
p_val <- 2 * (1 - pnorm(abs(z_val)))

# Logit 1: Adelie vs Gentoo
data.frame(
  Variabel = colnames(koef),
  Beta     = round(koef["Adelie", ], 4),
  SE       = round(se["Adelie", ], 4),
  Wald_Z   = round(z_val["Adelie", ], 4),
  P_value  = round(p_val["Adelie", ], 4),
  Sig      = ifelse(p_val["Adelie", ] < 0.001, "***",
             ifelse(p_val["Adelie", ] < 0.01,  "**",
             ifelse(p_val["Adelie", ] < 0.05,  "*", "ns")))
) %>%
  kable(caption = "Uji Parsial (Wald) — Model Logit 1: Adelie vs Gentoo",
        row.names = FALSE) %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE) %>%
  footnote(general = "*** p<0.001  ** p<0.01  * p<0.05  ns = tidak signifikan")

Uji Parsial (Wald) — Model Logit 1: Adelie vs Gentoo
Variabel	Beta	SE	Wald_Z	P_value	Sig
(Intercept)	28.1010	0.8292	33.8875	0.0000	***
culmen_length_mm	-19.6174	14.1015	-1.3912	0.1642	ns
culmen_depth_mm	36.3336	20.2410	1.7951	0.0726	ns
flipper_length_mm	2.4138	5.1015	0.4732	0.6361	ns
body_mass_g	-0.0674	0.1515	-0.4446	0.6566	ns
Note:
* p<0.001 p<0.01 * p<0.05 ns = tidak signifikan

# Logit 2: Chinstrap vs Gentoo
data.frame(
  Variabel = colnames(koef),
  Beta     = round(koef["Chinstrap", ], 4),
  SE       = round(se["Chinstrap", ], 4),
  Wald_Z   = round(z_val["Chinstrap", ], 4),
  P_value  = round(p_val["Chinstrap", ], 4),
  Sig      = ifelse(p_val["Chinstrap", ] < 0.001, "***",
             ifelse(p_val["Chinstrap", ] < 0.01,  "**",
             ifelse(p_val["Chinstrap", ] < 0.05,  "*", "ns")))
) %>%
  kable(caption = "Uji Parsial (Wald) — Model Logit 2: Chinstrap vs Gentoo",
        row.names = FALSE) %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE) %>%
  footnote(general = "*** p<0.001  ** p<0.01  * p<0.05  ns = tidak signifikan")

Uji Parsial (Wald) — Model Logit 2: Chinstrap vs Gentoo
Variabel	Beta	SE	Wald_Z	P_value	Sig
(Intercept)	-21.0251	0.9971	-21.0866	0.0000	***
culmen_length_mm	25.7274	15.2861	1.6831	0.0924	ns
culmen_depth_mm	1.3359	35.3462	0.0378	0.9699	ns
flipper_length_mm	-1.9625	5.5131	-0.3560	0.7219	ns
body_mass_g	-0.1910	0.1491	-1.2811	0.2002	ns
Note:
* p<0.001 p<0.01 * p<0.05 ns = tidak signifikan

6.4 Persamaan Model

b <- round(koef, 4)

cat("**Model Logit 1 (Adelie vs Gentoo):**\n")

## **Model Logit 1 (Adelie vs Gentoo):**

cat(sprintf(
  "ln(P(Adelie)/P(Gentoo)) = %.4f + %.4f*X1 + %.4f*X2 + %.4f*X3 + %.4f*X4\n",
  b["Adelie","(Intercept)"], b["Adelie","culmen_length_mm"],
  b["Adelie","culmen_depth_mm"], b["Adelie","flipper_length_mm"],
  b["Adelie","body_mass_g"]))

## ln(P(Adelie)/P(Gentoo)) = 28.1010 + -19.6174*X1 + 36.3336*X2 + 2.4138*X3 + -0.0674*X4

cat("\n**Model Logit 2 (Chinstrap vs Gentoo):**\n")

## 
## **Model Logit 2 (Chinstrap vs Gentoo):**

cat(sprintf(
  "ln(P(Chinstrap)/P(Gentoo)) = %.4f + %.4f*X1 + %.4f*X2 + %.4f*X3 + %.4f*X4\n",
  b["Chinstrap","(Intercept)"], b["Chinstrap","culmen_length_mm"],
  b["Chinstrap","culmen_depth_mm"], b["Chinstrap","flipper_length_mm"],
  b["Chinstrap","body_mass_g"]))

## ln(P(Chinstrap)/P(Gentoo)) = -21.0251 + 25.7274*X1 + 1.3359*X2 + -1.9625*X3 + -0.1910*X4

cat("\nDi mana: X1=culmen_length, X2=culmen_depth, X3=flipper_length, X4=body_mass\n")

## 
## Di mana: X1=culmen_length, X2=culmen_depth, X3=flipper_length, X4=body_mass

6.5 Odds Ratio

or_adelie   <- round(exp(koef["Adelie", ]), 4)
or_chinstrap <- round(exp(koef["Chinstrap", ]), 4)

data.frame(
  Variabel        = colnames(koef),
  OR_Adelie       = or_adelie,
  OR_Chinstrap    = or_chinstrap
) %>%
  kable(caption = "Odds Ratio — Model Regresi Logistik Multinomial",
        col.names = c("Variabel", "OR (Adelie vs Gentoo)", "OR (Chinstrap vs Gentoo)"),
        row.names = FALSE) %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE) %>%
  footnote(general = "OR > 1: hubungan positif; OR < 1: hubungan negatif terhadap kategori tersebut dibanding Gentoo")

Odds Ratio — Model Regresi Logistik Multinomial
Variabel	OR (Adelie vs Gentoo)	OR (Chinstrap vs Gentoo)
(Intercept)	1.599955e+12	0.000000e+00
culmen_length_mm	0.000000e+00	1.490229e+11
culmen_depth_mm	6.018423e+15	3.803400e+00
flipper_length_mm	1.117630e+01	1.405000e-01
body_mass_g	9.348000e-01	8.261000e-01
Note:
OR > 1: hubungan positif; OR < 1: hubungan negatif terhadap kategori tersebut dibanding Gentoo

6.6 Evaluasi Model Multinomial

multinom_pred <- predict(multinom_model, newdata = test, type = "class")
cm_multi      <- confusionMatrix(multinom_pred, test$species)
print(cm_multi)

## Confusion Matrix and Statistics
## 
##            Reference
## Prediction  Gentoo Adelie Chinstrap
##   Gentoo        24      1         0
##   Adelie         0     28         1
##   Chinstrap      0      1        12
## 
## Overall Statistics
##                                           
##                Accuracy : 0.9552          
##                  95% CI : (0.8747, 0.9907)
##     No Information Rate : 0.4478          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.9295          
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: Gentoo Class: Adelie Class: Chinstrap
## Sensitivity                 1.0000        0.9333           0.9231
## Specificity                 0.9767        0.9730           0.9815
## Pos Pred Value              0.9600        0.9655           0.9231
## Neg Pred Value              1.0000        0.9474           0.9815
## Prevalence                  0.3582        0.4478           0.1940
## Detection Rate              0.3582        0.4179           0.1791
## Detection Prevalence        0.3731        0.4328           0.1940
## Balanced Accuracy           0.9884        0.9532           0.9523

# Tabel confusion matrix rapi
cm_multi$table %>%
  as.data.frame() %>%
  pivot_wider(names_from = Reference, values_from = Freq) %>%
  rename(Prediksi = Prediction) %>%
  kable(caption = "Confusion Matrix — Regresi Logistik Multinomial") %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE) %>%
  add_header_above(c(" " = 1, "Aktual" = 3))

Confusion Matrix — Regresi Logistik Multinomial
	Aktual
Prediksi	Gentoo	Adelie	Chinstrap
Gentoo	24	1	0
Adelie	0	28	1
Chinstrap	0	1	12

# APER Multinomial
multinom_pred_train <- predict(multinom_model, newdata = train, type = "class")
n_salah_multi_train <- sum(multinom_pred_train != train$species)
n_salah_multi_test  <- sum(multinom_pred       != test$species)
aper_multi_train    <- n_salah_multi_train / nrow(train)
aper_multi_test     <- n_salah_multi_test  / nrow(test)

data.frame(
  Data          = c("Training", "Testing"),
  `N Total`     = c(nrow(train), nrow(test)),
  `N Salah`     = c(n_salah_multi_train, n_salah_multi_test),
  `APER (%)`    = round(c(aper_multi_train, aper_multi_test) * 100, 2),
  `Akurasi (%)` = round((1 - c(aper_multi_train, aper_multi_test)) * 100, 2)
) %>%
  kable(caption = "APER dan Akurasi Model Regresi Logistik Multinomial") %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)

APER dan Akurasi Model Regresi Logistik Multinomial
Data	N.Total	N.Salah	APER….	Akurasi….
Training	275	0	0.00	100.00
Testing	67	3	4.48	95.52

7. Perbandingan LDA vs Regresi Logistik Multinomial

data.frame(
  Aspek              = c("Metode", "Asumsi Utama", "Variabel Dependen",
                         "APER Testing (%)", "Akurasi Testing (%)", "Kelebihan"),
  LDA                = c("Linear Discriminant Analysis",
                         "Normalitas multivariat & homogenitas kovarians",
                         "Nominal ≥ 2 kategori",
                         paste0(round(aper_test * 100, 2), "%"),
                         paste0(round((1 - aper_test) * 100, 2), "%"),
                         "Interpretasi geometris jelas, visualisasi mudah"),
  Multinomial        = c("Regresi Logistik Multinomial",
                         "Tidak memerlukan normalitas prediktor",
                         "Nominal ≥ 3 kategori",
                         paste0(round(aper_multi_test * 100, 2), "%"),
                         paste0(round((1 - aper_multi_test) * 100, 2), "%"),
                         "Lebih fleksibel, menghasilkan odds ratio")
) %>%
  kable(caption = "Perbandingan LDA dan Regresi Logistik Multinomial",
        col.names = c("Aspek", "LDA", "Multinomial Logistik")) %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = TRUE)

Perbandingan LDA dan Regresi Logistik Multinomial
Aspek	LDA	Multinomial Logistik
Metode	Linear Discriminant Analysis	Regresi Logistik Multinomial
Asumsi Utama	Normalitas multivariat & homogenitas kovarians	Tidak memerlukan normalitas prediktor
Variabel Dependen	Nominal ≥ 2 kategori	Nominal ≥ 3 kategori
APER Testing (%)	1.49%	4.48%
Akurasi Testing (%)	98.51%	95.52%
Kelebihan	Interpretasi geometris jelas, visualisasi mudah	Lebih fleksibel, menghasilkan odds ratio

8. Kesimpulan

cat(sprintf(
"LDA Multinomial:
  - Menghasilkan 2 fungsi diskriminan (LD1: %.2f%%, LD2: %.2f%%)
  - APER Training: %.2f%% | APER Testing: %.2f%%
  - Akurasi Testing: %.2f%%

Regresi Logistik Multinomial:
  - Kategori referensi: Gentoo
  - Menghasilkan 2 persamaan logit (Adelie vs Gentoo, Chinstrap vs Gentoo)
  - APER Training: %.2f%% | APER Testing: %.2f%%
  - Akurasi Testing: %.2f%%
",
prop_var[1], prop_var[2],
aper_train * 100, aper_test * 100, (1 - aper_test) * 100,
aper_multi_train * 100, aper_multi_test * 100, (1 - aper_multi_test) * 100
))

## LDA Multinomial:
##   - Menghasilkan 2 fungsi diskriminan (LD1: 86.59%, LD2: 13.41%)
##   - APER Training: 1.09% | APER Testing: 1.49%
##   - Akurasi Testing: 98.51%
## 
## Regresi Logistik Multinomial:
##   - Kategori referensi: Gentoo
##   - Menghasilkan 2 persamaan logit (Adelie vs Gentoo, Chinstrap vs Gentoo)
##   - APER Training: 0.00% | APER Testing: 4.48%
##   - Akurasi Testing: 95.52%

Modul 4 - LDA Multinomial & Regresi Logistik Multinomial

Kelompok ? lufa

2026-04-30

1. Library & Data

2. Statistika Deskriptif

3. Uji Asumsi

3.1 Normalitas Multivariat (Mardia’s Test)

3.2 Homogenitas Matriks Kovarians (Box’s M Test)

4. Split Data

5. Linear Discriminant Analysis (LDA)

5.1 Pembentukan Fungsi Diskriminan

5.2 Koefisien Fungsi Diskriminan

5.3 Proporsi Variansi yang Dijelaskan

5.4 Visualisasi LDA

5.5 Evaluasi Model LDA

6. Regresi Logistik Multinomial

6.1 Pembentukan Model

6.2 Estimasi Parameter

6.3 Uji Signifikansi Parameter

6.3.1 Uji Serentak (Likelihood Ratio Test / G²)

6.3.2 Uji Parsial (Wald Test)

6.4 Persamaan Model

6.5 Odds Ratio

6.6 Evaluasi Model Multinomial

7. Perbandingan LDA vs Regresi Logistik Multinomial

8. Kesimpulan