library(readxl)    
## Warning: package 'readxl' was built under R version 4.5.3
library(dplyr)        
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)      
## Warning: package 'ggplot2' was built under R version 4.5.3
library(tidyr)        
library(scales) 
## Warning: package 'scales' was built under R version 4.5.3
library(MASS)         
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
## 
##     select
library(nnet)         
library(cluster)      
library(factoextra)
## Warning: package 'factoextra' was built under R version 4.5.3
## Welcome to factoextra!
## Want to learn more? See two factoextra-related books at https://www.datanovia.com/en/product/practical-guide-to-principal-component-methods-in-r/
library(MVN)         
## Warning: package 'MVN' was built under R version 4.5.3
## Registered S3 method overwritten by 'lme4':
##   method           from
##   na.action.merMod car
library(biotools)     
## Warning: package 'biotools' was built under R version 4.5.3
## ---
## biotools version 4.3
library(car)          
## Warning: package 'car' was built under R version 4.5.3
## Loading required package: carData
## Warning: package 'carData' was built under R version 4.5.3
## 
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
## 
##     recode
library(corrplot)
## Warning: package 'corrplot' was built under R version 4.5.3
## corrplot 0.95 loaded
library(caret)
## Warning: package 'caret' was built under R version 4.5.3
## Loading required package: lattice
library(kableExtra)
## Warning: package 'kableExtra' was built under R version 4.5.3
## 
## Attaching package: 'kableExtra'
## The following object is masked from 'package:dplyr':
## 
##     group_rows
df_raw <- read_excel("Dataset_Pengeluaran.xlsx")

# Simpan ID
ID_rumah_tangga <- df_raw[[2]]

df <- df_raw[, 3:ncol(df_raw)]

cat("Jumlah Baris :", nrow(df), "\n")
## Jumlah Baris : 5019
cat("Jumlah Kolom :", ncol(df), "\n")
## Jumlah Kolom : 7
cat("Nama Kolom   :\n")
## Nama Kolom   :
print(names(df))
## [1] "Pengeluaran non Makanan"       "Total Pengeluaran Rumahtangga"
## [3] "Pengeluaran perkapita"         "Nilai Kalori Perkapita"       
## [5] "Nilai Protein Perkapita"       "Nilai Lemak Perkapita"        
## [7] "Nilai Karbohidrat Perkapita"

Eksplorasi Awal

str(df)
## tibble [5,019 × 7] (S3: tbl_df/tbl/data.frame)
##  $ Pengeluaran non Makanan      : num [1:5019] 4797917 3440333 1396583 3771700 2421750 ...
##  $ Total Pengeluaran Rumahtangga: num [1:5019] 9042060 6076048 3123726 10194229 4098579 ...
##  $ Pengeluaran perkapita        : num [1:5019] 2260515 2025349 1041242 1699038 683096 ...
##  $ Nilai Kalori Perkapita       : num [1:5019] 3893 2095 2186 2463 1742 ...
##  $ Nilai Protein Perkapita      : num [1:5019] 100.3 83.5 83.1 83.1 48.7 ...
##  $ Nilai Lemak Perkapita        : num [1:5019] 83.5 84.5 30.8 57.4 26 ...
##  $ Nilai Karbohidrat Perkapita  : num [1:5019] 625 254 330 359 286 ...
summary(df)
##  Pengeluaran non Makanan Total Pengeluaran Rumahtangga Pengeluaran perkapita
##  Min.   :   87583        Min.   :  324080              Min.   :  162040     
##  1st Qu.:  985375        1st Qu.: 2488527              1st Qu.:  689877     
##  Median : 1578167        Median : 3702060              Median : 1097552     
##  Mean   : 2434316        Mean   : 4739760              Mean   : 1412501     
##  3rd Qu.: 2729292        3rd Qu.: 5705784              3rd Qu.: 1724557     
##  Max.   :69011667        Max.   :73649336              Max.   :24071032     
##  Nilai Kalori Perkapita Nilai Protein Perkapita Nilai Lemak Perkapita
##  Min.   :1000           Min.   : 22.58          Min.   :  7.445      
##  1st Qu.:1786           1st Qu.: 52.28          1st Qu.: 26.451      
##  Median :2195           Median : 65.94          Median : 36.302      
##  Mean   :2305           Mean   : 71.00          Mean   : 41.826      
##  3rd Qu.:2701           3rd Qu.: 83.62          3rd Qu.: 51.149      
##  Max.   :4490           Max.   :264.19          Max.   :260.872      
##  Nilai Karbohidrat Perkapita
##  Min.   : 92.7              
##  1st Qu.:279.7              
##  Median :344.2              
##  Mean   :361.4              
##  3rd Qu.:427.1              
##  Max.   :830.5
head(df, 6)
## # A tibble: 6 × 7
##   `Pengeluaran non Makanan` Total Pengeluaran Rumahtang…¹ Pengeluaran perkapit…²
##                       <dbl>                         <dbl>                  <dbl>
## 1                  4797917.                      9042060.               2260515.
## 2                  3440333.                      6076048.               2025349.
## 3                  1396583.                      3123726.               1041242.
## 4                  3771700                      10194229.               1699038.
## 5                  2421750                       4098579.                683096.
## 6                  1730617.                      2927532.                731883.
## # ℹ abbreviated names: ¹​`Total Pengeluaran Rumahtangga`,
## #   ²​`Pengeluaran perkapita`
## # ℹ 4 more variables: `Nilai Kalori Perkapita` <dbl>,
## #   `Nilai Protein Perkapita` <dbl>, `Nilai Lemak Perkapita` <dbl>,
## #   `Nilai Karbohidrat Perkapita` <dbl>

Preprocessing Data

Cek Missing Values

cat("=== CEK MISSING VALUES ===\n")
## === CEK MISSING VALUES ===
missing_summary <- data.frame(
  Variabel      = colnames(df),
  Jumlah_NA     = colSums(is.na(df)),
  Persentase_NA = round(colMeans(is.na(df)) * 100, 2)
)

kable(missing_summary, row.names = FALSE,
      caption = "Ringkasan Missing Values per Variabel") %>%
  kable_styling(bootstrap_options = c("striped", "hover"))
Ringkasan Missing Values per Variabel
Variabel Jumlah_NA Persentase_NA
Pengeluaran non Makanan 0 0
Total Pengeluaran Rumahtangga 0 0
Pengeluaran perkapita 0 0
Nilai Kalori Perkapita 0 0
Nilai Protein Perkapita 0 0
Nilai Lemak Perkapita 0 0
Nilai Karbohidrat Perkapita 0 0
# Cek apakah ada missing values
total_missing <- sum(is.na(df))

if (total_missing == 0) {
  cat("Tidak ada missing values. Dataset siap diproses.\n")
} else {
  cat("Ditemukan", total_missing, "missing values. Menghapus baris yang mengandung NA...\n")
  df <- df %>% na.omit()
  cat("Dataset setelah pembersihan:", nrow(df), "baris.\n")
}
## Tidak ada missing values. Dataset siap diproses.

Cek Outlier dengan Boxplot

library(tidyr)
df_long <- df %>%
  pivot_longer(cols = everything(),
               names_to  = "Variabel",
               values_to = "Nilai")

ggplot(df_long, aes(x = Variabel, y = Nilai, fill = Variabel)) +
  geom_boxplot(outlier.colour = "red", outlier.shape = 16,
               outlier.size = 1.5, alpha = 0.7) +
  theme_minimal() +
  theme(
    axis.text.x  = element_text(angle = 30, hjust = 1, size = 8),
    legend.position = "none"
  ) +
  labs(title = "Boxplot Deteksi Outlier per Variabel",
       x = "Variabel", y = "Nilai")

outlier_summary <- sapply(df, function(x) {
  z <- abs(scale(x))
  sum(z > 3)
})

outlier_df <- data.frame(
  Variabel        = names(outlier_summary),
  Jumlah_Outlier  = as.integer(outlier_summary),
  Persentase      = round(as.integer(outlier_summary) / nrow(df) * 100, 2)
)

kable(outlier_df, row.names = FALSE,
      caption = "Jumlah Outlier per Variabel (|Z-score| > 3)") %>%
  kable_styling(bootstrap_options = c("striped", "hover"))
Jumlah Outlier per Variabel (|Z-score| > 3)
Variabel Jumlah_Outlier Persentase
Pengeluaran non Makanan 83 1.65
Total Pengeluaran Rumahtangga 82 1.63
Pengeluaran perkapita 100 1.99
Nilai Kalori Perkapita 16 0.32
Nilai Protein Perkapita 69 1.37
Nilai Lemak Perkapita 80 1.59
Nilai Karbohidrat Perkapita 42 0.84

Standardisasi Data

df_scaled <- as.data.frame(scale(df))

cat("HASIL STANDARDISASI\n")
## HASIL STANDARDISASI
verify <- data.frame(
  Variabel = colnames(df_scaled),
  Mean     = round(colMeans(df_scaled), 6),
  SD       = round(apply(df_scaled, 2, sd), 6)
)

kable(verify, row.names = FALSE,
      caption = "Rata-rata dan Standar Deviasi setelah Standardisasi") %>%
  kable_styling(bootstrap_options = c("striped", "hover"))
Rata-rata dan Standar Deviasi setelah Standardisasi
Variabel Mean SD
Pengeluaran non Makanan 0 1
Total Pengeluaran Rumahtangga 0 1
Pengeluaran perkapita 0 1
Nilai Kalori Perkapita 0 1
Nilai Protein Perkapita 0 1
Nilai Lemak Perkapita 0 1
Nilai Karbohidrat Perkapita 0 1
df_scaled_long <- df_scaled %>%
  pivot_longer(cols = everything(),
               names_to  = "Variabel",
               values_to = "Nilai")

ggplot(df_scaled_long, aes(x = Variabel, y = Nilai, fill = Variabel)) +
  geom_boxplot(outlier.colour = "red", outlier.shape = 16,
               outlier.size = 1.2, alpha = 0.7) +
  theme_minimal() +
  theme(
    axis.text.x  = element_text(angle = 30, hjust = 1, size = 8),
    legend.position = "none"
  ) +
  labs(title = "Boxplot Setelah Standardisasi (Z-score)",
       x = "Variabel", y = "Nilai Terstandarisasi")

Uji Multikolinearitas (Korelasi Matrix)

cor_matrix <- cor(df_scaled)

corrplot(cor_matrix,
         method  = "color",
         type    = "upper",
         addCoef.col = "black",
         number.cex  = 0.7,
         tl.cex  = 0.75,
         tl.col  = "black",
         col     = colorRampPalette(c("#d73027", "white", "#1a9850"))(200),
         title   = "Korelasi Matrix Antar Variabel",
         mar     = c(0, 0, 1, 0))

cat("TABEL KORELASI\n")
## TABEL KORELASI
kable(round(cor_matrix, 3),
      caption = "Matriks Korelasi Pearson antar Variabel") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed")) %>%
  scroll_box(width = "100%")
Matriks Korelasi Pearson antar Variabel
Pengeluaran non Makanan Total Pengeluaran Rumahtangga Pengeluaran perkapita Nilai Kalori Perkapita Nilai Protein Perkapita Nilai Lemak Perkapita Nilai Karbohidrat Perkapita
Pengeluaran non Makanan 1.000 0.957 0.798 0.129 0.208 0.288 0.013
Total Pengeluaran Rumahtangga 0.957 1.000 0.768 0.185 0.265 0.365 0.052
Pengeluaran perkapita 0.798 0.768 1.000 0.435 0.511 0.510 0.276
Nilai Kalori Perkapita 0.129 0.185 0.435 1.000 0.864 0.698 0.919
Nilai Protein Perkapita 0.208 0.265 0.511 0.864 1.000 0.722 0.709
Nilai Lemak Perkapita 0.288 0.365 0.510 0.698 0.722 1.000 0.446
Nilai Karbohidrat Perkapita 0.013 0.052 0.276 0.919 0.709 0.446 1.000
# Identifikasi pasangan dengan korelasi tinggi (|r| >= 0.8)
cat("PASANGAN VARIABEL DENGAN KORELASI TINGGI (|r| >= 0.8)\n")
## PASANGAN VARIABEL DENGAN KORELASI TINGGI (|r| >= 0.8)
cor_pairs <- which(abs(cor_matrix) >= 0.8 & upper.tri(cor_matrix), arr.ind = TRUE)

if (nrow(cor_pairs) == 0) {
  cat("Tidak ada pasangan variabel dengan korelasi >= 0.8.\n")
} else {
  hasil <- data.frame(
    Variabel_1  = rownames(cor_matrix)[cor_pairs[, 1]],
    Variabel_2  = colnames(cor_matrix)[cor_pairs[, 2]],
    Korelasi    = round(cor_matrix[cor_pairs], 3)
  )
  kable(hasil, row.names = FALSE,
        caption = "Pasangan Variabel Berkorelasi Tinggi") %>%
    kable_styling(bootstrap_options = c("striped", "hover"))
}
Pasangan Variabel Berkorelasi Tinggi
Variabel_1 Variabel_2 Korelasi
Pengeluaran non Makanan Total Pengeluaran Rumahtangga 0.957
Nilai Kalori Perkapita Nilai Protein Perkapita 0.864
Nilai Kalori Perkapita Nilai Karbohidrat Perkapita 0.919

Membuat Variabel Kelas (Label)

Elbow Method (Menentukan Jumlah Cluster Optimal)

set.seed(42)

wss <- sapply(1:10, function(k) {
  kmeans(df_scaled, centers = k, nstart = 25, iter.max = 100)$tot.withinss
})

elbow_df <- data.frame(k = 1:10, WSS = wss)

ggplot(elbow_df, aes(x = k, y = WSS)) +
  geom_line(color = "#2c7bb6", linewidth = 1.2) +
  geom_point(color = "#d7191c", size = 3) +
  scale_x_continuous(breaks = 1:10) +
  theme_minimal() +
  labs(
    title = "Elbow Method - Within-Cluster Sum of Squares (WSS)",
    x = "Jumlah Cluster (k)",
    y = "Total WSS"
  )

Silhouette Score

set.seed(42)

sil_scores <- sapply(2:10, function(k) {
  km <- kmeans(df_scaled, centers = k, nstart = 25, iter.max = 100)
  ss <- silhouette(km$cluster, dist(df_scaled))
  mean(ss[, 3])
})

sil_df <- data.frame(k = 2:10, Silhouette = sil_scores)

ggplot(sil_df, aes(x = k, y = Silhouette)) +
  geom_line(color = "#1a9850", linewidth = 1.2) +
  geom_point(color = "#d7191c", size = 3) +
  scale_x_continuous(breaks = 2:10) +
  theme_minimal() +
  labs(
    title = "Silhouette Score per Jumlah Cluster",
    x = "Jumlah Cluster (k)",
    y = "Rata-rata Silhouette Score"
  )

cat("Silhouette Score per k:\n")
## Silhouette Score per k:
kable(sil_df, digits = 4, caption = "Rata-rata Silhouette Score per Jumlah Cluster") %>%
  kable_styling(bootstrap_options = c("striped", "hover"))
Rata-rata Silhouette Score per Jumlah Cluster
k Silhouette
2 0.4230
3 0.3875
4 0.2920
5 0.3116
6 0.3080
7 0.2616
8 0.2489
9 0.2527
10 0.2219

K-Means Clustering

set.seed(42)

kmeans_result <- kmeans(df_scaled, centers = 3, nstart = 25, iter.max = 100)

cat("HASIL K-MEANS CLUSTERING\n")
## HASIL K-MEANS CLUSTERING
cat("Ukuran tiap cluster:\n")
## Ukuran tiap cluster:
print(kmeans_result$size)
## [1] 3175 1567  277
cat("\nTotal Within-Cluster SS :", round(kmeans_result$tot.withinss, 2), "\n")
## 
## Total Within-Cluster SS : 17715.88
cat("Between-Cluster SS      :", round(kmeans_result$betweenss, 2), "\n")
## Between-Cluster SS      : 17410.12
cat("Rasio Between/Total     :", round(kmeans_result$betweenss / kmeans_result$totss * 100, 2), "%\n")
## Rasio Between/Total     : 49.56 %

Visualisasi Cluster

fviz_cluster(kmeans_result, data = df_scaled,
             geom = "point",
             ellipse.type = "convex",
             palette = c("#E74C3C", "#2ECC71", "#3498DB"),
             ggtheme = theme_minimal(),
             main = "Visualisasi Cluster K-Means (k = 3)")

Tambahkan Kolom Kelas ke Dataset

cluster_means_perkapita <- tapply(df$`Pengeluaran perkapita`, kmeans_result$cluster, mean)
cat("Rata-rata Pengeluaran perkapita per cluster (raw):\n")
## Rata-rata Pengeluaran perkapita per cluster (raw):
print(round(cluster_means_perkapita, 0))
##       1       2       3 
##  982613 1709214 4661393
rank_order <- rank(cluster_means_perkapita)
label_map <- ifelse(rank_order == 1, "Rendah",
             ifelse(rank_order == 2, "Menengah", "Tinggi"))

cat("\nMapping label cluster:\n")
## 
## Mapping label cluster:
print(label_map)
##          1          2          3 
##   "Rendah" "Menengah"   "Tinggi"
df$Kelas        <- factor(label_map[kmeans_result$cluster],
                          levels = c("Rendah", "Menengah", "Tinggi"))
df_scaled$Kelas <- df$Kelas

cat("\nDistribusi label Kelas:\n")
## 
## Distribusi label Kelas:
print(table(df$Kelas))
## 
##   Rendah Menengah   Tinggi 
##     3175     1567      277

Profil Tiap Cluster

profil_cluster <- df %>%
  group_by(Kelas) %>%
  summarise(across(where(is.numeric), ~ round(mean(.x), 2)), .groups = "drop")

kable(t(profil_cluster), caption = "Profil Rata-rata Tiap Cluster (Data Asli)") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed")) %>%
  scroll_box(width = "100%")
Profil Rata-rata Tiap Cluster (Data Asli)
Kelas Rendah Menengah Tinggi
Pengeluaran non Makanan 1810339 2165018 11109827
Total Pengeluaran Rumahtangga 3786087 4677145 16025072
Pengeluaran perkapita 982613.1 1709213.6 4661392.9
Nilai Kalori Perkapita 1892.62 3065.47 2736.34
Nilai Protein Perkapita 56.31 96.33 95.97
Nilai Lemak Perkapita 31.32 58.14 69.92
Nilai Karbohidrat Perkapita 303.13 476.69 378.04

Visualisasi Profil Cluster

profil_long <- profil_cluster %>%
  pivot_longer(cols = -Kelas, names_to = "Variabel", values_to = "Rata_rata")

ggplot(profil_long, aes(x = Variabel, y = Rata_rata, fill = Kelas)) +
  geom_bar(stat = "identity", position = "dodge", alpha = 0.85) +
  scale_fill_manual(values = c("Rendah" = "#E74C3C",
                               "Menengah" = "#F39C12",
                               "Tinggi" = "#2ECC71")) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 30, hjust = 1, size = 8)) +
  labs(title = "Profil Rata-rata Variabel per Cluster",
       x = "Variabel", y = "Rata-rata", fill = "Kelas")

Uji Asumsi LDA

library(MVN)
library(kableExtra)
library(dplyr)

df_num     <- df_scaled[, sapply(df_scaled, is.numeric)]
kelas_list <- c("Rendah", "Menengah", "Tinggi")

mvn_ver <- as.numeric(gsub("^(\\d+)\\..*", "\\1", packageVersion("MVN")))

run_mvn <- function(data, test = "mardia") {
  if (mvn_ver >= 6) {
    mvn(data = data, mvn_test = test)
  } else {
    mvn(data = data, mvnTest = test, desc = FALSE, showOutliers = FALSE)
  }
}

ekstrak_mardia <- function(hasil_mvn) {
  tbl    <- hasil_mvn$multivariateNormality
  p_col  <- grep("p.?value", names(tbl), ignore.case = TRUE, value = TRUE)[1]

  idx_skew <- grep("Skewness", tbl$Test, ignore.case = TRUE)
  idx_kurt <- grep("Kurtosis", tbl$Test, ignore.case = TRUE)
  idx_mvn  <- grep("^MVN$",    tbl$Test, ignore.case = TRUE)

  p_skew <- if (length(idx_skew) > 0) as.numeric(tbl[idx_skew, p_col]) else NA
  p_kurt <- if (length(idx_kurt) > 0) as.numeric(tbl[idx_kurt, p_col]) else NA
  mvn_ok <- if (length(idx_mvn)  > 0) as.character(tbl[idx_mvn,  "MVN"]) else "NO"

  list(skewness = p_skew, kurtosis = p_kurt, normal = mvn_ok)
}

cat("MARDIA'S TEST NORMALITAS MULTIVARIAT \n\n")
## MARDIA'S TEST NORMALITAS MULTIVARIAT
hasil_mardia <- lapply(kelas_list, function(kls) {
  subset_data <- df_num[df$Kelas == kls, ]
  n_obs       <- nrow(subset_data)
  p_vars      <- ncol(subset_data)

  if (n_obs <= p_vars) {
    cat(sprintf("!!! Kelas %s: n=%d <= p=%d. Uji dilewati.\n", kls, n_obs, p_vars))
    return(list(kelas = kls, n = n_obs, skewness = NA, kurtosis = NA, normal = "N/A"))
  }

  res_mvn <- tryCatch(run_mvn(subset_data, test = "mardia"), error = function(e) NULL)
  if (is.null(res_mvn)) {
    return(list(kelas = kls, n = n_obs, skewness = NA, kurtosis = NA, normal = "ERR"))
  }

  res <- ekstrak_mardia(res_mvn)
  list(kelas = kls, n = n_obs,
       skewness = res$skewness, kurtosis = res$kurtosis, normal = res$normal)
})

ringkasan_mardia <- do.call(rbind, lapply(hasil_mardia, function(x) {
  data.frame(
    Kelas       = x$kelas,
    N           = x$n,
    p_Skewness  = ifelse(is.na(x$skewness), NA, round(as.numeric(x$skewness), 4)),
    p_Kurtosis  = ifelse(is.na(x$kurtosis), NA, round(as.numeric(x$kurtosis), 4)),
    Normal_MVN  = x$normal
  )
}))

kable(ringkasan_mardia, row.names = FALSE,
      caption = "Ringkasan Mardia's Test per Kelas (alpha = 0.05)") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "bordered")) %>%
  column_spec(5, bold = TRUE,
              color = ifelse(ringkasan_mardia$Normal_MVN == "YES", "darkgreen",
                      ifelse(ringkasan_mardia$Normal_MVN == "N/A", "orange", "red")))
Ringkasan Mardia’s Test per Kelas (alpha = 0.05)
Kelas N p_Skewness p_Kurtosis Normal_MVN
Rendah 3175 NA NA NO
Menengah 1567 NA NA NO
Tinggi 277 NA NA NO
cat("INTERPRETASI MARDIA'S TEST\n\n")
## INTERPRETASI MARDIA'S TEST
for (i in seq_len(nrow(ringkasan_mardia))) {
  row  <- ringkasan_mardia[i, ]
  stat <- if (row$Normal_MVN == "YES") "NORMAL" else "TIDAK NORMAL"
  cat(sprintf(
    "Kelas %-8s | N = %3d | p-Skewness = %s | p-Kurtosis = %s | -> %s\n",
    row$Kelas, row$N,
    ifelse(is.na(row$p_Skewness), "   NA  ", sprintf("%.4f", row$p_Skewness)),
    ifelse(is.na(row$p_Kurtosis), "   NA  ", sprintf("%.4f", row$p_Kurtosis)),
    stat
  ))
}
## Kelas Rendah   | N = 3175 | p-Skewness =    NA   | p-Kurtosis =    NA   | -> TIDAK NORMAL
## Kelas Menengah | N = 1567 | p-Skewness =    NA   | p-Kurtosis =    NA   | -> TIDAK NORMAL
## Kelas Tinggi   | N = 277 | p-Skewness =    NA   | p-Kurtosis =    NA   | -> TIDAK NORMAL

Henze-Zirkler Test

ekstrak_hz <- function(hasil_mvn) {
  tbl   <- hasil_mvn$multivariateNormality
  p_col <- grep("p.?value", names(tbl), ignore.case = TRUE, value = TRUE)[1]
  hz_col <- grep("^HZ$",    names(tbl), ignore.case = TRUE, value = TRUE)[1]

  hz_stat <- if (!is.null(hz_col) && !is.na(hz_col)) as.numeric(tbl[[hz_col]]) else NA
  p_val   <- if (!is.null(p_col)  && !is.na(p_col))  as.numeric(tbl[[p_col]])  else NA
  mvn_ok  <- if ("MVN" %in% names(tbl)) as.character(tbl$MVN[1]) else "NO"

  list(hz_stat = hz_stat, p_value = p_val, normal = mvn_ok)
}

cat("=== HENZE-ZIRKLER TEST NORMALITAS MULTIVARIAT ===\n\n")
## === HENZE-ZIRKLER TEST NORMALITAS MULTIVARIAT ===
hasil_hz <- lapply(kelas_list, function(kls) {
  subset_data <- df_num[df$Kelas == kls, ]
  n_obs       <- nrow(subset_data)
  p_vars      <- ncol(subset_data)

  if (n_obs <= p_vars) {
    cat(sprintf("!!! Kelas %s: n=%d <= p=%d. Uji dilewati.\n", kls, n_obs, p_vars))
    return(list(kelas = kls, n = n_obs, hz_stat = NA, p_value = NA, normal = "N/A"))
  }

  res_mvn <- tryCatch(run_mvn(subset_data, test = "hz"), error = function(e) NULL)
  if (is.null(res_mvn)) {
    return(list(kelas = kls, n = n_obs, hz_stat = NA, p_value = NA, normal = "ERR"))
  }

  res <- ekstrak_hz(res_mvn)
  list(kelas = kls, n = n_obs,
       hz_stat = res$hz_stat, p_value = res$p_value, normal = res$normal)
})

ringkasan_hz <- do.call(rbind, lapply(hasil_hz, function(x) {
  data.frame(
    Kelas      = x$kelas,
    N          = x$n,
    HZ_Stat    = ifelse(is.na(x$hz_stat), NA, round(as.numeric(x$hz_stat), 4)),
    p_Value    = ifelse(is.na(x$p_value), NA, round(as.numeric(x$p_value), 4)),
    Normal_MVN = x$normal
  )
}))

kable(ringkasan_hz, row.names = FALSE,
      caption = "Ringkasan Henze-Zirkler Test per Kelas (alpha = 0.05)") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "bordered")) %>%
  column_spec(5, bold = TRUE,
              color = ifelse(ringkasan_hz$Normal_MVN == "YES", "darkgreen",
                      ifelse(ringkasan_hz$Normal_MVN == "N/A", "orange", "red")))
Ringkasan Henze-Zirkler Test per Kelas (alpha = 0.05)
Kelas N HZ_Stat p_Value Normal_MVN
Rendah 3175 NA NA NO
Menengah 1567 NA NA NO
Tinggi 277 NA NA NO
cat("INTERPRETASI HENZE-ZIRKLER TEST\n\n")
## INTERPRETASI HENZE-ZIRKLER TEST
for (i in seq_len(nrow(ringkasan_hz))) {
  row  <- ringkasan_hz[i, ]
  stat <- if (row$Normal_MVN == "YES") "NORMAL" else "TIDAK NORMAL"
  cat(sprintf(
    "Kelas %-8s | N = %3d | HZ-Stat = %s | p-value = %s | -> %s\n",
    row$Kelas, row$N,
    ifelse(is.na(row$HZ_Stat), "   NA   ", sprintf("%.4f", row$HZ_Stat)),
    ifelse(is.na(row$p_Value), "   NA   ", sprintf("%.4f", row$p_Value)),
    stat
  ))
}
## Kelas Rendah   | N = 3175 | HZ-Stat =    NA    | p-value =    NA    | -> TIDAK NORMAL
## Kelas Menengah | N = 1567 | HZ-Stat =    NA    | p-value =    NA    | -> TIDAK NORMAL
## Kelas Tinggi   | N = 277 | HZ-Stat =    NA    | p-value =    NA    | -> TIDAK NORMAL

QQ-Plot Multivariat (Mahalanobis Distance)

library(ggplot2)

par(mfrow = c(1, 3), mar = c(4, 4, 3, 1))

for (kls in kelas_list) {
  subset_data <- df_num[df$Kelas == kls, ]
  n_obs       <- nrow(subset_data)
  p_vars      <- ncol(subset_data)

  if (n_obs <= p_vars) {
    plot.new()
    title(main = paste("Kelas:", kls, "\n(Sampel terlalu kecil)"))
    next
  }

  center   <- colMeans(subset_data)
  cov_mat  <- cov(subset_data)

  mah_dist <- tryCatch(
    mahalanobis(subset_data, center = center, cov = cov_mat),
    error = function(e) rep(NA, n_obs)
  )

  if (all(is.na(mah_dist))) {
    plot.new()
    title(main = paste("Kelas:", kls, "\n(Matriks singular)"))
    next
  }

  chi_q <- qchisq(ppoints(n_obs), df = p_vars)

  # Plot
  qqplot(chi_q, sort(mah_dist),
         main  = paste("QQ-Plot Mahalanobis\nKelas:", kls),
         xlab  = expression("Kuantil " * chi^2 * " Teoretis"),
         ylab  = "Jarak Mahalanobis (terurut)",
         pch   = 16, col = "#3498DB", cex = 0.7)
  abline(a = 0, b = 1, col = "#E74C3C", lwd = 2, lty = 2)
}

par(mfrow = c(1, 1))

Rekapitulasi Uji Normalitas Multivariat

rekapitulasi_norm <- data.frame(
  Kelas         = kelas_list,
  N             = ringkasan_mardia$N,
  Mardia_MVN    = ringkasan_mardia$Normal_MVN,
  HZ_MVN        = ringkasan_hz$Normal_MVN,
  Kesimpulan    = ifelse(
    ringkasan_mardia$Normal_MVN == "YES" & ringkasan_hz$Normal_MVN == "YES",
    "Normal Multivariat",
    ifelse(
      ringkasan_mardia$Normal_MVN %in% c("N/A", "ERR") |
        ringkasan_hz$Normal_MVN  %in% c("N/A", "ERR"),
      "Tidak dapat diuji",
      "Tidak Normal Multivariat"
    )
  )
)

kable(rekapitulasi_norm, row.names = FALSE,
      caption = "Rekapitulasi Uji Normalitas Multivariat (Mardia & Henze-Zirkler)") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "bordered")) %>%
  column_spec(5, bold = TRUE)
Rekapitulasi Uji Normalitas Multivariat (Mardia & Henze-Zirkler)
Kelas N Mardia_MVN HZ_MVN Kesimpulan
Rendah 3175 NO NO Tidak Normal Multivariat
Menengah 1567 NO NO Tidak Normal Multivariat
Tinggi 277 NO NO Tidak Normal Multivariat

Uji Homogenitas Matriks Kovarians (Box’s M Test)

library(biotools)

cat("BOX'S M TEST - HOMOGENITAS MATRIKS KOVARIANS\n\n")
## BOX'S M TEST - HOMOGENITAS MATRIKS KOVARIANS
boxm_result <- tryCatch({
  boxM(data   = df_num,
       grouping = df$Kelas)
}, error = function(e) {
  cat("ERROR saat menjalankan Box's M test:", conditionMessage(e), "\n")
  NULL
})

if (!is.null(boxm_result)) {
  print(boxm_result)

  chi_stat <- round(boxm_result$statistic,   4)
  df_test  <- boxm_result$parameter["df"]
  p_val    <- round(boxm_result$p.value,     6)
  keputusan <- ifelse(p_val > 0.05,
                      "Gagal Tolak H0 -> Matriks Kovarians HOMOGEN",
                      "Tolak H0 -> Matriks Kovarians TIDAK HOMOGEN")

  cat(sprintf("\nChi-Squared Approx : %.4f\n", chi_stat))
  cat(sprintf("Derajat Bebas (df) : %d\n",    as.integer(df_test)))
  cat(sprintf("p-value            : %.6f\n",  p_val))
  cat(sprintf("Keputusan (alpha=0.05) : %s\n",    keputusan))
}
## 
##  Box's M-test for Homogeneity of Covariance Matrices
## 
## data:  df_num
## Chi-Sq (approx.) = 11918, df = 56, p-value < 2.2e-16
## 
## 
## Chi-Squared Approx : 11917.9773
## Derajat Bebas (df) : 56
## p-value            : 0.000000
## Keputusan (alpha=0.05) : Tolak H0 -> Matriks Kovarians TIDAK HOMOGEN
if (!is.null(boxm_result)) {
  boxm_tbl <- data.frame(
    Statistik       = c("Chi-Squared Approx", "Derajat Bebas", "p-value", "Keputusan (alpha=0.05)"),
    Nilai           = c(
      sprintf("%.4f",  as.numeric(boxm_result$statistic)),
      sprintf("%d",    as.integer(boxm_result$parameter["df"])),
      sprintf("%.6f",  boxm_result$p.value),
      ifelse(boxm_result$p.value > 0.05,
             "Gagal Tolak H0 - Kovarians HOMOGEN",
             "Tolak H0 - Kovarians TIDAK HOMOGEN")
    )
  )

  kable(boxm_tbl, row.names = FALSE,
        caption = "Hasil Box's M Test Homogenitas Matriks Kovarians") %>%
    kable_styling(bootstrap_options = c("striped", "hover", "bordered")) %>%
    row_spec(4, bold = TRUE,
             color = ifelse(boxm_result$p.value > 0.05, "darkgreen", "red"))
}
Hasil Box’s M Test Homogenitas Matriks Kovarians
Statistik Nilai
Chi-Squared Approx 11917.9773
Derajat Bebas 56
p-value 0.000000
Keputusan (alpha=0.05) Tolak H0 - Kovarians TIDAK HOMOGEN

Visualisasi Matriks Kovarians per Kelas

library(corrplot)

par(mfrow = c(1, 3), mar = c(1, 1, 3, 1))

for (kls in kelas_list) {
  subset_data <- df_num[df$Kelas == kls, ]
  if (nrow(subset_data) > ncol(subset_data)) {
    cov_mat  <- cov(subset_data)
    cor_mat  <- cov2cor(cov_mat)   # konversi ke korelasi agar nilai dalam [-1, 1]
    corrplot(cor_mat,
             method      = "color",
             type        = "upper",
             tl.cex      = 0.65,
             tl.col      = "black",
             addCoef.col = "black",
             number.cex  = 0.5,
             col         = colorRampPalette(c("#d73027", "white", "#1a9850"))(200),
             title       = paste("Korelasi (dari Kovarians)\nKelas:", kls),
             mar         = c(0, 0, 2, 0))
  } else {
    plot.new()
    title(main = paste("Kelas:", kls, "\n(Sampel terlalu kecil)"))
  }
}

par(mfrow = c(1, 1))

Log-Determinan Matriks Kovarians per Kelas

log_det_df <- do.call(rbind, lapply(kelas_list, function(kls) {
  subset_data <- df_num[df$Kelas == kls, ]
  n_obs       <- nrow(subset_data)
  p_vars      <- ncol(subset_data)

  if (n_obs <= p_vars) {
    return(data.frame(Kelas = kls, N = n_obs, Log_Det = NA, Keterangan = "Sampel terlalu kecil"))
  }

  cov_mat  <- cov(subset_data)
  det_val  <- tryCatch(det(cov_mat), error = function(e) NA)
  log_det  <- ifelse(!is.na(det_val) & det_val > 0, round(log(det_val), 4), NA)
  ket      <- ifelse(is.na(log_det), "Matriks singular", sprintf("%.4f", log_det))

  data.frame(Kelas = kls, N = n_obs, Log_Det = log_det, Keterangan = ket)
}))

kable(log_det_df, row.names = FALSE,
      caption = "Log-Determinan Matriks Kovarians per Kelas") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "bordered"))
Log-Determinan Matriks Kovarians per Kelas
Kelas N Log_Det Keterangan
Rendah 3175 -15.7287 -15.7287
Menengah 1567 -9.5327 -9.5327
Tinggi 277 -1.8367 -1.8367

Konfirmasi Tidak Ada Multikolinearitas Sempurna

Determinan Matriks Kovarians Keseluruhan

cat("CEK DETERMINAN MATRIKS KOVARIANS GLOBAL\n\n")
## CEK DETERMINAN MATRIKS KOVARIANS GLOBAL
cov_global  <- cov(df_num)
det_global  <- det(cov_global)
cat(sprintf("Determinan matriks kovarians (keseluruhan) : %.6e\n", det_global))
## Determinan matriks kovarians (keseluruhan) : 1.247857e-04
if (det_global < 1e-10) {
  cat("Determinan sangat kecil -> kemungkinan multikolinearitas sempurna!\n")
} else {
  cat("Determinan cukup besar -> tidak ada multikolinearitas sempurna.\n")
}
## Determinan cukup besar -> tidak ada multikolinearitas sempurna.

Variance Inflation Factor (VIF)

library(car)

cat("VARIANCE INFLATION FACTOR (VIF)\n\n")
## VARIANCE INFLATION FACTOR (VIF)
df_vif          <- df_num
predictor_names <- names(df_vif)
response        <- predictor_names[1]
predictors      <- predictor_names[-1]

if (length(predictors) >= 2) {
  # Bungkus semua nama kolom dengan backtick agar spasi dalam nama tidak error
  formula_vif <- as.formula(paste(
    paste0("`", response, "`"), "~",
    paste(paste0("`", predictors, "`"), collapse = " + ")
  ))

  lm_vif     <- lm(formula_vif, data = df_vif)
  vif_values <- tryCatch(vif(lm_vif), error = function(e) NULL)

  if (!is.null(vif_values)) {
    vif_df <- data.frame(
      Variabel = names(vif_values),
      VIF      = round(as.numeric(vif_values), 3),
      Status   = ifelse(as.numeric(vif_values) > 10, "Multikolinearitas Tinggi",
                 ifelse(as.numeric(vif_values) > 5,  "Perlu Perhatian",
                                                      "Aman"))
    )

    kable(vif_df, row.names = FALSE,
          caption = "Variance Inflation Factor (VIF) - Deteksi Multikolinearitas") %>%
      kable_styling(bootstrap_options = c("striped", "hover", "bordered")) %>%
      column_spec(3, bold = TRUE)
  } else {
    cat("VIF tidak dapat dihitung (kemungkinan matriks singular).\n")
  }
} else {
  cat("Jumlah prediktor terlalu sedikit untuk menghitung VIF.\n")
}
Variance Inflation Factor (VIF) - Deteksi Multikolinearitas
Variabel VIF Status
Total Pengeluaran Rumahtangga 2.683 Aman
Pengeluaran perkapita 3.308 Aman
Nilai Kalori Perkapita 28.102 Multikolinearitas Tinggi
Nilai Protein Perkapita 5.026 Perlu Perhatian
Nilai Lemak Perkapita 3.899 Aman
Nilai Karbohidrat Perkapita 13.956 Multikolinearitas Tinggi

Nilai Eigenvalue Matriks Kovarians

cat("EIGENVALUE MATRIKS KOVARIANS GLOBAL\n\n")
## EIGENVALUE MATRIKS KOVARIANS GLOBAL
eigen_vals  <- eigen(cov_global, only.values = TRUE)$values
eigen_vals  <- round(eigen_vals, 6)

eigen_df <- data.frame(
  Komponen    = paste0("lambda", seq_along(eigen_vals)),
  Eigenvalue  = eigen_vals,
  Status      = ifelse(eigen_vals < 1e-6, "Near-zero (Singular!)", "Positif")
)

kable(eigen_df, row.names = FALSE,
      caption = "Eigenvalue Matriks Kovarians Global") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "bordered")) %>%
  column_spec(3, bold = TRUE)
Eigenvalue Matriks Kovarians Global
Komponen Eigenvalue Status
lambda1 3.935157 Positif
lambda2 2.096893 Positif
lambda3 0.512850 Positif
lambda4 0.229270 Positif
lambda5 0.169894 Positif
lambda6 0.032986 Positif
lambda7 0.022950 Positif
n_zero <- sum(eigen_vals < 1e-6)
cat(sprintf("\nJumlah eigenvalue near-zero (< 1e-6) : %d\n", n_zero))
## 
## Jumlah eigenvalue near-zero (< 1e-6) : 0
if (n_zero == 0) {
  cat("[OK] Tidak ada eigenvalue nol -> matriks kovarians non-singular -> tidak ada multikolinearitas sempurna.\n")
} else {
  cat("[!] Ada eigenvalue nol/near-zero -> kemungkinan multikolinearitas sempurna! Pertimbangkan PCA atau seleksi variabel.\n")
}
## [OK] Tidak ada eigenvalue nol -> matriks kovarians non-singular -> tidak ada multikolinearitas sempurna.

Condition Number (Angka Kondisi)

cat("ANGKA KONDISI (CONDITION NUMBER)\n\n")
## ANGKA KONDISI (CONDITION NUMBER)
lambda_max  <- max(eigen_vals)
lambda_min  <- min(eigen_vals[eigen_vals > 1e-12])
cond_number <- lambda_max / lambda_min

cat(sprintf("lambda_max            : %.4f\n", lambda_max))
## lambda_max            : 3.9352
cat(sprintf("lambda_min (> 1e-12)  : %.6f\n", lambda_min))
## lambda_min (> 1e-12)  : 0.022950
cat(sprintf("Angka Kondisi    : %.2f\n",  cond_number))
## Angka Kondisi    : 171.47
if (cond_number < 100) {
  cat("Angka kondisi < 100 -> kondisi matriks baik, tidak ada multikolinearitas parah.\n")
} else if (cond_number < 1000) {
  cat("Agka kondisi 100-1000 -> kondisi matriks cukup buruk, waspadai multikolinearitas moderat.\n")
} else {
  cat("Angka kondisi > 1000 -> matriks sangat ill-conditioned! Multikolinearitas parah.\n")
}
## Agka kondisi 100-1000 -> kondisi matriks cukup buruk, waspadai multikolinearitas moderat.

Linear Discriminant Analysis (LDA)

Split Data: Training 80% & Testing 20%

set.seed(123)   
train_index <- createDataPartition(df_scaled$Kelas,
                                   p     = 0.80,
                                   list  = FALSE)

train_data  <- df_scaled[ train_index, ]
test_data   <- df_scaled[-train_index, ]

cat("UKURAN DATA\n")
## UKURAN DATA
cat("Total data  :", nrow(df_scaled), "\n")
## Total data  : 5019
cat("Data training:", nrow(train_data), sprintf("(%.1f%%)\n",
    nrow(train_data) / nrow(df_scaled) * 100))
## Data training: 4016 (80.0%)
cat("Data testing :", nrow(test_data),  sprintf("(%.1f%%)\n",
    nrow(test_data)  / nrow(df_scaled) * 100))
## Data testing : 1003 (20.0%)
cat("\nDISTRIBUSI KELAS - TRAINING\n")
## 
## DISTRIBUSI KELAS - TRAINING
print(table(train_data$Kelas))
## 
##   Rendah Menengah   Tinggi 
##     2540     1254      222
cat("\nDISTRIBUSI KELAS - TESTING\n")
## 
## DISTRIBUSI KELAS - TESTING
print(table(test_data$Kelas))
## 
##   Rendah Menengah   Tinggi 
##      635      313       55
# Tampilkan proporsi dalam tabel
split_summary <- data.frame(
  Dataset   = c("Training", "Testing", "Total"),
  Rendah    = c(sum(train_data$Kelas == "Rendah"),
                sum(test_data$Kelas  == "Rendah"),
                sum(df_scaled$Kelas  == "Rendah")),
  Menengah  = c(sum(train_data$Kelas == "Menengah"),
                sum(test_data$Kelas  == "Menengah"),
                sum(df_scaled$Kelas  == "Menengah")),
  Tinggi    = c(sum(train_data$Kelas == "Tinggi"),
                sum(test_data$Kelas  == "Tinggi"),
                sum(df_scaled$Kelas  == "Tinggi")),
  Total     = c(nrow(train_data), nrow(test_data), nrow(df_scaled))
)

kable(split_summary, row.names = FALSE,
      caption = "Distribusi Kelas pada Data Training dan Testing") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "bordered")) %>%
  row_spec(3, bold = TRUE, background = "#f0f0f0")
Distribusi Kelas pada Data Training dan Testing
Dataset Rendah Menengah Tinggi Total
Training 2540 1254 222 4016
Testing 635 313 55 1003
Total 3175 1567 277 5019

Membangun Model LDA

predictor_cols <- setdiff(names(train_data), "Kelas")

formula_lda <- as.formula(
  paste("Kelas ~", paste(paste0("`", predictor_cols, "`"), collapse = " + "))
)

lda_model <- lda(formula_lda, data = train_data)

cat("RINGKASAN MODEL LDA\n\n")
## RINGKASAN MODEL LDA
print(lda_model)
## Call:
## lda(formula_lda, data = train_data)
## 
## Prior probabilities of groups:
##     Rendah   Menengah     Tinggi 
## 0.63247012 0.31225100 0.05527888 
## 
## Group means:
##          `Pengeluaran non Makanan` `Total Pengeluaran Rumahtangga`
## Rendah                 -0.21169678                     -0.24189365
## Menengah               -0.09628201                     -0.01371452
## Tinggi                  2.88166562                      2.81976917
##          `Pengeluaran perkapita` `Nilai Kalori Perkapita`
## Rendah                -0.3649409               -0.5868958
## Menengah               0.2402299                1.0722419
## Tinggi                 2.7090153                0.5898291
##          `Nilai Protein Perkapita` `Nilai Lemak Perkapita`
## Rendah                  -0.5537088              -0.4674411
## Menengah                 0.9324874               0.7141846
## Tinggi                   0.8571492               1.2269739
##          `Nilai Karbohidrat Perkapita`
## Rendah                      -0.5198639
## Menengah                     1.0204269
## Tinggi                       0.1329941
## 
## Coefficients of linear discriminants:
##                                       LD1         LD2
## `Pengeluaran non Makanan`       0.2939806  0.40234490
## `Total Pengeluaran Rumahtangga` 0.1796666  0.32996048
## `Pengeluaran perkapita`         0.2196298  0.64007621
## `Nilai Kalori Perkapita`        0.5841140 -0.65195504
## `Nilai Protein Perkapita`       0.3117786 -0.22746373
## `Nilai Lemak Perkapita`         0.2443962  0.04231628
## `Nilai Karbohidrat Perkapita`   0.3233888 -0.15609597
## 
## Proportion of trace:
##    LD1    LD2 
## 0.6623 0.3377

Proporsi Varians Tiap Fungsi Diskriminan

prop_var <- lda_model$svd^2 / sum(lda_model$svd^2)
cum_var  <- cumsum(prop_var)

prop_df <- data.frame(
  Fungsi_Diskriminan = paste0("LD", seq_along(prop_var)),
  Singular_Value     = round(lda_model$svd, 4),
  Proporsi_Varians   = round(prop_var * 100, 2),
  Kumulatif          = round(cum_var  * 100, 2)
)

cat("=== PROPORSI VARIANS FUNGSI DISKRIMINAN ===\n")
## === PROPORSI VARIANS FUNGSI DISKRIMINAN ===
kable(prop_df, row.names = FALSE,
      caption = "Proporsi Varians Tiap Fungsi Diskriminan (LD)",
      col.names = c("Fungsi Diskriminan", "Singular Value",
                    "Proporsi Varians (%)", "Kumulatif (%)")) %>%
  kable_styling(bootstrap_options = c("striped", "hover", "bordered")) %>%
  column_spec(3:4, bold = TRUE)
Proporsi Varians Tiap Fungsi Diskriminan (LD)
Fungsi Diskriminan Singular Value Proporsi Varians (%) Kumulatif (%)
LD1 58.9142 66.23 66.23
LD2 42.0730 33.77 100.00
cat(sprintf("\nLD1 menjelaskan %.2f%% varians.\n", prop_df$Proporsi_Varians[1]))
## 
## LD1 menjelaskan 66.23% varians.
cat(sprintf("LD1 + LD2 menjelaskan %.2f%% varians.\n", prop_df$Kumulatif[2]))
## LD1 + LD2 menjelaskan 100.00% varians.

Koefisien Fungsi Diskriminan

koef_df <- as.data.frame(lda_model$scaling)
koef_df$Variabel <- rownames(koef_df)
rownames(koef_df) <- NULL
koef_df <- koef_df[, c("Variabel", "LD1", "LD2")]

kable(koef_df, row.names = FALSE, digits = 4,
      caption = "Koefisien Fungsi Diskriminan (Standardized)") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "bordered")) %>%
  column_spec(2:3, bold = TRUE)
Koefisien Fungsi Diskriminan (Standardized)
Variabel LD1 LD2
Pengeluaran non Makanan 0.2940 0.4023
Total Pengeluaran Rumahtangga 0.1797 0.3300
Pengeluaran perkapita 0.2196 0.6401
Nilai Kalori Perkapita 0.5841 -0.6520
Nilai Protein Perkapita 0.3118 -0.2275
Nilai Lemak Perkapita 0.2444 0.0423
Nilai Karbohidrat Perkapita 0.3234 -0.1561
cat("\nInterpretasi:\n")
## 
## Interpretasi:
cat("- Nilai absolut koefisien yang besar -> variabel tersebut lebih\n")
## - Nilai absolut koefisien yang besar -> variabel tersebut lebih
cat("  berpengaruh dalam membentuk fungsi diskriminan.\n")
##   berpengaruh dalam membentuk fungsi diskriminan.
cat("- Tanda (+/-) menunjukkan arah kontribusi.\n")
## - Tanda (+/-) menunjukkan arah kontribusi.

Visualisasi: Plot Skor Diskriminan per Kelompok

A. Histogram Skor LD1 per Kelas

lda_scores_train <- predict(lda_model, train_data)

scores_df <- data.frame(
  LD1   = lda_scores_train$x[, "LD1"],
  LD2   = lda_scores_train$x[, "LD2"],
  Kelas = train_data$Kelas
)

# Histogram LD1
ggplot(scores_df, aes(x = LD1, fill = Kelas)) +
  geom_histogram(bins = 50, alpha = 0.6, position = "identity",
                 color = "white", linewidth = 0.3) +
  scale_fill_manual(values = c("Rendah"   = "#3498db",
                                "Menengah" = "#2ecc71",
                                "Tinggi"   = "#e74c3c")) +
  theme_minimal(base_size = 13) +
  labs(title = "Distribusi Skor Diskriminan LD1 per Kelas",
       subtitle = paste0("LD1 menjelaskan ", round(prop_var[1] * 100, 1), "% varians"),
       x     = "Skor LD1",
       y     = "Frekuensi",
       fill  = "Kelas") +
  theme(legend.position = "top",
        plot.title = element_text(face = "bold"))

B. Scatter Plot LD1 vs LD2 per Kelas

centroid_df <- scores_df %>%
  group_by(Kelas) %>%
  summarise(LD1 = mean(LD1), LD2 = mean(LD2), .groups = "drop")

ggplot(scores_df, aes(x = LD1, y = LD2, color = Kelas)) +
  geom_point(alpha = 0.4, size = 1.2) +
  stat_ellipse(aes(fill = Kelas), geom = "polygon",
               alpha = 0.10, level = 0.95, linewidth = 0.8) +
  geom_point(data = centroid_df, aes(x = LD1, y = LD2),
             size = 5, shape = 18, color = "black") +
  geom_text(data = centroid_df, aes(x = LD1, y = LD2, label = Kelas),
            vjust = -1, fontface = "bold", size = 4, color = "black") +
  scale_color_manual(values = c("Rendah"   = "#3498db",
                                 "Menengah" = "#2ecc71",
                                 "Tinggi"   = "#e74c3c")) +
  scale_fill_manual( values = c("Rendah"   = "#3498db",
                                 "Menengah" = "#2ecc71",
                                 "Tinggi"   = "#e74c3c")) +
  theme_minimal(base_size = 13) +
  labs(title = "Scatter Plot Skor Diskriminan LD1 vs LD2",
       subtitle = sprintf("LD1: %.1f%% | LD2: %.1f%% varians",
                          prop_var[1] * 100, prop_var[2] * 100),
       x     = paste0("LD1 (", round(prop_var[1] * 100, 1), "%)"),
       y     = paste0("LD2 (", round(prop_var[2] * 100, 1), "%)"),
       color = "Kelas", fill = "Kelas") +
  theme(legend.position = "right",
        plot.title = element_text(face = "bold"))

C. Density Plot LD1 per Kelas

ggplot(scores_df, aes(x = LD1, fill = Kelas, color = Kelas)) +
  geom_density(alpha = 0.4, linewidth = 1) +
  scale_fill_manual( values = c("Rendah"   = "#3498db",
                                 "Menengah" = "#2ecc71",
                                 "Tinggi"   = "#e74c3c")) +
  scale_color_manual(values = c("Rendah"   = "#2980b9",
                                 "Menengah" = "#27ae60",
                                 "Tinggi"   = "#c0392b")) +
  theme_minimal(base_size = 13) +
  labs(title = "Density Plot Skor Diskriminan LD1 per Kelas",
       x     = "Skor LD1",
       y     = "Densitas",
       fill  = "Kelas", color = "Kelas") +
  theme(legend.position = "top",
        plot.title = element_text(face = "bold"))

D. Kontribusi Variabel pada LD1 dan LD2 (Bar Plot Koefisien)

koef_long <- tidyr::pivot_longer(koef_df,
                                  cols      = c("LD1", "LD2"),
                                  names_to  = "Fungsi",
                                  values_to = "Koefisien")

ggplot(koef_long, aes(x = reorder(Variabel, abs(Koefisien)),
                       y = Koefisien, fill = Koefisien > 0)) +
  geom_col(show.legend = FALSE, alpha = 0.85) +
  coord_flip() +
  facet_wrap(~ Fungsi, scales = "free_x") +
  scale_fill_manual(values = c("TRUE" = "#2ecc71", "FALSE" = "#e74c3c")) +
  theme_minimal(base_size = 12) +
  labs(title = "Koefisien Variabel pada Fungsi Diskriminan LD1 dan LD2",
       x = "Variabel",
       y = "Koefisien") +
  theme(plot.title = element_text(face = "bold"),
        strip.text = element_text(face = "bold", size = 12))

Multinomial Logistic Regression (Multinom)

Membangun Model Multinom

train_data$Kelas <- relevel(factor(train_data$Kelas), ref = "Rendah")

set.seed(123)

multinom_model <- multinom(Kelas ~ ., data = train_data,
                           maxit = 500, trace = FALSE)

cat("RINGKASAN MODEL MULTINOM\n\n")
## RINGKASAN MODEL MULTINOM
print(summary(multinom_model))
## Call:
## multinom(formula = Kelas ~ ., data = train_data, maxit = 500, 
##     trace = FALSE)
## 
## Coefficients:
##          (Intercept) `Pengeluaran non Makanan` `Total Pengeluaran Rumahtangga`
## Menengah   -85.90405                  8.577093                        17.25029
## Tinggi    -834.47961                160.014082                       253.41555
##          `Pengeluaran perkapita` `Nilai Kalori Perkapita`
## Menengah                47.01235                 121.2738
## Tinggi                 223.15664                 146.7272
##          `Nilai Protein Perkapita` `Nilai Lemak Perkapita`
## Menengah                 114.79375                87.81565
## Tinggi                    65.25421               102.08418
##          `Nilai Karbohidrat Perkapita`
## Menengah                     114.07331
## Tinggi                        46.27432
## 
## Std. Errors:
##          (Intercept) `Pengeluaran non Makanan` `Total Pengeluaran Rumahtangga`
## Menengah    27.96057                  6.195044                        6.713941
## Tinggi     261.53840                 62.872281                       84.854201
##          `Pengeluaran perkapita` `Nilai Kalori Perkapita`
## Menengah                15.96062                 40.33540
## Tinggi                  71.54925                 60.15963
##          `Nilai Protein Perkapita` `Nilai Lemak Perkapita`
## Menengah                  37.17461                28.55991
## Tinggi                    27.75041                36.62291
##          `Nilai Karbohidrat Perkapita`
## Menengah                      37.19801
## Tinggi                        29.45159
## 
## Residual Deviance: 4.863707 
## AIC: 36.86371

Ekstrak Koefisien Model Multinom

koef_multinom <- coef(multinom_model)

cat("KOEFISIEN MODEL MULTINOM\n")
## KOEFISIEN MODEL MULTINOM
cat("(Relatif terhadap kelas referensi: Rendah)\n\n")
## (Relatif terhadap kelas referensi: Rendah)
kable(round(koef_multinom, 4),
      caption = "Koefisien Multinomial Logistic Regression") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "bordered")) %>%
  column_spec(1, bold = TRUE, background = "#f5f5f5")
Koefisien Multinomial Logistic Regression
(Intercept) Pengeluaran non Makanan Total Pengeluaran Rumahtangga Pengeluaran perkapita Nilai Kalori Perkapita Nilai Protein Perkapita Nilai Lemak Perkapita Nilai Karbohidrat Perkapita
Menengah -85.9041 8.5771 17.2503 47.0124 121.2738 114.7938 87.8156 114.0733
Tinggi -834.4796 160.0141 253.4155 223.1566 146.7272 65.2542 102.0842 46.2743

Uji Signifikansi Koefisien (Z-test & P-value)

se_multinom <- summary(multinom_model)$standard.errors
z_scores    <- koef_multinom / se_multinom
p_values    <- 2 * (1 - pnorm(abs(z_scores)))

cat("Z-SCORE\n")
## Z-SCORE
kable(round(z_scores, 4),
      caption = "Z-Score Tiap Koefisien") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "bordered")) %>%
  column_spec(1, bold = TRUE, background = "#f5f5f5")
Z-Score Tiap Koefisien
(Intercept) Pengeluaran non Makanan Total Pengeluaran Rumahtangga Pengeluaran perkapita Nilai Kalori Perkapita Nilai Protein Perkapita Nilai Lemak Perkapita Nilai Karbohidrat Perkapita
Menengah -3.0723 1.3845 2.5693 2.9455 3.0066 3.0880 3.0748 3.0667
Tinggi -3.1907 2.5451 2.9865 3.1189 2.4390 2.3515 2.7874 1.5712
cat("\nP-VALUE\n")
## 
## P-VALUE
kable(round(p_values, 4),
      caption = "P-Value Tiap Koefisien (alpha = 0.05)") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "bordered")) %>%
  column_spec(1, bold = TRUE, background = "#f5f5f5")
P-Value Tiap Koefisien (alpha = 0.05)
(Intercept) Pengeluaran non Makanan Total Pengeluaran Rumahtangga Pengeluaran perkapita Nilai Kalori Perkapita Nilai Protein Perkapita Nilai Lemak Perkapita Nilai Karbohidrat Perkapita
Menengah 0.0021 0.1662 0.0102 0.0032 0.0026 0.0020 0.0021 0.0022
Tinggi 0.0014 0.0109 0.0028 0.0018 0.0147 0.0187 0.0053 0.1161

Tabel Signifikansi Lengkap per Variabel

hasil_list <- list()

for (kelas in rownames(koef_multinom)) {
  df_kelas <- data.frame(
    Kelas     = kelas,
    Variabel  = colnames(koef_multinom),
    Koefisien = round(koef_multinom[kelas, ], 4),
    Std_Error = round(se_multinom[kelas, ],   4),
    Z_Score   = round(z_scores[kelas, ],      4),
    P_Value   = round(p_values[kelas, ],      4)
  )
  df_kelas$Signifikan <- ifelse(df_kelas$P_Value < 0.001, "***",
                         ifelse(df_kelas$P_Value < 0.01,  "**",
                         ifelse(df_kelas$P_Value < 0.05,  "*",
                         ifelse(df_kelas$P_Value < 0.10,  ".",  ""))))
  hasil_list[[kelas]] <- df_kelas
}

tabel_sig <- bind_rows(hasil_list)
rownames(tabel_sig) <- NULL

cat("Keterangan: *** p<0.001 | ** p<0.01 | * p<0.05 | . p<0.10\n\n")
## Keterangan: *** p<0.001 | ** p<0.01 | * p<0.05 | . p<0.10
kable(tabel_sig, row.names = FALSE,
      caption = paste("Tabel Signifikansi Koefisien Multinom",
                      "(Referensi: Kelas Rendah)")) %>%
  kable_styling(bootstrap_options = c("striped", "hover", "bordered")) %>%
  column_spec(7, bold = TRUE, color = "darkred") %>%
  row_spec(which(tabel_sig$P_Value < 0.05), background = "#fff3cd") %>%
  collapse_rows(columns = 1, valign = "top")
Tabel Signifikansi Koefisien Multinom (Referensi: Kelas Rendah)
Kelas Variabel Koefisien Std_Error Z_Score P_Value Signifikan
Menengah (Intercept) -85.9041 27.9606 -3.0723 0.0021 **
Pengeluaran non Makanan 8.5771 6.1950 1.3845 0.1662
Total Pengeluaran Rumahtangga 17.2503 6.7139 2.5693 0.0102
Pengeluaran perkapita 47.0124 15.9606 2.9455 0.0032 **
Nilai Kalori Perkapita 121.2738 40.3354 3.0066 0.0026 **
Nilai Protein Perkapita 114.7938 37.1746 3.0880 0.0020 **
Nilai Lemak Perkapita 87.8156 28.5599 3.0748 0.0021 **
Nilai Karbohidrat Perkapita 114.0733 37.1980 3.0667 0.0022 **
Tinggi (Intercept) -834.4796 261.5384 -3.1907 0.0014 **
Pengeluaran non Makanan 160.0141 62.8723 2.5451 0.0109
Total Pengeluaran Rumahtangga 253.4155 84.8542 2.9865 0.0028 **
Pengeluaran perkapita 223.1566 71.5492 3.1189 0.0018 **
Nilai Kalori Perkapita 146.7272 60.1596 2.4390 0.0147
Nilai Protein Perkapita 65.2542 27.7504 2.3515 0.0187
Nilai Lemak Perkapita 102.0842 36.6229 2.7874 0.0053 **
Nilai Karbohidrat Perkapita 46.2743 29.4516 1.5712 0.1161

Odds Ratio (Exp Koefisien)

odds_ratio <- exp(koef_multinom)

cat("ODDS RATIO (exp(koefisien))\n")
## ODDS RATIO (exp(koefisien))
cat("Interpretasi: OR > 1 -> meningkatkan peluang kelas vs Rendah\n")
## Interpretasi: OR > 1 -> meningkatkan peluang kelas vs Rendah
cat("              OR < 1 -> menurunkan peluang kelas vs Rendah\n\n")
##               OR < 1 -> menurunkan peluang kelas vs Rendah
kable(round(odds_ratio, 4),
      caption = "Odds Ratio Koefisien Multinomial Logistic Regression") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "bordered")) %>%
  column_spec(1, bold = TRUE, background = "#f5f5f5")
Odds Ratio Koefisien Multinomial Logistic Regression
(Intercept) Pengeluaran non Makanan Total Pengeluaran Rumahtangga Pengeluaran perkapita Nilai Kalori Perkapita Nilai Protein Perkapita Nilai Lemak Perkapita Nilai Karbohidrat Perkapita
Menengah 0 5.308650e+03 3.102456e+07 2.613402e+20 4.661469e+52 7.149787e+49 1.373572e+38 3.478623e+49
Tinggi 0 3.113386e+69 1.140183e+110 8.235606e+96 5.282212e+63 2.185467e+28 2.160702e+44 1.249341e+20

Perbandingan Koefisien: LDA vs Multinom

koef_lda_df <- data.frame(
  Variabel    = rownames(lda_model$scaling),
  LDA_LD1     = round(lda_model$scaling[, "LD1"], 4),
  LDA_LD2     = round(lda_model$scaling[, "LD2"], 4)
)

koef_multinom_no_int <- koef_multinom[, -1]   #

koef_multi_df <- data.frame(
  Variabel            = colnames(koef_multinom_no_int),
  Multinom_Menengah   = round(koef_multinom_no_int["Menengah", ], 4),
  Multinom_Tinggi     = round(koef_multinom_no_int["Tinggi",   ], 4)
)

koef_lda_df$Variabel <- gsub("`", "", koef_lda_df$Variabel)

perbandingan <- merge(koef_lda_df, koef_multi_df, by = "Variabel", all = TRUE)
perbandingan <- perbandingan[order(abs(perbandingan$LDA_LD1), decreasing = TRUE), ]
rownames(perbandingan) <- NULL

kable(perbandingan,
      caption = "Perbandingan Koefisien LDA vs Multinomial Logistic Regression",
      col.names = c("Variabel", "LDA LD1", "LDA LD2",
                    "Multinom (Menengah vs Rendah)",
                    "Multinom (Tinggi vs Rendah)")) %>%
  kable_styling(bootstrap_options = c("striped", "hover", "bordered")) %>%
  column_spec(1, bold = TRUE) %>%
  column_spec(2:3, background = "#e8f4f8") %>%
  column_spec(4:5, background = "#fef9e7")
Perbandingan Koefisien LDA vs Multinomial Logistic Regression
Variabel LDA LD1 LDA LD2 Multinom (Menengah vs Rendah) Multinom (Tinggi vs Rendah)
Nilai Kalori Perkapita 0.5841 -0.6520 NA NA
Nilai Karbohidrat Perkapita 0.3234 -0.1561 NA NA
Nilai Protein Perkapita 0.3118 -0.2275 NA NA
Pengeluaran non Makanan 0.2940 0.4023 NA NA
Nilai Lemak Perkapita 0.2444 0.0423 NA NA
Pengeluaran perkapita 0.2196 0.6401 NA NA
Total Pengeluaran Rumahtangga 0.1797 0.3300 NA NA
Nilai Kalori Perkapita NA NA 121.2738 146.7272
Nilai Karbohidrat Perkapita NA NA 114.0733 46.2743
Nilai Lemak Perkapita NA NA 87.8156 102.0842
Nilai Protein Perkapita NA NA 114.7938 65.2542
Pengeluaran non Makanan NA NA 8.5771 160.0141
Pengeluaran perkapita NA NA 47.0124 223.1566
Total Pengeluaran Rumahtangga NA NA 17.2503 253.4155

Visualisasi Perbandingan Koefisien LDA vs Multinom

library(ggplot2)
library(tidyr)

# Format panjang untuk plot
perbandingan_long <- perbandingan %>%
  pivot_longer(cols      = -Variabel,
               names_to  = "Model_Fungsi",
               values_to = "Koefisien") %>%
  mutate(
    Model = case_when(
      grepl("LDA",      Model_Fungsi) ~ "LDA",
      grepl("Multinom", Model_Fungsi) ~ "Multinom"
    ),
    Fungsi = case_when(
      Model_Fungsi == "LDA_LD1"               ~ "LD1",
      Model_Fungsi == "LDA_LD2"               ~ "LD2",
      Model_Fungsi == "Multinom_Menengah"      ~ "Menengah vs Rendah",
      Model_Fungsi == "Multinom_Tinggi"        ~ "Tinggi vs Rendah"
    )
  )

ggplot(perbandingan_long,
       aes(x = reorder(Variabel, abs(Koefisien)),
           y = Koefisien,
           fill = Model)) +
  geom_col(position = "dodge", alpha = 0.85, color = "white") +
  coord_flip() +
  facet_wrap(~ Fungsi, scales = "free_x", ncol = 2) +
  scale_fill_manual(values = c("LDA" = "#3498db", "Multinom" = "#e67e22")) +
  geom_hline(yintercept = 0, linetype = "dashed", color = "gray40") +
  theme_minimal(base_size = 12) +
  labs(
    title    = "Perbandingan Koefisien: LDA vs Multinomial Logistic Regression",
    subtitle = "Biru = LDA | Oranye = Multinom",
    x        = "Variabel",
    y        = "Koefisien",
    fill     = "Model"
  ) +
  theme(
    plot.title    = element_text(face = "bold"),
    legend.position = "top",
    strip.text    = element_text(face = "bold", size = 11)
  )
## Warning: Removed 28 rows containing missing values or values outside the scale range
## (`geom_col()`).


FASE 6 — Evaluasi & Validasi Model

Prediksi Kelas pada Data Testing

library(caret)
library(kableExtra)
library(ggplot2)
library(dplyr)
library(tidyr)

pred_lda  <- predict(lda_model, newdata = test_data)
kelas_pred_lda  <- pred_lda$class
kelas_aktual    <- test_data$Kelas

# PREDIKSI MULTINOM
pred_multinom       <- predict(multinom_model, newdata = test_data, type = "class")
kelas_pred_multinom <- factor(pred_multinom,
                               levels = c("Rendah", "Menengah", "Tinggi"))

# Tabel perbandingan aktual vs prediksi
perbandingan_pred <- data.frame(
  Aktual        = kelas_aktual,
  Pred_LDA      = kelas_pred_lda,
  Pred_Multinom = kelas_pred_multinom,
  Cocok_LDA     = ifelse(kelas_aktual == kelas_pred_lda,      "Benar", "Salah"),
  Cocok_Multinom= ifelse(kelas_aktual == kelas_pred_multinom, "Salah", "Benar")
)

cat("PREVIEW HASIL PREDIKSI (20 baris pertama)\n\n")
## PREVIEW HASIL PREDIKSI (20 baris pertama)
kable(head(perbandingan_pred, 20), row.names = TRUE,
      caption = "Perbandingan Kelas Aktual vs Prediksi (LDA & Multinom)") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "bordered")) %>%
  column_spec(4, bold = TRUE) %>%
  column_spec(5, bold = TRUE)
Perbandingan Kelas Aktual vs Prediksi (LDA & Multinom)
Aktual Pred_LDA Pred_Multinom Cocok_LDA Cocok_Multinom
1 Rendah Rendah Rendah Benar Salah
2 Tinggi Tinggi Tinggi Benar Salah
3 Rendah Rendah Rendah Benar Salah
4 Rendah Rendah Rendah Benar Salah
5 Rendah Rendah Rendah Benar Salah
6 Menengah Menengah Menengah Benar Salah
7 Rendah Rendah Rendah Benar Salah
8 Tinggi Rendah Tinggi Salah Salah
9 Rendah Rendah Rendah Benar Salah
10 Rendah Rendah Rendah Benar Salah
11 Rendah Rendah Rendah Benar Salah
12 Tinggi Tinggi Tinggi Benar Salah
13 Menengah Rendah Menengah Salah Salah
14 Rendah Rendah Rendah Benar Salah
15 Rendah Rendah Rendah Benar Salah
16 Rendah Rendah Rendah Benar Salah
17 Rendah Rendah Rendah Benar Salah
18 Rendah Rendah Rendah Benar Salah
19 Menengah Menengah Menengah Benar Salah
20 Menengah Menengah Menengah Benar Salah

Confusion Matrix

Confusion Matrix — LDA

cm_lda <- confusionMatrix(
  data      = kelas_pred_lda,
  reference = kelas_aktual,
  positive  = NULL
)

cat("CONFUSION MATRIX — LDA\n\n")
## CONFUSION MATRIX — LDA
print(cm_lda)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Rendah Menengah Tinggi
##   Rendah      635       59      5
##   Menengah      0      254      6
##   Tinggi        0        0     44
## 
## Overall Statistics
##                                           
##                Accuracy : 0.9302          
##                  95% CI : (0.9126, 0.9452)
##     No Information Rate : 0.6331          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.8532          
##                                           
##  Mcnemar's Test P-Value : 4.268e-15       
## 
## Statistics by Class:
## 
##                      Class: Rendah Class: Menengah Class: Tinggi
## Sensitivity                 1.0000          0.8115       0.80000
## Specificity                 0.8261          0.9913       1.00000
## Pos Pred Value              0.9084          0.9769       1.00000
## Neg Pred Value              1.0000          0.9206       0.98853
## Prevalence                  0.6331          0.3121       0.05484
## Detection Rate              0.6331          0.2532       0.04387
## Detection Prevalence        0.6969          0.2592       0.04387
## Balanced Accuracy           0.9130          0.9014       0.90000

Confusion Matrix — Multinomial Logistic Regression

cm_multinom <- confusionMatrix(
  data      = kelas_pred_multinom,
  reference = kelas_aktual,
  positive  = NULL
)

cat("CONFUSION MATRIX — MULTINOMIAL LOGISTIC REGRESSION\n\n")
## CONFUSION MATRIX — MULTINOMIAL LOGISTIC REGRESSION
print(cm_multinom)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Rendah Menengah Tinggi
##   Rendah      635        1      0
##   Menengah      0      312      3
##   Tinggi        0        0     52
## 
## Overall Statistics
##                                           
##                Accuracy : 0.996           
##                  95% CI : (0.9898, 0.9989)
##     No Information Rate : 0.6331          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.992           
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: Rendah Class: Menengah Class: Tinggi
## Sensitivity                 1.0000          0.9968       0.94545
## Specificity                 0.9973          0.9957       1.00000
## Pos Pred Value              0.9984          0.9905       1.00000
## Neg Pred Value              1.0000          0.9985       0.99685
## Prevalence                  0.6331          0.3121       0.05484
## Detection Rate              0.6331          0.3111       0.05184
## Detection Prevalence        0.6341          0.3141       0.05184
## Balanced Accuracy           0.9986          0.9962       0.97273

Visualisasi Confusion Matrix

Heatmap Confusion Matrix — LDA

cm_lda_df <- as.data.frame(cm_lda$table)
names(cm_lda_df) <- c("Prediksi", "Aktual", "Frekuensi")

ggplot(cm_lda_df, aes(x = Prediksi, y = Aktual, fill = Frekuensi)) +
  geom_tile(color = "white", linewidth = 0.8) +
  geom_text(aes(label = Frekuensi), size = 7, fontface = "bold", color = "white") +
  scale_fill_gradient(low = "#AED6F1", high = "#1A5276") +
  scale_x_discrete(limits = c("Rendah", "Menengah", "Tinggi")) +
  scale_y_discrete(limits = rev(c("Rendah", "Menengah", "Tinggi"))) +
  theme_minimal(base_size = 13) +
  labs(
    title    = "Confusion Matrix — LDA",
    subtitle = sprintf("Akurasi: %.2f%%", cm_lda$overall["Accuracy"] * 100),
    x        = "Kelas Prediksi",
    y        = "Kelas Aktual",
    fill     = "Jumlah"
  ) +
  theme(
    plot.title    = element_text(face = "bold"),
    axis.text     = element_text(size = 12),
    panel.grid    = element_blank()
  )

Heatmap Confusion Matrix — Multinom

cm_multinom_df <- as.data.frame(cm_multinom$table)
names(cm_multinom_df) <- c("Prediksi", "Aktual", "Frekuensi")

ggplot(cm_multinom_df, aes(x = Prediksi, y = Aktual, fill = Frekuensi)) +
  geom_tile(color = "white", linewidth = 0.8) +
  geom_text(aes(label = Frekuensi), size = 7, fontface = "bold", color = "white") +
  scale_fill_gradient(low = "#A9DFBF", high = "#1E8449") +
  scale_x_discrete(limits = c("Rendah", "Menengah", "Tinggi")) +
  scale_y_discrete(limits = rev(c("Rendah", "Menengah", "Tinggi"))) +
  theme_minimal(base_size = 13) +
  labs(
    title    = "Confusion Matrix — Multinomial Logistic Regression",
    subtitle = sprintf("Akurasi: %.2f%%", cm_multinom$overall["Accuracy"] * 100),
    x        = "Kelas Prediksi",
    y        = "Kelas Aktual",
    fill     = "Jumlah"
  ) +
  theme(
    plot.title    = element_text(face = "bold"),
    axis.text     = element_text(size = 12),
    panel.grid    = element_blank()
  )


Akurasi, Sensitivitas, dan Spesifisitas

ekstrak_metrik <- function(cm_obj, model_name) {
  byclass <- as.data.frame(cm_obj$byClass)
  
  kelas_names <- gsub("Class: ", "", rownames(byclass))
  
  data.frame(
    Model        = model_name,
    Kelas        = kelas_names,
    Sensitivitas = round(byclass$Sensitivity,  4),
    Spesifisitas = round(byclass$Specificity,  4),
    Precision    = round(byclass$`Pos Pred Value`, 4),
    F1_Score     = round(byclass$F1,           4),
    Balanced_Acc = round(byclass$`Balanced Accuracy`, 4)
  )
}

metrik_lda      <- ekstrak_metrik(cm_lda,      "LDA")
metrik_multinom <- ekstrak_metrik(cm_multinom, "Multinom")
metrik_gabungan <- bind_rows(metrik_lda, metrik_multinom)
rownames(metrik_gabungan) <- NULL

cat("METRIK EVALUASI PER KELAS\n\n")
## METRIK EVALUASI PER KELAS
kable(metrik_gabungan, row.names = FALSE,
      caption = "Akurasi, Sensitivitas, dan Spesifisitas per Kelas") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "bordered")) %>%
  column_spec(1, bold = TRUE, background = "#f0f0f0") %>%
  column_spec(3:4, bold = TRUE) %>%
  collapse_rows(columns = 1, valign = "middle")
Akurasi, Sensitivitas, dan Spesifisitas per Kelas
Model Kelas Sensitivitas Spesifisitas Precision F1_Score Balanced_Acc
LDA Rendah 1.0000 0.8261 0.9084 0.9520 0.9130
Menengah 0.8115 0.9913 0.9769 0.8866 0.9014
Tinggi 0.8000 1.0000 1.0000 0.8889 0.9000
Multinom Rendah 1.0000 0.9973 0.9984 0.9992 0.9986
Menengah 0.9968 0.9957 0.9905 0.9936 0.9962
Tinggi 0.9455 1.0000 1.0000 0.9720 0.9727

Akurasi Keseluruhan

akurasi_lda      <- round(cm_lda$overall["Accuracy"]      * 100, 2)
akurasi_multinom <- round(cm_multinom$overall["Accuracy"] * 100, 2)

kappa_lda      <- round(cm_lda$overall["Kappa"],      4)
kappa_multinom <- round(cm_multinom$overall["Kappa"], 4)

overall_tbl <- data.frame(
  Model    = c("LDA", "Multinomial Logistic Regression"),
  Akurasi  = c(paste0(akurasi_lda, "%"),
               paste0(akurasi_multinom, "%")),
  Kappa    = c(kappa_lda, kappa_multinom),
  Interpretasi_Kappa = c(
    ifelse(kappa_lda      >= 0.8, "Sangat Baik",
    ifelse(kappa_lda      >= 0.6, "Baik",
    ifelse(kappa_lda      >= 0.4, "Cukup", "Lemah"))),
    ifelse(kappa_multinom >= 0.8, "Sangat Baik",
    ifelse(kappa_multinom >= 0.6, "Baik",
    ifelse(kappa_multinom >= 0.4, "Cukup", "Lemah")))
  )
)

kable(overall_tbl, row.names = FALSE,
      caption = "Akurasi Keseluruhan dan Cohen's Kappa") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "bordered")) %>%
  column_spec(2:3, bold = TRUE) %>%
  column_spec(4, bold = TRUE)
Akurasi Keseluruhan dan Cohen’s Kappa
Model Akurasi Kappa Interpretasi_Kappa
LDA 93.02% 0.8532 Sangat Baik
Multinomial Logistic Regression 99.6% 0.9920 Sangat Baik

metrik_plot <- metrik_gabungan %>%
  dplyr::select(Model, Kelas, Sensitivitas, Spesifisitas) %>%
  pivot_longer(cols = c("Sensitivitas", "Spesifisitas"),
               names_to  = "Metrik",
               values_to = "Nilai")

ggplot(metrik_plot, aes(x = Kelas, y = Nilai, fill = Model)) +
  geom_col(position = "dodge", alpha = 0.85, color = "white") +
  geom_text(aes(label = sprintf("%.3f", Nilai)),
            position = position_dodge(width = 0.9),
            vjust = -0.4, size = 3.5, fontface = "bold") +
  facet_wrap(~ Metrik) +
  scale_fill_manual(values = c("LDA" = "#3498db", "Multinom" = "#e67e22")) +
  scale_y_continuous(limits = c(0, 1.12), labels = scales::percent_format()) +
  theme_minimal(base_size = 13) +
  labs(
    title = "Sensitivitas & Spesifisitas per Kelas: LDA vs Multinom",
    x     = "Kelas",
    y     = "Nilai",
    fill  = "Model"
  ) +
  theme(
    plot.title      = element_text(face = "bold"),
    legend.position = "top",
    strip.text      = element_text(face = "bold", size = 12)
  )

heatmap Metrik Evaluasi per Kelas

metrik_heatmap <- metrik_gabungan %>%
  dplyr::select(Model, Kelas, Sensitivitas, Spesifisitas, Precision, F1_Score) %>%
  pivot_longer(cols = c("Sensitivitas", "Spesifisitas", "Precision", "F1_Score"),
               names_to  = "Metrik",
               values_to = "Nilai") %>%
  mutate(
    Nilai = replace_na(Nilai, 0),
    Metrik = factor(Metrik, levels = c("Sensitivitas", "Spesifisitas",
                                        "Precision", "F1_Score")),
    Kelas  = factor(Kelas,  levels = c("Rendah", "Menengah", "Tinggi")),
    Label  = sprintf("%.3f", Nilai)
  )

ggplot(metrik_heatmap, aes(x = Kelas, y = Metrik, fill = Nilai)) +
  geom_tile(color = "white", linewidth = 1.2) +
  geom_text(aes(label = Label,
                color  = ifelse(Nilai < 0.5, "black", "white")),
            size = 4.5, fontface = "bold") +
  scale_fill_gradient2(
    low      = "#E74C3C",
    mid      = "#F39C12",
    high     = "#1A5276",
    midpoint = 0.75,
    limits   = c(0, 1),
    labels   = scales::percent_format()
  ) +
  scale_color_identity() +
  facet_wrap(~ Model, ncol = 2) +
  theme_minimal(base_size = 13) +
  labs(
    title    = "Heatmap Metrik Evaluasi per Kelas",
    subtitle = "Perbandingan LDA vs Multinomial Logistic Regression",
    x        = "Kelas",
    y        = "Metrik Evaluasi",
    fill     = "Nilai"
  ) +
  theme(
    plot.title       = element_text(face = "bold", size = 14, hjust = 0.5),
    plot.subtitle    = element_text(size = 10, hjust = 0.5, color = "gray40"),
    legend.position  = "right",
    legend.key.height = unit(1.5, "cm"),
    strip.text       = element_text(face = "bold", size = 12,
                                    margin = margin(5, 5, 5, 5)),,
    axis.text.x      = element_text(size = 11, face = "bold"),
    axis.text.y      = element_text(size = 11),
    panel.grid       = element_blank(),
    panel.spacing    = unit(2, "lines")
  )


Apparent Error Rate (AER) dan Cross-validated Error Rate (CVER)

Apparent Error Rate (AER)

# AER LDA
pred_train_lda   <- predict(lda_model, newdata = train_data)$class
aer_lda          <- mean(pred_train_lda != train_data$Kelas)

# AER Multinom
pred_train_multi <- predict(multinom_model, newdata = train_data, type = "class")
pred_train_multi <- factor(pred_train_multi, levels = c("Rendah", "Menengah", "Tinggi"))
aer_multinom     <- mean(pred_train_multi != train_data$Kelas)

cat("APPARENT ERROR RATE (AER)\n\n")
## APPARENT ERROR RATE (AER)
cat(sprintf("AER — LDA               : %.4f  (%.2f%%)\n",
            aer_lda,      aer_lda      * 100))
## AER — LDA               : 0.0573  (5.73%)
cat(sprintf("AER — Multinom          : %.4f  (%.2f%%)\n",
            aer_multinom, aer_multinom * 100))
## AER — Multinom          : 0.0000  (0.00%)

Cross-validated Error Rate (CVER) — k-Fold Cross Validation

set.seed(42)

df_cv <- rbind(train_data, test_data)
df_cv$Kelas <- factor(df_cv$Kelas, levels = c("Rendah", "Menengah", "Tinggi"))

K <- 10   # 10-fold
folds <- createFolds(df_cv$Kelas, k = K, list = TRUE, returnTrain = FALSE)

#CVER LDA
err_lda_cv <- sapply(seq_len(K), function(i) {
  idx_test  <- folds[[i]]
  idx_train <- setdiff(seq_len(nrow(df_cv)), idx_test)

  tryCatch({
    formula_tmp <- as.formula(
      paste("Kelas ~",
            paste(paste0("`", setdiff(names(df_cv), "Kelas"), "`"),
                  collapse = " + "))
    )
    model_tmp  <- lda(formula_tmp, data = df_cv[idx_train, ])
    pred_tmp   <- predict(model_tmp, newdata = df_cv[idx_test, ])$class
    mean(pred_tmp != df_cv$Kelas[idx_test])
  }, error = function(e) NA)
})

cver_lda <- mean(err_lda_cv, na.rm = TRUE)

#CVER Multinom
err_multi_cv <- sapply(seq_len(K), function(i) {
  idx_test  <- folds[[i]]
  idx_train <- setdiff(seq_len(nrow(df_cv)), idx_test)

  tryCatch({
    train_tmp <- df_cv[idx_train, ]
    train_tmp$Kelas <- relevel(train_tmp$Kelas, ref = "Rendah")
    model_tmp <- multinom(Kelas ~ ., data = train_tmp,
                          maxit = 500, trace = FALSE)
    pred_tmp  <- factor(predict(model_tmp, newdata = df_cv[idx_test, ],
                                type = "class"),
                        levels = c("Rendah", "Menengah", "Tinggi"))
    mean(pred_tmp != df_cv$Kelas[idx_test])
  }, error = function(e) NA)
})

cver_multinom <- mean(err_multi_cv, na.rm = TRUE)

cat("CROSS-VALIDATED ERROR RATE (10-Fold CV)\n\n")
## CROSS-VALIDATED ERROR RATE (10-Fold CV)
cat(sprintf("CVER — LDA     : %.4f  (%.2f%%)\n",
            cver_lda,      cver_lda      * 100))
## CVER — LDA     : 0.0616  (6.16%)
cat(sprintf("CVER — Multinom: %.4f  (%.2f%%)\n",
            cver_multinom, cver_multinom * 100))
## CVER — Multinom: 0.0020  (0.20%)

Rekapitulasi AER vs CVER vs Testing Error Rate

# Testing Error Rate
ter_lda      <- 1 - cm_lda$overall["Accuracy"]
ter_multinom <- 1 - cm_multinom$overall["Accuracy"]

error_summary <- data.frame(
  Model = c("LDA", "Multinomial Logistic Regression"),
  AER   = round(c(aer_lda, aer_multinom) * 100, 2),
  CVER  = round(c(cver_lda, cver_multinom) * 100, 2),
  TER   = round(c(ter_lda, ter_multinom) * 100, 2),
  Akurasi_Testing = round(c(cm_lda$overall["Accuracy"],
                             cm_multinom$overall["Accuracy"]) * 100, 2)
)
names(error_summary) <- c("Model", "AER (%)", "CVER 10-Fold (%)",
                           "Testing Error Rate (%)", "Akurasi Testing (%)")
rownames(error_summary) <- NULL

kable(error_summary, row.names = FALSE,
      caption = "Rekapitulasi Apparent Error Rate, CVER, dan Testing Error Rate") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "bordered")) %>%
  column_spec(1, bold = TRUE) %>%
  column_spec(2:4, bold = TRUE) %>%
  column_spec(5, bold = TRUE)
Rekapitulasi Apparent Error Rate, CVER, dan Testing Error Rate
Model AER (%) CVER 10-Fold (%) Testing Error Rate (%) Akurasi Testing (%)
LDA 5.73 6.16 6.98 93.02
Multinomial Logistic Regression 0.00 0.20 0.40 99.60

Visualisasi Error Rate: AER vs CVER vs Testing

error_long <- error_summary %>%
  dplyr::select(Model, `AER (%)`, `CVER 10-Fold (%)`, `Testing Error Rate (%)`) %>%
  pivot_longer(cols = -Model,
               names_to  = "Jenis_Error",
               values_to = "Error_Rate")

ggplot(error_long, aes(x = Model, y = Error_Rate, fill = Jenis_Error)) +
  geom_col(position = "dodge", alpha = 0.85, color = "white", width = 0.6) +
  geom_text(aes(label = sprintf("%.2f%%", Error_Rate)),
            position = position_dodge(width = 0.6),
            vjust = -0.5, size = 4, fontface = "bold") +
  scale_fill_manual(values = c(
    "AER (%)"                = "#E74C3C",
    "CVER 10-Fold (%)"       = "#F39C12",
    "Testing Error Rate (%)" = "#8E44AD"
  )) +
  scale_y_continuous(limits = c(0, max(error_long$Error_Rate) * 1.3),
                     labels = function(x) paste0(x, "%")) +
  theme_minimal(base_size = 13) +
  labs(
    title    = "Perbandingan Error Rate: AER vs CVER vs Testing Error Rate",
    subtitle = "Nilai lebih rendah = Model lebih baik",
    x        = "Model",
    y        = "Error Rate (%)",
    fill     = "Jenis Error"
  ) +
  theme(
    plot.title      = element_text(face = "bold"),
    legend.position = "top"
  )


Visualisasi Akurasi per Fold (Cross Validation)

cv_fold_df <- data.frame(
  Fold    = rep(1:K, 2),
  Model   = rep(c("LDA", "Multinom"), each = K),
  Error   = c(err_lda_cv * 100, err_multi_cv * 100),
  Akurasi = c((1 - err_lda_cv) * 100, (1 - err_multi_cv) * 100)
)

ggplot(cv_fold_df, aes(x = factor(Fold), y = Akurasi,
                        color = Model, group = Model)) +
  geom_line(linewidth = 1.2, alpha = 0.8) +
  geom_point(size = 3.5) +
  geom_text(aes(label = sprintf("%.1f%%", Akurasi)),
            vjust = -0.8, size = 3.2, fontface = "bold") +
  geom_hline(aes(yintercept = mean(Akurasi), color = Model),
             linetype = "dashed", linewidth = 0.8, alpha = 0.6,
             data = cv_fold_df %>% group_by(Model) %>%
               summarise(Akurasi = mean(Akurasi, na.rm = TRUE))) +
  scale_color_manual(values = c("LDA" = "#3498db", "Multinom" = "#e67e22")) +
  scale_y_continuous(limits = c(
    max(0, min(cv_fold_df$Akurasi, na.rm = TRUE) - 5), 105),
    labels = function(x) paste0(x, "%")) +
  theme_minimal(base_size = 13) +
  labs(
    title    = "Akurasi per Fold — 10-Fold Cross Validation",
    subtitle = "Garis putus-putus = rata-rata akurasi CV",
    x        = "Fold",
    y        = "Akurasi (%)",
    color    = "Model"
  ) +
  theme(
    plot.title      = element_text(face = "bold"),
    legend.position = "top"
  )