Preprocessing Data
Cek Missing Values
cat("=== CEK MISSING VALUES ===\n")
## === CEK MISSING VALUES ===
missing_summary <- data.frame(
Variabel = colnames(df),
Jumlah_NA = colSums(is.na(df)),
Persentase_NA = round(colMeans(is.na(df)) * 100, 2)
)
kable(missing_summary, row.names = FALSE,
caption = "Ringkasan Missing Values per Variabel") %>%
kable_styling(bootstrap_options = c("striped", "hover"))
Ringkasan Missing Values per Variabel
|
Variabel
|
Jumlah_NA
|
Persentase_NA
|
|
Pengeluaran non Makanan
|
0
|
0
|
|
Total Pengeluaran Rumahtangga
|
0
|
0
|
|
Pengeluaran perkapita
|
0
|
0
|
|
Nilai Kalori Perkapita
|
0
|
0
|
|
Nilai Protein Perkapita
|
0
|
0
|
|
Nilai Lemak Perkapita
|
0
|
0
|
|
Nilai Karbohidrat Perkapita
|
0
|
0
|
# Cek apakah ada missing values
total_missing <- sum(is.na(df))
if (total_missing == 0) {
cat("Tidak ada missing values. Dataset siap diproses.\n")
} else {
cat("Ditemukan", total_missing, "missing values. Menghapus baris yang mengandung NA...\n")
df <- df %>% na.omit()
cat("Dataset setelah pembersihan:", nrow(df), "baris.\n")
}
## Tidak ada missing values. Dataset siap diproses.
Cek Outlier dengan Boxplot
library(tidyr)
df_long <- df %>%
pivot_longer(cols = everything(),
names_to = "Variabel",
values_to = "Nilai")
ggplot(df_long, aes(x = Variabel, y = Nilai, fill = Variabel)) +
geom_boxplot(outlier.colour = "red", outlier.shape = 16,
outlier.size = 1.5, alpha = 0.7) +
theme_minimal() +
theme(
axis.text.x = element_text(angle = 30, hjust = 1, size = 8),
legend.position = "none"
) +
labs(title = "Boxplot Deteksi Outlier per Variabel",
x = "Variabel", y = "Nilai")

outlier_summary <- sapply(df, function(x) {
z <- abs(scale(x))
sum(z > 3)
})
outlier_df <- data.frame(
Variabel = names(outlier_summary),
Jumlah_Outlier = as.integer(outlier_summary),
Persentase = round(as.integer(outlier_summary) / nrow(df) * 100, 2)
)
kable(outlier_df, row.names = FALSE,
caption = "Jumlah Outlier per Variabel (|Z-score| > 3)") %>%
kable_styling(bootstrap_options = c("striped", "hover"))
Jumlah Outlier per Variabel (|Z-score| > 3)
|
Variabel
|
Jumlah_Outlier
|
Persentase
|
|
Pengeluaran non Makanan
|
83
|
1.65
|
|
Total Pengeluaran Rumahtangga
|
82
|
1.63
|
|
Pengeluaran perkapita
|
100
|
1.99
|
|
Nilai Kalori Perkapita
|
16
|
0.32
|
|
Nilai Protein Perkapita
|
69
|
1.37
|
|
Nilai Lemak Perkapita
|
80
|
1.59
|
|
Nilai Karbohidrat Perkapita
|
42
|
0.84
|
Standardisasi Data
df_scaled <- as.data.frame(scale(df))
cat("HASIL STANDARDISASI\n")
## HASIL STANDARDISASI
verify <- data.frame(
Variabel = colnames(df_scaled),
Mean = round(colMeans(df_scaled), 6),
SD = round(apply(df_scaled, 2, sd), 6)
)
kable(verify, row.names = FALSE,
caption = "Rata-rata dan Standar Deviasi setelah Standardisasi") %>%
kable_styling(bootstrap_options = c("striped", "hover"))
Rata-rata dan Standar Deviasi setelah Standardisasi
|
Variabel
|
Mean
|
SD
|
|
Pengeluaran non Makanan
|
0
|
1
|
|
Total Pengeluaran Rumahtangga
|
0
|
1
|
|
Pengeluaran perkapita
|
0
|
1
|
|
Nilai Kalori Perkapita
|
0
|
1
|
|
Nilai Protein Perkapita
|
0
|
1
|
|
Nilai Lemak Perkapita
|
0
|
1
|
|
Nilai Karbohidrat Perkapita
|
0
|
1
|
df_scaled_long <- df_scaled %>%
pivot_longer(cols = everything(),
names_to = "Variabel",
values_to = "Nilai")
ggplot(df_scaled_long, aes(x = Variabel, y = Nilai, fill = Variabel)) +
geom_boxplot(outlier.colour = "red", outlier.shape = 16,
outlier.size = 1.2, alpha = 0.7) +
theme_minimal() +
theme(
axis.text.x = element_text(angle = 30, hjust = 1, size = 8),
legend.position = "none"
) +
labs(title = "Boxplot Setelah Standardisasi (Z-score)",
x = "Variabel", y = "Nilai Terstandarisasi")

Uji Multikolinearitas (Korelasi Matrix)
cor_matrix <- cor(df_scaled)
corrplot(cor_matrix,
method = "color",
type = "upper",
addCoef.col = "black",
number.cex = 0.7,
tl.cex = 0.75,
tl.col = "black",
col = colorRampPalette(c("#d73027", "white", "#1a9850"))(200),
title = "Korelasi Matrix Antar Variabel",
mar = c(0, 0, 1, 0))

cat("TABEL KORELASI\n")
## TABEL KORELASI
kable(round(cor_matrix, 3),
caption = "Matriks Korelasi Pearson antar Variabel") %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed")) %>%
scroll_box(width = "100%")
Matriks Korelasi Pearson antar Variabel
|
|
Pengeluaran non Makanan
|
Total Pengeluaran Rumahtangga
|
Pengeluaran perkapita
|
Nilai Kalori Perkapita
|
Nilai Protein Perkapita
|
Nilai Lemak Perkapita
|
Nilai Karbohidrat Perkapita
|
|
Pengeluaran non Makanan
|
1.000
|
0.957
|
0.798
|
0.129
|
0.208
|
0.288
|
0.013
|
|
Total Pengeluaran Rumahtangga
|
0.957
|
1.000
|
0.768
|
0.185
|
0.265
|
0.365
|
0.052
|
|
Pengeluaran perkapita
|
0.798
|
0.768
|
1.000
|
0.435
|
0.511
|
0.510
|
0.276
|
|
Nilai Kalori Perkapita
|
0.129
|
0.185
|
0.435
|
1.000
|
0.864
|
0.698
|
0.919
|
|
Nilai Protein Perkapita
|
0.208
|
0.265
|
0.511
|
0.864
|
1.000
|
0.722
|
0.709
|
|
Nilai Lemak Perkapita
|
0.288
|
0.365
|
0.510
|
0.698
|
0.722
|
1.000
|
0.446
|
|
Nilai Karbohidrat Perkapita
|
0.013
|
0.052
|
0.276
|
0.919
|
0.709
|
0.446
|
1.000
|
# Identifikasi pasangan dengan korelasi tinggi (|r| >= 0.8)
cat("PASANGAN VARIABEL DENGAN KORELASI TINGGI (|r| >= 0.8)\n")
## PASANGAN VARIABEL DENGAN KORELASI TINGGI (|r| >= 0.8)
cor_pairs <- which(abs(cor_matrix) >= 0.8 & upper.tri(cor_matrix), arr.ind = TRUE)
if (nrow(cor_pairs) == 0) {
cat("Tidak ada pasangan variabel dengan korelasi >= 0.8.\n")
} else {
hasil <- data.frame(
Variabel_1 = rownames(cor_matrix)[cor_pairs[, 1]],
Variabel_2 = colnames(cor_matrix)[cor_pairs[, 2]],
Korelasi = round(cor_matrix[cor_pairs], 3)
)
kable(hasil, row.names = FALSE,
caption = "Pasangan Variabel Berkorelasi Tinggi") %>%
kable_styling(bootstrap_options = c("striped", "hover"))
}
Pasangan Variabel Berkorelasi Tinggi
|
Variabel_1
|
Variabel_2
|
Korelasi
|
|
Pengeluaran non Makanan
|
Total Pengeluaran Rumahtangga
|
0.957
|
|
Nilai Kalori Perkapita
|
Nilai Protein Perkapita
|
0.864
|
|
Nilai Kalori Perkapita
|
Nilai Karbohidrat Perkapita
|
0.919
|
Membuat Variabel Kelas (Label)
Elbow Method (Menentukan Jumlah Cluster Optimal)
set.seed(42)
wss <- sapply(1:10, function(k) {
kmeans(df_scaled, centers = k, nstart = 25, iter.max = 100)$tot.withinss
})
elbow_df <- data.frame(k = 1:10, WSS = wss)
ggplot(elbow_df, aes(x = k, y = WSS)) +
geom_line(color = "#2c7bb6", linewidth = 1.2) +
geom_point(color = "#d7191c", size = 3) +
scale_x_continuous(breaks = 1:10) +
theme_minimal() +
labs(
title = "Elbow Method - Within-Cluster Sum of Squares (WSS)",
x = "Jumlah Cluster (k)",
y = "Total WSS"
)

Silhouette Score
set.seed(42)
sil_scores <- sapply(2:10, function(k) {
km <- kmeans(df_scaled, centers = k, nstart = 25, iter.max = 100)
ss <- silhouette(km$cluster, dist(df_scaled))
mean(ss[, 3])
})
sil_df <- data.frame(k = 2:10, Silhouette = sil_scores)
ggplot(sil_df, aes(x = k, y = Silhouette)) +
geom_line(color = "#1a9850", linewidth = 1.2) +
geom_point(color = "#d7191c", size = 3) +
scale_x_continuous(breaks = 2:10) +
theme_minimal() +
labs(
title = "Silhouette Score per Jumlah Cluster",
x = "Jumlah Cluster (k)",
y = "Rata-rata Silhouette Score"
)

cat("Silhouette Score per k:\n")
## Silhouette Score per k:
kable(sil_df, digits = 4, caption = "Rata-rata Silhouette Score per Jumlah Cluster") %>%
kable_styling(bootstrap_options = c("striped", "hover"))
Rata-rata Silhouette Score per Jumlah Cluster
|
k
|
Silhouette
|
|
2
|
0.4230
|
|
3
|
0.3875
|
|
4
|
0.2920
|
|
5
|
0.3116
|
|
6
|
0.3080
|
|
7
|
0.2616
|
|
8
|
0.2489
|
|
9
|
0.2527
|
|
10
|
0.2219
|
K-Means Clustering
set.seed(42)
kmeans_result <- kmeans(df_scaled, centers = 3, nstart = 25, iter.max = 100)
cat("HASIL K-MEANS CLUSTERING\n")
## HASIL K-MEANS CLUSTERING
cat("Ukuran tiap cluster:\n")
## Ukuran tiap cluster:
print(kmeans_result$size)
## [1] 3175 1567 277
cat("\nTotal Within-Cluster SS :", round(kmeans_result$tot.withinss, 2), "\n")
##
## Total Within-Cluster SS : 17715.88
cat("Between-Cluster SS :", round(kmeans_result$betweenss, 2), "\n")
## Between-Cluster SS : 17410.12
cat("Rasio Between/Total :", round(kmeans_result$betweenss / kmeans_result$totss * 100, 2), "%\n")
## Rasio Between/Total : 49.56 %
Visualisasi Cluster
fviz_cluster(kmeans_result, data = df_scaled,
geom = "point",
ellipse.type = "convex",
palette = c("#E74C3C", "#2ECC71", "#3498DB"),
ggtheme = theme_minimal(),
main = "Visualisasi Cluster K-Means (k = 3)")

Tambahkan Kolom Kelas ke Dataset
cluster_means_perkapita <- tapply(df$`Pengeluaran perkapita`, kmeans_result$cluster, mean)
cat("Rata-rata Pengeluaran perkapita per cluster (raw):\n")
## Rata-rata Pengeluaran perkapita per cluster (raw):
print(round(cluster_means_perkapita, 0))
## 1 2 3
## 982613 1709214 4661393
rank_order <- rank(cluster_means_perkapita)
label_map <- ifelse(rank_order == 1, "Rendah",
ifelse(rank_order == 2, "Menengah", "Tinggi"))
cat("\nMapping label cluster:\n")
##
## Mapping label cluster:
print(label_map)
## 1 2 3
## "Rendah" "Menengah" "Tinggi"
df$Kelas <- factor(label_map[kmeans_result$cluster],
levels = c("Rendah", "Menengah", "Tinggi"))
df_scaled$Kelas <- df$Kelas
cat("\nDistribusi label Kelas:\n")
##
## Distribusi label Kelas:
print(table(df$Kelas))
##
## Rendah Menengah Tinggi
## 3175 1567 277
Profil Tiap Cluster
profil_cluster <- df %>%
group_by(Kelas) %>%
summarise(across(where(is.numeric), ~ round(mean(.x), 2)), .groups = "drop")
kable(t(profil_cluster), caption = "Profil Rata-rata Tiap Cluster (Data Asli)") %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed")) %>%
scroll_box(width = "100%")
Profil Rata-rata Tiap Cluster (Data Asli)
|
|
|
|
|
|
Kelas
|
Rendah
|
Menengah
|
Tinggi
|
|
Pengeluaran non Makanan
|
1810339
|
2165018
|
11109827
|
|
Total Pengeluaran Rumahtangga
|
3786087
|
4677145
|
16025072
|
|
Pengeluaran perkapita
|
982613.1
|
1709213.6
|
4661392.9
|
|
Nilai Kalori Perkapita
|
1892.62
|
3065.47
|
2736.34
|
|
Nilai Protein Perkapita
|
56.31
|
96.33
|
95.97
|
|
Nilai Lemak Perkapita
|
31.32
|
58.14
|
69.92
|
|
Nilai Karbohidrat Perkapita
|
303.13
|
476.69
|
378.04
|
Visualisasi Profil Cluster
profil_long <- profil_cluster %>%
pivot_longer(cols = -Kelas, names_to = "Variabel", values_to = "Rata_rata")
ggplot(profil_long, aes(x = Variabel, y = Rata_rata, fill = Kelas)) +
geom_bar(stat = "identity", position = "dodge", alpha = 0.85) +
scale_fill_manual(values = c("Rendah" = "#E74C3C",
"Menengah" = "#F39C12",
"Tinggi" = "#2ECC71")) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 30, hjust = 1, size = 8)) +
labs(title = "Profil Rata-rata Variabel per Cluster",
x = "Variabel", y = "Rata-rata", fill = "Kelas")

Uji Asumsi LDA
library(MVN)
library(kableExtra)
library(dplyr)
df_num <- df_scaled[, sapply(df_scaled, is.numeric)]
kelas_list <- c("Rendah", "Menengah", "Tinggi")
mvn_ver <- as.numeric(gsub("^(\\d+)\\..*", "\\1", packageVersion("MVN")))
run_mvn <- function(data, test = "mardia") {
if (mvn_ver >= 6) {
mvn(data = data, mvn_test = test)
} else {
mvn(data = data, mvnTest = test, desc = FALSE, showOutliers = FALSE)
}
}
ekstrak_mardia <- function(hasil_mvn) {
tbl <- hasil_mvn$multivariateNormality
p_col <- grep("p.?value", names(tbl), ignore.case = TRUE, value = TRUE)[1]
idx_skew <- grep("Skewness", tbl$Test, ignore.case = TRUE)
idx_kurt <- grep("Kurtosis", tbl$Test, ignore.case = TRUE)
idx_mvn <- grep("^MVN$", tbl$Test, ignore.case = TRUE)
p_skew <- if (length(idx_skew) > 0) as.numeric(tbl[idx_skew, p_col]) else NA
p_kurt <- if (length(idx_kurt) > 0) as.numeric(tbl[idx_kurt, p_col]) else NA
mvn_ok <- if (length(idx_mvn) > 0) as.character(tbl[idx_mvn, "MVN"]) else "NO"
list(skewness = p_skew, kurtosis = p_kurt, normal = mvn_ok)
}
cat("MARDIA'S TEST NORMALITAS MULTIVARIAT \n\n")
## MARDIA'S TEST NORMALITAS MULTIVARIAT
hasil_mardia <- lapply(kelas_list, function(kls) {
subset_data <- df_num[df$Kelas == kls, ]
n_obs <- nrow(subset_data)
p_vars <- ncol(subset_data)
if (n_obs <= p_vars) {
cat(sprintf("!!! Kelas %s: n=%d <= p=%d. Uji dilewati.\n", kls, n_obs, p_vars))
return(list(kelas = kls, n = n_obs, skewness = NA, kurtosis = NA, normal = "N/A"))
}
res_mvn <- tryCatch(run_mvn(subset_data, test = "mardia"), error = function(e) NULL)
if (is.null(res_mvn)) {
return(list(kelas = kls, n = n_obs, skewness = NA, kurtosis = NA, normal = "ERR"))
}
res <- ekstrak_mardia(res_mvn)
list(kelas = kls, n = n_obs,
skewness = res$skewness, kurtosis = res$kurtosis, normal = res$normal)
})
ringkasan_mardia <- do.call(rbind, lapply(hasil_mardia, function(x) {
data.frame(
Kelas = x$kelas,
N = x$n,
p_Skewness = ifelse(is.na(x$skewness), NA, round(as.numeric(x$skewness), 4)),
p_Kurtosis = ifelse(is.na(x$kurtosis), NA, round(as.numeric(x$kurtosis), 4)),
Normal_MVN = x$normal
)
}))
kable(ringkasan_mardia, row.names = FALSE,
caption = "Ringkasan Mardia's Test per Kelas (alpha = 0.05)") %>%
kable_styling(bootstrap_options = c("striped", "hover", "bordered")) %>%
column_spec(5, bold = TRUE,
color = ifelse(ringkasan_mardia$Normal_MVN == "YES", "darkgreen",
ifelse(ringkasan_mardia$Normal_MVN == "N/A", "orange", "red")))
Ringkasan Mardia’s Test per Kelas (alpha = 0.05)
|
Kelas
|
N
|
p_Skewness
|
p_Kurtosis
|
Normal_MVN
|
|
Rendah
|
3175
|
NA
|
NA
|
NO
|
|
Menengah
|
1567
|
NA
|
NA
|
NO
|
|
Tinggi
|
277
|
NA
|
NA
|
NO
|
cat("INTERPRETASI MARDIA'S TEST\n\n")
## INTERPRETASI MARDIA'S TEST
for (i in seq_len(nrow(ringkasan_mardia))) {
row <- ringkasan_mardia[i, ]
stat <- if (row$Normal_MVN == "YES") "NORMAL" else "TIDAK NORMAL"
cat(sprintf(
"Kelas %-8s | N = %3d | p-Skewness = %s | p-Kurtosis = %s | -> %s\n",
row$Kelas, row$N,
ifelse(is.na(row$p_Skewness), " NA ", sprintf("%.4f", row$p_Skewness)),
ifelse(is.na(row$p_Kurtosis), " NA ", sprintf("%.4f", row$p_Kurtosis)),
stat
))
}
## Kelas Rendah | N = 3175 | p-Skewness = NA | p-Kurtosis = NA | -> TIDAK NORMAL
## Kelas Menengah | N = 1567 | p-Skewness = NA | p-Kurtosis = NA | -> TIDAK NORMAL
## Kelas Tinggi | N = 277 | p-Skewness = NA | p-Kurtosis = NA | -> TIDAK NORMAL
Henze-Zirkler Test
ekstrak_hz <- function(hasil_mvn) {
tbl <- hasil_mvn$multivariateNormality
p_col <- grep("p.?value", names(tbl), ignore.case = TRUE, value = TRUE)[1]
hz_col <- grep("^HZ$", names(tbl), ignore.case = TRUE, value = TRUE)[1]
hz_stat <- if (!is.null(hz_col) && !is.na(hz_col)) as.numeric(tbl[[hz_col]]) else NA
p_val <- if (!is.null(p_col) && !is.na(p_col)) as.numeric(tbl[[p_col]]) else NA
mvn_ok <- if ("MVN" %in% names(tbl)) as.character(tbl$MVN[1]) else "NO"
list(hz_stat = hz_stat, p_value = p_val, normal = mvn_ok)
}
cat("=== HENZE-ZIRKLER TEST NORMALITAS MULTIVARIAT ===\n\n")
## === HENZE-ZIRKLER TEST NORMALITAS MULTIVARIAT ===
hasil_hz <- lapply(kelas_list, function(kls) {
subset_data <- df_num[df$Kelas == kls, ]
n_obs <- nrow(subset_data)
p_vars <- ncol(subset_data)
if (n_obs <= p_vars) {
cat(sprintf("!!! Kelas %s: n=%d <= p=%d. Uji dilewati.\n", kls, n_obs, p_vars))
return(list(kelas = kls, n = n_obs, hz_stat = NA, p_value = NA, normal = "N/A"))
}
res_mvn <- tryCatch(run_mvn(subset_data, test = "hz"), error = function(e) NULL)
if (is.null(res_mvn)) {
return(list(kelas = kls, n = n_obs, hz_stat = NA, p_value = NA, normal = "ERR"))
}
res <- ekstrak_hz(res_mvn)
list(kelas = kls, n = n_obs,
hz_stat = res$hz_stat, p_value = res$p_value, normal = res$normal)
})
ringkasan_hz <- do.call(rbind, lapply(hasil_hz, function(x) {
data.frame(
Kelas = x$kelas,
N = x$n,
HZ_Stat = ifelse(is.na(x$hz_stat), NA, round(as.numeric(x$hz_stat), 4)),
p_Value = ifelse(is.na(x$p_value), NA, round(as.numeric(x$p_value), 4)),
Normal_MVN = x$normal
)
}))
kable(ringkasan_hz, row.names = FALSE,
caption = "Ringkasan Henze-Zirkler Test per Kelas (alpha = 0.05)") %>%
kable_styling(bootstrap_options = c("striped", "hover", "bordered")) %>%
column_spec(5, bold = TRUE,
color = ifelse(ringkasan_hz$Normal_MVN == "YES", "darkgreen",
ifelse(ringkasan_hz$Normal_MVN == "N/A", "orange", "red")))
Ringkasan Henze-Zirkler Test per Kelas (alpha = 0.05)
|
Kelas
|
N
|
HZ_Stat
|
p_Value
|
Normal_MVN
|
|
Rendah
|
3175
|
NA
|
NA
|
NO
|
|
Menengah
|
1567
|
NA
|
NA
|
NO
|
|
Tinggi
|
277
|
NA
|
NA
|
NO
|
cat("INTERPRETASI HENZE-ZIRKLER TEST\n\n")
## INTERPRETASI HENZE-ZIRKLER TEST
for (i in seq_len(nrow(ringkasan_hz))) {
row <- ringkasan_hz[i, ]
stat <- if (row$Normal_MVN == "YES") "NORMAL" else "TIDAK NORMAL"
cat(sprintf(
"Kelas %-8s | N = %3d | HZ-Stat = %s | p-value = %s | -> %s\n",
row$Kelas, row$N,
ifelse(is.na(row$HZ_Stat), " NA ", sprintf("%.4f", row$HZ_Stat)),
ifelse(is.na(row$p_Value), " NA ", sprintf("%.4f", row$p_Value)),
stat
))
}
## Kelas Rendah | N = 3175 | HZ-Stat = NA | p-value = NA | -> TIDAK NORMAL
## Kelas Menengah | N = 1567 | HZ-Stat = NA | p-value = NA | -> TIDAK NORMAL
## Kelas Tinggi | N = 277 | HZ-Stat = NA | p-value = NA | -> TIDAK NORMAL
QQ-Plot Multivariat (Mahalanobis Distance)
library(ggplot2)
par(mfrow = c(1, 3), mar = c(4, 4, 3, 1))
for (kls in kelas_list) {
subset_data <- df_num[df$Kelas == kls, ]
n_obs <- nrow(subset_data)
p_vars <- ncol(subset_data)
if (n_obs <= p_vars) {
plot.new()
title(main = paste("Kelas:", kls, "\n(Sampel terlalu kecil)"))
next
}
center <- colMeans(subset_data)
cov_mat <- cov(subset_data)
mah_dist <- tryCatch(
mahalanobis(subset_data, center = center, cov = cov_mat),
error = function(e) rep(NA, n_obs)
)
if (all(is.na(mah_dist))) {
plot.new()
title(main = paste("Kelas:", kls, "\n(Matriks singular)"))
next
}
chi_q <- qchisq(ppoints(n_obs), df = p_vars)
# Plot
qqplot(chi_q, sort(mah_dist),
main = paste("QQ-Plot Mahalanobis\nKelas:", kls),
xlab = expression("Kuantil " * chi^2 * " Teoretis"),
ylab = "Jarak Mahalanobis (terurut)",
pch = 16, col = "#3498DB", cex = 0.7)
abline(a = 0, b = 1, col = "#E74C3C", lwd = 2, lty = 2)
}

par(mfrow = c(1, 1))
Rekapitulasi Uji Normalitas Multivariat
rekapitulasi_norm <- data.frame(
Kelas = kelas_list,
N = ringkasan_mardia$N,
Mardia_MVN = ringkasan_mardia$Normal_MVN,
HZ_MVN = ringkasan_hz$Normal_MVN,
Kesimpulan = ifelse(
ringkasan_mardia$Normal_MVN == "YES" & ringkasan_hz$Normal_MVN == "YES",
"Normal Multivariat",
ifelse(
ringkasan_mardia$Normal_MVN %in% c("N/A", "ERR") |
ringkasan_hz$Normal_MVN %in% c("N/A", "ERR"),
"Tidak dapat diuji",
"Tidak Normal Multivariat"
)
)
)
kable(rekapitulasi_norm, row.names = FALSE,
caption = "Rekapitulasi Uji Normalitas Multivariat (Mardia & Henze-Zirkler)") %>%
kable_styling(bootstrap_options = c("striped", "hover", "bordered")) %>%
column_spec(5, bold = TRUE)
Rekapitulasi Uji Normalitas Multivariat (Mardia & Henze-Zirkler)
|
Kelas
|
N
|
Mardia_MVN
|
HZ_MVN
|
Kesimpulan
|
|
Rendah
|
3175
|
NO
|
NO
|
Tidak Normal Multivariat
|
|
Menengah
|
1567
|
NO
|
NO
|
Tidak Normal Multivariat
|
|
Tinggi
|
277
|
NO
|
NO
|
Tidak Normal Multivariat
|
Uji Homogenitas Matriks Kovarians (Box’s M Test)
library(biotools)
cat("BOX'S M TEST - HOMOGENITAS MATRIKS KOVARIANS\n\n")
## BOX'S M TEST - HOMOGENITAS MATRIKS KOVARIANS
boxm_result <- tryCatch({
boxM(data = df_num,
grouping = df$Kelas)
}, error = function(e) {
cat("ERROR saat menjalankan Box's M test:", conditionMessage(e), "\n")
NULL
})
if (!is.null(boxm_result)) {
print(boxm_result)
chi_stat <- round(boxm_result$statistic, 4)
df_test <- boxm_result$parameter["df"]
p_val <- round(boxm_result$p.value, 6)
keputusan <- ifelse(p_val > 0.05,
"Gagal Tolak H0 -> Matriks Kovarians HOMOGEN",
"Tolak H0 -> Matriks Kovarians TIDAK HOMOGEN")
cat(sprintf("\nChi-Squared Approx : %.4f\n", chi_stat))
cat(sprintf("Derajat Bebas (df) : %d\n", as.integer(df_test)))
cat(sprintf("p-value : %.6f\n", p_val))
cat(sprintf("Keputusan (alpha=0.05) : %s\n", keputusan))
}
##
## Box's M-test for Homogeneity of Covariance Matrices
##
## data: df_num
## Chi-Sq (approx.) = 11918, df = 56, p-value < 2.2e-16
##
##
## Chi-Squared Approx : 11917.9773
## Derajat Bebas (df) : 56
## p-value : 0.000000
## Keputusan (alpha=0.05) : Tolak H0 -> Matriks Kovarians TIDAK HOMOGEN
if (!is.null(boxm_result)) {
boxm_tbl <- data.frame(
Statistik = c("Chi-Squared Approx", "Derajat Bebas", "p-value", "Keputusan (alpha=0.05)"),
Nilai = c(
sprintf("%.4f", as.numeric(boxm_result$statistic)),
sprintf("%d", as.integer(boxm_result$parameter["df"])),
sprintf("%.6f", boxm_result$p.value),
ifelse(boxm_result$p.value > 0.05,
"Gagal Tolak H0 - Kovarians HOMOGEN",
"Tolak H0 - Kovarians TIDAK HOMOGEN")
)
)
kable(boxm_tbl, row.names = FALSE,
caption = "Hasil Box's M Test Homogenitas Matriks Kovarians") %>%
kable_styling(bootstrap_options = c("striped", "hover", "bordered")) %>%
row_spec(4, bold = TRUE,
color = ifelse(boxm_result$p.value > 0.05, "darkgreen", "red"))
}
Hasil Box’s M Test Homogenitas Matriks Kovarians
|
Statistik
|
Nilai
|
|
Chi-Squared Approx
|
11917.9773
|
|
Derajat Bebas
|
56
|
|
p-value
|
0.000000
|
|
Keputusan (alpha=0.05)
|
Tolak H0 - Kovarians TIDAK HOMOGEN
|
Visualisasi Matriks Kovarians per Kelas
library(corrplot)
par(mfrow = c(1, 3), mar = c(1, 1, 3, 1))
for (kls in kelas_list) {
subset_data <- df_num[df$Kelas == kls, ]
if (nrow(subset_data) > ncol(subset_data)) {
cov_mat <- cov(subset_data)
cor_mat <- cov2cor(cov_mat) # konversi ke korelasi agar nilai dalam [-1, 1]
corrplot(cor_mat,
method = "color",
type = "upper",
tl.cex = 0.65,
tl.col = "black",
addCoef.col = "black",
number.cex = 0.5,
col = colorRampPalette(c("#d73027", "white", "#1a9850"))(200),
title = paste("Korelasi (dari Kovarians)\nKelas:", kls),
mar = c(0, 0, 2, 0))
} else {
plot.new()
title(main = paste("Kelas:", kls, "\n(Sampel terlalu kecil)"))
}
}

par(mfrow = c(1, 1))
Log-Determinan Matriks Kovarians per Kelas
log_det_df <- do.call(rbind, lapply(kelas_list, function(kls) {
subset_data <- df_num[df$Kelas == kls, ]
n_obs <- nrow(subset_data)
p_vars <- ncol(subset_data)
if (n_obs <= p_vars) {
return(data.frame(Kelas = kls, N = n_obs, Log_Det = NA, Keterangan = "Sampel terlalu kecil"))
}
cov_mat <- cov(subset_data)
det_val <- tryCatch(det(cov_mat), error = function(e) NA)
log_det <- ifelse(!is.na(det_val) & det_val > 0, round(log(det_val), 4), NA)
ket <- ifelse(is.na(log_det), "Matriks singular", sprintf("%.4f", log_det))
data.frame(Kelas = kls, N = n_obs, Log_Det = log_det, Keterangan = ket)
}))
kable(log_det_df, row.names = FALSE,
caption = "Log-Determinan Matriks Kovarians per Kelas") %>%
kable_styling(bootstrap_options = c("striped", "hover", "bordered"))
Log-Determinan Matriks Kovarians per Kelas
|
Kelas
|
N
|
Log_Det
|
Keterangan
|
|
Rendah
|
3175
|
-15.7287
|
-15.7287
|
|
Menengah
|
1567
|
-9.5327
|
-9.5327
|
|
Tinggi
|
277
|
-1.8367
|
-1.8367
|
Konfirmasi Tidak Ada Multikolinearitas Sempurna
Determinan Matriks Kovarians Keseluruhan
cat("CEK DETERMINAN MATRIKS KOVARIANS GLOBAL\n\n")
## CEK DETERMINAN MATRIKS KOVARIANS GLOBAL
cov_global <- cov(df_num)
det_global <- det(cov_global)
cat(sprintf("Determinan matriks kovarians (keseluruhan) : %.6e\n", det_global))
## Determinan matriks kovarians (keseluruhan) : 1.247857e-04
if (det_global < 1e-10) {
cat("Determinan sangat kecil -> kemungkinan multikolinearitas sempurna!\n")
} else {
cat("Determinan cukup besar -> tidak ada multikolinearitas sempurna.\n")
}
## Determinan cukup besar -> tidak ada multikolinearitas sempurna.
Variance Inflation Factor (VIF)
library(car)
cat("VARIANCE INFLATION FACTOR (VIF)\n\n")
## VARIANCE INFLATION FACTOR (VIF)
df_vif <- df_num
predictor_names <- names(df_vif)
response <- predictor_names[1]
predictors <- predictor_names[-1]
if (length(predictors) >= 2) {
# Bungkus semua nama kolom dengan backtick agar spasi dalam nama tidak error
formula_vif <- as.formula(paste(
paste0("`", response, "`"), "~",
paste(paste0("`", predictors, "`"), collapse = " + ")
))
lm_vif <- lm(formula_vif, data = df_vif)
vif_values <- tryCatch(vif(lm_vif), error = function(e) NULL)
if (!is.null(vif_values)) {
vif_df <- data.frame(
Variabel = names(vif_values),
VIF = round(as.numeric(vif_values), 3),
Status = ifelse(as.numeric(vif_values) > 10, "Multikolinearitas Tinggi",
ifelse(as.numeric(vif_values) > 5, "Perlu Perhatian",
"Aman"))
)
kable(vif_df, row.names = FALSE,
caption = "Variance Inflation Factor (VIF) - Deteksi Multikolinearitas") %>%
kable_styling(bootstrap_options = c("striped", "hover", "bordered")) %>%
column_spec(3, bold = TRUE)
} else {
cat("VIF tidak dapat dihitung (kemungkinan matriks singular).\n")
}
} else {
cat("Jumlah prediktor terlalu sedikit untuk menghitung VIF.\n")
}
Variance Inflation Factor (VIF) - Deteksi Multikolinearitas
|
Variabel
|
VIF
|
Status
|
Total Pengeluaran Rumahtangga
|
2.683
|
Aman
|
Pengeluaran perkapita
|
3.308
|
Aman
|
Nilai Kalori Perkapita
|
28.102
|
Multikolinearitas Tinggi
|
Nilai Protein Perkapita
|
5.026
|
Perlu Perhatian
|
Nilai Lemak Perkapita
|
3.899
|
Aman
|
Nilai Karbohidrat Perkapita
|
13.956
|
Multikolinearitas Tinggi
|
Nilai Eigenvalue Matriks Kovarians
cat("EIGENVALUE MATRIKS KOVARIANS GLOBAL\n\n")
## EIGENVALUE MATRIKS KOVARIANS GLOBAL
eigen_vals <- eigen(cov_global, only.values = TRUE)$values
eigen_vals <- round(eigen_vals, 6)
eigen_df <- data.frame(
Komponen = paste0("lambda", seq_along(eigen_vals)),
Eigenvalue = eigen_vals,
Status = ifelse(eigen_vals < 1e-6, "Near-zero (Singular!)", "Positif")
)
kable(eigen_df, row.names = FALSE,
caption = "Eigenvalue Matriks Kovarians Global") %>%
kable_styling(bootstrap_options = c("striped", "hover", "bordered")) %>%
column_spec(3, bold = TRUE)
Eigenvalue Matriks Kovarians Global
|
Komponen
|
Eigenvalue
|
Status
|
|
lambda1
|
3.935157
|
Positif
|
|
lambda2
|
2.096893
|
Positif
|
|
lambda3
|
0.512850
|
Positif
|
|
lambda4
|
0.229270
|
Positif
|
|
lambda5
|
0.169894
|
Positif
|
|
lambda6
|
0.032986
|
Positif
|
|
lambda7
|
0.022950
|
Positif
|
n_zero <- sum(eigen_vals < 1e-6)
cat(sprintf("\nJumlah eigenvalue near-zero (< 1e-6) : %d\n", n_zero))
##
## Jumlah eigenvalue near-zero (< 1e-6) : 0
if (n_zero == 0) {
cat("[OK] Tidak ada eigenvalue nol -> matriks kovarians non-singular -> tidak ada multikolinearitas sempurna.\n")
} else {
cat("[!] Ada eigenvalue nol/near-zero -> kemungkinan multikolinearitas sempurna! Pertimbangkan PCA atau seleksi variabel.\n")
}
## [OK] Tidak ada eigenvalue nol -> matriks kovarians non-singular -> tidak ada multikolinearitas sempurna.
Condition Number (Angka Kondisi)
cat("ANGKA KONDISI (CONDITION NUMBER)\n\n")
## ANGKA KONDISI (CONDITION NUMBER)
lambda_max <- max(eigen_vals)
lambda_min <- min(eigen_vals[eigen_vals > 1e-12])
cond_number <- lambda_max / lambda_min
cat(sprintf("lambda_max : %.4f\n", lambda_max))
## lambda_max : 3.9352
cat(sprintf("lambda_min (> 1e-12) : %.6f\n", lambda_min))
## lambda_min (> 1e-12) : 0.022950
cat(sprintf("Angka Kondisi : %.2f\n", cond_number))
## Angka Kondisi : 171.47
if (cond_number < 100) {
cat("Angka kondisi < 100 -> kondisi matriks baik, tidak ada multikolinearitas parah.\n")
} else if (cond_number < 1000) {
cat("Agka kondisi 100-1000 -> kondisi matriks cukup buruk, waspadai multikolinearitas moderat.\n")
} else {
cat("Angka kondisi > 1000 -> matriks sangat ill-conditioned! Multikolinearitas parah.\n")
}
## Agka kondisi 100-1000 -> kondisi matriks cukup buruk, waspadai multikolinearitas moderat.
Linear Discriminant Analysis (LDA)
Split Data: Training 80% & Testing 20%
set.seed(123)
train_index <- createDataPartition(df_scaled$Kelas,
p = 0.80,
list = FALSE)
train_data <- df_scaled[ train_index, ]
test_data <- df_scaled[-train_index, ]
cat("UKURAN DATA\n")
## UKURAN DATA
cat("Total data :", nrow(df_scaled), "\n")
## Total data : 5019
cat("Data training:", nrow(train_data), sprintf("(%.1f%%)\n",
nrow(train_data) / nrow(df_scaled) * 100))
## Data training: 4016 (80.0%)
cat("Data testing :", nrow(test_data), sprintf("(%.1f%%)\n",
nrow(test_data) / nrow(df_scaled) * 100))
## Data testing : 1003 (20.0%)
cat("\nDISTRIBUSI KELAS - TRAINING\n")
##
## DISTRIBUSI KELAS - TRAINING
print(table(train_data$Kelas))
##
## Rendah Menengah Tinggi
## 2540 1254 222
cat("\nDISTRIBUSI KELAS - TESTING\n")
##
## DISTRIBUSI KELAS - TESTING
print(table(test_data$Kelas))
##
## Rendah Menengah Tinggi
## 635 313 55
# Tampilkan proporsi dalam tabel
split_summary <- data.frame(
Dataset = c("Training", "Testing", "Total"),
Rendah = c(sum(train_data$Kelas == "Rendah"),
sum(test_data$Kelas == "Rendah"),
sum(df_scaled$Kelas == "Rendah")),
Menengah = c(sum(train_data$Kelas == "Menengah"),
sum(test_data$Kelas == "Menengah"),
sum(df_scaled$Kelas == "Menengah")),
Tinggi = c(sum(train_data$Kelas == "Tinggi"),
sum(test_data$Kelas == "Tinggi"),
sum(df_scaled$Kelas == "Tinggi")),
Total = c(nrow(train_data), nrow(test_data), nrow(df_scaled))
)
kable(split_summary, row.names = FALSE,
caption = "Distribusi Kelas pada Data Training dan Testing") %>%
kable_styling(bootstrap_options = c("striped", "hover", "bordered")) %>%
row_spec(3, bold = TRUE, background = "#f0f0f0")
Distribusi Kelas pada Data Training dan Testing
|
Dataset
|
Rendah
|
Menengah
|
Tinggi
|
Total
|
|
Training
|
2540
|
1254
|
222
|
4016
|
|
Testing
|
635
|
313
|
55
|
1003
|
|
Total
|
3175
|
1567
|
277
|
5019
|
Membangun Model LDA
predictor_cols <- setdiff(names(train_data), "Kelas")
formula_lda <- as.formula(
paste("Kelas ~", paste(paste0("`", predictor_cols, "`"), collapse = " + "))
)
lda_model <- lda(formula_lda, data = train_data)
cat("RINGKASAN MODEL LDA\n\n")
## RINGKASAN MODEL LDA
print(lda_model)
## Call:
## lda(formula_lda, data = train_data)
##
## Prior probabilities of groups:
## Rendah Menengah Tinggi
## 0.63247012 0.31225100 0.05527888
##
## Group means:
## `Pengeluaran non Makanan` `Total Pengeluaran Rumahtangga`
## Rendah -0.21169678 -0.24189365
## Menengah -0.09628201 -0.01371452
## Tinggi 2.88166562 2.81976917
## `Pengeluaran perkapita` `Nilai Kalori Perkapita`
## Rendah -0.3649409 -0.5868958
## Menengah 0.2402299 1.0722419
## Tinggi 2.7090153 0.5898291
## `Nilai Protein Perkapita` `Nilai Lemak Perkapita`
## Rendah -0.5537088 -0.4674411
## Menengah 0.9324874 0.7141846
## Tinggi 0.8571492 1.2269739
## `Nilai Karbohidrat Perkapita`
## Rendah -0.5198639
## Menengah 1.0204269
## Tinggi 0.1329941
##
## Coefficients of linear discriminants:
## LD1 LD2
## `Pengeluaran non Makanan` 0.2939806 0.40234490
## `Total Pengeluaran Rumahtangga` 0.1796666 0.32996048
## `Pengeluaran perkapita` 0.2196298 0.64007621
## `Nilai Kalori Perkapita` 0.5841140 -0.65195504
## `Nilai Protein Perkapita` 0.3117786 -0.22746373
## `Nilai Lemak Perkapita` 0.2443962 0.04231628
## `Nilai Karbohidrat Perkapita` 0.3233888 -0.15609597
##
## Proportion of trace:
## LD1 LD2
## 0.6623 0.3377
Proporsi Varians Tiap Fungsi Diskriminan
prop_var <- lda_model$svd^2 / sum(lda_model$svd^2)
cum_var <- cumsum(prop_var)
prop_df <- data.frame(
Fungsi_Diskriminan = paste0("LD", seq_along(prop_var)),
Singular_Value = round(lda_model$svd, 4),
Proporsi_Varians = round(prop_var * 100, 2),
Kumulatif = round(cum_var * 100, 2)
)
cat("=== PROPORSI VARIANS FUNGSI DISKRIMINAN ===\n")
## === PROPORSI VARIANS FUNGSI DISKRIMINAN ===
kable(prop_df, row.names = FALSE,
caption = "Proporsi Varians Tiap Fungsi Diskriminan (LD)",
col.names = c("Fungsi Diskriminan", "Singular Value",
"Proporsi Varians (%)", "Kumulatif (%)")) %>%
kable_styling(bootstrap_options = c("striped", "hover", "bordered")) %>%
column_spec(3:4, bold = TRUE)
Proporsi Varians Tiap Fungsi Diskriminan (LD)
|
Fungsi Diskriminan
|
Singular Value
|
Proporsi Varians (%)
|
Kumulatif (%)
|
|
LD1
|
58.9142
|
66.23
|
66.23
|
|
LD2
|
42.0730
|
33.77
|
100.00
|
cat(sprintf("\nLD1 menjelaskan %.2f%% varians.\n", prop_df$Proporsi_Varians[1]))
##
## LD1 menjelaskan 66.23% varians.
cat(sprintf("LD1 + LD2 menjelaskan %.2f%% varians.\n", prop_df$Kumulatif[2]))
## LD1 + LD2 menjelaskan 100.00% varians.
Koefisien Fungsi Diskriminan
koef_df <- as.data.frame(lda_model$scaling)
koef_df$Variabel <- rownames(koef_df)
rownames(koef_df) <- NULL
koef_df <- koef_df[, c("Variabel", "LD1", "LD2")]
kable(koef_df, row.names = FALSE, digits = 4,
caption = "Koefisien Fungsi Diskriminan (Standardized)") %>%
kable_styling(bootstrap_options = c("striped", "hover", "bordered")) %>%
column_spec(2:3, bold = TRUE)
Koefisien Fungsi Diskriminan (Standardized)
|
Variabel
|
LD1
|
LD2
|
Pengeluaran non Makanan
|
0.2940
|
0.4023
|
Total Pengeluaran Rumahtangga
|
0.1797
|
0.3300
|
Pengeluaran perkapita
|
0.2196
|
0.6401
|
Nilai Kalori Perkapita
|
0.5841
|
-0.6520
|
Nilai Protein Perkapita
|
0.3118
|
-0.2275
|
Nilai Lemak Perkapita
|
0.2444
|
0.0423
|
Nilai Karbohidrat Perkapita
|
0.3234
|
-0.1561
|
cat("\nInterpretasi:\n")
##
## Interpretasi:
cat("- Nilai absolut koefisien yang besar -> variabel tersebut lebih\n")
## - Nilai absolut koefisien yang besar -> variabel tersebut lebih
cat(" berpengaruh dalam membentuk fungsi diskriminan.\n")
## berpengaruh dalam membentuk fungsi diskriminan.
cat("- Tanda (+/-) menunjukkan arah kontribusi.\n")
## - Tanda (+/-) menunjukkan arah kontribusi.
Visualisasi: Plot Skor Diskriminan per Kelompok
A. Histogram Skor LD1 per Kelas
lda_scores_train <- predict(lda_model, train_data)
scores_df <- data.frame(
LD1 = lda_scores_train$x[, "LD1"],
LD2 = lda_scores_train$x[, "LD2"],
Kelas = train_data$Kelas
)
# Histogram LD1
ggplot(scores_df, aes(x = LD1, fill = Kelas)) +
geom_histogram(bins = 50, alpha = 0.6, position = "identity",
color = "white", linewidth = 0.3) +
scale_fill_manual(values = c("Rendah" = "#3498db",
"Menengah" = "#2ecc71",
"Tinggi" = "#e74c3c")) +
theme_minimal(base_size = 13) +
labs(title = "Distribusi Skor Diskriminan LD1 per Kelas",
subtitle = paste0("LD1 menjelaskan ", round(prop_var[1] * 100, 1), "% varians"),
x = "Skor LD1",
y = "Frekuensi",
fill = "Kelas") +
theme(legend.position = "top",
plot.title = element_text(face = "bold"))

B. Scatter Plot LD1 vs LD2 per Kelas
centroid_df <- scores_df %>%
group_by(Kelas) %>%
summarise(LD1 = mean(LD1), LD2 = mean(LD2), .groups = "drop")
ggplot(scores_df, aes(x = LD1, y = LD2, color = Kelas)) +
geom_point(alpha = 0.4, size = 1.2) +
stat_ellipse(aes(fill = Kelas), geom = "polygon",
alpha = 0.10, level = 0.95, linewidth = 0.8) +
geom_point(data = centroid_df, aes(x = LD1, y = LD2),
size = 5, shape = 18, color = "black") +
geom_text(data = centroid_df, aes(x = LD1, y = LD2, label = Kelas),
vjust = -1, fontface = "bold", size = 4, color = "black") +
scale_color_manual(values = c("Rendah" = "#3498db",
"Menengah" = "#2ecc71",
"Tinggi" = "#e74c3c")) +
scale_fill_manual( values = c("Rendah" = "#3498db",
"Menengah" = "#2ecc71",
"Tinggi" = "#e74c3c")) +
theme_minimal(base_size = 13) +
labs(title = "Scatter Plot Skor Diskriminan LD1 vs LD2",
subtitle = sprintf("LD1: %.1f%% | LD2: %.1f%% varians",
prop_var[1] * 100, prop_var[2] * 100),
x = paste0("LD1 (", round(prop_var[1] * 100, 1), "%)"),
y = paste0("LD2 (", round(prop_var[2] * 100, 1), "%)"),
color = "Kelas", fill = "Kelas") +
theme(legend.position = "right",
plot.title = element_text(face = "bold"))

C. Density Plot LD1 per Kelas
ggplot(scores_df, aes(x = LD1, fill = Kelas, color = Kelas)) +
geom_density(alpha = 0.4, linewidth = 1) +
scale_fill_manual( values = c("Rendah" = "#3498db",
"Menengah" = "#2ecc71",
"Tinggi" = "#e74c3c")) +
scale_color_manual(values = c("Rendah" = "#2980b9",
"Menengah" = "#27ae60",
"Tinggi" = "#c0392b")) +
theme_minimal(base_size = 13) +
labs(title = "Density Plot Skor Diskriminan LD1 per Kelas",
x = "Skor LD1",
y = "Densitas",
fill = "Kelas", color = "Kelas") +
theme(legend.position = "top",
plot.title = element_text(face = "bold"))

D. Kontribusi Variabel pada LD1 dan LD2 (Bar Plot Koefisien)
koef_long <- tidyr::pivot_longer(koef_df,
cols = c("LD1", "LD2"),
names_to = "Fungsi",
values_to = "Koefisien")
ggplot(koef_long, aes(x = reorder(Variabel, abs(Koefisien)),
y = Koefisien, fill = Koefisien > 0)) +
geom_col(show.legend = FALSE, alpha = 0.85) +
coord_flip() +
facet_wrap(~ Fungsi, scales = "free_x") +
scale_fill_manual(values = c("TRUE" = "#2ecc71", "FALSE" = "#e74c3c")) +
theme_minimal(base_size = 12) +
labs(title = "Koefisien Variabel pada Fungsi Diskriminan LD1 dan LD2",
x = "Variabel",
y = "Koefisien") +
theme(plot.title = element_text(face = "bold"),
strip.text = element_text(face = "bold", size = 12))

Multinomial Logistic Regression (Multinom)
Membangun Model Multinom
train_data$Kelas <- relevel(factor(train_data$Kelas), ref = "Rendah")
set.seed(123)
multinom_model <- multinom(Kelas ~ ., data = train_data,
maxit = 500, trace = FALSE)
cat("RINGKASAN MODEL MULTINOM\n\n")
## RINGKASAN MODEL MULTINOM
print(summary(multinom_model))
## Call:
## multinom(formula = Kelas ~ ., data = train_data, maxit = 500,
## trace = FALSE)
##
## Coefficients:
## (Intercept) `Pengeluaran non Makanan` `Total Pengeluaran Rumahtangga`
## Menengah -85.90405 8.577093 17.25029
## Tinggi -834.47961 160.014082 253.41555
## `Pengeluaran perkapita` `Nilai Kalori Perkapita`
## Menengah 47.01235 121.2738
## Tinggi 223.15664 146.7272
## `Nilai Protein Perkapita` `Nilai Lemak Perkapita`
## Menengah 114.79375 87.81565
## Tinggi 65.25421 102.08418
## `Nilai Karbohidrat Perkapita`
## Menengah 114.07331
## Tinggi 46.27432
##
## Std. Errors:
## (Intercept) `Pengeluaran non Makanan` `Total Pengeluaran Rumahtangga`
## Menengah 27.96057 6.195044 6.713941
## Tinggi 261.53840 62.872281 84.854201
## `Pengeluaran perkapita` `Nilai Kalori Perkapita`
## Menengah 15.96062 40.33540
## Tinggi 71.54925 60.15963
## `Nilai Protein Perkapita` `Nilai Lemak Perkapita`
## Menengah 37.17461 28.55991
## Tinggi 27.75041 36.62291
## `Nilai Karbohidrat Perkapita`
## Menengah 37.19801
## Tinggi 29.45159
##
## Residual Deviance: 4.863707
## AIC: 36.86371
Ekstrak Koefisien Model Multinom
koef_multinom <- coef(multinom_model)
cat("KOEFISIEN MODEL MULTINOM\n")
## KOEFISIEN MODEL MULTINOM
cat("(Relatif terhadap kelas referensi: Rendah)\n\n")
## (Relatif terhadap kelas referensi: Rendah)
kable(round(koef_multinom, 4),
caption = "Koefisien Multinomial Logistic Regression") %>%
kable_styling(bootstrap_options = c("striped", "hover", "bordered")) %>%
column_spec(1, bold = TRUE, background = "#f5f5f5")
Koefisien Multinomial Logistic Regression
|
|
(Intercept)
|
Pengeluaran non Makanan
|
Total Pengeluaran Rumahtangga
|
Pengeluaran perkapita
|
Nilai Kalori Perkapita
|
Nilai Protein Perkapita
|
Nilai Lemak Perkapita
|
Nilai Karbohidrat Perkapita
|
|
Menengah
|
-85.9041
|
8.5771
|
17.2503
|
47.0124
|
121.2738
|
114.7938
|
87.8156
|
114.0733
|
|
Tinggi
|
-834.4796
|
160.0141
|
253.4155
|
223.1566
|
146.7272
|
65.2542
|
102.0842
|
46.2743
|
Uji Signifikansi Koefisien (Z-test & P-value)
se_multinom <- summary(multinom_model)$standard.errors
z_scores <- koef_multinom / se_multinom
p_values <- 2 * (1 - pnorm(abs(z_scores)))
cat("Z-SCORE\n")
## Z-SCORE
kable(round(z_scores, 4),
caption = "Z-Score Tiap Koefisien") %>%
kable_styling(bootstrap_options = c("striped", "hover", "bordered")) %>%
column_spec(1, bold = TRUE, background = "#f5f5f5")
Z-Score Tiap Koefisien
|
|
(Intercept)
|
Pengeluaran non Makanan
|
Total Pengeluaran Rumahtangga
|
Pengeluaran perkapita
|
Nilai Kalori Perkapita
|
Nilai Protein Perkapita
|
Nilai Lemak Perkapita
|
Nilai Karbohidrat Perkapita
|
|
Menengah
|
-3.0723
|
1.3845
|
2.5693
|
2.9455
|
3.0066
|
3.0880
|
3.0748
|
3.0667
|
|
Tinggi
|
-3.1907
|
2.5451
|
2.9865
|
3.1189
|
2.4390
|
2.3515
|
2.7874
|
1.5712
|
cat("\nP-VALUE\n")
##
## P-VALUE
kable(round(p_values, 4),
caption = "P-Value Tiap Koefisien (alpha = 0.05)") %>%
kable_styling(bootstrap_options = c("striped", "hover", "bordered")) %>%
column_spec(1, bold = TRUE, background = "#f5f5f5")
P-Value Tiap Koefisien (alpha = 0.05)
|
|
(Intercept)
|
Pengeluaran non Makanan
|
Total Pengeluaran Rumahtangga
|
Pengeluaran perkapita
|
Nilai Kalori Perkapita
|
Nilai Protein Perkapita
|
Nilai Lemak Perkapita
|
Nilai Karbohidrat Perkapita
|
|
Menengah
|
0.0021
|
0.1662
|
0.0102
|
0.0032
|
0.0026
|
0.0020
|
0.0021
|
0.0022
|
|
Tinggi
|
0.0014
|
0.0109
|
0.0028
|
0.0018
|
0.0147
|
0.0187
|
0.0053
|
0.1161
|
Tabel Signifikansi Lengkap per Variabel
hasil_list <- list()
for (kelas in rownames(koef_multinom)) {
df_kelas <- data.frame(
Kelas = kelas,
Variabel = colnames(koef_multinom),
Koefisien = round(koef_multinom[kelas, ], 4),
Std_Error = round(se_multinom[kelas, ], 4),
Z_Score = round(z_scores[kelas, ], 4),
P_Value = round(p_values[kelas, ], 4)
)
df_kelas$Signifikan <- ifelse(df_kelas$P_Value < 0.001, "***",
ifelse(df_kelas$P_Value < 0.01, "**",
ifelse(df_kelas$P_Value < 0.05, "*",
ifelse(df_kelas$P_Value < 0.10, ".", ""))))
hasil_list[[kelas]] <- df_kelas
}
tabel_sig <- bind_rows(hasil_list)
rownames(tabel_sig) <- NULL
cat("Keterangan: *** p<0.001 | ** p<0.01 | * p<0.05 | . p<0.10\n\n")
## Keterangan: *** p<0.001 | ** p<0.01 | * p<0.05 | . p<0.10
kable(tabel_sig, row.names = FALSE,
caption = paste("Tabel Signifikansi Koefisien Multinom",
"(Referensi: Kelas Rendah)")) %>%
kable_styling(bootstrap_options = c("striped", "hover", "bordered")) %>%
column_spec(7, bold = TRUE, color = "darkred") %>%
row_spec(which(tabel_sig$P_Value < 0.05), background = "#fff3cd") %>%
collapse_rows(columns = 1, valign = "top")
Tabel Signifikansi Koefisien Multinom (Referensi: Kelas Rendah)
|
Kelas
|
Variabel
|
Koefisien
|
Std_Error
|
Z_Score
|
P_Value
|
Signifikan
|
|
Menengah
|
(Intercept)
|
-85.9041
|
27.9606
|
-3.0723
|
0.0021
|
**
|
Pengeluaran non Makanan
|
8.5771
|
6.1950
|
1.3845
|
0.1662
|
|
Total Pengeluaran Rumahtangga
|
17.2503
|
6.7139
|
2.5693
|
0.0102
|
|
Pengeluaran perkapita
|
47.0124
|
15.9606
|
2.9455
|
0.0032
|
**
|
Nilai Kalori Perkapita
|
121.2738
|
40.3354
|
3.0066
|
0.0026
|
**
|
Nilai Protein Perkapita
|
114.7938
|
37.1746
|
3.0880
|
0.0020
|
**
|
Nilai Lemak Perkapita
|
87.8156
|
28.5599
|
3.0748
|
0.0021
|
**
|
Nilai Karbohidrat Perkapita
|
114.0733
|
37.1980
|
3.0667
|
0.0022
|
**
|
|
Tinggi
|
(Intercept)
|
-834.4796
|
261.5384
|
-3.1907
|
0.0014
|
**
|
Pengeluaran non Makanan
|
160.0141
|
62.8723
|
2.5451
|
0.0109
|
|
Total Pengeluaran Rumahtangga
|
253.4155
|
84.8542
|
2.9865
|
0.0028
|
**
|
Pengeluaran perkapita
|
223.1566
|
71.5492
|
3.1189
|
0.0018
|
**
|
Nilai Kalori Perkapita
|
146.7272
|
60.1596
|
2.4390
|
0.0147
|
|
Nilai Protein Perkapita
|
65.2542
|
27.7504
|
2.3515
|
0.0187
|
|
Nilai Lemak Perkapita
|
102.0842
|
36.6229
|
2.7874
|
0.0053
|
**
|
Nilai Karbohidrat Perkapita
|
46.2743
|
29.4516
|
1.5712
|
0.1161
|
|
Odds Ratio (Exp Koefisien)
odds_ratio <- exp(koef_multinom)
cat("ODDS RATIO (exp(koefisien))\n")
## ODDS RATIO (exp(koefisien))
cat("Interpretasi: OR > 1 -> meningkatkan peluang kelas vs Rendah\n")
## Interpretasi: OR > 1 -> meningkatkan peluang kelas vs Rendah
cat(" OR < 1 -> menurunkan peluang kelas vs Rendah\n\n")
## OR < 1 -> menurunkan peluang kelas vs Rendah
kable(round(odds_ratio, 4),
caption = "Odds Ratio Koefisien Multinomial Logistic Regression") %>%
kable_styling(bootstrap_options = c("striped", "hover", "bordered")) %>%
column_spec(1, bold = TRUE, background = "#f5f5f5")
Odds Ratio Koefisien Multinomial Logistic Regression
|
|
(Intercept)
|
Pengeluaran non Makanan
|
Total Pengeluaran Rumahtangga
|
Pengeluaran perkapita
|
Nilai Kalori Perkapita
|
Nilai Protein Perkapita
|
Nilai Lemak Perkapita
|
Nilai Karbohidrat Perkapita
|
|
Menengah
|
0
|
5.308650e+03
|
3.102456e+07
|
2.613402e+20
|
4.661469e+52
|
7.149787e+49
|
1.373572e+38
|
3.478623e+49
|
|
Tinggi
|
0
|
3.113386e+69
|
1.140183e+110
|
8.235606e+96
|
5.282212e+63
|
2.185467e+28
|
2.160702e+44
|
1.249341e+20
|
Perbandingan Koefisien: LDA vs Multinom
koef_lda_df <- data.frame(
Variabel = rownames(lda_model$scaling),
LDA_LD1 = round(lda_model$scaling[, "LD1"], 4),
LDA_LD2 = round(lda_model$scaling[, "LD2"], 4)
)
koef_multinom_no_int <- koef_multinom[, -1] #
koef_multi_df <- data.frame(
Variabel = colnames(koef_multinom_no_int),
Multinom_Menengah = round(koef_multinom_no_int["Menengah", ], 4),
Multinom_Tinggi = round(koef_multinom_no_int["Tinggi", ], 4)
)
koef_lda_df$Variabel <- gsub("`", "", koef_lda_df$Variabel)
perbandingan <- merge(koef_lda_df, koef_multi_df, by = "Variabel", all = TRUE)
perbandingan <- perbandingan[order(abs(perbandingan$LDA_LD1), decreasing = TRUE), ]
rownames(perbandingan) <- NULL
kable(perbandingan,
caption = "Perbandingan Koefisien LDA vs Multinomial Logistic Regression",
col.names = c("Variabel", "LDA LD1", "LDA LD2",
"Multinom (Menengah vs Rendah)",
"Multinom (Tinggi vs Rendah)")) %>%
kable_styling(bootstrap_options = c("striped", "hover", "bordered")) %>%
column_spec(1, bold = TRUE) %>%
column_spec(2:3, background = "#e8f4f8") %>%
column_spec(4:5, background = "#fef9e7")
Perbandingan Koefisien LDA vs Multinomial Logistic Regression
|
Variabel
|
LDA LD1
|
LDA LD2
|
Multinom (Menengah vs Rendah)
|
Multinom (Tinggi vs Rendah)
|
|
Nilai Kalori Perkapita
|
0.5841
|
-0.6520
|
NA
|
NA
|
|
Nilai Karbohidrat Perkapita
|
0.3234
|
-0.1561
|
NA
|
NA
|
|
Nilai Protein Perkapita
|
0.3118
|
-0.2275
|
NA
|
NA
|
|
Pengeluaran non Makanan
|
0.2940
|
0.4023
|
NA
|
NA
|
|
Nilai Lemak Perkapita
|
0.2444
|
0.0423
|
NA
|
NA
|
|
Pengeluaran perkapita
|
0.2196
|
0.6401
|
NA
|
NA
|
|
Total Pengeluaran Rumahtangga
|
0.1797
|
0.3300
|
NA
|
NA
|
Nilai Kalori Perkapita
|
NA
|
NA
|
121.2738
|
146.7272
|
Nilai Karbohidrat Perkapita
|
NA
|
NA
|
114.0733
|
46.2743
|
Nilai Lemak Perkapita
|
NA
|
NA
|
87.8156
|
102.0842
|
Nilai Protein Perkapita
|
NA
|
NA
|
114.7938
|
65.2542
|
Pengeluaran non Makanan
|
NA
|
NA
|
8.5771
|
160.0141
|
Pengeluaran perkapita
|
NA
|
NA
|
47.0124
|
223.1566
|
Total Pengeluaran Rumahtangga
|
NA
|
NA
|
17.2503
|
253.4155
|
Visualisasi Perbandingan Koefisien LDA vs Multinom
library(ggplot2)
library(tidyr)
# Format panjang untuk plot
perbandingan_long <- perbandingan %>%
pivot_longer(cols = -Variabel,
names_to = "Model_Fungsi",
values_to = "Koefisien") %>%
mutate(
Model = case_when(
grepl("LDA", Model_Fungsi) ~ "LDA",
grepl("Multinom", Model_Fungsi) ~ "Multinom"
),
Fungsi = case_when(
Model_Fungsi == "LDA_LD1" ~ "LD1",
Model_Fungsi == "LDA_LD2" ~ "LD2",
Model_Fungsi == "Multinom_Menengah" ~ "Menengah vs Rendah",
Model_Fungsi == "Multinom_Tinggi" ~ "Tinggi vs Rendah"
)
)
ggplot(perbandingan_long,
aes(x = reorder(Variabel, abs(Koefisien)),
y = Koefisien,
fill = Model)) +
geom_col(position = "dodge", alpha = 0.85, color = "white") +
coord_flip() +
facet_wrap(~ Fungsi, scales = "free_x", ncol = 2) +
scale_fill_manual(values = c("LDA" = "#3498db", "Multinom" = "#e67e22")) +
geom_hline(yintercept = 0, linetype = "dashed", color = "gray40") +
theme_minimal(base_size = 12) +
labs(
title = "Perbandingan Koefisien: LDA vs Multinomial Logistic Regression",
subtitle = "Biru = LDA | Oranye = Multinom",
x = "Variabel",
y = "Koefisien",
fill = "Model"
) +
theme(
plot.title = element_text(face = "bold"),
legend.position = "top",
strip.text = element_text(face = "bold", size = 11)
)
## Warning: Removed 28 rows containing missing values or values outside the scale range
## (`geom_col()`).

FASE 6 — Evaluasi & Validasi Model
Prediksi Kelas pada Data Testing
library(caret)
library(kableExtra)
library(ggplot2)
library(dplyr)
library(tidyr)
pred_lda <- predict(lda_model, newdata = test_data)
kelas_pred_lda <- pred_lda$class
kelas_aktual <- test_data$Kelas
# PREDIKSI MULTINOM
pred_multinom <- predict(multinom_model, newdata = test_data, type = "class")
kelas_pred_multinom <- factor(pred_multinom,
levels = c("Rendah", "Menengah", "Tinggi"))
# Tabel perbandingan aktual vs prediksi
perbandingan_pred <- data.frame(
Aktual = kelas_aktual,
Pred_LDA = kelas_pred_lda,
Pred_Multinom = kelas_pred_multinom,
Cocok_LDA = ifelse(kelas_aktual == kelas_pred_lda, "Benar", "Salah"),
Cocok_Multinom= ifelse(kelas_aktual == kelas_pred_multinom, "Salah", "Benar")
)
cat("PREVIEW HASIL PREDIKSI (20 baris pertama)\n\n")
## PREVIEW HASIL PREDIKSI (20 baris pertama)
kable(head(perbandingan_pred, 20), row.names = TRUE,
caption = "Perbandingan Kelas Aktual vs Prediksi (LDA & Multinom)") %>%
kable_styling(bootstrap_options = c("striped", "hover", "bordered")) %>%
column_spec(4, bold = TRUE) %>%
column_spec(5, bold = TRUE)
Perbandingan Kelas Aktual vs Prediksi (LDA & Multinom)
|
|
Aktual
|
Pred_LDA
|
Pred_Multinom
|
Cocok_LDA
|
Cocok_Multinom
|
|
1
|
Rendah
|
Rendah
|
Rendah
|
Benar
|
Salah
|
|
2
|
Tinggi
|
Tinggi
|
Tinggi
|
Benar
|
Salah
|
|
3
|
Rendah
|
Rendah
|
Rendah
|
Benar
|
Salah
|
|
4
|
Rendah
|
Rendah
|
Rendah
|
Benar
|
Salah
|
|
5
|
Rendah
|
Rendah
|
Rendah
|
Benar
|
Salah
|
|
6
|
Menengah
|
Menengah
|
Menengah
|
Benar
|
Salah
|
|
7
|
Rendah
|
Rendah
|
Rendah
|
Benar
|
Salah
|
|
8
|
Tinggi
|
Rendah
|
Tinggi
|
Salah
|
Salah
|
|
9
|
Rendah
|
Rendah
|
Rendah
|
Benar
|
Salah
|
|
10
|
Rendah
|
Rendah
|
Rendah
|
Benar
|
Salah
|
|
11
|
Rendah
|
Rendah
|
Rendah
|
Benar
|
Salah
|
|
12
|
Tinggi
|
Tinggi
|
Tinggi
|
Benar
|
Salah
|
|
13
|
Menengah
|
Rendah
|
Menengah
|
Salah
|
Salah
|
|
14
|
Rendah
|
Rendah
|
Rendah
|
Benar
|
Salah
|
|
15
|
Rendah
|
Rendah
|
Rendah
|
Benar
|
Salah
|
|
16
|
Rendah
|
Rendah
|
Rendah
|
Benar
|
Salah
|
|
17
|
Rendah
|
Rendah
|
Rendah
|
Benar
|
Salah
|
|
18
|
Rendah
|
Rendah
|
Rendah
|
Benar
|
Salah
|
|
19
|
Menengah
|
Menengah
|
Menengah
|
Benar
|
Salah
|
|
20
|
Menengah
|
Menengah
|
Menengah
|
Benar
|
Salah
|
Confusion Matrix
Confusion Matrix — LDA
cm_lda <- confusionMatrix(
data = kelas_pred_lda,
reference = kelas_aktual,
positive = NULL
)
cat("CONFUSION MATRIX — LDA\n\n")
## CONFUSION MATRIX — LDA
print(cm_lda)
## Confusion Matrix and Statistics
##
## Reference
## Prediction Rendah Menengah Tinggi
## Rendah 635 59 5
## Menengah 0 254 6
## Tinggi 0 0 44
##
## Overall Statistics
##
## Accuracy : 0.9302
## 95% CI : (0.9126, 0.9452)
## No Information Rate : 0.6331
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.8532
##
## Mcnemar's Test P-Value : 4.268e-15
##
## Statistics by Class:
##
## Class: Rendah Class: Menengah Class: Tinggi
## Sensitivity 1.0000 0.8115 0.80000
## Specificity 0.8261 0.9913 1.00000
## Pos Pred Value 0.9084 0.9769 1.00000
## Neg Pred Value 1.0000 0.9206 0.98853
## Prevalence 0.6331 0.3121 0.05484
## Detection Rate 0.6331 0.2532 0.04387
## Detection Prevalence 0.6969 0.2592 0.04387
## Balanced Accuracy 0.9130 0.9014 0.90000
Confusion Matrix — Multinomial Logistic Regression
cm_multinom <- confusionMatrix(
data = kelas_pred_multinom,
reference = kelas_aktual,
positive = NULL
)
cat("CONFUSION MATRIX — MULTINOMIAL LOGISTIC REGRESSION\n\n")
## CONFUSION MATRIX — MULTINOMIAL LOGISTIC REGRESSION
print(cm_multinom)
## Confusion Matrix and Statistics
##
## Reference
## Prediction Rendah Menengah Tinggi
## Rendah 635 1 0
## Menengah 0 312 3
## Tinggi 0 0 52
##
## Overall Statistics
##
## Accuracy : 0.996
## 95% CI : (0.9898, 0.9989)
## No Information Rate : 0.6331
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.992
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: Rendah Class: Menengah Class: Tinggi
## Sensitivity 1.0000 0.9968 0.94545
## Specificity 0.9973 0.9957 1.00000
## Pos Pred Value 0.9984 0.9905 1.00000
## Neg Pred Value 1.0000 0.9985 0.99685
## Prevalence 0.6331 0.3121 0.05484
## Detection Rate 0.6331 0.3111 0.05184
## Detection Prevalence 0.6341 0.3141 0.05184
## Balanced Accuracy 0.9986 0.9962 0.97273
Visualisasi Confusion Matrix
Heatmap Confusion Matrix — LDA
cm_lda_df <- as.data.frame(cm_lda$table)
names(cm_lda_df) <- c("Prediksi", "Aktual", "Frekuensi")
ggplot(cm_lda_df, aes(x = Prediksi, y = Aktual, fill = Frekuensi)) +
geom_tile(color = "white", linewidth = 0.8) +
geom_text(aes(label = Frekuensi), size = 7, fontface = "bold", color = "white") +
scale_fill_gradient(low = "#AED6F1", high = "#1A5276") +
scale_x_discrete(limits = c("Rendah", "Menengah", "Tinggi")) +
scale_y_discrete(limits = rev(c("Rendah", "Menengah", "Tinggi"))) +
theme_minimal(base_size = 13) +
labs(
title = "Confusion Matrix — LDA",
subtitle = sprintf("Akurasi: %.2f%%", cm_lda$overall["Accuracy"] * 100),
x = "Kelas Prediksi",
y = "Kelas Aktual",
fill = "Jumlah"
) +
theme(
plot.title = element_text(face = "bold"),
axis.text = element_text(size = 12),
panel.grid = element_blank()
)

Heatmap Confusion Matrix — Multinom
cm_multinom_df <- as.data.frame(cm_multinom$table)
names(cm_multinom_df) <- c("Prediksi", "Aktual", "Frekuensi")
ggplot(cm_multinom_df, aes(x = Prediksi, y = Aktual, fill = Frekuensi)) +
geom_tile(color = "white", linewidth = 0.8) +
geom_text(aes(label = Frekuensi), size = 7, fontface = "bold", color = "white") +
scale_fill_gradient(low = "#A9DFBF", high = "#1E8449") +
scale_x_discrete(limits = c("Rendah", "Menengah", "Tinggi")) +
scale_y_discrete(limits = rev(c("Rendah", "Menengah", "Tinggi"))) +
theme_minimal(base_size = 13) +
labs(
title = "Confusion Matrix — Multinomial Logistic Regression",
subtitle = sprintf("Akurasi: %.2f%%", cm_multinom$overall["Accuracy"] * 100),
x = "Kelas Prediksi",
y = "Kelas Aktual",
fill = "Jumlah"
) +
theme(
plot.title = element_text(face = "bold"),
axis.text = element_text(size = 12),
panel.grid = element_blank()
)

Akurasi, Sensitivitas, dan Spesifisitas
ekstrak_metrik <- function(cm_obj, model_name) {
byclass <- as.data.frame(cm_obj$byClass)
kelas_names <- gsub("Class: ", "", rownames(byclass))
data.frame(
Model = model_name,
Kelas = kelas_names,
Sensitivitas = round(byclass$Sensitivity, 4),
Spesifisitas = round(byclass$Specificity, 4),
Precision = round(byclass$`Pos Pred Value`, 4),
F1_Score = round(byclass$F1, 4),
Balanced_Acc = round(byclass$`Balanced Accuracy`, 4)
)
}
metrik_lda <- ekstrak_metrik(cm_lda, "LDA")
metrik_multinom <- ekstrak_metrik(cm_multinom, "Multinom")
metrik_gabungan <- bind_rows(metrik_lda, metrik_multinom)
rownames(metrik_gabungan) <- NULL
cat("METRIK EVALUASI PER KELAS\n\n")
## METRIK EVALUASI PER KELAS
kable(metrik_gabungan, row.names = FALSE,
caption = "Akurasi, Sensitivitas, dan Spesifisitas per Kelas") %>%
kable_styling(bootstrap_options = c("striped", "hover", "bordered")) %>%
column_spec(1, bold = TRUE, background = "#f0f0f0") %>%
column_spec(3:4, bold = TRUE) %>%
collapse_rows(columns = 1, valign = "middle")
Akurasi, Sensitivitas, dan Spesifisitas per Kelas
|
Model
|
Kelas
|
Sensitivitas
|
Spesifisitas
|
Precision
|
F1_Score
|
Balanced_Acc
|
|
LDA
|
Rendah
|
1.0000
|
0.8261
|
0.9084
|
0.9520
|
0.9130
|
|
Menengah
|
0.8115
|
0.9913
|
0.9769
|
0.8866
|
0.9014
|
|
Tinggi
|
0.8000
|
1.0000
|
1.0000
|
0.8889
|
0.9000
|
|
Multinom
|
Rendah
|
1.0000
|
0.9973
|
0.9984
|
0.9992
|
0.9986
|
|
Menengah
|
0.9968
|
0.9957
|
0.9905
|
0.9936
|
0.9962
|
|
Tinggi
|
0.9455
|
1.0000
|
1.0000
|
0.9720
|
0.9727
|
Akurasi Keseluruhan
akurasi_lda <- round(cm_lda$overall["Accuracy"] * 100, 2)
akurasi_multinom <- round(cm_multinom$overall["Accuracy"] * 100, 2)
kappa_lda <- round(cm_lda$overall["Kappa"], 4)
kappa_multinom <- round(cm_multinom$overall["Kappa"], 4)
overall_tbl <- data.frame(
Model = c("LDA", "Multinomial Logistic Regression"),
Akurasi = c(paste0(akurasi_lda, "%"),
paste0(akurasi_multinom, "%")),
Kappa = c(kappa_lda, kappa_multinom),
Interpretasi_Kappa = c(
ifelse(kappa_lda >= 0.8, "Sangat Baik",
ifelse(kappa_lda >= 0.6, "Baik",
ifelse(kappa_lda >= 0.4, "Cukup", "Lemah"))),
ifelse(kappa_multinom >= 0.8, "Sangat Baik",
ifelse(kappa_multinom >= 0.6, "Baik",
ifelse(kappa_multinom >= 0.4, "Cukup", "Lemah")))
)
)
kable(overall_tbl, row.names = FALSE,
caption = "Akurasi Keseluruhan dan Cohen's Kappa") %>%
kable_styling(bootstrap_options = c("striped", "hover", "bordered")) %>%
column_spec(2:3, bold = TRUE) %>%
column_spec(4, bold = TRUE)
Akurasi Keseluruhan dan Cohen’s Kappa
|
Model
|
Akurasi
|
Kappa
|
Interpretasi_Kappa
|
|
LDA
|
93.02%
|
0.8532
|
Sangat Baik
|
|
Multinomial Logistic Regression
|
99.6%
|
0.9920
|
Sangat Baik
|
metrik_plot <- metrik_gabungan %>%
dplyr::select(Model, Kelas, Sensitivitas, Spesifisitas) %>%
pivot_longer(cols = c("Sensitivitas", "Spesifisitas"),
names_to = "Metrik",
values_to = "Nilai")
ggplot(metrik_plot, aes(x = Kelas, y = Nilai, fill = Model)) +
geom_col(position = "dodge", alpha = 0.85, color = "white") +
geom_text(aes(label = sprintf("%.3f", Nilai)),
position = position_dodge(width = 0.9),
vjust = -0.4, size = 3.5, fontface = "bold") +
facet_wrap(~ Metrik) +
scale_fill_manual(values = c("LDA" = "#3498db", "Multinom" = "#e67e22")) +
scale_y_continuous(limits = c(0, 1.12), labels = scales::percent_format()) +
theme_minimal(base_size = 13) +
labs(
title = "Sensitivitas & Spesifisitas per Kelas: LDA vs Multinom",
x = "Kelas",
y = "Nilai",
fill = "Model"
) +
theme(
plot.title = element_text(face = "bold"),
legend.position = "top",
strip.text = element_text(face = "bold", size = 12)
)

heatmap Metrik Evaluasi per Kelas
metrik_heatmap <- metrik_gabungan %>%
dplyr::select(Model, Kelas, Sensitivitas, Spesifisitas, Precision, F1_Score) %>%
pivot_longer(cols = c("Sensitivitas", "Spesifisitas", "Precision", "F1_Score"),
names_to = "Metrik",
values_to = "Nilai") %>%
mutate(
Nilai = replace_na(Nilai, 0),
Metrik = factor(Metrik, levels = c("Sensitivitas", "Spesifisitas",
"Precision", "F1_Score")),
Kelas = factor(Kelas, levels = c("Rendah", "Menengah", "Tinggi")),
Label = sprintf("%.3f", Nilai)
)
ggplot(metrik_heatmap, aes(x = Kelas, y = Metrik, fill = Nilai)) +
geom_tile(color = "white", linewidth = 1.2) +
geom_text(aes(label = Label,
color = ifelse(Nilai < 0.5, "black", "white")),
size = 4.5, fontface = "bold") +
scale_fill_gradient2(
low = "#E74C3C",
mid = "#F39C12",
high = "#1A5276",
midpoint = 0.75,
limits = c(0, 1),
labels = scales::percent_format()
) +
scale_color_identity() +
facet_wrap(~ Model, ncol = 2) +
theme_minimal(base_size = 13) +
labs(
title = "Heatmap Metrik Evaluasi per Kelas",
subtitle = "Perbandingan LDA vs Multinomial Logistic Regression",
x = "Kelas",
y = "Metrik Evaluasi",
fill = "Nilai"
) +
theme(
plot.title = element_text(face = "bold", size = 14, hjust = 0.5),
plot.subtitle = element_text(size = 10, hjust = 0.5, color = "gray40"),
legend.position = "right",
legend.key.height = unit(1.5, "cm"),
strip.text = element_text(face = "bold", size = 12,
margin = margin(5, 5, 5, 5)),,
axis.text.x = element_text(size = 11, face = "bold"),
axis.text.y = element_text(size = 11),
panel.grid = element_blank(),
panel.spacing = unit(2, "lines")
)

Apparent Error Rate (AER) dan Cross-validated Error Rate (CVER)
Apparent Error Rate (AER)
# AER LDA
pred_train_lda <- predict(lda_model, newdata = train_data)$class
aer_lda <- mean(pred_train_lda != train_data$Kelas)
# AER Multinom
pred_train_multi <- predict(multinom_model, newdata = train_data, type = "class")
pred_train_multi <- factor(pred_train_multi, levels = c("Rendah", "Menengah", "Tinggi"))
aer_multinom <- mean(pred_train_multi != train_data$Kelas)
cat("APPARENT ERROR RATE (AER)\n\n")
## APPARENT ERROR RATE (AER)
cat(sprintf("AER — LDA : %.4f (%.2f%%)\n",
aer_lda, aer_lda * 100))
## AER — LDA : 0.0573 (5.73%)
cat(sprintf("AER — Multinom : %.4f (%.2f%%)\n",
aer_multinom, aer_multinom * 100))
## AER — Multinom : 0.0000 (0.00%)
Cross-validated Error Rate (CVER) — k-Fold Cross Validation
set.seed(42)
df_cv <- rbind(train_data, test_data)
df_cv$Kelas <- factor(df_cv$Kelas, levels = c("Rendah", "Menengah", "Tinggi"))
K <- 10 # 10-fold
folds <- createFolds(df_cv$Kelas, k = K, list = TRUE, returnTrain = FALSE)
#CVER LDA
err_lda_cv <- sapply(seq_len(K), function(i) {
idx_test <- folds[[i]]
idx_train <- setdiff(seq_len(nrow(df_cv)), idx_test)
tryCatch({
formula_tmp <- as.formula(
paste("Kelas ~",
paste(paste0("`", setdiff(names(df_cv), "Kelas"), "`"),
collapse = " + "))
)
model_tmp <- lda(formula_tmp, data = df_cv[idx_train, ])
pred_tmp <- predict(model_tmp, newdata = df_cv[idx_test, ])$class
mean(pred_tmp != df_cv$Kelas[idx_test])
}, error = function(e) NA)
})
cver_lda <- mean(err_lda_cv, na.rm = TRUE)
#CVER Multinom
err_multi_cv <- sapply(seq_len(K), function(i) {
idx_test <- folds[[i]]
idx_train <- setdiff(seq_len(nrow(df_cv)), idx_test)
tryCatch({
train_tmp <- df_cv[idx_train, ]
train_tmp$Kelas <- relevel(train_tmp$Kelas, ref = "Rendah")
model_tmp <- multinom(Kelas ~ ., data = train_tmp,
maxit = 500, trace = FALSE)
pred_tmp <- factor(predict(model_tmp, newdata = df_cv[idx_test, ],
type = "class"),
levels = c("Rendah", "Menengah", "Tinggi"))
mean(pred_tmp != df_cv$Kelas[idx_test])
}, error = function(e) NA)
})
cver_multinom <- mean(err_multi_cv, na.rm = TRUE)
cat("CROSS-VALIDATED ERROR RATE (10-Fold CV)\n\n")
## CROSS-VALIDATED ERROR RATE (10-Fold CV)
cat(sprintf("CVER — LDA : %.4f (%.2f%%)\n",
cver_lda, cver_lda * 100))
## CVER — LDA : 0.0616 (6.16%)
cat(sprintf("CVER — Multinom: %.4f (%.2f%%)\n",
cver_multinom, cver_multinom * 100))
## CVER — Multinom: 0.0020 (0.20%)
Rekapitulasi AER vs CVER vs Testing Error Rate
# Testing Error Rate
ter_lda <- 1 - cm_lda$overall["Accuracy"]
ter_multinom <- 1 - cm_multinom$overall["Accuracy"]
error_summary <- data.frame(
Model = c("LDA", "Multinomial Logistic Regression"),
AER = round(c(aer_lda, aer_multinom) * 100, 2),
CVER = round(c(cver_lda, cver_multinom) * 100, 2),
TER = round(c(ter_lda, ter_multinom) * 100, 2),
Akurasi_Testing = round(c(cm_lda$overall["Accuracy"],
cm_multinom$overall["Accuracy"]) * 100, 2)
)
names(error_summary) <- c("Model", "AER (%)", "CVER 10-Fold (%)",
"Testing Error Rate (%)", "Akurasi Testing (%)")
rownames(error_summary) <- NULL
kable(error_summary, row.names = FALSE,
caption = "Rekapitulasi Apparent Error Rate, CVER, dan Testing Error Rate") %>%
kable_styling(bootstrap_options = c("striped", "hover", "bordered")) %>%
column_spec(1, bold = TRUE) %>%
column_spec(2:4, bold = TRUE) %>%
column_spec(5, bold = TRUE)
Rekapitulasi Apparent Error Rate, CVER, dan Testing Error Rate
|
Model
|
AER (%)
|
CVER 10-Fold (%)
|
Testing Error Rate (%)
|
Akurasi Testing (%)
|
|
LDA
|
5.73
|
6.16
|
6.98
|
93.02
|
|
Multinomial Logistic Regression
|
0.00
|
0.20
|
0.40
|
99.60
|
Visualisasi Error Rate: AER vs CVER vs Testing
error_long <- error_summary %>%
dplyr::select(Model, `AER (%)`, `CVER 10-Fold (%)`, `Testing Error Rate (%)`) %>%
pivot_longer(cols = -Model,
names_to = "Jenis_Error",
values_to = "Error_Rate")
ggplot(error_long, aes(x = Model, y = Error_Rate, fill = Jenis_Error)) +
geom_col(position = "dodge", alpha = 0.85, color = "white", width = 0.6) +
geom_text(aes(label = sprintf("%.2f%%", Error_Rate)),
position = position_dodge(width = 0.6),
vjust = -0.5, size = 4, fontface = "bold") +
scale_fill_manual(values = c(
"AER (%)" = "#E74C3C",
"CVER 10-Fold (%)" = "#F39C12",
"Testing Error Rate (%)" = "#8E44AD"
)) +
scale_y_continuous(limits = c(0, max(error_long$Error_Rate) * 1.3),
labels = function(x) paste0(x, "%")) +
theme_minimal(base_size = 13) +
labs(
title = "Perbandingan Error Rate: AER vs CVER vs Testing Error Rate",
subtitle = "Nilai lebih rendah = Model lebih baik",
x = "Model",
y = "Error Rate (%)",
fill = "Jenis Error"
) +
theme(
plot.title = element_text(face = "bold"),
legend.position = "top"
)

Visualisasi Akurasi per Fold (Cross Validation)
cv_fold_df <- data.frame(
Fold = rep(1:K, 2),
Model = rep(c("LDA", "Multinom"), each = K),
Error = c(err_lda_cv * 100, err_multi_cv * 100),
Akurasi = c((1 - err_lda_cv) * 100, (1 - err_multi_cv) * 100)
)
ggplot(cv_fold_df, aes(x = factor(Fold), y = Akurasi,
color = Model, group = Model)) +
geom_line(linewidth = 1.2, alpha = 0.8) +
geom_point(size = 3.5) +
geom_text(aes(label = sprintf("%.1f%%", Akurasi)),
vjust = -0.8, size = 3.2, fontface = "bold") +
geom_hline(aes(yintercept = mean(Akurasi), color = Model),
linetype = "dashed", linewidth = 0.8, alpha = 0.6,
data = cv_fold_df %>% group_by(Model) %>%
summarise(Akurasi = mean(Akurasi, na.rm = TRUE))) +
scale_color_manual(values = c("LDA" = "#3498db", "Multinom" = "#e67e22")) +
scale_y_continuous(limits = c(
max(0, min(cv_fold_df$Akurasi, na.rm = TRUE) - 5), 105),
labels = function(x) paste0(x, "%")) +
theme_minimal(base_size = 13) +
labs(
title = "Akurasi per Fold — 10-Fold Cross Validation",
subtitle = "Garis putus-putus = rata-rata akurasi CV",
x = "Fold",
y = "Akurasi (%)",
color = "Model"
) +
theme(
plot.title = element_text(face = "bold"),
legend.position = "top"
)
