Analisis ini bertujuan untuk memodelkan tingkat keparahan kanker payudara menggunakan Regresi Logistik Multinomial dan membandingkannya dengan Analisis Diskriminan (LDA) sebagai metode pembanding.
Variabel Respon (Y): Tingkat Diagnosis
Benign = Tidak ganasMalignant_Ringan = Ganas dengan ukuran tumor kecil
(radius_worst ≤ median M)Malignant_Berat = Ganas dengan ukuran tumor besar
(radius_worst > median M)Variabel Prediktor: 21 pengukuran karakteristik sel tumor (radius, tekstur, perimeter, luas, kehalusan, kekompakan, konkavitas, simetri, dll.)
| Aspek | Logistik Multinomial | Analisis Diskriminan |
|---|---|---|
| Asumsi prediktor | Tidak ada | Normal multivariat |
| Jenis prediktor | Kategorik & kontinu | Sebaiknya kontinu |
| Output utama | Odds Ratio | Fungsi diskriminan |
| Evaluasi | Akurasi, Pseudo R² | Akurasi, proporsi terklasifikasi |
library(nnet)
library(car)
library(DescTools)
library(tidyverse)
library(caret)
library(knitr)
library(kableExtra)
library(MASS)
df <- read.csv(file.choose())
cat("Jumlah baris:", nrow(df), "\n")
## Jumlah baris: 1200
cat("Jumlah kolom:", ncol(df), "\n")
## Jumlah kolom: 22
Kelas Malignant dipecah berdasarkan median radius_worst
pada grup M:
median_r <- median(df$radius_worst[df$diagnosis == "M"])
cat("Median radius_worst (Malignant):", round(median_r, 3), "\n")
## Median radius_worst (Malignant): 17.001
df$Y <- case_when(
df$diagnosis == "B" ~ "Benign",
df$diagnosis == "M" & df$radius_worst <= median_r ~ "Malignant_Ringan",
df$diagnosis == "M" & df$radius_worst > median_r ~ "Malignant_Berat"
)
df$Y <- factor(df$Y, levels = c("Benign", "Malignant_Ringan", "Malignant_Berat"))
cat("Preprocessing selesai.\n")
## Preprocessing selesai.
prediktor <- c("radius_mean", "texture_mean", "perimeter_mean", "area_mean",
"smoothness_mean", "compactness_mean", "concavity_mean",
"concave_points_mean", "symmetry_mean", "fractal_dimension_mean",
"radius_se", "texture_se", "perimeter_se", "area_se",
"smoothness_se", "radius_worst", "texture_worst",
"perimeter_worst", "area_worst", "concavity_worst",
"concave_points_worst")
cat("Jumlah prediktor:", length(prediktor), "\n")
## Jumlah prediktor: 21
tabel_Y <- table(df$Y)
persen_Y <- round(prop.table(tabel_Y) * 100, 2)
tabel_respon <- data.frame(
Kategori = names(tabel_Y),
Frekuensi = as.numeric(tabel_Y),
Persentase = paste0(formatC(as.numeric(persen_Y), format = "f", digits = 2), "%")
)
tabel_respon %>%
kable(caption = "Tabel 1. Distribusi Variabel Respon (Y)") %>%
kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)
| Kategori | Frekuensi | Persentase |
|---|---|---|
| Benign | 612 | 51.00% |
| Malignant_Ringan | 294 | 24.50% |
| Malignant_Berat | 294 | 24.50% |
barplot(tabel_Y,
main = "Distribusi Tingkat Diagnosis Kanker Payudara",
xlab = "Kategori",
ylab = "Frekuensi",
col = c("steelblue", "orange", "tomato"),
names.arg = c("Benign", "Malignant\nRingan", "Malignant\nBerat"))
Karena prediktor bersifat kontinu, digunakan ANOVA one-way untuk menguji apakah rata-rata tiap prediktor berbeda signifikan antar kategori Y.
Hipotesis:
hasil_anova <- data.frame(
Variabel = prediktor,
F_value = NA,
Pvalue = NA,
Keputusan = NA
)
for (i in seq_along(prediktor)) {
var <- prediktor[i]
aov_res <- summary(aov(df[[var]] ~ df$Y))[[1]]
hasil_anova$F_value[i] <- round(aov_res$`F value`[1], 4)
hasil_anova$Pvalue[i] <- round(aov_res$`Pr(>F)`[1], 4)
hasil_anova$Keputusan[i] <- ifelse(aov_res$`Pr(>F)`[1] < 0.05,
"Tolak H0", "Gagal Tolak H0")
}
hasil_anova %>%
arrange(Pvalue) %>%
kable(caption = "Tabel 2. Hasil Uji ANOVA One-Way per Prediktor") %>%
kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE) %>%
row_spec(which(hasil_anova$Pvalue < 0.05), background = "#d4edda")
| Variabel | F_value | Pvalue | Keputusan |
|---|---|---|---|
| radius_worst | 295.3993 | 0.0000 | Tolak H0 |
| area_mean | 4.7331 | 0.0090 | Tolak H0 |
| radius_se | 3.3243 | 0.0363 | Tolak H0 |
| smoothness_mean | 2.4964 | 0.0828 | Gagal Tolak H0 |
| area_worst | 1.6605 | 0.1905 | Gagal Tolak H0 |
| concavity_worst | 1.5318 | 0.2166 | Gagal Tolak H0 |
| perimeter_mean | 1.3092 | 0.2704 | Gagal Tolak H0 |
| texture_mean | 1.0362 | 0.3551 | Gagal Tolak H0 |
| texture_worst | 0.9680 | 0.3801 | Gagal Tolak H0 |
| compactness_mean | 0.7693 | 0.4636 | Gagal Tolak H0 |
| concave_points_worst | 0.6287 | 0.5334 | Gagal Tolak H0 |
| perimeter_worst | 0.5719 | 0.5646 | Gagal Tolak H0 |
| smoothness_se | 0.4482 | 0.6389 | Gagal Tolak H0 |
| radius_mean | 0.3875 | 0.6788 | Gagal Tolak H0 |
| fractal_dimension_mean | 0.3828 | 0.6821 | Gagal Tolak H0 |
| perimeter_se | 0.3664 | 0.6933 | Gagal Tolak H0 |
| area_se | 0.1974 | 0.8209 | Gagal Tolak H0 |
| texture_se | 0.1626 | 0.8500 | Gagal Tolak H0 |
| symmetry_mean | 0.1408 | 0.8687 | Gagal Tolak H0 |
| concavity_mean | 0.1139 | 0.8924 | Gagal Tolak H0 |
| concave_points_mean | 0.0900 | 0.9140 | Gagal Tolak H0 |
var_signifikan <- hasil_anova$Variabel[hasil_anova$Pvalue < 0.05]
cat("\nVariabel signifikan:", paste(var_signifikan, collapse = ", "), "\n")
##
## Variabel signifikan: area_mean, radius_se, radius_worst
cat("Jumlah:", length(var_signifikan), "variabel\n")
## Jumlah: 3 variabel
Model dibentuk menggunakan variabel signifikan dari ANOVA dengan
kategori referensi Y = Benign.
df$Y <- relevel(df$Y, ref = "Benign")
formula_model <- as.formula(paste("Y ~", paste(var_signifikan, collapse = " + ")))
cat("Formula model:\n")
## Formula model:
print(formula_model)
## Y ~ area_mean + radius_se + radius_worst
model <- multinom(formula_model, data = df, maxit = 300, trace = FALSE)
summary(model)
## Call:
## multinom(formula = formula_model, data = df, maxit = 300, trace = FALSE)
##
## Coefficients:
## (Intercept) area_mean radius_se radius_worst
## Malignant_Ringan 4.294523 -0.0006491655 -0.7033361 -0.2815694
## Malignant_Berat -6.362704 -0.0014998199 0.9877892 0.3316919
##
## Std. Errors:
## (Intercept) area_mean radius_se radius_worst
## Malignant_Ringan 0.5482204 0.0005220224 0.6491101 0.02583170
## Malignant_Berat 0.5059690 0.0005338467 0.4475778 0.02595509
##
## Residual Deviance: 1985.882
## AIC: 2001.882
Hipotesis:
model_null <- multinom(Y ~ 1, data = df, maxit = 300, trace = FALSE)
lrt <- deviance(model_null) - deviance(model)
df_lrt <- attr(logLik(model), "df") - attr(logLik(model_null), "df")
p_lrt <- pchisq(lrt, df = df_lrt, lower.tail = FALSE)
hasil_serentak <- data.frame(
Statistik = c("G² (Chi-Square)", "Derajat Bebas", "P-value", "Keputusan"),
Nilai = c(round(lrt, 4), df_lrt, round(p_lrt, 4),
ifelse(p_lrt < 0.05, "Tolak H0 - Model Signifikan", "Gagal Tolak H0"))
)
hasil_serentak %>%
kable(caption = "Tabel 3. Hasil Uji Serentak (Likelihood Ratio Test)") %>%
kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)
| Statistik | Nilai |
|---|---|
| G² (Chi-Square) | 492.3328 |
| Derajat Bebas | 6 |
| P-value | 0 |
| Keputusan | Tolak H0 - Model Signifikan |
Hipotesis:
z <- summary(model)$coefficients / summary(model)$standard.errors
p_value_wald <- (1 - pnorm(abs(z))) * 2
cat("=== Nilai Wald (Z) ===\n")
## === Nilai Wald (Z) ===
print(round(z, 4))
## (Intercept) area_mean radius_se radius_worst
## Malignant_Ringan 7.8336 -1.2436 -1.0835 -10.9001
## Malignant_Berat -12.5753 -2.8095 2.2070 12.7795
cat("\n=== P-value ===\n")
##
## === P-value ===
print(round(p_value_wald, 4))
## (Intercept) area_mean radius_se radius_worst
## Malignant_Ringan 0 0.2137 0.2786 0
## Malignant_Berat 0 0.0050 0.0273 0
ll_model <- logLik(model)
ll_null <- logLik(model_null)
n <- nrow(df)
r2_cs <- 1 - exp((2/n) * (as.numeric(ll_null) - as.numeric(ll_model)))
r2_max <- 1 - exp((2/n) * as.numeric(ll_null))
r2_nagelkerke <- r2_cs / r2_max
hasil_r2 <- data.frame(
Metode = c("Cox & Snell R²", "Nagelkerke R²"),
Nilai = c(round(r2_cs, 4), round(r2_nagelkerke, 4))
)
hasil_r2 %>%
kable(caption = "Tabel 4. Pseudo R-Square") %>%
kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)
| Metode | Nilai |
|---|---|
| Cox & Snell R² | 0.3365 |
| Nagelkerke R² | 0.3854 |
cat("Model mampu menjelaskan", round(r2_nagelkerke * 100, 2),
"% variasi variabel respon\n")
## Model mampu menjelaskan 38.54 % variasi variabel respon
Odds Ratio = exp(β). Interpretasi:
odds_ratio <- exp(coef(model))
odds_ratio %>%
round(4) %>%
kable(caption = "Tabel 5. Odds Ratio (exp(β))") %>%
kable_styling(bootstrap_options = c("striped", "hover"), full_width = TRUE)
| (Intercept) | area_mean | radius_se | radius_worst | |
|---|---|---|---|---|
| Malignant_Ringan | 73.2972 | 0.9994 | 0.4949 | 0.7546 |
| Malignant_Berat | 0.0017 | 0.9985 | 2.6853 | 1.3933 |
pred <- predict(model, newdata = df)
conf_matrix <- table(Prediksi = pred, Aktual = df$Y)
conf_matrix %>%
kable(caption = "Tabel 6. Confusion Matrix - Regresi Logistik Multinomial") %>%
kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)
| Benign | Malignant_Ringan | Malignant_Berat | |
|---|---|---|---|
| Benign | 425 | 209 | 179 |
| Malignant_Ringan | 92 | 85 | 0 |
| Malignant_Berat | 95 | 0 | 115 |
akurasi <- mean(pred == df$Y)
cat("\nAkurasi Model:", round(akurasi * 100, 2), "%\n")
##
## Akurasi Model: 52.08 %
Analisis Diskriminan dilakukan sebagai metode pembanding menggunakan variabel prediktor yang sama dengan model multinomial (variabel signifikan dari ANOVA).
# Siapkan data untuk LDA
df_lda_model <- df[, c("Y", var_signifikan)]
df_lda_model$Y <- factor(df_lda_model$Y)
# Fit model LDA
lda_model <- lda(Y ~ ., data = df_lda_model)
print(lda_model)
## Call:
## lda(Y ~ ., data = df_lda_model)
##
## Prior probabilities of groups:
## Benign Malignant_Ringan Malignant_Berat
## 0.510 0.245 0.245
##
## Group means:
## area_mean radius_se radius_worst
## Benign 652.1719 0.4022801 16.84441
## Malignant_Ringan 638.3248 0.3927571 13.91439
## Malignant_Berat 620.2158 0.4135611 20.46774
##
## Coefficients of linear discriminants:
## LD1 LD2
## area_mean -0.0004900696 0.006769143
## radius_se 0.8752039186 0.319213149
## radius_worst 0.3026746693 0.020365161
##
## Proportion of trace:
## LD1 LD2
## 0.9891 0.0109
# Prediksi LDA
pred_lda <- predict(lda_model, newdata = df_lda_model)
conf_lda <- table(Prediksi = pred_lda$class, Aktual = df_lda_model$Y)
conf_lda %>%
kable(caption = "Tabel 7. Confusion Matrix - Analisis Diskriminan") %>%
kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)
| Benign | Malignant_Ringan | Malignant_Berat | |
|---|---|---|---|
| Benign | 429 | 212 | 179 |
| Malignant_Ringan | 87 | 82 | 0 |
| Malignant_Berat | 96 | 0 | 115 |
akurasi_lda <- mean(pred_lda$class == df_lda_model$Y)
cat("\nAkurasi Analisis Diskriminan:", round(akurasi_lda * 100, 2), "%\n")
##
## Akurasi Analisis Diskriminan: 52.17 %
tabel_perbandingan <- data.frame(
Metode = c("Regresi Logistik Multinomial", "Analisis Diskriminan (LDA)"),
Akurasi = c(paste0(round(akurasi * 100, 2), "%"),
paste0(round(akurasi_lda * 100, 2), "%")),
Kelebihan = c("Tidak perlu asumsi normalitas, prediktor kontinu maupun kategorik",
"Stabil pada data dengan distribusi normal multivariat"),
Kesimpulan = c(ifelse(akurasi >= akurasi_lda, "✅ Lebih Baik", "❌ Lebih Rendah"),
ifelse(akurasi_lda > akurasi, "✅ Lebih Baik", "❌ Lebih Rendah"))
)
tabel_perbandingan %>%
kable(caption = "Tabel 8. Perbandingan Regresi Logistik Multinomial vs Analisis Diskriminan") %>%
kable_styling(bootstrap_options = c("striped", "hover"), full_width = TRUE)
| Metode | Akurasi | Kelebihan | Kesimpulan |
|---|---|---|---|
| Regresi Logistik Multinomial | 52.08% | Tidak perlu asumsi normalitas, prediktor kontinu maupun kategorik | ❌ Lebih Rendah | |
| Analisis Diskriminan (LDA) | 52.17% | Stabil pada data dengan distribusi normal multivariat | ✅ Lebih Baik | |
cat("\nKesimpulan Perbandingan:\n")
##
## Kesimpulan Perbandingan:
if (akurasi > akurasi_lda) {
cat("Regresi Logistik Multinomial menghasilkan akurasi lebih tinggi (",
round(akurasi * 100, 2), "%) dibandingkan Analisis Diskriminan (",
round(akurasi_lda * 100, 2), "%).\n")
cat("Hal ini sesuai karena LDA mengasumsikan normalitas multivariat yang\n")
cat("mungkin tidak sepenuhnya terpenuhi pada data ini.\n")
} else {
cat("Analisis Diskriminan menghasilkan akurasi lebih tinggi (",
round(akurasi_lda * 100, 2), "%) dibandingkan Regresi Logistik Multinomial (",
round(akurasi * 100, 2), "%).\n")
}
## Analisis Diskriminan menghasilkan akurasi lebih tinggi ( 52.17 %) dibandingkan Regresi Logistik Multinomial ( 52.08 %).
## ## Ringkasan Hasil Analisis:
## - Total observasi : 1200
## - Variabel signifikan (ANOVA): 3 variabel
## - Kategori referensi : Benign
## - Model logit terbentuk : 2 model (Malignant_Ringan vs Benign, Malignant_Berat vs Benign)
## - Akurasi Logistik Multinomial: 52.08 %
## - Akurasi Analisis Diskriminan: 52.17 %
## - Metode terbaik : Analisis Diskriminan (LDA)
Analisis menggunakan dataset Breast Cancer Research | R version 4.5.2