#install.packages(c("tidyverse", "ggplot2", "corrplot", "car", "nnet", "reshape2", "gridExtra"))
library(tidyverse)
## Warning: package 'readr' was built under R version 4.5.3
## Warning: package 'dplyr' was built under R version 4.5.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.2.0 ✔ readr 2.2.0
## ✔ forcats 1.0.1 ✔ stringr 1.6.0
## ✔ ggplot2 4.0.2 ✔ tibble 3.3.1
## ✔ lubridate 1.9.5 ✔ tidyr 1.3.2
## ✔ purrr 1.2.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggplot2)
library(corrplot)
## corrplot 0.95 loaded
library(car)
## Loading required package: carData
##
## Attaching package: 'car'
##
## The following object is masked from 'package:dplyr':
##
## recode
##
## The following object is masked from 'package:purrr':
##
## some
library(nnet)
library(reshape2)
##
## Attaching package: 'reshape2'
##
## The following object is masked from 'package:tidyr':
##
## smiths
library(gridExtra)
##
## Attaching package: 'gridExtra'
##
## The following object is masked from 'package:dplyr':
##
## combine
library(MASS)
##
## Attaching package: 'MASS'
##
## The following object is masked from 'package:dplyr':
##
## select
OPEN FILE
df <- read.csv("glass.csv",
stringsAsFactors = FALSE,
fileEncoding = "UTF-8-BOM")
head(df)
## RI Na Mg Al Si K Ca Ba Fe Type
## 1 1.52101 13.64 4.49 1.10 71.78 0.06 8.75 0 0.00 1
## 2 1.51761 13.89 3.60 1.36 72.73 0.48 7.83 0 0.00 1
## 3 1.51618 13.53 3.55 1.54 72.99 0.39 7.78 0 0.00 1
## 4 1.51766 13.21 3.69 1.29 72.61 0.57 8.22 0 0.00 1
## 5 1.51742 13.27 3.62 1.24 73.08 0.55 8.07 0 0.00 1
## 6 1.51596 12.79 3.61 1.62 72.97 0.64 8.07 0 0.26 1
PRE-PROCESSING
df <- df %>%
filter(Type %in% c(1, 2, 3)) %>%
mutate(
Type = factor(Type,
levels = c(1, 2, 3),
labels = c("bldg_float",
"bldg_non_float",
"vehicle_float"))
) %>%
drop_na()
df$Type <- relevel(df$Type, ref = "bldg_float")
cat("Dimensi setelah filter 3 kategori:\n")
## Dimensi setelah filter 3 kategori:
cat(" ", nrow(df), "baris x", ncol(df), "kolom\n\n")
## 163 baris x 10 kolom
warna <- c("bldg_float" = "#2A9D8F",
"bldg_non_float" = "#E76F51",
"vehicle_float" = "#457B9D")
str(df)
## 'data.frame': 163 obs. of 10 variables:
## $ RI : num 1.52 1.52 1.52 1.52 1.52 ...
## $ Na : num 13.6 13.9 13.5 13.2 13.3 ...
## $ Mg : num 4.49 3.6 3.55 3.69 3.62 3.61 3.6 3.61 3.58 3.6 ...
## $ Al : num 1.1 1.36 1.54 1.29 1.24 1.62 1.14 1.05 1.37 1.36 ...
## $ Si : num 71.8 72.7 73 72.6 73.1 ...
## $ K : num 0.06 0.48 0.39 0.57 0.55 0.64 0.58 0.57 0.56 0.57 ...
## $ Ca : num 8.75 7.83 7.78 8.22 8.07 8.07 8.17 8.24 8.3 8.4 ...
## $ Ba : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Fe : num 0 0 0 0 0 0.26 0 0 0 0.11 ...
## $ Type: Factor w/ 3 levels "bldg_float","bldg_non_float",..: 1 1 1 1 1 1 1 1 1 1 ...
summary(df)
## RI Na Mg Al
## Min. :1.512 Min. :10.73 Min. :0.000 Min. :0.290
## 1st Qu.:1.517 1st Qu.:12.87 1st Qu.:3.425 1st Qu.:1.145
## Median :1.518 Median :13.21 Median :3.540 Median :1.290
## Mean :1.519 Mean :13.20 Mean :3.295 Mean :1.282
## 3rd Qu.:1.519 3rd Qu.:13.52 3rd Qu.:3.650 3rd Qu.:1.490
## Max. :1.534 Max. :14.86 Max. :4.490 Max. :2.120
## Si K Ca Ba
## Min. :69.81 Min. :0.0000 Min. : 7.080 Min. :0.00000
## 1st Qu.:72.19 1st Qu.:0.3850 1st Qu.: 8.210 1st Qu.:0.00000
## Median :72.74 Median :0.5700 Median : 8.550 Median :0.00000
## Mean :72.59 Mean :0.4775 Mean : 8.925 Mean :0.02982
## 3rd Qu.:73.00 3rd Qu.:0.6100 3rd Qu.: 9.010 3rd Qu.:0.00000
## Max. :74.45 Max. :1.1000 Max. :16.190 Max. :3.15000
## Fe Type
## Min. :0.00000 bldg_float :70
## 1st Qu.:0.00000 bldg_non_float:76
## Median :0.00000 vehicle_float :17
## Mean :0.06761
## 3rd Qu.:0.13000
## Max. :0.37000
tbl <- table(df$Type)
print(data.frame(
Jenis_Kaca = names(tbl),
Frekuensi = as.integer(tbl),
Persen = paste0(round(prop.table(tbl) * 100, 1), "%")
))
## Jenis_Kaca Frekuensi Persen
## 1 bldg_float 70 42.9%
## 2 bldg_non_float 76 46.6%
## 3 vehicle_float 17 10.4%
make_bp <- function(var, lab) {
ggplot(df, aes_string(x = "Type", y = var, fill = "Type")) +
geom_boxplot(alpha = 0.8, outlier.color = "red",
outlier.shape = 4, outlier.size = 1.5) +
scale_fill_manual(values = warna) +
scale_x_discrete(labels = c("Float","Non-Float","Vehicle")) +
labs(title = lab, x = "", y = lab) +
theme_bw(base_size = 9) +
theme(legend.position = "none")
}
make_bp("RI", "Refractive Index")
## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## This warning is displayed once per session.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

PEMODELAN
num_vars <- c("RI","Na","Mg","Al","Si","K","Ca","Ba","Fe")
num_df <- df %>% dplyr::select(all_of(num_vars))
df_bt <- df %>%
mutate(
RI_log = RI * log(RI),
Na_log = Na * log(Na + 0.001),
Mg_log = Mg * log(Mg + 0.001),
Al_log = Al * log(Al + 0.001),
Si_log = Si * log(Si + 0.001),
K_log = K * log(K + 0.001),
Ca_log = Ca * log(Ca + 0.001),
Ba_log = Ba * log(Ba + 0.001),
Fe_log = Fe * log(Fe + 0.001)
)
log_terms <- c("RI_log","Na_log","Mg_log","Al_log","Si_log",
"K_log","Ca_log","Ba_log","Fe_log")
orig_terms <- num_vars
form_bt <- as.formula(paste(
"y_bin ~",
paste(c(orig_terms, log_terms), collapse = " + ")
))
df_bt12 <- df_bt %>%
filter(Type %in% c("bldg_float","bldg_non_float")) %>%
mutate(y_bin = as.integer(Type == "bldg_non_float"))
bt12 <- glm(form_bt, data = df_bt12, family = binomial)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
p_bt12 <- summary(bt12)$coefficients[log_terms, 4]
df_bt13 <- df_bt %>%
filter(Type %in% c("bldg_float","vehicle_float")) %>%
mutate(y_bin = as.integer(Type == "vehicle_float"))
bt13 <- glm(form_bt, data = df_bt13, family = binomial)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
p_bt13 <- summary(bt13)$coefficients[log_terms, 4]
UJI ASUMSI
bt_tbl <- data.frame(
Variabel = num_vars,
p_T2_vs_T1 = round(p_bt12, 4),
p_T3_vs_T1 = round(p_bt13, 4),
Status_T2_vs_T1 = ifelse(p_bt12 > 0.05, "✓ Linear", "✗ Tidak"),
Status_T3_vs_T1 = ifelse(p_bt13 > 0.05, "✓ Linear", "✗ Tidak"),
row.names = NULL
)
cat("Hasil Box-Tidwell (p-value X*ln(X)):\n")
## Hasil Box-Tidwell (p-value X*ln(X)):
print(bt_tbl)
## Variabel p_T2_vs_T1 p_T3_vs_T1 Status_T2_vs_T1 Status_T3_vs_T1
## 1 RI 0.2747 0.0155 ✓ Linear ✗ Tidak
## 2 Na 0.1696 0.0926 ✓ Linear ✓ Linear
## 3 Mg 0.0424 0.6920 ✗ Tidak ✓ Linear
## 4 Al 0.2455 0.4313 ✓ Linear ✓ Linear
## 5 Si 0.3851 0.0398 ✓ Linear ✗ Tidak
## 6 K 0.8261 0.1037 ✓ Linear ✓ Linear
## 7 Ca 0.0177 0.3211 ✗ Tidak ✓ Linear
## 8 Ba 0.3265 0.8968 ✓ Linear ✓ Linear
## 9 Fe 0.2972 0.6534 ✓ Linear ✓ Linear
cat("─── ASUMSI 1: LINEARITAS ───\n")
## ─── ASUMSI 1: LINEARITAS ───
n_nl <- sum(p_bt12 < 0.05) + sum(p_bt13 < 0.05)
cat(sprintf("\n=> %d dari 18 uji tidak linear\n", n_nl))
##
## => 4 dari 18 uji tidak linear
cat(ifelse(n_nl <= 4,
"=> Mayoritas LINEAR → Asumsi TERPENUHI ✓\n\n",
"=> Banyak tidak linear, pertimbangkan transformasi\n\n"))
## => Mayoritas LINEAR → Asumsi TERPENUHI ✓
cat("─── ASUMSI 2: INDEPENDENSI OBSERVASI ───\n")
## ─── ASUMSI 2: INDEPENDENSI OBSERVASI ───
n_dup <- sum(duplicated(df))
cat("Jumlah duplikat:", n_dup, "\n")
## Jumlah duplikat: 1
cat("Setiap baris = 1 sampel kaca yang berbeda\n")
## Setiap baris = 1 sampel kaca yang berbeda
cat("Diambil dari laboratorium forensik independen\n")
## Diambil dari laboratorium forensik independen
cat("=> Asumsi Independensi:",
ifelse(n_dup == 0, "TERPENUHI ✓\n\n",
paste("Ada", n_dup, "duplikat → dihapus\n\n")))
## => Asumsi Independensi: Ada 1 duplikat → dihapus
if (n_dup > 0) df <- distinct(df)
cat("─── ASUMSI 3: TIDAK ADA MULTIKOLINEARITAS ───\n")
## ─── ASUMSI 3: TIDAK ADA MULTIKOLINEARITAS ───
cat("Metode : Korelasi Pearson + VIF\n")
## Metode : Korelasi Pearson + VIF
cat("Kriteria: |r| < 0.8 dan VIF < 10\n\n")
## Kriteria: |r| < 0.8 dan VIF < 10
cor_mat <- cor(num_df)
cat("Matriks Korelasi:\n")
## Matriks Korelasi:
print(round(cor_mat, 3))
## RI Na Mg Al Si K Ca Ba Fe
## RI 1.000 -0.069 -0.567 -0.462 -0.691 -0.622 0.882 0.327 0.109
## Na -0.069 1.000 0.340 -0.272 -0.359 -0.367 -0.279 -0.301 -0.194
## Mg -0.567 0.340 1.000 0.006 0.133 0.216 -0.828 -0.287 -0.139
## Al -0.462 -0.272 0.006 1.000 0.171 0.713 -0.376 0.200 0.054
## Si -0.691 -0.359 0.133 0.171 1.000 0.426 -0.450 -0.354 -0.099
## K -0.622 -0.367 0.216 0.713 0.426 1.000 -0.534 0.012 0.075
## Ca 0.882 -0.279 -0.828 -0.376 -0.450 -0.534 1.000 0.241 0.139
## Ba 0.327 -0.301 -0.287 0.200 -0.354 0.012 0.241 1.000 0.174
## Fe 0.109 -0.194 -0.139 0.054 -0.099 0.075 0.139 0.174 1.000
# Deteksi pasangan korelasi tinggi
cat("\nPasangan variabel dengan |r| > 0.6:\n")
##
## Pasangan variabel dengan |r| > 0.6:
cor_up <- which(abs(cor_mat) > 0.6 & upper.tri(cor_mat), arr.ind = TRUE)
if (nrow(cor_up) == 0) {
cat(" Tidak ada\n")
} else {
for (i in seq_len(nrow(cor_up))) {
r1 <- rownames(cor_mat)[cor_up[i,1]]
r2 <- colnames(cor_mat)[cor_up[i,2]]
cat(sprintf(" %s & %s : r = %.3f\n",
r1, r2, cor_mat[cor_up[i,1], cor_up[i,2]]))
}
}
## RI & Si : r = -0.691
## RI & K : r = -0.622
## Al & K : r = 0.713
## RI & Ca : r = 0.882
## Mg & Ca : r = -0.828
Cek VIF
lm_vif <- lm(as.numeric(Type) ~ RI + Na + Mg + Al + Si + K + Ca + Ba + Fe,
data = df)
vif_v <- vif(lm_vif)
vif_tbl <- data.frame(
Variabel = names(vif_v),
VIF = round(vif_v, 3),
Status = ifelse(vif_v < 5, "✓ OK (< 5)",
ifelse(vif_v < 10, "⚠ Perhatian (5–10)",
"✗ Masalah (> 10)"))
)
cat("\nVIF:\n")
##
## VIF:
print(vif_tbl)
## Variabel VIF Status
## RI RI 13.258 ✗ Masalah (> 10)
## Na Na 31.051 ✗ Masalah (> 10)
## Mg Mg 75.729 ✗ Masalah (> 10)
## Al Al 12.162 ✗ Masalah (> 10)
## Si Si 37.072 ✗ Masalah (> 10)
## K K 7.804 ⚠ Perhatian (5–10)
## Ca Ca 199.254 ✗ Masalah (> 10)
## Ba Ba 7.513 ⚠ Perhatian (5–10)
## Fe Fe 1.148 ✓ OK (< 5)
cat("=> Asumsi Multikolinearitas:",
ifelse(all(vif_v < 10), "TERPENUHI ✓\n\n", "PERLU DITANGANI\n\n"))
## => Asumsi Multikolinearitas: PERLU DITANGANI
Cek Outliers
num_vars <- c("RI", "Na", "Mg", "Al", "Si", "K", "Ca", "Ba", "Fe")
z_df <- df %>% dplyr::select(all_of(num_vars)) %>%
mutate(across(everything(), ~abs(scale(.)[,1])))
out_z <- z_df %>% summarise(across(everything(), ~sum(. > 3)))
out_tbl <- data.frame(
Variabel = names(out_z),
Outlier_Zscore = as.integer(out_z),
Status = ifelse(as.integer(out_z) == 0, "✓ Tidak ada", "⚠ Ada")
)
cat("Outlier per variabel (|z| > 3):\n")
## Outlier per variabel (|z| > 3):
print(out_tbl)
## Variabel Outlier_Zscore Status
## 1 RI 3 ⚠ Ada
## 2 Na 3 ⚠ Ada
## 3 Mg 9 ⚠ Ada
## 4 Al 1 ⚠ Ada
## 5 Si 3 ⚠ Ada
## 6 K 0 ✓ Tidak ada
## 7 Ca 7 ⚠ Ada
## 8 Ba 1 ⚠ Ada
## 9 Fe 1 ⚠ Ada
df_clean <- df %>%
filter(if_all(all_of(num_vars), ~ abs(scale(.)[,1]) <= 3))
cat(sprintf("\nBaris sebelum hapus outlier : %d\n", nrow(df)))
##
## Baris sebelum hapus outlier : 162
cat(sprintf("Baris setelah hapus outlier : %d\n", nrow(df_clean)))
## Baris setelah hapus outlier : 150
cat(sprintf("Outlier yang dihapus : %d baris\n\n", nrow(df) - nrow(df_clean)))
## Outlier yang dihapus : 12 baris
vars_final <- c("RI", "Na", "Mg", "Al", "K", "Ba", "Fe")
num_df_clean <- df_clean %>% dplyr::select(all_of(vars_final))
Cek Korelasi
cor_mat <- cor(num_df_clean)
cat("Matriks Korelasi:\n")
## Matriks Korelasi:
print(round(cor_mat, 3))
## RI Na Mg Al K Ba Fe
## RI 1.000 0.410 -0.010 -0.644 -0.692 0.105 -0.024
## Na 0.410 1.000 0.053 -0.230 -0.534 0.111 -0.169
## Mg -0.010 0.053 1.000 -0.254 -0.128 -0.111 -0.098
## Al -0.644 -0.230 -0.254 1.000 0.651 0.012 -0.051
## K -0.692 -0.534 -0.128 0.651 1.000 -0.139 0.040
## Ba 0.105 0.111 -0.111 0.012 -0.139 1.000 0.052
## Fe -0.024 -0.169 -0.098 -0.051 0.040 0.052 1.000
cat("\nPasangan variabel dengan |r| > 0.6:\n")
##
## Pasangan variabel dengan |r| > 0.6:
cor_up <- which(abs(cor_mat) > 0.6 & upper.tri(cor_mat), arr.ind = TRUE)
if (nrow(cor_up) == 0) {
cat(" Tidak ada\n")
} else {
for (i in seq_len(nrow(cor_up))) {
r1 <- rownames(cor_mat)[cor_up[i,1]]
r2 <- colnames(cor_mat)[cor_up[i,2]]
cat(sprintf(" %s & %s : r = %.3f\n",
r1, r2, cor_mat[cor_up[i,1], cor_up[i,2]]))
}
}
## RI & Al : r = -0.644
## RI & K : r = -0.692
## Al & K : r = 0.651
Cek FIV ke-2
lm_vif <- lm(as.numeric(Type) ~ RI + Na + Mg + Al + Si + K + Ca + Ba + Fe,
data = df_clean) # <-- df_clean
vif_v <- vif(lm_vif)
vif_tbl <- data.frame(
Variabel = names(vif_v),
VIF = round(vif_v, 3),
Status = ifelse(vif_v < 5, "✓ OK (< 5)",
ifelse(vif_v < 10, "⚠ Perhatian (5–10)",
"✗ Masalah (> 10)"))
)
cat("\nVIF:\n")
##
## VIF:
print(vif_tbl)
## Variabel VIF Status
## RI RI 5.885 ⚠ Perhatian (5–10)
## Na Na 18.137 ✗ Masalah (> 10)
## Mg Mg 16.832 ✗ Masalah (> 10)
## Al Al 10.642 ✗ Masalah (> 10)
## Si Si 23.731 ✗ Masalah (> 10)
## K K 6.393 ⚠ Perhatian (5–10)
## Ca Ca 46.013 ✗ Masalah (> 10)
## Ba Ba 1.368 ✓ OK (< 5)
## Fe Fe 1.108 ✓ OK (< 5)
cat("=> Asumsi Multikolinearitas:",
ifelse(all(vif_v < 10), "TERPENUHI ✓\n\n", "PERLU DITANGANI\n\n"))
## => Asumsi Multikolinearitas: PERLU DITANGANI
Iterasi
cat("=== Iterasi 1: Hapus Ca ===\n")
## === Iterasi 1: Hapus Ca ===
lm_vif2 <- lm(as.numeric(Type) ~ RI + Na + Mg + Al + Si + K + Ba + Fe,
data = df_clean) # Ca dihapus
vif_v2 <- vif(lm_vif2)
vif_tbl2 <- data.frame(
Variabel = names(vif_v2),
VIF = round(vif_v2, 3),
Status = ifelse(vif_v2 < 5, "✓ OK (< 5)",
ifelse(vif_v2 < 10, "⚠ Perhatian (5–10)",
"✗ Masalah (> 10)"))
)
print(vif_tbl2)
## Variabel VIF Status
## RI RI 4.815 ✓ OK (< 5)
## Na Na 3.298 ✓ OK (< 5)
## Mg Mg 1.323 ✓ OK (< 5)
## Al Al 2.802 ✓ OK (< 5)
## Si Si 5.142 ⚠ Perhatian (5–10)
## K K 2.810 ✓ OK (< 5)
## Ba Ba 1.060 ✓ OK (< 5)
## Fe Fe 1.091 ✓ OK (< 5)
cat("=> Masih ada VIF > 10:", sum(vif_v2 > 10), "variabel\n\n")
## => Masih ada VIF > 10: 0 variabel
cat("=== Iterasi 2: Hapus Si ===\n")
## === Iterasi 2: Hapus Si ===
lm_vif3 <- lm(as.numeric(Type) ~ RI + Na + Mg + Al + K + Ba + Fe,
data = df_clean) # Ca dan Si dihapus
vif_v3 <- vif(lm_vif3)
vif_tbl3 <- data.frame(
Variabel = names(vif_v3),
VIF = round(vif_v3, 3),
Status = ifelse(vif_v3 < 5, "✓ OK (< 5)",
ifelse(vif_v3 < 10, "⚠ Perhatian (5–10)",
"✗ Masalah (> 10)"))
)
print(vif_tbl3)
## Variabel VIF Status
## RI RI 2.392 ✓ OK (< 5)
## Na Na 1.518 ✓ OK (< 5)
## Mg Mg 1.168 ✓ OK (< 5)
## Al Al 2.355 ✓ OK (< 5)
## K K 2.778 ✓ OK (< 5)
## Ba Ba 1.057 ✓ OK (< 5)
## Fe Fe 1.060 ✓ OK (< 5)
cat("=> Masih ada VIF > 10:", sum(vif_v3 > 10), "variabel\n\n")
## => Masih ada VIF > 10: 0 variabel
cat("=== PERBANDINGAN ===\n")
## === PERBANDINGAN ===
cat(sprintf("Tanpa Ca : maks VIF = %.3f | variabel masalah = %d\n",
max(vif_v2), sum(vif_v2 > 10)))
## Tanpa Ca : maks VIF = 5.142 | variabel masalah = 0
cat(sprintf("Tanpa Ca & Si : maks VIF = %.3f | variabel masalah = %d\n",
max(vif_v3), sum(vif_v3 > 10)))
## Tanpa Ca & Si : maks VIF = 2.778 | variabel masalah = 0
# Simpan variabel final sesuai hasil terbaik
# Jika iterasi 1 cukup (semua VIF < 10):
vars_final <- c("RI", "Na", "Mg", "Al", "Si", "K", "Ba", "Fe") # tanpa Ca
df_final <- df_clean %>% dplyr::select(all_of(vars_final), Type)
cat("\nDataset final:", nrow(df_final), "baris x", ncol(df_final), "kolom\n")
##
## Dataset final: 150 baris x 9 kolom
cat("Variabel prediktor:", paste(vars_final, collapse = ", "), "\n")
## Variabel prediktor: RI, Na, Mg, Al, Si, K, Ba, Fe
# Correlation matrix tanpa Ca
num_vars_noCa <- c("RI", "Na", "Mg", "Al", "K", "Ba", "Fe")
num_df_noCa <- df_clean %>% dplyr::select(all_of(num_vars_noCa))
cor_mat_noCa <- cor(num_df_noCa)
cat("Matriks Korelasi (tanpa Ca):\n")
## Matriks Korelasi (tanpa Ca):
print(round(cor_mat_noCa, 3))
## RI Na Mg Al K Ba Fe
## RI 1.000 0.410 -0.010 -0.644 -0.692 0.105 -0.024
## Na 0.410 1.000 0.053 -0.230 -0.534 0.111 -0.169
## Mg -0.010 0.053 1.000 -0.254 -0.128 -0.111 -0.098
## Al -0.644 -0.230 -0.254 1.000 0.651 0.012 -0.051
## K -0.692 -0.534 -0.128 0.651 1.000 -0.139 0.040
## Ba 0.105 0.111 -0.111 0.012 -0.139 1.000 0.052
## Fe -0.024 -0.169 -0.098 -0.051 0.040 0.052 1.000
cat("\nPasangan variabel dengan |r| > 0.6:\n")
##
## Pasangan variabel dengan |r| > 0.6:
cor_up <- which(abs(cor_mat_noCa) > 0.6 & upper.tri(cor_mat_noCa), arr.ind = TRUE)
if (nrow(cor_up) == 0) {
cat(" Tidak ada\n")
} else {
for (i in seq_len(nrow(cor_up))) {
r1 <- rownames(cor_mat_noCa)[cor_up[i, 1]]
r2 <- colnames(cor_mat_noCa)[cor_up[i, 2]]
cat(sprintf(" %s & %s : r = %.3f\n",
r1, r2, cor_mat_noCa[cor_up[i,1], cor_up[i,2]]))
}
}
## RI & Al : r = -0.644
## RI & K : r = -0.692
## Al & K : r = 0.651
# Dataset final siap untuk model
vars_final <- c("RI", "Na", "Mg", "Al", "K", "Ba", "Fe")
df_final <- df_clean %>% dplyr::select(all_of(vars_final), Type)
cat("=> Asumsi Multikolinearitas: TERPENUHI ✓\n")
## => Asumsi Multikolinearitas: TERPENUHI ✓
cat(" Variabel Ca dan Si dihapus (VIF > 10)\n")
## Variabel Ca dan Si dihapus (VIF > 10)
cat(" Semua VIF tersisa < 5\n")
## Semua VIF tersisa < 5
cat(" Variabel final:", paste(vars_final, collapse = ", "), "\n")
## Variabel final: RI, Na, Mg, Al, K, Ba, Fe
cat(" Dataset:", nrow(df_final), "baris x", ncol(df_final), "kolom\n")
## Dataset: 150 baris x 8 kolom
head(df_final)
## RI Na Mg Al K Ba Fe Type
## 1 1.52101 13.64 4.49 1.10 0.06 0 0.00 bldg_float
## 2 1.51761 13.89 3.60 1.36 0.48 0 0.00 bldg_float
## 3 1.51618 13.53 3.55 1.54 0.39 0 0.00 bldg_float
## 4 1.51766 13.21 3.69 1.29 0.57 0 0.00 bldg_float
## 5 1.51742 13.27 3.62 1.24 0.55 0 0.00 bldg_float
## 6 1.51596 12.79 3.61 1.62 0.64 0 0.26 bldg_float
sum(is.na(df_final))
## [1] 0
#CEK ASUMSI LDA
# pakai uji normalitas univariat per variabel sebagai pendekatan
cat("Uji Normalitas per Variabel (Shapiro-Wilk):\n")
## Uji Normalitas per Variabel (Shapiro-Wilk):
sw_result <- df_final %>% dplyr::select(all_of(vars_final)) %>%
sapply(function(x) shapiro.test(x)$p.value)
sw_tbl <- data.frame(
Variabel = names(sw_result),
p_value = round(sw_result, 4),
Status = ifelse(sw_result > 0.05, "✓ Normal", "✗ Tidak Normal")
)
print(sw_tbl)
## Variabel p_value Status
## RI RI 0.0000 ✗ Tidak Normal
## Na Na 0.0251 ✗ Tidak Normal
## Mg Mg 0.0000 ✗ Tidak Normal
## Al Al 0.0080 ✗ Tidak Normal
## K K 0.0000 ✗ Tidak Normal
## Ba Ba 0.0000 ✗ Tidak Normal
## Fe Fe 0.0000 ✗ Tidak Normal
cat(sprintf("\nVariabel normal: %d dari %d\n",
sum(sw_result > 0.05), length(sw_result)))
##
## Variabel normal: 0 dari 7
cat(ifelse(all(sw_result > 0.05),
"=> Asumsi Normalitas: TERPENUHI ✓\n",
"=> Asumsi Normalitas: TIDAK TERPENUHI\n"))
## => Asumsi Normalitas: TIDAK TERPENUHI
# Asumsi 5: Kehomogenan Matriks Varians-Kovarians (Box's M Test)
library(biotools)
## Warning: package 'biotools' was built under R version 4.5.3
## ---
## biotools version 4.3
cat("Asumsi 5 Kehomogenan Varians-Kovarians (Box's M Test) \n")
## Asumsi 5 Kehomogenan Varians-Kovarians (Box's M Test)
cat("H0 : Matriks kovarians antar kelompok homogen\n")
## H0 : Matriks kovarians antar kelompok homogen
cat("H1 : Matriks kovarians antar kelompok tidak homogen\n\n")
## H1 : Matriks kovarians antar kelompok tidak homogen
boxm_result <- boxM(
data = df_final %>% dplyr::select(all_of(vars_final)),
grouping = df_final$Type
)
print(boxm_result)
##
## Box's M-test for Homogeneity of Covariance Matrices
##
## data: df_final %>% dplyr::select(all_of(vars_final))
## Chi-Sq (approx.) = 208.8, df = 56, p-value < 2.2e-16
cat(sprintf("\nChi-Square = %.4f | df = %d | p-value = %.4f\n",
boxm_result$statistic, boxm_result$parameter, boxm_result$p.value))
##
## Chi-Square = 208.8028 | df = 56 | p-value = 0.0000
cat(ifelse(boxm_result$p.value > 0.05,
"=> Asumsi Homogenitas: TERPENUHI ✓ → Gunakan LDA\n\n",
"=> Asumsi Homogenitas: TIDAK TERPENUHI → Pertimbangkan QDA\n\n"))
## => Asumsi Homogenitas: TIDAK TERPENUHI → Pertimbangkan QDA
MODEL REGRESI LOGISTIK MULTINOMIAL
model_multinom <- multinom(
as.formula(paste("Type ~", paste(vars_final, collapse = " + "))),
data = df_final,
trace = FALSE
)
cat("ESTIMASI PARAMETER MODEL\n")
## ESTIMASI PARAMETER MODEL
print(summary(model_multinom))
## Call:
## multinom(formula = as.formula(paste("Type ~", paste(vars_final,
## collapse = " + "))), data = df_final, trace = FALSE)
##
## Coefficients:
## (Intercept) RI Na Mg Al K
## bldg_non_float -168.2918 97.05469 1.037690 -0.2245978 5.25030219 1.609452
## vehicle_float 625.8205 -422.13752 1.250729 -0.5492412 0.09324865 -2.336724
## Ba Fe
## bldg_non_float -1.898337 3.993628
## vehicle_float -1.643733 -2.508314
##
## Std. Errors:
## (Intercept) RI Na Mg Al K
## bldg_non_float 2.678513 4.068030 0.6266196 0.5858427 1.196128 1.803303
## vehicle_float 3.789424 5.775927 0.8832948 0.9019059 1.438085 2.141522
## Ba Fe
## bldg_non_float 3.589812 2.184576
## vehicle_float 4.283600 3.753342
##
## Residual Deviance: 231.1288
## AIC: 263.1288
# MODEL LDA
cat("MODEL LINEAR DISCRIMINANT ANALYSIS\n\n")
## MODEL LINEAR DISCRIMINANT ANALYSIS
lda_model <- lda(
Type ~ RI + Na + Mg + Al + K + Ba + Fe,
data = df_final,
prior = c(1,1,1)/3 # prior probabilitas sama
)
cat("Prior Probabilitas:\n")
## Prior Probabilitas:
print(lda_model$prior)
## bldg_float bldg_non_float vehicle_float
## 0.3333333 0.3333333 0.3333333
cat("\nMean per Kelompok:\n")
##
## Mean per Kelompok:
print(round(lda_model$means, 4))
## RI Na Mg Al K Ba Fe
## bldg_float 1.5187 13.2056 3.5456 1.1869 0.4585 0.0131 0.0587
## bldg_non_float 1.5174 13.2041 3.4092 1.4586 0.5706 0.0102 0.0761
## vehicle_float 1.5177 13.3900 3.5288 1.2194 0.4175 0.0094 0.0375
cat("\nKoefisien Fungsi Diskriminan:\n")
##
## Koefisien Fungsi Diskriminan:
print(round(lda_model$scaling, 4))
## LD1 LD2
## RI -408.7738 541.0737
## Na 0.0193 -1.2597
## Mg -0.2036 0.6497
## Al -3.8005 -1.2315
## K -2.9568 3.0662
## Ba 0.5477 2.1021
## Fe -4.2831 -0.1443
# PROPORSI VARIANSI YANG DIJELASKAN
cat("PROPORSI VARIANSI\n")
## PROPORSI VARIANSI
prop_var <- lda_model$svd^2 / sum(lda_model$svd^2)
cat(sprintf("LD1 menjelaskan %.1f%% variansi\n", prop_var[1]*100))
## LD1 menjelaskan 67.1% variansi
cat(sprintf("LD2 menjelaskan %.1f%% variansi\n", prop_var[2]*100))
## LD2 menjelaskan 32.9% variansi
UJI SERENTAK (Likelihood Ratio Test / G²)
cat("UJI SERENTAK\n")
## UJI SERENTAK
cat("H0 : β1 = β2 = ... = βk = 0 (variabel prediktor tidak signifikan terhadap model)\n")
## H0 : β1 = β2 = ... = βk = 0 (variabel prediktor tidak signifikan terhadap model)
cat("H1 : minimal ada satu βi ≠ 0 (i = 1,2,...,k)\n\n")
## H1 : minimal ada satu βi ≠ 0 (i = 1,2,...,k)
model_null <- multinom(Type ~ 1, data = df_final, trace = FALSE)
LL_null <- logLik(model_null)
LL_full <- logLik(model_multinom)
G2 <- as.numeric(-2 * (LL_null - LL_full))
df_G2 <- attr(LL_full, "df") - attr(LL_null, "df")
p_G2 <- pchisq(G2, df = df_G2, lower.tail = FALSE)
chi_tabel <- qchisq(0.95, df = df_G2)
uji_serentak <- data.frame(
Model = c("Intercept Only", "Final"),
`-2LogLikelihood` = c(round(-2 * as.numeric(LL_null), 3),
round(-2 * as.numeric(LL_full), 3)),
Chi_Square = c(NA, round(G2, 3)),
df = c(NA, df_G2),
p_value = c(NA, round(p_G2, 4))
)
print(uji_serentak)
## Model X.2LogLikelihood Chi_Square df p_value
## 1 Intercept Only 287.580 NA NA NA
## 2 Final 231.129 56.452 14 0
cat(sprintf("\nG² hitung = %.3f\n", G2))
##
## G² hitung = 56.452
cat(sprintf("χ² tabel (df=%d, α=0.05) = %.3f\n", df_G2, chi_tabel))
## χ² tabel (df=14, α=0.05) = 23.685
cat(sprintf("p-value = %.4f\n\n", p_G2))
## p-value = 0.0000
cat(ifelse(G2 > chi_tabel & p_G2 < 0.05,
"Keputusan : Tolak H0\nKesimpulan: Minimal ada satu variabel prediktor yang berpengaruh signifikan\n",
"Keputusan : Gagal Tolak H0\nKesimpulan: Tidak ada variabel prediktor yang signifikan\n"))
## Keputusan : Tolak H0
## Kesimpulan: Minimal ada satu variabel prediktor yang berpengaruh signifikan
UJI PARSIAL (Wald Test)
cat("UJI PARSIAL (Wald Test)\n")
## UJI PARSIAL (Wald Test)
cat("H0 : βj = 0\n")
## H0 : βj = 0
cat("H1 : βj ≠ 0 ; j = 1, 2, ..., p\n")
## H1 : βj ≠ 0 ; j = 1, 2, ..., p
cat("Kriteria: Tolak H0 jika |W| > 1.96 atau p-value < 0.05\n\n")
## Kriteria: Tolak H0 jika |W| > 1.96 atau p-value < 0.05
coef_mat <- summary(model_multinom)$coefficients
se_mat <- summary(model_multinom)$standard.errors
W_mat <- coef_mat / se_mat
p_mat <- 2 * (1 - pnorm(abs(W_mat)))
kategori <- rownames(coef_mat)
for (kat in kategori) {
cat(sprintf("══ Model Logit: %s vs bldg_float (referensi) ══\n", kat))
parsial_tbl <- data.frame(
Variabel = colnames(coef_mat),
B = round(coef_mat[kat, ], 4),
SE = round(se_mat[kat, ], 4),
Wald_W = round(W_mat[kat, ], 4),
df = 1,
p_value = round(p_mat[kat, ], 4),
Keputusan = ifelse(p_mat[kat, ] < 0.05, "Tolak H0", "Gagal Tolak H0"),
row.names = NULL
)
print(parsial_tbl)
cat("\n")
}
## ══ Model Logit: bldg_non_float vs bldg_float (referensi) ══
## Variabel B SE Wald_W df p_value Keputusan
## 1 (Intercept) -168.2918 2.6785 -62.8303 1 0.0000 Tolak H0
## 2 RI 97.0547 4.0680 23.8579 1 0.0000 Tolak H0
## 3 Na 1.0377 0.6266 1.6560 1 0.0977 Gagal Tolak H0
## 4 Mg -0.2246 0.5858 -0.3834 1 0.7014 Gagal Tolak H0
## 5 Al 5.2503 1.1961 4.3894 1 0.0000 Tolak H0
## 6 K 1.6095 1.8033 0.8925 1 0.3721 Gagal Tolak H0
## 7 Ba -1.8983 3.5898 -0.5288 1 0.5969 Gagal Tolak H0
## 8 Fe 3.9936 2.1846 1.8281 1 0.0675 Gagal Tolak H0
##
## ══ Model Logit: vehicle_float vs bldg_float (referensi) ══
## Variabel B SE Wald_W df p_value Keputusan
## 1 (Intercept) 625.8205 3.7894 165.1492 1 0.0000 Tolak H0
## 2 RI -422.1375 5.7759 -73.0857 1 0.0000 Tolak H0
## 3 Na 1.2507 0.8833 1.4160 1 0.1568 Gagal Tolak H0
## 4 Mg -0.5492 0.9019 -0.6090 1 0.5425 Gagal Tolak H0
## 5 Al 0.0932 1.4381 0.0648 1 0.9483 Gagal Tolak H0
## 6 K -2.3367 2.1415 -1.0912 1 0.2752 Gagal Tolak H0
## 7 Ba -1.6437 4.2836 -0.3837 1 0.7012 Gagal Tolak H0
## 8 Fe -2.5083 3.7533 -0.6683 1 0.5039 Gagal Tolak H0
ODDS RATIO
cat("─── ODDS RATIO ───\n")
## ─── ODDS RATIO ───
odds_ratio <- exp(coef(model_multinom))
for(kat in rownames(odds_ratio)) {
cat(sprintf("\n─── Odds Ratio: %s vs bldg_float ───\n", kat))
or_tbl <- data.frame(
variabel = colnames(odds_ratio),
Odds_Ratio = round(odds_ratio[kat, ], 4),
row.names = NULL
)
print(or_tbl)
}
##
## ─── Odds Ratio: bldg_non_float vs bldg_float ───
## variabel Odds_Ratio
## 1 (Intercept) 0.000000e+00
## 2 RI 1.413571e+42
## 3 Na 2.822700e+00
## 4 Mg 7.988000e-01
## 5 Al 1.906239e+02
## 6 K 5.000100e+00
## 7 Ba 1.498000e-01
## 8 Fe 5.425130e+01
##
## ─── Odds Ratio: vehicle_float vs bldg_float ───
## variabel Odds_Ratio
## 1 (Intercept) 6.171331e+271
## 2 RI 0.000000e+00
## 3 Na 3.492900e+00
## 4 Mg 5.774000e-01
## 5 Al 1.097700e+00
## 6 K 9.660000e-02
## 7 Ba 1.933000e-01
## 8 Fe 8.140000e-02
cat("\nInterpretasi:\n")
##
## Interpretasi:
cat("Odds Ratio > 1 : meningkatkan peluang kategori dibanding referensi\n")
## Odds Ratio > 1 : meningkatkan peluang kategori dibanding referensi
cat("Odds Ratio < 1 : menurunkan peluang kategori dibanding referensi\n")
## Odds Ratio < 1 : menurunkan peluang kategori dibanding referensi
cat("Odds Ratio = 1 : tidak berpengaruh\n")
## Odds Ratio = 1 : tidak berpengaruh
Prediksi Model
pred <- predict(model_multinom, df_final)
cat("─── CONFUSION MATRIX ───\n")
## ─── CONFUSION MATRIX ───
conf_matrix <- table(Prediksi = pred, Aktual = df_final$Type)
print(conf_matrix)
## Aktual
## Prediksi bldg_float bldg_non_float vehicle_float
## bldg_float 54 16 9
## bldg_non_float 14 50 5
## vehicle_float 0 0 2
Akurasi Model
accuracy <- mean(pred == df_final$Type)
cat("─── Akurasi Model ───\n")
## ─── Akurasi Model ───
cat(sprintf("Akurasi: %.4f (%.2f%%)\n", accuracy, accuracy * 100))
## Akurasi: 0.7067 (70.67%)
prob <- predict(model_multinom, df_final, type = "probs")
cat("\n─── Contoh Probabilitas (5 observasi pertama) ───\n")
##
## ─── Contoh Probabilitas (5 observasi pertama) ───
head(prob)
## bldg_float bldg_non_float vehicle_float
## 1 0.7276010 0.1393034 0.13309559
## 2 0.2998410 0.5029701 0.19718893
## 3 0.2352301 0.5321467 0.23262322
## 4 0.5365835 0.3503174 0.11309918
## 5 0.5598899 0.2875347 0.15257539
## 6 0.1309165 0.8531423 0.01594123
# PREDIKSI & CONFUSION MATRIX
cat("\nPREDIKSI & CONFUSION MATRIX\n\n")
##
## PREDIKSI & CONFUSION MATRIX
lda_pred <- predict(lda_model, df_final)
conf_mat <- table(
Aktual = df_final$Type,
Prediksi = lda_pred$class
)
cat("Confusion Matrix:\n")
## Confusion Matrix:
print(conf_mat)
## Prediksi
## Aktual bldg_float bldg_non_float vehicle_float
## bldg_float 41 12 15
## bldg_non_float 13 42 11
## vehicle_float 2 5 9
# APER (Apparent Error Rate)
n_total <- sum(conf_mat)
n_correct <- sum(diag(conf_mat))
n_wrong <- n_total - n_correct
APER <- n_wrong / n_total
akurasi <- n_correct / n_total
cat(sprintf("\nTotal observasi : %d\n", n_total))
##
## Total observasi : 150
cat(sprintf("Klasifikasi benar: %d\n", n_correct))
## Klasifikasi benar: 92
cat(sprintf("Klasifikasi salah: %d\n", n_wrong))
## Klasifikasi salah: 58
cat(sprintf("APER : %.4f (%.1f%%)\n", APER, APER*100))
## APER : 0.3867 (38.7%)
cat(sprintf("Akurasi : %.4f (%.1f%%)\n", akurasi, akurasi*100))
## Akurasi : 0.6133 (61.3%)
# VISUALISASI FUNGSI DISKRIMINAN
cat("\nVISUALISASI PLOT LDA\n")
##
## VISUALISASI PLOT LDA
# Skor diskriminan
lda_scores <- as.data.frame(lda_pred$x)
lda_scores$Type <- df_final$Type
# Plot LD1 vs LD2
ggplot(lda_scores, aes(x = LD1, y = LD2, color = Type, shape = Type)) +
geom_point(size = 2.5, alpha = 0.8) +
stat_ellipse(aes(group = Type), level = 0.95, linetype = "dashed") +
scale_color_manual(values = c("steelblue", "tomato", "seagreen")) +
labs(
title = "Linear Discriminant Analysis — Glass Classification",
subtitle = "Proyeksi pada LD1 dan LD2",
x = sprintf("LD1 (%.1f%%)", prop_var[1]*100),
y = sprintf("LD2 (%.1f%%)", prop_var[2]*100),
color = "Jenis Kaca",
shape = "Jenis Kaca"
) +
theme_minimal(base_size = 12) +
theme(legend.position = "bottom")

# PERBANDINGAN LDA vs REGRESI LOGISTIK MULTINOMIAL
cat("\n PERBANDINGAN MODEL \n")
##
## PERBANDINGAN MODEL
# Akurasi regresi logistik multinomial
pred_logit <- predict(model_multinom, df_final, type = "class")
acc_logit <- mean(pred_logit == df_final$Type) * 100
acc_lda <- akurasi * 100
perbandingan <- data.frame(
Model = c("Regresi Logistik Multinomial", "Linear Discriminant Analysis"),
Akurasi = c(round(acc_logit, 2), round(acc_lda, 2)),
APER = c(round(100 - acc_logit, 2), round(APER*100, 2))
)
cat("\n")
print(perbandingan)
## Model Akurasi APER
## 1 Regresi Logistik Multinomial 70.67 29.33
## 2 Linear Discriminant Analysis 61.33 38.67
cat(sprintf("\n=> Model terbaik: %s (akurasi %.1f%%)\n",
ifelse(acc_lda > acc_logit,
"Linear Discriminant Analysis",
"Regresi Logistik Multinomial"),
max(acc_lda, acc_logit)))
##
## => Model terbaik: Regresi Logistik Multinomial (akurasi 70.7%)