#install.packages(c("tidyverse", "ggplot2", "corrplot", "car", "nnet", "reshape2", "gridExtra"))

library(tidyverse)

## Warning: package 'readr' was built under R version 4.5.3

## Warning: package 'dplyr' was built under R version 4.5.3

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.2.0     ✔ readr     2.2.0
## ✔ forcats   1.0.1     ✔ stringr   1.6.0
## ✔ ggplot2   4.0.2     ✔ tibble    3.3.1
## ✔ lubridate 1.9.5     ✔ tidyr     1.3.2
## ✔ purrr     1.2.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(ggplot2)
library(corrplot)

## corrplot 0.95 loaded

library(car)

## Loading required package: carData
## 
## Attaching package: 'car'
## 
## The following object is masked from 'package:dplyr':
## 
##     recode
## 
## The following object is masked from 'package:purrr':
## 
##     some

library(nnet)       
library(reshape2)

## 
## Attaching package: 'reshape2'
## 
## The following object is masked from 'package:tidyr':
## 
##     smiths

library(gridExtra)

## 
## Attaching package: 'gridExtra'
## 
## The following object is masked from 'package:dplyr':
## 
##     combine

library(MASS)

## 
## Attaching package: 'MASS'
## 
## The following object is masked from 'package:dplyr':
## 
##     select

OPEN FILE

df <- read.csv("glass.csv", 
               stringsAsFactors = FALSE,
               fileEncoding = "UTF-8-BOM")

head(df)

##        RI    Na   Mg   Al    Si    K   Ca Ba   Fe Type
## 1 1.52101 13.64 4.49 1.10 71.78 0.06 8.75  0 0.00    1
## 2 1.51761 13.89 3.60 1.36 72.73 0.48 7.83  0 0.00    1
## 3 1.51618 13.53 3.55 1.54 72.99 0.39 7.78  0 0.00    1
## 4 1.51766 13.21 3.69 1.29 72.61 0.57 8.22  0 0.00    1
## 5 1.51742 13.27 3.62 1.24 73.08 0.55 8.07  0 0.00    1
## 6 1.51596 12.79 3.61 1.62 72.97 0.64 8.07  0 0.26    1

PRE-PROCESSING

df <- df %>%
  filter(Type %in% c(1, 2, 3)) %>%
  mutate(
    Type = factor(Type,
                  levels = c(1, 2, 3),
                  labels = c("bldg_float",
                             "bldg_non_float",
                             "vehicle_float"))
  ) %>%
  drop_na()

df$Type <- relevel(df$Type, ref = "bldg_float")

cat("Dimensi setelah filter 3 kategori:\n")

## Dimensi setelah filter 3 kategori:

cat("  ", nrow(df), "baris x", ncol(df), "kolom\n\n")

##    163 baris x 10 kolom

warna <- c("bldg_float"     = "#2A9D8F",
           "bldg_non_float" = "#E76F51",
           "vehicle_float"  = "#457B9D")

str(df)

## 'data.frame':    163 obs. of  10 variables:
##  $ RI  : num  1.52 1.52 1.52 1.52 1.52 ...
##  $ Na  : num  13.6 13.9 13.5 13.2 13.3 ...
##  $ Mg  : num  4.49 3.6 3.55 3.69 3.62 3.61 3.6 3.61 3.58 3.6 ...
##  $ Al  : num  1.1 1.36 1.54 1.29 1.24 1.62 1.14 1.05 1.37 1.36 ...
##  $ Si  : num  71.8 72.7 73 72.6 73.1 ...
##  $ K   : num  0.06 0.48 0.39 0.57 0.55 0.64 0.58 0.57 0.56 0.57 ...
##  $ Ca  : num  8.75 7.83 7.78 8.22 8.07 8.07 8.17 8.24 8.3 8.4 ...
##  $ Ba  : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Fe  : num  0 0 0 0 0 0.26 0 0 0 0.11 ...
##  $ Type: Factor w/ 3 levels "bldg_float","bldg_non_float",..: 1 1 1 1 1 1 1 1 1 1 ...

summary(df)

##        RI              Na              Mg              Al       
##  Min.   :1.512   Min.   :10.73   Min.   :0.000   Min.   :0.290  
##  1st Qu.:1.517   1st Qu.:12.87   1st Qu.:3.425   1st Qu.:1.145  
##  Median :1.518   Median :13.21   Median :3.540   Median :1.290  
##  Mean   :1.519   Mean   :13.20   Mean   :3.295   Mean   :1.282  
##  3rd Qu.:1.519   3rd Qu.:13.52   3rd Qu.:3.650   3rd Qu.:1.490  
##  Max.   :1.534   Max.   :14.86   Max.   :4.490   Max.   :2.120  
##        Si              K                Ca               Ba         
##  Min.   :69.81   Min.   :0.0000   Min.   : 7.080   Min.   :0.00000  
##  1st Qu.:72.19   1st Qu.:0.3850   1st Qu.: 8.210   1st Qu.:0.00000  
##  Median :72.74   Median :0.5700   Median : 8.550   Median :0.00000  
##  Mean   :72.59   Mean   :0.4775   Mean   : 8.925   Mean   :0.02982  
##  3rd Qu.:73.00   3rd Qu.:0.6100   3rd Qu.: 9.010   3rd Qu.:0.00000  
##  Max.   :74.45   Max.   :1.1000   Max.   :16.190   Max.   :3.15000  
##        Fe                      Type   
##  Min.   :0.00000   bldg_float    :70  
##  1st Qu.:0.00000   bldg_non_float:76  
##  Median :0.00000   vehicle_float :17  
##  Mean   :0.06761                      
##  3rd Qu.:0.13000                      
##  Max.   :0.37000

tbl <- table(df$Type)
print(data.frame(
  Jenis_Kaca = names(tbl),
  Frekuensi  = as.integer(tbl),
  Persen     = paste0(round(prop.table(tbl) * 100, 1), "%")
))

##       Jenis_Kaca Frekuensi Persen
## 1     bldg_float        70  42.9%
## 2 bldg_non_float        76  46.6%
## 3  vehicle_float        17  10.4%

make_bp <- function(var, lab) {
  ggplot(df, aes_string(x = "Type", y = var, fill = "Type")) +
    geom_boxplot(alpha = 0.8, outlier.color = "red",
                 outlier.shape = 4, outlier.size = 1.5) +
    scale_fill_manual(values = warna) +
    scale_x_discrete(labels = c("Float","Non-Float","Vehicle")) +
    labs(title = lab, x = "", y = lab) +
    theme_bw(base_size = 9) +
    theme(legend.position = "none")
}

make_bp("RI", "Refractive Index")

## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## This warning is displayed once per session.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

PEMODELAN

num_vars <- c("RI","Na","Mg","Al","Si","K","Ca","Ba","Fe")
num_df   <- df %>% dplyr::select(all_of(num_vars))

df_bt <- df %>%
  mutate(
    RI_log = RI * log(RI),
    Na_log = Na * log(Na + 0.001),
    Mg_log = Mg * log(Mg + 0.001),
    Al_log = Al * log(Al + 0.001),
    Si_log = Si * log(Si + 0.001),
    K_log  = K  * log(K  + 0.001),
    Ca_log = Ca * log(Ca + 0.001),
    Ba_log = Ba * log(Ba + 0.001),
    Fe_log = Fe * log(Fe + 0.001)
  )

log_terms <- c("RI_log","Na_log","Mg_log","Al_log","Si_log",
               "K_log","Ca_log","Ba_log","Fe_log")

orig_terms <- num_vars

form_bt <- as.formula(paste(
  "y_bin ~",
  paste(c(orig_terms, log_terms), collapse = " + ")
))

df_bt12 <- df_bt %>%
  filter(Type %in% c("bldg_float","bldg_non_float")) %>%
  mutate(y_bin = as.integer(Type == "bldg_non_float"))
 
bt12 <- glm(form_bt, data = df_bt12, family = binomial)

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

p_bt12 <- summary(bt12)$coefficients[log_terms, 4]

df_bt13 <- df_bt %>%
  filter(Type %in% c("bldg_float","vehicle_float")) %>%
  mutate(y_bin = as.integer(Type == "vehicle_float"))
 
bt13 <- glm(form_bt, data = df_bt13, family = binomial)

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

p_bt13 <- summary(bt13)$coefficients[log_terms, 4]

UJI ASUMSI

bt_tbl <- data.frame(
  Variabel             = num_vars,
  p_T2_vs_T1           = round(p_bt12, 4),
  p_T3_vs_T1           = round(p_bt13, 4),
  Status_T2_vs_T1      = ifelse(p_bt12 > 0.05, "✓ Linear", "✗ Tidak"),
  Status_T3_vs_T1      = ifelse(p_bt13 > 0.05, "✓ Linear", "✗ Tidak"),
  row.names = NULL
)
cat("Hasil Box-Tidwell (p-value X*ln(X)):\n")

## Hasil Box-Tidwell (p-value X*ln(X)):

print(bt_tbl)

##   Variabel p_T2_vs_T1 p_T3_vs_T1 Status_T2_vs_T1 Status_T3_vs_T1
## 1       RI     0.2747     0.0155        ✓ Linear         ✗ Tidak
## 2       Na     0.1696     0.0926        ✓ Linear        ✓ Linear
## 3       Mg     0.0424     0.6920         ✗ Tidak        ✓ Linear
## 4       Al     0.2455     0.4313        ✓ Linear        ✓ Linear
## 5       Si     0.3851     0.0398        ✓ Linear         ✗ Tidak
## 6        K     0.8261     0.1037        ✓ Linear        ✓ Linear
## 7       Ca     0.0177     0.3211         ✗ Tidak        ✓ Linear
## 8       Ba     0.3265     0.8968        ✓ Linear        ✓ Linear
## 9       Fe     0.2972     0.6534        ✓ Linear        ✓ Linear

cat("─── ASUMSI 1: LINEARITAS ───\n")

## ─── ASUMSI 1: LINEARITAS ───

n_nl <- sum(p_bt12 < 0.05) + sum(p_bt13 < 0.05)
cat(sprintf("\n=> %d dari 18 uji tidak linear\n", n_nl))

## 
## => 4 dari 18 uji tidak linear

cat(ifelse(n_nl <= 4,
    "=> Mayoritas LINEAR → Asumsi TERPENUHI ✓\n\n",
    "=> Banyak tidak linear, pertimbangkan transformasi\n\n"))

## => Mayoritas LINEAR → Asumsi TERPENUHI ✓

cat("─── ASUMSI 2: INDEPENDENSI OBSERVASI ───\n")

## ─── ASUMSI 2: INDEPENDENSI OBSERVASI ───

n_dup <- sum(duplicated(df))
cat("Jumlah duplikat:", n_dup, "\n")

## Jumlah duplikat: 1

cat("Setiap baris = 1 sampel kaca yang berbeda\n")

## Setiap baris = 1 sampel kaca yang berbeda

cat("Diambil dari laboratorium forensik independen\n")

## Diambil dari laboratorium forensik independen

cat("=> Asumsi Independensi:",
    ifelse(n_dup == 0, "TERPENUHI ✓\n\n",
           paste("Ada", n_dup, "duplikat → dihapus\n\n")))

## => Asumsi Independensi: Ada 1 duplikat → dihapus

if (n_dup > 0) df <- distinct(df)

cat("─── ASUMSI 3: TIDAK ADA MULTIKOLINEARITAS ───\n")

## ─── ASUMSI 3: TIDAK ADA MULTIKOLINEARITAS ───

cat("Metode  : Korelasi Pearson + VIF\n")

## Metode  : Korelasi Pearson + VIF

cat("Kriteria: |r| < 0.8 dan VIF < 10\n\n")

## Kriteria: |r| < 0.8 dan VIF < 10

cor_mat <- cor(num_df)
cat("Matriks Korelasi:\n")

## Matriks Korelasi:

print(round(cor_mat, 3))

##        RI     Na     Mg     Al     Si      K     Ca     Ba     Fe
## RI  1.000 -0.069 -0.567 -0.462 -0.691 -0.622  0.882  0.327  0.109
## Na -0.069  1.000  0.340 -0.272 -0.359 -0.367 -0.279 -0.301 -0.194
## Mg -0.567  0.340  1.000  0.006  0.133  0.216 -0.828 -0.287 -0.139
## Al -0.462 -0.272  0.006  1.000  0.171  0.713 -0.376  0.200  0.054
## Si -0.691 -0.359  0.133  0.171  1.000  0.426 -0.450 -0.354 -0.099
## K  -0.622 -0.367  0.216  0.713  0.426  1.000 -0.534  0.012  0.075
## Ca  0.882 -0.279 -0.828 -0.376 -0.450 -0.534  1.000  0.241  0.139
## Ba  0.327 -0.301 -0.287  0.200 -0.354  0.012  0.241  1.000  0.174
## Fe  0.109 -0.194 -0.139  0.054 -0.099  0.075  0.139  0.174  1.000

# Deteksi pasangan korelasi tinggi
cat("\nPasangan variabel dengan |r| > 0.6:\n")

## 
## Pasangan variabel dengan |r| > 0.6:

cor_up <- which(abs(cor_mat) > 0.6 & upper.tri(cor_mat), arr.ind = TRUE)
if (nrow(cor_up) == 0) {
  cat("  Tidak ada\n")
} else {
  for (i in seq_len(nrow(cor_up))) {
    r1 <- rownames(cor_mat)[cor_up[i,1]]
    r2 <- colnames(cor_mat)[cor_up[i,2]]
    cat(sprintf("  %s & %s : r = %.3f\n",
                r1, r2, cor_mat[cor_up[i,1], cor_up[i,2]]))
  }
}

##   RI & Si : r = -0.691
##   RI & K : r = -0.622
##   Al & K : r = 0.713
##   RI & Ca : r = 0.882
##   Mg & Ca : r = -0.828

Cek VIF

lm_vif <- lm(as.numeric(Type) ~ RI + Na + Mg + Al + Si + K + Ca + Ba + Fe,
              data = df)
vif_v   <- vif(lm_vif)
vif_tbl <- data.frame(
  Variabel = names(vif_v),
  VIF      = round(vif_v, 3),
  Status   = ifelse(vif_v < 5,  "✓ OK (< 5)",
              ifelse(vif_v < 10, "⚠ Perhatian (5–10)",
                                 "✗ Masalah (> 10)"))
)
cat("\nVIF:\n")

## 
## VIF:

print(vif_tbl)

##    Variabel     VIF             Status
## RI       RI  13.258   ✗ Masalah (> 10)
## Na       Na  31.051   ✗ Masalah (> 10)
## Mg       Mg  75.729   ✗ Masalah (> 10)
## Al       Al  12.162   ✗ Masalah (> 10)
## Si       Si  37.072   ✗ Masalah (> 10)
## K         K   7.804 ⚠ Perhatian (5–10)
## Ca       Ca 199.254   ✗ Masalah (> 10)
## Ba       Ba   7.513 ⚠ Perhatian (5–10)
## Fe       Fe   1.148         ✓ OK (< 5)

cat("=> Asumsi Multikolinearitas:",
    ifelse(all(vif_v < 10), "TERPENUHI ✓\n\n", "PERLU DITANGANI\n\n"))

## => Asumsi Multikolinearitas: PERLU DITANGANI

Cek Outliers

num_vars <- c("RI", "Na", "Mg", "Al", "Si", "K", "Ca", "Ba", "Fe")

z_df  <- df %>% dplyr::select(all_of(num_vars)) %>%
           mutate(across(everything(), ~abs(scale(.)[,1])))
out_z <- z_df %>% summarise(across(everything(), ~sum(. > 3)))

out_tbl <- data.frame(
  Variabel       = names(out_z),
  Outlier_Zscore = as.integer(out_z),
  Status         = ifelse(as.integer(out_z) == 0, "✓ Tidak ada", "⚠ Ada")
)

cat("Outlier per variabel (|z| > 3):\n")

## Outlier per variabel (|z| > 3):

print(out_tbl)

##   Variabel Outlier_Zscore      Status
## 1       RI              3       ⚠ Ada
## 2       Na              3       ⚠ Ada
## 3       Mg              9       ⚠ Ada
## 4       Al              1       ⚠ Ada
## 5       Si              3       ⚠ Ada
## 6        K              0 ✓ Tidak ada
## 7       Ca              7       ⚠ Ada
## 8       Ba              1       ⚠ Ada
## 9       Fe              1       ⚠ Ada

df_clean <- df %>%
  filter(if_all(all_of(num_vars), ~ abs(scale(.)[,1]) <= 3))

cat(sprintf("\nBaris sebelum hapus outlier : %d\n", nrow(df)))

## 
## Baris sebelum hapus outlier : 162

cat(sprintf("Baris setelah hapus outlier : %d\n",  nrow(df_clean)))

## Baris setelah hapus outlier : 150

cat(sprintf("Outlier yang dihapus        : %d baris\n\n", nrow(df) - nrow(df_clean)))

## Outlier yang dihapus        : 12 baris

vars_final <- c("RI", "Na", "Mg", "Al", "K", "Ba", "Fe")
num_df_clean <- df_clean %>% dplyr::select(all_of(vars_final))

Cek Korelasi

cor_mat <- cor(num_df_clean)
cat("Matriks Korelasi:\n")

## Matriks Korelasi:

print(round(cor_mat, 3))

##        RI     Na     Mg     Al      K     Ba     Fe
## RI  1.000  0.410 -0.010 -0.644 -0.692  0.105 -0.024
## Na  0.410  1.000  0.053 -0.230 -0.534  0.111 -0.169
## Mg -0.010  0.053  1.000 -0.254 -0.128 -0.111 -0.098
## Al -0.644 -0.230 -0.254  1.000  0.651  0.012 -0.051
## K  -0.692 -0.534 -0.128  0.651  1.000 -0.139  0.040
## Ba  0.105  0.111 -0.111  0.012 -0.139  1.000  0.052
## Fe -0.024 -0.169 -0.098 -0.051  0.040  0.052  1.000

cat("\nPasangan variabel dengan |r| > 0.6:\n")

## 
## Pasangan variabel dengan |r| > 0.6:

cor_up <- which(abs(cor_mat) > 0.6 & upper.tri(cor_mat), arr.ind = TRUE)
if (nrow(cor_up) == 0) {
  cat("  Tidak ada\n")
} else {
  for (i in seq_len(nrow(cor_up))) {
    r1 <- rownames(cor_mat)[cor_up[i,1]]
    r2 <- colnames(cor_mat)[cor_up[i,2]]
    cat(sprintf("  %s & %s : r = %.3f\n",
                r1, r2, cor_mat[cor_up[i,1], cor_up[i,2]]))
  }
}

##   RI & Al : r = -0.644
##   RI & K : r = -0.692
##   Al & K : r = 0.651

Cek FIV ke-2

lm_vif <- lm(as.numeric(Type) ~ RI + Na + Mg + Al + Si + K + Ca + Ba + Fe,
              data = df_clean)  # <-- df_clean
vif_v   <- vif(lm_vif)
vif_tbl <- data.frame(
  Variabel = names(vif_v),
  VIF      = round(vif_v, 3),
  Status   = ifelse(vif_v < 5,  "✓ OK (< 5)",
              ifelse(vif_v < 10, "⚠ Perhatian (5–10)",
                                 "✗ Masalah (> 10)"))
)
cat("\nVIF:\n")

## 
## VIF:

print(vif_tbl)

##    Variabel    VIF             Status
## RI       RI  5.885 ⚠ Perhatian (5–10)
## Na       Na 18.137   ✗ Masalah (> 10)
## Mg       Mg 16.832   ✗ Masalah (> 10)
## Al       Al 10.642   ✗ Masalah (> 10)
## Si       Si 23.731   ✗ Masalah (> 10)
## K         K  6.393 ⚠ Perhatian (5–10)
## Ca       Ca 46.013   ✗ Masalah (> 10)
## Ba       Ba  1.368         ✓ OK (< 5)
## Fe       Fe  1.108         ✓ OK (< 5)

cat("=> Asumsi Multikolinearitas:",
    ifelse(all(vif_v < 10), "TERPENUHI ✓\n\n", "PERLU DITANGANI\n\n"))

## => Asumsi Multikolinearitas: PERLU DITANGANI

Iterasi

cat("=== Iterasi 1: Hapus Ca ===\n")

## === Iterasi 1: Hapus Ca ===

lm_vif2 <- lm(as.numeric(Type) ~ RI + Na + Mg + Al + Si + K + Ba + Fe,
               data = df_clean)   # Ca dihapus
vif_v2  <- vif(lm_vif2)

vif_tbl2 <- data.frame(
  Variabel = names(vif_v2),
  VIF      = round(vif_v2, 3),
  Status   = ifelse(vif_v2 < 5,  "✓ OK (< 5)",
              ifelse(vif_v2 < 10, "⚠ Perhatian (5–10)",
                                  "✗ Masalah (> 10)"))
)
print(vif_tbl2)

##    Variabel   VIF             Status
## RI       RI 4.815         ✓ OK (< 5)
## Na       Na 3.298         ✓ OK (< 5)
## Mg       Mg 1.323         ✓ OK (< 5)
## Al       Al 2.802         ✓ OK (< 5)
## Si       Si 5.142 ⚠ Perhatian (5–10)
## K         K 2.810         ✓ OK (< 5)
## Ba       Ba 1.060         ✓ OK (< 5)
## Fe       Fe 1.091         ✓ OK (< 5)

cat("=> Masih ada VIF > 10:", sum(vif_v2 > 10), "variabel\n\n")

## => Masih ada VIF > 10: 0 variabel

cat("=== Iterasi 2: Hapus Si ===\n")

## === Iterasi 2: Hapus Si ===

lm_vif3 <- lm(as.numeric(Type) ~ RI + Na + Mg + Al + K + Ba + Fe,
               data = df_clean)   # Ca dan Si dihapus
vif_v3  <- vif(lm_vif3)

vif_tbl3 <- data.frame(
  Variabel = names(vif_v3),
  VIF      = round(vif_v3, 3),
  Status   = ifelse(vif_v3 < 5,  "✓ OK (< 5)",
              ifelse(vif_v3 < 10, "⚠ Perhatian (5–10)",
                                  "✗ Masalah (> 10)"))
)
print(vif_tbl3)

##    Variabel   VIF     Status
## RI       RI 2.392 ✓ OK (< 5)
## Na       Na 1.518 ✓ OK (< 5)
## Mg       Mg 1.168 ✓ OK (< 5)
## Al       Al 2.355 ✓ OK (< 5)
## K         K 2.778 ✓ OK (< 5)
## Ba       Ba 1.057 ✓ OK (< 5)
## Fe       Fe 1.060 ✓ OK (< 5)

cat("=> Masih ada VIF > 10:", sum(vif_v3 > 10), "variabel\n\n")

## => Masih ada VIF > 10: 0 variabel

cat("=== PERBANDINGAN ===\n")

## === PERBANDINGAN ===

cat(sprintf("Tanpa Ca       : maks VIF = %.3f | variabel masalah = %d\n",
            max(vif_v2), sum(vif_v2 > 10)))

## Tanpa Ca       : maks VIF = 5.142 | variabel masalah = 0

cat(sprintf("Tanpa Ca & Si  : maks VIF = %.3f | variabel masalah = %d\n",
            max(vif_v3), sum(vif_v3 > 10)))

## Tanpa Ca & Si  : maks VIF = 2.778 | variabel masalah = 0

# Simpan variabel final sesuai hasil terbaik
# Jika iterasi 1 cukup (semua VIF < 10):
vars_final <- c("RI", "Na", "Mg", "Al", "Si", "K", "Ba", "Fe")   # tanpa Ca

df_final <- df_clean %>% dplyr::select(all_of(vars_final), Type)
cat("\nDataset final:", nrow(df_final), "baris x", ncol(df_final), "kolom\n")

## 
## Dataset final: 150 baris x 9 kolom

cat("Variabel prediktor:", paste(vars_final, collapse = ", "), "\n")

## Variabel prediktor: RI, Na, Mg, Al, Si, K, Ba, Fe

# Correlation matrix tanpa Ca
num_vars_noCa <- c("RI", "Na", "Mg", "Al", "K", "Ba", "Fe")

num_df_noCa <- df_clean %>% dplyr::select(all_of(num_vars_noCa))

cor_mat_noCa <- cor(num_df_noCa)

cat("Matriks Korelasi (tanpa Ca):\n")

## Matriks Korelasi (tanpa Ca):

print(round(cor_mat_noCa, 3))

##        RI     Na     Mg     Al      K     Ba     Fe
## RI  1.000  0.410 -0.010 -0.644 -0.692  0.105 -0.024
## Na  0.410  1.000  0.053 -0.230 -0.534  0.111 -0.169
## Mg -0.010  0.053  1.000 -0.254 -0.128 -0.111 -0.098
## Al -0.644 -0.230 -0.254  1.000  0.651  0.012 -0.051
## K  -0.692 -0.534 -0.128  0.651  1.000 -0.139  0.040
## Ba  0.105  0.111 -0.111  0.012 -0.139  1.000  0.052
## Fe -0.024 -0.169 -0.098 -0.051  0.040  0.052  1.000

cat("\nPasangan variabel dengan |r| > 0.6:\n")

## 
## Pasangan variabel dengan |r| > 0.6:

cor_up <- which(abs(cor_mat_noCa) > 0.6 & upper.tri(cor_mat_noCa), arr.ind = TRUE)
if (nrow(cor_up) == 0) {
  cat("  Tidak ada\n")
} else {
  for (i in seq_len(nrow(cor_up))) {
    r1 <- rownames(cor_mat_noCa)[cor_up[i, 1]]
    r2 <- colnames(cor_mat_noCa)[cor_up[i, 2]]
    cat(sprintf("  %s & %s : r = %.3f\n",
                r1, r2, cor_mat_noCa[cor_up[i,1], cor_up[i,2]]))
  }
}

##   RI & Al : r = -0.644
##   RI & K : r = -0.692
##   Al & K : r = 0.651

# Dataset final siap untuk model
vars_final <- c("RI", "Na", "Mg", "Al", "K", "Ba", "Fe")

df_final <- df_clean %>% dplyr::select(all_of(vars_final), Type)

cat("=> Asumsi Multikolinearitas: TERPENUHI ✓\n")

## => Asumsi Multikolinearitas: TERPENUHI ✓

cat("   Variabel Ca dan Si dihapus (VIF > 10)\n")

##    Variabel Ca dan Si dihapus (VIF > 10)

cat("   Semua VIF tersisa < 5\n")

##    Semua VIF tersisa < 5

cat("   Variabel final:", paste(vars_final, collapse = ", "), "\n")

##    Variabel final: RI, Na, Mg, Al, K, Ba, Fe

cat("   Dataset:", nrow(df_final), "baris x", ncol(df_final), "kolom\n")

##    Dataset: 150 baris x 8 kolom

head(df_final)

##        RI    Na   Mg   Al    K Ba   Fe       Type
## 1 1.52101 13.64 4.49 1.10 0.06  0 0.00 bldg_float
## 2 1.51761 13.89 3.60 1.36 0.48  0 0.00 bldg_float
## 3 1.51618 13.53 3.55 1.54 0.39  0 0.00 bldg_float
## 4 1.51766 13.21 3.69 1.29 0.57  0 0.00 bldg_float
## 5 1.51742 13.27 3.62 1.24 0.55  0 0.00 bldg_float
## 6 1.51596 12.79 3.61 1.62 0.64  0 0.26 bldg_float

sum(is.na(df_final))

## [1] 0

#CEK ASUMSI LDA
# pakai uji normalitas univariat per variabel sebagai pendekatan
cat("Uji Normalitas per Variabel (Shapiro-Wilk):\n")

## Uji Normalitas per Variabel (Shapiro-Wilk):

sw_result <- df_final %>% dplyr::select(all_of(vars_final)) %>%
  sapply(function(x) shapiro.test(x)$p.value)

sw_tbl <- data.frame(
  Variabel = names(sw_result),
  p_value  = round(sw_result, 4),
  Status   = ifelse(sw_result > 0.05, "✓ Normal", "✗ Tidak Normal")
)
print(sw_tbl)

##    Variabel p_value         Status
## RI       RI  0.0000 ✗ Tidak Normal
## Na       Na  0.0251 ✗ Tidak Normal
## Mg       Mg  0.0000 ✗ Tidak Normal
## Al       Al  0.0080 ✗ Tidak Normal
## K         K  0.0000 ✗ Tidak Normal
## Ba       Ba  0.0000 ✗ Tidak Normal
## Fe       Fe  0.0000 ✗ Tidak Normal

cat(sprintf("\nVariabel normal: %d dari %d\n",
            sum(sw_result > 0.05), length(sw_result)))

## 
## Variabel normal: 0 dari 7

cat(ifelse(all(sw_result > 0.05),
    "=> Asumsi Normalitas: TERPENUHI ✓\n",
    "=> Asumsi Normalitas: TIDAK TERPENUHI\n"))

## => Asumsi Normalitas: TIDAK TERPENUHI

# Asumsi 5: Kehomogenan Matriks Varians-Kovarians (Box's M Test)
library(biotools)

## Warning: package 'biotools' was built under R version 4.5.3

## ---
## biotools version 4.3

cat("Asumsi 5 Kehomogenan Varians-Kovarians (Box's M Test) \n")

## Asumsi 5 Kehomogenan Varians-Kovarians (Box's M Test)

cat("H0 : Matriks kovarians antar kelompok homogen\n")

## H0 : Matriks kovarians antar kelompok homogen

cat("H1 : Matriks kovarians antar kelompok tidak homogen\n\n")

## H1 : Matriks kovarians antar kelompok tidak homogen

boxm_result <- boxM(
  data     = df_final %>% dplyr::select(all_of(vars_final)),
  grouping = df_final$Type
)
print(boxm_result)

## 
##  Box's M-test for Homogeneity of Covariance Matrices
## 
## data:  df_final %>% dplyr::select(all_of(vars_final))
## Chi-Sq (approx.) = 208.8, df = 56, p-value < 2.2e-16

cat(sprintf("\nChi-Square = %.4f | df = %d | p-value = %.4f\n",
            boxm_result$statistic, boxm_result$parameter, boxm_result$p.value))

## 
## Chi-Square = 208.8028 | df = 56 | p-value = 0.0000

cat(ifelse(boxm_result$p.value > 0.05,
    "=> Asumsi Homogenitas: TERPENUHI ✓ → Gunakan LDA\n\n",
    "=> Asumsi Homogenitas: TIDAK TERPENUHI → Pertimbangkan QDA\n\n"))

## => Asumsi Homogenitas: TIDAK TERPENUHI → Pertimbangkan QDA

MODEL REGRESI LOGISTIK MULTINOMIAL

model_multinom <- multinom(
  as.formula(paste("Type ~", paste(vars_final, collapse = " + "))),
  data  = df_final,
  trace = FALSE
)

cat("ESTIMASI PARAMETER MODEL\n")

## ESTIMASI PARAMETER MODEL

print(summary(model_multinom))

## Call:
## multinom(formula = as.formula(paste("Type ~", paste(vars_final, 
##     collapse = " + "))), data = df_final, trace = FALSE)
## 
## Coefficients:
##                (Intercept)         RI       Na         Mg         Al         K
## bldg_non_float   -168.2918   97.05469 1.037690 -0.2245978 5.25030219  1.609452
## vehicle_float     625.8205 -422.13752 1.250729 -0.5492412 0.09324865 -2.336724
##                       Ba        Fe
## bldg_non_float -1.898337  3.993628
## vehicle_float  -1.643733 -2.508314
## 
## Std. Errors:
##                (Intercept)       RI        Na        Mg       Al        K
## bldg_non_float    2.678513 4.068030 0.6266196 0.5858427 1.196128 1.803303
## vehicle_float     3.789424 5.775927 0.8832948 0.9019059 1.438085 2.141522
##                      Ba       Fe
## bldg_non_float 3.589812 2.184576
## vehicle_float  4.283600 3.753342
## 
## Residual Deviance: 231.1288 
## AIC: 263.1288

# MODEL LDA
cat("MODEL LINEAR DISCRIMINANT ANALYSIS\n\n")

## MODEL LINEAR DISCRIMINANT ANALYSIS

lda_model <- lda(
  Type ~ RI + Na + Mg + Al + K + Ba + Fe,
  data   = df_final,
  prior  = c(1,1,1)/3   # prior probabilitas sama
)

cat("Prior Probabilitas:\n")

## Prior Probabilitas:

print(lda_model$prior)

##     bldg_float bldg_non_float  vehicle_float 
##      0.3333333      0.3333333      0.3333333

cat("\nMean per Kelompok:\n")

## 
## Mean per Kelompok:

print(round(lda_model$means, 4))

##                    RI      Na     Mg     Al      K     Ba     Fe
## bldg_float     1.5187 13.2056 3.5456 1.1869 0.4585 0.0131 0.0587
## bldg_non_float 1.5174 13.2041 3.4092 1.4586 0.5706 0.0102 0.0761
## vehicle_float  1.5177 13.3900 3.5288 1.2194 0.4175 0.0094 0.0375

cat("\nKoefisien Fungsi Diskriminan:\n")

## 
## Koefisien Fungsi Diskriminan:

print(round(lda_model$scaling, 4))

##          LD1      LD2
## RI -408.7738 541.0737
## Na    0.0193  -1.2597
## Mg   -0.2036   0.6497
## Al   -3.8005  -1.2315
## K    -2.9568   3.0662
## Ba    0.5477   2.1021
## Fe   -4.2831  -0.1443

# PROPORSI VARIANSI YANG DIJELASKAN 
cat("PROPORSI VARIANSI\n")

## PROPORSI VARIANSI

prop_var <- lda_model$svd^2 / sum(lda_model$svd^2)
cat(sprintf("LD1 menjelaskan %.1f%% variansi\n", prop_var[1]*100))

## LD1 menjelaskan 67.1% variansi

cat(sprintf("LD2 menjelaskan %.1f%% variansi\n", prop_var[2]*100))

## LD2 menjelaskan 32.9% variansi

UJI SERENTAK (Likelihood Ratio Test / G²)

cat("UJI SERENTAK\n")

## UJI SERENTAK

cat("H0 : β1 = β2 = ... = βk = 0 (variabel prediktor tidak signifikan terhadap model)\n")

## H0 : β1 = β2 = ... = βk = 0 (variabel prediktor tidak signifikan terhadap model)

cat("H1 : minimal ada satu βi ≠ 0 (i = 1,2,...,k)\n\n")

## H1 : minimal ada satu βi ≠ 0 (i = 1,2,...,k)

model_null <- multinom(Type ~ 1, data = df_final, trace = FALSE)

LL_null   <- logLik(model_null)
LL_full   <- logLik(model_multinom)
G2        <- as.numeric(-2 * (LL_null - LL_full))
df_G2     <- attr(LL_full, "df") - attr(LL_null, "df")
p_G2      <- pchisq(G2, df = df_G2, lower.tail = FALSE)
chi_tabel <- qchisq(0.95, df = df_G2)

uji_serentak <- data.frame(
  Model             = c("Intercept Only", "Final"),
  `-2LogLikelihood` = c(round(-2 * as.numeric(LL_null), 3),
                        round(-2 * as.numeric(LL_full), 3)),
  Chi_Square        = c(NA, round(G2, 3)),
  df                = c(NA, df_G2),
  p_value           = c(NA, round(p_G2, 4))
)
print(uji_serentak)

##            Model X.2LogLikelihood Chi_Square df p_value
## 1 Intercept Only          287.580         NA NA      NA
## 2          Final          231.129     56.452 14       0

cat(sprintf("\nG² hitung              = %.3f\n", G2))

## 
## G² hitung              = 56.452

cat(sprintf("χ² tabel (df=%d, α=0.05) = %.3f\n", df_G2, chi_tabel))

## χ² tabel (df=14, α=0.05) = 23.685

cat(sprintf("p-value                = %.4f\n\n", p_G2))

## p-value                = 0.0000

cat(ifelse(G2 > chi_tabel & p_G2 < 0.05,
    "Keputusan : Tolak H0\nKesimpulan: Minimal ada satu variabel prediktor yang berpengaruh signifikan\n",
    "Keputusan : Gagal Tolak H0\nKesimpulan: Tidak ada variabel prediktor yang signifikan\n"))

## Keputusan : Tolak H0
## Kesimpulan: Minimal ada satu variabel prediktor yang berpengaruh signifikan

UJI PARSIAL (Wald Test)

cat("UJI PARSIAL (Wald Test)\n")

## UJI PARSIAL (Wald Test)

cat("H0 : βj = 0\n")

## H0 : βj = 0

cat("H1 : βj ≠ 0 ; j = 1, 2, ..., p\n")

## H1 : βj ≠ 0 ; j = 1, 2, ..., p

cat("Kriteria: Tolak H0 jika |W| > 1.96 atau p-value < 0.05\n\n")

## Kriteria: Tolak H0 jika |W| > 1.96 atau p-value < 0.05

coef_mat <- summary(model_multinom)$coefficients
se_mat   <- summary(model_multinom)$standard.errors
W_mat    <- coef_mat / se_mat
p_mat    <- 2 * (1 - pnorm(abs(W_mat)))

kategori <- rownames(coef_mat)

for (kat in kategori) {
  cat(sprintf("══ Model Logit: %s vs bldg_float (referensi) ══\n", kat))
  
  parsial_tbl <- data.frame(
    Variabel  = colnames(coef_mat),
    B         = round(coef_mat[kat, ], 4),
    SE        = round(se_mat[kat, ],   4),
    Wald_W    = round(W_mat[kat, ],    4),
    df        = 1,
    p_value   = round(p_mat[kat, ],    4),
    Keputusan = ifelse(p_mat[kat, ] < 0.05, "Tolak H0", "Gagal Tolak H0"),
    row.names = NULL
  )
  print(parsial_tbl)
  cat("\n")
}

## ══ Model Logit: bldg_non_float vs bldg_float (referensi) ══
##      Variabel         B     SE   Wald_W df p_value      Keputusan
## 1 (Intercept) -168.2918 2.6785 -62.8303  1  0.0000       Tolak H0
## 2          RI   97.0547 4.0680  23.8579  1  0.0000       Tolak H0
## 3          Na    1.0377 0.6266   1.6560  1  0.0977 Gagal Tolak H0
## 4          Mg   -0.2246 0.5858  -0.3834  1  0.7014 Gagal Tolak H0
## 5          Al    5.2503 1.1961   4.3894  1  0.0000       Tolak H0
## 6           K    1.6095 1.8033   0.8925  1  0.3721 Gagal Tolak H0
## 7          Ba   -1.8983 3.5898  -0.5288  1  0.5969 Gagal Tolak H0
## 8          Fe    3.9936 2.1846   1.8281  1  0.0675 Gagal Tolak H0
## 
## ══ Model Logit: vehicle_float vs bldg_float (referensi) ══
##      Variabel         B     SE   Wald_W df p_value      Keputusan
## 1 (Intercept)  625.8205 3.7894 165.1492  1  0.0000       Tolak H0
## 2          RI -422.1375 5.7759 -73.0857  1  0.0000       Tolak H0
## 3          Na    1.2507 0.8833   1.4160  1  0.1568 Gagal Tolak H0
## 4          Mg   -0.5492 0.9019  -0.6090  1  0.5425 Gagal Tolak H0
## 5          Al    0.0932 1.4381   0.0648  1  0.9483 Gagal Tolak H0
## 6           K   -2.3367 2.1415  -1.0912  1  0.2752 Gagal Tolak H0
## 7          Ba   -1.6437 4.2836  -0.3837  1  0.7012 Gagal Tolak H0
## 8          Fe   -2.5083 3.7533  -0.6683  1  0.5039 Gagal Tolak H0

ODDS RATIO

cat("─── ODDS RATIO ───\n")

## ─── ODDS RATIO ───

odds_ratio <- exp(coef(model_multinom))
for(kat in rownames(odds_ratio)) {
  cat(sprintf("\n─── Odds Ratio: %s vs bldg_float ───\n", kat))
  
  or_tbl <- data.frame(
    variabel = colnames(odds_ratio),
    Odds_Ratio = round(odds_ratio[kat, ], 4),
    row.names = NULL
  )
  print(or_tbl)
}

## 
## ─── Odds Ratio: bldg_non_float vs bldg_float ───
##      variabel   Odds_Ratio
## 1 (Intercept) 0.000000e+00
## 2          RI 1.413571e+42
## 3          Na 2.822700e+00
## 4          Mg 7.988000e-01
## 5          Al 1.906239e+02
## 6           K 5.000100e+00
## 7          Ba 1.498000e-01
## 8          Fe 5.425130e+01
## 
## ─── Odds Ratio: vehicle_float vs bldg_float ───
##      variabel    Odds_Ratio
## 1 (Intercept) 6.171331e+271
## 2          RI  0.000000e+00
## 3          Na  3.492900e+00
## 4          Mg  5.774000e-01
## 5          Al  1.097700e+00
## 6           K  9.660000e-02
## 7          Ba  1.933000e-01
## 8          Fe  8.140000e-02

cat("\nInterpretasi:\n")

## 
## Interpretasi:

cat("Odds Ratio > 1  : meningkatkan peluang kategori dibanding referensi\n")

## Odds Ratio > 1  : meningkatkan peluang kategori dibanding referensi

cat("Odds Ratio < 1  : menurunkan peluang kategori dibanding referensi\n")

## Odds Ratio < 1  : menurunkan peluang kategori dibanding referensi

cat("Odds Ratio = 1  : tidak berpengaruh\n")

## Odds Ratio = 1  : tidak berpengaruh

Prediksi Model

pred <- predict(model_multinom, df_final)

cat("─── CONFUSION MATRIX ───\n")

## ─── CONFUSION MATRIX ───

conf_matrix <- table(Prediksi = pred, Aktual = df_final$Type)
print(conf_matrix)

##                 Aktual
## Prediksi         bldg_float bldg_non_float vehicle_float
##   bldg_float             54             16             9
##   bldg_non_float         14             50             5
##   vehicle_float           0              0             2

Akurasi Model

accuracy <- mean(pred == df_final$Type)

cat("─── Akurasi Model ───\n")

## ─── Akurasi Model ───

cat(sprintf("Akurasi: %.4f (%.2f%%)\n", accuracy, accuracy * 100))

## Akurasi: 0.7067 (70.67%)

prob <- predict(model_multinom, df_final, type = "probs")

cat("\n─── Contoh Probabilitas (5 observasi pertama) ───\n")

## 
## ─── Contoh Probabilitas (5 observasi pertama) ───

head(prob)

##   bldg_float bldg_non_float vehicle_float
## 1  0.7276010      0.1393034    0.13309559
## 2  0.2998410      0.5029701    0.19718893
## 3  0.2352301      0.5321467    0.23262322
## 4  0.5365835      0.3503174    0.11309918
## 5  0.5598899      0.2875347    0.15257539
## 6  0.1309165      0.8531423    0.01594123

# PREDIKSI & CONFUSION MATRIX 
cat("\nPREDIKSI & CONFUSION MATRIX\n\n")

## 
## PREDIKSI & CONFUSION MATRIX

lda_pred <- predict(lda_model, df_final)

conf_mat <- table(
  Aktual    = df_final$Type,
  Prediksi  = lda_pred$class
)
cat("Confusion Matrix:\n")

## Confusion Matrix:

print(conf_mat)

##                 Prediksi
## Aktual           bldg_float bldg_non_float vehicle_float
##   bldg_float             41             12            15
##   bldg_non_float         13             42            11
##   vehicle_float           2              5             9

# APER (Apparent Error Rate)
n_total    <- sum(conf_mat)
n_correct  <- sum(diag(conf_mat))
n_wrong    <- n_total - n_correct
APER       <- n_wrong / n_total
akurasi    <- n_correct / n_total

cat(sprintf("\nTotal observasi  : %d\n", n_total))

## 
## Total observasi  : 150

cat(sprintf("Klasifikasi benar: %d\n", n_correct))

## Klasifikasi benar: 92

cat(sprintf("Klasifikasi salah: %d\n", n_wrong))

## Klasifikasi salah: 58

cat(sprintf("APER             : %.4f (%.1f%%)\n", APER, APER*100))

## APER             : 0.3867 (38.7%)

cat(sprintf("Akurasi          : %.4f (%.1f%%)\n", akurasi, akurasi*100))

## Akurasi          : 0.6133 (61.3%)

# VISUALISASI FUNGSI DISKRIMINAN 
cat("\nVISUALISASI PLOT LDA\n")

## 
## VISUALISASI PLOT LDA

# Skor diskriminan
lda_scores <- as.data.frame(lda_pred$x)
lda_scores$Type <- df_final$Type

# Plot LD1 vs LD2
ggplot(lda_scores, aes(x = LD1, y = LD2, color = Type, shape = Type)) +
  geom_point(size = 2.5, alpha = 0.8) +
  stat_ellipse(aes(group = Type), level = 0.95, linetype = "dashed") +
  scale_color_manual(values = c("steelblue", "tomato", "seagreen")) +
  labs(
    title    = "Linear Discriminant Analysis — Glass Classification",
    subtitle = "Proyeksi pada LD1 dan LD2",
    x        = sprintf("LD1 (%.1f%%)", prop_var[1]*100),
    y        = sprintf("LD2 (%.1f%%)", prop_var[2]*100),
    color    = "Jenis Kaca",
    shape    = "Jenis Kaca"
  ) +
  theme_minimal(base_size = 12) +
  theme(legend.position = "bottom")

# PERBANDINGAN LDA vs REGRESI LOGISTIK MULTINOMIAL 
cat("\n PERBANDINGAN MODEL \n")

## 
##  PERBANDINGAN MODEL

# Akurasi regresi logistik multinomial
pred_logit  <- predict(model_multinom, df_final, type = "class")
acc_logit   <- mean(pred_logit == df_final$Type) * 100
acc_lda     <- akurasi * 100

perbandingan <- data.frame(
  Model    = c("Regresi Logistik Multinomial", "Linear Discriminant Analysis"),
  Akurasi  = c(round(acc_logit, 2), round(acc_lda, 2)),
  APER     = c(round(100 - acc_logit, 2), round(APER*100, 2))
)
cat("\n")

print(perbandingan)

##                          Model Akurasi  APER
## 1 Regresi Logistik Multinomial   70.67 29.33
## 2 Linear Discriminant Analysis   61.33 38.67

cat(sprintf("\n=> Model terbaik: %s (akurasi %.1f%%)\n",
            ifelse(acc_lda > acc_logit,
                   "Linear Discriminant Analysis",
                   "Regresi Logistik Multinomial"),
            max(acc_lda, acc_logit)))

## 
## => Model terbaik: Regresi Logistik Multinomial (akurasi 70.7%)

ANMUL Logistic Regression Multinom

Kekila Akmal Nifcky, Nabilah Hilmi Rosanti, Rebriane Atitha Ginting

2026-04-26

OPEN FILE

PRE-PROCESSING

PEMODELAN

UJI ASUMSI

Cek VIF

Cek Outliers

Cek Korelasi

Cek FIV ke-2

Iterasi

MODEL REGRESI LOGISTIK MULTINOMIAL

UJI SERENTAK (Likelihood Ratio Test / G²)

UJI PARSIAL (Wald Test)

ODDS RATIO

Prediksi Model

Akurasi Model