1 Load Package

packages <- c("tidyverse", "ggplot2", "corrplot", "car",
              "nortest", "moments", "gridExtra", "scales",
              "ggcorrplot", "knitr", "kableExtra", "MASS",
              "heplots", "nnet", "caret", "lmtest",
              "broom", "margins")

installed <- packages %in% rownames(installed.packages())
if (any(!installed)) install.packages(packages[!installed])

library(tidyverse)
library(ggplot2)
library(corrplot)
library(car)
library(nortest)
library(moments)
library(gridExtra)
library(scales)
library(ggcorrplot)
library(knitr)
library(kableExtra)
library(MASS)
library(heplots)
library(nnet)
library(caret)
library(lmtest)
library(broom)
library(margins)

2 Load Data

df <- read.csv("Sleep_health_and_lifestyle_dataset.csv",
               stringsAsFactors = FALSE)

cat("Dimensi data:", nrow(df), "baris x", ncol(df), "kolom\n")
## Dimensi data: 374 baris x 13 kolom
kable(head(df, 10), caption = "10 Baris Pertama Dataset") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"),
                full_width = TRUE, font_size = 13)
10 Baris Pertama Dataset
Person.ID Gender Age Occupation Sleep.Duration Quality.of.Sleep Physical.Activity.Level Stress.Level BMI.Category Blood.Pressure Heart.Rate Daily.Steps Sleep.Disorder
1 Male 27 Software Engineer 6.1 6 42 6 Overweight 126/83 77 4200 None
2 Male 28 Doctor 6.2 6 60 8 Normal 125/80 75 10000 None
3 Male 28 Doctor 6.2 6 60 8 Normal 125/80 75 10000 None
4 Male 28 Sales Representative 5.9 4 30 8 Obese 140/90 85 3000 Sleep Apnea
5 Male 28 Sales Representative 5.9 4 30 8 Obese 140/90 85 3000 Sleep Apnea
6 Male 28 Software Engineer 5.9 4 30 8 Obese 140/90 85 3000 Insomnia
7 Male 29 Teacher 6.3 6 40 7 Obese 140/90 82 3500 Insomnia
8 Male 29 Doctor 7.8 7 75 6 Normal 120/80 70 8000 None
9 Male 29 Doctor 7.8 7 75 6 Normal 120/80 70 8000 None
10 Male 29 Doctor 7.8 7 75 6 Normal 120/80 70 8000 None
str(df)
## 'data.frame':    374 obs. of  13 variables:
##  $ Person.ID              : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ Gender                 : chr  "Male" "Male" "Male" "Male" ...
##  $ Age                    : int  27 28 28 28 28 28 29 29 29 29 ...
##  $ Occupation             : chr  "Software Engineer" "Doctor" "Doctor" "Sales Representative" ...
##  $ Sleep.Duration         : num  6.1 6.2 6.2 5.9 5.9 5.9 6.3 7.8 7.8 7.8 ...
##  $ Quality.of.Sleep       : int  6 6 6 4 4 4 6 7 7 7 ...
##  $ Physical.Activity.Level: int  42 60 60 30 30 30 40 75 75 75 ...
##  $ Stress.Level           : int  6 8 8 8 8 8 7 6 6 6 ...
##  $ BMI.Category           : chr  "Overweight" "Normal" "Normal" "Obese" ...
##  $ Blood.Pressure         : chr  "126/83" "125/80" "125/80" "140/90" ...
##  $ Heart.Rate             : int  77 75 75 85 85 85 82 70 70 70 ...
##  $ Daily.Steps            : int  4200 10000 10000 3000 3000 3000 3500 8000 8000 8000 ...
##  $ Sleep.Disorder         : chr  "None" "None" "None" "Sleep Apnea" ...

3 Data Cleaning

3.1 Cek Missing Values

missing_df <- data.frame(
  Kolom   = names(df),
  Missing = colSums(is.na(df)),
  Persen  = paste0(round(colSums(is.na(df)) / nrow(df) * 100, 2), "%")
) %>% filter(Missing > 0)

kable(missing_df, caption = "Kolom dengan Missing Values", row.names = FALSE) %>%
  kable_styling(bootstrap_options = c("striped","hover"), full_width = FALSE) %>%
  row_spec(which(missing_df$Missing > 0), background = "#fff3cd")
Kolom dengan Missing Values
Kolom Missing Persen
NA NA NA
:—– ——-: :——

Catatan: Missing values pada Sleep.Disorder bukan data rusak — responden yang tidak memiliki gangguan tidur tidak diisi nilainya, sehingga diimputasi dengan "None".

3.2 Imputasi & Standardisasi

# Imputasi Sleep.Disorder: NA -> "None"
df$Sleep.Disorder[is.na(df$Sleep.Disorder)] <- "None"

# Standardisasi BMI: "Normal" -> "Normal Weight"
df$BMI.Category[df$BMI.Category == "Normal"] <- "Normal Weight"

# Pecah Blood.Pressure menjadi Systolic & Diastolic
df <- df %>%
  separate(Blood.Pressure, into = c("Systolic", "Diastolic"),
           sep = "/", convert = TRUE)

# Hapus Person.ID (tidak relevan untuk analisis)
df <- df %>% dplyr::select(-Person.ID)

# Encoding ke faktor
df$Gender         <- as.factor(df$Gender)
df$Occupation     <- as.factor(df$Occupation)
df$BMI.Category   <- as.factor(df$BMI.Category)
df$Sleep.Disorder <- as.factor(df$Sleep.Disorder)

cat("Dimensi setelah cleaning:", nrow(df), "baris x", ncol(df), "kolom\n")
## Dimensi setelah cleaning: 374 baris x 13 kolom

3.3 Data Setelah Cleaning

kable(summary(df), caption = "Ringkasan Statistik Setelah Cleaning") %>%
  kable_styling(bootstrap_options = c("striped","hover","condensed"),
                full_width = TRUE, font_size = 12) %>%
  scroll_box(width = "100%", height = "350px")
Ringkasan Statistik Setelah Cleaning
Gender Age Occupation Sleep.Duration Quality.of.Sleep Physical.Activity.Level Stress.Level BMI.Category Systolic Diastolic Heart.Rate Daily.Steps Sleep.Disorder
Female:185 Min. :27.00 Nurse :73 Min. :5.800 Min. :4.000 Min. :30.00 Min. :3.000 Normal Weight:216 Min. :115.0 Min. :75.00 Min. :65.00 Min. : 3000 Insomnia : 77
Male :189 1st Qu.:35.25 Doctor :71 1st Qu.:6.400 1st Qu.:6.000 1st Qu.:45.00 1st Qu.:4.000 Obese : 10 1st Qu.:125.0 1st Qu.:80.00 1st Qu.:68.00 1st Qu.: 5600 None :219
NA Median :43.00 Engineer :63 Median :7.200 Median :7.000 Median :60.00 Median :5.000 Overweight :148 Median :130.0 Median :85.00 Median :70.00 Median : 7000 Sleep Apnea: 78
NA Mean :42.18 Lawyer :47 Mean :7.132 Mean :7.313 Mean :59.17 Mean :5.385 NA Mean :128.6 Mean :84.65 Mean :70.17 Mean : 6817 NA
NA 3rd Qu.:50.00 Teacher :40 3rd Qu.:7.800 3rd Qu.:8.000 3rd Qu.:75.00 3rd Qu.:7.000 NA 3rd Qu.:135.0 3rd Qu.:90.00 3rd Qu.:72.00 3rd Qu.: 8000 NA
NA Max. :59.00 Accountant:37 Max. :8.500 Max. :9.000 Max. :90.00 Max. :8.000 NA Max. :142.0 Max. :95.00 Max. :86.00 Max. :10000 NA
NA NA (Other) :43 NA NA NA NA NA NA NA NA NA NA

4 EDA - Variabel Target (Sleep Disorder)

Variabel target untuk analisis LDA Multinomial adalah Sleep.Disorder dengan 3 kelas: None, Insomnia, dan Sleep Apnea.

4.1 Distribusi Kelas Target

target_df <- as.data.frame(table(df$Sleep.Disorder)) %>%
  rename(Kelas = Var1, Frekuensi = Freq) %>%
  mutate(Persen = paste0(round(Frekuensi / nrow(df) * 100, 1), "%"))

kable(target_df, caption = "Distribusi Kelas Sleep Disorder", row.names = FALSE) %>%
  kable_styling(bootstrap_options = c("striped","hover"), full_width = FALSE) %>%
  row_spec(which(target_df$Kelas == "None"),        background = "#d4edda") %>%
  row_spec(which(target_df$Kelas == "Insomnia"),    background = "#fff3cd") %>%
  row_spec(which(target_df$Kelas == "Sleep Apnea"), background = "#f8d7da")
Distribusi Kelas Sleep Disorder
Kelas Frekuensi Persen
Insomnia 77 20.6%
None 219 58.6%
Sleep Apnea 78 20.9%
ggplot(df, aes(x = Sleep.Disorder, fill = Sleep.Disorder)) +
  geom_bar(alpha = 0.85, width = 0.6) +
  geom_text(stat = "count", aes(label = after_stat(count)),
            vjust = -0.5, size = 4.5, fontface = "bold") +
  scale_fill_manual(values = c("None"        = "#55A868",
                               "Insomnia"    = "#DD8452",
                               "Sleep Apnea" = "#C44E52")) +
  labs(title = "Distribusi Variabel Target: Sleep Disorder",
       x = "Kelas", y = "Jumlah Observasi") +
  theme_bw(base_size = 12) +
  theme(legend.position = "none")

Catatan class imbalance: Kelas None (219 obs) mendominasi dibanding Insomnia (77) dan Sleep Apnea (78). Pada tahap LDA, prior probability akan disesuaikan untuk mengatasi ketidakseimbangan ini.


5 EDA - Variabel Numerik (Prediktor LDA)

Prediktor numerik yang akan digunakan dalam LDA:

num_vars <- df %>%
  dplyr::select(Age, Sleep.Duration, Physical.Activity.Level,
         Stress.Level, Heart.Rate, Daily.Steps, Systolic, Diastolic)

5.1 Statistik Deskriptif

desc_tbl <- round(as.data.frame(do.call(cbind, lapply(num_vars, function(x) {
  c(Mean     = mean(x),
    Median   = median(x),
    SD       = sd(x),
    Min      = min(x),
    Max      = max(x),
    Skewness = skewness(x))
}))), 3)

kable(desc_tbl, caption = "Statistik Deskriptif Variabel Numerik (Prediktor LDA)") %>%
  kable_styling(bootstrap_options = c("striped","hover","condensed"),
                full_width = TRUE) %>%
  scroll_box(width = "100%")
Statistik Deskriptif Variabel Numerik (Prediktor LDA)
Age Sleep.Duration Physical.Activity.Level Stress.Level Heart.Rate Daily.Steps Systolic Diastolic
Mean 42.184 7.132 59.171 5.385 70.166 6816.845 128.553 84.650
Median 43.000 7.200 60.000 5.000 70.000 7000.000 130.000 85.000
SD 8.673 0.796 20.831 1.775 4.136 1617.916 7.748 6.162
Min 27.000 5.800 30.000 3.000 65.000 3000.000 115.000 75.000
Max 59.000 8.500 90.000 8.000 86.000 10000.000 142.000 95.000
Skewness 0.256 0.037 0.074 0.154 1.220 0.178 -0.036 0.377

5.2 Distribusi Variabel Numerik

num_long <- num_vars %>%
  pivot_longer(everything(), names_to = "Variabel", values_to = "Nilai")

ggplot(num_long, aes(x = Nilai)) +
  geom_histogram(fill = "#4C72B0", color = "white", bins = 20, alpha = 0.85) +
  facet_wrap(~Variabel, scales = "free") +
  labs(title = "Distribusi Variabel Numerik (Prediktor LDA)",
       x = NULL, y = "Frekuensi") +
  theme_bw()

5.3 Distribusi Prediktor per Kelas Target

num_class <- df %>%
  dplyr::select(Sleep.Disorder, Age, Sleep.Duration, Physical.Activity.Level,
         Stress.Level, Heart.Rate, Daily.Steps, Systolic, Diastolic) %>%
  pivot_longer(-Sleep.Disorder, names_to = "Variabel", values_to = "Nilai")

ggplot(num_class, aes(x = Sleep.Disorder, y = Nilai, fill = Sleep.Disorder)) +
  geom_boxplot(alpha = 0.8, outlier.color = "red", outlier.size = 1.5) +
  scale_fill_manual(values = c("None"        = "#55A868",
                               "Insomnia"    = "#DD8452",
                               "Sleep Apnea" = "#C44E52")) +
  facet_wrap(~Variabel, scales = "free_y") +
  labs(title = "Distribusi Prediktor per Kelas Sleep Disorder",
       x = NULL, y = NULL) +
  theme_bw(base_size = 11) +
  theme(legend.position = "bottom",
        axis.text.x = element_text(angle = 20, hjust = 1))


6 EDA - Variabel Kategorik

6.1 Distribusi Frekuensi

cat_list <- list(
  Gender       = as.data.frame(table(df$Gender)),
  BMI.Category = as.data.frame(table(df$BMI.Category))
)

for (nm in names(cat_list)) {
  names(cat_list[[nm]]) <- c("Kategori","Frekuensi")
  cat_list[[nm]]$Persen <- paste0(
    round(cat_list[[nm]]$Frekuensi / nrow(df) * 100, 1), "%")
}

kable(cat_list$Gender, caption = "Distribusi Gender") %>%
  kable_styling(full_width = FALSE, bootstrap_options = c("striped","hover"))
Distribusi Gender
Kategori Frekuensi Persen
Female 185 49.5%
Male 189 50.5%
kable(cat_list$BMI.Category, caption = "Distribusi BMI Category") %>%
  kable_styling(full_width = FALSE, bootstrap_options = c("striped","hover"))
Distribusi BMI Category
Kategori Frekuensi Persen
Normal Weight 216 57.8%
Obese 10 2.7%
Overweight 148 39.6%

6.2 Visualisasi Kategorik

p_gen <- ggplot(df, aes(x = Gender, fill = Gender)) +
  geom_bar(show.legend = FALSE, alpha = 0.85) +
  scale_fill_manual(values = c("#4C72B0","#DD8452")) +
  labs(title = "Distribusi Gender", x = NULL, y = "Jumlah") +
  theme_bw()

p_bmi <- ggplot(df, aes(x = BMI.Category, fill = BMI.Category)) +
  geom_bar(show.legend = FALSE, alpha = 0.85) +
  scale_fill_brewer(palette = "Set2") +
  labs(title = "Distribusi BMI Category", x = NULL, y = "Jumlah") +
  theme_bw() +
  theme(axis.text.x = element_text(angle = 15, hjust = 1))

p_occ <- ggplot(df, aes(y = fct_infreq(Occupation), fill = Occupation)) +
  geom_bar(show.legend = FALSE, alpha = 0.85) +
  scale_fill_brewer(palette = "Paired") +
  labs(title = "Distribusi Occupation", x = "Jumlah", y = NULL) +
  theme_bw()

grid.arrange(p_gen, p_bmi, p_occ, ncol = 3)


7 Matriks Korelasi

cor_matrix <- cor(num_vars, use = "complete.obs")

ggcorrplot(cor_matrix,
           method   = "circle",
           type     = "lower",
           lab      = TRUE,
           lab_size = 3,
           colors   = c("#D73027","white","#4575B4"),
           title    = "Heatmap Korelasi Variabel Numerik",
           ggtheme  = theme_bw())

cor_df <- as.data.frame(round(cor_matrix, 3))
kable(cor_df, caption = "Matriks Korelasi Lengkap") %>%
  kable_styling(bootstrap_options = c("striped","hover","condensed"),
                full_width = TRUE) %>%
  scroll_box(width = "100%")
Matriks Korelasi Lengkap
Age Sleep.Duration Physical.Activity.Level Stress.Level Heart.Rate Daily.Steps Systolic Diastolic
Age 1.000 0.345 0.179 -0.422 -0.226 0.058 0.606 0.594
Sleep.Duration 0.345 1.000 0.212 -0.811 -0.516 -0.040 -0.180 -0.167
Physical.Activity.Level 0.179 0.212 1.000 -0.034 0.137 0.773 0.265 0.383
Stress.Level -0.422 -0.811 -0.034 1.000 0.670 0.187 0.103 0.092
Heart.Rate -0.226 -0.516 0.137 0.670 1.000 -0.030 0.294 0.271
Daily.Steps 0.058 -0.040 0.773 0.187 -0.030 1.000 0.103 0.242
Systolic 0.606 -0.180 0.265 0.103 0.294 0.103 1.000 0.973
Diastolic 0.594 -0.167 0.383 0.092 0.271 0.242 0.973 1.000

8 Deteksi Outlier

detect_outliers <- function(x, varname) {
  Q1      <- quantile(x, 0.25, na.rm = TRUE)
  Q3      <- quantile(x, 0.75, na.rm = TRUE)
  IQR_val <- Q3 - Q1
  lower   <- Q1 - 1.5 * IQR_val
  upper   <- Q3 + 1.5 * IQR_val
  n_out   <- sum(x < lower | x > upper, na.rm = TRUE)
  data.frame(
    Variabel    = varname,
    Q1          = round(Q1, 2),
    Q3          = round(Q3, 2),
    IQR         = round(IQR_val, 2),
    Lower_Fence = round(lower, 2),
    Upper_Fence = round(upper, 2),
    N_Outlier   = n_out,
    Persen      = paste0(round(n_out / length(x) * 100, 2), "%")
  )
}

outlier_df <- bind_rows(
  lapply(seq_along(num_vars),
         function(i) detect_outliers(num_vars[[i]], names(num_vars)[i]))
)

kable(outlier_df, caption = "Ringkasan Outlier per Variabel (Metode IQR)",
      row.names = FALSE) %>%
  kable_styling(bootstrap_options = c("striped","hover","condensed"),
                full_width = TRUE) %>%
  row_spec(which(outlier_df$N_Outlier > 0), background = "#fff3cd")
Ringkasan Outlier per Variabel (Metode IQR)
Variabel Q1 Q3 IQR Lower_Fence Upper_Fence N_Outlier Persen
Age 35.25 50.0 14.75 13.12 72.12 0 0%
Sleep.Duration 6.40 7.8 1.40 4.30 9.90 0 0%
Physical.Activity.Level 45.00 75.0 30.00 0.00 120.00 0 0%
Stress.Level 4.00 7.0 3.00 -0.50 11.50 0 0%
Heart.Rate 68.00 72.0 4.00 62.00 78.00 15 4.01%
Daily.Steps 5600.00 8000.0 2400.00 2000.00 11600.00 0 0%
Systolic 125.00 135.0 10.00 110.00 150.00 0 0%
Diastolic 80.00 90.0 10.00 65.00 105.00 0 0%
num_long2 <- num_vars %>%
  pivot_longer(everything(), names_to = "Variabel", values_to = "Nilai")

ggplot(num_long2, aes(x = Variabel, y = Nilai)) +
  geom_boxplot(fill = "#4C72B0", alpha = 0.7,
               outlier.color = "red", outlier.shape = 19) +
  facet_wrap(~Variabel, scales = "free") +
  labs(title = "Deteksi Outlier - Boxplot Semua Variabel Numerik",
       x = NULL, y = NULL) +
  theme_bw() +
  theme(axis.text.x = element_blank(), axis.ticks.x = element_blank())


9 Cek Asumsi LDA

9.1 Asumsi 1 – Normalitas Multivariat per Kelas (Shapiro-Wilk)

LDA mengasumsikan setiap prediktor berdistribusi normal di dalam masing-masing kelas target.

pred_vars <- c("Age", "Sleep.Duration", "Physical.Activity.Level",
               "Stress.Level", "Heart.Rate", "Daily.Steps",
               "Systolic", "Diastolic")

kelas <- levels(df$Sleep.Disorder)

norm_results <- lapply(kelas, function(k) {
  sub <- df %>% filter(Sleep.Disorder == k) %>% dplyr::select(all_of(pred_vars))
  lapply(pred_vars, function(v) {
    sw <- shapiro.test(sub[[v]])
    data.frame(
      Kelas    = k,
      Variabel = v,
      W        = round(sw$statistic, 4),
      p.value  = round(sw$p.value, 4),
      Normal   = ifelse(sw$p.value > 0.05, "Ya", "Tidak")
    )
  })
})

norm_df <- bind_rows(unlist(norm_results, recursive = FALSE))

kable(norm_df,
      caption  = "Uji Normalitas Shapiro-Wilk per Kelas & Variabel (Asumsi LDA)",
      row.names = FALSE) %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"),
                full_width = TRUE) %>%
  row_spec(which(norm_df$Normal == "Tidak"), background = "#f8d7da") %>%
  row_spec(which(norm_df$Normal == "Ya"),    background = "#d4edda") %>%
  scroll_box(width = "100%", height = "400px")
Uji Normalitas Shapiro-Wilk per Kelas & Variabel (Asumsi LDA)
Kelas Variabel W p.value Normal
Insomnia Age 0.7754 0 Tidak
Insomnia Sleep.Duration 0.7381 0 Tidak
Insomnia Physical.Activity.Level 0.5448 0 Tidak
Insomnia Stress.Level 0.7493 0 Tidak
Insomnia Heart.Rate 0.8158 0 Tidak
Insomnia Daily.Steps 0.6528 0 Tidak
Insomnia Systolic 0.7907 0 Tidak
Insomnia Diastolic 0.7996 0 Tidak
None Age 0.9108 0 Tidak
None Sleep.Duration 0.9231 0 Tidak
None Physical.Activity.Level 0.8774 0 Tidak
None Stress.Level 0.8934 0 Tidak
None Heart.Rate 0.9103 0 Tidak
None Daily.Steps 0.8244 0 Tidak
None Systolic 0.9124 0 Tidak
None Diastolic 0.8393 0 Tidak
Sleep Apnea Age 0.8469 0 Tidak
Sleep Apnea Sleep.Duration 0.7748 0 Tidak
Sleep Apnea Physical.Activity.Level 0.7578 0 Tidak
Sleep Apnea Stress.Level 0.7175 0 Tidak
Sleep Apnea Heart.Rate 0.8251 0 Tidak
Sleep Apnea Daily.Steps 0.8488 0 Tidak
Sleep Apnea Systolic 0.5007 0 Tidak
Sleep Apnea Diastolic 0.5767 0 Tidak

9.2 Asumsi 2 – Homogenitas Matriks Kovarians (Box’s M Test)

Asumsi utama LDA: matriks kovarians antar kelas harus homogen.

boxm_result <- boxM(
  df %>% dplyr::select(all_of(pred_vars)),
  df$Sleep.Disorder
)

boxm_df <- data.frame(
  Uji        = "Box's M Test",
  Chi_Square = round(boxm_result$statistic, 4),
  df         = boxm_result$parameter["df"],
  p.value    = round(boxm_result$p.value, 4),
  Kesimpulan = ifelse(boxm_result$p.value > 0.05,
                      "Kovarians Homogen → Asumsi LDA Terpenuhi ✓",
                      "Kovarians Tidak Homogen → Pertimbangkan QDA ⚠")
)

kable(boxm_df,
      caption   = "Box's M Test – Homogenitas Matriks Kovarians Antar Kelas",
      row.names = FALSE) %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE) %>%
  row_spec(1,
           color      = "white",
           background = ifelse(boxm_result$p.value > 0.05, "#5cb85c", "#f0ad4e"))
Box’s M Test – Homogenitas Matriks Kovarians Antar Kelas
Uji Chi_Square df p.value Kesimpulan
Box’s M Test 1122.323 72 0 Kovarians Tidak Homogen → Pertimbangkan QDA ⚠

Interpretasi: Jika p-value > 0.05, matriks kovarians antar kelas homogen dan asumsi LDA terpenuhi. Jika p-value ≤ 0.05, pertimbangkan menggunakan QDA (Quadratic Discriminant Analysis) sebagai alternatif.

9.3 Asumsi 3 – Tidak Ada Multikolinearitas Tinggi (VIF)

df_vif <- df %>%
  mutate(
    Gender_bin     = as.integer(Gender == "Male"),
    BMI_Overweight = as.integer(BMI.Category == "Overweight"),
    BMI_Obese      = as.integer(BMI.Category == "Obese")
  )

model_vif <- lm(as.numeric(Sleep.Disorder) ~
                  Age + Sleep.Duration + Physical.Activity.Level +
                  Stress.Level + Heart.Rate + Daily.Steps +
                  Systolic + Diastolic + Gender_bin +
                  BMI_Overweight + BMI_Obese,
                data = df_vif)

vif_vals <- vif(model_vif)

vif_df <- data.frame(
  Variabel = names(vif_vals),
  VIF      = round(vif_vals, 3),
  Status   = ifelse(vif_vals > 10, "Multikolinear Tinggi",
             ifelse(vif_vals > 5,  "Perlu Perhatian", "Aman"))
)

kable(vif_df, caption = "Variance Inflation Factor (VIF) – Cek Asumsi LDA & MLR",
      row.names = FALSE) %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE) %>%
  row_spec(which(vif_df$Status == "Multikolinear Tinggi"), color="white", background="#d9534f") %>%
  row_spec(which(vif_df$Status == "Perlu Perhatian"),      color="white", background="#f0ad4e") %>%
  row_spec(which(vif_df$Status == "Aman"),                 color="white", background="#5cb85c")
Variance Inflation Factor (VIF) – Cek Asumsi LDA & MLR
Variabel VIF Status
Age 8.031 Perlu Perhatian
Sleep.Duration 8.263 Perlu Perhatian
Physical.Activity.Level 5.926 Perlu Perhatian
Stress.Level 9.968 Perlu Perhatian
Heart.Rate 6.554 Perlu Perhatian
Daily.Steps 6.871 Perlu Perhatian
Systolic 70.455 Multikolinear Tinggi
Diastolic 76.025 Multikolinear Tinggi
Gender_bin 3.368 Aman
BMI_Overweight 6.580 Perlu Perhatian
BMI_Obese 3.344 Aman

Interpretasi: VIF < 5 = tidak ada multikolinearitas (aman); VIF 5-10 = sedang; VIF > 10 = tinggi, perlu penanganan sebelum masuk ke LDA.

vif_df$Variabel <- factor(vif_df$Variabel,
                           levels = vif_df$Variabel[order(vif_df$VIF)])

ggplot(vif_df, aes(x = Variabel, y = VIF, fill = Status)) +
  geom_col(alpha = 0.85) +
  geom_hline(yintercept = 5,  linetype = "dashed", color = "orange", linewidth = 0.9) +
  geom_hline(yintercept = 10, linetype = "dashed", color = "red",    linewidth = 0.9) +
  annotate("text", x = 1.5, y = 5.5,  label = "VIF = 5",  color = "orange", size = 3.5) +
  annotate("text", x = 1.5, y = 10.5, label = "VIF = 10", color = "red",    size = 3.5) +
  scale_fill_manual(
    values = c("Aman"                 = "#5cb85c",
               "Perlu Perhatian"      = "#f0ad4e",
               "Multikolinear Tinggi" = "#d9534f")) +
  coord_flip() +
  labs(title    = "Variance Inflation Factor (VIF)",
       subtitle = "Deteksi Multikolinearitas antar Prediktor",
       x = NULL, y = "VIF", fill = "Status") +
  theme_bw(base_size = 12)

9.4 Ringkasan Asumsi LDA

lda_asumsi <- data.frame(
  No     = 1:3,
  Asumsi = c("Normalitas per kelas",
             "Homogenitas kovarians (Box's M)",
             "Tidak ada multikolinearitas (VIF)"),
  Hasil  = c(
    paste0(sum(norm_df$Normal == "Tidak"), " dari ", nrow(norm_df),
           " kombinasi kelas-variabel tidak normal"),
    paste0("Chi² = ", round(boxm_result$statistic, 2),
           ", p = ", round(boxm_result$p.value, 4)),
    paste0(sum(vif_vals > 5), " variabel VIF > 5; maks VIF = ",
           round(max(vif_vals), 2))
  ),
  Status = c(
    ifelse(sum(norm_df$Normal == "Tidak") / nrow(norm_df) < 0.5,
           "⚠ Sebagian – LDA masih robust", "✗ Banyak pelanggaran"),
    ifelse(boxm_result$p.value > 0.05,
           "Homogen", "⚠ Tidak homogen – LDA masih digunakan dengan catatan"),
    ifelse(all(vif_vals <= 10), "Tidak ada VIF > 10", "✗ Ada multikolinearitas tinggi")
  )
)

kable(lda_asumsi, caption = "Ringkasan Pemeriksaan Asumsi LDA",
      row.names = FALSE) %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"),
                full_width = TRUE) %>%
  column_spec(4, bold = TRUE,
              color = ifelse(grepl("✓", lda_asumsi$Status), "#155724",
                      ifelse(grepl("⚠", lda_asumsi$Status), "#856404", "#721c24")))
Ringkasan Pemeriksaan Asumsi LDA
No Asumsi Hasil Status
1 Normalitas per kelas 24 dari 24 kombinasi kelas-variabel tidak normal ✗ Banyak pelanggaran
2 Homogenitas kovarians (Box’s M) Chi² = 1122.32, p = 0 ⚠ Tidak homogen – LDA masih digunakan dengan catatan
3 Tidak ada multikolinearitas (VIF) 9 variabel VIF > 5; maks VIF = 76.03 ✗ Ada multikolinearitas tinggi

10 Penanganan Asumsi LDA (Data Preparation)

Berdasarkan hasil cek asumsi, ditemukan beberapa pelanggaran. Tabel berikut merangkum masalah, dampak, dan strategi penanganannya:

Masalah Temuan Strategi
Outlier Terdeteksi outlier di beberapa variabel Winsorizing (IQR capping)
Normalitas Banyak variabel tidak normal per kelas Transformasi Box-Cox
Multikolinearitas Ada VIF > 5 / > 10 PCA (komponen orthogonal, VIF = 1)
Class Imbalance None dominan vs Insomnia/Sleep Apnea SMOTE manual
Box’s M p ≈ 0 Diterima – sensitif terhadap n besar; LDA tetap digunakan dengan catatan

10.1 Penanganan Outlier – Winsorizing (IQR Capping)

Winsorizing dilakukan pada fitur numerik asli sebelum PCA agar PC scores tidak terdistorsi oleh nilai ekstrem.

winsorize <- function(x) {
  Q1 <- quantile(x, 0.25, na.rm = TRUE)
  Q3 <- quantile(x, 0.75, na.rm = TRUE)
  pmin(pmax(x, Q1 - 1.5 * (Q3 - Q1)), Q3 + 1.5 * (Q3 - Q1))
}

num_cols_orig     <- df %>% dplyr::select(where(is.numeric)) %>% names()
df[num_cols_orig] <- lapply(df[num_cols_orig], winsorize)

outlier_after <- bind_rows(lapply(num_cols_orig, function(v) {
  x <- df[[v]]
  Q1 <- quantile(x, 0.25); Q3 <- quantile(x, 0.75)
  data.frame(Variabel = v,
             N_Outlier_Setelah = sum(x < Q1 - 1.5*(Q3-Q1) | x > Q3 + 1.5*(Q3-Q1)))
}))

kable(outlier_after, caption = "Jumlah Outlier Setelah Winsorizing", row.names = FALSE) %>%
  kable_styling(bootstrap_options = c("striped","hover"), full_width = FALSE) %>%
  row_spec(which(outlier_after$N_Outlier_Setelah == 0), background = "#d4edda")
Jumlah Outlier Setelah Winsorizing
Variabel N_Outlier_Setelah
Age 0
Sleep.Duration 0
Quality.of.Sleep 0
Physical.Activity.Level 0
Stress.Level 0
Systolic 0
Diastolic 0
Heart.Rate 0
Daily.Steps 0
cat("Total outlier tersisa:", sum(outlier_after$N_Outlier_Setelah), "\n")
## Total outlier tersisa: 0

10.2 Penanganan Normalitas – Transformasi Box-Cox

Box-Cox diterapkan pada fitur numerik asli sebelum PCA agar distribusi tiap fitur lebih mendekati normal sebelum direduksi dimensinya.

boxcox_transform <- function(x) {
  x_shift <- x - min(x) + 1
  bc      <- boxcox(x_shift ~ 1, plotit = FALSE)
  lambda  <- bc$x[which.max(bc$y)]
  if (abs(lambda) < 0.01) log(x_shift) else (x_shift^lambda - 1) / lambda
}

# Simpan salinan df SEBELUM Box-Cox untuk dipakai di cek asumsi MLR (Box-Tidwell & Mahalanobis)
df_raw <- df

lambda_log <- data.frame(Variabel = character(), Lambda = numeric(),
                         stringsAsFactors = FALSE)
for (v in num_cols_orig) {
  x_shift    <- df[[v]] - min(df[[v]]) + 1
  bc         <- boxcox(x_shift ~ 1, plotit = FALSE)
  lam        <- bc$x[which.max(bc$y)]
  lambda_log <- rbind(lambda_log, data.frame(Variabel = v, Lambda = round(lam, 3)))
  df[[v]]    <- boxcox_transform(df[[v]])
}

kable(lambda_log, caption = "Parameter λ Box-Cox per Variabel", row.names = FALSE) %>%
  kable_styling(bootstrap_options = c("striped","hover"), full_width = FALSE)
Parameter λ Box-Cox per Variabel
Variabel Lambda
Age 0.6
Sleep.Duration 0.6
Quality.of.Sleep 1.2
Physical.Activity.Level 0.6
Stress.Level 0.5
Systolic 0.8
Diastolic 0.6
Heart.Rate 0.6
Daily.Steps 0.8
cat("Keterangan: lambda kecil ~ 0 = log transform; lambda = 1 = tidak perlu transformasi\n")
## Keterangan: lambda kecil ~ 0 = log transform; lambda = 1 = tidak perlu transformasi
df %>% dplyr::select(all_of(num_cols_orig)) %>%
  pivot_longer(everything(), names_to = "Variabel", values_to = "Nilai") %>%
  ggplot(aes(x = Nilai)) +
  geom_histogram(fill = "#55A868", color = "white", bins = 20, alpha = 0.85) +
  geom_density(aes(y = after_stat(count)), color = "#C44E52", linewidth = 0.8) +
  facet_wrap(~Variabel, scales = "free") +
  labs(title = "Distribusi Fitur Asli Setelah Box-Cox + Winsorizing",
       x = NULL, y = "Frekuensi") +
  theme_bw()

norm_after <- bind_rows(lapply(levels(df$Sleep.Disorder), function(k) {
  sub <- df %>% filter(Sleep.Disorder == k) %>% dplyr::select(all_of(num_cols_orig))
  bind_rows(lapply(names(sub), function(v) {
    sw <- shapiro.test(sub[[v]])
    data.frame(Kelas = k, Variabel = v,
               W = round(sw$statistic, 4),
               p.value = round(sw$p.value, 4),
               Normal = ifelse(sw$p.value > 0.05, "Ya", "Tidak"))
  }))
}))

kable(norm_after, caption = "Uji Normalitas Setelah Box-Cox (per Kelas & Variabel)",
      row.names = FALSE) %>%
  kable_styling(bootstrap_options = c("striped","hover","condensed"), full_width = TRUE) %>%
  row_spec(which(norm_after$Normal == "Ya"),    background = "#d4edda") %>%
  row_spec(which(norm_after$Normal == "Tidak"), background = "#f8d7da") %>%
  scroll_box(width = "100%", height = "350px")
Uji Normalitas Setelah Box-Cox (per Kelas & Variabel)
Kelas Variabel W p.value Normal
Insomnia Age 0.7088 0 Tidak
Insomnia Sleep.Duration 0.7805 0 Tidak
Insomnia Quality.of.Sleep 0.8567 0 Tidak
Insomnia Physical.Activity.Level 0.6320 0 Tidak
Insomnia Stress.Level 0.7458 0 Tidak
Insomnia Systolic 0.7634 0 Tidak
Insomnia Diastolic 0.7552 0 Tidak
Insomnia Heart.Rate 0.7907 0 Tidak
Insomnia Daily.Steps 0.6528 0 Tidak
None Age 0.9544 0 Tidak
None Sleep.Duration 0.9018 0 Tidak
None Quality.of.Sleep 0.8535 0 Tidak
None Physical.Activity.Level 0.8355 0 Tidak
None Stress.Level 0.9097 0 Tidak
None Systolic 0.9106 0 Tidak
None Diastolic 0.8550 0 Tidak
None Heart.Rate 0.8930 0 Tidak
None Daily.Steps 0.8206 0 Tidak
Sleep Apnea Age 0.7984 0 Tidak
Sleep Apnea Sleep.Duration 0.7809 0 Tidak
Sleep Apnea Quality.of.Sleep 0.7787 0 Tidak
Sleep Apnea Physical.Activity.Level 0.6972 0 Tidak
Sleep Apnea Stress.Level 0.7139 0 Tidak
Sleep Apnea Systolic 0.4869 0 Tidak
Sleep Apnea Diastolic 0.5543 0 Tidak
Sleep Apnea Heart.Rate 0.7787 0 Tidak
Sleep Apnea Daily.Steps 0.8465 0 Tidak
n_normal <- sum(norm_after$Normal == "Ya")
cat("Normal:", n_normal, "dari", nrow(norm_after), "kombinasi (",
    round(n_normal / nrow(norm_after) * 100, 1), "%)\n")
## Normal: 0 dari 27 kombinasi ( 0 %)

Catatan: Jika masih ada yang tidak normal setelah Box-Cox, hal ini umum pada data nyata. LDA cukup robust terhadap pelanggaran normalitas selama n per kelas >= 25. Setelah Box-Cox, data masuk ke PCA.

10.3 Penanganan Multikolinearitas – PCA

Drop iteratif tidak cukup karena multikolinearitas pada data ini sangat kuat. Solusi definitif adalah PCA: mengubah fitur-fitur yang saling berkorelasi menjadi komponen baru yang secara matematika orthogonal, sehingga VIF antar komponen = 1.

df_clean       <- df
num_cols_clean <- df_clean %>% dplyr::select(where(is.numeric)) %>% names()

cat("Fitur numerik sebelum PCA:", paste(num_cols_clean, collapse = ", "), "\n")
## Fitur numerik sebelum PCA: Age, Sleep.Duration, Quality.of.Sleep, Physical.Activity.Level, Stress.Level, Systolic, Diastolic, Heart.Rate, Daily.Steps
# Standarisasi (wajib sebelum PCA)
num_matrix <- scale(df_clean[num_cols_clean])

# Jalankan PCA
pca_result <- prcomp(num_matrix, center = FALSE, scale. = FALSE)

# Pilih PC yang menjelaskan >= 80% variansi
explained_var <- cumsum(pca_result$sdev^2) / sum(pca_result$sdev^2)
n_pc          <- which(explained_var >= 0.80)[1]
cat("\nJumlah PC yang menjelaskan >= 80% variansi:", n_pc, "\n")
## 
## Jumlah PC yang menjelaskan >= 80% variansi: 3
cat("Variansi kumulatif PC1 s.d. PC", n_pc, ":",
    round(explained_var[n_pc] * 100, 2), "%\n")
## Variansi kumulatif PC1 s.d. PC 3 : 87.37 %
ev_df <- data.frame(
  PC             = paste0("PC", seq_along(pca_result$sdev)),
  Std.Dev        = round(pca_result$sdev, 4),
  Var.Explained  = round(pca_result$sdev^2 / sum(pca_result$sdev^2) * 100, 2),
  Cumulative.Pct = round(explained_var * 100, 2)
)

kable(ev_df, caption = "Explained Variance per Principal Component", row.names = FALSE) %>%
  kable_styling(bootstrap_options = c("striped","hover","condensed"), full_width = FALSE) %>%
  row_spec(seq_len(n_pc), background = "#d4edda") %>%
  row_spec(n_pc, bold = TRUE)
Explained Variance per Principal Component
PC Std.Dev Var.Explained Cumulative.Pct
PC1 1.8940 39.86 39.86
PC2 1.6273 29.42 69.28
PC3 1.2758 18.08 87.37
PC4 0.7193 5.75 93.11
PC5 0.5660 3.56 96.67
PC6 0.3920 1.71 98.38
PC7 0.2924 0.95 99.33
PC8 0.2219 0.55 99.88
PC9 0.1043 0.12 100.00
ev_plot <- ev_df %>% mutate(PC = factor(PC, levels = PC))

p_scree <- ggplot(ev_plot, aes(x = PC, y = Var.Explained)) +
  geom_col(fill = "#4C72B0", alpha = 0.8) +
  geom_line(aes(group = 1), color = "#DD8452", linewidth = 1) +
  geom_point(color = "#DD8452", size = 2.5) +
  geom_vline(xintercept = n_pc + 0.5, linetype = "dashed",
             color = "red", linewidth = 0.8) +
  annotate("text", x = n_pc + 0.7, y = max(ev_plot$Var.Explained) * 0.9,
           label = paste0("PC", n_pc, " cutoff"), color = "red", hjust = 0, size = 3.5) +
  labs(title = "Scree Plot PCA", x = "Principal Component",
       y = "Variansi Dijelaskan (%)") +
  theme_bw()

p_cum <- ggplot(ev_plot, aes(x = PC, y = Cumulative.Pct, group = 1)) +
  geom_line(color = "#55A868", linewidth = 1.2) +
  geom_point(color = "#55A868", size = 2.5) +
  geom_hline(yintercept = 80, linetype = "dashed", color = "red", linewidth = 0.8) +
  annotate("text", x = 2, y = 81.5, label = "80% threshold",
           color = "red", size = 3.5) +
  labs(title = "Cumulative Explained Variance",
       x = "Principal Component", y = "Kumulatif (%)") +
  theme_bw()

grid.arrange(p_scree, p_cum, ncol = 2)

# Gunakan n_pc komponen pertama sebagai fitur baru
pc_scores        <- as.data.frame(pca_result$x[, seq_len(n_pc)])
names(pc_scores) <- paste0("PC", seq_len(n_pc))

df_clean <- bind_cols(
  df_clean %>% dplyr::select(Sleep.Disorder, Gender, Occupation, BMI.Category),
  pc_scores
)

num_cols_clean <- paste0("PC", seq_len(n_pc))
cat("Fitur baru setelah PCA:", paste(num_cols_clean, collapse = ", "), "\n")
## Fitur baru setelah PCA: PC1, PC2, PC3
cat("Dimensi df_clean:", nrow(df_clean), "x", ncol(df_clean), "\n")
## Dimensi df_clean: 374 x 7
df_vif2 <- df_clean %>%
  mutate(Gender_bin     = as.integer(Gender == "Male"),
         BMI_Overweight = as.integer(BMI.Category == "Overweight"),
         BMI_Obese      = as.integer(BMI.Category == "Obese"))

f2        <- as.formula(paste("as.numeric(Sleep.Disorder) ~",
               paste(c(num_cols_clean, "Gender_bin", "BMI_Overweight", "BMI_Obese"),
                     collapse = "+")))
vif_vals2 <- vif(lm(f2, data = df_vif2))

vif_df2 <- data.frame(
  Variabel = names(vif_vals2),
  VIF      = round(vif_vals2, 3),
  Status   = ifelse(vif_vals2 > 10, "Multikolinear Tinggi",
             ifelse(vif_vals2 > 5,  "Perlu Perhatian", "Aman"))
)

kable(vif_df2, caption = "VIF Setelah PCA – Semua Komponen Orthogonal",
      row.names = FALSE) %>%
  kable_styling(bootstrap_options = c("striped","hover"), full_width = FALSE) %>%
  row_spec(which(vif_df2$Status == "Multikolinear Tinggi"), color="white", background="#d9534f") %>%
  row_spec(which(vif_df2$Status == "Perlu Perhatian"),      color="white", background="#f0ad4e") %>%
  row_spec(which(vif_df2$Status == "Aman"),                 color="white", background="#5cb85c")
VIF Setelah PCA – Semua Komponen Orthogonal
Variabel VIF Status
PC1 1.597 Aman
PC2 2.246 Aman
PC3 1.547 Aman
Gender_bin 1.501 Aman
BMI_Overweight 3.274 Aman
BMI_Obese 1.375 Aman

10.4 Penanganan Class Imbalance – SMOTE Manual

set.seed(42)

smote_manual <- function(minority_num, n_target, k = 5) {
  n_min <- nrow(minority_num)
  synthetic <- vector("list", n_target)
  for (i in seq_len(n_target)) {
    idx_base <- sample(n_min, 1)
    base     <- as.numeric(minority_num[idx_base, ])
    dists    <- apply(minority_num, 1, function(r) sqrt(sum((as.numeric(r) - base)^2)))
    dists[idx_base] <- Inf
    nn_idx   <- order(dists)[seq_len(min(k, n_min - 1))]
    neighbor <- as.numeric(minority_num[sample(nn_idx, 1), ])
    synthetic[[i]] <- base + runif(1) * (neighbor - base)
  }
  as.data.frame(do.call(rbind, synthetic))
}

rownames(df_clean) <- seq_len(nrow(df_clean))
num_mat <- as.data.frame(df_clean[num_cols_clean])

n_target_cls <- max(table(df_clean$Sleep.Disorder))
df_balanced  <- df_clean

for (kls in c("Insomnia", "Sleep Apnea")) {
  idx_kls  <- which(df_clean$Sleep.Disorder == kls)
  n_needed <- n_target_cls - length(idx_kls)
  if (n_needed > 0) {
    syn_num  <- smote_manual(num_mat[idx_kls, , drop = FALSE], n_needed, k = 5)
    names(syn_num) <- num_cols_clean
    syn_full <- syn_num
    syn_full$Sleep.Disorder <- factor(kls,  levels = levels(df_clean$Sleep.Disorder))
    syn_full$Gender         <- factor(names(which.max(table(df_clean$Gender[idx_kls]))),
                                      levels = levels(df_clean$Gender))
    syn_full$Occupation     <- factor(names(which.max(table(df_clean$Occupation[idx_kls]))),
                                      levels = levels(df_clean$Occupation))
    syn_full$BMI.Category   <- factor(names(which.max(table(df_clean$BMI.Category[idx_kls]))),
                                      levels = levels(df_clean$BMI.Category))
    syn_full <- syn_full[, names(df_balanced)]
    df_balanced <- bind_rows(df_balanced, syn_full)
  }
}

df_balanced$Sleep.Disorder <- factor(df_balanced$Sleep.Disorder,
                                      levels = levels(df_clean$Sleep.Disorder))

dist_compare <- merge(
  as.data.frame(table(df_clean$Sleep.Disorder))    %>% rename(Kelas = Var1, Sebelum = Freq),
  as.data.frame(table(df_balanced$Sleep.Disorder)) %>% rename(Kelas = Var1, Sesudah = Freq),
  by = "Kelas")
dist_compare$Ditambah <- dist_compare$Sesudah - dist_compare$Sebelum

kable(dist_compare, caption = "Distribusi Kelas Sebelum vs Sesudah SMOTE",
      row.names = FALSE) %>%
  kable_styling(bootstrap_options = c("striped","hover"), full_width = FALSE) %>%
  row_spec(which(dist_compare$Ditambah > 0), background = "#d4edda")
Distribusi Kelas Sebelum vs Sesudah SMOTE
Kelas Sebelum Sesudah Ditambah
Insomnia 77 219 142
None 219 219 0
Sleep Apnea 78 219 141
p_bef <- ggplot(df_clean, aes(x=Sleep.Disorder, fill=Sleep.Disorder)) +
  geom_bar(alpha=0.85) +
  geom_text(stat="count", aes(label=after_stat(count)), vjust=-0.5, fontface="bold") +
  scale_fill_manual(values=c("None"="#55A868","Insomnia"="#DD8452","Sleep Apnea"="#C44E52")) +
  labs(title="Sebelum SMOTE", x=NULL, y="Jumlah") + ylim(0,260) +
  theme_bw() + theme(legend.position="none")

p_aft <- ggplot(df_balanced, aes(x=Sleep.Disorder, fill=Sleep.Disorder)) +
  geom_bar(alpha=0.85) +
  geom_text(stat="count", aes(label=after_stat(count)), vjust=-0.5, fontface="bold") +
  scale_fill_manual(values=c("None"="#55A868","Insomnia"="#DD8452","Sleep Apnea"="#C44E52")) +
  labs(title="Sesudah SMOTE", x=NULL, y="Jumlah") + ylim(0,260) +
  theme_bw() + theme(legend.position="none")

grid.arrange(p_bef, p_aft, ncol=2)

10.5 Verifikasi Akhir Asumsi LDA

boxm_final <- boxM(
  df_balanced %>% dplyr::select(all_of(num_cols_clean)),
  df_balanced$Sleep.Disorder
)

df_vif_final <- df_balanced %>%
  mutate(Gender_bin     = as.integer(Gender == "Male"),
         BMI_Overweight = as.integer(BMI.Category == "Overweight"),
         BMI_Obese      = as.integer(BMI.Category == "Obese"))
f_final  <- as.formula(paste("as.numeric(Sleep.Disorder) ~",
  paste(c(num_cols_clean, "Gender_bin", "BMI_Overweight", "BMI_Obese"), collapse = "+")))
vif_final <- vif(lm(f_final, data = df_vif_final))

norm_final <- bind_rows(lapply(levels(df_balanced$Sleep.Disorder), function(k) {
  sub <- df_balanced %>% filter(Sleep.Disorder == k) %>%
    dplyr::select(all_of(num_cols_clean))
  bind_rows(lapply(names(sub), function(v) {
    sw <- shapiro.test(sub[[v]])
    data.frame(Kelas = k, Variabel = v,
               p.value = round(sw$p.value, 4),
               Normal = ifelse(sw$p.value > 0.05, "Ya", "Tidak"))
  }))
}))

n_normal_pct <- round(sum(norm_final$Normal == "Ya") / nrow(norm_final) * 100, 1)

verif <- data.frame(
  Aspek = c(
    "Jumlah Observasi (setelah SMOTE)",
    "Distribusi Kelas",
    "Outlier",
    "Normalitas (kombinasi normal)",
    "Homogenitas Kovarians (Box's M)",
    "Multikolinearitas (VIF)"
  ),
  Hasil = c(
    paste(nrow(df_balanced), "baris"),
    paste(paste(names(table(df_balanced$Sleep.Disorder)),
                table(df_balanced$Sleep.Disorder), sep = "="), collapse = ", "),
    "0 outlier (setelah Winsorizing)",
    paste0(sum(norm_final$Normal == "Ya"), " dari ", nrow(norm_final),
           " kombinasi normal (", n_normal_pct, "%)"),
    paste0("p = ", round(boxm_final$p.value, 4),
           " — Box's M sangat sensitif pada n besar; pelanggaran ini umum. ",
           "LDA tetap digunakan dengan catatan."),
    ifelse(all(vif_final <= 5),  "Semua VIF ≤ 5 (Aman)",
    ifelse(all(vif_final <= 10), "Semua VIF ≤ 10 (Aman)", "Ada VIF > 10"))
  ),
  Status = c(
    "OK", "OK", "OK",
    ifelse(n_normal_pct >= 50, "OK", "Diterima*"),
    "Diterima*",
    ifelse(all(vif_final <= 10), "OK", "Perhatian")
  )
)

kable(verif, caption = "Verifikasi Asumsi Akhir – Data Siap LDA",
      row.names = FALSE) %>%
  kable_styling(bootstrap_options = c("striped","hover","condensed"), full_width = TRUE) %>%
  column_spec(1, bold = TRUE) %>%
  row_spec(which(verif$Status == "OK"),        background = "#d4edda") %>%
  row_spec(which(verif$Status == "Diterima*"), background = "#fff3cd") %>%
  row_spec(which(verif$Status == "Perhatian"), background = "#f8d7da")
Verifikasi Asumsi Akhir – Data Siap LDA
Aspek Hasil Status
Jumlah Observasi (setelah SMOTE) 657 baris OK
Distribusi Kelas Insomnia=219, None=219, Sleep Apnea=219 OK
Outlier 0 outlier (setelah Winsorizing) OK
Normalitas (kombinasi normal) 0 dari 9 kombinasi normal (0%) Diterima*
Homogenitas Kovarians (Box’s M) p = 0 — Box’s M sangat sensitif pada n besar; pelanggaran ini umum. LDA tetap digunakan dengan catatan. Diterima*
Multikolinearitas (VIF) Semua VIF ≤ 5 (Aman) OK

Keterangan status: - OK = asumsi terpenuhi sepenuhnya - Diterima* = pelanggaran ada namun dapat diterima secara statistik — Box’s M hampir selalu signifikan pada n > 200 karena sangat sensitif terhadap ukuran sampel besar, bukan karena perbedaan kovarians yang substansial. Normalitas yang tidak sempurna ditoleransi oleh LDA selama n per kelas cukup besar (≥ 25). - Dataset final tersimpan di objek df_balanced — siap diteruskan ke tahap pemodelan LDA/QDA.

cat("Dataset LDA final: df_balanced\n")
## Dataset LDA final: df_balanced
cat("Dimensi      :", nrow(df_balanced), "baris x", ncol(df_balanced), "kolom\n")
## Dimensi      : 657 baris x 7 kolom
cat("Fitur numerik:", paste(num_cols_clean, collapse = ", "), "\n")
## Fitur numerik: PC1, PC2, PC3
cat("Distribusi   :", paste(names(table(df_balanced$Sleep.Disorder)),
                            table(df_balanced$Sleep.Disorder),
                            sep = "=", collapse = " | "), "\n")
## Distribusi   : Insomnia=219 | None=219 | Sleep Apnea=219


11 LDA

11.1 Fitting Model LDA

11.1.1 Pemilihan Prediktor

Prediktor yang digunakan adalah PC scores hasil PCA (orthogonal, bebas multikolinearitas). Variabel kategorik (Gender, BMI.Category, Occupation) tidak disertakan dalam LDA standar.

cat("Prediktor LDA (PC scores):\n")
## Prediktor LDA (PC scores):
cat(paste(num_cols_clean, collapse = ", "), "\n\n")
## PC1, PC2, PC3
cat("Variabel target: Sleep.Disorder (",
    paste(levels(df_balanced$Sleep.Disorder), collapse = " / "), ")\n")
## Variabel target: Sleep.Disorder ( Insomnia / None / Sleep Apnea )
cat("Distribusi kelas:\n")
## Distribusi kelas:
print(table(df_balanced$Sleep.Disorder))
## 
##    Insomnia        None Sleep Apnea 
##         219         219         219

11.1.2 Fit Model LDA (Prior Proporsional)

set.seed(42)

lda_formula <- as.formula(
  paste("Sleep.Disorder ~", paste(num_cols_clean, collapse = " + "))
)

lda_model <- lda(
  lda_formula,
  data  = df_balanced
)

print(lda_model)
## Call:
## lda(lda_formula, data = df_balanced)
## 
## Prior probabilities of groups:
##    Insomnia        None Sleep Apnea 
##   0.3333333   0.3333333   0.3333333 
## 
## Group means:
##                    PC1        PC2         PC3
## Insomnia    -0.8753958  0.2024422  1.06088457
## None         0.4541202 -0.8009923 -0.38494922
## Sleep Apnea -0.7919284  2.1240026 -0.01783174
## 
## Coefficients of linear discriminants:
##            LD1        LD2
## PC1 -0.3133673 -0.1538351
## PC2  0.9235304 -0.2601952
## PC3  0.5667034  0.8305413
## 
## Proportion of trace:
##    LD1    LD2 
## 0.8337 0.1663

11.1.3 Koefisien Fungsi Diskriminan

coef_df <- as.data.frame(lda_model$scaling)
coef_df$Variabel <- rownames(coef_df)
rownames(coef_df) <- NULL
coef_df <- coef_df %>% dplyr::select(Variabel, everything())

names(coef_df)[-1] <- paste0("LD", seq_len(ncol(coef_df) - 1))

kable(coef_df %>% mutate(across(where(is.numeric), ~ round(., 4))),
      caption   = "Koefisien Fungsi Diskriminan Linear (Unstandardized)",
      row.names = FALSE) %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"),
                full_width = FALSE)
Koefisien Fungsi Diskriminan Linear (Unstandardized)
Variabel LD1 LD2
PC1 -0.3134 -0.1538
PC2 0.9235 -0.2602
PC3 0.5667 0.8305
prop_var <- lda_model$svd^2 / sum(lda_model$svd^2)

prop_df <- data.frame(
  LD               = paste0("LD", seq_along(prop_var)),
  Singular_Value   = round(lda_model$svd, 4),
  Proporsi_Varians = paste0(round(prop_var * 100, 2), "%"),
  Kumulatif        = paste0(round(cumsum(prop_var) * 100, 2), "%")
)

kable(prop_df,
      caption   = "Proporsi Varians yang Dijelaskan oleh Setiap Fungsi Diskriminan",
      row.names = FALSE) %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)
Proporsi Varians yang Dijelaskan oleh Setiap Fungsi Diskriminan
LD Singular_Value Proporsi_Varians Kumulatif
LD1 24.8065 83.37% 83.37%
LD2 11.0806 16.63% 100%

11.1.4 Group Means (Centroid per Kelas)

gm_df <- as.data.frame(lda_model$means)
gm_df$Kelas <- rownames(gm_df)
gm_df <- gm_df %>% dplyr::select(Kelas, everything())
rownames(gm_df) <- NULL

kable(gm_df %>% mutate(across(where(is.numeric), ~ round(., 3))),
      caption   = "Group Means – Rata-rata Prediktor per Kelas Target",
      row.names = FALSE) %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"),
                full_width = TRUE) %>%
  scroll_box(width = "100%")
Group Means – Rata-rata Prediktor per Kelas Target
Kelas PC1 PC2 PC3
Insomnia -0.875 0.202 1.061
None 0.454 -0.801 -0.385
Sleep Apnea -0.792 2.124 -0.018

11.2 Uji Signifikansi LDA (Wilks’ Lambda & F-test)

manova_formula <- as.formula(
  paste("cbind(", paste(num_cols_clean, collapse = ", "), ") ~ Sleep.Disorder")
)

manova_result <- manova(manova_formula, data = df_balanced)

wilks_result  <- summary(manova_result, test = "Wilks")
wilks_stat    <- wilks_result$stats

wilks_df <- data.frame(
  Uji        = "Wilks' Lambda",
  Lambda     = round(wilks_stat["Sleep.Disorder", "Wilks"], 4),
  F_approx   = round(wilks_stat["Sleep.Disorder", "approx F"], 4),
  num_df     = wilks_stat["Sleep.Disorder", "num Df"],
  den_df     = wilks_stat["Sleep.Disorder", "den Df"],
  p.value    = format.pval(wilks_stat["Sleep.Disorder", "Pr(>F)"],
                            digits = 4, eps = 0.0001),
  Kesimpulan = ifelse(wilks_stat["Sleep.Disorder", "Pr(>F)"] < 0.05,
                      "Signifikan – ada perbedaan antar kelas ✓",
                      "Tidak Signifikan")
)

kable(wilks_df,
      caption   = "Uji Wilks' Lambda – Signifikansi Model LDA",
      row.names = FALSE) %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE) %>%
  row_spec(1,
           color      = "white",
           background = ifelse(wilks_stat["Sleep.Disorder", "Pr(>F)"] < 0.05,
                               "#5cb85c", "#d9534f"))
Uji Wilks’ Lambda – Signifikansi Model LDA
Uji Lambda F_approx num_df den_df p.value Kesimpulan
Wilks’ Lambda 0.2523 215.3675 6 1304 < 1e-04 Signifikan – ada perbedaan antar kelas ✓
mv_tests <- data.frame(
  Uji = c("Pillai's Trace", "Wilks' Lambda", "Hotelling-Lawley", "Roy's Largest Root"),
  Statistik = c(
    round(summary(manova_result, test = "Pillai")$stats["Sleep.Disorder", "Pillai"], 4),
    round(wilks_stat["Sleep.Disorder", "Wilks"], 4),
    round(summary(manova_result, test = "Hotelling-Lawley")$stats["Sleep.Disorder", "Hotelling-Lawley"], 4),
    round(summary(manova_result, test = "Roy")$stats["Sleep.Disorder", "Roy"], 4)
  ),
  F_approx = c(
    round(summary(manova_result, test = "Pillai")$stats["Sleep.Disorder", "approx F"], 4),
    round(wilks_stat["Sleep.Disorder", "approx F"], 4),
    round(summary(manova_result, test = "Hotelling-Lawley")$stats["Sleep.Disorder", "approx F"], 4),
    round(summary(manova_result, test = "Roy")$stats["Sleep.Disorder", "approx F"], 4)
  ),
  p.value = c(
    format.pval(summary(manova_result, test = "Pillai")$stats["Sleep.Disorder", "Pr(>F)"], digits=4, eps=0.0001),
    format.pval(wilks_stat["Sleep.Disorder", "Pr(>F)"], digits=4, eps=0.0001),
    format.pval(summary(manova_result, test = "Hotelling-Lawley")$stats["Sleep.Disorder", "Pr(>F)"], digits=4, eps=0.0001),
    format.pval(summary(manova_result, test = "Roy")$stats["Sleep.Disorder", "Pr(>F)"], digits=4, eps=0.0001)
  )
)

kable(mv_tests,
      caption   = "Uji Signifikansi Multivariat MANOVA (Ekuivalen LDA)",
      row.names = FALSE) %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"),
                full_width = FALSE)
Uji Signifikansi Multivariat MANOVA (Ekuivalen LDA)
Uji Statistik F_approx p.value
Pillai’s Trace 0.9260 187.6632 < 1e-04
Wilks’ Lambda 0.2523 215.3675 < 1e-04
Hotelling-Lawley 2.2573 244.9191 < 1e-04
Roy’s Largest Root 1.8818 409.6150 < 1e-04

11.3 Prediksi & Confusion Matrix – LDA

11.3.1 Prediksi pada Data Training (Resubstitution)

lda_pred <- predict(lda_model, newdata = df_balanced)

df_lda_result <- df_balanced %>%
  mutate(
    Predicted_LDA  = lda_pred$class,
    Posterior_None = round(lda_pred$posterior[, "None"], 4),
    Posterior_Ins  = round(lda_pred$posterior[, "Insomnia"], 4),
    Posterior_SA   = round(lda_pred$posterior[, "Sleep Apnea"], 4)
  )

kable(head(df_lda_result %>%
             dplyr::select(Sleep.Disorder, Predicted_LDA,
                           Posterior_None, Posterior_Ins, Posterior_SA), 10),
      caption   = "10 Baris Pertama: Prediksi LDA & Posterior Probability",
      row.names = FALSE) %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = TRUE)
10 Baris Pertama: Prediksi LDA & Posterior Probability
Sleep.Disorder Predicted_LDA Posterior_None Posterior_Ins Posterior_SA
None Insomnia 0.3395 0.6502 0.0103
None None 0.9761 0.0187 0.0052
None None 0.9761 0.0187 0.0052
Sleep Apnea Insomnia 0.0003 0.9799 0.0198
Sleep Apnea Insomnia 0.0003 0.9799 0.0198
Insomnia Insomnia 0.0003 0.9799 0.0198
Insomnia Insomnia 0.0022 0.9290 0.0688
None None 0.9985 0.0014 0.0002
None None 0.9985 0.0014 0.0002
None None 0.9985 0.0014 0.0002

11.3.2 Confusion Matrix LDA

cm_lda <- table(
  Aktual   = df_balanced$Sleep.Disorder,
  Prediksi = lda_pred$class
)

kable(cm_lda,
      caption = "Confusion Matrix – LDA (Resubstitution)") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"),
                full_width = FALSE) %>%
  add_header_above(c(" " = 1, "Prediksi" = length(levels(df_balanced$Sleep.Disorder))))
Confusion Matrix – LDA (Resubstitution)
Prediksi
Insomnia None Sleep Apnea
Insomnia 204 13 2
None 37 177 5
Sleep Apnea 34 9 176
n_total   <- sum(cm_lda)
n_correct <- sum(diag(cm_lda))
accuracy_lda  <- n_correct / n_total
aper_lda      <- 1 - accuracy_lda

per_class_lda <- lapply(levels(df_balanced$Sleep.Disorder), function(k) {
  TP <- cm_lda[k, k]
  FP <- sum(cm_lda[, k]) - TP
  FN <- sum(cm_lda[k, ]) - TP
  TN <- n_total - TP - FP - FN
  data.frame(
    Kelas       = k,
    TP = TP, FP = FP, FN = FN, TN = TN,
    Sensitivity = round(TP / (TP + FN), 4),
    Specificity = round(TN / (TN + FP), 4),
    Precision   = round(TP / (TP + FP), 4),
    F1_Score    = round(2 * TP / (2 * TP + FP + FN), 4)
  )
})

per_class_lda_df <- bind_rows(per_class_lda)

kable(per_class_lda_df,
      caption   = "Metrik Evaluasi Per Kelas – LDA",
      row.names = FALSE) %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"),
                full_width = TRUE)
Metrik Evaluasi Per Kelas – LDA
Kelas TP FP FN TN Sensitivity Specificity Precision F1_Score
Insomnia 204 71 15 367 0.9315 0.8379 0.7418 0.8259
None 177 22 42 416 0.8082 0.9498 0.8894 0.8469
Sleep Apnea 176 7 43 431 0.8037 0.9840 0.9617 0.8756
overall_lda <- data.frame(
  Model       = "LDA",
  Accuracy    = paste0(round(accuracy_lda * 100, 2), "%"),
  APER        = paste0(round(aper_lda * 100, 2), "%"),
  N_Correct   = n_correct,
  N_Incorrect = n_total - n_correct,
  N_Total     = n_total
)

kable(overall_lda,
      caption   = "Ringkasan Evaluasi Model LDA (Resubstitution)",
      row.names = FALSE) %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE) %>%
  row_spec(1, color = "white",
           background = ifelse(accuracy_lda >= 0.8, "#5cb85c", "#f0ad4e"))
Ringkasan Evaluasi Model LDA (Resubstitution)
Model Accuracy APER N_Correct N_Incorrect N_Total
LDA 84.78% 15.22% 557 100 657

11.3.3 LOOCV – Leave-One-Out Cross-Validation LDA

set.seed(42)

lda_cv <- lda(
  lda_formula,
  data  = df_balanced,
  prior = prop.table(table(df_balanced$Sleep.Disorder)),
  CV    = TRUE
)

cm_cv <- table(
  Aktual   = df_balanced$Sleep.Disorder,
  Prediksi = lda_cv$class
)

acc_cv  <- sum(diag(cm_cv)) / sum(cm_cv)
aper_cv <- 1 - acc_cv

kable(cm_cv,
      caption = "Confusion Matrix – LDA LOOCV") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"),
                full_width = FALSE) %>%
  add_header_above(c(" " = 1, "Prediksi (LOOCV)" = length(levels(df_balanced$Sleep.Disorder))))
Confusion Matrix – LDA LOOCV
Prediksi (LOOCV)
Insomnia None Sleep Apnea
Insomnia 204 13 2
None 37 177 5
Sleep Apnea 34 9 176
loocv_df <- data.frame(
  Model       = "LDA (LOOCV)",
  Accuracy    = paste0(round(acc_cv * 100, 2), "%"),
  APER        = paste0(round(aper_cv * 100, 2), "%"),
  N_Correct   = sum(diag(cm_cv)),
  N_Incorrect = sum(cm_cv) - sum(diag(cm_cv)),
  N_Total     = sum(cm_cv)
)

kable(loocv_df,
      caption   = "Ringkasan Evaluasi Model LDA – LOOCV",
      row.names = FALSE) %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE) %>%
  row_spec(1, color = "white",
           background = ifelse(acc_cv >= 0.8, "#5cb85c", "#f0ad4e"))
Ringkasan Evaluasi Model LDA – LOOCV
Model Accuracy APER N_Correct N_Incorrect N_Total
LDA (LOOCV) 84.78% 15.22% 557 100 657


12 Cek Asumsi MLR

Catatan: Cek asumsi MLR dilakukan pada data asli (df) sebelum transformasi PCA, karena MLR membutuhkan interpretasi koefisien per prediktor asli.

12.1 Asumsi 1 – Linearitas Log-Odds (Box-Tidwell)

# Gunakan df_raw (sebelum Box-Cox) agar nilai positif dan log() aman
safe_log <- function(x) log(x - min(x) + 1)

df_bt <- df_raw %>%
  mutate(
    ln_Age               = safe_log(Age),
    ln_Sleep.Duration    = safe_log(Sleep.Duration),
    ln_Physical.Activity = safe_log(Physical.Activity.Level),
    ln_Stress.Level      = safe_log(Stress.Level),
    ln_Heart.Rate        = safe_log(Heart.Rate),
    ln_Daily.Steps       = safe_log(Daily.Steps + 1),
    ln_Systolic          = safe_log(Systolic),
    ln_Diastolic         = safe_log(Diastolic)
  )

df_bt$y_bin <- as.integer(df_bt$Sleep.Disorder != "None")

bt_model <- glm(
  y_bin ~ Age + ln_Age +
          Sleep.Duration + ln_Sleep.Duration +
          Physical.Activity.Level + ln_Physical.Activity +
          Stress.Level + ln_Stress.Level +
          Heart.Rate + ln_Heart.Rate +
          Daily.Steps + ln_Daily.Steps +
          Systolic + ln_Systolic +
          Diastolic + ln_Diastolic,
  data   = df_bt,
  family = binomial(link = "logit")
)

bt_summary <- summary(bt_model)$coefficients
bt_df <- data.frame(
  Term      = rownames(bt_summary),
  Estimate  = round(bt_summary[, "Estimate"], 4),
  Std.Error = round(bt_summary[, "Std. Error"], 4),
  z.value   = round(bt_summary[, "z value"], 4),
  p.value   = round(bt_summary[, "Pr(>|z|)"], 4)
) %>%
  filter(grepl("^ln_", Term)) %>%
  mutate(
    Prediktor = gsub("^ln_", "", Term),
    Status    = ifelse(p.value < 0.05, "Tidak Linier ⚠", "Linier ✓")
  ) %>%
  dplyr::select(Prediktor, Estimate, Std.Error, z.value, p.value, Status)

kable(bt_df,
      caption   = "Box-Tidwell Test – Uji Linearitas Log-Odds (Asumsi MLR)",
      row.names = FALSE) %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE) %>%
  row_spec(which(bt_df$Status == "Tidak Linier ⚠"), background = "#fff3cd") %>%
  row_spec(which(bt_df$Status == "Linier ✓"),       background = "#d4edda")
Box-Tidwell Test – Uji Linearitas Log-Odds (Asumsi MLR)
Prediktor Estimate Std.Error z.value p.value Status
Age -1.0180 1.6284 -0.6252 0.5319 Linier ✓
Sleep.Duration -12.8333 12.0080 -1.0687 0.2852 Linier ✓
Physical.Activity 0.9870 0.9943 0.9927 0.3208 Linier ✓
Stress.Level 11.9614 7.2523 1.6493 0.0991 Linier ✓
Heart.Rate -1.3586 1.5918 -0.8535 0.3934 Linier ✓
Daily.Steps 0.3592 3.9767 0.0903 0.9280 Linier ✓
Systolic -11.7622 3.7682 -3.1215 0.0018 Tidak Linier ⚠
Diastolic 5.4851 2.5063 2.1885 0.0286 Tidak Linier ⚠

Interpretasi: Koefisien ln(X) yang tidak signifikan (p > 0.05) menunjukkan hubungan linier antara log-odds dan prediktor tersebut sudah terpenuhi.

12.2 Asumsi 2 – Independensi Observasi

# Gunakan df_raw (sebelum Box-Cox) untuk cek independensi
n_dup <- sum(duplicated(df_raw %>% dplyr::select(-Sleep.Disorder)))

ind_df <- data.frame(
  Aspek = c(
    "Desain pengambilan data",
    "Duplikasi observasi",
    "Pengamatan berulang (longitudinal)"
  ),
  Evaluasi = c(
    "Cross-sectional survey – setiap baris = satu individu unik",
    paste0("Jumlah baris duplikat: ", n_dup,
           " (berdasarkan semua prediktor)"),
    "Dataset tidak memiliki ID waktu atau pengulangan per subjek"
  ),
  Status = c(
    "✓ Terpenuhi",
    ifelse(n_dup == 0, "✓ Tidak ada duplikat", "⚠ Ada duplikat – perlu dicek"),
    "✓ Terpenuhi"
  )
)

kable(ind_df, caption = "Evaluasi Asumsi Independensi Observasi (MLR)",
      row.names = FALSE) %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = TRUE) %>%
  column_spec(3, bold = TRUE,
              color = ifelse(grepl("✓", ind_df$Status), "#155724", "#856404"))
Evaluasi Asumsi Independensi Observasi (MLR)
Aspek Evaluasi Status
Desain pengambilan data Cross-sectional survey – setiap baris = satu individu unik ✓ Terpenuhi
Duplikasi observasi Jumlah baris duplikat: 265 (berdasarkan semua prediktor) ⚠ Ada duplikat – perlu dicek
Pengamatan berulang (longitudinal) Dataset tidak memiliki ID waktu atau pengulangan per subjek ✓ Terpenuhi

12.3 Asumsi 3 – Tidak Ada Multikolinearitas (VIF)

VIF sudah dihitung di bagian Cek Asumsi LDA (Asumsi 3) dan hasilnya berlaku untuk MLR juga karena prediktor yang digunakan sama. Lihat tabel VIF di atas.

12.4 Asumsi 4 – Tidak Ada Outlier Ekstrem (Mahalanobis Distance)

# Gunakan df_raw (sebelum Box-Cox) untuk Mahalanobis – skala asli lebih representatif
mah_data  <- df_raw %>% dplyr::select(all_of(pred_vars))
mah_dist  <- mahalanobis(mah_data, colMeans(mah_data), cov(mah_data))
mah_pval  <- pchisq(mah_dist, df = ncol(mah_data), lower.tail = FALSE)
n_out_mah <- sum(mah_pval < 0.001)

mah_df <- data.frame(
  Metode            = "Mahalanobis Distance",
  Threshold         = "p < 0.001 (χ² dengan df = 8)",
  N_Outlier_Ekstrem = n_out_mah,
  Persen            = paste0(round(n_out_mah / nrow(df_raw) * 100, 2), "%"),
  Status            = ifelse(n_out_mah / nrow(df_raw) < 0.05,
                             "Outlier < 5% – dapat ditoleransi ✓",
                             "Outlier ≥ 5% – perlu penanganan ⚠")
)

kable(mah_df,
      caption   = "Deteksi Outlier Multivariat – Mahalanobis Distance (Asumsi MLR)",
      row.names = FALSE) %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)
Deteksi Outlier Multivariat – Mahalanobis Distance (Asumsi MLR)
Metode Threshold N_Outlier_Ekstrem Persen Status
Mahalanobis Distance p < 0.001 (χ² dengan df = 8) 8 2.14% Outlier < 5% – dapat ditoleransi ✓
mah_plot_df <- data.frame(
  Index    = 1:nrow(df_raw),
  Mah.Dist = mah_dist,
  Outlier  = mah_pval < 0.001
)

ggplot(mah_plot_df, aes(x = Index, y = Mah.Dist, color = Outlier)) +
  geom_point(alpha = 0.7, size = 1.5) +
  geom_hline(yintercept = qchisq(0.999, df = ncol(mah_data)),
             linetype = "dashed", color = "red", linewidth = 0.8) +
  scale_color_manual(values = c("FALSE" = "#4C72B0", "TRUE" = "#C44E52"),
                     labels = c("Normal", "Outlier Ekstrem")) +
  labs(title    = "Mahalanobis Distance – Deteksi Outlier Multivariat",
       subtitle = "Garis merah = threshold chi-square (p = 0.001)",
       x = "Index Observasi", y = "Mahalanobis Distance", color = NULL) +
  theme_bw(base_size = 12)

12.5 Ringkasan Asumsi MLR

mlr_asumsi <- data.frame(
  No     = 1:4,
  Asumsi = c("Linearitas log-odds",
             "Independensi observasi",
             "Tidak ada multikolinearitas (VIF)",
             "Tidak ada outlier ekstrem"),
  Metode = c("Box-Tidwell test",
             "Evaluasi desain & duplikasi",
             "VIF",
             "Mahalanobis Distance"),
  Hasil  = c(
    paste0(sum(bt_df$Status == "Tidak Linier ⚠"), " variabel tidak linier"),
    ifelse(n_dup == 0, "Tidak ada duplikat",
           paste0(n_dup, " baris duplikat")),
    paste0(sum(vif_vals > 5), " variabel VIF > 5"),
    paste0(n_out_mah, " outlier ekstrem (",
           round(n_out_mah / nrow(df_raw) * 100, 1), "%)")
  ),
  Status = c(
    ifelse(sum(bt_df$Status == "Tidak Linier ⚠") == 0,
           "✓ Terpenuhi", "⚠ Sebagian tidak linier"),
    ifelse(n_dup == 0, "✓ Terpenuhi", "⚠ Perlu dicek"),
    ifelse(all(vif_vals <= 10), "✓ Tidak ada VIF > 10", "✗ Ada VIF > 10"),
    ifelse(n_out_mah / nrow(df_raw) < 0.05,
           "✓ Dapat ditoleransi", "⚠ Perlu penanganan")
  )
)

kable(mlr_asumsi, caption = "Ringkasan Pemeriksaan Asumsi MLR",
      row.names = FALSE) %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"),
                full_width = TRUE) %>%
  column_spec(5, bold = TRUE,
              color = ifelse(grepl("✓", mlr_asumsi$Status), "#155724",
                      ifelse(grepl("⚠", mlr_asumsi$Status), "#856404", "#721c24")))
Ringkasan Pemeriksaan Asumsi MLR
No Asumsi Metode Hasil Status
1 Linearitas log-odds Box-Tidwell test 2 variabel tidak linier ⚠ Sebagian tidak linier
2 Independensi observasi Evaluasi desain & duplikasi 265 baris duplikat ⚠ Perlu dicek
3 Tidak ada multikolinearitas (VIF) VIF 9 variabel VIF > 5 ✗ Ada VIF > 10
4 Tidak ada outlier ekstrem Mahalanobis Distance 8 outlier ekstrem (2.1%) ✓ Dapat ditoleransi

13 Penanganan Asumsi MLR yang Tidak Terpenuhi

Berdasarkan hasil cek asumsi MLR di atas, tabel berikut merangkum masalah yang terdeteksi dan strategi penanganannya sebelum fitting model MLR:

Masalah Temuan Strategi Penanganan
Multikolinearitas (VIF) Ada VIF > 5 / > 10 Drop iteratif variabel VIF tertinggi
Outlier ekstrem Cek via Mahalanobis Hapus baris outlier ekstrem (p < 0.001)
Linearitas log-odds Cek via Box-Tidwell Transformasi log/sqrt pada prediktor tidak linier
Class imbalance None >> Insomnia, Sleep Apnea Oversampling SMOTE pada data MLR

13.1 Penanganan Multikolinearitas MLR – Drop Iteratif VIF

Berbeda dengan LDA (menggunakan PCA), MLR membutuhkan koefisien yang dapat diinterpretasi per prediktor asli. Oleh karena itu penanganan multikolinearitas dilakukan dengan drop iteratif variabel VIF tertinggi (bukan PCA).

# MLR butuh prediktor asli yang interpretable → mulai dari df_raw (sebelum Box-Cox)
# Terapkan Winsorizing ulang pada df_raw agar outlier sudah ditangani
df_mlr_prep <- df_raw
num_cols_mlr_raw <- df_mlr_prep %>% dplyr::select(where(is.numeric)) %>% names()
df_mlr_prep[num_cols_mlr_raw] <- lapply(df_mlr_prep[num_cols_mlr_raw], winsorize)

# Drop iteratif: hapus variabel dengan VIF > 10 satu per satu
pred_mlr <- pred_vars  # c("Age", "Sleep.Duration", ...)

repeat {
  df_vif_mlr <- df_mlr_prep %>%
    mutate(Gender_bin     = as.integer(Gender == "Male"),
           BMI_Overweight = as.integer(BMI.Category == "Overweight"),
           BMI_Obese      = as.integer(BMI.Category == "Obese"))

  all_preds <- c(pred_mlr, "Gender_bin", "BMI_Overweight", "BMI_Obese")
  f_vif <- as.formula(paste("as.numeric(Sleep.Disorder) ~",
                             paste(all_preds, collapse = "+")))
  vif_iter <- vif(lm(f_vif, data = df_vif_mlr))

  max_vif <- max(vif_iter[pred_mlr])  # cek VIF prediktor numerik saja
  if (max_vif <= 10) break

  drop_var <- names(which.max(vif_iter[pred_mlr]))
  cat("Drop:", drop_var, "| VIF =", round(max_vif, 2), "\n")
  pred_mlr <- setdiff(pred_mlr, drop_var)
}
## Drop: Diastolic | VIF = 76.06 
## Drop: Stress.Level | VIF = 10.54
cat("\nPrediktor MLR setelah drop iteratif VIF:\n")
## 
## Prediktor MLR setelah drop iteratif VIF:
cat(paste(pred_mlr, collapse = ", "), "\n")
## Age, Sleep.Duration, Physical.Activity.Level, Heart.Rate, Daily.Steps, Systolic
# Tampilkan VIF final
vif_mlr_final <- vif(lm(as.formula(paste("as.numeric(Sleep.Disorder) ~",
  paste(c(pred_mlr, "Gender_bin", "BMI_Overweight", "BMI_Obese"), collapse = "+"))),
  data = df_vif_mlr))

vif_mlr_df <- data.frame(
  Variabel = names(vif_mlr_final),
  VIF      = round(vif_mlr_final, 3),
  Status   = ifelse(vif_mlr_final > 10, "Multikolinear Tinggi", "Aman")
)

kable(vif_mlr_df, caption = "VIF Setelah Drop Iteratif – Prediktor MLR Final",
      row.names = FALSE) %>%
  kable_styling(bootstrap_options = c("striped","hover"), full_width = FALSE) %>%
  row_spec(which(vif_mlr_df$Status == "Multikolinear Tinggi"), color="white", background="#d9534f") %>%
  row_spec(which(vif_mlr_df$Status == "Perlu Perhatian"),      color="white", background="#f0ad4e") %>%
  row_spec(which(vif_mlr_df$Status == "Aman"),                 color="white", background="#5cb85c")
VIF Setelah Drop Iteratif – Prediktor MLR Final
Variabel VIF Status
Age 5.097 Aman
Sleep.Duration 4.478 Aman
Physical.Activity.Level 5.215 Aman
Heart.Rate 2.428 Aman
Daily.Steps 4.721 Aman
Systolic 3.877 Aman
Gender_bin 1.837 Aman
BMI_Overweight 3.516 Aman
BMI_Obese 2.098 Aman

13.2 Penanganan Outlier MLR – Hapus Baris Outlier Ekstrem

# Hitung ulang Mahalanobis pada prediktor yang tersisa setelah drop VIF
mah_data_mlr <- df_mlr_prep %>% dplyr::select(all_of(pred_mlr))
mah_dist_mlr <- mahalanobis(mah_data_mlr,
                             colMeans(mah_data_mlr),
                             cov(mah_data_mlr))
mah_pval_mlr <- pchisq(mah_dist_mlr,
                        df       = ncol(mah_data_mlr),
                        lower.tail = FALSE)

n_outlier_mlr <- sum(mah_pval_mlr < 0.001)
cat("Jumlah outlier ekstrem (p < 0.001):", n_outlier_mlr, "\n")
## Jumlah outlier ekstrem (p < 0.001): 2
cat("Persentase:", round(n_outlier_mlr / nrow(df_mlr_prep) * 100, 2), "%\n")
## Persentase: 0.53 %
# Hapus outlier jika >= 5% dari data; jika < 5% tetap dipertahankan
if (n_outlier_mlr / nrow(df_mlr_prep) >= 0.05) {
  df_mlr_prep <- df_mlr_prep[mah_pval_mlr >= 0.001, ]
  cat("Outlier dihapus. Baris tersisa:", nrow(df_mlr_prep), "\n")
} else {
  cat("Outlier < 5% dari data → tetap dipertahankan (dapat ditoleransi).\n")
}
## Outlier < 5% dari data → tetap dipertahankan (dapat ditoleransi).

13.3 Penanganan Linearitas Log-Odds MLR – Transformasi Prediktor

Prediktor yang tidak lolos Box-Tidwell (tidak linier terhadap log-odds) ditransformasi menggunakan log atau square root sesuai distribusinya.

# Identifikasi prediktor tidak linier dari hasil Box-Tidwell
nonlinear_vars <- bt_df %>%
  filter(Status == "Tidak Linier ⚠") %>%
  pull(Prediktor)

# Saring hanya yang masih ada di pred_mlr (setelah drop VIF)
nonlinear_vars <- intersect(nonlinear_vars, pred_mlr)

cat("Prediktor tidak linier yang akan ditransformasi:\n")
## Prediktor tidak linier yang akan ditransformasi:
if (length(nonlinear_vars) == 0) {
  cat("Tidak ada – semua prediktor linier terhadap log-odds.\n")
} else {
  cat(paste(nonlinear_vars, collapse = ", "), "\n")
}
## Systolic
# Terapkan transformasi log dengan shift aman (nilai dari df_raw sudah positif setelah Winsorizing)
for (v in nonlinear_vars) {
  new_name <- paste0("log_", v)
  df_mlr_prep[[new_name]] <- log(df_mlr_prep[[v]] - min(df_mlr_prep[[v]]) + 1)
  pred_mlr <- c(setdiff(pred_mlr, v), new_name)
}

cat("\nPrediktor MLR final setelah transformasi linearitas:\n")
## 
## Prediktor MLR final setelah transformasi linearitas:
cat(paste(pred_mlr, collapse = ", "), "\n")
## Age, Sleep.Duration, Physical.Activity.Level, Heart.Rate, Daily.Steps, log_Systolic

13.4 Penanganan Class Imbalance MLR – SMOTE

set.seed(42)

# Buat df_mlr_balanced menggunakan SMOTE yang sama
num_mat_mlr   <- as.data.frame(df_mlr_prep[pred_mlr])
n_target_mlr  <- max(table(df_mlr_prep$Sleep.Disorder))
df_mlr_bal    <- df_mlr_prep

for (kls in c("Insomnia", "Sleep Apnea")) {
  idx_kls  <- which(df_mlr_prep$Sleep.Disorder == kls)
  n_needed <- n_target_mlr - length(idx_kls)
  if (n_needed > 0) {
    syn_num <- smote_manual(num_mat_mlr[idx_kls, , drop = FALSE], n_needed, k = 5)
    names(syn_num) <- pred_mlr
    syn_full <- syn_num
    syn_full$Sleep.Disorder <- factor(kls, levels = levels(df_mlr_prep$Sleep.Disorder))
    syn_full$Gender         <- factor(names(which.max(table(df_mlr_prep$Gender[idx_kls]))),
                                      levels = levels(df_mlr_prep$Gender))
    syn_full$Occupation     <- factor(names(which.max(table(df_mlr_prep$Occupation[idx_kls]))),
                                      levels = levels(df_mlr_prep$Occupation))
    syn_full$BMI.Category   <- factor(names(which.max(table(df_mlr_prep$BMI.Category[idx_kls]))),
                                      levels = levels(df_mlr_prep$BMI.Category))
    # Isi kolom yang tidak ada di syn_full dengan NA lalu drop
    for (col in setdiff(names(df_mlr_bal), names(syn_full))) {
      syn_full[[col]] <- NA
    }
    syn_full <- syn_full[, names(df_mlr_bal)]
    df_mlr_bal <- bind_rows(df_mlr_bal, syn_full)
  }
}

df_mlr_bal$Sleep.Disorder <- factor(df_mlr_bal$Sleep.Disorder,
                                     levels = levels(df_mlr_prep$Sleep.Disorder))

dist_mlr <- merge(
  as.data.frame(table(df_mlr_prep$Sleep.Disorder)) %>% rename(Kelas = Var1, Sebelum = Freq),
  as.data.frame(table(df_mlr_bal$Sleep.Disorder))  %>% rename(Kelas = Var1, Sesudah = Freq),
  by = "Kelas")
dist_mlr$Ditambah <- dist_mlr$Sesudah - dist_mlr$Sebelum

kable(dist_mlr, caption = "Distribusi Kelas MLR Sebelum vs Sesudah SMOTE",
      row.names = FALSE) %>%
  kable_styling(bootstrap_options = c("striped","hover"), full_width = FALSE) %>%
  row_spec(which(dist_mlr$Ditambah > 0), background = "#d4edda")
Distribusi Kelas MLR Sebelum vs Sesudah SMOTE
Kelas Sebelum Sesudah Ditambah
Insomnia 77 219 142
None 219 219 0
Sleep Apnea 78 219 141
cat("Dataset MLR final: df_mlr_bal\n")
## Dataset MLR final: df_mlr_bal
cat("Dimensi:", nrow(df_mlr_bal), "baris x", ncol(df_mlr_bal), "kolom\n")
## Dimensi: 657 baris x 14 kolom
cat("Prediktor numerik:", paste(pred_mlr, collapse = ", "), "\n")
## Prediktor numerik: Age, Sleep.Duration, Physical.Activity.Level, Heart.Rate, Daily.Steps, log_Systolic

13.5 Verifikasi Akhir Asumsi MLR

# VIF final pada df_mlr_bal
df_vif_mlr_bal <- df_mlr_bal %>%
  mutate(Gender_bin     = as.integer(Gender == "Male"),
         BMI_Overweight = as.integer(BMI.Category == "Overweight"),
         BMI_Obese      = as.integer(BMI.Category == "Obese"))

f_vif_final <- as.formula(paste("as.numeric(Sleep.Disorder) ~",
  paste(c(pred_mlr, "Gender_bin", "BMI_Overweight", "BMI_Obese"), collapse = "+")))

vif_mlr_akhir <- vif(lm(f_vif_final, data = df_vif_mlr_bal))

verif_mlr <- data.frame(
  Aspek = c(
    "Jumlah Observasi (setelah SMOTE)",
    "Distribusi Kelas",
    "Outlier Ekstrem (Mahalanobis)",
    "Linearitas Log-Odds",
    "Multikolinearitas (VIF)",
    "Independensi Observasi"
  ),
  Hasil = c(
    paste(nrow(df_mlr_bal), "baris"),
    paste(paste(names(table(df_mlr_bal$Sleep.Disorder)),
                table(df_mlr_bal$Sleep.Disorder), sep = "="), collapse = ", "),
    ifelse(n_outlier_mlr / nrow(df_mlr_prep) < 0.05,
           paste0(n_outlier_mlr, " outlier ditoleransi (< 5%)"),
           "Outlier dihapus"),
    ifelse(length(nonlinear_vars) == 0,
           "Semua prediktor linier (atau sudah ditransformasi log)",
           paste0("Transformasi log diterapkan pada: ",
                  paste(nonlinear_vars, collapse = ", "))),
    ifelse(all(vif_mlr_akhir <= 10), "Semua VIF ≤ 10 (Aman)", "Ada VIF > 10"),
    "Cross-sectional, tidak ada pengulangan per subjek"
  ),
  Status = c(
    "OK", "OK",
    ifelse(n_outlier_mlr / nrow(df_mlr_prep) < 0.05, "OK", "OK – dihapus"),
    ifelse(length(nonlinear_vars) == 0, "OK", "OK – ditransformasi"),
    ifelse(all(vif_mlr_akhir <= 10), "OK", "Perhatian"),
    "OK"
  )
)

kable(verif_mlr, caption = "Verifikasi Asumsi Akhir – Data Siap MLR",
      row.names = FALSE) %>%
  kable_styling(bootstrap_options = c("striped","hover","condensed"), full_width = TRUE) %>%
  column_spec(1, bold = TRUE) %>%
  row_spec(which(verif_mlr$Status == "OK" | grepl("OK", verif_mlr$Status)),
           background = "#d4edda") %>%
  row_spec(which(verif_mlr$Status == "Perhatian"), background = "#f8d7da")
Verifikasi Asumsi Akhir – Data Siap MLR
Aspek Hasil Status
Jumlah Observasi (setelah SMOTE) 657 baris OK
Distribusi Kelas Insomnia=219, None=219, Sleep Apnea=219 OK
Outlier Ekstrem (Mahalanobis) 2 outlier ditoleransi (< 5%) OK
Linearitas Log-Odds Transformasi log diterapkan pada: Systolic OK – ditransformasi
Multikolinearitas (VIF) Semua VIF ≤ 10 (Aman) OK
Independensi Observasi Cross-sectional, tidak ada pengulangan per subjek OK

Dataset MLR final tersimpan di objek df_mlr_bal — siap untuk fitting model MLR.



14 MLR

14.1 Fitting Model MLR

14.1.1 Pemilihan Reference Category

df_mlr_bal <- df_mlr_bal %>%
  mutate(Sleep.Disorder = relevel(Sleep.Disorder, ref = "None"))

ref_choice <- data.frame(
  Opsi      = 1:3,
  Reference = c("None", "Insomnia", "Sleep Apnea"),
  Alasan    = c(
    "Kelas mayoritas & 'kondisi normal' → log-odds insomnia vs sehat, SA vs sehat [DIPILIH]",
    "Baseline gangguan ringan → log-odds None vs insomnia, SA vs insomnia",
    "Baseline gangguan berat → log-odds None vs SA, insomnia vs SA"
  )
)

kable(ref_choice, caption = "Pertimbangan Pemilihan Reference Category MLR",
      row.names = FALSE) %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = TRUE) %>%
  row_spec(1, background = "#d4edda")
Pertimbangan Pemilihan Reference Category MLR
Opsi Reference Alasan
1 None Kelas mayoritas & ‘kondisi normal’ → log-odds insomnia vs sehat, SA vs sehat [DIPILIH]
2 Insomnia Baseline gangguan ringan → log-odds None vs insomnia, SA vs insomnia
3 Sleep Apnea Baseline gangguan berat → log-odds None vs SA, insomnia vs SA

14.1.2 Fit Model MLR

set.seed(42)

mlr_formula <- as.formula(
  paste("Sleep.Disorder ~", paste(pred_mlr, collapse = " + "))
)

mlr_model <- multinom(
  mlr_formula,
  data  = df_mlr_bal,
  maxit = 500,
  trace = FALSE
)

cat("=== Ringkasan Model MLR ===\n")
## === Ringkasan Model MLR ===
print(summary(mlr_model))
## Call:
## multinom(formula = mlr_formula, data = df_mlr_bal, maxit = 500, 
##     trace = FALSE)
## 
## Coefficients:
##             (Intercept)       Age Sleep.Duration Physical.Activity.Level
## Insomnia       24.33020 0.2795671     -3.9729017              0.07281570
## Sleep Apnea   -76.08184 0.2721917      0.5444884             -0.02851873
##              Heart.Rate   Daily.Steps log_Systolic
## Insomnia    -0.07236269 -0.0012714836    0.2474621
## Sleep Apnea  0.73548345  0.0007025924    1.7719652
## 
## Std. Errors:
##              (Intercept)        Age Sleep.Duration Physical.Activity.Level
## Insomnia    1.015390e-04 0.01996155    0.001467243              0.01238563
## Sleep Apnea 9.517733e-05 0.01860820    0.001215787              0.01257213
##             Heart.Rate  Daily.Steps log_Systolic
## Insomnia    0.01039475 0.0001683656 0.0006016256
## Sleep Apnea 0.01145725 0.0001642198 0.0003325545
## 
## Residual Deviance: 616.8503 
## AIC: 644.8503
coef_mlr <- coef(mlr_model)
se_mlr   <- summary(mlr_model)$standard.errors

z_mlr  <- coef_mlr / se_mlr
p_mlr  <- 2 * (1 - pnorm(abs(z_mlr)))

make_coef_table <- function(kelas_name) {
  data.frame(
    Variabel  = colnames(coef_mlr),
    Estimate  = round(coef_mlr[kelas_name, ], 4),
    Std.Error = round(se_mlr[kelas_name, ], 4),
    z.value   = round(z_mlr[kelas_name, ], 4),
    p.value   = round(p_mlr[kelas_name, ], 4),
    Sig       = ifelse(p_mlr[kelas_name, ] < 0.001, "***",
                ifelse(p_mlr[kelas_name, ] < 0.01,  "**",
                ifelse(p_mlr[kelas_name, ] < 0.05,  "*",
                ifelse(p_mlr[kelas_name, ] < 0.1,   ".", ""))))
  )
}

coef_ins <- make_coef_table("Insomnia")
coef_sa  <- make_coef_table("Sleep Apnea")

kable(coef_ins,
      caption   = "Koefisien MLR – Insomnia vs None (Log-Odds)",
      row.names = FALSE) %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = TRUE) %>%
  row_spec(which(coef_ins$p.value < 0.05), background = "#d4edda") %>%
  footnote(general = "Sig: *** p<0.001, ** p<0.01, * p<0.05, . p<0.1")
Koefisien MLR – Insomnia vs None (Log-Odds)
Variabel Estimate Std.Error z.value p.value Sig
(Intercept) 24.3302 0.0001 239614.2614 0 ***
Age 0.2796 0.0200 14.0053 0 ***
Sleep.Duration -3.9729 0.0015 -2707.7333 0 ***
Physical.Activity.Level 0.0728 0.0124 5.8790 0 ***
Heart.Rate -0.0724 0.0104 -6.9615 0 ***
Daily.Steps -0.0013 0.0002 -7.5519 0 ***
log_Systolic 0.2475 0.0006 411.3224 0 ***
Note:
Sig: *** p<0.001, ** p<0.01, * p<0.05, . p<0.1
kable(coef_sa,
      caption   = "Koefisien MLR – Sleep Apnea vs None (Log-Odds)",
      row.names = FALSE) %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = TRUE) %>%
  row_spec(which(coef_sa$p.value < 0.05), background = "#d4edda") %>%
  footnote(general = "Sig: *** p<0.001, ** p<0.01, * p<0.05, . p<0.1")
Koefisien MLR – Sleep Apnea vs None (Log-Odds)
Variabel Estimate Std.Error z.value p.value Sig
(Intercept) -76.0818 0.0001 -799369.3525 0.0000 ***
Age 0.2722 0.0186 14.6275 0.0000 ***
Sleep.Duration 0.5445 0.0012 447.8483 0.0000 ***
Physical.Activity.Level -0.0285 0.0126 -2.2684 0.0233
Heart.Rate 0.7355 0.0115 64.1937 0.0000 ***
Daily.Steps 0.0007 0.0002 4.2784 0.0000 ***
log_Systolic 1.7720 0.0003 5328.3456 0.0000 ***
Note:
Sig: *** p<0.001, ** p<0.01, * p<0.05, . p<0.1

Uji Signifikansi MLR

14.1.3 Uji Serentak – Likelihood Ratio Test (G²)

mlr_null <- multinom(
  Sleep.Disorder ~ 1,
  data  = df_mlr_bal,
  maxit = 500,
  trace = FALSE
)

lrt_result <- lrtest(mlr_null, mlr_model)

lrt_df <- data.frame(
  Model         = c("Model Null (intercept only)", "Model Penuh"),
  LogLikelihood = round(c(logLik(mlr_null), logLik(mlr_model)), 4),
  Df            = c(attr(logLik(mlr_null), "df"), attr(logLik(mlr_model), "df"))
)

kable(lrt_df, caption = "Log-Likelihood Model Null vs Penuh", row.names = FALSE) %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)
Log-Likelihood Model Null vs Penuh
Model LogLikelihood Df
Model Null (intercept only) -721.7883 2
Model Penuh -308.4251 14
G2_stat <- -2 * (as.numeric(logLik(mlr_null)) - as.numeric(logLik(mlr_model)))
df_G2   <- attr(logLik(mlr_model), "df") - attr(logLik(mlr_null), "df")
p_G2    <- pchisq(G2_stat, df = df_G2, lower.tail = FALSE)

g2_df <- data.frame(
  Uji          = "Likelihood Ratio Test (G²)",
  G2_Statistik = round(G2_stat, 4),
  df           = df_G2,
  p.value      = format.pval(p_G2, digits = 4, eps = 0.0001),
  Kesimpulan   = ifelse(p_G2 < 0.05,
                        "Model signifikan – prediktor berpengaruh serentak ✓",
                        "Model tidak signifikan")
)

kable(g2_df,
      caption   = "Uji Serentak – Likelihood Ratio Test (G²)",
      row.names = FALSE) %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE) %>%
  row_spec(1, color = "white",
           background = ifelse(p_G2 < 0.05, "#5cb85c", "#d9534f"))
Uji Serentak – Likelihood Ratio Test (G²)
Uji G2_Statistik df p.value Kesimpulan
Likelihood Ratio Test (G²) 826.7263 12 < 1e-04 Model signifikan – prediktor berpengaruh serentak ✓

14.1.4 Uji Parsial – Wald Test per Prediktor

wald_ins <- coef_ins %>%
  mutate(
    OR       = round(exp(Estimate), 4),
    CI_lower = round(exp(Estimate - 1.96 * Std.Error), 4),
    CI_upper = round(exp(Estimate + 1.96 * Std.Error), 4),
    CI_95    = paste0("[", CI_lower, ", ", CI_upper, "]")
  ) %>%
  dplyr::select(Variabel, Estimate, OR, CI_95, z.value, p.value, Sig)

kable(wald_ins,
      caption   = "Uji Wald Parsial – Insomnia vs None (+ Odds Ratio)",
      row.names = FALSE) %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = TRUE) %>%
  row_spec(which(wald_ins$p.value < 0.05 & wald_ins$Variabel != "(Intercept)"),
           background = "#d4edda") %>%
  footnote(general = "OR = Odds Ratio; CI_95 = 95% Confidence Interval untuk OR")
Uji Wald Parsial – Insomnia vs None (+ Odds Ratio)
Variabel Estimate OR CI_95 z.value p.value Sig
(Intercept) 24.3302 3.685289e+10 [36845672015.9903, 36860118350.7171] 239614.2614 0 ***
Age 0.2796 1.322600e+00 [1.2718, 1.3755] 14.0053 0 ***
Sleep.Duration -3.9729 1.880000e-02 [0.0188, 0.0189] -2707.7333 0 ***
Physical.Activity.Level 0.0728 1.075500e+00 [1.0497, 1.102] 5.8790 0 ***
Heart.Rate -0.0724 9.302000e-01 [0.9114, 0.9493] -6.9615 0 ***
Daily.Steps -0.0013 9.987000e-01 [0.9983, 0.9991] -7.5519 0 ***
log_Systolic 0.2475 1.280800e+00 [1.2793, 1.2823] 411.3224 0 ***
Note:
OR = Odds Ratio; CI_95 = 95% Confidence Interval untuk OR
wald_sa <- coef_sa %>%
  mutate(
    OR       = round(exp(Estimate), 4),
    CI_lower = round(exp(Estimate - 1.96 * Std.Error), 4),
    CI_upper = round(exp(Estimate + 1.96 * Std.Error), 4),
    CI_95    = paste0("[", CI_lower, ", ", CI_upper, "]")
  ) %>%
  dplyr::select(Variabel, Estimate, OR, CI_95, z.value, p.value, Sig)

kable(wald_sa,
      caption   = "Uji Wald Parsial – Sleep Apnea vs None (+ Odds Ratio)",
      row.names = FALSE) %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = TRUE) %>%
  row_spec(which(wald_sa$p.value < 0.05 & wald_sa$Variabel != "(Intercept)"),
           background = "#d4edda") %>%
  footnote(general = "OR = Odds Ratio; CI_95 = 95% Confidence Interval untuk OR")
Uji Wald Parsial – Sleep Apnea vs None (+ Odds Ratio)
Variabel Estimate OR CI_95 z.value p.value Sig
(Intercept) -76.0818 0.0000 [0, 0] -799369.3525 0.0000 ***
Age 0.2722 1.3128 [1.2659, 1.3616] 14.6275 0.0000 ***
Sleep.Duration 0.5445 1.7237 [1.7197, 1.7278] 447.8483 0.0000 ***
Physical.Activity.Level -0.0285 0.9719 [0.9482, 0.9962] -2.2684 0.0233
Heart.Rate 0.7355 2.0865 [2.04, 2.1341] 64.1937 0.0000 ***
Daily.Steps 0.0007 1.0007 [1.0003, 1.0011] 4.2784 0.0000 ***
log_Systolic 1.7720 5.8826 [5.8791, 5.8861] 5328.3456 0.0000 ***
Note:
OR = Odds Ratio; CI_95 = 95% Confidence Interval untuk OR

14.1.5 AIC & BIC Model MLR

aic_bic_df <- data.frame(
  Model  = c("MLR Null", "MLR Penuh"),
  AIC    = round(c(AIC(mlr_null), AIC(mlr_model)), 4),
  BIC    = round(c(BIC(mlr_null), BIC(mlr_model)), 4),
  LogLik = round(c(as.numeric(logLik(mlr_null)),
                   as.numeric(logLik(mlr_model))), 4)
)

kable(aic_bic_df,
      caption   = "AIC & BIC – Perbandingan Model MLR Null vs Penuh",
      row.names = FALSE) %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE) %>%
  row_spec(2, background = "#d4edda")
AIC & BIC – Perbandingan Model MLR Null vs Penuh
Model AIC BIC LogLik
MLR Null 1447.5765 1456.5519 -721.7883
MLR Penuh 644.8503 707.6779 -308.4251

14.2 Prediksi & Confusion Matrix – MLR

14.2.1 Prediksi MLR

mlr_pred_class <- predict(mlr_model, newdata = df_mlr_bal, type = "class")
mlr_pred_prob  <- predict(mlr_model, newdata = df_mlr_bal, type = "probs")

df_mlr_result <- df_mlr_bal %>%
  mutate(
    Predicted_MLR   = mlr_pred_class,
    Prob_None       = round(mlr_pred_prob[, "None"], 4),
    Prob_Insomnia   = round(mlr_pred_prob[, "Insomnia"], 4),
    Prob_SleepApnea = round(mlr_pred_prob[, "Sleep Apnea"], 4)
  )

kable(head(df_mlr_result %>%
             dplyr::select(Sleep.Disorder, Predicted_MLR,
                           Prob_None, Prob_Insomnia, Prob_SleepApnea), 10),
      caption   = "10 Baris Pertama: Prediksi MLR & Probabilitas per Kelas",
      row.names = FALSE) %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = TRUE)
10 Baris Pertama: Prediksi MLR & Probabilitas per Kelas
Sleep.Disorder Predicted_MLR Prob_None Prob_Insomnia Prob_SleepApnea
None Insomnia 0.3889 0.5829 0.0282
None None 0.5877 0.0021 0.4102
None None 0.5877 0.0021 0.4102
Sleep Apnea Insomnia 0.0916 0.8694 0.0390
Sleep Apnea Insomnia 0.0916 0.8694 0.0390
Insomnia Insomnia 0.0916 0.8694 0.0390
Insomnia Insomnia 0.2198 0.6172 0.1630
None None 0.9966 0.0004 0.0030
None None 0.9966 0.0004 0.0030
None None 0.9966 0.0004 0.0030

14.2.2 Confusion Matrix MLR

cm_mlr <- table(
  Aktual   = df_mlr_bal$Sleep.Disorder,
  Prediksi = mlr_pred_class
)

kable(cm_mlr,
      caption = "Confusion Matrix – MLR (Training Data)") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"),
                full_width = FALSE) %>%
  add_header_above(c(" " = 1, "Prediksi" = length(levels(df_mlr_bal$Sleep.Disorder))))
Confusion Matrix – MLR (Training Data)
Prediksi
None Insomnia Sleep Apnea
None 186 27 6
Insomnia 23 181 15
Sleep Apnea 13 10 196
n_total_mlr   <- sum(cm_mlr)
n_correct_mlr <- sum(diag(cm_mlr))
accuracy_mlr  <- n_correct_mlr / n_total_mlr
aper_mlr      <- 1 - accuracy_mlr

per_class_mlr <- lapply(levels(df_mlr_bal$Sleep.Disorder), function(k) {
  TP <- cm_mlr[k, k]
  FP <- sum(cm_mlr[, k]) - TP
  FN <- sum(cm_mlr[k, ]) - TP
  TN <- n_total_mlr - TP - FP - FN
  data.frame(
    Kelas       = k,
    TP = TP, FP = FP, FN = FN, TN = TN,
    Sensitivity = round(TP / (TP + FN), 4),
    Specificity = round(TN / (TN + FP), 4),
    Precision   = round(TP / (TP + FP), 4),
    F1_Score    = round(2 * TP / (2 * TP + FP + FN), 4)
  )
})

per_class_mlr_df <- bind_rows(per_class_mlr)

kable(per_class_mlr_df,
      caption   = "Metrik Evaluasi Per Kelas – MLR",
      row.names = FALSE) %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"),
                full_width = TRUE)
Metrik Evaluasi Per Kelas – MLR
Kelas TP FP FN TN Sensitivity Specificity Precision F1_Score
None 186 36 33 402 0.8493 0.9178 0.8378 0.8435
Insomnia 181 37 38 401 0.8265 0.9155 0.8303 0.8284
Sleep Apnea 196 21 23 417 0.8950 0.9521 0.9032 0.8991
overall_mlr <- data.frame(
  Model       = "MLR",
  Accuracy    = paste0(round(accuracy_mlr * 100, 2), "%"),
  APER        = paste0(round(aper_mlr * 100, 2), "%"),
  N_Correct   = n_correct_mlr,
  N_Incorrect = n_total_mlr - n_correct_mlr,
  N_Total     = n_total_mlr
)

kable(overall_mlr,
      caption   = "Ringkasan Evaluasi Model MLR (Training Data)",
      row.names = FALSE) %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE) %>%
  row_spec(1, color = "white",
           background = ifelse(accuracy_mlr >= 0.8, "#5cb85c", "#f0ad4e"))
Ringkasan Evaluasi Model MLR (Training Data)
Model Accuracy APER N_Correct N_Incorrect N_Total
MLR 85.69% 14.31% 563 94 657

15 Interpretasi

15.1 Interpretation MLR

15.1.1 Log-Odds dan Odds Ratio

coef_mlr <- summary(mlr_model)$coefficients
se_mlr   <- summary(mlr_model)$standard.errors

z_mlr <- coef_mlr / se_mlr
p_mlr <- 2 * (1 - pnorm(abs(z_mlr)))

logodds_table <- as.data.frame(coef_mlr) %>%
  rownames_to_column("Kelas") %>%
  pivot_longer(
    cols = -Kelas,
    names_to = "Variabel",
    values_to = "Log_Odds"
  )

se_table <- as.data.frame(se_mlr) %>%
  rownames_to_column("Kelas") %>%
  pivot_longer(
    cols = -Kelas,
    names_to = "Variabel",
    values_to = "Std_Error"
  )

p_table <- as.data.frame(p_mlr) %>%
  rownames_to_column("Kelas") %>%
  pivot_longer(
    cols = -Kelas,
    names_to = "Variabel",
    values_to = "p_value"
  )

mlr_interpret_table <- logodds_table %>%
  left_join(se_table, by = c("Kelas", "Variabel")) %>%
  left_join(p_table, by = c("Kelas", "Variabel")) %>%
  mutate(
    Odds_Ratio = exp(Log_Odds),
    Signifikan = ifelse(p_value < 0.05, "Signifikan", "Tidak Signifikan"),
    Interpretasi_OR = case_when(
      Odds_Ratio > 1 ~ "Meningkatkan odds",
      Odds_Ratio < 1 ~ "Menurunkan odds",
      TRUE ~ "Tidak mengubah odds"
    )
  )

kable(mlr_interpret_table %>%
        mutate(across(where(is.numeric), ~ round(., 4))),
      caption = "Interpretasi Log-Odds dan Odds Ratio Model MLR",
      row.names = FALSE) %>%
  kable_styling(bootstrap_options = c("striped","hover","condensed"),
                full_width = TRUE) %>%
  scroll_box(width = "100%", height = "450px")
Interpretasi Log-Odds dan Odds Ratio Model MLR
Kelas Variabel Log_Odds Std_Error p_value Odds_Ratio Signifikan Interpretasi_OR
Insomnia (Intercept) 24.3302 0.0001 0.0000 3.685295e+10 Signifikan Meningkatkan odds
Insomnia Age 0.2796 0.0200 0.0000 1.322600e+00 Signifikan Meningkatkan odds
Insomnia Sleep.Duration -3.9729 0.0015 0.0000 1.880000e-02 Signifikan Menurunkan odds
Insomnia Physical.Activity.Level 0.0728 0.0124 0.0000 1.075500e+00 Signifikan Meningkatkan odds
Insomnia Heart.Rate -0.0724 0.0104 0.0000 9.302000e-01 Signifikan Menurunkan odds
Insomnia Daily.Steps -0.0013 0.0002 0.0000 9.987000e-01 Signifikan Menurunkan odds
Insomnia log_Systolic 0.2475 0.0006 0.0000 1.280800e+00 Signifikan Meningkatkan odds
Sleep Apnea (Intercept) -76.0818 0.0001 0.0000 0.000000e+00 Signifikan Menurunkan odds
Sleep Apnea Age 0.2722 0.0186 0.0000 1.312800e+00 Signifikan Meningkatkan odds
Sleep Apnea Sleep.Duration 0.5445 0.0012 0.0000 1.723700e+00 Signifikan Meningkatkan odds
Sleep Apnea Physical.Activity.Level -0.0285 0.0126 0.0233 9.719000e-01 Signifikan Menurunkan odds
Sleep Apnea Heart.Rate 0.7355 0.0115 0.0000 2.086500e+00 Signifikan Meningkatkan odds
Sleep Apnea Daily.Steps 0.0007 0.0002 0.0000 1.000700e+00 Signifikan Meningkatkan odds
Sleep Apnea log_Systolic 1.7720 0.0003 0.0000 5.882400e+00 Signifikan Meningkatkan odds
or_sig_table <- mlr_interpret_table %>%
  filter(Variabel != "(Intercept)", p_value < 0.05) %>%
  arrange(Kelas, desc(abs(Log_Odds)))

15.1.2 Plot Odds Ratio

or_plot_df <- mlr_interpret_table %>%
  filter(Variabel != "(Intercept)")

ggplot(or_plot_df,
       aes(x = reorder(Variabel, Odds_Ratio),
           y = Odds_Ratio,
           color = Signifikan)) +
  geom_point(size = 3) +
  geom_hline(yintercept = 1,
             linetype = "dashed",
             color = "red") +
  coord_flip() +
  facet_wrap(~ Kelas, scales = "free_y") +
  labs(title = "Plot Odds Ratio Model MLR",
       subtitle = "OR > 1 meningkatkan odds, OR < 1 menurunkan odds",
       x = "Variabel",
       y = "Odds Ratio",
       color = "Status") +
  theme_bw()

15.1.3 Marginal Effects

calc_marginal_effect <- function(model, data, var, delta = NULL) {
  data1 <- data
  data2 <- data
  
  if (is.null(delta)) {
    delta <- sd(data[[var]], na.rm = TRUE)
  }
  
  data2[[var]] <- data2[[var]] + delta
  
  p1 <- predict(model, newdata = data1, type = "probs")
  p2 <- predict(model, newdata = data2, type = "probs")
  
  p1 <- as.data.frame(p1)
  p2 <- as.data.frame(p2)
  
  me <- colMeans(p2 - p1, na.rm = TRUE)
  
  data.frame(
    Variabel = var,
    Delta = delta,
    Kelas = names(me),
    AME = as.numeric(me)
  )
}

mfx_summary <- bind_rows(lapply(pred_mlr, function(v) {
  calc_marginal_effect(mlr_model, df_mlr_bal, v)
}))

kable(mfx_summary %>%
        mutate(across(where(is.numeric), ~ round(., 4))),
      caption = "Average Marginal Effects Model MLR",
      row.names = FALSE) %>%
  kable_styling(bootstrap_options = c("striped","hover","condensed"),
                full_width = TRUE) %>%
  scroll_box(width = "100%", height = "450px")
Average Marginal Effects Model MLR
Variabel Delta Kelas AME
Age 8.3229 None -0.1564
Age 8.3229 Insomnia 0.0938
Age 8.3229 Sleep Apnea 0.0626
Sleep.Duration 0.7864 None 0.1290
Sleep.Duration 0.7864 Insomnia -0.2718
Sleep.Duration 0.7864 Sleep Apnea 0.1428
Physical.Activity.Level 20.7731 None -0.0634
Physical.Activity.Level 20.7731 Insomnia 0.2128
Physical.Activity.Level 20.7731 Sleep Apnea -0.1494
Heart.Rate 3.9853 None -0.1112
Heart.Rate 3.9853 Insomnia -0.1671
Heart.Rate 3.9853 Sleep Apnea 0.2783
Daily.Steps 1801.9910 None 0.0450
Daily.Steps 1801.9910 Insomnia -0.2429
Daily.Steps 1801.9910 Sleep Apnea 0.1979
log_Systolic 0.7780 None -0.0569
log_Systolic 0.7780 Insomnia -0.0537
log_Systolic 0.7780 Sleep Apnea 0.1106

15.1.4 Plot Marginal Effects

ggplot(mfx_summary,
       aes(x = reorder(Variabel, AME),
           y = AME,
           fill = Kelas)) +
  geom_col(position = "dodge", alpha = 0.85) +
  geom_hline(yintercept = 0,
             linetype = "dashed",
             color = "red") +
  coord_flip() +
  labs(title = "Plot Average Marginal Effects",
       subtitle = "AME menunjukkan perubahan probabilitas rata-rata ketika variabel naik sebesar 1 SD",
       x = "Variabel",
       y = "Average Marginal Effect",
       fill = "Kelas") +
  theme_bw()

15.1.5 Predicted Probability Plot

pred_prob <- as.data.frame(predict(mlr_model, newdata = df_mlr_bal, type = "probs"))

pred_prob$Aktual <- df_mlr_bal$Sleep.Disorder

pred_prob$Prediksi <- colnames(pred_prob)[
  max.col(pred_prob, ties.method = "first")
]

pred_prob_long <- pred_prob %>%
  pivot_longer(
    cols = all_of(levels(df_mlr_bal$Sleep.Disorder)),
    names_to = "Kelas",
    values_to = "Probabilitas"
  )

ggplot(pred_prob_long,
       aes(x = Kelas,
           y = Probabilitas,
           fill = Kelas)) +
  geom_boxplot(alpha = 0.8) +
  labs(title = "Distribusi Predicted Probability MLR",
       x = "Kelas",
       y = "Probabilitas Prediksi") +
  theme_bw() +
  theme(legend.position = "none")

15.2 Interpretation LDA

15.2.1 Interpretasi Fungsi Diskriminan LDA

lda_importance <- coef_df %>%
  pivot_longer(
    cols = starts_with("LD"),
    names_to = "Fungsi_Diskriminan",
    values_to = "Koefisien"
  ) %>%
  mutate(
    Abs_Koefisien = abs(Koefisien),
    Arah = case_when(
      Koefisien > 0 ~ "Positif",
      Koefisien < 0 ~ "Negatif",
      TRUE ~ "Netral"
    ),
    Interpretasi = case_when(
      Koefisien > 0 ~ "Semakin tinggi variabel, skor LD cenderung meningkat",
      Koefisien < 0 ~ "Semakin tinggi variabel, skor LD cenderung menurun",
      TRUE ~ "Tidak memberi kontribusi pada fungsi LD"
    )
  ) %>%
  arrange(Fungsi_Diskriminan, desc(Abs_Koefisien))

kable(lda_importance %>%
        mutate(across(where(is.numeric), ~ round(., 4))),
      caption = "Interpretasi Kontribusi Variabel terhadap Fungsi Diskriminan LDA",
      row.names = FALSE) %>%
  kable_styling(
    bootstrap_options = c("striped", "hover", "condensed"),
    full_width = TRUE
  )
Interpretasi Kontribusi Variabel terhadap Fungsi Diskriminan LDA
Variabel Fungsi_Diskriminan Koefisien Abs_Koefisien Arah Interpretasi
PC2 LD1 0.9235 0.9235 Positif Semakin tinggi variabel, skor LD cenderung meningkat
PC3 LD1 0.5667 0.5667 Positif Semakin tinggi variabel, skor LD cenderung meningkat
PC1 LD1 -0.3134 0.3134 Negatif Semakin tinggi variabel, skor LD cenderung menurun
PC3 LD2 0.8305 0.8305 Positif Semakin tinggi variabel, skor LD cenderung meningkat
PC2 LD2 -0.2602 0.2602 Negatif Semakin tinggi variabel, skor LD cenderung menurun
PC1 LD2 -0.1538 0.1538 Negatif Semakin tinggi variabel, skor LD cenderung menurun

15.2.2 Skor LD dan Prediksi

lda_score_df <- as.data.frame(lda_pred$x)

lda_score_df <- lda_score_df %>%
  mutate(
    Aktual = df_balanced$Sleep.Disorder,
    Prediksi = lda_pred$class
  )

kable(head(lda_score_df, 10),
      caption = "10 Baris Pertama Skor LD dan Prediksi LDA",
      row.names = FALSE) %>%
  kable_styling(bootstrap_options = c("striped", "hover"))
10 Baris Pertama Skor LD dan Prediksi LDA
LD1 LD2 Aktual Prediksi
-1.077014 1.484504 None Insomnia
-1.817252 -1.138942 None None
-1.817252 -1.138942 None None
1.423253 3.328795 Sleep Apnea Insomnia
1.423253 3.328795 Sleep Apnea Insomnia
1.423253 3.328795 Insomnia Insomnia
1.078052 2.129825 Insomnia Insomnia
-2.909839 -1.383003 None None
-2.909839 -1.383003 None None
-2.909839 -1.383003 None None

15.2.3 Plot LD1 vs LD2

if ("LD2" %in% names(lda_score_df)) {
  ggplot(lda_score_df, aes(x = LD1, y = LD2, color = Aktual, shape = Prediksi)) +
    geom_point(alpha = 0.75, size = 2.5) +
    labs(
      title = "Visualisasi LDA: LD1 vs LD2",
      subtitle = "Pemisahan kelas berdasarkan fungsi diskriminan",
      x = "Linear Discriminant 1",
      y = "Linear Discriminant 2",
      color = "Kelas Aktual",
      shape = "Kelas Prediksi"
    ) +
    theme_bw()
} else {
  ggplot(lda_score_df, aes(x = LD1, fill = Aktual)) +
    geom_density(alpha = 0.45) +
    labs(
      title = "Visualisasi LDA: Distribusi LD1",
      subtitle = "Hanya satu fungsi diskriminan tersedia",
      x = "Linear Discriminant 1",
      y = "Density",
      fill = "Kelas Aktual"
    ) +
    theme_bw()
}

15.2.4 Plot Density LD1

ggplot(lda_score_df, aes(x = LD1, fill = Aktual)) +
  geom_density(alpha = 0.45) +
  labs(
    title = "Distribusi LD1 per Kelas Sleep Disorder",
    subtitle = "Overlap menunjukkan kelas yang sulit dipisahkan",
    x = "LD1",
    y = "Density",
    fill = "Kelas Aktual"
  ) +
  theme_bw()

15.2.5 Plot Kontribusi LDA

ggplot(lda_importance,
       aes(x = reorder(Variabel, Abs_Koefisien),
           y = Abs_Koefisien,
           fill = Fungsi_Diskriminan)) +
  geom_col(position = "dodge", alpha = 0.85) +
  coord_flip() +
  labs(
    title = "Besar Kontribusi Variabel pada Fungsi Diskriminan",
    subtitle = "Nilai absolut koefisien menunjukkan kekuatan kontribusi variabel",
    x = "Variabel",
    y = "|Koefisien|",
    fill = "Fungsi"
  ) +
  theme_bw()

15.2.6 Confusion Matrix Heatmap LDA

cm_lda_df <- as.data.frame(cm_lda)

ggplot(cm_lda_df, aes(x = Prediksi, y = Aktual, fill = Freq)) +
  geom_tile() +
  geom_text(aes(label = Freq), color = "white", size = 5) +
  scale_fill_gradient(low = "#56B1F7", high = "#132B43") +
  labs(
    title = "Confusion Matrix Heatmap - LDA",
    subtitle = "Diagonal menunjukkan prediksi yang benar",
    x = "Prediksi",
    y = "Aktual",
    fill = "Frekuensi"
  ) +
  theme_bw()

16 Perbandingan LDA vs MLR

compare_df <- data.frame(
  Metrik = c(
    "Accuracy (Training)",
    "APER (Training)",
    "Accuracy (LOOCV) – LDA",
    "APER (LOOCV) – LDA",
    "AIC",
    "BIC",
    "Log-Likelihood"
  ),
  LDA = c(
    paste0(round(accuracy_lda * 100, 2), "%"),
    paste0(round(aper_lda * 100, 2), "%"),
    paste0(round(acc_cv * 100, 2), "%"),
    paste0(round(aper_cv * 100, 2), "%"),
    "–",
    "–",
    "–"
  ),
  MLR = c(
    paste0(round(accuracy_mlr * 100, 2), "%"),
    paste0(round(aper_mlr * 100, 2), "%"),
    "–",
    "–",
    round(AIC(mlr_model), 4),
    round(BIC(mlr_model), 4),
    round(as.numeric(logLik(mlr_model)), 4)
  )
)

kable(compare_df,
      caption   = "Perbandingan Metrik Evaluasi – LDA vs MLR",
      row.names = FALSE) %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"),
                full_width = FALSE) %>%
  column_spec(2, background = "#e8f4f8") %>%
  column_spec(3, background = "#fef9e7")
Perbandingan Metrik Evaluasi – LDA vs MLR
Metrik LDA MLR
Accuracy (Training) 84.78% 85.69%
APER (Training) 15.22% 14.31%
Accuracy (LOOCV) – LDA 84.78%
APER (LOOCV) – LDA 15.22%
AIC 644.8503
BIC 707.6779
Log-Likelihood -308.4251
both_acc <- data.frame(
  Kelas           = levels(df_balanced$Sleep.Disorder),
  Sensitivity_LDA = per_class_lda_df$Sensitivity,
  Sensitivity_MLR = per_class_mlr_df$Sensitivity,
  F1_LDA          = per_class_lda_df$F1_Score,
  F1_MLR          = per_class_mlr_df$F1_Score
)

kable(both_acc,
      caption   = "Perbandingan Sensitivity & F1-Score per Kelas – LDA vs MLR",
      row.names = FALSE) %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)
Perbandingan Sensitivity & F1-Score per Kelas – LDA vs MLR
Kelas Sensitivity_LDA Sensitivity_MLR F1_LDA F1_MLR
Insomnia 0.9315 0.8493 0.8259 0.8435
None 0.8082 0.8265 0.8469 0.8284
Sleep Apnea 0.8037 0.8950 0.8756 0.8991
cat("=== KESIMPULAN PEMILIHAN MODEL ===\n\n")
## === KESIMPULAN PEMILIHAN MODEL ===
cat("LDA Accuracy (Resubstitution):", round(accuracy_lda * 100, 2), "%\n")
## LDA Accuracy (Resubstitution): 84.78 %
cat("LDA Accuracy (LOOCV)         :", round(acc_cv * 100, 2), "%\n")
## LDA Accuracy (LOOCV)         : 84.78 %
cat("MLR Accuracy (Training)      :", round(accuracy_mlr * 100, 2), "%\n")
## MLR Accuracy (Training)      : 85.69 %
cat("MLR AIC                      :", round(AIC(mlr_model), 2), "\n")
## MLR AIC                      : 644.85
cat("MLR BIC                      :", round(BIC(mlr_model), 2), "\n\n")
## MLR BIC                      : 707.68
if (acc_cv >= accuracy_mlr) {
  cat("→ LDA (LOOCV) menghasilkan akurasi lebih tinggi atau setara dengan MLR.\n")
  cat("  LDA lebih disarankan jika asumsi normalitas multivariat mendekati terpenuhi.\n")
} else {
  cat("→ MLR menghasilkan akurasi lebih tinggi dari LDA (LOOCV).\n")
  cat("  MLR lebih disarankan karena tidak memerlukan asumsi distribusi pada prediktor.\n")
}
## → MLR menghasilkan akurasi lebih tinggi dari LDA (LOOCV).
##   MLR lebih disarankan karena tidak memerlukan asumsi distribusi pada prediktor.