1 Load Package

packages <- c("tidyverse", "ggplot2", "corrplot", "car",
              "nortest", "moments", "gridExtra", "scales",
              "ggcorrplot", "knitr", "kableExtra", "MASS",
              "heplots", "nnet", "caret", "lmtest",
              "broom", "margins")

installed <- packages %in% rownames(installed.packages())
if (any(!installed)) install.packages(packages[!installed])

library(tidyverse)
library(ggplot2)
library(corrplot)
library(car)
library(nortest)
library(moments)
library(gridExtra)
library(scales)
library(ggcorrplot)
library(knitr)
library(kableExtra)
library(MASS)
library(heplots)
library(nnet)
library(caret)
library(lmtest)
library(broom)
library(margins)

2 Load Data

df <- read.csv("Sleep_health_and_lifestyle_dataset.csv",
               stringsAsFactors = FALSE)

cat("Dimensi data:", nrow(df), "baris x", ncol(df), "kolom\n")

## Dimensi data: 374 baris x 13 kolom

kable(head(df, 10), caption = "10 Baris Pertama Dataset") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"),
                full_width = TRUE, font_size = 13)

10 Baris Pertama Dataset
Person.ID	Gender	Age	Occupation	Sleep.Duration	Quality.of.Sleep	Physical.Activity.Level	Stress.Level	BMI.Category	Blood.Pressure	Heart.Rate	Daily.Steps	Sleep.Disorder
1	Male	27	Software Engineer	6.1	6	42	6	Overweight	126/83	77	4200	None
2	Male	28	Doctor	6.2	6	60	8	Normal	125/80	75	10000	None
3	Male	28	Doctor	6.2	6	60	8	Normal	125/80	75	10000	None
4	Male	28	Sales Representative	5.9	4	30	8	Obese	140/90	85	3000	Sleep Apnea
5	Male	28	Sales Representative	5.9	4	30	8	Obese	140/90	85	3000	Sleep Apnea
6	Male	28	Software Engineer	5.9	4	30	8	Obese	140/90	85	3000	Insomnia
7	Male	29	Teacher	6.3	6	40	7	Obese	140/90	82	3500	Insomnia
8	Male	29	Doctor	7.8	7	75	6	Normal	120/80	70	8000	None
9	Male	29	Doctor	7.8	7	75	6	Normal	120/80	70	8000	None
10	Male	29	Doctor	7.8	7	75	6	Normal	120/80	70	8000	None

str(df)

## 'data.frame':    374 obs. of  13 variables:
##  $ Person.ID              : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ Gender                 : chr  "Male" "Male" "Male" "Male" ...
##  $ Age                    : int  27 28 28 28 28 28 29 29 29 29 ...
##  $ Occupation             : chr  "Software Engineer" "Doctor" "Doctor" "Sales Representative" ...
##  $ Sleep.Duration         : num  6.1 6.2 6.2 5.9 5.9 5.9 6.3 7.8 7.8 7.8 ...
##  $ Quality.of.Sleep       : int  6 6 6 4 4 4 6 7 7 7 ...
##  $ Physical.Activity.Level: int  42 60 60 30 30 30 40 75 75 75 ...
##  $ Stress.Level           : int  6 8 8 8 8 8 7 6 6 6 ...
##  $ BMI.Category           : chr  "Overweight" "Normal" "Normal" "Obese" ...
##  $ Blood.Pressure         : chr  "126/83" "125/80" "125/80" "140/90" ...
##  $ Heart.Rate             : int  77 75 75 85 85 85 82 70 70 70 ...
##  $ Daily.Steps            : int  4200 10000 10000 3000 3000 3000 3500 8000 8000 8000 ...
##  $ Sleep.Disorder         : chr  "None" "None" "None" "Sleep Apnea" ...

3 Data Cleaning

3.1 Cek Missing Values

missing_df <- data.frame(
  Kolom   = names(df),
  Missing = colSums(is.na(df)),
  Persen  = paste0(round(colSums(is.na(df)) / nrow(df) * 100, 2), "%")
) %>% filter(Missing > 0)

kable(missing_df, caption = "Kolom dengan Missing Values", row.names = FALSE) %>%
  kable_styling(bootstrap_options = c("striped","hover"), full_width = FALSE) %>%
  row_spec(which(missing_df$Missing > 0), background = "#fff3cd")

Kolom dengan Missing Values
Kolom	Missing	Persen
NA	NA	NA
:—–	——-:	:——

Catatan: Missing values pada Sleep.Disorder bukan data rusak — responden yang tidak memiliki gangguan tidur tidak diisi nilainya, sehingga diimputasi dengan "None".

3.2 Imputasi & Standardisasi

# Imputasi Sleep.Disorder: NA -> "None"
df$Sleep.Disorder[is.na(df$Sleep.Disorder)] <- "None"

# Standardisasi BMI: "Normal" -> "Normal Weight"
df$BMI.Category[df$BMI.Category == "Normal"] <- "Normal Weight"

# Pecah Blood.Pressure menjadi Systolic & Diastolic
df <- df %>%
  separate(Blood.Pressure, into = c("Systolic", "Diastolic"),
           sep = "/", convert = TRUE)

# Hapus Person.ID (tidak relevan untuk analisis)
df <- df %>% dplyr::select(-Person.ID)

# Encoding ke faktor
df$Gender         <- as.factor(df$Gender)
df$Occupation     <- as.factor(df$Occupation)
df$BMI.Category   <- as.factor(df$BMI.Category)
df$Sleep.Disorder <- as.factor(df$Sleep.Disorder)

cat("Dimensi setelah cleaning:", nrow(df), "baris x", ncol(df), "kolom\n")

## Dimensi setelah cleaning: 374 baris x 13 kolom

3.3 Data Setelah Cleaning

kable(summary(df), caption = "Ringkasan Statistik Setelah Cleaning") %>%
  kable_styling(bootstrap_options = c("striped","hover","condensed"),
                full_width = TRUE, font_size = 12) %>%
  scroll_box(width = "100%", height = "350px")

Ringkasan Statistik Setelah Cleaning
Gender	Age	Occupation	Sleep.Duration	Quality.of.Sleep	Physical.Activity.Level	Stress.Level	BMI.Category	Systolic	Diastolic	Heart.Rate	Daily.Steps	Sleep.Disorder
Female:185	Min. :27.00	Nurse :73	Min. :5.800	Min. :4.000	Min. :30.00	Min. :3.000	Normal Weight:216	Min. :115.0	Min. :75.00	Min. :65.00	Min. : 3000	Insomnia : 77
Male :189	1st Qu.:35.25	Doctor :71	1st Qu.:6.400	1st Qu.:6.000	1st Qu.:45.00	1st Qu.:4.000	Obese : 10	1st Qu.:125.0	1st Qu.:80.00	1st Qu.:68.00	1st Qu.: 5600	None :219
NA	Median :43.00	Engineer :63	Median :7.200	Median :7.000	Median :60.00	Median :5.000	Overweight :148	Median :130.0	Median :85.00	Median :70.00	Median : 7000	Sleep Apnea: 78
NA	Mean :42.18	Lawyer :47	Mean :7.132	Mean :7.313	Mean :59.17	Mean :5.385	NA	Mean :128.6	Mean :84.65	Mean :70.17	Mean : 6817	NA
NA	3rd Qu.:50.00	Teacher :40	3rd Qu.:7.800	3rd Qu.:8.000	3rd Qu.:75.00	3rd Qu.:7.000	NA	3rd Qu.:135.0	3rd Qu.:90.00	3rd Qu.:72.00	3rd Qu.: 8000	NA
NA	Max. :59.00	Accountant:37	Max. :8.500	Max. :9.000	Max. :90.00	Max. :8.000	NA	Max. :142.0	Max. :95.00	Max. :86.00	Max. :10000	NA
NA	NA	(Other) :43	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA

4 EDA - Variabel Target (Sleep Disorder)

Variabel target untuk analisis LDA Multinomial adalah Sleep.Disorder dengan 3 kelas: None, Insomnia, dan Sleep Apnea.

4.1 Distribusi Kelas Target

target_df <- as.data.frame(table(df$Sleep.Disorder)) %>%
  rename(Kelas = Var1, Frekuensi = Freq) %>%
  mutate(Persen = paste0(round(Frekuensi / nrow(df) * 100, 1), "%"))

kable(target_df, caption = "Distribusi Kelas Sleep Disorder", row.names = FALSE) %>%
  kable_styling(bootstrap_options = c("striped","hover"), full_width = FALSE) %>%
  row_spec(which(target_df$Kelas == "None"),        background = "#d4edda") %>%
  row_spec(which(target_df$Kelas == "Insomnia"),    background = "#fff3cd") %>%
  row_spec(which(target_df$Kelas == "Sleep Apnea"), background = "#f8d7da")

Distribusi Kelas Sleep Disorder
Kelas	Frekuensi	Persen
Insomnia	77	20.6%
None	219	58.6%
Sleep Apnea	78	20.9%

ggplot(df, aes(x = Sleep.Disorder, fill = Sleep.Disorder)) +
  geom_bar(alpha = 0.85, width = 0.6) +
  geom_text(stat = "count", aes(label = after_stat(count)),
            vjust = -0.5, size = 4.5, fontface = "bold") +
  scale_fill_manual(values = c("None"        = "#55A868",
                               "Insomnia"    = "#DD8452",
                               "Sleep Apnea" = "#C44E52")) +
  labs(title = "Distribusi Variabel Target: Sleep Disorder",
       x = "Kelas", y = "Jumlah Observasi") +
  theme_bw(base_size = 12) +
  theme(legend.position = "none")

Catatan class imbalance: Kelas None (219 obs) mendominasi dibanding Insomnia (77) dan Sleep Apnea (78). Pada tahap LDA, prior probability akan disesuaikan untuk mengatasi ketidakseimbangan ini.

5 EDA - Variabel Numerik (Prediktor LDA)

Prediktor numerik yang akan digunakan dalam LDA:

num_vars <- df %>%
  dplyr::select(Age, Sleep.Duration, Physical.Activity.Level,
         Stress.Level, Heart.Rate, Daily.Steps, Systolic, Diastolic)

5.1 Statistik Deskriptif

desc_tbl <- round(as.data.frame(do.call(cbind, lapply(num_vars, function(x) {
  c(Mean     = mean(x),
    Median   = median(x),
    SD       = sd(x),
    Min      = min(x),
    Max      = max(x),
    Skewness = skewness(x))
}))), 3)

kable(desc_tbl, caption = "Statistik Deskriptif Variabel Numerik (Prediktor LDA)") %>%
  kable_styling(bootstrap_options = c("striped","hover","condensed"),
                full_width = TRUE) %>%
  scroll_box(width = "100%")

Statistik Deskriptif Variabel Numerik (Prediktor LDA)
	Age	Sleep.Duration	Physical.Activity.Level	Stress.Level	Heart.Rate	Daily.Steps	Systolic	Diastolic
Mean	42.184	7.132	59.171	5.385	70.166	6816.845	128.553	84.650
Median	43.000	7.200	60.000	5.000	70.000	7000.000	130.000	85.000
SD	8.673	0.796	20.831	1.775	4.136	1617.916	7.748	6.162
Min	27.000	5.800	30.000	3.000	65.000	3000.000	115.000	75.000
Max	59.000	8.500	90.000	8.000	86.000	10000.000	142.000	95.000
Skewness	0.256	0.037	0.074	0.154	1.220	0.178	-0.036	0.377

5.2 Distribusi Variabel Numerik

num_long <- num_vars %>%
  pivot_longer(everything(), names_to = "Variabel", values_to = "Nilai")

ggplot(num_long, aes(x = Nilai)) +
  geom_histogram(fill = "#4C72B0", color = "white", bins = 20, alpha = 0.85) +
  facet_wrap(~Variabel, scales = "free") +
  labs(title = "Distribusi Variabel Numerik (Prediktor LDA)",
       x = NULL, y = "Frekuensi") +
  theme_bw()

5.3 Distribusi Prediktor per Kelas Target

num_class <- df %>%
  dplyr::select(Sleep.Disorder, Age, Sleep.Duration, Physical.Activity.Level,
         Stress.Level, Heart.Rate, Daily.Steps, Systolic, Diastolic) %>%
  pivot_longer(-Sleep.Disorder, names_to = "Variabel", values_to = "Nilai")

ggplot(num_class, aes(x = Sleep.Disorder, y = Nilai, fill = Sleep.Disorder)) +
  geom_boxplot(alpha = 0.8, outlier.color = "red", outlier.size = 1.5) +
  scale_fill_manual(values = c("None"        = "#55A868",
                               "Insomnia"    = "#DD8452",
                               "Sleep Apnea" = "#C44E52")) +
  facet_wrap(~Variabel, scales = "free_y") +
  labs(title = "Distribusi Prediktor per Kelas Sleep Disorder",
       x = NULL, y = NULL) +
  theme_bw(base_size = 11) +
  theme(legend.position = "bottom",
        axis.text.x = element_text(angle = 20, hjust = 1))

6 EDA - Variabel Kategorik

6.1 Distribusi Frekuensi

cat_list <- list(
  Gender       = as.data.frame(table(df$Gender)),
  BMI.Category = as.data.frame(table(df$BMI.Category))
)

for (nm in names(cat_list)) {
  names(cat_list[[nm]]) <- c("Kategori","Frekuensi")
  cat_list[[nm]]$Persen <- paste0(
    round(cat_list[[nm]]$Frekuensi / nrow(df) * 100, 1), "%")
}

kable(cat_list$Gender, caption = "Distribusi Gender") %>%
  kable_styling(full_width = FALSE, bootstrap_options = c("striped","hover"))

Distribusi Gender
Kategori	Frekuensi	Persen
Female	185	49.5%
Male	189	50.5%

kable(cat_list$BMI.Category, caption = "Distribusi BMI Category") %>%
  kable_styling(full_width = FALSE, bootstrap_options = c("striped","hover"))

Distribusi BMI Category
Kategori	Frekuensi	Persen
Normal Weight	216	57.8%
Obese	10	2.7%
Overweight	148	39.6%

6.2 Visualisasi Kategorik

p_gen <- ggplot(df, aes(x = Gender, fill = Gender)) +
  geom_bar(show.legend = FALSE, alpha = 0.85) +
  scale_fill_manual(values = c("#4C72B0","#DD8452")) +
  labs(title = "Distribusi Gender", x = NULL, y = "Jumlah") +
  theme_bw()

p_bmi <- ggplot(df, aes(x = BMI.Category, fill = BMI.Category)) +
  geom_bar(show.legend = FALSE, alpha = 0.85) +
  scale_fill_brewer(palette = "Set2") +
  labs(title = "Distribusi BMI Category", x = NULL, y = "Jumlah") +
  theme_bw() +
  theme(axis.text.x = element_text(angle = 15, hjust = 1))

p_occ <- ggplot(df, aes(y = fct_infreq(Occupation), fill = Occupation)) +
  geom_bar(show.legend = FALSE, alpha = 0.85) +
  scale_fill_brewer(palette = "Paired") +
  labs(title = "Distribusi Occupation", x = "Jumlah", y = NULL) +
  theme_bw()

grid.arrange(p_gen, p_bmi, p_occ, ncol = 3)

7 Matriks Korelasi

cor_matrix <- cor(num_vars, use = "complete.obs")

ggcorrplot(cor_matrix,
           method   = "circle",
           type     = "lower",
           lab      = TRUE,
           lab_size = 3,
           colors   = c("#D73027","white","#4575B4"),
           title    = "Heatmap Korelasi Variabel Numerik",
           ggtheme  = theme_bw())

cor_df <- as.data.frame(round(cor_matrix, 3))
kable(cor_df, caption = "Matriks Korelasi Lengkap") %>%
  kable_styling(bootstrap_options = c("striped","hover","condensed"),
                full_width = TRUE) %>%
  scroll_box(width = "100%")

Matriks Korelasi Lengkap
	Age	Sleep.Duration	Physical.Activity.Level	Stress.Level	Heart.Rate	Daily.Steps	Systolic	Diastolic
Age	1.000	0.345	0.179	-0.422	-0.226	0.058	0.606	0.594
Sleep.Duration	0.345	1.000	0.212	-0.811	-0.516	-0.040	-0.180	-0.167
Physical.Activity.Level	0.179	0.212	1.000	-0.034	0.137	0.773	0.265	0.383
Stress.Level	-0.422	-0.811	-0.034	1.000	0.670	0.187	0.103	0.092
Heart.Rate	-0.226	-0.516	0.137	0.670	1.000	-0.030	0.294	0.271
Daily.Steps	0.058	-0.040	0.773	0.187	-0.030	1.000	0.103	0.242
Systolic	0.606	-0.180	0.265	0.103	0.294	0.103	1.000	0.973
Diastolic	0.594	-0.167	0.383	0.092	0.271	0.242	0.973	1.000

8 Deteksi Outlier

detect_outliers <- function(x, varname) {
  Q1      <- quantile(x, 0.25, na.rm = TRUE)
  Q3      <- quantile(x, 0.75, na.rm = TRUE)
  IQR_val <- Q3 - Q1
  lower   <- Q1 - 1.5 * IQR_val
  upper   <- Q3 + 1.5 * IQR_val
  n_out   <- sum(x < lower | x > upper, na.rm = TRUE)
  data.frame(
    Variabel    = varname,
    Q1          = round(Q1, 2),
    Q3          = round(Q3, 2),
    IQR         = round(IQR_val, 2),
    Lower_Fence = round(lower, 2),
    Upper_Fence = round(upper, 2),
    N_Outlier   = n_out,
    Persen      = paste0(round(n_out / length(x) * 100, 2), "%")
  )
}

outlier_df <- bind_rows(
  lapply(seq_along(num_vars),
         function(i) detect_outliers(num_vars[[i]], names(num_vars)[i]))
)

kable(outlier_df, caption = "Ringkasan Outlier per Variabel (Metode IQR)",
      row.names = FALSE) %>%
  kable_styling(bootstrap_options = c("striped","hover","condensed"),
                full_width = TRUE) %>%
  row_spec(which(outlier_df$N_Outlier > 0), background = "#fff3cd")

Ringkasan Outlier per Variabel (Metode IQR)
Variabel	Q1	Q3	IQR	Lower_Fence	Upper_Fence	N_Outlier	Persen
Age	35.25	50.0	14.75	13.12	72.12	0	0%
Sleep.Duration	6.40	7.8	1.40	4.30	9.90	0	0%
Physical.Activity.Level	45.00	75.0	30.00	0.00	120.00	0	0%
Stress.Level	4.00	7.0	3.00	-0.50	11.50	0	0%
Heart.Rate	68.00	72.0	4.00	62.00	78.00	15	4.01%
Daily.Steps	5600.00	8000.0	2400.00	2000.00	11600.00	0	0%
Systolic	125.00	135.0	10.00	110.00	150.00	0	0%
Diastolic	80.00	90.0	10.00	65.00	105.00	0	0%

num_long2 <- num_vars %>%
  pivot_longer(everything(), names_to = "Variabel", values_to = "Nilai")

ggplot(num_long2, aes(x = Variabel, y = Nilai)) +
  geom_boxplot(fill = "#4C72B0", alpha = 0.7,
               outlier.color = "red", outlier.shape = 19) +
  facet_wrap(~Variabel, scales = "free") +
  labs(title = "Deteksi Outlier - Boxplot Semua Variabel Numerik",
       x = NULL, y = NULL) +
  theme_bw() +
  theme(axis.text.x = element_blank(), axis.ticks.x = element_blank())

9 Cek Asumsi LDA

9.1 Asumsi 1 – Normalitas Multivariat per Kelas (Shapiro-Wilk)

LDA mengasumsikan setiap prediktor berdistribusi normal di dalam masing-masing kelas target.

pred_vars <- c("Age", "Sleep.Duration", "Physical.Activity.Level",
               "Stress.Level", "Heart.Rate", "Daily.Steps",
               "Systolic", "Diastolic")

kelas <- levels(df$Sleep.Disorder)

norm_results <- lapply(kelas, function(k) {
  sub <- df %>% filter(Sleep.Disorder == k) %>% dplyr::select(all_of(pred_vars))
  lapply(pred_vars, function(v) {
    sw <- shapiro.test(sub[[v]])
    data.frame(
      Kelas    = k,
      Variabel = v,
      W        = round(sw$statistic, 4),
      p.value  = round(sw$p.value, 4),
      Normal   = ifelse(sw$p.value > 0.05, "Ya", "Tidak")
    )
  })
})

norm_df <- bind_rows(unlist(norm_results, recursive = FALSE))

kable(norm_df,
      caption  = "Uji Normalitas Shapiro-Wilk per Kelas & Variabel (Asumsi LDA)",
      row.names = FALSE) %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"),
                full_width = TRUE) %>%
  row_spec(which(norm_df$Normal == "Tidak"), background = "#f8d7da") %>%
  row_spec(which(norm_df$Normal == "Ya"),    background = "#d4edda") %>%
  scroll_box(width = "100%", height = "400px")

Uji Normalitas Shapiro-Wilk per Kelas & Variabel (Asumsi LDA)
Kelas	Variabel	W	Normal
Insomnia	Age	0.7754	Tidak
Insomnia	Sleep.Duration	0.7381	Tidak
Insomnia	Physical.Activity.Level	0.5448	Tidak
Insomnia	Stress.Level	0.7493	Tidak
Insomnia	Heart.Rate	0.8158	Tidak
Insomnia	Daily.Steps	0.6528	Tidak
Insomnia	Systolic	0.7907	Tidak
Insomnia	Diastolic	0.7996	Tidak
None	Age	0.9108	Tidak
None	Sleep.Duration	0.9231	Tidak
None	Physical.Activity.Level	0.8774	Tidak
None	Stress.Level	0.8934	Tidak
None	Heart.Rate	0.9103	Tidak
None	Daily.Steps	0.8244	Tidak
None	Systolic	0.9124	Tidak
None	Diastolic	0.8393	Tidak
Sleep Apnea	Age	0.8469	Tidak
Sleep Apnea	Sleep.Duration	0.7748	Tidak
Sleep Apnea	Physical.Activity.Level	0.7578	Tidak
Sleep Apnea	Stress.Level	0.7175	Tidak
Sleep Apnea	Heart.Rate	0.8251	Tidak
Sleep Apnea	Daily.Steps	0.8488	Tidak
Sleep Apnea	Systolic	0.5007	Tidak
Sleep Apnea	Diastolic	0.5767	Tidak

9.2 Asumsi 2 – Homogenitas Matriks Kovarians (Box’s M Test)

Asumsi utama LDA: matriks kovarians antar kelas harus homogen.

boxm_result <- boxM(
  df %>% dplyr::select(all_of(pred_vars)),
  df$Sleep.Disorder
)

boxm_df <- data.frame(
  Uji        = "Box's M Test",
  Chi_Square = round(boxm_result$statistic, 4),
  df         = boxm_result$parameter["df"],
  p.value    = round(boxm_result$p.value, 4),
  Kesimpulan = ifelse(boxm_result$p.value > 0.05,
                      "Kovarians Homogen → Asumsi LDA Terpenuhi ✓",
                      "Kovarians Tidak Homogen → Pertimbangkan QDA ⚠")
)

kable(boxm_df,
      caption   = "Box's M Test – Homogenitas Matriks Kovarians Antar Kelas",
      row.names = FALSE) %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE) %>%
  row_spec(1,
           color      = "white",
           background = ifelse(boxm_result$p.value > 0.05, "#5cb85c", "#f0ad4e"))

Box’s M Test – Homogenitas Matriks Kovarians Antar Kelas
Uji	Chi_Square	df	p.value	Kesimpulan
Box’s M Test	1122.323	72	0	Kovarians Tidak Homogen → Pertimbangkan QDA ⚠

Interpretasi: Jika p-value > 0.05, matriks kovarians antar kelas homogen dan asumsi LDA terpenuhi. Jika p-value ≤ 0.05, pertimbangkan menggunakan QDA (Quadratic Discriminant Analysis) sebagai alternatif.

9.3 Asumsi 3 – Tidak Ada Multikolinearitas Tinggi (VIF)

df_vif <- df %>%
  mutate(
    Gender_bin     = as.integer(Gender == "Male"),
    BMI_Overweight = as.integer(BMI.Category == "Overweight"),
    BMI_Obese      = as.integer(BMI.Category == "Obese")
  )

model_vif <- lm(as.numeric(Sleep.Disorder) ~
                  Age + Sleep.Duration + Physical.Activity.Level +
                  Stress.Level + Heart.Rate + Daily.Steps +
                  Systolic + Diastolic + Gender_bin +
                  BMI_Overweight + BMI_Obese,
                data = df_vif)

vif_vals <- vif(model_vif)

vif_df <- data.frame(
  Variabel = names(vif_vals),
  VIF      = round(vif_vals, 3),
  Status   = ifelse(vif_vals > 10, "Multikolinear Tinggi",
             ifelse(vif_vals > 5,  "Perlu Perhatian", "Aman"))
)

kable(vif_df, caption = "Variance Inflation Factor (VIF) – Cek Asumsi LDA & MLR",
      row.names = FALSE) %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE) %>%
  row_spec(which(vif_df$Status == "Multikolinear Tinggi"), color="white", background="#d9534f") %>%
  row_spec(which(vif_df$Status == "Perlu Perhatian"),      color="white", background="#f0ad4e") %>%
  row_spec(which(vif_df$Status == "Aman"),                 color="white", background="#5cb85c")

Variance Inflation Factor (VIF) – Cek Asumsi LDA & MLR
Variabel	VIF	Status
Age	8.031	Perlu Perhatian
Sleep.Duration	8.263	Perlu Perhatian
Physical.Activity.Level	5.926	Perlu Perhatian
Stress.Level	9.968	Perlu Perhatian
Heart.Rate	6.554	Perlu Perhatian
Daily.Steps	6.871	Perlu Perhatian
Systolic	70.455	Multikolinear Tinggi
Diastolic	76.025	Multikolinear Tinggi
Gender_bin	3.368	Aman
BMI_Overweight	6.580	Perlu Perhatian
BMI_Obese	3.344	Aman

Interpretasi: VIF < 5 = tidak ada multikolinearitas (aman); VIF 5-10 = sedang; VIF > 10 = tinggi, perlu penanganan sebelum masuk ke LDA.

vif_df$Variabel <- factor(vif_df$Variabel,
                           levels = vif_df$Variabel[order(vif_df$VIF)])

ggplot(vif_df, aes(x = Variabel, y = VIF, fill = Status)) +
  geom_col(alpha = 0.85) +
  geom_hline(yintercept = 5,  linetype = "dashed", color = "orange", linewidth = 0.9) +
  geom_hline(yintercept = 10, linetype = "dashed", color = "red",    linewidth = 0.9) +
  annotate("text", x = 1.5, y = 5.5,  label = "VIF = 5",  color = "orange", size = 3.5) +
  annotate("text", x = 1.5, y = 10.5, label = "VIF = 10", color = "red",    size = 3.5) +
  scale_fill_manual(
    values = c("Aman"                 = "#5cb85c",
               "Perlu Perhatian"      = "#f0ad4e",
               "Multikolinear Tinggi" = "#d9534f")) +
  coord_flip() +
  labs(title    = "Variance Inflation Factor (VIF)",
       subtitle = "Deteksi Multikolinearitas antar Prediktor",
       x = NULL, y = "VIF", fill = "Status") +
  theme_bw(base_size = 12)

9.4 Ringkasan Asumsi LDA

lda_asumsi <- data.frame(
  No     = 1:3,
  Asumsi = c("Normalitas per kelas",
             "Homogenitas kovarians (Box's M)",
             "Tidak ada multikolinearitas (VIF)"),
  Hasil  = c(
    paste0(sum(norm_df$Normal == "Tidak"), " dari ", nrow(norm_df),
           " kombinasi kelas-variabel tidak normal"),
    paste0("Chi² = ", round(boxm_result$statistic, 2),
           ", p = ", round(boxm_result$p.value, 4)),
    paste0(sum(vif_vals > 5), " variabel VIF > 5; maks VIF = ",
           round(max(vif_vals), 2))
  ),
  Status = c(
    ifelse(sum(norm_df$Normal == "Tidak") / nrow(norm_df) < 0.5,
           "⚠ Sebagian – LDA masih robust", "✗ Banyak pelanggaran"),
    ifelse(boxm_result$p.value > 0.05,
           "Homogen", "⚠ Tidak homogen – LDA masih digunakan dengan catatan"),
    ifelse(all(vif_vals <= 10), "Tidak ada VIF > 10", "✗ Ada multikolinearitas tinggi")
  )
)

kable(lda_asumsi, caption = "Ringkasan Pemeriksaan Asumsi LDA",
      row.names = FALSE) %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"),
                full_width = TRUE) %>%
  column_spec(4, bold = TRUE,
              color = ifelse(grepl("✓", lda_asumsi$Status), "#155724",
                      ifelse(grepl("⚠", lda_asumsi$Status), "#856404", "#721c24")))

Ringkasan Pemeriksaan Asumsi LDA
No	Asumsi	Hasil	Status
1	Normalitas per kelas	24 dari 24 kombinasi kelas-variabel tidak normal	✗ Banyak pelanggaran
2	Homogenitas kovarians (Box’s M)	Chi² = 1122.32, p = 0	⚠ Tidak homogen – LDA masih digunakan dengan catatan
3	Tidak ada multikolinearitas (VIF)	9 variabel VIF > 5; maks VIF = 76.03	✗ Ada multikolinearitas tinggi

10 Penanganan Asumsi LDA (Data Preparation)

Berdasarkan hasil cek asumsi, ditemukan beberapa pelanggaran. Tabel berikut merangkum masalah, dampak, dan strategi penanganannya:

Masalah	Temuan	Strategi
Outlier	Terdeteksi outlier di beberapa variabel	Winsorizing (IQR capping)
Normalitas	Banyak variabel tidak normal per kelas	Transformasi Box-Cox
Multikolinearitas	Ada VIF > 5 / > 10	PCA (komponen orthogonal, VIF = 1)
Class Imbalance	None dominan vs Insomnia/Sleep Apnea	SMOTE manual
Box’s M	p ≈ 0	Diterima – sensitif terhadap n besar; LDA tetap digunakan dengan catatan

10.1 Penanganan Outlier – Winsorizing (IQR Capping)

Winsorizing dilakukan pada fitur numerik asli sebelum PCA agar PC scores tidak terdistorsi oleh nilai ekstrem.

winsorize <- function(x) {
  Q1 <- quantile(x, 0.25, na.rm = TRUE)
  Q3 <- quantile(x, 0.75, na.rm = TRUE)
  pmin(pmax(x, Q1 - 1.5 * (Q3 - Q1)), Q3 + 1.5 * (Q3 - Q1))
}

num_cols_orig     <- df %>% dplyr::select(where(is.numeric)) %>% names()
df[num_cols_orig] <- lapply(df[num_cols_orig], winsorize)

outlier_after <- bind_rows(lapply(num_cols_orig, function(v) {
  x <- df[[v]]
  Q1 <- quantile(x, 0.25); Q3 <- quantile(x, 0.75)
  data.frame(Variabel = v,
             N_Outlier_Setelah = sum(x < Q1 - 1.5*(Q3-Q1) | x > Q3 + 1.5*(Q3-Q1)))
}))

kable(outlier_after, caption = "Jumlah Outlier Setelah Winsorizing", row.names = FALSE) %>%
  kable_styling(bootstrap_options = c("striped","hover"), full_width = FALSE) %>%
  row_spec(which(outlier_after$N_Outlier_Setelah == 0), background = "#d4edda")

Jumlah Outlier Setelah Winsorizing
Variabel	N_Outlier_Setelah
Age	0
Sleep.Duration	0
Quality.of.Sleep	0
Physical.Activity.Level	0
Stress.Level	0
Systolic	0
Diastolic	0
Heart.Rate	0
Daily.Steps	0

cat("Total outlier tersisa:", sum(outlier_after$N_Outlier_Setelah), "\n")

## Total outlier tersisa: 0

10.2 Penanganan Normalitas – Transformasi Box-Cox

Box-Cox diterapkan pada fitur numerik asli sebelum PCA agar distribusi tiap fitur lebih mendekati normal sebelum direduksi dimensinya.

boxcox_transform <- function(x) {
  x_shift <- x - min(x) + 1
  bc      <- boxcox(x_shift ~ 1, plotit = FALSE)
  lambda  <- bc$x[which.max(bc$y)]
  if (abs(lambda) < 0.01) log(x_shift) else (x_shift^lambda - 1) / lambda
}

# Simpan salinan df SEBELUM Box-Cox untuk dipakai di cek asumsi MLR (Box-Tidwell & Mahalanobis)
df_raw <- df

lambda_log <- data.frame(Variabel = character(), Lambda = numeric(),
                         stringsAsFactors = FALSE)
for (v in num_cols_orig) {
  x_shift    <- df[[v]] - min(df[[v]]) + 1
  bc         <- boxcox(x_shift ~ 1, plotit = FALSE)
  lam        <- bc$x[which.max(bc$y)]
  lambda_log <- rbind(lambda_log, data.frame(Variabel = v, Lambda = round(lam, 3)))
  df[[v]]    <- boxcox_transform(df[[v]])
}

kable(lambda_log, caption = "Parameter λ Box-Cox per Variabel", row.names = FALSE) %>%
  kable_styling(bootstrap_options = c("striped","hover"), full_width = FALSE)

Parameter λ Box-Cox per Variabel
Variabel	Lambda
Age	0.6
Sleep.Duration	0.6
Quality.of.Sleep	1.2
Physical.Activity.Level	0.6
Stress.Level	0.5
Systolic	0.8
Diastolic	0.6
Heart.Rate	0.6
Daily.Steps	0.8

cat("Keterangan: lambda kecil ~ 0 = log transform; lambda = 1 = tidak perlu transformasi\n")

## Keterangan: lambda kecil ~ 0 = log transform; lambda = 1 = tidak perlu transformasi

df %>% dplyr::select(all_of(num_cols_orig)) %>%
  pivot_longer(everything(), names_to = "Variabel", values_to = "Nilai") %>%
  ggplot(aes(x = Nilai)) +
  geom_histogram(fill = "#55A868", color = "white", bins = 20, alpha = 0.85) +
  geom_density(aes(y = after_stat(count)), color = "#C44E52", linewidth = 0.8) +
  facet_wrap(~Variabel, scales = "free") +
  labs(title = "Distribusi Fitur Asli Setelah Box-Cox + Winsorizing",
       x = NULL, y = "Frekuensi") +
  theme_bw()

norm_after <- bind_rows(lapply(levels(df$Sleep.Disorder), function(k) {
  sub <- df %>% filter(Sleep.Disorder == k) %>% dplyr::select(all_of(num_cols_orig))
  bind_rows(lapply(names(sub), function(v) {
    sw <- shapiro.test(sub[[v]])
    data.frame(Kelas = k, Variabel = v,
               W = round(sw$statistic, 4),
               p.value = round(sw$p.value, 4),
               Normal = ifelse(sw$p.value > 0.05, "Ya", "Tidak"))
  }))
}))

kable(norm_after, caption = "Uji Normalitas Setelah Box-Cox (per Kelas & Variabel)",
      row.names = FALSE) %>%
  kable_styling(bootstrap_options = c("striped","hover","condensed"), full_width = TRUE) %>%
  row_spec(which(norm_after$Normal == "Ya"),    background = "#d4edda") %>%
  row_spec(which(norm_after$Normal == "Tidak"), background = "#f8d7da") %>%
  scroll_box(width = "100%", height = "350px")

Uji Normalitas Setelah Box-Cox (per Kelas & Variabel)
Kelas	Variabel	W	Normal
Insomnia	Age	0.7088	Tidak
Insomnia	Sleep.Duration	0.7805	Tidak
Insomnia	Quality.of.Sleep	0.8567	Tidak
Insomnia	Physical.Activity.Level	0.6320	Tidak
Insomnia	Stress.Level	0.7458	Tidak
Insomnia	Systolic	0.7634	Tidak
Insomnia	Diastolic	0.7552	Tidak
Insomnia	Heart.Rate	0.7907	Tidak
Insomnia	Daily.Steps	0.6528	Tidak
None	Age	0.9544	Tidak
None	Sleep.Duration	0.9018	Tidak
None	Quality.of.Sleep	0.8535	Tidak
None	Physical.Activity.Level	0.8355	Tidak
None	Stress.Level	0.9097	Tidak
None	Systolic	0.9106	Tidak
None	Diastolic	0.8550	Tidak
None	Heart.Rate	0.8930	Tidak
None	Daily.Steps	0.8206	Tidak
Sleep Apnea	Age	0.7984	Tidak
Sleep Apnea	Sleep.Duration	0.7809	Tidak
Sleep Apnea	Quality.of.Sleep	0.7787	Tidak
Sleep Apnea	Physical.Activity.Level	0.6972	Tidak
Sleep Apnea	Stress.Level	0.7139	Tidak
Sleep Apnea	Systolic	0.4869	Tidak
Sleep Apnea	Diastolic	0.5543	Tidak
Sleep Apnea	Heart.Rate	0.7787	Tidak
Sleep Apnea	Daily.Steps	0.8465	Tidak

n_normal <- sum(norm_after$Normal == "Ya")
cat("Normal:", n_normal, "dari", nrow(norm_after), "kombinasi (",
    round(n_normal / nrow(norm_after) * 100, 1), "%)\n")

## Normal: 0 dari 27 kombinasi ( 0 %)

Catatan: Jika masih ada yang tidak normal setelah Box-Cox, hal ini umum pada data nyata. LDA cukup robust terhadap pelanggaran normalitas selama n per kelas >= 25. Setelah Box-Cox, data masuk ke PCA.

10.3 Penanganan Multikolinearitas – PCA

Drop iteratif tidak cukup karena multikolinearitas pada data ini sangat kuat. Solusi definitif adalah PCA: mengubah fitur-fitur yang saling berkorelasi menjadi komponen baru yang secara matematika orthogonal, sehingga VIF antar komponen = 1.

df_clean       <- df
num_cols_clean <- df_clean %>% dplyr::select(where(is.numeric)) %>% names()

cat("Fitur numerik sebelum PCA:", paste(num_cols_clean, collapse = ", "), "\n")

## Fitur numerik sebelum PCA: Age, Sleep.Duration, Quality.of.Sleep, Physical.Activity.Level, Stress.Level, Systolic, Diastolic, Heart.Rate, Daily.Steps

# Standarisasi (wajib sebelum PCA)
num_matrix <- scale(df_clean[num_cols_clean])

# Jalankan PCA
pca_result <- prcomp(num_matrix, center = FALSE, scale. = FALSE)

# Pilih PC yang menjelaskan >= 80% variansi
explained_var <- cumsum(pca_result$sdev^2) / sum(pca_result$sdev^2)
n_pc          <- which(explained_var >= 0.80)[1]
cat("\nJumlah PC yang menjelaskan >= 80% variansi:", n_pc, "\n")

## 
## Jumlah PC yang menjelaskan >= 80% variansi: 3

cat("Variansi kumulatif PC1 s.d. PC", n_pc, ":",
    round(explained_var[n_pc] * 100, 2), "%\n")

## Variansi kumulatif PC1 s.d. PC 3 : 87.37 %

ev_df <- data.frame(
  PC             = paste0("PC", seq_along(pca_result$sdev)),
  Std.Dev        = round(pca_result$sdev, 4),
  Var.Explained  = round(pca_result$sdev^2 / sum(pca_result$sdev^2) * 100, 2),
  Cumulative.Pct = round(explained_var * 100, 2)
)

kable(ev_df, caption = "Explained Variance per Principal Component", row.names = FALSE) %>%
  kable_styling(bootstrap_options = c("striped","hover","condensed"), full_width = FALSE) %>%
  row_spec(seq_len(n_pc), background = "#d4edda") %>%
  row_spec(n_pc, bold = TRUE)

Explained Variance per Principal Component
PC	Std.Dev	Var.Explained	Cumulative.Pct
PC1	1.8940	39.86	39.86
PC2	1.6273	29.42	69.28
PC3	1.2758	18.08	87.37
PC4	0.7193	5.75	93.11
PC5	0.5660	3.56	96.67
PC6	0.3920	1.71	98.38
PC7	0.2924	0.95	99.33
PC8	0.2219	0.55	99.88
PC9	0.1043	0.12	100.00

ev_plot <- ev_df %>% mutate(PC = factor(PC, levels = PC))

p_scree <- ggplot(ev_plot, aes(x = PC, y = Var.Explained)) +
  geom_col(fill = "#4C72B0", alpha = 0.8) +
  geom_line(aes(group = 1), color = "#DD8452", linewidth = 1) +
  geom_point(color = "#DD8452", size = 2.5) +
  geom_vline(xintercept = n_pc + 0.5, linetype = "dashed",
             color = "red", linewidth = 0.8) +
  annotate("text", x = n_pc + 0.7, y = max(ev_plot$Var.Explained) * 0.9,
           label = paste0("PC", n_pc, " cutoff"), color = "red", hjust = 0, size = 3.5) +
  labs(title = "Scree Plot PCA", x = "Principal Component",
       y = "Variansi Dijelaskan (%)") +
  theme_bw()

p_cum <- ggplot(ev_plot, aes(x = PC, y = Cumulative.Pct, group = 1)) +
  geom_line(color = "#55A868", linewidth = 1.2) +
  geom_point(color = "#55A868", size = 2.5) +
  geom_hline(yintercept = 80, linetype = "dashed", color = "red", linewidth = 0.8) +
  annotate("text", x = 2, y = 81.5, label = "80% threshold",
           color = "red", size = 3.5) +
  labs(title = "Cumulative Explained Variance",
       x = "Principal Component", y = "Kumulatif (%)") +
  theme_bw()

grid.arrange(p_scree, p_cum, ncol = 2)

# Gunakan n_pc komponen pertama sebagai fitur baru
pc_scores        <- as.data.frame(pca_result$x[, seq_len(n_pc)])
names(pc_scores) <- paste0("PC", seq_len(n_pc))

df_clean <- bind_cols(
  df_clean %>% dplyr::select(Sleep.Disorder, Gender, Occupation, BMI.Category),
  pc_scores
)

num_cols_clean <- paste0("PC", seq_len(n_pc))
cat("Fitur baru setelah PCA:", paste(num_cols_clean, collapse = ", "), "\n")

## Fitur baru setelah PCA: PC1, PC2, PC3

cat("Dimensi df_clean:", nrow(df_clean), "x", ncol(df_clean), "\n")

## Dimensi df_clean: 374 x 7

df_vif2 <- df_clean %>%
  mutate(Gender_bin     = as.integer(Gender == "Male"),
         BMI_Overweight = as.integer(BMI.Category == "Overweight"),
         BMI_Obese      = as.integer(BMI.Category == "Obese"))

f2        <- as.formula(paste("as.numeric(Sleep.Disorder) ~",
               paste(c(num_cols_clean, "Gender_bin", "BMI_Overweight", "BMI_Obese"),
                     collapse = "+")))
vif_vals2 <- vif(lm(f2, data = df_vif2))

vif_df2 <- data.frame(
  Variabel = names(vif_vals2),
  VIF      = round(vif_vals2, 3),
  Status   = ifelse(vif_vals2 > 10, "Multikolinear Tinggi",
             ifelse(vif_vals2 > 5,  "Perlu Perhatian", "Aman"))
)

kable(vif_df2, caption = "VIF Setelah PCA – Semua Komponen Orthogonal",
      row.names = FALSE) %>%
  kable_styling(bootstrap_options = c("striped","hover"), full_width = FALSE) %>%
  row_spec(which(vif_df2$Status == "Multikolinear Tinggi"), color="white", background="#d9534f") %>%
  row_spec(which(vif_df2$Status == "Perlu Perhatian"),      color="white", background="#f0ad4e") %>%
  row_spec(which(vif_df2$Status == "Aman"),                 color="white", background="#5cb85c")

VIF Setelah PCA – Semua Komponen Orthogonal
Variabel	VIF	Status
PC1	1.597	Aman
PC2	2.246	Aman
PC3	1.547	Aman
Gender_bin	1.501	Aman
BMI_Overweight	3.274	Aman
BMI_Obese	1.375	Aman

10.4 Penanganan Class Imbalance – SMOTE Manual

set.seed(42)

smote_manual <- function(minority_num, n_target, k = 5) {
  n_min <- nrow(minority_num)
  synthetic <- vector("list", n_target)
  for (i in seq_len(n_target)) {
    idx_base <- sample(n_min, 1)
    base     <- as.numeric(minority_num[idx_base, ])
    dists    <- apply(minority_num, 1, function(r) sqrt(sum((as.numeric(r) - base)^2)))
    dists[idx_base] <- Inf
    nn_idx   <- order(dists)[seq_len(min(k, n_min - 1))]
    neighbor <- as.numeric(minority_num[sample(nn_idx, 1), ])
    synthetic[[i]] <- base + runif(1) * (neighbor - base)
  }
  as.data.frame(do.call(rbind, synthetic))
}

rownames(df_clean) <- seq_len(nrow(df_clean))
num_mat <- as.data.frame(df_clean[num_cols_clean])

n_target_cls <- max(table(df_clean$Sleep.Disorder))
df_balanced  <- df_clean

for (kls in c("Insomnia", "Sleep Apnea")) {
  idx_kls  <- which(df_clean$Sleep.Disorder == kls)
  n_needed <- n_target_cls - length(idx_kls)
  if (n_needed > 0) {
    syn_num  <- smote_manual(num_mat[idx_kls, , drop = FALSE], n_needed, k = 5)
    names(syn_num) <- num_cols_clean
    syn_full <- syn_num
    syn_full$Sleep.Disorder <- factor(kls,  levels = levels(df_clean$Sleep.Disorder))
    syn_full$Gender         <- factor(names(which.max(table(df_clean$Gender[idx_kls]))),
                                      levels = levels(df_clean$Gender))
    syn_full$Occupation     <- factor(names(which.max(table(df_clean$Occupation[idx_kls]))),
                                      levels = levels(df_clean$Occupation))
    syn_full$BMI.Category   <- factor(names(which.max(table(df_clean$BMI.Category[idx_kls]))),
                                      levels = levels(df_clean$BMI.Category))
    syn_full <- syn_full[, names(df_balanced)]
    df_balanced <- bind_rows(df_balanced, syn_full)
  }
}

df_balanced$Sleep.Disorder <- factor(df_balanced$Sleep.Disorder,
                                      levels = levels(df_clean$Sleep.Disorder))

dist_compare <- merge(
  as.data.frame(table(df_clean$Sleep.Disorder))    %>% rename(Kelas = Var1, Sebelum = Freq),
  as.data.frame(table(df_balanced$Sleep.Disorder)) %>% rename(Kelas = Var1, Sesudah = Freq),
  by = "Kelas")
dist_compare$Ditambah <- dist_compare$Sesudah - dist_compare$Sebelum

kable(dist_compare, caption = "Distribusi Kelas Sebelum vs Sesudah SMOTE",
      row.names = FALSE) %>%
  kable_styling(bootstrap_options = c("striped","hover"), full_width = FALSE) %>%
  row_spec(which(dist_compare$Ditambah > 0), background = "#d4edda")

Distribusi Kelas Sebelum vs Sesudah SMOTE
Kelas	Sebelum	Sesudah	Ditambah
Insomnia	77	219	142
None	219	219	0
Sleep Apnea	78	219	141

p_bef <- ggplot(df_clean, aes(x=Sleep.Disorder, fill=Sleep.Disorder)) +
  geom_bar(alpha=0.85) +
  geom_text(stat="count", aes(label=after_stat(count)), vjust=-0.5, fontface="bold") +
  scale_fill_manual(values=c("None"="#55A868","Insomnia"="#DD8452","Sleep Apnea"="#C44E52")) +
  labs(title="Sebelum SMOTE", x=NULL, y="Jumlah") + ylim(0,260) +
  theme_bw() + theme(legend.position="none")

p_aft <- ggplot(df_balanced, aes(x=Sleep.Disorder, fill=Sleep.Disorder)) +
  geom_bar(alpha=0.85) +
  geom_text(stat="count", aes(label=after_stat(count)), vjust=-0.5, fontface="bold") +
  scale_fill_manual(values=c("None"="#55A868","Insomnia"="#DD8452","Sleep Apnea"="#C44E52")) +
  labs(title="Sesudah SMOTE", x=NULL, y="Jumlah") + ylim(0,260) +
  theme_bw() + theme(legend.position="none")

grid.arrange(p_bef, p_aft, ncol=2)

10.5 Verifikasi Akhir Asumsi LDA

boxm_final <- boxM(
  df_balanced %>% dplyr::select(all_of(num_cols_clean)),
  df_balanced$Sleep.Disorder
)

df_vif_final <- df_balanced %>%
  mutate(Gender_bin     = as.integer(Gender == "Male"),
         BMI_Overweight = as.integer(BMI.Category == "Overweight"),
         BMI_Obese      = as.integer(BMI.Category == "Obese"))
f_final  <- as.formula(paste("as.numeric(Sleep.Disorder) ~",
  paste(c(num_cols_clean, "Gender_bin", "BMI_Overweight", "BMI_Obese"), collapse = "+")))
vif_final <- vif(lm(f_final, data = df_vif_final))

norm_final <- bind_rows(lapply(levels(df_balanced$Sleep.Disorder), function(k) {
  sub <- df_balanced %>% filter(Sleep.Disorder == k) %>%
    dplyr::select(all_of(num_cols_clean))
  bind_rows(lapply(names(sub), function(v) {
    sw <- shapiro.test(sub[[v]])
    data.frame(Kelas = k, Variabel = v,
               p.value = round(sw$p.value, 4),
               Normal = ifelse(sw$p.value > 0.05, "Ya", "Tidak"))
  }))
}))

n_normal_pct <- round(sum(norm_final$Normal == "Ya") / nrow(norm_final) * 100, 1)

verif <- data.frame(
  Aspek = c(
    "Jumlah Observasi (setelah SMOTE)",
    "Distribusi Kelas",
    "Outlier",
    "Normalitas (kombinasi normal)",
    "Homogenitas Kovarians (Box's M)",
    "Multikolinearitas (VIF)"
  ),
  Hasil = c(
    paste(nrow(df_balanced), "baris"),
    paste(paste(names(table(df_balanced$Sleep.Disorder)),
                table(df_balanced$Sleep.Disorder), sep = "="), collapse = ", "),
    "0 outlier (setelah Winsorizing)",
    paste0(sum(norm_final$Normal == "Ya"), " dari ", nrow(norm_final),
           " kombinasi normal (", n_normal_pct, "%)"),
    paste0("p = ", round(boxm_final$p.value, 4),
           " — Box's M sangat sensitif pada n besar; pelanggaran ini umum. ",
           "LDA tetap digunakan dengan catatan."),
    ifelse(all(vif_final <= 5),  "Semua VIF ≤ 5 (Aman)",
    ifelse(all(vif_final <= 10), "Semua VIF ≤ 10 (Aman)", "Ada VIF > 10"))
  ),
  Status = c(
    "OK", "OK", "OK",
    ifelse(n_normal_pct >= 50, "OK", "Diterima*"),
    "Diterima*",
    ifelse(all(vif_final <= 10), "OK", "Perhatian")
  )
)

kable(verif, caption = "Verifikasi Asumsi Akhir – Data Siap LDA",
      row.names = FALSE) %>%
  kable_styling(bootstrap_options = c("striped","hover","condensed"), full_width = TRUE) %>%
  column_spec(1, bold = TRUE) %>%
  row_spec(which(verif$Status == "OK"),        background = "#d4edda") %>%
  row_spec(which(verif$Status == "Diterima*"), background = "#fff3cd") %>%
  row_spec(which(verif$Status == "Perhatian"), background = "#f8d7da")

Verifikasi Asumsi Akhir – Data Siap LDA
Aspek	Hasil	Status
Jumlah Observasi (setelah SMOTE)	657 baris	OK
Distribusi Kelas	Insomnia=219, None=219, Sleep Apnea=219	OK
Outlier	0 outlier (setelah Winsorizing)	OK
Normalitas (kombinasi normal)	0 dari 9 kombinasi normal (0%)	Diterima*
Homogenitas Kovarians (Box’s M)	p = 0 — Box’s M sangat sensitif pada n besar; pelanggaran ini umum. LDA tetap digunakan dengan catatan.	Diterima*
Multikolinearitas (VIF)	Semua VIF ≤ 5 (Aman)	OK

Keterangan status: - OK = asumsi terpenuhi sepenuhnya - Diterima* = pelanggaran ada namun dapat diterima secara statistik — Box’s M hampir selalu signifikan pada n > 200 karena sangat sensitif terhadap ukuran sampel besar, bukan karena perbedaan kovarians yang substansial. Normalitas yang tidak sempurna ditoleransi oleh LDA selama n per kelas cukup besar (≥ 25). - Dataset final tersimpan di objek df_balanced — siap diteruskan ke tahap pemodelan LDA/QDA.

cat("Dataset LDA final: df_balanced\n")

## Dataset LDA final: df_balanced

cat("Dimensi      :", nrow(df_balanced), "baris x", ncol(df_balanced), "kolom\n")

## Dimensi      : 657 baris x 7 kolom

cat("Fitur numerik:", paste(num_cols_clean, collapse = ", "), "\n")

## Fitur numerik: PC1, PC2, PC3

cat("Distribusi   :", paste(names(table(df_balanced$Sleep.Disorder)),
                            table(df_balanced$Sleep.Disorder),
                            sep = "=", collapse = " | "), "\n")

## Distribusi   : Insomnia=219 | None=219 | Sleep Apnea=219

11 LDA

11.1 Fitting Model LDA

11.1.1 Pemilihan Prediktor

Prediktor yang digunakan adalah PC scores hasil PCA (orthogonal, bebas multikolinearitas). Variabel kategorik (Gender, BMI.Category, Occupation) tidak disertakan dalam LDA standar.

cat("Prediktor LDA (PC scores):\n")

## Prediktor LDA (PC scores):

cat(paste(num_cols_clean, collapse = ", "), "\n\n")

## PC1, PC2, PC3

cat("Variabel target: Sleep.Disorder (",
    paste(levels(df_balanced$Sleep.Disorder), collapse = " / "), ")\n")

## Variabel target: Sleep.Disorder ( Insomnia / None / Sleep Apnea )

cat("Distribusi kelas:\n")

## Distribusi kelas:

print(table(df_balanced$Sleep.Disorder))

## 
##    Insomnia        None Sleep Apnea 
##         219         219         219

11.1.2 Fit Model LDA (Prior Proporsional)

set.seed(42)

lda_formula <- as.formula(
  paste("Sleep.Disorder ~", paste(num_cols_clean, collapse = " + "))
)

lda_model <- lda(
  lda_formula,
  data  = df_balanced
)

print(lda_model)

## Call:
## lda(lda_formula, data = df_balanced)
## 
## Prior probabilities of groups:
##    Insomnia        None Sleep Apnea 
##   0.3333333   0.3333333   0.3333333 
## 
## Group means:
##                    PC1        PC2         PC3
## Insomnia    -0.8753958  0.2024422  1.06088457
## None         0.4541202 -0.8009923 -0.38494922
## Sleep Apnea -0.7919284  2.1240026 -0.01783174
## 
## Coefficients of linear discriminants:
##            LD1        LD2
## PC1 -0.3133673 -0.1538351
## PC2  0.9235304 -0.2601952
## PC3  0.5667034  0.8305413
## 
## Proportion of trace:
##    LD1    LD2 
## 0.8337 0.1663

11.1.3 Koefisien Fungsi Diskriminan

coef_df <- as.data.frame(lda_model$scaling)
coef_df$Variabel <- rownames(coef_df)
rownames(coef_df) <- NULL
coef_df <- coef_df %>% dplyr::select(Variabel, everything())

names(coef_df)[-1] <- paste0("LD", seq_len(ncol(coef_df) - 1))

kable(coef_df %>% mutate(across(where(is.numeric), ~ round(., 4))),
      caption   = "Koefisien Fungsi Diskriminan Linear (Unstandardized)",
      row.names = FALSE) %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"),
                full_width = FALSE)

Koefisien Fungsi Diskriminan Linear (Unstandardized)
Variabel	LD1	LD2
PC1	-0.3134	-0.1538
PC2	0.9235	-0.2602
PC3	0.5667	0.8305

prop_var <- lda_model$svd^2 / sum(lda_model$svd^2)

prop_df <- data.frame(
  LD               = paste0("LD", seq_along(prop_var)),
  Singular_Value   = round(lda_model$svd, 4),
  Proporsi_Varians = paste0(round(prop_var * 100, 2), "%"),
  Kumulatif        = paste0(round(cumsum(prop_var) * 100, 2), "%")
)

kable(prop_df,
      caption   = "Proporsi Varians yang Dijelaskan oleh Setiap Fungsi Diskriminan",
      row.names = FALSE) %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)

Proporsi Varians yang Dijelaskan oleh Setiap Fungsi Diskriminan
LD	Singular_Value	Proporsi_Varians	Kumulatif
LD1	24.8065	83.37%	83.37%
LD2	11.0806	16.63%	100%

11.1.4 Group Means (Centroid per Kelas)

gm_df <- as.data.frame(lda_model$means)
gm_df$Kelas <- rownames(gm_df)
gm_df <- gm_df %>% dplyr::select(Kelas, everything())
rownames(gm_df) <- NULL

kable(gm_df %>% mutate(across(where(is.numeric), ~ round(., 3))),
      caption   = "Group Means – Rata-rata Prediktor per Kelas Target",
      row.names = FALSE) %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"),
                full_width = TRUE) %>%
  scroll_box(width = "100%")

Group Means – Rata-rata Prediktor per Kelas Target
Kelas	PC1	PC2	PC3
Insomnia	-0.875	0.202	1.061
None	0.454	-0.801	-0.385
Sleep Apnea	-0.792	2.124	-0.018

11.2 Uji Signifikansi LDA (Wilks’ Lambda & F-test)

manova_formula <- as.formula(
  paste("cbind(", paste(num_cols_clean, collapse = ", "), ") ~ Sleep.Disorder")
)

manova_result <- manova(manova_formula, data = df_balanced)

wilks_result  <- summary(manova_result, test = "Wilks")
wilks_stat    <- wilks_result$stats

wilks_df <- data.frame(
  Uji        = "Wilks' Lambda",
  Lambda     = round(wilks_stat["Sleep.Disorder", "Wilks"], 4),
  F_approx   = round(wilks_stat["Sleep.Disorder", "approx F"], 4),
  num_df     = wilks_stat["Sleep.Disorder", "num Df"],
  den_df     = wilks_stat["Sleep.Disorder", "den Df"],
  p.value    = format.pval(wilks_stat["Sleep.Disorder", "Pr(>F)"],
                            digits = 4, eps = 0.0001),
  Kesimpulan = ifelse(wilks_stat["Sleep.Disorder", "Pr(>F)"] < 0.05,
                      "Signifikan – ada perbedaan antar kelas ✓",
                      "Tidak Signifikan")
)

kable(wilks_df,
      caption   = "Uji Wilks' Lambda – Signifikansi Model LDA",
      row.names = FALSE) %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE) %>%
  row_spec(1,
           color      = "white",
           background = ifelse(wilks_stat["Sleep.Disorder", "Pr(>F)"] < 0.05,
                               "#5cb85c", "#d9534f"))

Uji Wilks’ Lambda – Signifikansi Model LDA
Uji	Lambda	F_approx	num_df	den_df	p.value	Kesimpulan
Wilks’ Lambda	0.2523	215.3675	6	1304	< 1e-04	Signifikan – ada perbedaan antar kelas ✓

mv_tests <- data.frame(
  Uji = c("Pillai's Trace", "Wilks' Lambda", "Hotelling-Lawley", "Roy's Largest Root"),
  Statistik = c(
    round(summary(manova_result, test = "Pillai")$stats["Sleep.Disorder", "Pillai"], 4),
    round(wilks_stat["Sleep.Disorder", "Wilks"], 4),
    round(summary(manova_result, test = "Hotelling-Lawley")$stats["Sleep.Disorder", "Hotelling-Lawley"], 4),
    round(summary(manova_result, test = "Roy")$stats["Sleep.Disorder", "Roy"], 4)
  ),
  F_approx = c(
    round(summary(manova_result, test = "Pillai")$stats["Sleep.Disorder", "approx F"], 4),
    round(wilks_stat["Sleep.Disorder", "approx F"], 4),
    round(summary(manova_result, test = "Hotelling-Lawley")$stats["Sleep.Disorder", "approx F"], 4),
    round(summary(manova_result, test = "Roy")$stats["Sleep.Disorder", "approx F"], 4)
  ),
  p.value = c(
    format.pval(summary(manova_result, test = "Pillai")$stats["Sleep.Disorder", "Pr(>F)"], digits=4, eps=0.0001),
    format.pval(wilks_stat["Sleep.Disorder", "Pr(>F)"], digits=4, eps=0.0001),
    format.pval(summary(manova_result, test = "Hotelling-Lawley")$stats["Sleep.Disorder", "Pr(>F)"], digits=4, eps=0.0001),
    format.pval(summary(manova_result, test = "Roy")$stats["Sleep.Disorder", "Pr(>F)"], digits=4, eps=0.0001)
  )
)

kable(mv_tests,
      caption   = "Uji Signifikansi Multivariat MANOVA (Ekuivalen LDA)",
      row.names = FALSE) %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"),
                full_width = FALSE)

Uji Signifikansi Multivariat MANOVA (Ekuivalen LDA)
Uji	Statistik	F_approx	p.value
Pillai’s Trace	0.9260	187.6632	< 1e-04
Wilks’ Lambda	0.2523	215.3675	< 1e-04
Hotelling-Lawley	2.2573	244.9191	< 1e-04
Roy’s Largest Root	1.8818	409.6150	< 1e-04

11.3 Prediksi & Confusion Matrix – LDA

11.3.1 Prediksi pada Data Training (Resubstitution)

lda_pred <- predict(lda_model, newdata = df_balanced)

df_lda_result <- df_balanced %>%
  mutate(
    Predicted_LDA  = lda_pred$class,
    Posterior_None = round(lda_pred$posterior[, "None"], 4),
    Posterior_Ins  = round(lda_pred$posterior[, "Insomnia"], 4),
    Posterior_SA   = round(lda_pred$posterior[, "Sleep Apnea"], 4)
  )

kable(head(df_lda_result %>%
             dplyr::select(Sleep.Disorder, Predicted_LDA,
                           Posterior_None, Posterior_Ins, Posterior_SA), 10),
      caption   = "10 Baris Pertama: Prediksi LDA & Posterior Probability",
      row.names = FALSE) %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = TRUE)

10 Baris Pertama: Prediksi LDA & Posterior Probability
Sleep.Disorder	Predicted_LDA	Posterior_None	Posterior_Ins	Posterior_SA
None	Insomnia	0.3395	0.6502	0.0103
None	None	0.9761	0.0187	0.0052
None	None	0.9761	0.0187	0.0052
Sleep Apnea	Insomnia	0.0003	0.9799	0.0198
Sleep Apnea	Insomnia	0.0003	0.9799	0.0198
Insomnia	Insomnia	0.0003	0.9799	0.0198
Insomnia	Insomnia	0.0022	0.9290	0.0688
None	None	0.9985	0.0014	0.0002
None	None	0.9985	0.0014	0.0002
None	None	0.9985	0.0014	0.0002

11.3.2 Confusion Matrix LDA

cm_lda <- table(
  Aktual   = df_balanced$Sleep.Disorder,
  Prediksi = lda_pred$class
)

kable(cm_lda,
      caption = "Confusion Matrix – LDA (Resubstitution)") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"),
                full_width = FALSE) %>%
  add_header_above(c(" " = 1, "Prediksi" = length(levels(df_balanced$Sleep.Disorder))))

Confusion Matrix – LDA (Resubstitution)
	Prediksi
	Insomnia	None	Sleep Apnea
Insomnia	204	13	2
None	37	177	5
Sleep Apnea	34	9	176

n_total   <- sum(cm_lda)
n_correct <- sum(diag(cm_lda))
accuracy_lda  <- n_correct / n_total
aper_lda      <- 1 - accuracy_lda

per_class_lda <- lapply(levels(df_balanced$Sleep.Disorder), function(k) {
  TP <- cm_lda[k, k]
  FP <- sum(cm_lda[, k]) - TP
  FN <- sum(cm_lda[k, ]) - TP
  TN <- n_total - TP - FP - FN
  data.frame(
    Kelas       = k,
    TP = TP, FP = FP, FN = FN, TN = TN,
    Sensitivity = round(TP / (TP + FN), 4),
    Specificity = round(TN / (TN + FP), 4),
    Precision   = round(TP / (TP + FP), 4),
    F1_Score    = round(2 * TP / (2 * TP + FP + FN), 4)
  )
})

per_class_lda_df <- bind_rows(per_class_lda)

kable(per_class_lda_df,
      caption   = "Metrik Evaluasi Per Kelas – LDA",
      row.names = FALSE) %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"),
                full_width = TRUE)

Metrik Evaluasi Per Kelas – LDA
Kelas	TP	FP	FN	TN	Sensitivity	Specificity	Precision	F1_Score
Insomnia	204	71	15	367	0.9315	0.8379	0.7418	0.8259
None	177	22	42	416	0.8082	0.9498	0.8894	0.8469
Sleep Apnea	176	7	43	431	0.8037	0.9840	0.9617	0.8756

overall_lda <- data.frame(
  Model       = "LDA",
  Accuracy    = paste0(round(accuracy_lda * 100, 2), "%"),
  APER        = paste0(round(aper_lda * 100, 2), "%"),
  N_Correct   = n_correct,
  N_Incorrect = n_total - n_correct,
  N_Total     = n_total
)

kable(overall_lda,
      caption   = "Ringkasan Evaluasi Model LDA (Resubstitution)",
      row.names = FALSE) %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE) %>%
  row_spec(1, color = "white",
           background = ifelse(accuracy_lda >= 0.8, "#5cb85c", "#f0ad4e"))

Ringkasan Evaluasi Model LDA (Resubstitution)
Model	Accuracy	APER	N_Correct	N_Incorrect	N_Total
LDA	84.78%	15.22%	557	100	657

11.3.3 LOOCV – Leave-One-Out Cross-Validation LDA

set.seed(42)

lda_cv <- lda(
  lda_formula,
  data  = df_balanced,
  prior = prop.table(table(df_balanced$Sleep.Disorder)),
  CV    = TRUE
)

cm_cv <- table(
  Aktual   = df_balanced$Sleep.Disorder,
  Prediksi = lda_cv$class
)

acc_cv  <- sum(diag(cm_cv)) / sum(cm_cv)
aper_cv <- 1 - acc_cv

kable(cm_cv,
      caption = "Confusion Matrix – LDA LOOCV") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"),
                full_width = FALSE) %>%
  add_header_above(c(" " = 1, "Prediksi (LOOCV)" = length(levels(df_balanced$Sleep.Disorder))))

Confusion Matrix – LDA LOOCV
	Prediksi (LOOCV)
	Insomnia	None	Sleep Apnea
Insomnia	204	13	2
None	37	177	5
Sleep Apnea	34	9	176

loocv_df <- data.frame(
  Model       = "LDA (LOOCV)",
  Accuracy    = paste0(round(acc_cv * 100, 2), "%"),
  APER        = paste0(round(aper_cv * 100, 2), "%"),
  N_Correct   = sum(diag(cm_cv)),
  N_Incorrect = sum(cm_cv) - sum(diag(cm_cv)),
  N_Total     = sum(cm_cv)
)

kable(loocv_df,
      caption   = "Ringkasan Evaluasi Model LDA – LOOCV",
      row.names = FALSE) %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE) %>%
  row_spec(1, color = "white",
           background = ifelse(acc_cv >= 0.8, "#5cb85c", "#f0ad4e"))

Ringkasan Evaluasi Model LDA – LOOCV
Model	Accuracy	APER	N_Correct	N_Incorrect	N_Total
LDA (LOOCV)	84.78%	15.22%	557	100	657

12 Cek Asumsi MLR

Catatan: Cek asumsi MLR dilakukan pada data asli (df) sebelum transformasi PCA, karena MLR membutuhkan interpretasi koefisien per prediktor asli.

12.1 Asumsi 1 – Linearitas Log-Odds (Box-Tidwell)

# Gunakan df_raw (sebelum Box-Cox) agar nilai positif dan log() aman
safe_log <- function(x) log(x - min(x) + 1)

df_bt <- df_raw %>%
  mutate(
    ln_Age               = safe_log(Age),
    ln_Sleep.Duration    = safe_log(Sleep.Duration),
    ln_Physical.Activity = safe_log(Physical.Activity.Level),
    ln_Stress.Level      = safe_log(Stress.Level),
    ln_Heart.Rate        = safe_log(Heart.Rate),
    ln_Daily.Steps       = safe_log(Daily.Steps + 1),
    ln_Systolic          = safe_log(Systolic),
    ln_Diastolic         = safe_log(Diastolic)
  )

df_bt$y_bin <- as.integer(df_bt$Sleep.Disorder != "None")

bt_model <- glm(
  y_bin ~ Age + ln_Age +
          Sleep.Duration + ln_Sleep.Duration +
          Physical.Activity.Level + ln_Physical.Activity +
          Stress.Level + ln_Stress.Level +
          Heart.Rate + ln_Heart.Rate +
          Daily.Steps + ln_Daily.Steps +
          Systolic + ln_Systolic +
          Diastolic + ln_Diastolic,
  data   = df_bt,
  family = binomial(link = "logit")
)

bt_summary <- summary(bt_model)$coefficients
bt_df <- data.frame(
  Term      = rownames(bt_summary),
  Estimate  = round(bt_summary[, "Estimate"], 4),
  Std.Error = round(bt_summary[, "Std. Error"], 4),
  z.value   = round(bt_summary[, "z value"], 4),
  p.value   = round(bt_summary[, "Pr(>|z|)"], 4)
) %>%
  filter(grepl("^ln_", Term)) %>%
  mutate(
    Prediktor = gsub("^ln_", "", Term),
    Status    = ifelse(p.value < 0.05, "Tidak Linier ⚠", "Linier ✓")
  ) %>%
  dplyr::select(Prediktor, Estimate, Std.Error, z.value, p.value, Status)

kable(bt_df,
      caption   = "Box-Tidwell Test – Uji Linearitas Log-Odds (Asumsi MLR)",
      row.names = FALSE) %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE) %>%
  row_spec(which(bt_df$Status == "Tidak Linier ⚠"), background = "#fff3cd") %>%
  row_spec(which(bt_df$Status == "Linier ✓"),       background = "#d4edda")

Box-Tidwell Test – Uji Linearitas Log-Odds (Asumsi MLR)
Prediktor	Estimate	Std.Error	z.value	p.value	Status
Age	-1.0180	1.6284	-0.6252	0.5319	Linier ✓
Sleep.Duration	-12.8333	12.0080	-1.0687	0.2852	Linier ✓
Physical.Activity	0.9870	0.9943	0.9927	0.3208	Linier ✓
Stress.Level	11.9614	7.2523	1.6493	0.0991	Linier ✓
Heart.Rate	-1.3586	1.5918	-0.8535	0.3934	Linier ✓
Daily.Steps	0.3592	3.9767	0.0903	0.9280	Linier ✓
Systolic	-11.7622	3.7682	-3.1215	0.0018	Tidak Linier ⚠
Diastolic	5.4851	2.5063	2.1885	0.0286	Tidak Linier ⚠

Interpretasi: Koefisien ln(X) yang tidak signifikan (p > 0.05) menunjukkan hubungan linier antara log-odds dan prediktor tersebut sudah terpenuhi.

12.2 Asumsi 2 – Independensi Observasi

# Gunakan df_raw (sebelum Box-Cox) untuk cek independensi
n_dup <- sum(duplicated(df_raw %>% dplyr::select(-Sleep.Disorder)))

ind_df <- data.frame(
  Aspek = c(
    "Desain pengambilan data",
    "Duplikasi observasi",
    "Pengamatan berulang (longitudinal)"
  ),
  Evaluasi = c(
    "Cross-sectional survey – setiap baris = satu individu unik",
    paste0("Jumlah baris duplikat: ", n_dup,
           " (berdasarkan semua prediktor)"),
    "Dataset tidak memiliki ID waktu atau pengulangan per subjek"
  ),
  Status = c(
    "✓ Terpenuhi",
    ifelse(n_dup == 0, "✓ Tidak ada duplikat", "⚠ Ada duplikat – perlu dicek"),
    "✓ Terpenuhi"
  )
)

kable(ind_df, caption = "Evaluasi Asumsi Independensi Observasi (MLR)",
      row.names = FALSE) %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = TRUE) %>%
  column_spec(3, bold = TRUE,
              color = ifelse(grepl("✓", ind_df$Status), "#155724", "#856404"))

Evaluasi Asumsi Independensi Observasi (MLR)
Aspek	Evaluasi	Status
Desain pengambilan data	Cross-sectional survey – setiap baris = satu individu unik	✓ Terpenuhi
Duplikasi observasi	Jumlah baris duplikat: 265 (berdasarkan semua prediktor)	⚠ Ada duplikat – perlu dicek
Pengamatan berulang (longitudinal)	Dataset tidak memiliki ID waktu atau pengulangan per subjek	✓ Terpenuhi

12.3 Asumsi 3 – Tidak Ada Multikolinearitas (VIF)

VIF sudah dihitung di bagian Cek Asumsi LDA (Asumsi 3) dan hasilnya berlaku untuk MLR juga karena prediktor yang digunakan sama. Lihat tabel VIF di atas.

12.4 Asumsi 4 – Tidak Ada Outlier Ekstrem (Mahalanobis Distance)

# Gunakan df_raw (sebelum Box-Cox) untuk Mahalanobis – skala asli lebih representatif
mah_data  <- df_raw %>% dplyr::select(all_of(pred_vars))
mah_dist  <- mahalanobis(mah_data, colMeans(mah_data), cov(mah_data))
mah_pval  <- pchisq(mah_dist, df = ncol(mah_data), lower.tail = FALSE)
n_out_mah <- sum(mah_pval < 0.001)

mah_df <- data.frame(
  Metode            = "Mahalanobis Distance",
  Threshold         = "p < 0.001 (χ² dengan df = 8)",
  N_Outlier_Ekstrem = n_out_mah,
  Persen            = paste0(round(n_out_mah / nrow(df_raw) * 100, 2), "%"),
  Status            = ifelse(n_out_mah / nrow(df_raw) < 0.05,
                             "Outlier < 5% – dapat ditoleransi ✓",
                             "Outlier ≥ 5% – perlu penanganan ⚠")
)

kable(mah_df,
      caption   = "Deteksi Outlier Multivariat – Mahalanobis Distance (Asumsi MLR)",
      row.names = FALSE) %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)

Deteksi Outlier Multivariat – Mahalanobis Distance (Asumsi MLR)
Metode	Threshold	N_Outlier_Ekstrem	Persen	Status
Mahalanobis Distance	p < 0.001 (χ² dengan df = 8)	8	2.14%	Outlier < 5% – dapat ditoleransi ✓

mah_plot_df <- data.frame(
  Index    = 1:nrow(df_raw),
  Mah.Dist = mah_dist,
  Outlier  = mah_pval < 0.001
)

ggplot(mah_plot_df, aes(x = Index, y = Mah.Dist, color = Outlier)) +
  geom_point(alpha = 0.7, size = 1.5) +
  geom_hline(yintercept = qchisq(0.999, df = ncol(mah_data)),
             linetype = "dashed", color = "red", linewidth = 0.8) +
  scale_color_manual(values = c("FALSE" = "#4C72B0", "TRUE" = "#C44E52"),
                     labels = c("Normal", "Outlier Ekstrem")) +
  labs(title    = "Mahalanobis Distance – Deteksi Outlier Multivariat",
       subtitle = "Garis merah = threshold chi-square (p = 0.001)",
       x = "Index Observasi", y = "Mahalanobis Distance", color = NULL) +
  theme_bw(base_size = 12)

12.5 Ringkasan Asumsi MLR

mlr_asumsi <- data.frame(
  No     = 1:4,
  Asumsi = c("Linearitas log-odds",
             "Independensi observasi",
             "Tidak ada multikolinearitas (VIF)",
             "Tidak ada outlier ekstrem"),
  Metode = c("Box-Tidwell test",
             "Evaluasi desain & duplikasi",
             "VIF",
             "Mahalanobis Distance"),
  Hasil  = c(
    paste0(sum(bt_df$Status == "Tidak Linier ⚠"), " variabel tidak linier"),
    ifelse(n_dup == 0, "Tidak ada duplikat",
           paste0(n_dup, " baris duplikat")),
    paste0(sum(vif_vals > 5), " variabel VIF > 5"),
    paste0(n_out_mah, " outlier ekstrem (",
           round(n_out_mah / nrow(df_raw) * 100, 1), "%)")
  ),
  Status = c(
    ifelse(sum(bt_df$Status == "Tidak Linier ⚠") == 0,
           "✓ Terpenuhi", "⚠ Sebagian tidak linier"),
    ifelse(n_dup == 0, "✓ Terpenuhi", "⚠ Perlu dicek"),
    ifelse(all(vif_vals <= 10), "✓ Tidak ada VIF > 10", "✗ Ada VIF > 10"),
    ifelse(n_out_mah / nrow(df_raw) < 0.05,
           "✓ Dapat ditoleransi", "⚠ Perlu penanganan")
  )
)

kable(mlr_asumsi, caption = "Ringkasan Pemeriksaan Asumsi MLR",
      row.names = FALSE) %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"),
                full_width = TRUE) %>%
  column_spec(5, bold = TRUE,
              color = ifelse(grepl("✓", mlr_asumsi$Status), "#155724",
                      ifelse(grepl("⚠", mlr_asumsi$Status), "#856404", "#721c24")))

Ringkasan Pemeriksaan Asumsi MLR
No	Asumsi	Metode	Hasil	Status
1	Linearitas log-odds	Box-Tidwell test	2 variabel tidak linier	⚠ Sebagian tidak linier
2	Independensi observasi	Evaluasi desain & duplikasi	265 baris duplikat	⚠ Perlu dicek
3	Tidak ada multikolinearitas (VIF)	VIF	9 variabel VIF > 5	✗ Ada VIF > 10
4	Tidak ada outlier ekstrem	Mahalanobis Distance	8 outlier ekstrem (2.1%)	✓ Dapat ditoleransi

13 Penanganan Asumsi MLR yang Tidak Terpenuhi

Berdasarkan hasil cek asumsi MLR di atas, tabel berikut merangkum masalah yang terdeteksi dan strategi penanganannya sebelum fitting model MLR:

Masalah	Temuan	Strategi Penanganan
Multikolinearitas (VIF)	Ada VIF > 5 / > 10	Drop iteratif variabel VIF tertinggi
Outlier ekstrem	Cek via Mahalanobis	Hapus baris outlier ekstrem (p < 0.001)
Linearitas log-odds	Cek via Box-Tidwell	Transformasi log/sqrt pada prediktor tidak linier
Class imbalance	None >> Insomnia, Sleep Apnea	Oversampling SMOTE pada data MLR

13.1 Penanganan Multikolinearitas MLR – Drop Iteratif VIF

Berbeda dengan LDA (menggunakan PCA), MLR membutuhkan koefisien yang dapat diinterpretasi per prediktor asli. Oleh karena itu penanganan multikolinearitas dilakukan dengan drop iteratif variabel VIF tertinggi (bukan PCA).

# MLR butuh prediktor asli yang interpretable → mulai dari df_raw (sebelum Box-Cox)
# Terapkan Winsorizing ulang pada df_raw agar outlier sudah ditangani
df_mlr_prep <- df_raw
num_cols_mlr_raw <- df_mlr_prep %>% dplyr::select(where(is.numeric)) %>% names()
df_mlr_prep[num_cols_mlr_raw] <- lapply(df_mlr_prep[num_cols_mlr_raw], winsorize)

# Drop iteratif: hapus variabel dengan VIF > 10 satu per satu
pred_mlr <- pred_vars  # c("Age", "Sleep.Duration", ...)

repeat {
  df_vif_mlr <- df_mlr_prep %>%
    mutate(Gender_bin     = as.integer(Gender == "Male"),
           BMI_Overweight = as.integer(BMI.Category == "Overweight"),
           BMI_Obese      = as.integer(BMI.Category == "Obese"))

  all_preds <- c(pred_mlr, "Gender_bin", "BMI_Overweight", "BMI_Obese")
  f_vif <- as.formula(paste("as.numeric(Sleep.Disorder) ~",
                             paste(all_preds, collapse = "+")))
  vif_iter <- vif(lm(f_vif, data = df_vif_mlr))

  max_vif <- max(vif_iter[pred_mlr])  # cek VIF prediktor numerik saja
  if (max_vif <= 10) break

  drop_var <- names(which.max(vif_iter[pred_mlr]))
  cat("Drop:", drop_var, "| VIF =", round(max_vif, 2), "\n")
  pred_mlr <- setdiff(pred_mlr, drop_var)
}

## Drop: Diastolic | VIF = 76.06 
## Drop: Stress.Level | VIF = 10.54

cat("\nPrediktor MLR setelah drop iteratif VIF:\n")

## 
## Prediktor MLR setelah drop iteratif VIF:

cat(paste(pred_mlr, collapse = ", "), "\n")

## Age, Sleep.Duration, Physical.Activity.Level, Heart.Rate, Daily.Steps, Systolic

# Tampilkan VIF final
vif_mlr_final <- vif(lm(as.formula(paste("as.numeric(Sleep.Disorder) ~",
  paste(c(pred_mlr, "Gender_bin", "BMI_Overweight", "BMI_Obese"), collapse = "+"))),
  data = df_vif_mlr))

vif_mlr_df <- data.frame(
  Variabel = names(vif_mlr_final),
  VIF      = round(vif_mlr_final, 3),
  Status   = ifelse(vif_mlr_final > 10, "Multikolinear Tinggi", "Aman")
)

kable(vif_mlr_df, caption = "VIF Setelah Drop Iteratif – Prediktor MLR Final",
      row.names = FALSE) %>%
  kable_styling(bootstrap_options = c("striped","hover"), full_width = FALSE) %>%
  row_spec(which(vif_mlr_df$Status == "Multikolinear Tinggi"), color="white", background="#d9534f") %>%
  row_spec(which(vif_mlr_df$Status == "Perlu Perhatian"),      color="white", background="#f0ad4e") %>%
  row_spec(which(vif_mlr_df$Status == "Aman"),                 color="white", background="#5cb85c")

VIF Setelah Drop Iteratif – Prediktor MLR Final
Variabel	VIF	Status
Age	5.097	Aman
Sleep.Duration	4.478	Aman
Physical.Activity.Level	5.215	Aman
Heart.Rate	2.428	Aman
Daily.Steps	4.721	Aman
Systolic	3.877	Aman
Gender_bin	1.837	Aman
BMI_Overweight	3.516	Aman
BMI_Obese	2.098	Aman

13.2 Penanganan Outlier MLR – Hapus Baris Outlier Ekstrem

# Hitung ulang Mahalanobis pada prediktor yang tersisa setelah drop VIF
mah_data_mlr <- df_mlr_prep %>% dplyr::select(all_of(pred_mlr))
mah_dist_mlr <- mahalanobis(mah_data_mlr,
                             colMeans(mah_data_mlr),
                             cov(mah_data_mlr))
mah_pval_mlr <- pchisq(mah_dist_mlr,
                        df       = ncol(mah_data_mlr),
                        lower.tail = FALSE)

n_outlier_mlr <- sum(mah_pval_mlr < 0.001)
cat("Jumlah outlier ekstrem (p < 0.001):", n_outlier_mlr, "\n")

## Jumlah outlier ekstrem (p < 0.001): 2

cat("Persentase:", round(n_outlier_mlr / nrow(df_mlr_prep) * 100, 2), "%\n")

## Persentase: 0.53 %

# Hapus outlier jika >= 5% dari data; jika < 5% tetap dipertahankan
if (n_outlier_mlr / nrow(df_mlr_prep) >= 0.05) {
  df_mlr_prep <- df_mlr_prep[mah_pval_mlr >= 0.001, ]
  cat("Outlier dihapus. Baris tersisa:", nrow(df_mlr_prep), "\n")
} else {
  cat("Outlier < 5% dari data → tetap dipertahankan (dapat ditoleransi).\n")
}

## Outlier < 5% dari data → tetap dipertahankan (dapat ditoleransi).

13.3 Penanganan Linearitas Log-Odds MLR – Transformasi Prediktor

Prediktor yang tidak lolos Box-Tidwell (tidak linier terhadap log-odds) ditransformasi menggunakan log atau square root sesuai distribusinya.

# Identifikasi prediktor tidak linier dari hasil Box-Tidwell
nonlinear_vars <- bt_df %>%
  filter(Status == "Tidak Linier ⚠") %>%
  pull(Prediktor)

# Saring hanya yang masih ada di pred_mlr (setelah drop VIF)
nonlinear_vars <- intersect(nonlinear_vars, pred_mlr)

cat("Prediktor tidak linier yang akan ditransformasi:\n")

## Prediktor tidak linier yang akan ditransformasi:

if (length(nonlinear_vars) == 0) {
  cat("Tidak ada – semua prediktor linier terhadap log-odds.\n")
} else {
  cat(paste(nonlinear_vars, collapse = ", "), "\n")
}

## Systolic

# Terapkan transformasi log dengan shift aman (nilai dari df_raw sudah positif setelah Winsorizing)
for (v in nonlinear_vars) {
  new_name <- paste0("log_", v)
  df_mlr_prep[[new_name]] <- log(df_mlr_prep[[v]] - min(df_mlr_prep[[v]]) + 1)
  pred_mlr <- c(setdiff(pred_mlr, v), new_name)
}

cat("\nPrediktor MLR final setelah transformasi linearitas:\n")

## 
## Prediktor MLR final setelah transformasi linearitas:

cat(paste(pred_mlr, collapse = ", "), "\n")

## Age, Sleep.Duration, Physical.Activity.Level, Heart.Rate, Daily.Steps, log_Systolic

13.4 Penanganan Class Imbalance MLR – SMOTE

set.seed(42)

# Buat df_mlr_balanced menggunakan SMOTE yang sama
num_mat_mlr   <- as.data.frame(df_mlr_prep[pred_mlr])
n_target_mlr  <- max(table(df_mlr_prep$Sleep.Disorder))
df_mlr_bal    <- df_mlr_prep

for (kls in c("Insomnia", "Sleep Apnea")) {
  idx_kls  <- which(df_mlr_prep$Sleep.Disorder == kls)
  n_needed <- n_target_mlr - length(idx_kls)
  if (n_needed > 0) {
    syn_num <- smote_manual(num_mat_mlr[idx_kls, , drop = FALSE], n_needed, k = 5)
    names(syn_num) <- pred_mlr
    syn_full <- syn_num
    syn_full$Sleep.Disorder <- factor(kls, levels = levels(df_mlr_prep$Sleep.Disorder))
    syn_full$Gender         <- factor(names(which.max(table(df_mlr_prep$Gender[idx_kls]))),
                                      levels = levels(df_mlr_prep$Gender))
    syn_full$Occupation     <- factor(names(which.max(table(df_mlr_prep$Occupation[idx_kls]))),
                                      levels = levels(df_mlr_prep$Occupation))
    syn_full$BMI.Category   <- factor(names(which.max(table(df_mlr_prep$BMI.Category[idx_kls]))),
                                      levels = levels(df_mlr_prep$BMI.Category))
    # Isi kolom yang tidak ada di syn_full dengan NA lalu drop
    for (col in setdiff(names(df_mlr_bal), names(syn_full))) {
      syn_full[[col]] <- NA
    }
    syn_full <- syn_full[, names(df_mlr_bal)]
    df_mlr_bal <- bind_rows(df_mlr_bal, syn_full)
  }
}

df_mlr_bal$Sleep.Disorder <- factor(df_mlr_bal$Sleep.Disorder,
                                     levels = levels(df_mlr_prep$Sleep.Disorder))

dist_mlr <- merge(
  as.data.frame(table(df_mlr_prep$Sleep.Disorder)) %>% rename(Kelas = Var1, Sebelum = Freq),
  as.data.frame(table(df_mlr_bal$Sleep.Disorder))  %>% rename(Kelas = Var1, Sesudah = Freq),
  by = "Kelas")
dist_mlr$Ditambah <- dist_mlr$Sesudah - dist_mlr$Sebelum

kable(dist_mlr, caption = "Distribusi Kelas MLR Sebelum vs Sesudah SMOTE",
      row.names = FALSE) %>%
  kable_styling(bootstrap_options = c("striped","hover"), full_width = FALSE) %>%
  row_spec(which(dist_mlr$Ditambah > 0), background = "#d4edda")

Distribusi Kelas MLR Sebelum vs Sesudah SMOTE
Kelas	Sebelum	Sesudah	Ditambah
Insomnia	77	219	142
None	219	219	0
Sleep Apnea	78	219	141

cat("Dataset MLR final: df_mlr_bal\n")

## Dataset MLR final: df_mlr_bal

cat("Dimensi:", nrow(df_mlr_bal), "baris x", ncol(df_mlr_bal), "kolom\n")

## Dimensi: 657 baris x 14 kolom

cat("Prediktor numerik:", paste(pred_mlr, collapse = ", "), "\n")

## Prediktor numerik: Age, Sleep.Duration, Physical.Activity.Level, Heart.Rate, Daily.Steps, log_Systolic

13.5 Verifikasi Akhir Asumsi MLR

# VIF final pada df_mlr_bal
df_vif_mlr_bal <- df_mlr_bal %>%
  mutate(Gender_bin     = as.integer(Gender == "Male"),
         BMI_Overweight = as.integer(BMI.Category == "Overweight"),
         BMI_Obese      = as.integer(BMI.Category == "Obese"))

f_vif_final <- as.formula(paste("as.numeric(Sleep.Disorder) ~",
  paste(c(pred_mlr, "Gender_bin", "BMI_Overweight", "BMI_Obese"), collapse = "+")))

vif_mlr_akhir <- vif(lm(f_vif_final, data = df_vif_mlr_bal))

verif_mlr <- data.frame(
  Aspek = c(
    "Jumlah Observasi (setelah SMOTE)",
    "Distribusi Kelas",
    "Outlier Ekstrem (Mahalanobis)",
    "Linearitas Log-Odds",
    "Multikolinearitas (VIF)",
    "Independensi Observasi"
  ),
  Hasil = c(
    paste(nrow(df_mlr_bal), "baris"),
    paste(paste(names(table(df_mlr_bal$Sleep.Disorder)),
                table(df_mlr_bal$Sleep.Disorder), sep = "="), collapse = ", "),
    ifelse(n_outlier_mlr / nrow(df_mlr_prep) < 0.05,
           paste0(n_outlier_mlr, " outlier ditoleransi (< 5%)"),
           "Outlier dihapus"),
    ifelse(length(nonlinear_vars) == 0,
           "Semua prediktor linier (atau sudah ditransformasi log)",
           paste0("Transformasi log diterapkan pada: ",
                  paste(nonlinear_vars, collapse = ", "))),
    ifelse(all(vif_mlr_akhir <= 10), "Semua VIF ≤ 10 (Aman)", "Ada VIF > 10"),
    "Cross-sectional, tidak ada pengulangan per subjek"
  ),
  Status = c(
    "OK", "OK",
    ifelse(n_outlier_mlr / nrow(df_mlr_prep) < 0.05, "OK", "OK – dihapus"),
    ifelse(length(nonlinear_vars) == 0, "OK", "OK – ditransformasi"),
    ifelse(all(vif_mlr_akhir <= 10), "OK", "Perhatian"),
    "OK"
  )
)

kable(verif_mlr, caption = "Verifikasi Asumsi Akhir – Data Siap MLR",
      row.names = FALSE) %>%
  kable_styling(bootstrap_options = c("striped","hover","condensed"), full_width = TRUE) %>%
  column_spec(1, bold = TRUE) %>%
  row_spec(which(verif_mlr$Status == "OK" | grepl("OK", verif_mlr$Status)),
           background = "#d4edda") %>%
  row_spec(which(verif_mlr$Status == "Perhatian"), background = "#f8d7da")

Verifikasi Asumsi Akhir – Data Siap MLR
Aspek	Hasil	Status
Jumlah Observasi (setelah SMOTE)	657 baris	OK
Distribusi Kelas	Insomnia=219, None=219, Sleep Apnea=219	OK
Outlier Ekstrem (Mahalanobis)	2 outlier ditoleransi (< 5%)	OK
Linearitas Log-Odds	Transformasi log diterapkan pada: Systolic	OK – ditransformasi
Multikolinearitas (VIF)	Semua VIF ≤ 10 (Aman)	OK
Independensi Observasi	Cross-sectional, tidak ada pengulangan per subjek	OK

Dataset MLR final tersimpan di objek df_mlr_bal — siap untuk fitting model MLR.

14 MLR

14.1 Fitting Model MLR

14.1.1 Pemilihan Reference Category

df_mlr_bal <- df_mlr_bal %>%
  mutate(Sleep.Disorder = relevel(Sleep.Disorder, ref = "None"))

ref_choice <- data.frame(
  Opsi      = 1:3,
  Reference = c("None", "Insomnia", "Sleep Apnea"),
  Alasan    = c(
    "Kelas mayoritas & 'kondisi normal' → log-odds insomnia vs sehat, SA vs sehat [DIPILIH]",
    "Baseline gangguan ringan → log-odds None vs insomnia, SA vs insomnia",
    "Baseline gangguan berat → log-odds None vs SA, insomnia vs SA"
  )
)

kable(ref_choice, caption = "Pertimbangan Pemilihan Reference Category MLR",
      row.names = FALSE) %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = TRUE) %>%
  row_spec(1, background = "#d4edda")

Pertimbangan Pemilihan Reference Category MLR
Opsi	Reference	Alasan
1	None	Kelas mayoritas & ‘kondisi normal’ → log-odds insomnia vs sehat, SA vs sehat [DIPILIH]
2	Insomnia	Baseline gangguan ringan → log-odds None vs insomnia, SA vs insomnia
3	Sleep Apnea	Baseline gangguan berat → log-odds None vs SA, insomnia vs SA

14.1.2 Fit Model MLR

set.seed(42)

mlr_formula <- as.formula(
  paste("Sleep.Disorder ~", paste(pred_mlr, collapse = " + "))
)

mlr_model <- multinom(
  mlr_formula,
  data  = df_mlr_bal,
  maxit = 500,
  trace = FALSE
)

cat("=== Ringkasan Model MLR ===\n")

## === Ringkasan Model MLR ===

print(summary(mlr_model))

## Call:
## multinom(formula = mlr_formula, data = df_mlr_bal, maxit = 500, 
##     trace = FALSE)
## 
## Coefficients:
##             (Intercept)       Age Sleep.Duration Physical.Activity.Level
## Insomnia       24.33020 0.2795671     -3.9729017              0.07281570
## Sleep Apnea   -76.08184 0.2721917      0.5444884             -0.02851873
##              Heart.Rate   Daily.Steps log_Systolic
## Insomnia    -0.07236269 -0.0012714836    0.2474621
## Sleep Apnea  0.73548345  0.0007025924    1.7719652
## 
## Std. Errors:
##              (Intercept)        Age Sleep.Duration Physical.Activity.Level
## Insomnia    1.015390e-04 0.01996155    0.001467243              0.01238563
## Sleep Apnea 9.517733e-05 0.01860820    0.001215787              0.01257213
##             Heart.Rate  Daily.Steps log_Systolic
## Insomnia    0.01039475 0.0001683656 0.0006016256
## Sleep Apnea 0.01145725 0.0001642198 0.0003325545
## 
## Residual Deviance: 616.8503 
## AIC: 644.8503

coef_mlr <- coef(mlr_model)
se_mlr   <- summary(mlr_model)$standard.errors

z_mlr  <- coef_mlr / se_mlr
p_mlr  <- 2 * (1 - pnorm(abs(z_mlr)))

make_coef_table <- function(kelas_name) {
  data.frame(
    Variabel  = colnames(coef_mlr),
    Estimate  = round(coef_mlr[kelas_name, ], 4),
    Std.Error = round(se_mlr[kelas_name, ], 4),
    z.value   = round(z_mlr[kelas_name, ], 4),
    p.value   = round(p_mlr[kelas_name, ], 4),
    Sig       = ifelse(p_mlr[kelas_name, ] < 0.001, "***",
                ifelse(p_mlr[kelas_name, ] < 0.01,  "**",
                ifelse(p_mlr[kelas_name, ] < 0.05,  "*",
                ifelse(p_mlr[kelas_name, ] < 0.1,   ".", ""))))
  )
}

coef_ins <- make_coef_table("Insomnia")
coef_sa  <- make_coef_table("Sleep Apnea")

kable(coef_ins,
      caption   = "Koefisien MLR – Insomnia vs None (Log-Odds)",
      row.names = FALSE) %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = TRUE) %>%
  row_spec(which(coef_ins$p.value < 0.05), background = "#d4edda") %>%
  footnote(general = "Sig: *** p<0.001, ** p<0.01, * p<0.05, . p<0.1")

Koefisien MLR – Insomnia vs None (Log-Odds)
Variabel	Estimate	Std.Error	z.value	p.value	Sig
(Intercept)	24.3302	0.0001	239614.2614	0	***
Age	0.2796	0.0200	14.0053	0	***
Sleep.Duration	-3.9729	0.0015	-2707.7333	0	***
Physical.Activity.Level	0.0728	0.0124	5.8790	0	***
Heart.Rate	-0.0724	0.0104	-6.9615	0	***
Daily.Steps	-0.0013	0.0002	-7.5519	0	***
log_Systolic	0.2475	0.0006	411.3224	0	***
Note:
Sig: * p<0.001, p<0.01, * p<0.05, . p<0.1

kable(coef_sa,
      caption   = "Koefisien MLR – Sleep Apnea vs None (Log-Odds)",
      row.names = FALSE) %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = TRUE) %>%
  row_spec(which(coef_sa$p.value < 0.05), background = "#d4edda") %>%
  footnote(general = "Sig: *** p<0.001, ** p<0.01, * p<0.05, . p<0.1")

Koefisien MLR – Sleep Apnea vs None (Log-Odds)
Variabel	Estimate	Std.Error	z.value	p.value	Sig
(Intercept)	-76.0818	0.0001	-799369.3525	0.0000	***
Age	0.2722	0.0186	14.6275	0.0000	***
Sleep.Duration	0.5445	0.0012	447.8483	0.0000	***
Physical.Activity.Level	-0.0285	0.0126	-2.2684	0.0233
Heart.Rate	0.7355	0.0115	64.1937	0.0000	***
Daily.Steps	0.0007	0.0002	4.2784	0.0000	***
log_Systolic	1.7720	0.0003	5328.3456	0.0000	***
Note:
Sig: * p<0.001, p<0.01, * p<0.05, . p<0.1

Uji Signifikansi MLR

14.1.3 Uji Serentak – Likelihood Ratio Test (G²)

mlr_null <- multinom(
  Sleep.Disorder ~ 1,
  data  = df_mlr_bal,
  maxit = 500,
  trace = FALSE
)

lrt_result <- lrtest(mlr_null, mlr_model)

lrt_df <- data.frame(
  Model         = c("Model Null (intercept only)", "Model Penuh"),
  LogLikelihood = round(c(logLik(mlr_null), logLik(mlr_model)), 4),
  Df            = c(attr(logLik(mlr_null), "df"), attr(logLik(mlr_model), "df"))
)

kable(lrt_df, caption = "Log-Likelihood Model Null vs Penuh", row.names = FALSE) %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)

Log-Likelihood Model Null vs Penuh
Model	LogLikelihood	Df
Model Null (intercept only)	-721.7883	2
Model Penuh	-308.4251	14

G2_stat <- -2 * (as.numeric(logLik(mlr_null)) - as.numeric(logLik(mlr_model)))
df_G2   <- attr(logLik(mlr_model), "df") - attr(logLik(mlr_null), "df")
p_G2    <- pchisq(G2_stat, df = df_G2, lower.tail = FALSE)

g2_df <- data.frame(
  Uji          = "Likelihood Ratio Test (G²)",
  G2_Statistik = round(G2_stat, 4),
  df           = df_G2,
  p.value      = format.pval(p_G2, digits = 4, eps = 0.0001),
  Kesimpulan   = ifelse(p_G2 < 0.05,
                        "Model signifikan – prediktor berpengaruh serentak ✓",
                        "Model tidak signifikan")
)

kable(g2_df,
      caption   = "Uji Serentak – Likelihood Ratio Test (G²)",
      row.names = FALSE) %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE) %>%
  row_spec(1, color = "white",
           background = ifelse(p_G2 < 0.05, "#5cb85c", "#d9534f"))

Uji Serentak – Likelihood Ratio Test (G²)
Uji	G2_Statistik	df	p.value	Kesimpulan
Likelihood Ratio Test (G²)	826.7263	12	< 1e-04	Model signifikan – prediktor berpengaruh serentak ✓

14.1.4 Uji Parsial – Wald Test per Prediktor

wald_ins <- coef_ins %>%
  mutate(
    OR       = round(exp(Estimate), 4),
    CI_lower = round(exp(Estimate - 1.96 * Std.Error), 4),
    CI_upper = round(exp(Estimate + 1.96 * Std.Error), 4),
    CI_95    = paste0("[", CI_lower, ", ", CI_upper, "]")
  ) %>%
  dplyr::select(Variabel, Estimate, OR, CI_95, z.value, p.value, Sig)

kable(wald_ins,
      caption   = "Uji Wald Parsial – Insomnia vs None (+ Odds Ratio)",
      row.names = FALSE) %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = TRUE) %>%
  row_spec(which(wald_ins$p.value < 0.05 & wald_ins$Variabel != "(Intercept)"),
           background = "#d4edda") %>%
  footnote(general = "OR = Odds Ratio; CI_95 = 95% Confidence Interval untuk OR")

Uji Wald Parsial – Insomnia vs None (+ Odds Ratio)
Variabel	Estimate	OR	CI_95	z.value	p.value	Sig
(Intercept)	24.3302	3.685289e+10	[36845672015.9903, 36860118350.7171]	239614.2614	0	***
Age	0.2796	1.322600e+00	[1.2718, 1.3755]	14.0053	0	***
Sleep.Duration	-3.9729	1.880000e-02	[0.0188, 0.0189]	-2707.7333	0	***
Physical.Activity.Level	0.0728	1.075500e+00	[1.0497, 1.102]	5.8790	0	***
Heart.Rate	-0.0724	9.302000e-01	[0.9114, 0.9493]	-6.9615	0	***
Daily.Steps	-0.0013	9.987000e-01	[0.9983, 0.9991]	-7.5519	0	***
log_Systolic	0.2475	1.280800e+00	[1.2793, 1.2823]	411.3224	0	***
Note:
OR = Odds Ratio; CI_95 = 95% Confidence Interval untuk OR

wald_sa <- coef_sa %>%
  mutate(
    OR       = round(exp(Estimate), 4),
    CI_lower = round(exp(Estimate - 1.96 * Std.Error), 4),
    CI_upper = round(exp(Estimate + 1.96 * Std.Error), 4),
    CI_95    = paste0("[", CI_lower, ", ", CI_upper, "]")
  ) %>%
  dplyr::select(Variabel, Estimate, OR, CI_95, z.value, p.value, Sig)

kable(wald_sa,
      caption   = "Uji Wald Parsial – Sleep Apnea vs None (+ Odds Ratio)",
      row.names = FALSE) %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = TRUE) %>%
  row_spec(which(wald_sa$p.value < 0.05 & wald_sa$Variabel != "(Intercept)"),
           background = "#d4edda") %>%
  footnote(general = "OR = Odds Ratio; CI_95 = 95% Confidence Interval untuk OR")

Uji Wald Parsial – Sleep Apnea vs None (+ Odds Ratio)
Variabel	Estimate	OR	CI_95	z.value	p.value	Sig
(Intercept)	-76.0818	0.0000	[0, 0]	-799369.3525	0.0000	***
Age	0.2722	1.3128	[1.2659, 1.3616]	14.6275	0.0000	***
Sleep.Duration	0.5445	1.7237	[1.7197, 1.7278]	447.8483	0.0000	***
Physical.Activity.Level	-0.0285	0.9719	[0.9482, 0.9962]	-2.2684	0.0233
Heart.Rate	0.7355	2.0865	[2.04, 2.1341]	64.1937	0.0000	***
Daily.Steps	0.0007	1.0007	[1.0003, 1.0011]	4.2784	0.0000	***
log_Systolic	1.7720	5.8826	[5.8791, 5.8861]	5328.3456	0.0000	***
Note:
OR = Odds Ratio; CI_95 = 95% Confidence Interval untuk OR

14.1.5 AIC & BIC Model MLR

aic_bic_df <- data.frame(
  Model  = c("MLR Null", "MLR Penuh"),
  AIC    = round(c(AIC(mlr_null), AIC(mlr_model)), 4),
  BIC    = round(c(BIC(mlr_null), BIC(mlr_model)), 4),
  LogLik = round(c(as.numeric(logLik(mlr_null)),
                   as.numeric(logLik(mlr_model))), 4)
)

kable(aic_bic_df,
      caption   = "AIC & BIC – Perbandingan Model MLR Null vs Penuh",
      row.names = FALSE) %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE) %>%
  row_spec(2, background = "#d4edda")

AIC & BIC – Perbandingan Model MLR Null vs Penuh
Model	AIC	BIC	LogLik
MLR Null	1447.5765	1456.5519	-721.7883
MLR Penuh	644.8503	707.6779	-308.4251

14.2 Prediksi & Confusion Matrix – MLR

14.2.1 Prediksi MLR

mlr_pred_class <- predict(mlr_model, newdata = df_mlr_bal, type = "class")
mlr_pred_prob  <- predict(mlr_model, newdata = df_mlr_bal, type = "probs")

df_mlr_result <- df_mlr_bal %>%
  mutate(
    Predicted_MLR   = mlr_pred_class,
    Prob_None       = round(mlr_pred_prob[, "None"], 4),
    Prob_Insomnia   = round(mlr_pred_prob[, "Insomnia"], 4),
    Prob_SleepApnea = round(mlr_pred_prob[, "Sleep Apnea"], 4)
  )

kable(head(df_mlr_result %>%
             dplyr::select(Sleep.Disorder, Predicted_MLR,
                           Prob_None, Prob_Insomnia, Prob_SleepApnea), 10),
      caption   = "10 Baris Pertama: Prediksi MLR & Probabilitas per Kelas",
      row.names = FALSE) %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = TRUE)

10 Baris Pertama: Prediksi MLR & Probabilitas per Kelas
Sleep.Disorder	Predicted_MLR	Prob_None	Prob_Insomnia	Prob_SleepApnea
None	Insomnia	0.3889	0.5829	0.0282
None	None	0.5877	0.0021	0.4102
None	None	0.5877	0.0021	0.4102
Sleep Apnea	Insomnia	0.0916	0.8694	0.0390
Sleep Apnea	Insomnia	0.0916	0.8694	0.0390
Insomnia	Insomnia	0.0916	0.8694	0.0390
Insomnia	Insomnia	0.2198	0.6172	0.1630
None	None	0.9966	0.0004	0.0030
None	None	0.9966	0.0004	0.0030
None	None	0.9966	0.0004	0.0030

14.2.2 Confusion Matrix MLR

cm_mlr <- table(
  Aktual   = df_mlr_bal$Sleep.Disorder,
  Prediksi = mlr_pred_class
)

kable(cm_mlr,
      caption = "Confusion Matrix – MLR (Training Data)") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"),
                full_width = FALSE) %>%
  add_header_above(c(" " = 1, "Prediksi" = length(levels(df_mlr_bal$Sleep.Disorder))))

Confusion Matrix – MLR (Training Data)
	Prediksi
	None	Insomnia	Sleep Apnea
None	186	27	6
Insomnia	23	181	15
Sleep Apnea	13	10	196

n_total_mlr   <- sum(cm_mlr)
n_correct_mlr <- sum(diag(cm_mlr))
accuracy_mlr  <- n_correct_mlr / n_total_mlr
aper_mlr      <- 1 - accuracy_mlr

per_class_mlr <- lapply(levels(df_mlr_bal$Sleep.Disorder), function(k) {
  TP <- cm_mlr[k, k]
  FP <- sum(cm_mlr[, k]) - TP
  FN <- sum(cm_mlr[k, ]) - TP
  TN <- n_total_mlr - TP - FP - FN
  data.frame(
    Kelas       = k,
    TP = TP, FP = FP, FN = FN, TN = TN,
    Sensitivity = round(TP / (TP + FN), 4),
    Specificity = round(TN / (TN + FP), 4),
    Precision   = round(TP / (TP + FP), 4),
    F1_Score    = round(2 * TP / (2 * TP + FP + FN), 4)
  )
})

per_class_mlr_df <- bind_rows(per_class_mlr)

kable(per_class_mlr_df,
      caption   = "Metrik Evaluasi Per Kelas – MLR",
      row.names = FALSE) %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"),
                full_width = TRUE)

Metrik Evaluasi Per Kelas – MLR
Kelas	TP	FP	FN	TN	Sensitivity	Specificity	Precision	F1_Score
None	186	36	33	402	0.8493	0.9178	0.8378	0.8435
Insomnia	181	37	38	401	0.8265	0.9155	0.8303	0.8284
Sleep Apnea	196	21	23	417	0.8950	0.9521	0.9032	0.8991

overall_mlr <- data.frame(
  Model       = "MLR",
  Accuracy    = paste0(round(accuracy_mlr * 100, 2), "%"),
  APER        = paste0(round(aper_mlr * 100, 2), "%"),
  N_Correct   = n_correct_mlr,
  N_Incorrect = n_total_mlr - n_correct_mlr,
  N_Total     = n_total_mlr
)

kable(overall_mlr,
      caption   = "Ringkasan Evaluasi Model MLR (Training Data)",
      row.names = FALSE) %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE) %>%
  row_spec(1, color = "white",
           background = ifelse(accuracy_mlr >= 0.8, "#5cb85c", "#f0ad4e"))

Ringkasan Evaluasi Model MLR (Training Data)
Model	Accuracy	APER	N_Correct	N_Incorrect	N_Total
MLR	85.69%	14.31%	563	94	657

15 Interpretasi

15.1 Interpretation MLR

15.1.1 Log-Odds dan Odds Ratio

coef_mlr <- summary(mlr_model)$coefficients
se_mlr   <- summary(mlr_model)$standard.errors

z_mlr <- coef_mlr / se_mlr
p_mlr <- 2 * (1 - pnorm(abs(z_mlr)))

logodds_table <- as.data.frame(coef_mlr) %>%
  rownames_to_column("Kelas") %>%
  pivot_longer(
    cols = -Kelas,
    names_to = "Variabel",
    values_to = "Log_Odds"
  )

se_table <- as.data.frame(se_mlr) %>%
  rownames_to_column("Kelas") %>%
  pivot_longer(
    cols = -Kelas,
    names_to = "Variabel",
    values_to = "Std_Error"
  )

p_table <- as.data.frame(p_mlr) %>%
  rownames_to_column("Kelas") %>%
  pivot_longer(
    cols = -Kelas,
    names_to = "Variabel",
    values_to = "p_value"
  )

mlr_interpret_table <- logodds_table %>%
  left_join(se_table, by = c("Kelas", "Variabel")) %>%
  left_join(p_table, by = c("Kelas", "Variabel")) %>%
  mutate(
    Odds_Ratio = exp(Log_Odds),
    Signifikan = ifelse(p_value < 0.05, "Signifikan", "Tidak Signifikan"),
    Interpretasi_OR = case_when(
      Odds_Ratio > 1 ~ "Meningkatkan odds",
      Odds_Ratio < 1 ~ "Menurunkan odds",
      TRUE ~ "Tidak mengubah odds"
    )
  )

kable(mlr_interpret_table %>%
        mutate(across(where(is.numeric), ~ round(., 4))),
      caption = "Interpretasi Log-Odds dan Odds Ratio Model MLR",
      row.names = FALSE) %>%
  kable_styling(bootstrap_options = c("striped","hover","condensed"),
                full_width = TRUE) %>%
  scroll_box(width = "100%", height = "450px")

Interpretasi Log-Odds dan Odds Ratio Model MLR
Kelas	Variabel	Log_Odds	Std_Error	p_value	Odds_Ratio	Signifikan	Interpretasi_OR
Insomnia	(Intercept)	24.3302	0.0001	0.0000	3.685295e+10	Signifikan	Meningkatkan odds
Insomnia	Age	0.2796	0.0200	0.0000	1.322600e+00	Signifikan	Meningkatkan odds
Insomnia	Sleep.Duration	-3.9729	0.0015	0.0000	1.880000e-02	Signifikan	Menurunkan odds
Insomnia	Physical.Activity.Level	0.0728	0.0124	0.0000	1.075500e+00	Signifikan	Meningkatkan odds
Insomnia	Heart.Rate	-0.0724	0.0104	0.0000	9.302000e-01	Signifikan	Menurunkan odds
Insomnia	Daily.Steps	-0.0013	0.0002	0.0000	9.987000e-01	Signifikan	Menurunkan odds
Insomnia	log_Systolic	0.2475	0.0006	0.0000	1.280800e+00	Signifikan	Meningkatkan odds
Sleep Apnea	(Intercept)	-76.0818	0.0001	0.0000	0.000000e+00	Signifikan	Menurunkan odds
Sleep Apnea	Age	0.2722	0.0186	0.0000	1.312800e+00	Signifikan	Meningkatkan odds
Sleep Apnea	Sleep.Duration	0.5445	0.0012	0.0000	1.723700e+00	Signifikan	Meningkatkan odds
Sleep Apnea	Physical.Activity.Level	-0.0285	0.0126	0.0233	9.719000e-01	Signifikan	Menurunkan odds
Sleep Apnea	Heart.Rate	0.7355	0.0115	0.0000	2.086500e+00	Signifikan	Meningkatkan odds
Sleep Apnea	Daily.Steps	0.0007	0.0002	0.0000	1.000700e+00	Signifikan	Meningkatkan odds
Sleep Apnea	log_Systolic	1.7720	0.0003	0.0000	5.882400e+00	Signifikan	Meningkatkan odds

or_sig_table <- mlr_interpret_table %>%
  filter(Variabel != "(Intercept)", p_value < 0.05) %>%
  arrange(Kelas, desc(abs(Log_Odds)))

15.1.2 Plot Odds Ratio

or_plot_df <- mlr_interpret_table %>%
  filter(Variabel != "(Intercept)")

ggplot(or_plot_df,
       aes(x = reorder(Variabel, Odds_Ratio),
           y = Odds_Ratio,
           color = Signifikan)) +
  geom_point(size = 3) +
  geom_hline(yintercept = 1,
             linetype = "dashed",
             color = "red") +
  coord_flip() +
  facet_wrap(~ Kelas, scales = "free_y") +
  labs(title = "Plot Odds Ratio Model MLR",
       subtitle = "OR > 1 meningkatkan odds, OR < 1 menurunkan odds",
       x = "Variabel",
       y = "Odds Ratio",
       color = "Status") +
  theme_bw()

15.1.3 Marginal Effects

calc_marginal_effect <- function(model, data, var, delta = NULL) {
  data1 <- data
  data2 <- data
  
  if (is.null(delta)) {
    delta <- sd(data[[var]], na.rm = TRUE)
  }
  
  data2[[var]] <- data2[[var]] + delta
  
  p1 <- predict(model, newdata = data1, type = "probs")
  p2 <- predict(model, newdata = data2, type = "probs")
  
  p1 <- as.data.frame(p1)
  p2 <- as.data.frame(p2)
  
  me <- colMeans(p2 - p1, na.rm = TRUE)
  
  data.frame(
    Variabel = var,
    Delta = delta,
    Kelas = names(me),
    AME = as.numeric(me)
  )
}

mfx_summary <- bind_rows(lapply(pred_mlr, function(v) {
  calc_marginal_effect(mlr_model, df_mlr_bal, v)
}))

kable(mfx_summary %>%
        mutate(across(where(is.numeric), ~ round(., 4))),
      caption = "Average Marginal Effects Model MLR",
      row.names = FALSE) %>%
  kable_styling(bootstrap_options = c("striped","hover","condensed"),
                full_width = TRUE) %>%
  scroll_box(width = "100%", height = "450px")

Average Marginal Effects Model MLR
Variabel	Delta	Kelas	AME
Age	8.3229	None	-0.1564
Age	8.3229	Insomnia	0.0938
Age	8.3229	Sleep Apnea	0.0626
Sleep.Duration	0.7864	None	0.1290
Sleep.Duration	0.7864	Insomnia	-0.2718
Sleep.Duration	0.7864	Sleep Apnea	0.1428
Physical.Activity.Level	20.7731	None	-0.0634
Physical.Activity.Level	20.7731	Insomnia	0.2128
Physical.Activity.Level	20.7731	Sleep Apnea	-0.1494
Heart.Rate	3.9853	None	-0.1112
Heart.Rate	3.9853	Insomnia	-0.1671
Heart.Rate	3.9853	Sleep Apnea	0.2783
Daily.Steps	1801.9910	None	0.0450
Daily.Steps	1801.9910	Insomnia	-0.2429
Daily.Steps	1801.9910	Sleep Apnea	0.1979
log_Systolic	0.7780	None	-0.0569
log_Systolic	0.7780	Insomnia	-0.0537
log_Systolic	0.7780	Sleep Apnea	0.1106

15.1.4 Plot Marginal Effects

ggplot(mfx_summary,
       aes(x = reorder(Variabel, AME),
           y = AME,
           fill = Kelas)) +
  geom_col(position = "dodge", alpha = 0.85) +
  geom_hline(yintercept = 0,
             linetype = "dashed",
             color = "red") +
  coord_flip() +
  labs(title = "Plot Average Marginal Effects",
       subtitle = "AME menunjukkan perubahan probabilitas rata-rata ketika variabel naik sebesar 1 SD",
       x = "Variabel",
       y = "Average Marginal Effect",
       fill = "Kelas") +
  theme_bw()

15.1.5 Predicted Probability Plot

pred_prob <- as.data.frame(predict(mlr_model, newdata = df_mlr_bal, type = "probs"))

pred_prob$Aktual <- df_mlr_bal$Sleep.Disorder

pred_prob$Prediksi <- colnames(pred_prob)[
  max.col(pred_prob, ties.method = "first")
]

pred_prob_long <- pred_prob %>%
  pivot_longer(
    cols = all_of(levels(df_mlr_bal$Sleep.Disorder)),
    names_to = "Kelas",
    values_to = "Probabilitas"
  )

ggplot(pred_prob_long,
       aes(x = Kelas,
           y = Probabilitas,
           fill = Kelas)) +
  geom_boxplot(alpha = 0.8) +
  labs(title = "Distribusi Predicted Probability MLR",
       x = "Kelas",
       y = "Probabilitas Prediksi") +
  theme_bw() +
  theme(legend.position = "none")

15.2 Interpretation LDA

15.2.1 Interpretasi Fungsi Diskriminan LDA

lda_importance <- coef_df %>%
  pivot_longer(
    cols = starts_with("LD"),
    names_to = "Fungsi_Diskriminan",
    values_to = "Koefisien"
  ) %>%
  mutate(
    Abs_Koefisien = abs(Koefisien),
    Arah = case_when(
      Koefisien > 0 ~ "Positif",
      Koefisien < 0 ~ "Negatif",
      TRUE ~ "Netral"
    ),
    Interpretasi = case_when(
      Koefisien > 0 ~ "Semakin tinggi variabel, skor LD cenderung meningkat",
      Koefisien < 0 ~ "Semakin tinggi variabel, skor LD cenderung menurun",
      TRUE ~ "Tidak memberi kontribusi pada fungsi LD"
    )
  ) %>%
  arrange(Fungsi_Diskriminan, desc(Abs_Koefisien))

kable(lda_importance %>%
        mutate(across(where(is.numeric), ~ round(., 4))),
      caption = "Interpretasi Kontribusi Variabel terhadap Fungsi Diskriminan LDA",
      row.names = FALSE) %>%
  kable_styling(
    bootstrap_options = c("striped", "hover", "condensed"),
    full_width = TRUE
  )

Interpretasi Kontribusi Variabel terhadap Fungsi Diskriminan LDA
Variabel	Fungsi_Diskriminan	Koefisien	Abs_Koefisien	Arah	Interpretasi
PC2	LD1	0.9235	0.9235	Positif	Semakin tinggi variabel, skor LD cenderung meningkat
PC3	LD1	0.5667	0.5667	Positif	Semakin tinggi variabel, skor LD cenderung meningkat
PC1	LD1	-0.3134	0.3134	Negatif	Semakin tinggi variabel, skor LD cenderung menurun
PC3	LD2	0.8305	0.8305	Positif	Semakin tinggi variabel, skor LD cenderung meningkat
PC2	LD2	-0.2602	0.2602	Negatif	Semakin tinggi variabel, skor LD cenderung menurun
PC1	LD2	-0.1538	0.1538	Negatif	Semakin tinggi variabel, skor LD cenderung menurun

15.2.2 Skor LD dan Prediksi

lda_score_df <- as.data.frame(lda_pred$x)

lda_score_df <- lda_score_df %>%
  mutate(
    Aktual = df_balanced$Sleep.Disorder,
    Prediksi = lda_pred$class
  )

kable(head(lda_score_df, 10),
      caption = "10 Baris Pertama Skor LD dan Prediksi LDA",
      row.names = FALSE) %>%
  kable_styling(bootstrap_options = c("striped", "hover"))

10 Baris Pertama Skor LD dan Prediksi LDA
LD1	LD2	Aktual	Prediksi
-1.077014	1.484504	None	Insomnia
-1.817252	-1.138942	None	None
-1.817252	-1.138942	None	None
1.423253	3.328795	Sleep Apnea	Insomnia
1.423253	3.328795	Sleep Apnea	Insomnia
1.423253	3.328795	Insomnia	Insomnia
1.078052	2.129825	Insomnia	Insomnia
-2.909839	-1.383003	None	None
-2.909839	-1.383003	None	None
-2.909839	-1.383003	None	None

15.2.3 Plot LD1 vs LD2

if ("LD2" %in% names(lda_score_df)) {
  ggplot(lda_score_df, aes(x = LD1, y = LD2, color = Aktual, shape = Prediksi)) +
    geom_point(alpha = 0.75, size = 2.5) +
    labs(
      title = "Visualisasi LDA: LD1 vs LD2",
      subtitle = "Pemisahan kelas berdasarkan fungsi diskriminan",
      x = "Linear Discriminant 1",
      y = "Linear Discriminant 2",
      color = "Kelas Aktual",
      shape = "Kelas Prediksi"
    ) +
    theme_bw()
} else {
  ggplot(lda_score_df, aes(x = LD1, fill = Aktual)) +
    geom_density(alpha = 0.45) +
    labs(
      title = "Visualisasi LDA: Distribusi LD1",
      subtitle = "Hanya satu fungsi diskriminan tersedia",
      x = "Linear Discriminant 1",
      y = "Density",
      fill = "Kelas Aktual"
    ) +
    theme_bw()
}

15.2.4 Plot Density LD1

ggplot(lda_score_df, aes(x = LD1, fill = Aktual)) +
  geom_density(alpha = 0.45) +
  labs(
    title = "Distribusi LD1 per Kelas Sleep Disorder",
    subtitle = "Overlap menunjukkan kelas yang sulit dipisahkan",
    x = "LD1",
    y = "Density",
    fill = "Kelas Aktual"
  ) +
  theme_bw()

15.2.5 Plot Kontribusi LDA

ggplot(lda_importance,
       aes(x = reorder(Variabel, Abs_Koefisien),
           y = Abs_Koefisien,
           fill = Fungsi_Diskriminan)) +
  geom_col(position = "dodge", alpha = 0.85) +
  coord_flip() +
  labs(
    title = "Besar Kontribusi Variabel pada Fungsi Diskriminan",
    subtitle = "Nilai absolut koefisien menunjukkan kekuatan kontribusi variabel",
    x = "Variabel",
    y = "|Koefisien|",
    fill = "Fungsi"
  ) +
  theme_bw()

15.2.6 Confusion Matrix Heatmap LDA

cm_lda_df <- as.data.frame(cm_lda)

ggplot(cm_lda_df, aes(x = Prediksi, y = Aktual, fill = Freq)) +
  geom_tile() +
  geom_text(aes(label = Freq), color = "white", size = 5) +
  scale_fill_gradient(low = "#56B1F7", high = "#132B43") +
  labs(
    title = "Confusion Matrix Heatmap - LDA",
    subtitle = "Diagonal menunjukkan prediksi yang benar",
    x = "Prediksi",
    y = "Aktual",
    fill = "Frekuensi"
  ) +
  theme_bw()

16 Perbandingan LDA vs MLR

compare_df <- data.frame(
  Metrik = c(
    "Accuracy (Training)",
    "APER (Training)",
    "Accuracy (LOOCV) – LDA",
    "APER (LOOCV) – LDA",
    "AIC",
    "BIC",
    "Log-Likelihood"
  ),
  LDA = c(
    paste0(round(accuracy_lda * 100, 2), "%"),
    paste0(round(aper_lda * 100, 2), "%"),
    paste0(round(acc_cv * 100, 2), "%"),
    paste0(round(aper_cv * 100, 2), "%"),
    "–",
    "–",
    "–"
  ),
  MLR = c(
    paste0(round(accuracy_mlr * 100, 2), "%"),
    paste0(round(aper_mlr * 100, 2), "%"),
    "–",
    "–",
    round(AIC(mlr_model), 4),
    round(BIC(mlr_model), 4),
    round(as.numeric(logLik(mlr_model)), 4)
  )
)

kable(compare_df,
      caption   = "Perbandingan Metrik Evaluasi – LDA vs MLR",
      row.names = FALSE) %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"),
                full_width = FALSE) %>%
  column_spec(2, background = "#e8f4f8") %>%
  column_spec(3, background = "#fef9e7")

Perbandingan Metrik Evaluasi – LDA vs MLR
Metrik	LDA	MLR
Accuracy (Training)	84.78%	85.69%
APER (Training)	15.22%	14.31%
Accuracy (LOOCV) – LDA	84.78%	–
APER (LOOCV) – LDA	15.22%	–
AIC	–	644.8503
BIC	–	707.6779
Log-Likelihood	–	-308.4251

both_acc <- data.frame(
  Kelas           = levels(df_balanced$Sleep.Disorder),
  Sensitivity_LDA = per_class_lda_df$Sensitivity,
  Sensitivity_MLR = per_class_mlr_df$Sensitivity,
  F1_LDA          = per_class_lda_df$F1_Score,
  F1_MLR          = per_class_mlr_df$F1_Score
)

kable(both_acc,
      caption   = "Perbandingan Sensitivity & F1-Score per Kelas – LDA vs MLR",
      row.names = FALSE) %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)

Perbandingan Sensitivity & F1-Score per Kelas – LDA vs MLR
Kelas	Sensitivity_LDA	Sensitivity_MLR	F1_LDA	F1_MLR
Insomnia	0.9315	0.8493	0.8259	0.8435
None	0.8082	0.8265	0.8469	0.8284
Sleep Apnea	0.8037	0.8950	0.8756	0.8991

cat("=== KESIMPULAN PEMILIHAN MODEL ===\n\n")

## === KESIMPULAN PEMILIHAN MODEL ===

cat("LDA Accuracy (Resubstitution):", round(accuracy_lda * 100, 2), "%\n")

## LDA Accuracy (Resubstitution): 84.78 %

cat("LDA Accuracy (LOOCV)         :", round(acc_cv * 100, 2), "%\n")

## LDA Accuracy (LOOCV)         : 84.78 %

cat("MLR Accuracy (Training)      :", round(accuracy_mlr * 100, 2), "%\n")

## MLR Accuracy (Training)      : 85.69 %

cat("MLR AIC                      :", round(AIC(mlr_model), 2), "\n")

## MLR AIC                      : 644.85

cat("MLR BIC                      :", round(BIC(mlr_model), 2), "\n\n")

## MLR BIC                      : 707.68

if (acc_cv >= accuracy_mlr) {
  cat("→ LDA (LOOCV) menghasilkan akurasi lebih tinggi atau setara dengan MLR.\n")
  cat("  LDA lebih disarankan jika asumsi normalitas multivariat mendekati terpenuhi.\n")
} else {
  cat("→ MLR menghasilkan akurasi lebih tinggi dari LDA (LOOCV).\n")
  cat("  MLR lebih disarankan karena tidak memerlukan asumsi distribusi pada prediktor.\n")
}

## → MLR menghasilkan akurasi lebih tinggi dari LDA (LOOCV).
##   MLR lebih disarankan karena tidak memerlukan asumsi distribusi pada prediktor.

Analisis Klasifikasi: Data Preparation, EDA, LDA & MLR

Sleep Health and Lifestyle Dataset

KELOMPOK 1 – ANALISIS MULTIVARIAT Tamaela Nurandyapasa (24031554044) Niha Nur Mayla Putri (24031554057) Rahmawati Dian Pratiwi (24031554183)

05 May 2026