1. Persiapan Dataset

1.1 Instalasi & Pemuatan Paket

library(MASS)
library(ggplot2)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following object is masked from 'package:MASS':
## 
##     select

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(caret)

## Warning: package 'caret' was built under R version 4.5.3

## Loading required package: lattice

library(brant)

## Warning: package 'brant' was built under R version 4.5.3

library(tidyr)
library(scales)
library(knitr)
library(kableExtra)

## 
## Attaching package: 'kableExtra'

## The following object is masked from 'package:dplyr':
## 
##     group_rows

library(gridExtra)

## 
## Attaching package: 'gridExtra'

## The following object is masked from 'package:dplyr':
## 
##     combine

1.2 Import Dataset

df <- read.csv("C:/Users/ASUS/Downloads/Sleep_health_and_lifestyle_dataset.csv", stringsAsFactors = FALSE)
cat("Dimensi dataset:", nrow(df), "baris x", ncol(df), "kolom\n")

## Dimensi dataset: 374 baris x 13 kolom

2. Pemeriksaan Struktur Data

str(df)

## 'data.frame':    374 obs. of  13 variables:
##  $ Person.ID              : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ Gender                 : chr  "Male" "Male" "Male" "Male" ...
##  $ Age                    : int  27 28 28 28 28 28 29 29 29 29 ...
##  $ Occupation             : chr  "Software Engineer" "Doctor" "Doctor" "Sales Representative" ...
##  $ Sleep.Duration         : num  6.1 6.2 6.2 5.9 5.9 5.9 6.3 7.8 7.8 7.8 ...
##  $ Quality.of.Sleep       : int  6 6 6 4 4 4 6 7 7 7 ...
##  $ Physical.Activity.Level: int  42 60 60 30 30 30 40 75 75 75 ...
##  $ Stress.Level           : int  6 8 8 8 8 8 7 6 6 6 ...
##  $ BMI.Category           : chr  "Overweight" "Normal" "Normal" "Obese" ...
##  $ Blood.Pressure         : chr  "126/83" "125/80" "125/80" "140/90" ...
##  $ Heart.Rate             : int  77 75 75 85 85 85 82 70 70 70 ...
##  $ Daily.Steps            : int  4200 10000 10000 3000 3000 3000 3500 8000 8000 8000 ...
##  $ Sleep.Disorder         : chr  "None" "None" "None" "Sleep Apnea" ...

kable(head(df, 6), caption = "Enam Baris Pertama Dataset") %>%
  kable_styling(bootstrap_options = c("striped","hover","condensed"), full_width = FALSE)

Enam Baris Pertama Dataset
Person.ID	Gender	Age	Occupation	Sleep.Duration	Quality.of.Sleep	Physical.Activity.Level	Stress.Level	BMI.Category	Blood.Pressure	Heart.Rate	Daily.Steps	Sleep.Disorder
1	Male	27	Software Engineer	6.1	6	42	6	Overweight	126/83	77	4200	None
2	Male	28	Doctor	6.2	6	60	8	Normal	125/80	75	10000	None
3	Male	28	Doctor	6.2	6	60	8	Normal	125/80	75	10000	None
4	Male	28	Sales Representative	5.9	4	30	8	Obese	140/90	85	3000	Sleep Apnea
5	Male	28	Sales Representative	5.9	4	30	8	Obese	140/90	85	3000	Sleep Apnea
6	Male	28	Software Engineer	5.9	4	30	8	Obese	140/90	85	3000	Insomnia

3. Ringkasan Statistik Deskriptif

summary(df)

##    Person.ID         Gender               Age         Occupation       
##  Min.   :  1.00   Length:374         Min.   :27.00   Length:374        
##  1st Qu.: 94.25   Class :character   1st Qu.:35.25   Class :character  
##  Median :187.50   Mode  :character   Median :43.00   Mode  :character  
##  Mean   :187.50                      Mean   :42.18                     
##  3rd Qu.:280.75                      3rd Qu.:50.00                     
##  Max.   :374.00                      Max.   :59.00                     
##  Sleep.Duration  Quality.of.Sleep Physical.Activity.Level  Stress.Level  
##  Min.   :5.800   Min.   :4.000    Min.   :30.00           Min.   :3.000  
##  1st Qu.:6.400   1st Qu.:6.000    1st Qu.:45.00           1st Qu.:4.000  
##  Median :7.200   Median :7.000    Median :60.00           Median :5.000  
##  Mean   :7.132   Mean   :7.313    Mean   :59.17           Mean   :5.385  
##  3rd Qu.:7.800   3rd Qu.:8.000    3rd Qu.:75.00           3rd Qu.:7.000  
##  Max.   :8.500   Max.   :9.000    Max.   :90.00           Max.   :8.000  
##  BMI.Category       Blood.Pressure       Heart.Rate     Daily.Steps   
##  Length:374         Length:374         Min.   :65.00   Min.   : 3000  
##  Class :character   Class :character   1st Qu.:68.00   1st Qu.: 5600  
##  Mode  :character   Mode  :character   Median :70.00   Median : 7000  
##                                        Mean   :70.17   Mean   : 6817  
##                                        3rd Qu.:72.00   3rd Qu.: 8000  
##                                        Max.   :86.00   Max.   :10000  
##  Sleep.Disorder    
##  Length:374        
##  Class :character  
##  Mode  :character  
##                    
##                    
##

num_cols <- df %>% select(where(is.numeric))
ringkasan_df <- data.frame(
  Variabel = names(num_cols),
  Min      = sapply(num_cols, min, na.rm = TRUE),
  Q1       = sapply(num_cols, quantile, 0.25, na.rm = TRUE),
  Median   = sapply(num_cols, median, na.rm = TRUE),
  Mean     = round(sapply(num_cols, mean, na.rm = TRUE), 2),
  Q3       = sapply(num_cols, quantile, 0.75, na.rm = TRUE),
  Max      = sapply(num_cols, max, na.rm = TRUE)
)
kable(ringkasan_df, caption = "Ringkasan Statistik Variabel Numerik", row.names = FALSE) %>%
  kable_styling(bootstrap_options = c("striped","hover"), full_width = FALSE)

Ringkasan Statistik Variabel Numerik
Variabel	Min	Q1	Median	Mean	Q3	Max
Person.ID	1.0	94.25	187.5	187.50	280.75	374.0
Age	27.0	35.25	43.0	42.18	50.00	59.0
Sleep.Duration	5.8	6.40	7.2	7.13	7.80	8.5
Quality.of.Sleep	4.0	6.00	7.0	7.31	8.00	9.0
Physical.Activity.Level	30.0	45.00	60.0	59.17	75.00	90.0
Stress.Level	3.0	4.00	5.0	5.39	7.00	8.0
Heart.Rate	65.0	68.00	70.0	70.17	72.00	86.0
Daily.Steps	3000.0	5600.00	7000.0	6816.84	8000.00	10000.0

4. Pemeriksaan Nilai Hilang

missing_df <- data.frame(
  Kolom          = names(df),
  Jumlah_Missing = sapply(df, function(x) sum(is.na(x))),
  Persentase     = round(sapply(df, function(x) mean(is.na(x)) * 100), 2)
)
kable(missing_df, caption = "Nilai Hilang per Kolom", row.names = FALSE) %>%
  kable_styling(bootstrap_options = c("striped","hover"), full_width = FALSE) %>%
  row_spec(which(missing_df$Jumlah_Missing > 0), background = "#fff3cd")

Nilai Hilang per Kolom
Kolom	Jumlah_Missing	Persentase
Person.ID	0	0
Gender	0	0
Age	0	0
Occupation	0	0
Sleep.Duration	0	0
Quality.of.Sleep	0	0
Physical.Activity.Level	0	0
Stress.Level	0	0
BMI.Category	0	0
Blood.Pressure	0	0
Heart.Rate	0	0
Daily.Steps	0	0
Sleep.Disorder	0	0

ggplot(missing_df, aes(x = reorder(Kolom, Jumlah_Missing),
                        y = Jumlah_Missing, fill = Jumlah_Missing > 0)) +
  geom_col(show.legend = FALSE) +
  coord_flip() +
  scale_fill_manual(values = c("steelblue","tomato")) +
  labs(title = "Jumlah Nilai Hilang per Kolom", x = "Kolom", y = "Jumlah Missing") +
  theme_minimal(base_size = 12)

5. Distribusi Variabel Target

5.1 Pembuatan Variabel Target Ordinal

Variabel Quality.of.Sleep (skala 4–9) dikonversi menjadi 3 kategori ordinal: - Rendah: skor ≤ 5 | Sedang: skor 6–7 | Tinggi: skor ≥ 8

df <- df %>%
  mutate(
    Kualitas_Tidur = case_when(
      Quality.of.Sleep <= 5 ~ "Rendah",
      Quality.of.Sleep <= 7 ~ "Sedang",
      TRUE                  ~ "Tinggi"
    ),
    Kualitas_Tidur = factor(Kualitas_Tidur,
                            levels  = c("Rendah","Sedang","Tinggi"),
                            ordered = TRUE)
  )

cat("Distribusi Kualitas Tidur:\n"); print(table(df$Kualitas_Tidur))

## Distribusi Kualitas Tidur:

## 
## Rendah Sedang Tinggi 
##     12    182    180

cat("\nProporsi (%):\n"); print(round(prop.table(table(df$Kualitas_Tidur)) * 100, 2))

## 
## Proporsi (%):

## 
## Rendah Sedang Tinggi 
##   3.21  48.66  48.13

5.2 Visualisasi Distribusi Target

freq_target <- as.data.frame(table(df$Kualitas_Tidur)) %>%
  rename(Kategori = Var1, Frekuensi = Freq) %>%
  mutate(Persen = round(Frekuensi / sum(Frekuensi) * 100, 1))

ggplot(freq_target, aes(x = Kategori, y = Frekuensi, fill = Kategori)) +
  geom_col(width = 0.6, show.legend = FALSE) +
  geom_text(aes(label = paste0(Frekuensi, "\n(", Persen, "%)")), vjust = -0.3, size = 4.5) +
  scale_fill_manual(values = c("Rendah"="#e74c3c","Sedang"="#f39c12","Tinggi"="#2ecc71")) +
  labs(title    = "Distribusi Kualitas Tidur (Variabel Target Ordinal)",
       subtitle = "Sleep Health & Lifestyle Dataset",
       x = "Kategori Kualitas Tidur", y = "Frekuensi") +
  theme_minimal(base_size = 13) +
  theme(plot.title = element_text(face = "bold"))

6. Visualisasi Distribusi Prediktor

6.1 Variabel Numerik

numerik_long <- df %>%
  select(Age, Sleep.Duration, Physical.Activity.Level,
         Stress.Level, Heart.Rate, Daily.Steps) %>%
  pivot_longer(everything(), names_to = "Variabel", values_to = "Nilai")

ggplot(numerik_long, aes(x = Nilai, fill = Variabel)) +
  geom_histogram(bins = 20, color = "white", alpha = 0.85, show.legend = FALSE) +
  facet_wrap(~ Variabel, scales = "free", ncol = 3) +
  scale_fill_brewer(palette = "Set2") +
  labs(title = "Distribusi Variabel Numerik Prediktor", x = "Nilai", y = "Frekuensi") +
  theme_minimal(base_size = 11) +
  theme(plot.title = element_text(face = "bold"))

6.2 Variabel Kategorik vs Target

p1 <- ggplot(df, aes(x = BMI.Category, fill = Kualitas_Tidur)) +
  geom_bar(position = "fill") +
  scale_y_continuous(labels = percent_format()) +
  scale_fill_manual(values = c("Rendah"="#e74c3c","Sedang"="#f39c12","Tinggi"="#2ecc71")) +
  labs(title = "BMI vs Kualitas Tidur", x = "BMI Category",
       y = "Proporsi", fill = "Kualitas Tidur") +
  theme_minimal(base_size = 11) +
  theme(axis.text.x = element_text(angle = 20, hjust = 1))

p2 <- ggplot(df, aes(x = Gender, fill = Kualitas_Tidur)) +
  geom_bar(position = "fill") +
  scale_y_continuous(labels = percent_format()) +
  scale_fill_manual(values = c("Rendah"="#e74c3c","Sedang"="#f39c12","Tinggi"="#2ecc71")) +
  labs(title = "Gender vs Kualitas Tidur", x = "Gender",
       y = "Proporsi", fill = "Kualitas Tidur") +
  theme_minimal(base_size = 11)

grid.arrange(p1, p2, ncol = 2)

7. Membangun Model Regresi Logistik Ordinal

7.1 Validasi Kolom & Persiapan Data

kolom_penting <- c("Age","Sleep.Duration","Physical.Activity.Level",
                   "Stress.Level","Heart.Rate","Daily.Steps",
                   "BMI.Category","Gender","Kualitas_Tidur")

cat("Pengecekan kolom penting:\n")

## Pengecekan kolom penting:

for (kol in kolom_penting) {
  ada <- kol %in% names(df)
  cat(sprintf("  %-35s : %s\n", kol, ifelse(ada, "Ada", "Tidak Ada")))
}

##   Age                                 : Ada
##   Sleep.Duration                      : Ada
##   Physical.Activity.Level             : Ada
##   Stress.Level                        : Ada
##   Heart.Rate                          : Ada
##   Daily.Steps                         : Ada
##   BMI.Category                        : Ada
##   Gender                              : Ada
##   Kualitas_Tidur                      : Ada

df_model <- df %>%
  select(all_of(kolom_penting)) %>%
  mutate(BMI.Category = as.factor(BMI.Category), Gender = as.factor(Gender)) %>%
  drop_na()

cat("\nJumlah observasi untuk pemodelan:", nrow(df_model), "\n")

## 
## Jumlah observasi untuk pemodelan: 374

7.2 Split Data 80:20

set.seed(42)
idx_train  <- createDataPartition(df_model$Kualitas_Tidur, p = 0.8, list = FALSE)
data_train <- df_model[ idx_train, ]
data_test  <- df_model[-idx_train, ]

cat("Training:", nrow(data_train), "| Testing:", nrow(data_test), "\n")

## Training: 300 | Testing: 74

7.3 Scaling & Fit Model `polr()`

Catatan pemilihan prediktor: Berdasarkan pemeriksaan tabel silang, variabel Sleep.Duration dan Stress.Level mengalami perfect separation terhadap variabel target — artinya nilai-nilai tertentu dari kedua variabel ini secara sempurna memisahkan kategori Kualitas Tidur tanpa tumpang tindih. Kondisi ini menyebabkan koefisien divergen ke tak hingga sehingga algoritma optimasi polr() gagal konvergen. Oleh karena itu, model dibangun menggunakan 4 prediktor yang stabil: Age, Physical.Activity.Level, Heart.Rate, dan Daily.Steps. BMI.Category juga dikeluarkan karena quasi-complete separation (hanya 1 observasi pada kombinasi Tidak_Normal + Sedang).

cols_scale <- c("Age", "Sleep.Duration", "Physical.Activity.Level",
                "Stress.Level", "Heart.Rate", "Daily.Steps")

# Hitung mean & SD dari data training
means <- sapply(data_train[, cols_scale], mean, na.rm = TRUE)
sds   <- sapply(data_train[, cols_scale], sd,   na.rm = TRUE)

cat("=== SDs ===\n"); print(sds)

## === SDs ===

##                     Age          Sleep.Duration Physical.Activity.Level 
##               8.6087217               0.7948976              20.5125516 
##            Stress.Level              Heart.Rate             Daily.Steps 
##               1.7689884               4.2942120            1639.4276335

# Hapus kolom dengan SD = 0
cols_scale <- cols_scale[sds > 0]
means      <- means[cols_scale]
sds        <- sds[cols_scale]

# Scale manual
data_train_scaled <- data_train
data_test_scaled  <- data_test

for (col in cols_scale) {
  data_train_scaled[[col]] <- (data_train[[col]] - means[col]) / sds[col]
  data_test_scaled[[col]]  <- (data_test[[col]]  - means[col]) / sds[col]
}

# Buang baris dengan nilai tidak finite
data_train_scaled <- data_train_scaled %>%
  filter(if_all(where(is.numeric), ~ is.finite(.x)))
data_test_scaled <- data_test_scaled %>%
  filter(if_all(where(is.numeric), ~ is.finite(.x)))

cat("Train rows setelah cleaning:", nrow(data_train_scaled), "\n")

## Train rows setelah cleaning: 300

cat("Test rows setelah cleaning:",  nrow(data_test_scaled),  "\n")

## Test rows setelah cleaning: 74

# Pastikan faktor tidak memiliki level kosong
data_train_scaled$BMI.Category <- droplevels(data_train_scaled$BMI.Category)
data_train_scaled$Gender       <- droplevels(data_train_scaled$Gender)
data_test_scaled$BMI.Category  <- droplevels(data_test_scaled$BMI.Category)
data_test_scaled$Gender        <- droplevels(data_test_scaled$Gender)

# Fit model dengan 4 prediktor stabil
# Sleep.Duration & Stress.Level dikeluarkan karena perfect separation
# BMI.Category dikeluarkan karena quasi-complete separation
model_olr <- polr(
  Kualitas_Tidur ~ Age + Physical.Activity.Level + Heart.Rate + Daily.Steps,
  data   = data_train_scaled,
  Hess   = TRUE,
  method = "logistic"
)

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

summary(model_olr)

## Call:
## polr(formula = Kualitas_Tidur ~ Age + Physical.Activity.Level + 
##     Heart.Rate + Daily.Steps, data = data_train_scaled, Hess = TRUE, 
##     method = "logistic")
## 
## Coefficients:
##                           Value Std. Error t value
## Age                      0.5726     0.1747   3.277
## Physical.Activity.Level  2.0255     0.3228   6.276
## Heart.Rate              -2.2097     0.2383  -9.274
## Daily.Steps             -1.0344     0.2767  -3.739
## 
## Intercepts:
##               Value   Std. Error t value
## Rendah|Sedang -6.3409  0.6909    -9.1781
## Sedang|Tinggi  0.3889  0.1764     2.2040
## 
## Residual Deviance: 283.6826 
## AIC: 295.6826

8. Uji Signifikansi Koefisien (p-value)

ctable <- coef(summary(model_olr))
p_val  <- pnorm(abs(ctable[, "t value"]), lower.tail = FALSE) * 2
hasil_koef <- cbind(ctable, `p-value` = round(p_val, 4))

kable(as.data.frame(hasil_koef), caption = "Koefisien Model dengan p-value", digits = 4) %>%
  kable_styling(bootstrap_options = c("striped","hover"), full_width = FALSE) %>%
  row_spec(which(p_val < 0.05), background = "#d4edda")

Koefisien Model dengan p-value
	Value	Std. Error	t value	p-value
Age	0.5726	0.1747	3.2775	0.0010
Physical.Activity.Level	2.0255	0.3228	6.2757	0.0000
Heart.Rate	-2.2097	0.2383	-9.2741	0.0000
Daily.Steps	-1.0344	0.2767	-3.7388	0.0002
Rendah\|Sedang	-6.3409	0.6909	-9.1781	0.0000
Sedang\|Tinggi	0.3889	0.1764	2.2040	0.0275

# Pisahkan koefisien prediktor dari intercept
all_coef <- coef(summary(model_olr))
# Ambil hanya baris prediktor (bukan intercept Rendah|Sedang & Sedang|Tinggi)
pred_names <- names(coef(model_olr))  # hanya 4 prediktor

coef_vals <- coef(model_olr)                          # 4 nilai
se_vals   <- sqrt(diag(vcov(model_olr))[pred_names])  # 4 SE yang sesuai

or_wald <- data.frame(
  OR       = round(exp(coef_vals), 4),
  CI_Lower = round(exp(coef_vals - 1.96 * se_vals), 4),
  CI_Upper = round(exp(coef_vals + 1.96 * se_vals), 4)
)

kable(or_wald, caption = "Odds Ratio & CI 95% (Metode Wald)") %>%
  kable_styling(bootstrap_options = c("striped","hover"), full_width = FALSE)

Odds Ratio & CI 95% (Metode Wald)
	OR	CI_Lower	CI_Upper
Age	1.7728	1.2588	2.4967
Physical.Activity.Level	7.5801	4.0266	14.2698
Heart.Rate	0.1097	0.0688	0.1750
Daily.Steps	0.3555	0.2067	0.6113

9. Uji Asumsi Proportional Odds (Brant Test)

# H0: asumsi proportional odds terpenuhi (p-value > 0.05 = OK)
tryCatch(
  print(brant(model_olr)),
  error   = function(e) cat("Brant test error:", conditionMessage(e), "\n"),
  warning = function(w) cat("Brant test warning:", conditionMessage(w), "\n")
)

## Brant test warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

10. Prediksi & Evaluasi Model

# Gunakan data_test_scaled
pred_kelas <- predict(model_olr, newdata = data_test_scaled, type = "class")
pred_prob  <- predict(model_olr, newdata = data_test_scaled, type = "probs")

hasil_pred <- data.frame(
  Aktual      = data_test_scaled$Kualitas_Tidur,
  Prediksi    = pred_kelas,
  Prob_Rendah = round(pred_prob[, "Rendah"], 3),
  Prob_Sedang = round(pred_prob[, "Sedang"], 3),
  Prob_Tinggi = round(pred_prob[, "Tinggi"], 3)
)

kable(head(hasil_pred, 10), caption = "10 Hasil Prediksi Pertama", row.names = FALSE) %>%
  kable_styling(bootstrap_options = c("striped","hover"), full_width = FALSE)

10 Hasil Prediksi Pertama
Aktual	Prediksi	Prob_Rendah	Prob_Sedang	Prob_Tinggi
Sedang	Sedang	0.002	0.584	0.414
Sedang	Sedang	0.126	0.866	0.008
Sedang	Sedang	0.126	0.866	0.008
Sedang	Sedang	0.126	0.866	0.008
Sedang	Sedang	0.002	0.568	0.430
Sedang	Sedang	0.002	0.568	0.430
Rendah	Sedang	0.301	0.696	0.003
Sedang	Sedang	0.050	0.928	0.022
Sedang	Sedang	0.001	0.552	0.447
Sedang	Sedang	0.001	0.535	0.463

cm <- confusionMatrix(pred_kelas, data_test_scaled$Kualitas_Tidur)
print(cm)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Rendah Sedang Tinggi
##     Rendah      0      0      0
##     Sedang      2     30      7
##     Tinggi      0      6     29
## 
## Overall Statistics
##                                           
##                Accuracy : 0.7973          
##                  95% CI : (0.6878, 0.8819)
##     No Information Rate : 0.4865          
##     P-Value [Acc > NIR] : 3.721e-08       
##                                           
##                   Kappa : 0.6053          
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: Rendah Class: Sedang Class: Tinggi
## Sensitivity                0.00000        0.8333        0.8056
## Specificity                1.00000        0.7632        0.8421
## Pos Pred Value                 NaN        0.7692        0.8286
## Neg Pred Value             0.97297        0.8286        0.8205
## Prevalence                 0.02703        0.4865        0.4865
## Detection Rate             0.00000        0.4054        0.3919
## Detection Prevalence       0.00000        0.5270        0.4730
## Balanced Accuracy          0.50000        0.7982        0.8238

11. Visualisasi Confusion Matrix

cm_tabel <- as.data.frame(cm$table) %>%
  rename(Prediksi = Prediction, Aktual = Reference, Frekuensi = Freq)

ggplot(cm_tabel, aes(x = Aktual, y = Prediksi, fill = Frekuensi)) +
  geom_tile(color = "white", linewidth = 0.8) +
  geom_text(aes(label = Frekuensi), size = 7, fontface = "bold", color = "white") +
  scale_fill_gradient(low = "#aed6f1", high = "#1a5276") +
  labs(title    = "Confusion Matrix - Regresi Logistik Ordinal",
       subtitle = "Prediksi vs Aktual pada Data Testing",
       x = "Kategori Aktual", y = "Kategori Prediksi", fill = "Frekuensi") +
  theme_minimal(base_size = 13) +
  theme(plot.title = element_text(face = "bold"))

12. Akurasi & Metrik Evaluasi

akurasi <- cm$overall["Accuracy"]
kappa   <- cm$overall["Kappa"]
cat(sprintf("Akurasi Model : %.4f (%.2f%%)\n", akurasi, akurasi * 100))

## Akurasi Model : 0.7973 (79.73%)

cat(sprintf("Cohen's Kappa : %.4f\n", kappa))

## Cohen's Kappa : 0.6053

metrik_kelas <- as.data.frame(cm$byClass) %>%
  mutate(across(where(is.numeric), ~ round(.x, 4)))

kable(metrik_kelas, caption = "Metrik Evaluasi per Kelas") %>%
  kable_styling(bootstrap_options = c("striped","hover"), full_width = FALSE)

Metrik Evaluasi per Kelas
	Sensitivity	Specificity	Pos Pred Value	Neg Pred Value	Precision	Recall	F1	Prevalence	Detection Rate	Detection Prevalence	Balanced Accuracy
Class: Rendah	0.0000	1.0000	NaN	0.9730	NA	0.0000	NA	0.0270	0.0000	0.000	0.5000
Class: Sedang	0.8333	0.7632	0.7692	0.8286	0.7692	0.8333	0.8000	0.4865	0.4054	0.527	0.7982
Class: Tinggi	0.8056	0.8421	0.8286	0.8205	0.8286	0.8056	0.8169	0.4865	0.3919	0.473	0.8238

metrik_plot <- data.frame(
  Kelas       = rownames(cm$byClass),
  Sensitivity = cm$byClass[, "Sensitivity"],
  Specificity = cm$byClass[, "Specificity"],
  F1_Score    = cm$byClass[, "F1"]
) %>% pivot_longer(-Kelas, names_to = "Metrik", values_to = "Nilai")

ggplot(metrik_plot, aes(x = Kelas, y = Nilai, fill = Metrik)) +
  geom_col(position = "dodge", width = 0.65) +
  geom_text(aes(label = round(Nilai, 2)),
            position = position_dodge(0.65), vjust = -0.4, size = 3.5) +
  scale_fill_brewer(palette = "Set2") +
  scale_y_continuous(limits = c(0, 1.1), labels = percent_format()) +
  labs(title    = "Metrik Evaluasi per Kelas",
       subtitle = "Sensitivity, Specificity, dan F1-Score",
       x = "Kelas", y = "Nilai", fill = "Metrik") +
  theme_minimal(base_size = 12) +
  theme(plot.title = element_text(face = "bold"))

## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_col()`).

## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_text()`).

13. Kesimpulan

## === RINGKASAN HASIL ANALISIS ===
## Dataset  : Sleep Health & Lifestyle Dataset
## Target   : Kualitas Tidur (Ordinal: Rendah < Sedang < Tinggi)
## Metode   : Proportional Odds Logistic Regression (POLR)
## Paket R  : MASS::polr()
## 
## Prediktor dalam Model (4 variabel):
##   Age, Physical.Activity.Level, Heart.Rate, Daily.Steps
## 
## Prediktor Dikeluarkan:
##   Sleep.Duration  -> perfect separation
##   Stress.Level    -> perfect separation
##   BMI.Category    -> quasi-complete separation
## 
## Ukuran Data
##   Training : 300 observasi (80%)
##   Testing  : 74 observasi (20%)
## 
## Hasil Evaluasi
##   Akurasi       : 79.73%
##   Cohen's Kappa : 0.6053

Dataset: Sleep Health and Lifestyle - Kaggle

Regresi Logistik Ordinal: Sleep Health & Lifestyle Dataset

2026-04-30

1. Persiapan Dataset

1.1 Instalasi & Pemuatan Paket

1.2 Import Dataset

2. Pemeriksaan Struktur Data

3. Ringkasan Statistik Deskriptif

4. Pemeriksaan Nilai Hilang

5. Distribusi Variabel Target

5.1 Pembuatan Variabel Target Ordinal

5.2 Visualisasi Distribusi Target

6. Visualisasi Distribusi Prediktor

6.1 Variabel Numerik

6.2 Variabel Kategorik vs Target

7. Membangun Model Regresi Logistik Ordinal

7.1 Validasi Kolom & Persiapan Data

7.2 Split Data 80:20

7.3 Scaling & Fit Model `polr()`

8. Uji Signifikansi Koefisien (p-value)

9. Uji Asumsi Proportional Odds (Brant Test)

10. Prediksi & Evaluasi Model

11. Visualisasi Confusion Matrix

12. Akurasi & Metrik Evaluasi

13. Kesimpulan

Regresi Logistik Ordinal: Sleep Health & Lifestyle Dataset

2026-04-30

1. Persiapan Dataset

1.1 Instalasi & Pemuatan Paket

1.2 Import Dataset

2. Pemeriksaan Struktur Data

3. Ringkasan Statistik Deskriptif

4. Pemeriksaan Nilai Hilang

5. Distribusi Variabel Target

5.1 Pembuatan Variabel Target Ordinal

5.2 Visualisasi Distribusi Target

6. Visualisasi Distribusi Prediktor

6.1 Variabel Numerik

6.2 Variabel Kategorik vs Target

7. Membangun Model Regresi Logistik Ordinal

7.1 Validasi Kolom & Persiapan Data

7.2 Split Data 80:20

7.3 Scaling & Fit Model polr()

8. Uji Signifikansi Koefisien (p-value)

9. Uji Asumsi Proportional Odds (Brant Test)

10. Prediksi & Evaluasi Model

11. Visualisasi Confusion Matrix

12. Akurasi & Metrik Evaluasi

13. Kesimpulan

7.3 Scaling & Fit Model `polr()`