1. Persiapan Dataset
1.1 Instalasi & Pemuatan Paket
library(MASS)
library(ggplot2)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:MASS':
##
## select
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(caret)
## Warning: package 'caret' was built under R version 4.5.3
## Loading required package: lattice
library(brant)
## Warning: package 'brant' was built under R version 4.5.3
library(tidyr)
library(scales)
library(knitr)
library(kableExtra)
##
## Attaching package: 'kableExtra'
## The following object is masked from 'package:dplyr':
##
## group_rows
library(gridExtra)
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
1.2 Import Dataset
df <- read.csv("C:/Users/ASUS/Downloads/Sleep_health_and_lifestyle_dataset.csv", stringsAsFactors = FALSE)
cat("Dimensi dataset:", nrow(df), "baris x", ncol(df), "kolom\n")
## Dimensi dataset: 374 baris x 13 kolom
2. Pemeriksaan Struktur Data
str(df)
## 'data.frame': 374 obs. of 13 variables:
## $ Person.ID : int 1 2 3 4 5 6 7 8 9 10 ...
## $ Gender : chr "Male" "Male" "Male" "Male" ...
## $ Age : int 27 28 28 28 28 28 29 29 29 29 ...
## $ Occupation : chr "Software Engineer" "Doctor" "Doctor" "Sales Representative" ...
## $ Sleep.Duration : num 6.1 6.2 6.2 5.9 5.9 5.9 6.3 7.8 7.8 7.8 ...
## $ Quality.of.Sleep : int 6 6 6 4 4 4 6 7 7 7 ...
## $ Physical.Activity.Level: int 42 60 60 30 30 30 40 75 75 75 ...
## $ Stress.Level : int 6 8 8 8 8 8 7 6 6 6 ...
## $ BMI.Category : chr "Overweight" "Normal" "Normal" "Obese" ...
## $ Blood.Pressure : chr "126/83" "125/80" "125/80" "140/90" ...
## $ Heart.Rate : int 77 75 75 85 85 85 82 70 70 70 ...
## $ Daily.Steps : int 4200 10000 10000 3000 3000 3000 3500 8000 8000 8000 ...
## $ Sleep.Disorder : chr "None" "None" "None" "Sleep Apnea" ...
kable(head(df, 6), caption = "Enam Baris Pertama Dataset") %>%
kable_styling(bootstrap_options = c("striped","hover","condensed"), full_width = FALSE)
Enam Baris Pertama Dataset
|
Person.ID
|
Gender
|
Age
|
Occupation
|
Sleep.Duration
|
Quality.of.Sleep
|
Physical.Activity.Level
|
Stress.Level
|
BMI.Category
|
Blood.Pressure
|
Heart.Rate
|
Daily.Steps
|
Sleep.Disorder
|
|
1
|
Male
|
27
|
Software Engineer
|
6.1
|
6
|
42
|
6
|
Overweight
|
126/83
|
77
|
4200
|
None
|
|
2
|
Male
|
28
|
Doctor
|
6.2
|
6
|
60
|
8
|
Normal
|
125/80
|
75
|
10000
|
None
|
|
3
|
Male
|
28
|
Doctor
|
6.2
|
6
|
60
|
8
|
Normal
|
125/80
|
75
|
10000
|
None
|
|
4
|
Male
|
28
|
Sales Representative
|
5.9
|
4
|
30
|
8
|
Obese
|
140/90
|
85
|
3000
|
Sleep Apnea
|
|
5
|
Male
|
28
|
Sales Representative
|
5.9
|
4
|
30
|
8
|
Obese
|
140/90
|
85
|
3000
|
Sleep Apnea
|
|
6
|
Male
|
28
|
Software Engineer
|
5.9
|
4
|
30
|
8
|
Obese
|
140/90
|
85
|
3000
|
Insomnia
|
3. Ringkasan Statistik Deskriptif
summary(df)
## Person.ID Gender Age Occupation
## Min. : 1.00 Length:374 Min. :27.00 Length:374
## 1st Qu.: 94.25 Class :character 1st Qu.:35.25 Class :character
## Median :187.50 Mode :character Median :43.00 Mode :character
## Mean :187.50 Mean :42.18
## 3rd Qu.:280.75 3rd Qu.:50.00
## Max. :374.00 Max. :59.00
## Sleep.Duration Quality.of.Sleep Physical.Activity.Level Stress.Level
## Min. :5.800 Min. :4.000 Min. :30.00 Min. :3.000
## 1st Qu.:6.400 1st Qu.:6.000 1st Qu.:45.00 1st Qu.:4.000
## Median :7.200 Median :7.000 Median :60.00 Median :5.000
## Mean :7.132 Mean :7.313 Mean :59.17 Mean :5.385
## 3rd Qu.:7.800 3rd Qu.:8.000 3rd Qu.:75.00 3rd Qu.:7.000
## Max. :8.500 Max. :9.000 Max. :90.00 Max. :8.000
## BMI.Category Blood.Pressure Heart.Rate Daily.Steps
## Length:374 Length:374 Min. :65.00 Min. : 3000
## Class :character Class :character 1st Qu.:68.00 1st Qu.: 5600
## Mode :character Mode :character Median :70.00 Median : 7000
## Mean :70.17 Mean : 6817
## 3rd Qu.:72.00 3rd Qu.: 8000
## Max. :86.00 Max. :10000
## Sleep.Disorder
## Length:374
## Class :character
## Mode :character
##
##
##
num_cols <- df %>% select(where(is.numeric))
ringkasan_df <- data.frame(
Variabel = names(num_cols),
Min = sapply(num_cols, min, na.rm = TRUE),
Q1 = sapply(num_cols, quantile, 0.25, na.rm = TRUE),
Median = sapply(num_cols, median, na.rm = TRUE),
Mean = round(sapply(num_cols, mean, na.rm = TRUE), 2),
Q3 = sapply(num_cols, quantile, 0.75, na.rm = TRUE),
Max = sapply(num_cols, max, na.rm = TRUE)
)
kable(ringkasan_df, caption = "Ringkasan Statistik Variabel Numerik", row.names = FALSE) %>%
kable_styling(bootstrap_options = c("striped","hover"), full_width = FALSE)
Ringkasan Statistik Variabel Numerik
|
Variabel
|
Min
|
Q1
|
Median
|
Mean
|
Q3
|
Max
|
|
Person.ID
|
1.0
|
94.25
|
187.5
|
187.50
|
280.75
|
374.0
|
|
Age
|
27.0
|
35.25
|
43.0
|
42.18
|
50.00
|
59.0
|
|
Sleep.Duration
|
5.8
|
6.40
|
7.2
|
7.13
|
7.80
|
8.5
|
|
Quality.of.Sleep
|
4.0
|
6.00
|
7.0
|
7.31
|
8.00
|
9.0
|
|
Physical.Activity.Level
|
30.0
|
45.00
|
60.0
|
59.17
|
75.00
|
90.0
|
|
Stress.Level
|
3.0
|
4.00
|
5.0
|
5.39
|
7.00
|
8.0
|
|
Heart.Rate
|
65.0
|
68.00
|
70.0
|
70.17
|
72.00
|
86.0
|
|
Daily.Steps
|
3000.0
|
5600.00
|
7000.0
|
6816.84
|
8000.00
|
10000.0
|
4. Pemeriksaan Nilai Hilang
missing_df <- data.frame(
Kolom = names(df),
Jumlah_Missing = sapply(df, function(x) sum(is.na(x))),
Persentase = round(sapply(df, function(x) mean(is.na(x)) * 100), 2)
)
kable(missing_df, caption = "Nilai Hilang per Kolom", row.names = FALSE) %>%
kable_styling(bootstrap_options = c("striped","hover"), full_width = FALSE) %>%
row_spec(which(missing_df$Jumlah_Missing > 0), background = "#fff3cd")
Nilai Hilang per Kolom
|
Kolom
|
Jumlah_Missing
|
Persentase
|
|
Person.ID
|
0
|
0
|
|
Gender
|
0
|
0
|
|
Age
|
0
|
0
|
|
Occupation
|
0
|
0
|
|
Sleep.Duration
|
0
|
0
|
|
Quality.of.Sleep
|
0
|
0
|
|
Physical.Activity.Level
|
0
|
0
|
|
Stress.Level
|
0
|
0
|
|
BMI.Category
|
0
|
0
|
|
Blood.Pressure
|
0
|
0
|
|
Heart.Rate
|
0
|
0
|
|
Daily.Steps
|
0
|
0
|
|
Sleep.Disorder
|
0
|
0
|
ggplot(missing_df, aes(x = reorder(Kolom, Jumlah_Missing),
y = Jumlah_Missing, fill = Jumlah_Missing > 0)) +
geom_col(show.legend = FALSE) +
coord_flip() +
scale_fill_manual(values = c("steelblue","tomato")) +
labs(title = "Jumlah Nilai Hilang per Kolom", x = "Kolom", y = "Jumlah Missing") +
theme_minimal(base_size = 12)

5. Distribusi Variabel Target
5.1 Pembuatan Variabel Target Ordinal
Variabel Quality.of.Sleep (skala 4–9)
dikonversi menjadi 3 kategori ordinal: - Rendah: skor ≤
5 | Sedang: skor 6–7 | Tinggi: skor ≥
8
df <- df %>%
mutate(
Kualitas_Tidur = case_when(
Quality.of.Sleep <= 5 ~ "Rendah",
Quality.of.Sleep <= 7 ~ "Sedang",
TRUE ~ "Tinggi"
),
Kualitas_Tidur = factor(Kualitas_Tidur,
levels = c("Rendah","Sedang","Tinggi"),
ordered = TRUE)
)
cat("Distribusi Kualitas Tidur:\n"); print(table(df$Kualitas_Tidur))
## Distribusi Kualitas Tidur:
##
## Rendah Sedang Tinggi
## 12 182 180
cat("\nProporsi (%):\n"); print(round(prop.table(table(df$Kualitas_Tidur)) * 100, 2))
##
## Proporsi (%):
##
## Rendah Sedang Tinggi
## 3.21 48.66 48.13
5.2 Visualisasi Distribusi Target
freq_target <- as.data.frame(table(df$Kualitas_Tidur)) %>%
rename(Kategori = Var1, Frekuensi = Freq) %>%
mutate(Persen = round(Frekuensi / sum(Frekuensi) * 100, 1))
ggplot(freq_target, aes(x = Kategori, y = Frekuensi, fill = Kategori)) +
geom_col(width = 0.6, show.legend = FALSE) +
geom_text(aes(label = paste0(Frekuensi, "\n(", Persen, "%)")), vjust = -0.3, size = 4.5) +
scale_fill_manual(values = c("Rendah"="#e74c3c","Sedang"="#f39c12","Tinggi"="#2ecc71")) +
labs(title = "Distribusi Kualitas Tidur (Variabel Target Ordinal)",
subtitle = "Sleep Health & Lifestyle Dataset",
x = "Kategori Kualitas Tidur", y = "Frekuensi") +
theme_minimal(base_size = 13) +
theme(plot.title = element_text(face = "bold"))

6. Visualisasi Distribusi Prediktor
6.1 Variabel Numerik
numerik_long <- df %>%
select(Age, Sleep.Duration, Physical.Activity.Level,
Stress.Level, Heart.Rate, Daily.Steps) %>%
pivot_longer(everything(), names_to = "Variabel", values_to = "Nilai")
ggplot(numerik_long, aes(x = Nilai, fill = Variabel)) +
geom_histogram(bins = 20, color = "white", alpha = 0.85, show.legend = FALSE) +
facet_wrap(~ Variabel, scales = "free", ncol = 3) +
scale_fill_brewer(palette = "Set2") +
labs(title = "Distribusi Variabel Numerik Prediktor", x = "Nilai", y = "Frekuensi") +
theme_minimal(base_size = 11) +
theme(plot.title = element_text(face = "bold"))

6.2 Variabel Kategorik vs Target
p1 <- ggplot(df, aes(x = BMI.Category, fill = Kualitas_Tidur)) +
geom_bar(position = "fill") +
scale_y_continuous(labels = percent_format()) +
scale_fill_manual(values = c("Rendah"="#e74c3c","Sedang"="#f39c12","Tinggi"="#2ecc71")) +
labs(title = "BMI vs Kualitas Tidur", x = "BMI Category",
y = "Proporsi", fill = "Kualitas Tidur") +
theme_minimal(base_size = 11) +
theme(axis.text.x = element_text(angle = 20, hjust = 1))
p2 <- ggplot(df, aes(x = Gender, fill = Kualitas_Tidur)) +
geom_bar(position = "fill") +
scale_y_continuous(labels = percent_format()) +
scale_fill_manual(values = c("Rendah"="#e74c3c","Sedang"="#f39c12","Tinggi"="#2ecc71")) +
labs(title = "Gender vs Kualitas Tidur", x = "Gender",
y = "Proporsi", fill = "Kualitas Tidur") +
theme_minimal(base_size = 11)
grid.arrange(p1, p2, ncol = 2)

7. Membangun Model Regresi Logistik Ordinal
7.1 Validasi Kolom & Persiapan Data
kolom_penting <- c("Age","Sleep.Duration","Physical.Activity.Level",
"Stress.Level","Heart.Rate","Daily.Steps",
"BMI.Category","Gender","Kualitas_Tidur")
cat("Pengecekan kolom penting:\n")
## Pengecekan kolom penting:
for (kol in kolom_penting) {
ada <- kol %in% names(df)
cat(sprintf(" %-35s : %s\n", kol, ifelse(ada, "Ada", "Tidak Ada")))
}
## Age : Ada
## Sleep.Duration : Ada
## Physical.Activity.Level : Ada
## Stress.Level : Ada
## Heart.Rate : Ada
## Daily.Steps : Ada
## BMI.Category : Ada
## Gender : Ada
## Kualitas_Tidur : Ada
df_model <- df %>%
select(all_of(kolom_penting)) %>%
mutate(BMI.Category = as.factor(BMI.Category), Gender = as.factor(Gender)) %>%
drop_na()
cat("\nJumlah observasi untuk pemodelan:", nrow(df_model), "\n")
##
## Jumlah observasi untuk pemodelan: 374
7.2 Split Data 80:20
set.seed(42)
idx_train <- createDataPartition(df_model$Kualitas_Tidur, p = 0.8, list = FALSE)
data_train <- df_model[ idx_train, ]
data_test <- df_model[-idx_train, ]
cat("Training:", nrow(data_train), "| Testing:", nrow(data_test), "\n")
## Training: 300 | Testing: 74
7.3 Scaling & Fit Model polr()
Catatan pemilihan prediktor: Berdasarkan pemeriksaan
tabel silang, variabel Sleep.Duration dan
Stress.Level mengalami perfect separation terhadap
variabel target — artinya nilai-nilai tertentu dari kedua variabel ini
secara sempurna memisahkan kategori Kualitas Tidur
tanpa tumpang tindih. Kondisi ini menyebabkan koefisien divergen ke tak
hingga sehingga algoritma optimasi polr() gagal konvergen.
Oleh karena itu, model dibangun menggunakan 4 prediktor yang stabil:
Age, Physical.Activity.Level,
Heart.Rate, dan Daily.Steps.
BMI.Category juga dikeluarkan karena quasi-complete
separation (hanya 1 observasi pada kombinasi
Tidak_Normal + Sedang).
cols_scale <- c("Age", "Sleep.Duration", "Physical.Activity.Level",
"Stress.Level", "Heart.Rate", "Daily.Steps")
# Hitung mean & SD dari data training
means <- sapply(data_train[, cols_scale], mean, na.rm = TRUE)
sds <- sapply(data_train[, cols_scale], sd, na.rm = TRUE)
cat("=== SDs ===\n"); print(sds)
## === SDs ===
## Age Sleep.Duration Physical.Activity.Level
## 8.6087217 0.7948976 20.5125516
## Stress.Level Heart.Rate Daily.Steps
## 1.7689884 4.2942120 1639.4276335
# Hapus kolom dengan SD = 0
cols_scale <- cols_scale[sds > 0]
means <- means[cols_scale]
sds <- sds[cols_scale]
# Scale manual
data_train_scaled <- data_train
data_test_scaled <- data_test
for (col in cols_scale) {
data_train_scaled[[col]] <- (data_train[[col]] - means[col]) / sds[col]
data_test_scaled[[col]] <- (data_test[[col]] - means[col]) / sds[col]
}
# Buang baris dengan nilai tidak finite
data_train_scaled <- data_train_scaled %>%
filter(if_all(where(is.numeric), ~ is.finite(.x)))
data_test_scaled <- data_test_scaled %>%
filter(if_all(where(is.numeric), ~ is.finite(.x)))
cat("Train rows setelah cleaning:", nrow(data_train_scaled), "\n")
## Train rows setelah cleaning: 300
cat("Test rows setelah cleaning:", nrow(data_test_scaled), "\n")
## Test rows setelah cleaning: 74
# Pastikan faktor tidak memiliki level kosong
data_train_scaled$BMI.Category <- droplevels(data_train_scaled$BMI.Category)
data_train_scaled$Gender <- droplevels(data_train_scaled$Gender)
data_test_scaled$BMI.Category <- droplevels(data_test_scaled$BMI.Category)
data_test_scaled$Gender <- droplevels(data_test_scaled$Gender)
# Fit model dengan 4 prediktor stabil
# Sleep.Duration & Stress.Level dikeluarkan karena perfect separation
# BMI.Category dikeluarkan karena quasi-complete separation
model_olr <- polr(
Kualitas_Tidur ~ Age + Physical.Activity.Level + Heart.Rate + Daily.Steps,
data = data_train_scaled,
Hess = TRUE,
method = "logistic"
)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(model_olr)
## Call:
## polr(formula = Kualitas_Tidur ~ Age + Physical.Activity.Level +
## Heart.Rate + Daily.Steps, data = data_train_scaled, Hess = TRUE,
## method = "logistic")
##
## Coefficients:
## Value Std. Error t value
## Age 0.5726 0.1747 3.277
## Physical.Activity.Level 2.0255 0.3228 6.276
## Heart.Rate -2.2097 0.2383 -9.274
## Daily.Steps -1.0344 0.2767 -3.739
##
## Intercepts:
## Value Std. Error t value
## Rendah|Sedang -6.3409 0.6909 -9.1781
## Sedang|Tinggi 0.3889 0.1764 2.2040
##
## Residual Deviance: 283.6826
## AIC: 295.6826
8. Uji Signifikansi Koefisien (p-value)
ctable <- coef(summary(model_olr))
p_val <- pnorm(abs(ctable[, "t value"]), lower.tail = FALSE) * 2
hasil_koef <- cbind(ctable, `p-value` = round(p_val, 4))
kable(as.data.frame(hasil_koef), caption = "Koefisien Model dengan p-value", digits = 4) %>%
kable_styling(bootstrap_options = c("striped","hover"), full_width = FALSE) %>%
row_spec(which(p_val < 0.05), background = "#d4edda")
Koefisien Model dengan p-value
|
|
Value
|
Std. Error
|
t value
|
p-value
|
|
Age
|
0.5726
|
0.1747
|
3.2775
|
0.0010
|
|
Physical.Activity.Level
|
2.0255
|
0.3228
|
6.2757
|
0.0000
|
|
Heart.Rate
|
-2.2097
|
0.2383
|
-9.2741
|
0.0000
|
|
Daily.Steps
|
-1.0344
|
0.2767
|
-3.7388
|
0.0002
|
|
Rendah|Sedang
|
-6.3409
|
0.6909
|
-9.1781
|
0.0000
|
|
Sedang|Tinggi
|
0.3889
|
0.1764
|
2.2040
|
0.0275
|
# Pisahkan koefisien prediktor dari intercept
all_coef <- coef(summary(model_olr))
# Ambil hanya baris prediktor (bukan intercept Rendah|Sedang & Sedang|Tinggi)
pred_names <- names(coef(model_olr)) # hanya 4 prediktor
coef_vals <- coef(model_olr) # 4 nilai
se_vals <- sqrt(diag(vcov(model_olr))[pred_names]) # 4 SE yang sesuai
or_wald <- data.frame(
OR = round(exp(coef_vals), 4),
CI_Lower = round(exp(coef_vals - 1.96 * se_vals), 4),
CI_Upper = round(exp(coef_vals + 1.96 * se_vals), 4)
)
kable(or_wald, caption = "Odds Ratio & CI 95% (Metode Wald)") %>%
kable_styling(bootstrap_options = c("striped","hover"), full_width = FALSE)
Odds Ratio & CI 95% (Metode Wald)
|
|
OR
|
CI_Lower
|
CI_Upper
|
|
Age
|
1.7728
|
1.2588
|
2.4967
|
|
Physical.Activity.Level
|
7.5801
|
4.0266
|
14.2698
|
|
Heart.Rate
|
0.1097
|
0.0688
|
0.1750
|
|
Daily.Steps
|
0.3555
|
0.2067
|
0.6113
|
9. Uji Asumsi Proportional Odds (Brant Test)
# H0: asumsi proportional odds terpenuhi (p-value > 0.05 = OK)
tryCatch(
print(brant(model_olr)),
error = function(e) cat("Brant test error:", conditionMessage(e), "\n"),
warning = function(w) cat("Brant test warning:", conditionMessage(w), "\n")
)
## Brant test warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
10. Prediksi & Evaluasi Model
# Gunakan data_test_scaled
pred_kelas <- predict(model_olr, newdata = data_test_scaled, type = "class")
pred_prob <- predict(model_olr, newdata = data_test_scaled, type = "probs")
hasil_pred <- data.frame(
Aktual = data_test_scaled$Kualitas_Tidur,
Prediksi = pred_kelas,
Prob_Rendah = round(pred_prob[, "Rendah"], 3),
Prob_Sedang = round(pred_prob[, "Sedang"], 3),
Prob_Tinggi = round(pred_prob[, "Tinggi"], 3)
)
kable(head(hasil_pred, 10), caption = "10 Hasil Prediksi Pertama", row.names = FALSE) %>%
kable_styling(bootstrap_options = c("striped","hover"), full_width = FALSE)
10 Hasil Prediksi Pertama
|
Aktual
|
Prediksi
|
Prob_Rendah
|
Prob_Sedang
|
Prob_Tinggi
|
|
Sedang
|
Sedang
|
0.002
|
0.584
|
0.414
|
|
Sedang
|
Sedang
|
0.126
|
0.866
|
0.008
|
|
Sedang
|
Sedang
|
0.126
|
0.866
|
0.008
|
|
Sedang
|
Sedang
|
0.126
|
0.866
|
0.008
|
|
Sedang
|
Sedang
|
0.002
|
0.568
|
0.430
|
|
Sedang
|
Sedang
|
0.002
|
0.568
|
0.430
|
|
Rendah
|
Sedang
|
0.301
|
0.696
|
0.003
|
|
Sedang
|
Sedang
|
0.050
|
0.928
|
0.022
|
|
Sedang
|
Sedang
|
0.001
|
0.552
|
0.447
|
|
Sedang
|
Sedang
|
0.001
|
0.535
|
0.463
|
cm <- confusionMatrix(pred_kelas, data_test_scaled$Kualitas_Tidur)
print(cm)
## Confusion Matrix and Statistics
##
## Reference
## Prediction Rendah Sedang Tinggi
## Rendah 0 0 0
## Sedang 2 30 7
## Tinggi 0 6 29
##
## Overall Statistics
##
## Accuracy : 0.7973
## 95% CI : (0.6878, 0.8819)
## No Information Rate : 0.4865
## P-Value [Acc > NIR] : 3.721e-08
##
## Kappa : 0.6053
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: Rendah Class: Sedang Class: Tinggi
## Sensitivity 0.00000 0.8333 0.8056
## Specificity 1.00000 0.7632 0.8421
## Pos Pred Value NaN 0.7692 0.8286
## Neg Pred Value 0.97297 0.8286 0.8205
## Prevalence 0.02703 0.4865 0.4865
## Detection Rate 0.00000 0.4054 0.3919
## Detection Prevalence 0.00000 0.5270 0.4730
## Balanced Accuracy 0.50000 0.7982 0.8238
11. Visualisasi Confusion Matrix
cm_tabel <- as.data.frame(cm$table) %>%
rename(Prediksi = Prediction, Aktual = Reference, Frekuensi = Freq)
ggplot(cm_tabel, aes(x = Aktual, y = Prediksi, fill = Frekuensi)) +
geom_tile(color = "white", linewidth = 0.8) +
geom_text(aes(label = Frekuensi), size = 7, fontface = "bold", color = "white") +
scale_fill_gradient(low = "#aed6f1", high = "#1a5276") +
labs(title = "Confusion Matrix - Regresi Logistik Ordinal",
subtitle = "Prediksi vs Aktual pada Data Testing",
x = "Kategori Aktual", y = "Kategori Prediksi", fill = "Frekuensi") +
theme_minimal(base_size = 13) +
theme(plot.title = element_text(face = "bold"))

12. Akurasi & Metrik Evaluasi
akurasi <- cm$overall["Accuracy"]
kappa <- cm$overall["Kappa"]
cat(sprintf("Akurasi Model : %.4f (%.2f%%)\n", akurasi, akurasi * 100))
## Akurasi Model : 0.7973 (79.73%)
cat(sprintf("Cohen's Kappa : %.4f\n", kappa))
## Cohen's Kappa : 0.6053
metrik_kelas <- as.data.frame(cm$byClass) %>%
mutate(across(where(is.numeric), ~ round(.x, 4)))
kable(metrik_kelas, caption = "Metrik Evaluasi per Kelas") %>%
kable_styling(bootstrap_options = c("striped","hover"), full_width = FALSE)
Metrik Evaluasi per Kelas
|
|
Sensitivity
|
Specificity
|
Pos Pred Value
|
Neg Pred Value
|
Precision
|
Recall
|
F1
|
Prevalence
|
Detection Rate
|
Detection Prevalence
|
Balanced Accuracy
|
|
Class: Rendah
|
0.0000
|
1.0000
|
NaN
|
0.9730
|
NA
|
0.0000
|
NA
|
0.0270
|
0.0000
|
0.000
|
0.5000
|
|
Class: Sedang
|
0.8333
|
0.7632
|
0.7692
|
0.8286
|
0.7692
|
0.8333
|
0.8000
|
0.4865
|
0.4054
|
0.527
|
0.7982
|
|
Class: Tinggi
|
0.8056
|
0.8421
|
0.8286
|
0.8205
|
0.8286
|
0.8056
|
0.8169
|
0.4865
|
0.3919
|
0.473
|
0.8238
|
metrik_plot <- data.frame(
Kelas = rownames(cm$byClass),
Sensitivity = cm$byClass[, "Sensitivity"],
Specificity = cm$byClass[, "Specificity"],
F1_Score = cm$byClass[, "F1"]
) %>% pivot_longer(-Kelas, names_to = "Metrik", values_to = "Nilai")
ggplot(metrik_plot, aes(x = Kelas, y = Nilai, fill = Metrik)) +
geom_col(position = "dodge", width = 0.65) +
geom_text(aes(label = round(Nilai, 2)),
position = position_dodge(0.65), vjust = -0.4, size = 3.5) +
scale_fill_brewer(palette = "Set2") +
scale_y_continuous(limits = c(0, 1.1), labels = percent_format()) +
labs(title = "Metrik Evaluasi per Kelas",
subtitle = "Sensitivity, Specificity, dan F1-Score",
x = "Kelas", y = "Nilai", fill = "Metrik") +
theme_minimal(base_size = 12) +
theme(plot.title = element_text(face = "bold"))
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_col()`).
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_text()`).

13. Kesimpulan
## === RINGKASAN HASIL ANALISIS ===
## Dataset : Sleep Health & Lifestyle Dataset
## Target : Kualitas Tidur (Ordinal: Rendah < Sedang < Tinggi)
## Metode : Proportional Odds Logistic Regression (POLR)
## Paket R : MASS::polr()
##
## Prediktor dalam Model (4 variabel):
## Age, Physical.Activity.Level, Heart.Rate, Daily.Steps
##
## Prediktor Dikeluarkan:
## Sleep.Duration -> perfect separation
## Stress.Level -> perfect separation
## BMI.Category -> quasi-complete separation
##
## Ukuran Data
## Training : 300 observasi (80%)
## Testing : 74 observasi (20%)
##
## Hasil Evaluasi
## Akurasi : 79.73%
## Cohen's Kappa : 0.6053
Dataset: Sleep
Health and Lifestyle - Kaggle