Membaca dan Mengeksplorasi Dataset
data <- read.csv("sleep_health_lifestyle_dataset.csv")
Penggunaan Library
library(tidyr)
## Warning: package 'tidyr' was built under R version 4.4.3
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.4.3
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(biotools)
## Warning: package 'biotools' was built under R version 4.4.3
## Loading required package: MASS
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
## ---
## biotools version 4.3
library(caTools)
## Warning: package 'caTools' was built under R version 4.4.3
library(MASS)
library(UBL)
## Warning: package 'UBL' was built under R version 4.4.3
## Loading required package: MBA
## Warning: package 'MBA' was built under R version 4.4.3
## Loading required package: gstat
## Warning: package 'gstat' was built under R version 4.4.3
## Loading required package: automap
## Warning: package 'automap' was built under R version 4.4.3
## Loading required package: sp
## Warning: package 'sp' was built under R version 4.4.3
## Loading required package: randomForest
## Warning: package 'randomForest' was built under R version 4.4.3
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
##
## margin
## The following object is masked from 'package:dplyr':
##
## combine
library(MVN)
## Warning: package 'MVN' was built under R version 4.4.3
library(caret)
## Warning: package 'caret' was built under R version 4.4.3
## Loading required package: lattice
library(MLmetrics)
## Warning: package 'MLmetrics' was built under R version 4.4.3
##
## Attaching package: 'MLmetrics'
## The following objects are masked from 'package:caret':
##
## MAE, RMSE
## The following object is masked from 'package:base':
##
## Recall
library(nnet)
## Warning: package 'nnet' was built under R version 4.4.3
library(car)
## Warning: package 'car' was built under R version 4.4.3
## Loading required package: carData
## Warning: package 'carData' was built under R version 4.4.3
##
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
##
## recode
library(readr)
## Warning: package 'readr' was built under R version 4.4.3
library(smotefamily)
## Warning: package 'smotefamily' was built under R version 4.4.3
library(fastDummies)
## Warning: package 'fastDummies' was built under R version 4.4.3
Struktur dan Ringkasan Data
str(data)
## 'data.frame': 400 obs. of 13 variables:
## $ Person.ID : int 1 2 3 4 5 6 7 8 9 10 ...
## $ Gender : chr "Male" "Female" "Male" "Male" ...
## $ Age : int 29 43 44 29 67 47 22 49 25 51 ...
## $ Occupation : chr "Manual Labor" "Retired" "Retired" "Office Worker" ...
## $ Sleep.Duration..hours. : num 7.4 4.2 6.1 8.3 9.1 6.1 5.1 10.7 11.9 8.2 ...
## $ Quality.of.Sleep..scale..1.10. : num 7 4.9 6 10 9.5 6.9 6.1 6.2 7.2 4 ...
## $ Physical.Activity.Level..minutes.day.: int 41 41 107 20 19 24 26 49 27 64 ...
## $ Stress.Level..scale..1.10. : int 7 5 4 10 4 4 6 8 8 5 ...
## $ BMI.Category : chr "Obese" "Obese" "Underweight" "Obese" ...
## $ Blood.Pressure..systolic.diastolic. : chr "124/70" "131/86" "122/70" "124/72" ...
## $ Heart.Rate..bpm. : int 91 81 81 55 97 87 66 59 99 76 ...
## $ Daily.Steps : int 8539 18754 2857 6886 14945 9485 15680 18767 16397 12744 ...
## $ Sleep.Disorder : chr "None" "None" "None" "None" ...
summary(data)
## Person.ID Gender Age Occupation
## Min. : 1.0 Length:400 Min. :18.00 Length:400
## 1st Qu.:100.8 Class :character 1st Qu.:29.00 Class :character
## Median :200.5 Mode :character Median :40.00 Mode :character
## Mean :200.5 Mean :39.95
## 3rd Qu.:300.2 3rd Qu.:49.00
## Max. :400.0 Max. :90.00
## Sleep.Duration..hours. Quality.of.Sleep..scale..1.10.
## Min. : 4.100 Min. : 1.000
## 1st Qu.: 5.900 1st Qu.: 4.700
## Median : 8.200 Median : 6.100
## Mean : 8.041 Mean : 6.126
## 3rd Qu.:10.125 3rd Qu.: 7.425
## Max. :12.000 Max. :10.000
## Physical.Activity.Level..minutes.day. Stress.Level..scale..1.10.
## Min. : 10.00 Min. : 1.000
## 1st Qu.: 35.00 1st Qu.: 3.000
## Median : 65.50 Median : 5.000
## Mean : 64.98 Mean : 5.473
## 3rd Qu.: 94.00 3rd Qu.: 8.000
## Max. :120.00 Max. :10.000
## BMI.Category Blood.Pressure..systolic.diastolic. Heart.Rate..bpm.
## Length:400 Length:400 Min. : 50.00
## Class :character Class :character 1st Qu.: 63.00
## Mode :character Mode :character Median : 77.00
## Mean : 75.99
## 3rd Qu.: 90.00
## Max. :100.00
## Daily.Steps Sleep.Disorder
## Min. : 2067 Length:400
## 1st Qu.: 6165 Class :character
## Median :11786 Mode :character
## Mean :11077
## 3rd Qu.:15878
## Max. :19958
Cek Nilai Hilang dan Duplikasi Data
colSums(is.na(data))
## Person.ID Gender
## 0 0
## Age Occupation
## 0 0
## Sleep.Duration..hours. Quality.of.Sleep..scale..1.10.
## 0 0
## Physical.Activity.Level..minutes.day. Stress.Level..scale..1.10.
## 0 0
## BMI.Category Blood.Pressure..systolic.diastolic.
## 0 0
## Heart.Rate..bpm. Daily.Steps
## 0 0
## Sleep.Disorder
## 0
sapply(data, function(x) sum(x == "", na.rm = TRUE))
## Person.ID Gender
## 0 0
## Age Occupation
## 0 0
## Sleep.Duration..hours. Quality.of.Sleep..scale..1.10.
## 0 0
## Physical.Activity.Level..minutes.day. Stress.Level..scale..1.10.
## 0 0
## BMI.Category Blood.Pressure..systolic.diastolic.
## 0 0
## Heart.Rate..bpm. Daily.Steps
## 0 0
## Sleep.Disorder
## 0
sum(duplicated(data))
## [1] 0
Cek Outlier
sleep_duration <- data[["Sleep.Duration..hours."]]
z_scores <- scale(sleep_duration)
outliers <- which(abs(z_scores) > 3)
data[outliers, ]
## [1] Person.ID Gender
## [3] Age Occupation
## [5] Sleep.Duration..hours. Quality.of.Sleep..scale..1.10.
## [7] Physical.Activity.Level..minutes.day. Stress.Level..scale..1.10.
## [9] BMI.Category Blood.Pressure..systolic.diastolic.
## [11] Heart.Rate..bpm. Daily.Steps
## [13] Sleep.Disorder
## <0 rows> (or 0-length row.names)
Memisahkan Kolom Blood Pressure Menjadi Systolic dan Diastolic
data <- data %>%
separate(Blood.Pressure..systolic.diastolic., into = c("Systolic", "Diastolic"), sep = "/", convert = TRUE)
Statistik Deskriptif
summary(data[sapply(data, is.numeric)])
## Person.ID Age Sleep.Duration..hours.
## Min. : 1.0 Min. :18.00 Min. : 4.100
## 1st Qu.:100.8 1st Qu.:29.00 1st Qu.: 5.900
## Median :200.5 Median :40.00 Median : 8.200
## Mean :200.5 Mean :39.95 Mean : 8.041
## 3rd Qu.:300.2 3rd Qu.:49.00 3rd Qu.:10.125
## Max. :400.0 Max. :90.00 Max. :12.000
## Quality.of.Sleep..scale..1.10. Physical.Activity.Level..minutes.day.
## Min. : 1.000 Min. : 10.00
## 1st Qu.: 4.700 1st Qu.: 35.00
## Median : 6.100 Median : 65.50
## Mean : 6.126 Mean : 64.98
## 3rd Qu.: 7.425 3rd Qu.: 94.00
## Max. :10.000 Max. :120.00
## Stress.Level..scale..1.10. Systolic Diastolic Heart.Rate..bpm.
## Min. : 1.000 Min. :109.0 Min. :60.00 Min. : 50.00
## 1st Qu.: 3.000 1st Qu.:115.0 1st Qu.:66.00 1st Qu.: 63.00
## Median : 5.000 Median :122.0 Median :73.00 Median : 77.00
## Mean : 5.473 Mean :122.2 Mean :73.04 Mean : 75.99
## 3rd Qu.: 8.000 3rd Qu.:128.0 3rd Qu.:79.00 3rd Qu.: 90.00
## Max. :10.000 Max. :145.0 Max. :96.00 Max. :100.00
## Daily.Steps
## Min. : 2067
## 1st Qu.: 6165
## Median :11786
## Mean :11077
## 3rd Qu.:15878
## Max. :19958
Visualisasi Awal (EDA)
Distribusi Durasi Tidur
ggplot(data, aes(x = `Sleep.Duration..hours.`)) +
geom_histogram(binwidth = 1, fill = "steelblue", color = "black") +
stat_bin(binwidth = 1, geom = "text", aes(label = ..count..), vjust = -0.5) +
scale_x_continuous(breaks = seq(0, 13, 1)) +
labs(title = "Distribusi Durasi Tidur", x = "Durasi Tidur (jam)", y = "Jumlah Individu") +
theme_minimal()
## Warning: The dot-dot notation (`..count..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(count)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

Distribusi Sleep Disorder
ggplot(data, aes(x = Sleep.Disorder)) +
geom_bar(fill = "tomato") +
labs(title = "Distribusi Sleep Disorder", x = "Jenis Gangguan Tidur", y = "Jumlah Individu") +
theme_minimal()

Encoding Kategorikal dan Cek Skewness
data$Gender <- as.factor(data$Gender)
data$Occupation <- as.factor(data$Occupation)
data$BMI.Category <- as.factor(data$BMI.Category)
data$Sleep.Disorder <- as.factor(data$Sleep.Disorder)
num_cols <- c("Age", "Sleep.Duration..hours.", "Quality.of.Sleep..scale..1.10.",
"Physical.Activity.Level..minutes.day.", "Stress.Level..scale..1.10.",
"Heart.Rate..bpm.", "Systolic", "Diastolic", "Daily.Steps")
data[num_cols] %>%
pivot_longer(everything(), names_to = "Variabel", values_to = "Nilai") %>%
ggplot(aes(x = Nilai)) +
geom_histogram(bins = 30, fill = "steelblue", color = "black") +
facet_wrap(~ Variabel, scales = "free", ncol = 3) +
theme_minimal()

Uji Normalitas dan Homogenitas
mvn(data = data_normalized[num_cols_updated], multivariatePlot = "qq")

## $multivariateNormality
## Test HZ p value MVN
## 1 Henze-Zirkler 1.370701 0 NO
##
## $univariateNormality
## Test Variable Statistic p value
## 1 Anderson-Darling Age 3.2674 <0.001
## 2 Anderson-Darling Sleep.Duration..hours. 6.1299 <0.001
## 3 Anderson-Darling Quality.of.Sleep..scale..1.10. 0.4740 0.2403
## 4 Anderson-Darling Physical.Activity.Level..minutes.day. 5.1860 <0.001
## 5 Anderson-Darling Heart.Rate..bpm. 5.6207 <0.001
## 6 Anderson-Darling Systolic 1.5722 5e-04
## 7 Anderson-Darling Diastolic 2.4130 <0.001
## 8 Anderson-Darling Daily.Steps 6.9134 <0.001
## Normality
## 1 NO
## 2 NO
## 3 YES
## 4 NO
## 5 NO
## 6 NO
## 7 NO
## 8 NO
##
## $Descriptives
## n Mean Std.Dev Median
## Age 400 -3.453408e-16 1 0.17933727
## Sleep.Duration..hours. 400 8.825541e-17 1 0.06640073
## Quality.of.Sleep..scale..1.10. 400 -1.193307e-17 1 -0.01303314
## Physical.Activity.Level..minutes.day. 400 1.722025e-17 1 0.01594532
## Heart.Rate..bpm. 400 3.252693e-16 1 0.06689037
## Systolic 400 -1.714887e-15 1 0.01032986
## Diastolic 400 -1.329076e-15 1 0.02599014
## Daily.Steps 400 -4.046378e-17 1 0.13215617
## Min Max 25th 75th
## Age -1.980542 2.372819 -0.6905128 0.7282710
## Sleep.Duration..hours. -1.648516 1.655835 -0.8956255 0.8715749
## Quality.of.Sleep..scale..1.10. -2.594354 1.960918 -0.7216309 0.6576041
## Physical.Activity.Level..minutes.day. -1.702434 1.703363 -0.9283893 0.8983563
## Heart.Rate..bpm. -1.721268 1.590136 -0.8603029 0.9278555
## Systolic -1.670664 2.587065 -0.8712295 0.7265890
## Diastolic -1.503583 2.429408 -0.7789446 0.6856899
## Daily.Steps -1.679378 1.655515 -0.9154619 0.8950007
## Skew Kurtosis
## Age -0.32628119 -0.6220320
## Sleep.Duration..hours. -0.06329684 -1.2895248
## Quality.of.Sleep..scale..1.10. -0.11632665 -0.3664791
## Physical.Activity.Level..minutes.day. 0.03774603 -1.2436784
## Heart.Rate..bpm. -0.08445252 -1.2543068
## Systolic 0.16547902 -0.6016516
## Diastolic 0.29480978 -0.6645804
## Daily.Steps -0.07826105 -1.3160737
for (label in levels(data_normalized$Sleep.Disorder)) {
cat("\nNormalitas untuk kelas:", label, "\n")
hasil <- mvn(data_normalized[data_normalized$Sleep.Disorder == label, num_cols_updated], multivariatePlot = "none")
print(hasil$multivariateNormality)
}
##
## Normalitas untuk kelas: Insomnia
## Test HZ p value MVN
## 1 Henze-Zirkler 1.055827 5.368583e-05 NO
##
## Normalitas untuk kelas: None
## Test HZ p value MVN
## 1 Henze-Zirkler 1.272359 0 NO
##
## Normalitas untuk kelas: Sleep Apnea
## Test HZ p value MVN
## 1 Henze-Zirkler 0.959814 0.1254576 YES
box_m_test <- boxM(data_normalized[, num_cols_updated], data_normalized$Sleep.Disorder)
print(box_m_test)
##
## Box's M-test for Homogeneity of Covariance Matrices
##
## data: data_normalized[, num_cols_updated]
## Chi-Sq (approx.) = 55.581, df = 72, p-value = 0.9239
Uji ANOVA
for (var in num_cols_updated) {
model <- aov(as.formula(paste(var, "~ Sleep.Disorder")), data = data_normalized)
cat("\nANOVA untuk:", var, "\n")
print(summary(model))
}
##
## ANOVA untuk: Age
## Df Sum Sq Mean Sq F value Pr(>F)
## Sleep.Disorder 2 0.3 0.1338 0.133 0.875
## Residuals 397 398.7 1.0044
##
## ANOVA untuk: Sleep.Duration..hours.
## Df Sum Sq Mean Sq F value Pr(>F)
## Sleep.Disorder 2 0.9 0.4602 0.459 0.632
## Residuals 397 398.1 1.0027
##
## ANOVA untuk: Quality.of.Sleep..scale..1.10.
## Df Sum Sq Mean Sq F value Pr(>F)
## Sleep.Disorder 2 0.2 0.0805 0.08 0.923
## Residuals 397 398.8 1.0046
##
## ANOVA untuk: Physical.Activity.Level..minutes.day.
## Df Sum Sq Mean Sq F value Pr(>F)
## Sleep.Disorder 2 9 4.517 4.598 0.0106 *
## Residuals 397 390 0.982
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## ANOVA untuk: Heart.Rate..bpm.
## Df Sum Sq Mean Sq F value Pr(>F)
## Sleep.Disorder 2 0 0.0184 0.018 0.982
## Residuals 397 399 1.0049
##
## ANOVA untuk: Systolic
## Df Sum Sq Mean Sq F value Pr(>F)
## Sleep.Disorder 2 0.4 0.2018 0.201 0.818
## Residuals 397 398.6 1.0040
##
## ANOVA untuk: Diastolic
## Df Sum Sq Mean Sq F value Pr(>F)
## Sleep.Disorder 2 0.5 0.2702 0.269 0.764
## Residuals 397 398.5 1.0037
##
## ANOVA untuk: Daily.Steps
## Df Sum Sq Mean Sq F value Pr(>F)
## Sleep.Disorder 2 5.7 2.8382 2.865 0.0582 .
## Residuals 397 393.3 0.9907
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Pemisahan Data
set.seed(123)
split <- sample.split(data_normalized$Sleep.Disorder, SplitRatio = 0.8)
trainData <- subset(data_normalized, split == TRUE)
testData <- subset(data_normalized, split == FALSE)
Latih dan Evaluasi Model LDA
lda_model <- lda(Sleep.Disorder ~ ., data = trainData)
print(lda_model)
## Call:
## lda(Sleep.Disorder ~ ., data = trainData)
##
## Prior probabilities of groups:
## Insomnia None Sleep Apnea
## 0.196875 0.725000 0.078125
##
## Group means:
## GenderMale Age OccupationOffice Worker OccupationRetired
## Insomnia 0.4920635 0.01169231 0.3650794 0.1746032
## None 0.5129310 0.04077412 0.2198276 0.2241379
## Sleep Apnea 0.4800000 -0.01543298 0.3200000 0.2400000
## OccupationStudent Sleep.Duration..hours.
## Insomnia 0.2539683 -0.05509741
## None 0.2974138 -0.01653257
## Sleep Apnea 0.2800000 0.15507445
## Quality.of.Sleep..scale..1.10.
## Insomnia 0.03597419
## None -0.01761459
## Sleep Apnea -0.18107204
## Physical.Activity.Level..minutes.day. BMI.CategoryObese
## Insomnia -0.1386179 0.2063492
## None 0.1009568 0.2586207
## Sleep Apnea -0.3289690 0.2400000
## BMI.CategoryOverweight BMI.CategoryUnderweight Systolic
## Insomnia 0.2222222 0.3015873 -0.01395367
## None 0.2931034 0.2284483 0.05306806
## Sleep Apnea 0.2400000 0.2800000 -0.03809873
## Diastolic Heart.Rate..bpm. Daily.Steps
## Insomnia 0.08992091 0.01537963 0.2445957
## None 0.02066944 -0.02474418 -0.1324443
## Sleep Apnea 0.04407762 -0.06821493 0.2202155
##
## Coefficients of linear discriminants:
## LD1 LD2
## GenderMale 0.096390867 -0.14145704
## Age 0.197846581 2.54158608
## OccupationOffice Worker -1.286575714 0.07506759
## OccupationRetired -0.328392771 1.12804726
## OccupationStudent -0.276342275 0.62333798
## Sleep.Duration..hours. -0.068738595 0.50483986
## Quality.of.Sleep..scale..1.10. 0.006125075 -0.43519535
## Physical.Activity.Level..minutes.day. 0.433586450 -0.38899267
## BMI.CategoryObese 0.767470321 4.02148114
## BMI.CategoryOverweight 0.560102621 0.23964715
## BMI.CategoryUnderweight -0.394729431 -0.01010480
## Systolic 0.499464929 -2.84990010
## Diastolic -0.764019132 -0.29977483
## Heart.Rate..bpm. -0.050107041 -0.10102298
## Daily.Steps -0.579080873 0.02012286
##
## Proportion of trace:
## LD1 LD2
## 0.857 0.143
lda_pred <- predict(lda_model, testData)
confusionMatrix(lda_pred$class, testData$Sleep.Disorder)
## Confusion Matrix and Statistics
##
## Reference
## Prediction Insomnia None Sleep Apnea
## Insomnia 3 1 1
## None 13 57 5
## Sleep Apnea 0 0 0
##
## Overall Statistics
##
## Accuracy : 0.75
## 95% CI : (0.6406, 0.8401)
## No Information Rate : 0.725
## P-Value [Acc > NIR] : 0.3598434
##
## Kappa : 0.1878
##
## Mcnemar's Test P-Value : 0.0009908
##
## Statistics by Class:
##
## Class: Insomnia Class: None Class: Sleep Apnea
## Sensitivity 0.1875 0.9828 0.000
## Specificity 0.9688 0.1818 1.000
## Pos Pred Value 0.6000 0.7600 NaN
## Neg Pred Value 0.8267 0.8000 0.925
## Prevalence 0.2000 0.7250 0.075
## Detection Rate 0.0375 0.7125 0.000
## Detection Prevalence 0.0625 0.9375 0.000
## Balanced Accuracy 0.5781 0.5823 0.500
Latih dan Evaluasi Model QDA
qda_model <- qda(Sleep.Disorder ~ ., data = trainData)
print(qda_model)
## Call:
## qda(Sleep.Disorder ~ ., data = trainData)
##
## Prior probabilities of groups:
## Insomnia None Sleep Apnea
## 0.196875 0.725000 0.078125
##
## Group means:
## GenderMale Age OccupationOffice Worker OccupationRetired
## Insomnia 0.4920635 0.01169231 0.3650794 0.1746032
## None 0.5129310 0.04077412 0.2198276 0.2241379
## Sleep Apnea 0.4800000 -0.01543298 0.3200000 0.2400000
## OccupationStudent Sleep.Duration..hours.
## Insomnia 0.2539683 -0.05509741
## None 0.2974138 -0.01653257
## Sleep Apnea 0.2800000 0.15507445
## Quality.of.Sleep..scale..1.10.
## Insomnia 0.03597419
## None -0.01761459
## Sleep Apnea -0.18107204
## Physical.Activity.Level..minutes.day. BMI.CategoryObese
## Insomnia -0.1386179 0.2063492
## None 0.1009568 0.2586207
## Sleep Apnea -0.3289690 0.2400000
## BMI.CategoryOverweight BMI.CategoryUnderweight Systolic
## Insomnia 0.2222222 0.3015873 -0.01395367
## None 0.2931034 0.2284483 0.05306806
## Sleep Apnea 0.2400000 0.2800000 -0.03809873
## Diastolic Heart.Rate..bpm. Daily.Steps
## Insomnia 0.08992091 0.01537963 0.2445957
## None 0.02066944 -0.02474418 -0.1324443
## Sleep Apnea 0.04407762 -0.06821493 0.2202155
qda_pred <- predict(qda_model, testData)
confusionMatrix(qda_pred$class, testData$Sleep.Disorder)
## Confusion Matrix and Statistics
##
## Reference
## Prediction Insomnia None Sleep Apnea
## Insomnia 3 10 1
## None 13 47 5
## Sleep Apnea 0 1 0
##
## Overall Statistics
##
## Accuracy : 0.625
## 95% CI : (0.5096, 0.7308)
## No Information Rate : 0.725
## P-Value [Acc > NIR] : 0.9810
##
## Kappa : 0
##
## Mcnemar's Test P-Value : 0.2553
##
## Statistics by Class:
##
## Class: Insomnia Class: None Class: Sleep Apnea
## Sensitivity 0.1875 0.8103 0.0000
## Specificity 0.8281 0.1818 0.9865
## Pos Pred Value 0.2143 0.7231 0.0000
## Neg Pred Value 0.8030 0.2667 0.9241
## Prevalence 0.2000 0.7250 0.0750
## Detection Rate 0.0375 0.5875 0.0000
## Detection Prevalence 0.1750 0.8125 0.0125
## Balanced Accuracy 0.5078 0.4961 0.4932
Visualisasi Hasil LDA
testData$LDA1 <- lda_pred$x[,1]
testData$LDA2 <- lda_pred$x[,2]
ggplot(testData, aes(x= LDA1, y= LDA2, color = Sleep.Disorder)) +
geom_point(alpha = 0.7) +
labs(title = "Visualisasi LDA", x = "LDA1", y = "LDA2") +
theme_minimal()

Analisis Regresi Logistik Multinomial
# Load data
data <- read_csv("sleep_health_lifestyle_dataset.csv")
# Ubah kolom kategorikal menjadi faktor
data <- data %>%
mutate(
Gender = as.factor(Gender),
Occupation = as.factor(Occupation),
`BMI Category` = as.factor(`BMI Category`),
`Sleep Disorder` = as.factor(`Sleep Disorder`)
)
# Pisahkan tekanan darah ke dua kolom numerik: Systolic dan Diastolic
data <- data %>%
separate(`Blood Pressure (systolic/diastolic)`, into = c("Systolic", "Diastolic"), sep = "/", convert = TRUE)
# Drop baris yang memiliki NA di kolom target (Sleep Disorder)
data <- data %>% filter(!is.na(`Sleep Disorder`))
# Cek hasil akhir
glimpse(data)
## Rows: 400
## Columns: 14
## $ `Person ID` <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10,…
## $ Gender <fct> Male, Female, Male, Male, Male…
## $ Age <dbl> 29, 43, 44, 29, 67, 47, 22, 49…
## $ Occupation <fct> Manual Labor, Retired, Retired…
## $ `Sleep Duration (hours)` <dbl> 7.4, 4.2, 6.1, 8.3, 9.1, 6.1, …
## $ `Quality of Sleep (scale: 1-10)` <dbl> 7.0, 4.9, 6.0, 10.0, 9.5, 6.9,…
## $ `Physical Activity Level (minutes/day)` <dbl> 41, 41, 107, 20, 19, 24, 26, 4…
## $ `Stress Level (scale: 1-10)` <dbl> 7, 5, 4, 10, 4, 4, 6, 8, 8, 5,…
## $ `BMI Category` <fct> Obese, Obese, Underweight, Obe…
## $ Systolic <int> 124, 131, 122, 124, 133, 123, …
## $ Diastolic <int> 70, 86, 70, 72, 78, 60, 70, 87…
## $ `Heart Rate (bpm)` <dbl> 91, 81, 81, 55, 97, 87, 66, 59…
## $ `Daily Steps` <dbl> 8539, 18754, 2857, 6886, 14945…
## $ `Sleep Disorder` <fct> None, None, None, None, Insomn…
summary(data)
## Person ID Gender Age Occupation
## Min. : 1.0 Female:201 Min. :18.00 Manual Labor : 96
## 1st Qu.:100.8 Male :199 1st Qu.:29.00 Office Worker: 99
## Median :200.5 Median :40.00 Retired : 95
## Mean :200.5 Mean :39.95 Student :110
## 3rd Qu.:300.2 3rd Qu.:49.00
## Max. :400.0 Max. :90.00
## Sleep Duration (hours) Quality of Sleep (scale: 1-10)
## Min. : 4.100 Min. : 1.000
## 1st Qu.: 5.900 1st Qu.: 4.700
## Median : 8.200 Median : 6.100
## Mean : 8.041 Mean : 6.126
## 3rd Qu.:10.125 3rd Qu.: 7.425
## Max. :12.000 Max. :10.000
## Physical Activity Level (minutes/day) Stress Level (scale: 1-10)
## Min. : 10.00 Min. : 1.000
## 1st Qu.: 35.00 1st Qu.: 3.000
## Median : 65.50 Median : 5.000
## Mean : 64.98 Mean : 5.473
## 3rd Qu.: 94.00 3rd Qu.: 8.000
## Max. :120.00 Max. :10.000
## BMI Category Systolic Diastolic Heart Rate (bpm)
## Normal : 91 Min. :109.0 Min. :60.00 Min. : 50.00
## Obese : 98 1st Qu.:115.0 1st Qu.:66.00 1st Qu.: 63.00
## Overweight :109 Median :122.0 Median :73.00 Median : 77.00
## Underweight:102 Mean :122.2 Mean :73.04 Mean : 75.99
## 3rd Qu.:128.0 3rd Qu.:79.00 3rd Qu.: 90.00
## Max. :145.0 Max. :96.00 Max. :100.00
## Daily Steps Sleep Disorder
## Min. : 2067 Insomnia : 79
## 1st Qu.: 6165 None :290
## Median :11786 Sleep Apnea: 31
## Mean :11077
## 3rd Qu.:15878
## Max. :19958
Build Model Multinomial Logistic Regression
# Split Data Train dan Test
set.seed(123)
library(caTools)
split <- sample.split(data$`Sleep Disorder`, SplitRatio = 0.8)
train <- subset(data, split == TRUE)
test <- subset(data, split == FALSE)
library(nnet)
model <- multinom(`Sleep Disorder` ~ ., data = train)
## # weights: 57 (36 variable)
## initial value 351.555932
## iter 10 value 282.353428
## iter 20 value 261.649692
## iter 30 value 224.185005
## iter 40 value 223.099062
## iter 50 value 222.607334
## iter 60 value 222.495284
## final value 222.244355
## converged
summary(model)
## Call:
## multinom(formula = `Sleep Disorder` ~ ., data = train)
##
## Coefficients:
## (Intercept) `Person ID` GenderMale Age
## None -72.7237 -0.0001723193 0.08308344 -0.3609286
## Sleep Apnea -98.7701 -0.0006566064 -0.13423185 -0.4970458
## OccupationOffice Worker OccupationRetired OccupationStudent
## None -0.88906646 -0.1078120 -0.08346798
## Sleep Apnea -0.01741841 0.6164179 0.42203309
## `Sleep Duration (hours)` `Quality of Sleep (scale: 1-10)`
## None 0.005992069 -0.03360876
## Sleep Apnea 0.108782414 -0.10355854
## `Physical Activity Level (minutes/day)`
## None 0.007184875
## Sleep Apnea -0.007680839
## `Stress Level (scale: 1-10)` `BMI Category`Obese
## None -0.002084572 -6.785851
## Sleep Apnea 0.135571838 -9.742138
## `BMI Category`Overweight `BMI Category`Underweight Systolic
## None 0.367060206 -0.3147506 0.7869565
## Sleep Apnea -0.009587405 -0.1621588 0.9910614
## Diastolic `Heart Rate (bpm)` `Daily Steps`
## None -0.06673796 -0.003848549 -7.447352e-05
## Sleep Apnea -0.01349845 -0.008306396 3.718087e-07
##
## Std. Errors:
## (Intercept) `Person ID` GenderMale Age
## None 0.0003043853 0.001308302 0.0010032387 0.01455552
## Sleep Apnea 0.0005469014 0.002158540 0.0004533345 0.02311583
## OccupationOffice Worker OccupationRetired OccupationStudent
## None 0.001129647 0.0006082703 0.0009284263
## Sleep Apnea 0.001031989 0.0003201091 0.0004660763
## `Sleep Duration (hours)` `Quality of Sleep (scale: 1-10)`
## None 0.05042876 0.05936647
## Sleep Apnea 0.01807641 0.02331854
## `Physical Activity Level (minutes/day)`
## None 0.004638267
## Sleep Apnea 0.007791381
## `Stress Level (scale: 1-10)` `BMI Category`Obese
## None 0.04378828 0.001845208
## Sleep Apnea 0.01791826 0.002697322
## `BMI Category`Overweight `BMI Category`Underweight Systolic
## None 0.0007632439 0.001207743 0.02031453
## Sleep Apnea 0.0011977052 0.001100462 0.03164274
## Diastolic `Heart Rate (bpm)` `Daily Steps`
## None 0.03087102 0.009785213 2.892987e-05
## Sleep Apnea 0.04969513 0.015853833 4.767726e-05
##
## Residual Deviance: 444.4887
## AIC: 516.4887
Evaluasi Model
pred <- predict(model, newdata = test)
confusionMatrix <- table(Predicted = pred, Actual = test$`Sleep Disorder`)
print(confusionMatrix)
## Actual
## Predicted Insomnia None Sleep Apnea
## Insomnia 2 1 1
## None 14 57 5
## Sleep Apnea 0 0 0
Akurasi
accuracy <- sum(diag(confusionMatrix)) / sum(confusionMatrix)
print(paste("Accuracy:", round(accuracy * 100, 2), "%"))
## [1] "Accuracy: 73.75 %"
Hitung Precision, Recall, dan F1-score per kelas
# Konversi prediksi dan label aktual ke faktor dengan level yang sama
pred <- factor(pred, levels = levels(test$`Sleep Disorder`))
actual <- factor(test$`Sleep Disorder`, levels = levels(test$`Sleep Disorder`))
# Buat confusion matrix menggunakan caret
conf <- confusionMatrix(pred, actual)
print(conf)
## Confusion Matrix and Statistics
##
## Reference
## Prediction Insomnia None Sleep Apnea
## Insomnia 2 1 1
## None 14 57 5
## Sleep Apnea 0 0 0
##
## Overall Statistics
##
## Accuracy : 0.7375
## 95% CI : (0.6271, 0.8296)
## No Information Rate : 0.725
## P-Value [Acc > NIR] : 0.4576322
##
## Kappa : 0.1286
##
## Mcnemar's Test P-Value : 0.0006229
##
## Statistics by Class:
##
## Class: Insomnia Class: None Class: Sleep Apnea
## Sensitivity 0.1250 0.9828 0.000
## Specificity 0.9688 0.1364 1.000
## Pos Pred Value 0.5000 0.7500 NaN
## Neg Pred Value 0.8158 0.7500 0.925
## Prevalence 0.2000 0.7250 0.075
## Detection Rate 0.0250 0.7125 0.000
## Detection Prevalence 0.0500 0.9500 0.000
## Balanced Accuracy 0.5469 0.5596 0.500
Hitung precision, recall, dan F1-score secara manual
classes <- levels(actual)
for (cls in classes) {
precision <- Precision(y_pred = pred, y_true = actual, positive = cls)
recall <- Recall(y_pred = pred, y_true = actual, positive = cls)
f1 <- F1_Score(y_pred = pred, y_true = actual, positive = cls)
cat("\nKelas:", cls, "\n")
cat("Precision:", round(precision, 3), "\n")
cat("Recall :", round(recall, 3), "\n")
cat("F1-Score :", round(f1, 3), "\n")
}
##
## Kelas: Insomnia
## Precision: 0.5
## Recall : 0.125
## F1-Score : 0.2
##
## Kelas: None
## Precision: 0.75
## Recall : 0.983
## F1-Score : 0.851
##
## Kelas: Sleep Apnea
## Precision: NaN
## Recall : 0
## F1-Score : NaN