Membaca dan Mengeksplorasi Dataset

data <- read.csv("sleep_health_lifestyle_dataset.csv")

Penggunaan Library

library(tidyr)
## Warning: package 'tidyr' was built under R version 4.4.3
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.4.3
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
library(biotools)
## Warning: package 'biotools' was built under R version 4.4.3
## Loading required package: MASS
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
## 
##     select
## ---
## biotools version 4.3
library(caTools)
## Warning: package 'caTools' was built under R version 4.4.3
library(MASS)
library(UBL)
## Warning: package 'UBL' was built under R version 4.4.3
## Loading required package: MBA
## Warning: package 'MBA' was built under R version 4.4.3
## Loading required package: gstat
## Warning: package 'gstat' was built under R version 4.4.3
## Loading required package: automap
## Warning: package 'automap' was built under R version 4.4.3
## Loading required package: sp
## Warning: package 'sp' was built under R version 4.4.3
## Loading required package: randomForest
## Warning: package 'randomForest' was built under R version 4.4.3
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
## 
##     margin
## The following object is masked from 'package:dplyr':
## 
##     combine
library(MVN)
## Warning: package 'MVN' was built under R version 4.4.3
library(caret)
## Warning: package 'caret' was built under R version 4.4.3
## Loading required package: lattice
library(MLmetrics)
## Warning: package 'MLmetrics' was built under R version 4.4.3
## 
## Attaching package: 'MLmetrics'
## The following objects are masked from 'package:caret':
## 
##     MAE, RMSE
## The following object is masked from 'package:base':
## 
##     Recall
library(nnet)
## Warning: package 'nnet' was built under R version 4.4.3
library(car)
## Warning: package 'car' was built under R version 4.4.3
## Loading required package: carData
## Warning: package 'carData' was built under R version 4.4.3
## 
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
## 
##     recode
library(readr)
## Warning: package 'readr' was built under R version 4.4.3
library(smotefamily)
## Warning: package 'smotefamily' was built under R version 4.4.3
library(fastDummies)
## Warning: package 'fastDummies' was built under R version 4.4.3

Struktur dan Ringkasan Data

str(data)
## 'data.frame':    400 obs. of  13 variables:
##  $ Person.ID                            : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ Gender                               : chr  "Male" "Female" "Male" "Male" ...
##  $ Age                                  : int  29 43 44 29 67 47 22 49 25 51 ...
##  $ Occupation                           : chr  "Manual Labor" "Retired" "Retired" "Office Worker" ...
##  $ Sleep.Duration..hours.               : num  7.4 4.2 6.1 8.3 9.1 6.1 5.1 10.7 11.9 8.2 ...
##  $ Quality.of.Sleep..scale..1.10.       : num  7 4.9 6 10 9.5 6.9 6.1 6.2 7.2 4 ...
##  $ Physical.Activity.Level..minutes.day.: int  41 41 107 20 19 24 26 49 27 64 ...
##  $ Stress.Level..scale..1.10.           : int  7 5 4 10 4 4 6 8 8 5 ...
##  $ BMI.Category                         : chr  "Obese" "Obese" "Underweight" "Obese" ...
##  $ Blood.Pressure..systolic.diastolic.  : chr  "124/70" "131/86" "122/70" "124/72" ...
##  $ Heart.Rate..bpm.                     : int  91 81 81 55 97 87 66 59 99 76 ...
##  $ Daily.Steps                          : int  8539 18754 2857 6886 14945 9485 15680 18767 16397 12744 ...
##  $ Sleep.Disorder                       : chr  "None" "None" "None" "None" ...
summary(data)
##    Person.ID        Gender               Age         Occupation       
##  Min.   :  1.0   Length:400         Min.   :18.00   Length:400        
##  1st Qu.:100.8   Class :character   1st Qu.:29.00   Class :character  
##  Median :200.5   Mode  :character   Median :40.00   Mode  :character  
##  Mean   :200.5                      Mean   :39.95                     
##  3rd Qu.:300.2                      3rd Qu.:49.00                     
##  Max.   :400.0                      Max.   :90.00                     
##  Sleep.Duration..hours. Quality.of.Sleep..scale..1.10.
##  Min.   : 4.100         Min.   : 1.000                
##  1st Qu.: 5.900         1st Qu.: 4.700                
##  Median : 8.200         Median : 6.100                
##  Mean   : 8.041         Mean   : 6.126                
##  3rd Qu.:10.125         3rd Qu.: 7.425                
##  Max.   :12.000         Max.   :10.000                
##  Physical.Activity.Level..minutes.day. Stress.Level..scale..1.10.
##  Min.   : 10.00                        Min.   : 1.000            
##  1st Qu.: 35.00                        1st Qu.: 3.000            
##  Median : 65.50                        Median : 5.000            
##  Mean   : 64.98                        Mean   : 5.473            
##  3rd Qu.: 94.00                        3rd Qu.: 8.000            
##  Max.   :120.00                        Max.   :10.000            
##  BMI.Category       Blood.Pressure..systolic.diastolic. Heart.Rate..bpm.
##  Length:400         Length:400                          Min.   : 50.00  
##  Class :character   Class :character                    1st Qu.: 63.00  
##  Mode  :character   Mode  :character                    Median : 77.00  
##                                                         Mean   : 75.99  
##                                                         3rd Qu.: 90.00  
##                                                         Max.   :100.00  
##   Daily.Steps    Sleep.Disorder    
##  Min.   : 2067   Length:400        
##  1st Qu.: 6165   Class :character  
##  Median :11786   Mode  :character  
##  Mean   :11077                     
##  3rd Qu.:15878                     
##  Max.   :19958

Cek Nilai Hilang dan Duplikasi Data

colSums(is.na(data))
##                             Person.ID                                Gender 
##                                     0                                     0 
##                                   Age                            Occupation 
##                                     0                                     0 
##                Sleep.Duration..hours.        Quality.of.Sleep..scale..1.10. 
##                                     0                                     0 
## Physical.Activity.Level..minutes.day.            Stress.Level..scale..1.10. 
##                                     0                                     0 
##                          BMI.Category   Blood.Pressure..systolic.diastolic. 
##                                     0                                     0 
##                      Heart.Rate..bpm.                           Daily.Steps 
##                                     0                                     0 
##                        Sleep.Disorder 
##                                     0
sapply(data, function(x) sum(x == "", na.rm = TRUE))
##                             Person.ID                                Gender 
##                                     0                                     0 
##                                   Age                            Occupation 
##                                     0                                     0 
##                Sleep.Duration..hours.        Quality.of.Sleep..scale..1.10. 
##                                     0                                     0 
## Physical.Activity.Level..minutes.day.            Stress.Level..scale..1.10. 
##                                     0                                     0 
##                          BMI.Category   Blood.Pressure..systolic.diastolic. 
##                                     0                                     0 
##                      Heart.Rate..bpm.                           Daily.Steps 
##                                     0                                     0 
##                        Sleep.Disorder 
##                                     0
sum(duplicated(data))
## [1] 0

Cek Outlier

sleep_duration <- data[["Sleep.Duration..hours."]]
z_scores <- scale(sleep_duration)
outliers <- which(abs(z_scores) > 3)
data[outliers, ]
##  [1] Person.ID                             Gender                               
##  [3] Age                                   Occupation                           
##  [5] Sleep.Duration..hours.                Quality.of.Sleep..scale..1.10.       
##  [7] Physical.Activity.Level..minutes.day. Stress.Level..scale..1.10.           
##  [9] BMI.Category                          Blood.Pressure..systolic.diastolic.  
## [11] Heart.Rate..bpm.                      Daily.Steps                          
## [13] Sleep.Disorder                       
## <0 rows> (or 0-length row.names)

Memisahkan Kolom Blood Pressure Menjadi Systolic dan Diastolic

data <- data %>%
  separate(Blood.Pressure..systolic.diastolic., into = c("Systolic", "Diastolic"), sep = "/", convert = TRUE)

Statistik Deskriptif

summary(data[sapply(data, is.numeric)])
##    Person.ID          Age        Sleep.Duration..hours.
##  Min.   :  1.0   Min.   :18.00   Min.   : 4.100        
##  1st Qu.:100.8   1st Qu.:29.00   1st Qu.: 5.900        
##  Median :200.5   Median :40.00   Median : 8.200        
##  Mean   :200.5   Mean   :39.95   Mean   : 8.041        
##  3rd Qu.:300.2   3rd Qu.:49.00   3rd Qu.:10.125        
##  Max.   :400.0   Max.   :90.00   Max.   :12.000        
##  Quality.of.Sleep..scale..1.10. Physical.Activity.Level..minutes.day.
##  Min.   : 1.000                 Min.   : 10.00                       
##  1st Qu.: 4.700                 1st Qu.: 35.00                       
##  Median : 6.100                 Median : 65.50                       
##  Mean   : 6.126                 Mean   : 64.98                       
##  3rd Qu.: 7.425                 3rd Qu.: 94.00                       
##  Max.   :10.000                 Max.   :120.00                       
##  Stress.Level..scale..1.10.    Systolic       Diastolic     Heart.Rate..bpm.
##  Min.   : 1.000             Min.   :109.0   Min.   :60.00   Min.   : 50.00  
##  1st Qu.: 3.000             1st Qu.:115.0   1st Qu.:66.00   1st Qu.: 63.00  
##  Median : 5.000             Median :122.0   Median :73.00   Median : 77.00  
##  Mean   : 5.473             Mean   :122.2   Mean   :73.04   Mean   : 75.99  
##  3rd Qu.: 8.000             3rd Qu.:128.0   3rd Qu.:79.00   3rd Qu.: 90.00  
##  Max.   :10.000             Max.   :145.0   Max.   :96.00   Max.   :100.00  
##   Daily.Steps   
##  Min.   : 2067  
##  1st Qu.: 6165  
##  Median :11786  
##  Mean   :11077  
##  3rd Qu.:15878  
##  Max.   :19958

Visualisasi Awal (EDA)

Distribusi Durasi Tidur

ggplot(data, aes(x = `Sleep.Duration..hours.`)) + 
  geom_histogram(binwidth = 1, fill = "steelblue", color = "black") +
  stat_bin(binwidth = 1, geom = "text", aes(label = ..count..), vjust = -0.5) +
  scale_x_continuous(breaks = seq(0, 13, 1)) +
  labs(title = "Distribusi Durasi Tidur", x = "Durasi Tidur (jam)", y = "Jumlah Individu") +
  theme_minimal()
## Warning: The dot-dot notation (`..count..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(count)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

Distribusi Sleep Disorder

ggplot(data, aes(x = Sleep.Disorder)) + 
  geom_bar(fill = "tomato") +
  labs(title = "Distribusi Sleep Disorder", x = "Jenis Gangguan Tidur", y = "Jumlah Individu") +
  theme_minimal()

Encoding Kategorikal dan Cek Skewness

data$Gender <- as.factor(data$Gender)
data$Occupation <- as.factor(data$Occupation)
data$BMI.Category <- as.factor(data$BMI.Category)
data$Sleep.Disorder <- as.factor(data$Sleep.Disorder)

num_cols <- c("Age", "Sleep.Duration..hours.", "Quality.of.Sleep..scale..1.10.",
              "Physical.Activity.Level..minutes.day.", "Stress.Level..scale..1.10.",
              "Heart.Rate..bpm.", "Systolic", "Diastolic", "Daily.Steps")

data[num_cols] %>%
  pivot_longer(everything(), names_to = "Variabel", values_to = "Nilai") %>%
  ggplot(aes(x = Nilai)) +
  geom_histogram(bins = 30, fill = "steelblue", color = "black") +
  facet_wrap(~ Variabel, scales = "free", ncol = 3) +
  theme_minimal()

Transformasi dan Normalisasi

data_transformed <- data %>% mutate(
  Systolic = log(Systolic),
  Diastolic = sqrt(Diastolic),
  Age = log(Age)
)

data_transformed <- data_transformed %>% dplyr::select(-`Stress.Level..scale..1.10.`)

num_cols_updated <- c("Age", "Sleep.Duration..hours.", "Quality.of.Sleep..scale..1.10.",
                      "Physical.Activity.Level..minutes.day.", "Heart.Rate..bpm.",
                      "Systolic", "Diastolic", "Daily.Steps")

data_normalized <- data_transformed
data_normalized[num_cols_updated] <- scale(data_transformed[num_cols_updated])
data_normalized <- data_normalized %>% dplyr::select(-Person.ID)

Uji Normalitas dan Homogenitas

mvn(data = data_normalized[num_cols_updated], multivariatePlot = "qq")

## $multivariateNormality
##            Test       HZ p value MVN
## 1 Henze-Zirkler 1.370701       0  NO
## 
## $univariateNormality
##               Test                              Variable Statistic   p value
## 1 Anderson-Darling                  Age                     3.2674  <0.001  
## 2 Anderson-Darling        Sleep.Duration..hours.            6.1299  <0.001  
## 3 Anderson-Darling    Quality.of.Sleep..scale..1.10.        0.4740  0.2403  
## 4 Anderson-Darling Physical.Activity.Level..minutes.day.    5.1860  <0.001  
## 5 Anderson-Darling           Heart.Rate..bpm.               5.6207  <0.001  
## 6 Anderson-Darling               Systolic                   1.5722   5e-04  
## 7 Anderson-Darling               Diastolic                  2.4130  <0.001  
## 8 Anderson-Darling              Daily.Steps                 6.9134  <0.001  
##   Normality
## 1    NO    
## 2    NO    
## 3    YES   
## 4    NO    
## 5    NO    
## 6    NO    
## 7    NO    
## 8    NO    
## 
## $Descriptives
##                                         n          Mean Std.Dev      Median
## Age                                   400 -3.453408e-16       1  0.17933727
## Sleep.Duration..hours.                400  8.825541e-17       1  0.06640073
## Quality.of.Sleep..scale..1.10.        400 -1.193307e-17       1 -0.01303314
## Physical.Activity.Level..minutes.day. 400  1.722025e-17       1  0.01594532
## Heart.Rate..bpm.                      400  3.252693e-16       1  0.06689037
## Systolic                              400 -1.714887e-15       1  0.01032986
## Diastolic                             400 -1.329076e-15       1  0.02599014
## Daily.Steps                           400 -4.046378e-17       1  0.13215617
##                                             Min      Max       25th      75th
## Age                                   -1.980542 2.372819 -0.6905128 0.7282710
## Sleep.Duration..hours.                -1.648516 1.655835 -0.8956255 0.8715749
## Quality.of.Sleep..scale..1.10.        -2.594354 1.960918 -0.7216309 0.6576041
## Physical.Activity.Level..minutes.day. -1.702434 1.703363 -0.9283893 0.8983563
## Heart.Rate..bpm.                      -1.721268 1.590136 -0.8603029 0.9278555
## Systolic                              -1.670664 2.587065 -0.8712295 0.7265890
## Diastolic                             -1.503583 2.429408 -0.7789446 0.6856899
## Daily.Steps                           -1.679378 1.655515 -0.9154619 0.8950007
##                                              Skew   Kurtosis
## Age                                   -0.32628119 -0.6220320
## Sleep.Duration..hours.                -0.06329684 -1.2895248
## Quality.of.Sleep..scale..1.10.        -0.11632665 -0.3664791
## Physical.Activity.Level..minutes.day.  0.03774603 -1.2436784
## Heart.Rate..bpm.                      -0.08445252 -1.2543068
## Systolic                               0.16547902 -0.6016516
## Diastolic                              0.29480978 -0.6645804
## Daily.Steps                           -0.07826105 -1.3160737
for (label in levels(data_normalized$Sleep.Disorder)) {
  cat("\nNormalitas untuk kelas:", label, "\n")
  hasil <- mvn(data_normalized[data_normalized$Sleep.Disorder == label, num_cols_updated], multivariatePlot = "none")
  print(hasil$multivariateNormality)
}
## 
## Normalitas untuk kelas: Insomnia 
##            Test       HZ      p value MVN
## 1 Henze-Zirkler 1.055827 5.368583e-05  NO
## 
## Normalitas untuk kelas: None 
##            Test       HZ p value MVN
## 1 Henze-Zirkler 1.272359       0  NO
## 
## Normalitas untuk kelas: Sleep Apnea 
##            Test       HZ   p value MVN
## 1 Henze-Zirkler 0.959814 0.1254576 YES
box_m_test <- boxM(data_normalized[, num_cols_updated], data_normalized$Sleep.Disorder)
print(box_m_test)
## 
##  Box's M-test for Homogeneity of Covariance Matrices
## 
## data:  data_normalized[, num_cols_updated]
## Chi-Sq (approx.) = 55.581, df = 72, p-value = 0.9239

Uji ANOVA

for (var in num_cols_updated) {
  model <- aov(as.formula(paste(var, "~ Sleep.Disorder")), data = data_normalized)
  cat("\nANOVA untuk:", var, "\n")
  print(summary(model))
}
## 
## ANOVA untuk: Age 
##                 Df Sum Sq Mean Sq F value Pr(>F)
## Sleep.Disorder   2    0.3  0.1338   0.133  0.875
## Residuals      397  398.7  1.0044               
## 
## ANOVA untuk: Sleep.Duration..hours. 
##                 Df Sum Sq Mean Sq F value Pr(>F)
## Sleep.Disorder   2    0.9  0.4602   0.459  0.632
## Residuals      397  398.1  1.0027               
## 
## ANOVA untuk: Quality.of.Sleep..scale..1.10. 
##                 Df Sum Sq Mean Sq F value Pr(>F)
## Sleep.Disorder   2    0.2  0.0805    0.08  0.923
## Residuals      397  398.8  1.0046               
## 
## ANOVA untuk: Physical.Activity.Level..minutes.day. 
##                 Df Sum Sq Mean Sq F value Pr(>F)  
## Sleep.Disorder   2      9   4.517   4.598 0.0106 *
## Residuals      397    390   0.982                 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## ANOVA untuk: Heart.Rate..bpm. 
##                 Df Sum Sq Mean Sq F value Pr(>F)
## Sleep.Disorder   2      0  0.0184   0.018  0.982
## Residuals      397    399  1.0049               
## 
## ANOVA untuk: Systolic 
##                 Df Sum Sq Mean Sq F value Pr(>F)
## Sleep.Disorder   2    0.4  0.2018   0.201  0.818
## Residuals      397  398.6  1.0040               
## 
## ANOVA untuk: Diastolic 
##                 Df Sum Sq Mean Sq F value Pr(>F)
## Sleep.Disorder   2    0.5  0.2702   0.269  0.764
## Residuals      397  398.5  1.0037               
## 
## ANOVA untuk: Daily.Steps 
##                 Df Sum Sq Mean Sq F value Pr(>F)  
## Sleep.Disorder   2    5.7  2.8382   2.865 0.0582 .
## Residuals      397  393.3  0.9907                 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Pemisahan Data

set.seed(123)
split <- sample.split(data_normalized$Sleep.Disorder, SplitRatio = 0.8)
trainData <- subset(data_normalized, split == TRUE)
testData <- subset(data_normalized, split == FALSE)

Latih dan Evaluasi Model LDA

lda_model <- lda(Sleep.Disorder ~ ., data = trainData)
print(lda_model)
## Call:
## lda(Sleep.Disorder ~ ., data = trainData)
## 
## Prior probabilities of groups:
##    Insomnia        None Sleep Apnea 
##    0.196875    0.725000    0.078125 
## 
## Group means:
##             GenderMale         Age OccupationOffice Worker OccupationRetired
## Insomnia     0.4920635  0.01169231               0.3650794         0.1746032
## None         0.5129310  0.04077412               0.2198276         0.2241379
## Sleep Apnea  0.4800000 -0.01543298               0.3200000         0.2400000
##             OccupationStudent Sleep.Duration..hours.
## Insomnia            0.2539683            -0.05509741
## None                0.2974138            -0.01653257
## Sleep Apnea         0.2800000             0.15507445
##             Quality.of.Sleep..scale..1.10.
## Insomnia                        0.03597419
## None                           -0.01761459
## Sleep Apnea                    -0.18107204
##             Physical.Activity.Level..minutes.day. BMI.CategoryObese
## Insomnia                               -0.1386179         0.2063492
## None                                    0.1009568         0.2586207
## Sleep Apnea                            -0.3289690         0.2400000
##             BMI.CategoryOverweight BMI.CategoryUnderweight    Systolic
## Insomnia                 0.2222222               0.3015873 -0.01395367
## None                     0.2931034               0.2284483  0.05306806
## Sleep Apnea              0.2400000               0.2800000 -0.03809873
##              Diastolic Heart.Rate..bpm. Daily.Steps
## Insomnia    0.08992091       0.01537963   0.2445957
## None        0.02066944      -0.02474418  -0.1324443
## Sleep Apnea 0.04407762      -0.06821493   0.2202155
## 
## Coefficients of linear discriminants:
##                                                LD1         LD2
## GenderMale                             0.096390867 -0.14145704
## Age                                    0.197846581  2.54158608
## OccupationOffice Worker               -1.286575714  0.07506759
## OccupationRetired                     -0.328392771  1.12804726
## OccupationStudent                     -0.276342275  0.62333798
## Sleep.Duration..hours.                -0.068738595  0.50483986
## Quality.of.Sleep..scale..1.10.         0.006125075 -0.43519535
## Physical.Activity.Level..minutes.day.  0.433586450 -0.38899267
## BMI.CategoryObese                      0.767470321  4.02148114
## BMI.CategoryOverweight                 0.560102621  0.23964715
## BMI.CategoryUnderweight               -0.394729431 -0.01010480
## Systolic                               0.499464929 -2.84990010
## Diastolic                             -0.764019132 -0.29977483
## Heart.Rate..bpm.                      -0.050107041 -0.10102298
## Daily.Steps                           -0.579080873  0.02012286
## 
## Proportion of trace:
##   LD1   LD2 
## 0.857 0.143
lda_pred <- predict(lda_model, testData)
confusionMatrix(lda_pred$class, testData$Sleep.Disorder)
## Confusion Matrix and Statistics
## 
##              Reference
## Prediction    Insomnia None Sleep Apnea
##   Insomnia           3    1           1
##   None              13   57           5
##   Sleep Apnea        0    0           0
## 
## Overall Statistics
##                                           
##                Accuracy : 0.75            
##                  95% CI : (0.6406, 0.8401)
##     No Information Rate : 0.725           
##     P-Value [Acc > NIR] : 0.3598434       
##                                           
##                   Kappa : 0.1878          
##                                           
##  Mcnemar's Test P-Value : 0.0009908       
## 
## Statistics by Class:
## 
##                      Class: Insomnia Class: None Class: Sleep Apnea
## Sensitivity                   0.1875      0.9828              0.000
## Specificity                   0.9688      0.1818              1.000
## Pos Pred Value                0.6000      0.7600                NaN
## Neg Pred Value                0.8267      0.8000              0.925
## Prevalence                    0.2000      0.7250              0.075
## Detection Rate                0.0375      0.7125              0.000
## Detection Prevalence          0.0625      0.9375              0.000
## Balanced Accuracy             0.5781      0.5823              0.500

Latih dan Evaluasi Model QDA

qda_model <- qda(Sleep.Disorder ~ ., data = trainData)
print(qda_model)
## Call:
## qda(Sleep.Disorder ~ ., data = trainData)
## 
## Prior probabilities of groups:
##    Insomnia        None Sleep Apnea 
##    0.196875    0.725000    0.078125 
## 
## Group means:
##             GenderMale         Age OccupationOffice Worker OccupationRetired
## Insomnia     0.4920635  0.01169231               0.3650794         0.1746032
## None         0.5129310  0.04077412               0.2198276         0.2241379
## Sleep Apnea  0.4800000 -0.01543298               0.3200000         0.2400000
##             OccupationStudent Sleep.Duration..hours.
## Insomnia            0.2539683            -0.05509741
## None                0.2974138            -0.01653257
## Sleep Apnea         0.2800000             0.15507445
##             Quality.of.Sleep..scale..1.10.
## Insomnia                        0.03597419
## None                           -0.01761459
## Sleep Apnea                    -0.18107204
##             Physical.Activity.Level..minutes.day. BMI.CategoryObese
## Insomnia                               -0.1386179         0.2063492
## None                                    0.1009568         0.2586207
## Sleep Apnea                            -0.3289690         0.2400000
##             BMI.CategoryOverweight BMI.CategoryUnderweight    Systolic
## Insomnia                 0.2222222               0.3015873 -0.01395367
## None                     0.2931034               0.2284483  0.05306806
## Sleep Apnea              0.2400000               0.2800000 -0.03809873
##              Diastolic Heart.Rate..bpm. Daily.Steps
## Insomnia    0.08992091       0.01537963   0.2445957
## None        0.02066944      -0.02474418  -0.1324443
## Sleep Apnea 0.04407762      -0.06821493   0.2202155
qda_pred <- predict(qda_model, testData)
confusionMatrix(qda_pred$class, testData$Sleep.Disorder)
## Confusion Matrix and Statistics
## 
##              Reference
## Prediction    Insomnia None Sleep Apnea
##   Insomnia           3   10           1
##   None              13   47           5
##   Sleep Apnea        0    1           0
## 
## Overall Statistics
##                                           
##                Accuracy : 0.625           
##                  95% CI : (0.5096, 0.7308)
##     No Information Rate : 0.725           
##     P-Value [Acc > NIR] : 0.9810          
##                                           
##                   Kappa : 0               
##                                           
##  Mcnemar's Test P-Value : 0.2553          
## 
## Statistics by Class:
## 
##                      Class: Insomnia Class: None Class: Sleep Apnea
## Sensitivity                   0.1875      0.8103             0.0000
## Specificity                   0.8281      0.1818             0.9865
## Pos Pred Value                0.2143      0.7231             0.0000
## Neg Pred Value                0.8030      0.2667             0.9241
## Prevalence                    0.2000      0.7250             0.0750
## Detection Rate                0.0375      0.5875             0.0000
## Detection Prevalence          0.1750      0.8125             0.0125
## Balanced Accuracy             0.5078      0.4961             0.4932

Visualisasi Hasil LDA

testData$LDA1 <- lda_pred$x[,1]
testData$LDA2 <- lda_pred$x[,2]

ggplot(testData, aes(x= LDA1, y= LDA2, color = Sleep.Disorder)) + 
  geom_point(alpha = 0.7) +
  labs(title = "Visualisasi LDA", x = "LDA1", y = "LDA2") +
  theme_minimal()

Analisis Regresi Logistik Multinomial

# Load data
data <- read_csv("sleep_health_lifestyle_dataset.csv")

# Ubah kolom kategorikal menjadi faktor
data <- data %>%
  mutate(
    Gender = as.factor(Gender),
    Occupation = as.factor(Occupation),
    `BMI Category` = as.factor(`BMI Category`),
    `Sleep Disorder` = as.factor(`Sleep Disorder`)
  )

# Pisahkan tekanan darah ke dua kolom numerik: Systolic dan Diastolic
data <- data %>%
  separate(`Blood Pressure (systolic/diastolic)`, into = c("Systolic", "Diastolic"), sep = "/", convert = TRUE)

# Drop baris yang memiliki NA di kolom target (Sleep Disorder)
data <- data %>% filter(!is.na(`Sleep Disorder`))

# Cek hasil akhir
glimpse(data)
## Rows: 400
## Columns: 14
## $ `Person ID`                             <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10,…
## $ Gender                                  <fct> Male, Female, Male, Male, Male…
## $ Age                                     <dbl> 29, 43, 44, 29, 67, 47, 22, 49…
## $ Occupation                              <fct> Manual Labor, Retired, Retired…
## $ `Sleep Duration (hours)`                <dbl> 7.4, 4.2, 6.1, 8.3, 9.1, 6.1, …
## $ `Quality of Sleep (scale: 1-10)`        <dbl> 7.0, 4.9, 6.0, 10.0, 9.5, 6.9,…
## $ `Physical Activity Level (minutes/day)` <dbl> 41, 41, 107, 20, 19, 24, 26, 4…
## $ `Stress Level (scale: 1-10)`            <dbl> 7, 5, 4, 10, 4, 4, 6, 8, 8, 5,…
## $ `BMI Category`                          <fct> Obese, Obese, Underweight, Obe…
## $ Systolic                                <int> 124, 131, 122, 124, 133, 123, …
## $ Diastolic                               <int> 70, 86, 70, 72, 78, 60, 70, 87…
## $ `Heart Rate (bpm)`                      <dbl> 91, 81, 81, 55, 97, 87, 66, 59…
## $ `Daily Steps`                           <dbl> 8539, 18754, 2857, 6886, 14945…
## $ `Sleep Disorder`                        <fct> None, None, None, None, Insomn…
summary(data)
##    Person ID        Gender         Age                Occupation 
##  Min.   :  1.0   Female:201   Min.   :18.00   Manual Labor : 96  
##  1st Qu.:100.8   Male  :199   1st Qu.:29.00   Office Worker: 99  
##  Median :200.5                Median :40.00   Retired      : 95  
##  Mean   :200.5                Mean   :39.95   Student      :110  
##  3rd Qu.:300.2                3rd Qu.:49.00                      
##  Max.   :400.0                Max.   :90.00                      
##  Sleep Duration (hours) Quality of Sleep (scale: 1-10)
##  Min.   : 4.100         Min.   : 1.000                
##  1st Qu.: 5.900         1st Qu.: 4.700                
##  Median : 8.200         Median : 6.100                
##  Mean   : 8.041         Mean   : 6.126                
##  3rd Qu.:10.125         3rd Qu.: 7.425                
##  Max.   :12.000         Max.   :10.000                
##  Physical Activity Level (minutes/day) Stress Level (scale: 1-10)
##  Min.   : 10.00                        Min.   : 1.000            
##  1st Qu.: 35.00                        1st Qu.: 3.000            
##  Median : 65.50                        Median : 5.000            
##  Mean   : 64.98                        Mean   : 5.473            
##  3rd Qu.: 94.00                        3rd Qu.: 8.000            
##  Max.   :120.00                        Max.   :10.000            
##       BMI Category    Systolic       Diastolic     Heart Rate (bpm)
##  Normal     : 91   Min.   :109.0   Min.   :60.00   Min.   : 50.00  
##  Obese      : 98   1st Qu.:115.0   1st Qu.:66.00   1st Qu.: 63.00  
##  Overweight :109   Median :122.0   Median :73.00   Median : 77.00  
##  Underweight:102   Mean   :122.2   Mean   :73.04   Mean   : 75.99  
##                    3rd Qu.:128.0   3rd Qu.:79.00   3rd Qu.: 90.00  
##                    Max.   :145.0   Max.   :96.00   Max.   :100.00  
##   Daily Steps        Sleep Disorder
##  Min.   : 2067   Insomnia   : 79   
##  1st Qu.: 6165   None       :290   
##  Median :11786   Sleep Apnea: 31   
##  Mean   :11077                     
##  3rd Qu.:15878                     
##  Max.   :19958

Build Model Multinomial Logistic Regression

# Split Data Train dan Test
set.seed(123)
library(caTools)
split <- sample.split(data$`Sleep Disorder`, SplitRatio = 0.8)
train <- subset(data, split == TRUE)
test <- subset(data, split == FALSE)

library(nnet)
model <- multinom(`Sleep Disorder` ~ ., data = train)
## # weights:  57 (36 variable)
## initial  value 351.555932 
## iter  10 value 282.353428
## iter  20 value 261.649692
## iter  30 value 224.185005
## iter  40 value 223.099062
## iter  50 value 222.607334
## iter  60 value 222.495284
## final  value 222.244355 
## converged
summary(model)
## Call:
## multinom(formula = `Sleep Disorder` ~ ., data = train)
## 
## Coefficients:
##             (Intercept)   `Person ID`  GenderMale        Age
## None           -72.7237 -0.0001723193  0.08308344 -0.3609286
## Sleep Apnea    -98.7701 -0.0006566064 -0.13423185 -0.4970458
##             OccupationOffice Worker OccupationRetired OccupationStudent
## None                    -0.88906646        -0.1078120       -0.08346798
## Sleep Apnea             -0.01741841         0.6164179        0.42203309
##             `Sleep Duration (hours)` `Quality of Sleep (scale: 1-10)`
## None                     0.005992069                      -0.03360876
## Sleep Apnea              0.108782414                      -0.10355854
##             `Physical Activity Level (minutes/day)`
## None                                    0.007184875
## Sleep Apnea                            -0.007680839
##             `Stress Level (scale: 1-10)` `BMI Category`Obese
## None                        -0.002084572           -6.785851
## Sleep Apnea                  0.135571838           -9.742138
##             `BMI Category`Overweight `BMI Category`Underweight  Systolic
## None                     0.367060206                -0.3147506 0.7869565
## Sleep Apnea             -0.009587405                -0.1621588 0.9910614
##               Diastolic `Heart Rate (bpm)` `Daily Steps`
## None        -0.06673796       -0.003848549 -7.447352e-05
## Sleep Apnea -0.01349845       -0.008306396  3.718087e-07
## 
## Std. Errors:
##              (Intercept) `Person ID`   GenderMale        Age
## None        0.0003043853 0.001308302 0.0010032387 0.01455552
## Sleep Apnea 0.0005469014 0.002158540 0.0004533345 0.02311583
##             OccupationOffice Worker OccupationRetired OccupationStudent
## None                    0.001129647      0.0006082703      0.0009284263
## Sleep Apnea             0.001031989      0.0003201091      0.0004660763
##             `Sleep Duration (hours)` `Quality of Sleep (scale: 1-10)`
## None                      0.05042876                       0.05936647
## Sleep Apnea               0.01807641                       0.02331854
##             `Physical Activity Level (minutes/day)`
## None                                    0.004638267
## Sleep Apnea                             0.007791381
##             `Stress Level (scale: 1-10)` `BMI Category`Obese
## None                          0.04378828         0.001845208
## Sleep Apnea                   0.01791826         0.002697322
##             `BMI Category`Overweight `BMI Category`Underweight   Systolic
## None                    0.0007632439               0.001207743 0.02031453
## Sleep Apnea             0.0011977052               0.001100462 0.03164274
##              Diastolic `Heart Rate (bpm)` `Daily Steps`
## None        0.03087102        0.009785213  2.892987e-05
## Sleep Apnea 0.04969513        0.015853833  4.767726e-05
## 
## Residual Deviance: 444.4887 
## AIC: 516.4887

Evaluasi Model

pred <- predict(model, newdata = test)
confusionMatrix <- table(Predicted = pred, Actual = test$`Sleep Disorder`)
print(confusionMatrix)
##              Actual
## Predicted     Insomnia None Sleep Apnea
##   Insomnia           2    1           1
##   None              14   57           5
##   Sleep Apnea        0    0           0

Akurasi

accuracy <- sum(diag(confusionMatrix)) / sum(confusionMatrix)
print(paste("Accuracy:", round(accuracy * 100, 2), "%"))
## [1] "Accuracy: 73.75 %"

Hitung Precision, Recall, dan F1-score per kelas

# Konversi prediksi dan label aktual ke faktor dengan level yang sama
pred <- factor(pred, levels = levels(test$`Sleep Disorder`))
actual <- factor(test$`Sleep Disorder`, levels = levels(test$`Sleep Disorder`))

# Buat confusion matrix menggunakan caret
conf <- confusionMatrix(pred, actual)
print(conf)
## Confusion Matrix and Statistics
## 
##              Reference
## Prediction    Insomnia None Sleep Apnea
##   Insomnia           2    1           1
##   None              14   57           5
##   Sleep Apnea        0    0           0
## 
## Overall Statistics
##                                           
##                Accuracy : 0.7375          
##                  95% CI : (0.6271, 0.8296)
##     No Information Rate : 0.725           
##     P-Value [Acc > NIR] : 0.4576322       
##                                           
##                   Kappa : 0.1286          
##                                           
##  Mcnemar's Test P-Value : 0.0006229       
## 
## Statistics by Class:
## 
##                      Class: Insomnia Class: None Class: Sleep Apnea
## Sensitivity                   0.1250      0.9828              0.000
## Specificity                   0.9688      0.1364              1.000
## Pos Pred Value                0.5000      0.7500                NaN
## Neg Pred Value                0.8158      0.7500              0.925
## Prevalence                    0.2000      0.7250              0.075
## Detection Rate                0.0250      0.7125              0.000
## Detection Prevalence          0.0500      0.9500              0.000
## Balanced Accuracy             0.5469      0.5596              0.500

Hitung precision, recall, dan F1-score secara manual

classes <- levels(actual)

for (cls in classes) {
  precision <- Precision(y_pred = pred, y_true = actual, positive = cls)
  recall <- Recall(y_pred = pred, y_true = actual, positive = cls)
  f1 <- F1_Score(y_pred = pred, y_true = actual, positive = cls)

  cat("\nKelas:", cls, "\n")
  cat("Precision:", round(precision, 3), "\n")
  cat("Recall   :", round(recall, 3), "\n")
  cat("F1-Score :", round(f1, 3), "\n")
}
## 
## Kelas: Insomnia 
## Precision: 0.5 
## Recall   : 0.125 
## F1-Score : 0.2 
## 
## Kelas: None 
## Precision: 0.75 
## Recall   : 0.983 
## F1-Score : 0.851 
## 
## Kelas: Sleep Apnea 
## Precision: NaN 
## Recall   : 0 
## F1-Score : NaN