Introduction ———————-

Sleep is an important part of human life. When we are sleeping, our brains can retain the knowledge we have acquired throughout the day and restore our energy.

However, nowadays, it is really common for adults not to have a good sleep time, perhaps due to their occupations, daily stress, or other bad habits.

So, can machine learning help them get a better night’s sleep?

Goals

Train a machine learning model capable of predicting whether someone will have a sleep disorder based on their lifestyle.

Exploring Dataset

Libraries ———————-

library(here)
library(tidyverse)

Dataset ———————-

data_raw = readr::read_csv(file = here::here('data/raw/Sleep_health_and_lifestyle_dataset.csv'))
data_raw |> head(5)
## # A tibble: 5 × 13
##   `Person ID` Gender   Age Occupation        `Sleep Duration` `Quality of Sleep`
##         <dbl> <chr>  <dbl> <chr>                        <dbl>              <dbl>
## 1           1 Male      27 Software Engineer              6.1                  6
## 2           2 Male      28 Doctor                         6.2                  6
## 3           3 Male      28 Doctor                         6.2                  6
## 4           4 Male      28 Sales Representa…              5.9                  4
## 5           5 Male      28 Sales Representa…              5.9                  4
## # ℹ 7 more variables: `Physical Activity Level` <dbl>, `Stress Level` <dbl>,
## #   `BMI Category` <chr>, `Blood Pressure` <chr>, `Heart Rate` <dbl>,
## #   `Daily Steps` <dbl>, `Sleep Disorder` <chr>

Clean Dataset ———————-

Rename Dataset

print('--------------Before Rename-----------------')
## [1] "--------------Before Rename-----------------"
colnames(data_raw)
##  [1] "Person ID"               "Gender"                 
##  [3] "Age"                     "Occupation"             
##  [5] "Sleep Duration"          "Quality of Sleep"       
##  [7] "Physical Activity Level" "Stress Level"           
##  [9] "BMI Category"            "Blood Pressure"         
## [11] "Heart Rate"              "Daily Steps"            
## [13] "Sleep Disorder"
print('--------------------After Rename------------------')
## [1] "--------------------After Rename------------------"
colnames(data_raw)=lapply(colnames(data_raw),
                      function(x) stringr::str_replace_all(string=x,pattern=" ", repl="_")) |>
                      tolower()

data_renamed = data_raw
colnames(data_renamed)
##  [1] "person_id"               "gender"                 
##  [3] "age"                     "occupation"             
##  [5] "sleep_duration"          "quality_of_sleep"       
##  [7] "physical_activity_level" "stress_level"           
##  [9] "bmi_category"            "blood_pressure"         
## [11] "heart_rate"              "daily_steps"            
## [13] "sleep_disorder"

Transform character and integer in factors

transform_in_factor=c("gender","occupation","bmi_category","sleep_disorder",'stress_level','quality_of_sleep')

for(x in transform_in_factor){
  data_renamed[x] = factor(data_renamed[[x]])
}

rm(x)

rm(transform_in_factor)
data_renamed_factored = data_renamed

Rename Factor “Normal Weight” for “Underweight”

data_renamed_factored$bmi_category <- recode(data_renamed_factored$bmi_category,`Normal Weight`='Underweight')
summary(data_renamed_factored)
##    person_id         gender         age             occupation sleep_duration 
##  Min.   :  1.00   Female:185   Min.   :27.00   Nurse     :73   Min.   :5.800  
##  1st Qu.: 94.25   Male  :189   1st Qu.:35.25   Doctor    :71   1st Qu.:6.400  
##  Median :187.50                Median :43.00   Engineer  :63   Median :7.200  
##  Mean   :187.50                Mean   :42.18   Lawyer    :47   Mean   :7.132  
##  3rd Qu.:280.75                3rd Qu.:50.00   Teacher   :40   3rd Qu.:7.800  
##  Max.   :374.00                Max.   :59.00   Accountant:37   Max.   :8.500  
##                                                (Other)   :43                  
##  quality_of_sleep physical_activity_level stress_level      bmi_category
##  4:  5            Min.   :30.00           3:71         Normal     :195  
##  5:  7            1st Qu.:45.00           4:70         Underweight: 21  
##  6:105            Median :60.00           5:67         Obese      : 10  
##  7: 77            Mean   :59.17           6:46         Overweight :148  
##  8:109            3rd Qu.:75.00           7:50                          
##  9: 71            Max.   :90.00           8:70                          
##                                                                         
##  blood_pressure       heart_rate     daily_steps        sleep_disorder
##  Length:374         Min.   :65.00   Min.   : 3000   Insomnia   : 77   
##  Class :character   1st Qu.:68.00   1st Qu.: 5600   None       :219   
##  Mode  :character   Median :70.00   Median : 7000   Sleep Apnea: 78   
##                     Mean   :70.17   Mean   : 6817                     
##                     3rd Qu.:72.00   3rd Qu.: 8000                     
##                     Max.   :86.00   Max.   :10000                     
## 

Split blood pressure

data_renamed_factored_new_features= data_renamed_factored |>  separate_wider_delim(blood_pressure, "/", names = c("systolic_pressure", "diastolic_pressure"))

for(x in c("systolic_pressure", "diastolic_pressure")) {
  data_renamed_factored_new_features[x] = as.numeric(data_renamed_factored_new_features[[x]])
}

rm(x)

data_renamed_factored_new_features |> summary()
##    person_id         gender         age             occupation sleep_duration 
##  Min.   :  1.00   Female:185   Min.   :27.00   Nurse     :73   Min.   :5.800  
##  1st Qu.: 94.25   Male  :189   1st Qu.:35.25   Doctor    :71   1st Qu.:6.400  
##  Median :187.50                Median :43.00   Engineer  :63   Median :7.200  
##  Mean   :187.50                Mean   :42.18   Lawyer    :47   Mean   :7.132  
##  3rd Qu.:280.75                3rd Qu.:50.00   Teacher   :40   3rd Qu.:7.800  
##  Max.   :374.00                Max.   :59.00   Accountant:37   Max.   :8.500  
##                                                (Other)   :43                  
##  quality_of_sleep physical_activity_level stress_level      bmi_category
##  4:  5            Min.   :30.00           3:71         Normal     :195  
##  5:  7            1st Qu.:45.00           4:70         Underweight: 21  
##  6:105            Median :60.00           5:67         Obese      : 10  
##  7: 77            Mean   :59.17           6:46         Overweight :148  
##  8:109            3rd Qu.:75.00           7:50                          
##  9: 71            Max.   :90.00           8:70                          
##                                                                         
##  systolic_pressure diastolic_pressure   heart_rate     daily_steps   
##  Min.   :115.0     Min.   :75.00      Min.   :65.00   Min.   : 3000  
##  1st Qu.:125.0     1st Qu.:80.00      1st Qu.:68.00   1st Qu.: 5600  
##  Median :130.0     Median :85.00      Median :70.00   Median : 7000  
##  Mean   :128.6     Mean   :84.65      Mean   :70.17   Mean   : 6817  
##  3rd Qu.:135.0     3rd Qu.:90.00      3rd Qu.:72.00   3rd Qu.: 8000  
##  Max.   :142.0     Max.   :95.00      Max.   :86.00   Max.   :10000  
##                                                                      
##      sleep_disorder
##  Insomnia   : 77   
##  None       :219   
##  Sleep Apnea: 78   
##                    
##                    
##                    
## 

EDA ———————-

data_eda = data_renamed_factored_new_features
data_eda = data_eda |>  dplyr::select(-c('person_id'))

Numeric Features

columns_numeric = data_eda |> dplyr::select_if(is.numeric) |> colnames()
#columns_numeric

for( x in columns_numeric){
  g = data_eda |> ggplot() +
    geom_histogram(aes(.data[[x]]), bins = 30) +
    xlab(x)
  
  print(g)
    
}

rm(columns_numeric,x,g)

Sleep Disorder For Categorical Features

columns_factor = data_eda |> dplyr::select_if(is.factor) |>  dplyr:: select(-c(sleep_disorder)) |> colnames()

for( x in columns_factor){
  g = data_eda |>
    ggplot() +
    geom_bar(aes(x=sleep_disorder,
                 fill=.data[[x]]),
             position ='dodge2')
  
  print(g)
  }

rm(columns_factor,x,g)

Train Models ———————-

Libraries

library(here)
library(tidymodels)
library(tidyverse)
library(rpart.plot)

Split Dataset

set.seed(1)
split <- rsample::initial_split(data_eda, prop = .80,strata = sleep_disorder)

train_set <- rsample::training(split)
test_set <- rsample::testing(split)

Decision Tree

model_decision_tree=parsnip::decision_tree(mode='classification', engine = 'rpart')
model_decision_tree_fit= model_decision_tree |> fit( train_set$sleep_disorder ~ ., data = train_set) 
graph_model_decision_tree_fit <- extract_fit_engine(model_decision_tree_fit)
rpart.plot::rpart.plot(graph_model_decision_tree_fit)

SVM Linear

model_svm_linear=parsnip::svm_linear(mode ='classification',engine = 'kernlab')
model_svm_linear_fit= model_svm_linear |> fit( train_set$sleep_disorder ~ ., data = train_set) 
##  Setting default kernel parameters
model_svm_linear_fit
## parsnip model object
## 
## Support Vector Machine object of class "ksvm" 
## 
## SV type: C-svc  (classification) 
##  parameter : cost C = 1 
## 
## Linear (vanilla) kernel function. 
## 
## Number of Support Vectors : 89 
## 
## Objective Function Value : -23.4147 -18.1426 -25.4937 
## Training error : 0.077181 
## Probability model included.

SVM Radial

model_svm_radial=parsnip::svm_rbf(mode ='classification',engine = 'kernlab')
model_svm_radial_fit= model_svm_radial |> fit( train_set$sleep_disorder ~ ., data = train_set) 
model_svm_radial_fit
## parsnip model object
## 
## Support Vector Machine object of class "ksvm" 
## 
## SV type: C-svc  (classification) 
##  parameter : cost C = 1 
## 
## Gaussian Radial Basis kernel function. 
##  Hyperparameter : sigma =  0.0930375075840404 
## 
## Number of Support Vectors : 93 
## 
## Objective Function Value : -31.1503 -25.7571 -31.9414 
## Training error : 0.087248 
## Probability model included.

Random Forest

model_random_forest= rand_forest() |> 
  set_engine('randomForest') |> 
  set_mode('classification')

model_random_forest_workflow =  workflow() |>
  add_model(model_random_forest) |> 
  add_formula(sleep_disorder ~ .)

model_random_forest_fit = fit(model_random_forest_workflow, train_set)

Evaluate Model ———————-

Preprocess

test_set_without_class=subset(test_set,select = -c(sleep_disorder))
test_set_only_class=subset(test_set,select = c(sleep_disorder))


classification_metrics <- yardstick::metric_set(accuracy, mcc, f_meas)

Decision Tree

predict(model_decision_tree_fit,test_set_without_class ) |>
  dplyr::bind_cols(test_set_only_class)  |> 
  yardstick::conf_mat(truth = sleep_disorder, estimate = .pred_class)
##              Truth
## Prediction    Insomnia None Sleep Apnea
##   Insomnia          10    0           0
##   None               6   44           1
##   Sleep Apnea        0    0          15
predict(model_decision_tree_fit,test_set_without_class ) |>
  dplyr::bind_cols(test_set_only_class)  |> 
  classification_metrics(truth = sleep_disorder, estimate = .pred_class)
## # A tibble: 3 × 3
##   .metric  .estimator .estimate
##   <chr>    <chr>          <dbl>
## 1 accuracy multiclass     0.908
## 2 mcc      multiclass     0.844
## 3 f_meas   macro          0.888

SVM Linear

predict(model_svm_linear_fit,test_set_without_class ) |>
  dplyr::bind_cols(test_set_only_class)  |> 
  yardstick::conf_mat(truth = sleep_disorder, estimate = .pred_class)
##              Truth
## Prediction    Insomnia None Sleep Apnea
##   Insomnia          13    1           0
##   None               2   43           0
##   Sleep Apnea        1    0          16
predict(model_svm_linear_fit,test_set_without_class ) |>
  dplyr::bind_cols(test_set_only_class)  |> 
  classification_metrics(truth = sleep_disorder, estimate = .pred_class)
## # A tibble: 3 × 3
##   .metric  .estimator .estimate
##   <chr>    <chr>          <dbl>
## 1 accuracy multiclass     0.947
## 2 mcc      multiclass     0.909
## 3 f_meas   macro          0.934

SVM Radial

predict(model_svm_radial_fit,test_set_without_class ) |>
  dplyr::bind_cols(test_set_only_class)  |> 
  yardstick::conf_mat(truth = sleep_disorder, estimate = .pred_class)
##              Truth
## Prediction    Insomnia None Sleep Apnea
##   Insomnia          11    0           0
##   None               2   44           0
##   Sleep Apnea        3    0          16
predict(model_svm_radial_fit,test_set_without_class ) |>
  dplyr::bind_cols(test_set_only_class)  |> 
  classification_metrics(truth = sleep_disorder, estimate = .pred_class)
## # A tibble: 3 × 3
##   .metric  .estimator .estimate
##   <chr>    <chr>          <dbl>
## 1 accuracy multiclass     0.934
## 2 mcc      multiclass     0.889
## 3 f_meas   macro          0.902

Random Florest

predict(model_random_forest_fit,test_set_without_class ) |>
  dplyr::bind_cols(test_set_only_class)  |> 
  yardstick::conf_mat(truth = sleep_disorder, estimate = .pred_class)
##              Truth
## Prediction    Insomnia None Sleep Apnea
##   Insomnia          11    0           0
##   None               2   44           0
##   Sleep Apnea        3    0          16
predict(model_random_forest_fit,test_set_without_class ) |>
  dplyr::bind_cols(test_set_only_class)  |> 
  classification_metrics(truth = sleep_disorder, estimate = .pred_class)
## # A tibble: 3 × 3
##   .metric  .estimator .estimate
##   <chr>    <chr>          <dbl>
## 1 accuracy multiclass     0.934
## 2 mcc      multiclass     0.889
## 3 f_meas   macro          0.902

Conclusion

The model with the best metrics was the SVM with Linear Kernel. However, its performance can become better with some tuning techniques.