Sleep is an important part of human life. When we are sleeping, our brains can retain the knowledge we have acquired throughout the day and restore our energy.
However, nowadays, it is really common for adults not to have a good sleep time, perhaps due to their occupations, daily stress, or other bad habits.
So, can machine learning help them get a better night’s sleep?
Train a machine learning model capable of predicting whether someone will have a sleep disorder based on their lifestyle.
library(here)
library(tidyverse)
data_raw = readr::read_csv(file = here::here('data/raw/Sleep_health_and_lifestyle_dataset.csv'))
data_raw |> head(5)
## # A tibble: 5 × 13
## `Person ID` Gender Age Occupation `Sleep Duration` `Quality of Sleep`
## <dbl> <chr> <dbl> <chr> <dbl> <dbl>
## 1 1 Male 27 Software Engineer 6.1 6
## 2 2 Male 28 Doctor 6.2 6
## 3 3 Male 28 Doctor 6.2 6
## 4 4 Male 28 Sales Representa… 5.9 4
## 5 5 Male 28 Sales Representa… 5.9 4
## # ℹ 7 more variables: `Physical Activity Level` <dbl>, `Stress Level` <dbl>,
## # `BMI Category` <chr>, `Blood Pressure` <chr>, `Heart Rate` <dbl>,
## # `Daily Steps` <dbl>, `Sleep Disorder` <chr>
print('--------------Before Rename-----------------')
## [1] "--------------Before Rename-----------------"
colnames(data_raw)
## [1] "Person ID" "Gender"
## [3] "Age" "Occupation"
## [5] "Sleep Duration" "Quality of Sleep"
## [7] "Physical Activity Level" "Stress Level"
## [9] "BMI Category" "Blood Pressure"
## [11] "Heart Rate" "Daily Steps"
## [13] "Sleep Disorder"
print('--------------------After Rename------------------')
## [1] "--------------------After Rename------------------"
colnames(data_raw)=lapply(colnames(data_raw),
function(x) stringr::str_replace_all(string=x,pattern=" ", repl="_")) |>
tolower()
data_renamed = data_raw
colnames(data_renamed)
## [1] "person_id" "gender"
## [3] "age" "occupation"
## [5] "sleep_duration" "quality_of_sleep"
## [7] "physical_activity_level" "stress_level"
## [9] "bmi_category" "blood_pressure"
## [11] "heart_rate" "daily_steps"
## [13] "sleep_disorder"
transform_in_factor=c("gender","occupation","bmi_category","sleep_disorder",'stress_level','quality_of_sleep')
for(x in transform_in_factor){
data_renamed[x] = factor(data_renamed[[x]])
}
rm(x)
rm(transform_in_factor)
data_renamed_factored = data_renamed
data_renamed_factored$bmi_category <- recode(data_renamed_factored$bmi_category,`Normal Weight`='Underweight')
summary(data_renamed_factored)
## person_id gender age occupation sleep_duration
## Min. : 1.00 Female:185 Min. :27.00 Nurse :73 Min. :5.800
## 1st Qu.: 94.25 Male :189 1st Qu.:35.25 Doctor :71 1st Qu.:6.400
## Median :187.50 Median :43.00 Engineer :63 Median :7.200
## Mean :187.50 Mean :42.18 Lawyer :47 Mean :7.132
## 3rd Qu.:280.75 3rd Qu.:50.00 Teacher :40 3rd Qu.:7.800
## Max. :374.00 Max. :59.00 Accountant:37 Max. :8.500
## (Other) :43
## quality_of_sleep physical_activity_level stress_level bmi_category
## 4: 5 Min. :30.00 3:71 Normal :195
## 5: 7 1st Qu.:45.00 4:70 Underweight: 21
## 6:105 Median :60.00 5:67 Obese : 10
## 7: 77 Mean :59.17 6:46 Overweight :148
## 8:109 3rd Qu.:75.00 7:50
## 9: 71 Max. :90.00 8:70
##
## blood_pressure heart_rate daily_steps sleep_disorder
## Length:374 Min. :65.00 Min. : 3000 Insomnia : 77
## Class :character 1st Qu.:68.00 1st Qu.: 5600 None :219
## Mode :character Median :70.00 Median : 7000 Sleep Apnea: 78
## Mean :70.17 Mean : 6817
## 3rd Qu.:72.00 3rd Qu.: 8000
## Max. :86.00 Max. :10000
##
data_renamed_factored_new_features= data_renamed_factored |> separate_wider_delim(blood_pressure, "/", names = c("systolic_pressure", "diastolic_pressure"))
for(x in c("systolic_pressure", "diastolic_pressure")) {
data_renamed_factored_new_features[x] = as.numeric(data_renamed_factored_new_features[[x]])
}
rm(x)
data_renamed_factored_new_features |> summary()
## person_id gender age occupation sleep_duration
## Min. : 1.00 Female:185 Min. :27.00 Nurse :73 Min. :5.800
## 1st Qu.: 94.25 Male :189 1st Qu.:35.25 Doctor :71 1st Qu.:6.400
## Median :187.50 Median :43.00 Engineer :63 Median :7.200
## Mean :187.50 Mean :42.18 Lawyer :47 Mean :7.132
## 3rd Qu.:280.75 3rd Qu.:50.00 Teacher :40 3rd Qu.:7.800
## Max. :374.00 Max. :59.00 Accountant:37 Max. :8.500
## (Other) :43
## quality_of_sleep physical_activity_level stress_level bmi_category
## 4: 5 Min. :30.00 3:71 Normal :195
## 5: 7 1st Qu.:45.00 4:70 Underweight: 21
## 6:105 Median :60.00 5:67 Obese : 10
## 7: 77 Mean :59.17 6:46 Overweight :148
## 8:109 3rd Qu.:75.00 7:50
## 9: 71 Max. :90.00 8:70
##
## systolic_pressure diastolic_pressure heart_rate daily_steps
## Min. :115.0 Min. :75.00 Min. :65.00 Min. : 3000
## 1st Qu.:125.0 1st Qu.:80.00 1st Qu.:68.00 1st Qu.: 5600
## Median :130.0 Median :85.00 Median :70.00 Median : 7000
## Mean :128.6 Mean :84.65 Mean :70.17 Mean : 6817
## 3rd Qu.:135.0 3rd Qu.:90.00 3rd Qu.:72.00 3rd Qu.: 8000
## Max. :142.0 Max. :95.00 Max. :86.00 Max. :10000
##
## sleep_disorder
## Insomnia : 77
## None :219
## Sleep Apnea: 78
##
##
##
##
data_eda = data_renamed_factored_new_features
data_eda = data_eda |> dplyr::select(-c('person_id'))
columns_numeric = data_eda |> dplyr::select_if(is.numeric) |> colnames()
#columns_numeric
for( x in columns_numeric){
g = data_eda |> ggplot() +
geom_histogram(aes(.data[[x]]), bins = 30) +
xlab(x)
print(g)
}
rm(columns_numeric,x,g)
columns_factor = data_eda |> dplyr::select_if(is.factor) |> dplyr:: select(-c(sleep_disorder)) |> colnames()
for( x in columns_factor){
g = data_eda |>
ggplot() +
geom_bar(aes(x=sleep_disorder,
fill=.data[[x]]),
position ='dodge2')
print(g)
}
rm(columns_factor,x,g)
library(here)
library(tidymodels)
library(tidyverse)
library(rpart.plot)
set.seed(1)
split <- rsample::initial_split(data_eda, prop = .80,strata = sleep_disorder)
train_set <- rsample::training(split)
test_set <- rsample::testing(split)
model_decision_tree=parsnip::decision_tree(mode='classification', engine = 'rpart')
model_decision_tree_fit= model_decision_tree |> fit( train_set$sleep_disorder ~ ., data = train_set)
graph_model_decision_tree_fit <- extract_fit_engine(model_decision_tree_fit)
rpart.plot::rpart.plot(graph_model_decision_tree_fit)
model_svm_linear=parsnip::svm_linear(mode ='classification',engine = 'kernlab')
model_svm_linear_fit= model_svm_linear |> fit( train_set$sleep_disorder ~ ., data = train_set)
## Setting default kernel parameters
model_svm_linear_fit
## parsnip model object
##
## Support Vector Machine object of class "ksvm"
##
## SV type: C-svc (classification)
## parameter : cost C = 1
##
## Linear (vanilla) kernel function.
##
## Number of Support Vectors : 89
##
## Objective Function Value : -23.4147 -18.1426 -25.4937
## Training error : 0.077181
## Probability model included.
model_svm_radial=parsnip::svm_rbf(mode ='classification',engine = 'kernlab')
model_svm_radial_fit= model_svm_radial |> fit( train_set$sleep_disorder ~ ., data = train_set)
model_svm_radial_fit
## parsnip model object
##
## Support Vector Machine object of class "ksvm"
##
## SV type: C-svc (classification)
## parameter : cost C = 1
##
## Gaussian Radial Basis kernel function.
## Hyperparameter : sigma = 0.0930375075840404
##
## Number of Support Vectors : 93
##
## Objective Function Value : -31.1503 -25.7571 -31.9414
## Training error : 0.087248
## Probability model included.
model_random_forest= rand_forest() |>
set_engine('randomForest') |>
set_mode('classification')
model_random_forest_workflow = workflow() |>
add_model(model_random_forest) |>
add_formula(sleep_disorder ~ .)
model_random_forest_fit = fit(model_random_forest_workflow, train_set)
test_set_without_class=subset(test_set,select = -c(sleep_disorder))
test_set_only_class=subset(test_set,select = c(sleep_disorder))
classification_metrics <- yardstick::metric_set(accuracy, mcc, f_meas)
predict(model_decision_tree_fit,test_set_without_class ) |>
dplyr::bind_cols(test_set_only_class) |>
yardstick::conf_mat(truth = sleep_disorder, estimate = .pred_class)
## Truth
## Prediction Insomnia None Sleep Apnea
## Insomnia 10 0 0
## None 6 44 1
## Sleep Apnea 0 0 15
predict(model_decision_tree_fit,test_set_without_class ) |>
dplyr::bind_cols(test_set_only_class) |>
classification_metrics(truth = sleep_disorder, estimate = .pred_class)
## # A tibble: 3 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 accuracy multiclass 0.908
## 2 mcc multiclass 0.844
## 3 f_meas macro 0.888
predict(model_svm_linear_fit,test_set_without_class ) |>
dplyr::bind_cols(test_set_only_class) |>
yardstick::conf_mat(truth = sleep_disorder, estimate = .pred_class)
## Truth
## Prediction Insomnia None Sleep Apnea
## Insomnia 13 1 0
## None 2 43 0
## Sleep Apnea 1 0 16
predict(model_svm_linear_fit,test_set_without_class ) |>
dplyr::bind_cols(test_set_only_class) |>
classification_metrics(truth = sleep_disorder, estimate = .pred_class)
## # A tibble: 3 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 accuracy multiclass 0.947
## 2 mcc multiclass 0.909
## 3 f_meas macro 0.934
predict(model_svm_radial_fit,test_set_without_class ) |>
dplyr::bind_cols(test_set_only_class) |>
yardstick::conf_mat(truth = sleep_disorder, estimate = .pred_class)
## Truth
## Prediction Insomnia None Sleep Apnea
## Insomnia 11 0 0
## None 2 44 0
## Sleep Apnea 3 0 16
predict(model_svm_radial_fit,test_set_without_class ) |>
dplyr::bind_cols(test_set_only_class) |>
classification_metrics(truth = sleep_disorder, estimate = .pred_class)
## # A tibble: 3 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 accuracy multiclass 0.934
## 2 mcc multiclass 0.889
## 3 f_meas macro 0.902
predict(model_random_forest_fit,test_set_without_class ) |>
dplyr::bind_cols(test_set_only_class) |>
yardstick::conf_mat(truth = sleep_disorder, estimate = .pred_class)
## Truth
## Prediction Insomnia None Sleep Apnea
## Insomnia 11 0 0
## None 2 44 0
## Sleep Apnea 3 0 16
predict(model_random_forest_fit,test_set_without_class ) |>
dplyr::bind_cols(test_set_only_class) |>
classification_metrics(truth = sleep_disorder, estimate = .pred_class)
## # A tibble: 3 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 accuracy multiclass 0.934
## 2 mcc multiclass 0.889
## 3 f_meas macro 0.902
The model with the best metrics was the SVM with Linear Kernel. However, its performance can become better with some tuning techniques.