Import Library

#install.packages("readr")
#install.packages("dplyr")
#install.packages("ggplot2")
#install.packages("corrplot")
#install.packages("pls")
#install.packages("caret")

library(readr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
library(corrplot)
## corrplot 0.95 loaded
library(pls)
## 
## Attaching package: 'pls'
## The following object is masked from 'package:corrplot':
## 
##     corrplot
## The following object is masked from 'package:stats':
## 
##     loadings
library(caret)
## Loading required package: lattice
## 
## Attaching package: 'caret'
## The following object is masked from 'package:pls':
## 
##     R2

Load dataset

df <- read_csv("Sleep_health_and_lifestyle_dataset.csv")
## Rows: 374 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (5): Gender, Occupation, BMI Category, Blood Pressure, Sleep Disorder
## dbl (8): Person ID, Age, Sleep Duration, Quality of Sleep, Physical Activity...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
head(df)
## # A tibble: 6 × 13
##   `Person ID` Gender   Age Occupation        `Sleep Duration` `Quality of Sleep`
##         <dbl> <chr>  <dbl> <chr>                        <dbl>              <dbl>
## 1           1 Male      27 Software Engineer              6.1                  6
## 2           2 Male      28 Doctor                         6.2                  6
## 3           3 Male      28 Doctor                         6.2                  6
## 4           4 Male      28 Sales Representa…              5.9                  4
## 5           5 Male      28 Sales Representa…              5.9                  4
## 6           6 Male      28 Software Engineer              5.9                  4
## # ℹ 7 more variables: `Physical Activity Level` <dbl>, `Stress Level` <dbl>,
## #   `BMI Category` <chr>, `Blood Pressure` <chr>, `Heart Rate` <dbl>,
## #   `Daily Steps` <dbl>, `Sleep Disorder` <chr>

Melihat informasi dataset

# melihat struktur data
str(df)
## spc_tbl_ [374 × 13] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ Person ID              : num [1:374] 1 2 3 4 5 6 7 8 9 10 ...
##  $ Gender                 : chr [1:374] "Male" "Male" "Male" "Male" ...
##  $ Age                    : num [1:374] 27 28 28 28 28 28 29 29 29 29 ...
##  $ Occupation             : chr [1:374] "Software Engineer" "Doctor" "Doctor" "Sales Representative" ...
##  $ Sleep Duration         : num [1:374] 6.1 6.2 6.2 5.9 5.9 5.9 6.3 7.8 7.8 7.8 ...
##  $ Quality of Sleep       : num [1:374] 6 6 6 4 4 4 6 7 7 7 ...
##  $ Physical Activity Level: num [1:374] 42 60 60 30 30 30 40 75 75 75 ...
##  $ Stress Level           : num [1:374] 6 8 8 8 8 8 7 6 6 6 ...
##  $ BMI Category           : chr [1:374] "Overweight" "Normal" "Normal" "Obese" ...
##  $ Blood Pressure         : chr [1:374] "126/83" "125/80" "125/80" "140/90" ...
##  $ Heart Rate             : num [1:374] 77 75 75 85 85 85 82 70 70 70 ...
##  $ Daily Steps            : num [1:374] 4200 10000 10000 3000 3000 3000 3500 8000 8000 8000 ...
##  $ Sleep Disorder         : chr [1:374] "None" "None" "None" "Sleep Apnea" ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   `Person ID` = col_double(),
##   ..   Gender = col_character(),
##   ..   Age = col_double(),
##   ..   Occupation = col_character(),
##   ..   `Sleep Duration` = col_double(),
##   ..   `Quality of Sleep` = col_double(),
##   ..   `Physical Activity Level` = col_double(),
##   ..   `Stress Level` = col_double(),
##   ..   `BMI Category` = col_character(),
##   ..   `Blood Pressure` = col_character(),
##   ..   `Heart Rate` = col_double(),
##   ..   `Daily Steps` = col_double(),
##   ..   `Sleep Disorder` = col_character()
##   .. )
##  - attr(*, "problems")=<pointer: 0x000001f1319ff6b0>
# melihat dimensi data
dim(df)
## [1] 374  13
# statistik deskriptif
summary(df)
##    Person ID            Gender         Age            Occupation 
##  Min.   :  1.00   Length   :374   Min.   :27.00   Length   :374  
##  1st Qu.: 94.25   N.unique :  2   1st Qu.:35.25   N.unique : 11  
##  Median :187.50   N.blank  :  0   Median :43.00   N.blank  :  0  
##  Mean   :187.50   Min.nchar:  4   Mean   :42.18   Min.nchar:  5  
##  3rd Qu.:280.75   Max.nchar:  6   3rd Qu.:50.00   Max.nchar: 20  
##  Max.   :374.00                   Max.   :59.00                  
##  Sleep Duration  Quality of Sleep Physical Activity Level  Stress Level  
##  Min.   :5.800   Min.   :4.000    Min.   :30.00           Min.   :3.000  
##  1st Qu.:6.400   1st Qu.:6.000    1st Qu.:45.00           1st Qu.:4.000  
##  Median :7.200   Median :7.000    Median :60.00           Median :5.000  
##  Mean   :7.132   Mean   :7.313    Mean   :59.17           Mean   :5.385  
##  3rd Qu.:7.800   3rd Qu.:8.000    3rd Qu.:75.00           3rd Qu.:7.000  
##  Max.   :8.500   Max.   :9.000    Max.   :90.00           Max.   :8.000  
##     BMI Category   Blood Pressure   Heart Rate     Daily Steps   
##  Length   :374   Length   :374    Min.   :65.00   Min.   : 3000  
##  N.unique :  4   N.unique : 25    1st Qu.:68.00   1st Qu.: 5600  
##  N.blank  :  0   N.blank  :  0    Median :70.00   Median : 7000  
##  Min.nchar:  5   Min.nchar:  6    Mean   :70.17   Mean   : 6817  
##  Max.nchar: 13   Max.nchar:  6    3rd Qu.:72.00   3rd Qu.: 8000  
##                                   Max.   :86.00   Max.   :10000  
##    Sleep Disorder
##  Length   :374   
##  N.unique :  3   
##  N.blank  :  0   
##  Min.nchar:  4   
##  Max.nchar: 11   
## 

Cek missing value

colSums(is.na(df))
##               Person ID                  Gender                     Age 
##                       0                       0                       0 
##              Occupation          Sleep Duration        Quality of Sleep 
##                       0                       0                       0 
## Physical Activity Level            Stress Level            BMI Category 
##                       0                       0                       0 
##          Blood Pressure              Heart Rate             Daily Steps 
##                       0                       0                       0 
##          Sleep Disorder 
##                       0

Data Preprocessinng

# menghapus kolom person id
df <- df %>%
  select(-`Person ID`)

head(df)
## # A tibble: 6 × 12
##   Gender   Age Occupation           `Sleep Duration` `Quality of Sleep`
##   <chr>  <dbl> <chr>                           <dbl>              <dbl>
## 1 Male      27 Software Engineer                 6.1                  6
## 2 Male      28 Doctor                            6.2                  6
## 3 Male      28 Doctor                            6.2                  6
## 4 Male      28 Sales Representative              5.9                  4
## 5 Male      28 Sales Representative              5.9                  4
## 6 Male      28 Software Engineer                 5.9                  4
## # ℹ 7 more variables: `Physical Activity Level` <dbl>, `Stress Level` <dbl>,
## #   `BMI Category` <chr>, `Blood Pressure` <chr>, `Heart Rate` <dbl>,
## #   `Daily Steps` <dbl>, `Sleep Disorder` <chr>
# Encoding variabel kategorik
df$`BMI Category` <- as.numeric(
  as.factor(df$`BMI Category`)
)

EDA

# histogram quality of sleep

ggplot(df, aes(x = `Quality of Sleep`)) +
  geom_histogram(bins = 10)

# Scatterplot Stress Level dan Quality of Sleep
ggplot(df,
       aes(x = `Stress Level`,
           y = `Quality of Sleep`)) +
  geom_point()

# Boxplot BMI Category dan quality of sleep
ggplot(df,
       aes(x = factor(`BMI Category`),
           y = `Quality of Sleep`)) +
  geom_boxplot()

# Heatmap korelasi
numeric_data <- df[, sapply(df, is.numeric)]

cor_matrix <- cor(numeric_data)

corrplot::corrplot(
  cor_matrix,
  method = "color",
  type = "upper",
  addCoef.col = "black",
  tl.col = "black",
  tl.cex = 0.7,
  number.cex = 0.5
)

### Variabel dependen dan independen

# variabel independen
X <- df %>%
  select(`Physical Activity Level`,
         `Stress Level`,
         `BMI Category`,
         `Daily Steps`,
         `Heart Rate`)

# variabel dependen
Y <- df$`Quality of Sleep`

Model PLS

pls_model <- plsr(
  Y ~ .,
  data = data.frame(Y, X),
  scale = TRUE,
  validation = "CV"
)

summary(pls_model)
## Data:    X dimension: 374 5 
##  Y dimension: 374 1
## Fit method: kernelpls
## Number of components considered: 5
## 
## VALIDATION: RMSEP
## Cross-validated using 10 random segments.
##        (Intercept)  1 comps  2 comps  3 comps  4 comps  5 comps
## CV           1.199   0.5301   0.4702   0.4412   0.4358   0.4313
## adjCV        1.199   0.5296   0.4696   0.4408   0.4355   0.4310
## 
## TRAINING: % variance explained
##    1 comps  2 comps  3 comps  4 comps  5 comps
## X    35.14    59.11    81.82    98.10   100.00
## Y    80.68    85.26    86.78    87.02    87.33

Jumlah komponen terbaik

# plot validation
validationplot(pls_model, val.type = "MSEP")

#RMSEP
RMSEP(pls_model)
##        (Intercept)  1 comps  2 comps  3 comps  4 comps  5 comps
## CV           1.199   0.5301   0.4702   0.4412   0.4358   0.4313
## adjCV        1.199   0.5296   0.4696   0.4408   0.4355   0.4310

Koefisien model

coef(pls_model)
## , , 5 comps
## 
##                                   Y
## Physical.Activity.Level  0.17006882
## Stress.Level            -0.99168461
## BMI.Category            -0.19837465
## Daily.Steps              0.07029009
## Heart.Rate              -0.08790154
coef_df <- data.frame(
  Variabel = rownames(coef(pls_model)),
  Koefisien = as.vector(coef(pls_model))
)

coef_df
##                  Variabel   Koefisien
## 1 Physical.Activity.Level  0.17006882
## 2            Stress.Level -0.99168461
## 3            BMI.Category -0.19837465
## 4             Daily.Steps  0.07029009
## 5              Heart.Rate -0.08790154

Visualisasi Koefisien Variabel

ggplot(coef_df,
       aes(x = Variabel,
           y = Koefisien)) +
  geom_bar(stat = "identity") +
  theme(axis.text.x = element_text(angle = 45,
                                   hjust = 1))