#install.packages("readr")
#install.packages("dplyr")
#install.packages("ggplot2")
#install.packages("corrplot")
#install.packages("pls")
#install.packages("caret")
library(readr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(corrplot)
## corrplot 0.95 loaded
library(pls)
##
## Attaching package: 'pls'
## The following object is masked from 'package:corrplot':
##
## corrplot
## The following object is masked from 'package:stats':
##
## loadings
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:pls':
##
## R2
df <- read_csv("Sleep_health_and_lifestyle_dataset.csv")
## Rows: 374 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (5): Gender, Occupation, BMI Category, Blood Pressure, Sleep Disorder
## dbl (8): Person ID, Age, Sleep Duration, Quality of Sleep, Physical Activity...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
head(df)
## # A tibble: 6 × 13
## `Person ID` Gender Age Occupation `Sleep Duration` `Quality of Sleep`
## <dbl> <chr> <dbl> <chr> <dbl> <dbl>
## 1 1 Male 27 Software Engineer 6.1 6
## 2 2 Male 28 Doctor 6.2 6
## 3 3 Male 28 Doctor 6.2 6
## 4 4 Male 28 Sales Representa… 5.9 4
## 5 5 Male 28 Sales Representa… 5.9 4
## 6 6 Male 28 Software Engineer 5.9 4
## # ℹ 7 more variables: `Physical Activity Level` <dbl>, `Stress Level` <dbl>,
## # `BMI Category` <chr>, `Blood Pressure` <chr>, `Heart Rate` <dbl>,
## # `Daily Steps` <dbl>, `Sleep Disorder` <chr>
# melihat struktur data
str(df)
## spc_tbl_ [374 × 13] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ Person ID : num [1:374] 1 2 3 4 5 6 7 8 9 10 ...
## $ Gender : chr [1:374] "Male" "Male" "Male" "Male" ...
## $ Age : num [1:374] 27 28 28 28 28 28 29 29 29 29 ...
## $ Occupation : chr [1:374] "Software Engineer" "Doctor" "Doctor" "Sales Representative" ...
## $ Sleep Duration : num [1:374] 6.1 6.2 6.2 5.9 5.9 5.9 6.3 7.8 7.8 7.8 ...
## $ Quality of Sleep : num [1:374] 6 6 6 4 4 4 6 7 7 7 ...
## $ Physical Activity Level: num [1:374] 42 60 60 30 30 30 40 75 75 75 ...
## $ Stress Level : num [1:374] 6 8 8 8 8 8 7 6 6 6 ...
## $ BMI Category : chr [1:374] "Overweight" "Normal" "Normal" "Obese" ...
## $ Blood Pressure : chr [1:374] "126/83" "125/80" "125/80" "140/90" ...
## $ Heart Rate : num [1:374] 77 75 75 85 85 85 82 70 70 70 ...
## $ Daily Steps : num [1:374] 4200 10000 10000 3000 3000 3000 3500 8000 8000 8000 ...
## $ Sleep Disorder : chr [1:374] "None" "None" "None" "Sleep Apnea" ...
## - attr(*, "spec")=
## .. cols(
## .. `Person ID` = col_double(),
## .. Gender = col_character(),
## .. Age = col_double(),
## .. Occupation = col_character(),
## .. `Sleep Duration` = col_double(),
## .. `Quality of Sleep` = col_double(),
## .. `Physical Activity Level` = col_double(),
## .. `Stress Level` = col_double(),
## .. `BMI Category` = col_character(),
## .. `Blood Pressure` = col_character(),
## .. `Heart Rate` = col_double(),
## .. `Daily Steps` = col_double(),
## .. `Sleep Disorder` = col_character()
## .. )
## - attr(*, "problems")=<pointer: 0x000001f1319ff6b0>
# melihat dimensi data
dim(df)
## [1] 374 13
# statistik deskriptif
summary(df)
## Person ID Gender Age Occupation
## Min. : 1.00 Length :374 Min. :27.00 Length :374
## 1st Qu.: 94.25 N.unique : 2 1st Qu.:35.25 N.unique : 11
## Median :187.50 N.blank : 0 Median :43.00 N.blank : 0
## Mean :187.50 Min.nchar: 4 Mean :42.18 Min.nchar: 5
## 3rd Qu.:280.75 Max.nchar: 6 3rd Qu.:50.00 Max.nchar: 20
## Max. :374.00 Max. :59.00
## Sleep Duration Quality of Sleep Physical Activity Level Stress Level
## Min. :5.800 Min. :4.000 Min. :30.00 Min. :3.000
## 1st Qu.:6.400 1st Qu.:6.000 1st Qu.:45.00 1st Qu.:4.000
## Median :7.200 Median :7.000 Median :60.00 Median :5.000
## Mean :7.132 Mean :7.313 Mean :59.17 Mean :5.385
## 3rd Qu.:7.800 3rd Qu.:8.000 3rd Qu.:75.00 3rd Qu.:7.000
## Max. :8.500 Max. :9.000 Max. :90.00 Max. :8.000
## BMI Category Blood Pressure Heart Rate Daily Steps
## Length :374 Length :374 Min. :65.00 Min. : 3000
## N.unique : 4 N.unique : 25 1st Qu.:68.00 1st Qu.: 5600
## N.blank : 0 N.blank : 0 Median :70.00 Median : 7000
## Min.nchar: 5 Min.nchar: 6 Mean :70.17 Mean : 6817
## Max.nchar: 13 Max.nchar: 6 3rd Qu.:72.00 3rd Qu.: 8000
## Max. :86.00 Max. :10000
## Sleep Disorder
## Length :374
## N.unique : 3
## N.blank : 0
## Min.nchar: 4
## Max.nchar: 11
##
colSums(is.na(df))
## Person ID Gender Age
## 0 0 0
## Occupation Sleep Duration Quality of Sleep
## 0 0 0
## Physical Activity Level Stress Level BMI Category
## 0 0 0
## Blood Pressure Heart Rate Daily Steps
## 0 0 0
## Sleep Disorder
## 0
# menghapus kolom person id
df <- df %>%
select(-`Person ID`)
head(df)
## # A tibble: 6 × 12
## Gender Age Occupation `Sleep Duration` `Quality of Sleep`
## <chr> <dbl> <chr> <dbl> <dbl>
## 1 Male 27 Software Engineer 6.1 6
## 2 Male 28 Doctor 6.2 6
## 3 Male 28 Doctor 6.2 6
## 4 Male 28 Sales Representative 5.9 4
## 5 Male 28 Sales Representative 5.9 4
## 6 Male 28 Software Engineer 5.9 4
## # ℹ 7 more variables: `Physical Activity Level` <dbl>, `Stress Level` <dbl>,
## # `BMI Category` <chr>, `Blood Pressure` <chr>, `Heart Rate` <dbl>,
## # `Daily Steps` <dbl>, `Sleep Disorder` <chr>
# Encoding variabel kategorik
df$`BMI Category` <- as.numeric(
as.factor(df$`BMI Category`)
)
# histogram quality of sleep
ggplot(df, aes(x = `Quality of Sleep`)) +
geom_histogram(bins = 10)
# Scatterplot Stress Level dan Quality of Sleep
ggplot(df,
aes(x = `Stress Level`,
y = `Quality of Sleep`)) +
geom_point()
# Boxplot BMI Category dan quality of sleep
ggplot(df,
aes(x = factor(`BMI Category`),
y = `Quality of Sleep`)) +
geom_boxplot()
# Heatmap korelasi
numeric_data <- df[, sapply(df, is.numeric)]
cor_matrix <- cor(numeric_data)
corrplot::corrplot(
cor_matrix,
method = "color",
type = "upper",
addCoef.col = "black",
tl.col = "black",
tl.cex = 0.7,
number.cex = 0.5
)
### Variabel dependen dan independen
# variabel independen
X <- df %>%
select(`Physical Activity Level`,
`Stress Level`,
`BMI Category`,
`Daily Steps`,
`Heart Rate`)
# variabel dependen
Y <- df$`Quality of Sleep`
pls_model <- plsr(
Y ~ .,
data = data.frame(Y, X),
scale = TRUE,
validation = "CV"
)
summary(pls_model)
## Data: X dimension: 374 5
## Y dimension: 374 1
## Fit method: kernelpls
## Number of components considered: 5
##
## VALIDATION: RMSEP
## Cross-validated using 10 random segments.
## (Intercept) 1 comps 2 comps 3 comps 4 comps 5 comps
## CV 1.199 0.5301 0.4702 0.4412 0.4358 0.4313
## adjCV 1.199 0.5296 0.4696 0.4408 0.4355 0.4310
##
## TRAINING: % variance explained
## 1 comps 2 comps 3 comps 4 comps 5 comps
## X 35.14 59.11 81.82 98.10 100.00
## Y 80.68 85.26 86.78 87.02 87.33
# plot validation
validationplot(pls_model, val.type = "MSEP")
#RMSEP
RMSEP(pls_model)
## (Intercept) 1 comps 2 comps 3 comps 4 comps 5 comps
## CV 1.199 0.5301 0.4702 0.4412 0.4358 0.4313
## adjCV 1.199 0.5296 0.4696 0.4408 0.4355 0.4310
coef(pls_model)
## , , 5 comps
##
## Y
## Physical.Activity.Level 0.17006882
## Stress.Level -0.99168461
## BMI.Category -0.19837465
## Daily.Steps 0.07029009
## Heart.Rate -0.08790154
coef_df <- data.frame(
Variabel = rownames(coef(pls_model)),
Koefisien = as.vector(coef(pls_model))
)
coef_df
## Variabel Koefisien
## 1 Physical.Activity.Level 0.17006882
## 2 Stress.Level -0.99168461
## 3 BMI.Category -0.19837465
## 4 Daily.Steps 0.07029009
## 5 Heart.Rate -0.08790154
ggplot(coef_df,
aes(x = Variabel,
y = Koefisien)) +
geom_bar(stat = "identity") +
theme(axis.text.x = element_text(angle = 45,
hjust = 1))