library(readr)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.2 ✔ purrr 1.0.2
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.3 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
sleep_health_and_lifestyle_dataset <- read_csv("C:/Users/acer/Desktop/Sleep_health_and_lifestyle_dataset.csv")
## Rows: 374 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (5): Gender, Occupation, BMI Category, Blood Pressure, Sleep Disorder
## dbl (8): Person ID, Age, Sleep Duration, Quality of Sleep, Physical Activity...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
View(sleep_health_and_lifestyle_dataset)
no_of_rows<-nrow(sleep_health_and_lifestyle_dataset)
print(no_of_rows)
## [1] 374
no_of_columns<-ncol(sleep_health_and_lifestyle_dataset)
print(no_of_columns)
## [1] 13
head(sleep_health_and_lifestyle_dataset,10)
## # A tibble: 10 × 13
## `Person ID` Gender Age Occupation `Sleep Duration` `Quality of Sleep`
## <dbl> <chr> <dbl> <chr> <dbl> <dbl>
## 1 1 Male 27 Software Engine… 6.1 6
## 2 2 Male 28 Doctor 6.2 6
## 3 3 Male 28 Doctor 6.2 6
## 4 4 Male 28 Sales Represent… 5.9 4
## 5 5 Male 28 Sales Represent… 5.9 4
## 6 6 Male 28 Software Engine… 5.9 4
## 7 7 Male 29 Teacher 6.3 6
## 8 8 Male 29 Doctor 7.8 7
## 9 9 Male 29 Doctor 7.8 7
## 10 10 Male 29 Doctor 7.8 7
## # ℹ 7 more variables: `Physical Activity Level` <dbl>, `Stress Level` <dbl>,
## # `BMI Category` <chr>, `Blood Pressure` <chr>, `Heart Rate` <dbl>,
## # `Daily Steps` <dbl>, `Sleep Disorder` <chr>
tail(sleep_health_and_lifestyle_dataset,15)
## # A tibble: 15 × 13
## `Person ID` Gender Age Occupation `Sleep Duration` `Quality of Sleep`
## <dbl> <chr> <dbl> <chr> <dbl> <dbl>
## 1 360 Female 59 Nurse 8.1 9
## 2 361 Female 59 Nurse 8.2 9
## 3 362 Female 59 Nurse 8.2 9
## 4 363 Female 59 Nurse 8.2 9
## 5 364 Female 59 Nurse 8.2 9
## 6 365 Female 59 Nurse 8 9
## 7 366 Female 59 Nurse 8 9
## 8 367 Female 59 Nurse 8.1 9
## 9 368 Female 59 Nurse 8 9
## 10 369 Female 59 Nurse 8.1 9
## 11 370 Female 59 Nurse 8.1 9
## 12 371 Female 59 Nurse 8 9
## 13 372 Female 59 Nurse 8.1 9
## 14 373 Female 59 Nurse 8.1 9
## 15 374 Female 59 Nurse 8.1 9
## # ℹ 7 more variables: `Physical Activity Level` <dbl>, `Stress Level` <dbl>,
## # `BMI Category` <chr>, `Blood Pressure` <chr>, `Heart Rate` <dbl>,
## # `Daily Steps` <dbl>, `Sleep Disorder` <chr>
#STATISTICAL TECHNIQUES
mean(sleep_health_and_lifestyle_dataset$Age)
## [1] 42.18449
median(sleep_health_and_lifestyle_dataset$Age)
## [1] 43
standard_deviation <- sd(sleep_health_and_lifestyle_dataset$Age)
print(standard_deviation)
## [1] 8.673133
variance <- var(sleep_health_and_lifestyle_dataset$Age)
print(variance)
## [1] 75.22324
Occupation_Daily_steps<- sleep_health_and_lifestyle_dataset%>% group_by (Occupation)%>%
summarise(Total_daily_steps = sum(`Daily Steps`))%>%arrange(-Total_daily_steps)
View(Occupation_Daily_steps)
Gender_base<-sleep_health_and_lifestyle_dataset%>% group_by(Gender)%>%
summarise(Average_sleep_duration = mean(`Sleep Duration`),
Total_steps_taken=sum(`Daily Steps`),
Average_quality_of_sleep= mean(`Quality of Sleep`),Average_Stress_level = mean(`Stress Level`))
View(Gender_base)
#FILTERING
high_stress_data <- sleep_health_and_lifestyle_dataset[sleep_health_and_lifestyle_dataset$'Stress Level' > 5, ]
print(high_stress_data)
## # A tibble: 166 × 13
## `Person ID` Gender Age Occupation `Sleep Duration` `Quality of Sleep`
## <dbl> <chr> <dbl> <chr> <dbl> <dbl>
## 1 1 Male 27 Software Engine… 6.1 6
## 2 2 Male 28 Doctor 6.2 6
## 3 3 Male 28 Doctor 6.2 6
## 4 4 Male 28 Sales Represent… 5.9 4
## 5 5 Male 28 Sales Represent… 5.9 4
## 6 6 Male 28 Software Engine… 5.9 4
## 7 7 Male 29 Teacher 6.3 6
## 8 8 Male 29 Doctor 7.8 7
## 9 9 Male 29 Doctor 7.8 7
## 10 10 Male 29 Doctor 7.8 7
## # ℹ 156 more rows
## # ℹ 7 more variables: `Physical Activity Level` <dbl>, `Stress Level` <dbl>,
## # `BMI Category` <chr>, `Blood Pressure` <chr>, `Heart Rate` <dbl>,
## # `Daily Steps` <dbl>, `Sleep Disorder` <chr>
#DATA CLEANING
# REMOVING DUPLICATES
sleep_health_and_lifestyle_dataset<- unique(sleep_health_and_lifestyle_dataset)
print(sleep_health_and_lifestyle_dataset)
## # A tibble: 374 × 13
## `Person ID` Gender Age Occupation `Sleep Duration` `Quality of Sleep`
## <dbl> <chr> <dbl> <chr> <dbl> <dbl>
## 1 1 Male 27 Software Engine… 6.1 6
## 2 2 Male 28 Doctor 6.2 6
## 3 3 Male 28 Doctor 6.2 6
## 4 4 Male 28 Sales Represent… 5.9 4
## 5 5 Male 28 Sales Represent… 5.9 4
## 6 6 Male 28 Software Engine… 5.9 4
## 7 7 Male 29 Teacher 6.3 6
## 8 8 Male 29 Doctor 7.8 7
## 9 9 Male 29 Doctor 7.8 7
## 10 10 Male 29 Doctor 7.8 7
## # ℹ 364 more rows
## # ℹ 7 more variables: `Physical Activity Level` <dbl>, `Stress Level` <dbl>,
## # `BMI Category` <chr>, `Blood Pressure` <chr>, `Heart Rate` <dbl>,
## # `Daily Steps` <dbl>, `Sleep Disorder` <chr>
#DATA TRANSFORMING
# Scaling numerical variables
sleep_health_and_lifestyle_dataset$Age_scaled <- scale(sleep_health_and_lifestyle_dataset$Age)
# Encoding categorical variables
sleep_health_and_lifestyle_dataset$Gender <- as.factor(sleep_health_and_lifestyle_dataset$Gender)
#Data exploration and visualization
library(GGally)
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
library(ggplot2)
library(tidyverse)
View(sleep_health_and_lifestyle_dataset)
summary(sleep_health_and_lifestyle_dataset)
## Person ID Gender Age Occupation
## Min. : 1.00 Female:185 Min. :27.00 Length:374
## 1st Qu.: 94.25 Male :189 1st Qu.:35.25 Class :character
## Median :187.50 Median :43.00 Mode :character
## Mean :187.50 Mean :42.18
## 3rd Qu.:280.75 3rd Qu.:50.00
## Max. :374.00 Max. :59.00
## Sleep Duration Quality of Sleep Physical Activity Level Stress Level
## Min. :5.800 Min. :4.000 Min. :30.00 Min. :3.000
## 1st Qu.:6.400 1st Qu.:6.000 1st Qu.:45.00 1st Qu.:4.000
## Median :7.200 Median :7.000 Median :60.00 Median :5.000
## Mean :7.132 Mean :7.313 Mean :59.17 Mean :5.385
## 3rd Qu.:7.800 3rd Qu.:8.000 3rd Qu.:75.00 3rd Qu.:7.000
## Max. :8.500 Max. :9.000 Max. :90.00 Max. :8.000
## BMI Category Blood Pressure Heart Rate Daily Steps
## Length:374 Length:374 Min. :65.00 Min. : 3000
## Class :character Class :character 1st Qu.:68.00 1st Qu.: 5600
## Mode :character Mode :character Median :70.00 Median : 7000
## Mean :70.17 Mean : 6817
## 3rd Qu.:72.00 3rd Qu.: 8000
## Max. :86.00 Max. :10000
## Sleep Disorder Age_scaled.V1
## Length:374 Min. :-1.7507504
## Class :character 1st Qu.:-0.7995371
## Mode :character Median : 0.0940269
## Mean : 0.0000000
## 3rd Qu.: 0.9011170
## Max. : 1.9388042
#CORRELATION Analysis
cor_matrix <- cor(sleep_health_and_lifestyle_dataset[, c("Sleep Duration", "Physical Activity Level", "Quality of Sleep")])
print(cor_matrix)
## Sleep Duration Physical Activity Level Quality of Sleep
## Sleep Duration 1.0000000 0.2123603 0.8832130
## Physical Activity Level 0.2123603 1.0000000 0.1928965
## Quality of Sleep 0.8832130 0.1928965 1.0000000
#Scatter plot
sleep_health_and_lifestyle_dataset%>%
ggplot(aes(`Sleep Duration`,`Age`))+
geom_point(col="purple",size=4)

#Histogram
hist(sleep_health_and_lifestyle_dataset$`Heart Rate`,col=c("blue","purple","orange"))

# Calculate the mean stress level for each combination of factors
mean_Stress_Level <-sleep_health_and_lifestyle_dataset %>%
group_by(`Physical Activity Level`,`Quality of Sleep`,`Sleep Duration`) %>%
summarise(Mean_Stress_Level = mean(`Stress Level`))
## `summarise()` has grouped output by 'Physical Activity Level', 'Quality of
## Sleep'. You can override using the `.groups` argument.
mean_Stress_Level
## # A tibble: 61 × 4
## # Groups: Physical Activity Level, Quality of Sleep [32]
## Physical Activity Lev…¹ `Quality of Sleep` `Sleep Duration` Mean_Stress_Level
## <dbl> <dbl> <dbl> <dbl>
## 1 30 4 5.9 8
## 2 30 6 6 8
## 3 30 6 6.1 8
## 4 30 6 6.2 8
## 5 30 7 6.8 6
## 6 30 9 8.3 3
## 7 30 9 8.4 3
## 8 30 9 8.5 3
## 9 32 4 5.8 8
## 10 35 5 6.4 7
## # ℹ 51 more rows
## # ℹ abbreviated name: ¹`Physical Activity Level`