The Sleep Health and Lifestyle Dataset captures details about sleep patterns, lifestyle habits, and related health indicators. I’m basically trying to practice my R skills with this data set.
Purpose of the project:
The purpose of this analysis is to identify patterns and relationships in the variables.
To explore the relationship between sleep quality and lifestyle, demographics and health indicators in adult participants.
We have the following variables:
Person ID: Unique identifier for each individual.
Gender: Gender of the person.
Age: Age in years.
Occupation: Job category.
Sleep Duration: Number of hours slept per day.
Quantity of Sleep: Sleep quality rating from 1 (poor) to 10 (high).
Physical Activity Level: Daily physical activity duration, minutes per day.
Stress Level: Stress rating from 1 (low) to 10 (high).
BMI Category: Body Mass Index classification (Underweight, Normal, Overweight).
Blood Pressure: Measured in mmHg, formatted as systolic/diastolic.
Heart Rate: Resting heart rate in beats per minute.
Daily Steps: Number of steps taken daily.
Sleep Disorder: Presence of sleep disorder: None, Insomnia, or Sleep Apnea.
#library needed for data wrangling
library("tidyverse")
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.2
## ✔ ggplot2 4.0.0 ✔ tibble 3.3.0
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.1.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
#read in the csv file using the read.csv function
shl_dframe <- read.csv("C:/Users/MODEL24/Downloads/Sleep_health_and_lifestyle_dataset.csv")
#see the head of the data
head(shl_dframe)
## Person.ID Gender Age Occupation Sleep.Duration Quality.of.Sleep
## 1 1 Male 27 Software Engineer 6.1 6
## 2 2 Male 28 Doctor 6.2 6
## 3 3 Male 28 Doctor 6.2 6
## 4 4 Male 28 Sales Representative 5.9 4
## 5 5 Male 28 Sales Representative 5.9 4
## 6 6 Male 28 Software Engineer 5.9 4
## Physical.Activity.Level Stress.Level BMI.Category Blood.Pressure Heart.Rate
## 1 42 6 Overweight 126/83 77
## 2 60 8 Normal 125/80 75
## 3 60 8 Normal 125/80 75
## 4 30 8 Obese 140/90 85
## 5 30 8 Obese 140/90 85
## 6 30 8 Obese 140/90 85
## Daily.Steps Sleep.Disorder
## 1 4200 None
## 2 10000 None
## 3 10000 None
## 4 3000 Sleep Apnea
## 5 3000 Sleep Apnea
## 6 3000 Insomnia
#to see the head of the data
head(shl_dframe)
## Person.ID Gender Age Occupation Sleep.Duration Quality.of.Sleep
## 1 1 Male 27 Software Engineer 6.1 6
## 2 2 Male 28 Doctor 6.2 6
## 3 3 Male 28 Doctor 6.2 6
## 4 4 Male 28 Sales Representative 5.9 4
## 5 5 Male 28 Sales Representative 5.9 4
## 6 6 Male 28 Software Engineer 5.9 4
## Physical.Activity.Level Stress.Level BMI.Category Blood.Pressure Heart.Rate
## 1 42 6 Overweight 126/83 77
## 2 60 8 Normal 125/80 75
## 3 60 8 Normal 125/80 75
## 4 30 8 Obese 140/90 85
## 5 30 8 Obese 140/90 85
## 6 30 8 Obese 140/90 85
## Daily.Steps Sleep.Disorder
## 1 4200 None
## 2 10000 None
## 3 10000 None
## 4 3000 Sleep Apnea
## 5 3000 Sleep Apnea
## 6 3000 Insomnia
#to check the names of columns in the data
colnames(shl_dframe)
## [1] "Person.ID" "Gender"
## [3] "Age" "Occupation"
## [5] "Sleep.Duration" "Quality.of.Sleep"
## [7] "Physical.Activity.Level" "Stress.Level"
## [9] "BMI.Category" "Blood.Pressure"
## [11] "Heart.Rate" "Daily.Steps"
## [13] "Sleep.Disorder"
#to see the total number of NA across the data frame; if it returns 0, it means we have no NA
sum(is.na(shl_dframe))
## [1] 0
#to have a glipmse of the columns
glimpse(shl_dframe)
## Rows: 374
## Columns: 13
## $ Person.ID <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,…
## $ Gender <chr> "Male", "Male", "Male", "Male", "Male", "Male"…
## $ Age <int> 27, 28, 28, 28, 28, 28, 29, 29, 29, 29, 29, 29…
## $ Occupation <chr> "Software Engineer", "Doctor", "Doctor", "Sale…
## $ Sleep.Duration <dbl> 6.1, 6.2, 6.2, 5.9, 5.9, 5.9, 6.3, 7.8, 7.8, 7…
## $ Quality.of.Sleep <int> 6, 6, 6, 4, 4, 4, 6, 7, 7, 7, 6, 7, 6, 6, 6, 6…
## $ Physical.Activity.Level <int> 42, 60, 60, 30, 30, 30, 40, 75, 75, 75, 30, 75…
## $ Stress.Level <int> 6, 8, 8, 8, 8, 8, 7, 6, 6, 6, 8, 6, 8, 8, 8, 8…
## $ BMI.Category <chr> "Overweight", "Normal", "Normal", "Obese", "Ob…
## $ Blood.Pressure <chr> "126/83", "125/80", "125/80", "140/90", "140/9…
## $ Heart.Rate <int> 77, 75, 75, 85, 85, 85, 82, 70, 70, 70, 70, 70…
## $ Daily.Steps <int> 4200, 10000, 10000, 3000, 3000, 3000, 3500, 80…
## $ Sleep.Disorder <chr> "None", "None", "None", "Sleep Apnea", "Sleep …
#we observe from the glimpse that the data types should be modified
pre_process_shl_dframe <- function(dataframe) {
#we make the gender, BMI, and sleep disorder factor columns
dataframe$Gender <- as.factor(dataframe$Gender)
dataframe$BMI.Category <- as.factor(dataframe$BMI.Category)
dataframe$Sleep.Disorder <- as.factor(dataframe$Sleep.Disorder)
return(dataframe)
}
#we apply the function on the data frame and store it as a new dataframe
new_shl_dframe <- pre_process_shl_dframe(shl_dframe)
#to check the class of each columns
sapply(new_shl_dframe, class)
## Person.ID Gender Age
## "integer" "factor" "integer"
## Occupation Sleep.Duration Quality.of.Sleep
## "character" "numeric" "integer"
## Physical.Activity.Level Stress.Level BMI.Category
## "integer" "integer" "factor"
## Blood.Pressure Heart.Rate Daily.Steps
## "character" "integer" "integer"
## Sleep.Disorder
## "factor"
#to see key statistics based on the type of data
summary(new_shl_dframe)
## Person.ID Gender Age Occupation
## Min. : 1.00 Female:185 Min. :27.00 Length:374
## 1st Qu.: 94.25 Male :189 1st Qu.:35.25 Class :character
## Median :187.50 Median :43.00 Mode :character
## Mean :187.50 Mean :42.18
## 3rd Qu.:280.75 3rd Qu.:50.00
## Max. :374.00 Max. :59.00
## Sleep.Duration Quality.of.Sleep Physical.Activity.Level Stress.Level
## Min. :5.800 Min. :4.000 Min. :30.00 Min. :3.000
## 1st Qu.:6.400 1st Qu.:6.000 1st Qu.:45.00 1st Qu.:4.000
## Median :7.200 Median :7.000 Median :60.00 Median :5.000
## Mean :7.132 Mean :7.313 Mean :59.17 Mean :5.385
## 3rd Qu.:7.800 3rd Qu.:8.000 3rd Qu.:75.00 3rd Qu.:7.000
## Max. :8.500 Max. :9.000 Max. :90.00 Max. :8.000
## BMI.Category Blood.Pressure Heart.Rate Daily.Steps
## Normal :195 Length:374 Min. :65.00 Min. : 3000
## Normal Weight: 21 Class :character 1st Qu.:68.00 1st Qu.: 5600
## Obese : 10 Mode :character Median :70.00 Median : 7000
## Overweight :148 Mean :70.17 Mean : 6817
## 3rd Qu.:72.00 3rd Qu.: 8000
## Max. :86.00 Max. :10000
## Sleep.Disorder
## Insomnia : 77
## None :219
## Sleep Apnea: 78
##
##
##
#to see the specific characters in the column
unique(new_shl_dframe$Occupation)
## [1] "Software Engineer" "Doctor" "Sales Representative"
## [4] "Teacher" "Nurse" "Engineer"
## [7] "Accountant" "Scientist" "Lawyer"
## [10] "Salesperson" "Manager"
#to check for duplicates
all(duplicated(new_shl_dframe)) #tells us all of it is FALSE
## [1] FALSE
#the first column, person ID, is not necessary for analysis, so we remove it
new_shl_dframe <- new_shl_dframe[, -1]
#to extract the gender, age and occupation column and filter the selected data based on a condition, say age < 50
new_shl_dframe %>%
select(Gender, Age, Occupation) %>%
filter(Age < 50) %>%
nrow() #this tells the number of observations in the filtered data set.
## [1] 278
#to see information about people that are obese
new_shl_dframe %>%
filter(BMI.Category == "Obese")
## Gender Age Occupation Sleep.Duration Quality.of.Sleep
## 1 Male 28 Sales Representative 5.9 4
## 2 Male 28 Sales Representative 5.9 4
## 3 Male 28 Software Engineer 5.9 4
## 4 Male 29 Teacher 6.3 6
## 5 Male 35 Lawyer 7.4 7
## 6 Female 38 Lawyer 7.4 7
## 7 Male 48 Doctor 7.3 7
## 8 Male 48 Doctor 7.3 7
## 9 Male 49 Doctor 8.1 9
## 10 Male 49 Doctor 8.1 9
## Physical.Activity.Level Stress.Level BMI.Category Blood.Pressure Heart.Rate
## 1 30 8 Obese 140/90 85
## 2 30 8 Obese 140/90 85
## 3 30 8 Obese 140/90 85
## 4 40 7 Obese 140/90 82
## 5 60 5 Obese 135/88 84
## 6 60 5 Obese 135/88 84
## 7 65 5 Obese 142/92 83
## 8 65 5 Obese 142/92 83
## 9 85 3 Obese 139/91 86
## 10 85 3 Obese 139/91 86
## Daily.Steps Sleep.Disorder
## 1 3000 Sleep Apnea
## 2 3000 Sleep Apnea
## 3 3000 Insomnia
## 4 3500 Insomnia
## 5 3300 Sleep Apnea
## 6 3300 Sleep Apnea
## 7 3500 Insomnia
## 8 3500 Insomnia
## 9 3700 Sleep Apnea
## 10 3700 Sleep Apnea
#to check those that are obese, are doctors, and have insomnia
new_shl_dframe %>%
filter(BMI.Category == "Obese" & Occupation == "Doctor" & Sleep.Disorder == "Insomnia")
## Gender Age Occupation Sleep.Duration Quality.of.Sleep Physical.Activity.Level
## 1 Male 48 Doctor 7.3 7 65
## 2 Male 48 Doctor 7.3 7 65
## Stress.Level BMI.Category Blood.Pressure Heart.Rate Daily.Steps
## 1 5 Obese 142/92 83 3500
## 2 5 Obese 142/92 83 3500
## Sleep.Disorder
## 1 Insomnia
## 2 Insomnia
To see the average sleep duration and stress level of each occupation in the data set:
new_shl_dframe %>%
group_by(Occupation) %>% #we group the whole data set by the occupations;
summarise(average_sleep_duration = mean(Sleep.Duration), #we find the average sleep duration of each occupation;
average_stress_level = mean(Stress.Level), #we find the average stress level of each occupation
total_employees = n() #we find the total number of people in each occupation
) %>%
arrange(desc(average_stress_level)) #arranges the table in descending order of average stress level
## # A tibble: 11 × 4
## Occupation average_sleep_duration average_stress_level total_employees
## <chr> <dbl> <dbl> <int>
## 1 Sales Representa… 5.9 8 2
## 2 Salesperson 6.40 7 32
## 3 Scientist 6 7 4
## 4 Doctor 6.97 6.73 71
## 5 Software Engineer 6.75 6 4
## 6 Nurse 7.06 5.55 73
## 7 Lawyer 7.41 5.06 47
## 8 Manager 6.9 5 1
## 9 Accountant 7.11 4.59 37
## 10 Teacher 6.69 4.53 40
## 11 Engineer 7.99 3.89 63
To see how the variables differs in both genders, for those with sleep disorders
#comparing males and females
new_shl_dframe %>%
filter(Sleep.Disorder != "None") %>% #this returns people with sleep disorder
group_by(Gender) %>% #this groups the whole data based on gender
summarise(Physical_Activity_Level = mean(Physical.Activity.Level),
Stress_Level = mean(Stress.Level),
Heart_Rate = mean(Heart.Rate)) %>%
mutate(activity_per_stress_ratio = Physical_Activity_Level/Stress_Level) #mutate adds a column to the data set
## # A tibble: 2 × 5
## Gender Physical_Activity_Level Stress_Level Heart_Rate activity_per_stress_r…¹
## <fct> <dbl> <dbl> <dbl> <dbl>
## 1 Female 66.3 5.36 70.6 12.4
## 2 Male 50.2 6.58 74.1 7.63
## # ℹ abbreviated name: ¹activity_per_stress_ratio
To have a view of people in the high risk group; someone over 40 years old and has a BMI over 25
new_shl_dframe %>%
mutate(is_high_risk = ifelse(Age > 40 & #we create a column that tells if they're high risk or not
(BMI.Category == c("Overweight","Obese")), TRUE, FALSE)) %>%
group_by(Sleep.Disorder, is_high_risk) %>% #we then group by the sleep disorder and the new column
summarise(count = n()) %>% #we use this to see how many people are in each category
mutate(percentage = count/sum(count) * 100) #to see the percentage of each sleep disorder category
## `summarise()` has grouped output by 'Sleep.Disorder'. You can override using
## the `.groups` argument.
## # A tibble: 6 × 4
## # Groups: Sleep.Disorder [3]
## Sleep.Disorder is_high_risk count percentage
## <fct> <lgl> <int> <dbl>
## 1 Insomnia FALSE 45 58.4
## 2 Insomnia TRUE 32 41.6
## 3 None FALSE 213 97.3
## 4 None TRUE 6 2.74
## 5 Sleep Apnea FALSE 48 61.5
## 6 Sleep Apnea TRUE 30 38.5
The above code tells us the number of people in each sleep disorder and if they’re high risk Sleep disorder analysis: to see the information of people diagnosed with insomnia
new_shl_dframe %>%
filter(Sleep.Disorder %in% "Insomnia") %>% #this extracts the individuals with insomnia
select(Gender, Age, Occupation, BMI.Category, Physical.Activity.Level, Heart.Rate, Daily.Steps, Sleep.Duration) %>%
arrange(Sleep.Duration) %>%
head(, 15) #this gives the first 15 observations
## Gender Age Occupation BMI.Category Physical.Activity.Level Heart.Rate
## 1 Male 28 Software Engineer Obese 30 85
## 2 Male 33 Doctor Normal 30 72
## 3 Female 50 Nurse Overweight 90 75
## 4 Male 29 Teacher Obese 40 82
## 5 Male 43 Salesperson Overweight 45 72
## 6 Male 44 Salesperson Overweight 45 72
## Daily.Steps Sleep.Duration
## 1 3000 5.9
## 2 5000 6.0
## 3 10000 6.1
## 4 3500 6.3
## 5 6000 6.3
## 6 6000 6.3
The above code gives us information about the lifestyle and physical health of people diagnosed with insomnia; the data is arranged in ascending order of the sleep duration
Creating statistical table
new_shl_dframe %>%
group_by(Occupation) %>%
summarise(freq = n()) %>%
pivot_wider(names_from = Occupation, values_from = freq)
## # A tibble: 1 × 11
## Accountant Doctor Engineer Lawyer Manager Nurse `Sales Representative`
## <int> <int> <int> <int> <int> <int> <int>
## 1 37 71 63 47 1 73 2
## # ℹ 4 more variables: Salesperson <int>, Scientist <int>,
## # `Software Engineer` <int>, Teacher <int>
This gives a frequency table showing the number of individuals in each occupation.
To have the systolic and diastolic values in separate columns
new_shl_dframe %>%
separate(Blood.Pressure, into=c("Systolic", "Diastolic "), sep="/") %>%
head(,10)
## Gender Age Occupation Sleep.Duration Quality.of.Sleep
## 1 Male 27 Software Engineer 6.1 6
## 2 Male 28 Doctor 6.2 6
## 3 Male 28 Doctor 6.2 6
## 4 Male 28 Sales Representative 5.9 4
## 5 Male 28 Sales Representative 5.9 4
## 6 Male 28 Software Engineer 5.9 4
## Physical.Activity.Level Stress.Level BMI.Category Systolic Diastolic
## 1 42 6 Overweight 126 83
## 2 60 8 Normal 125 80
## 3 60 8 Normal 125 80
## 4 30 8 Obese 140 90
## 5 30 8 Obese 140 90
## 6 30 8 Obese 140 90
## Heart.Rate Daily.Steps Sleep.Disorder
## 1 77 4200 None
## 2 75 10000 None
## 3 75 10000 None
## 4 85 3000 Sleep Apnea
## 5 85 3000 Sleep Apnea
## 6 85 3000 Insomnia
Correlation Analysis
#to check for correlation between numeric variables and identify key drivers of sleep quality.
correlation_matrix <- new_shl_dframe %>%
select(Age, Sleep.Duration, Quality.of.Sleep, #select the numeric variables
Physical.Activity.Level, Stress.Level,
Heart.Rate, Daily.Steps) %>%
cor(method="pearson")
correlation_matrix
## Age Sleep.Duration Quality.of.Sleep
## Age 1.0000000 0.34470936 0.47373388
## Sleep.Duration 0.3447094 1.00000000 0.88321300
## Quality.of.Sleep 0.4737339 0.88321300 1.00000000
## Physical.Activity.Level 0.1789927 0.21236031 0.19289645
## Stress.Level -0.4223445 -0.81102303 -0.89875203
## Heart.Rate -0.2256062 -0.51645489 -0.65986473
## Daily.Steps 0.0579734 -0.03953254 0.01679141
## Physical.Activity.Level Stress.Level Heart.Rate
## Age 0.17899272 -0.42234448 -0.22560619
## Sleep.Duration 0.21236031 -0.81102303 -0.51645489
## Quality.of.Sleep 0.19289645 -0.89875203 -0.65986473
## Physical.Activity.Level 1.00000000 -0.03413446 0.13697098
## Stress.Level -0.03413446 1.00000000 0.67002646
## Heart.Rate 0.13697098 0.67002646 1.00000000
## Daily.Steps 0.77272305 0.18682895 -0.03030858
## Daily.Steps
## Age 0.05797340
## Sleep.Duration -0.03953254
## Quality.of.Sleep 0.01679141
## Physical.Activity.Level 0.77272305
## Stress.Level 0.18682895
## Heart.Rate -0.03030858
## Daily.Steps 1.00000000
ANOVA:to check how the categorical variables relate to the numeric variables
#we want to see which occupation is most affected by poor sleep
#using anova, we check if sleep quality differs by occupation; is it engineers that have the poorest quality of sleep
anova_occupation <- aov(Quality.of.Sleep ~ Occupation, data = new_shl_dframe)
#we check the summary of the anova to see the p-value
summary(anova_occupation) #this gives the F value and p-value too
## Df Sum Sq Mean Sq F value Pr(>F)
## Occupation 10 241.9 24.191 30.02 <2e-16 ***
## Residuals 363 292.5 0.806
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
The high F-value shows there is a strong difference in the average sleep quality between the occupations; the low p-value tells us the difference is real and not a coincidence
#to visualize the variables with strong correlation
plot( new_shl_dframe$Stress.Level, new_shl_dframe$Quality.of.Sleep, main = "Quality of Sleep vs Stress Level")
The scatter plot shows a downward trend which confirms that the more the
stress, the less the quality
plot( new_shl_dframe$Heart.Rate, new_shl_dframe$Quality.of.Sleep, main = "Heart Rate vs Stress Level")
The plot shows that a high heart rate leads to a low quality of
sleep
#To create a bar chart for the categorical variables in the data set:
new_shl_dframe %>%
ggplot(aes(x=" ", fill = Sleep.Disorder)) +
geom_bar(position = "dodge") +
theme_minimal() +
labs(title = "Count of Individuals by Sleep Disorder Type", y="Number of People")
#each category in the column is represented by a different color
#this bar chart shows the count of the sleep disorder in the data set
new_shl_dframe %>%
ggplot(aes(x=" ", fill = Gender)) +
geom_bar(position = "dodge") +
theme_bw()
#this bar chart shows the count of males and females
new_shl_dframe %>%
ggplot(aes(x=" ", fill = BMI.Category)) + #plots each category with different colors
geom_bar(position = "dodge") + #plots the usual bar chart
labs(title = "Count of Each BMI Category", x="BMI Category",
y="Count", color="BMI Category") + #adds label to the plot
theme(plot.title = element_text(size = 15, face = "bold",#this alters the look of the plot title
colour="blue",hjust=0.5)) #hjust adjust the position of the text
## Ignoring unknown labels:
## • colour : "BMI Category"
#this bar chart shows the count of the different BMI category in the data set;
#each color represents each category
#we already identified heart rate and stress as factors that affect the quality of sleep,
#we now want to see how it varies from one disorder to the other
heartrate_by_disorder <- new_shl_dframe %>%
group_by(Sleep.Disorder) %>%
summarise(Avg_Heart_Rate = mean(Heart.Rate),
Avg_Stress = mean(Stress.Level),
Count = n() )
#To see the line of best fit of the plot of stress against sleep quality
new_shl_dframe %>%
ggplot(aes(x = Stress.Level, y = Quality.of.Sleep)) +
geom_point(alpha = 0.6, color = "blue") + # Adds the data points; alpha reduces color saturation
geom_smooth(method = "lm", color = "red") + # Adds the line of best fit (lm = linear model)
labs(title = "Relationship between Stress Level and Sleep Quality",
x = "Stress Level (Score)",
y = "Quality of Sleep (Score)") +
theme_minimal() # A cleaner visual style
## `geom_smooth()` using formula = 'y ~ x'