knitr::opts_chunk$set(echo = TRUE)
#tidyr, dplyr, and tidyverse for data manipulation and cleaning. ggplot2 and ggpmisc for data visualization. forcats for reordering factor levels.
library(tidyr)
library(ggplot2)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ lubridate 1.9.4 ✔ tibble 3.2.1
## ✔ purrr 1.0.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dplyr)
library(forcats)
sleepData <- read.csv("~/Downloads/Sleep_Data_Sampled - Sleep_Data_Sampled.csv")
class(sleepData) #This helps verify that the data was successfully loaded into a data frame format.
## [1] "data.frame"
levels(factor(sleepData$BMI.Category)) #Check what BMI categories exist (e.g., "Normal", "Overweight", "Obese"). Converts the BMI.Category column into a factor, which is R’s way of handling categorical variables.Then, levels() extracts the distinct categories (factor levels) present in that column.
## [1] "Normal" "Normal Weight" "Obese" "Overweight"
sleepData <- sleepData %>%
mutate(BMI.Category = ifelse(BMI.Category == "Normal Weight", "Normal", BMI.Category)) #"Normal Weight" was recoded as "Normal" under the BMI.Category column for simplicity and to avoid long labels in plots.
sleepData <- sleepData %>% drop_na() #Uses the drop_na() function from the tidyverse to remove any rows with missing values (NA) in any column.Essential for avoiding issues in your analysis and plots, since many functions don't handle NA values well. Helps ensure statistical summaries and visualizations are based on complete data only.
sleepData <- sleepData %>% distinct() #Removes duplicate rows in dataset, keeping only the unique ones.Prevents skewed results due to repeated entries.Cleans up the data so each row represents a unique observation.
summary(sleepData) #Gives a quick statistical summary of each column in the dataset.For numeric variables, it shows:Min, 1st Quartile, Median, Mean, 3rd Quartile, Max.For categorical (factor/character) variables, it shows:A count of how many entries fall into each category.A fast way to inspect dataset.Helps detect outliers, data entry errors, or imbalances.
## Person.ID Gender Age Occupation
## Min. : 1 Length:15000 Min. :27.00 Length:15000
## 1st Qu.: 3751 Class :character 1st Qu.:40.00 Class :character
## Median : 7500 Mode :character Median :44.00 Mode :character
## Mean : 7500 Mean :44.13
## 3rd Qu.:11250 3rd Qu.:48.00
## Max. :15000 Max. :59.00
## Sleep.Duration Quality.of.Sleep Physical.Activity.Level Stress.Level
## Min. :5.800 Min. :4.000 Min. :30.00 Min. :3.000
## 1st Qu.:6.500 1st Qu.:6.000 1st Qu.:45.00 1st Qu.:4.000
## Median :7.000 Median :7.000 Median :60.00 Median :6.000
## Mean :6.997 Mean :7.131 Mean :59.92 Mean :5.655
## 3rd Qu.:7.450 3rd Qu.:8.000 3rd Qu.:75.00 3rd Qu.:6.000
## Max. :8.500 Max. :9.000 Max. :90.00 Max. :8.000
## BMI.Category Blood.Pressure Heart.Rate Daily.Steps
## Length:15000 Length:15000 Min. :65.00 Min. : 3000
## Class :character Class :character 1st Qu.:68.00 1st Qu.: 6000
## Mode :character Mode :character Median :70.00 Median : 6500
## Mean :70.86 Mean : 6795
## 3rd Qu.:72.00 3rd Qu.: 7600
## Max. :86.00 Max. :10000
## Sleep.Disorder
## Length:15000
## Class :character
## Mode :character
##
##
##
ggplot(sleepData, aes(x = BMI.Category, fill = Sleep.Disorder)) +
geom_bar(position = "fill") +
scale_y_continuous(labels = scales::percent) +
labs(title = "BMI Category and Sleep Disorders (Percentage)",
x = "BMI Category",
y = "Percentage",
fill = "Sleep Disorder") +
theme_classic() +
theme(legend.position = "bottom")
Figure 1: BMI Category and Sleep Disorder
library(ggplot2)
library(ggpmisc)
ggplot(sleepData, aes(x = Sleep.Duration, y = Quality.of.Sleep)) +
geom_bin2d(bins = 10) +
scale_fill_gradient(low = "lightblue", high = "darkblue") +
geom_smooth(method = "lm", color = "red", se = TRUE, ,linewidth = 1) +
stat_poly_eq(
aes(label = paste(after_stat(eq.label), after_stat(rr.label), sep = "~~~")),
formula = y ~ x,
parse = TRUE,
label.x = "left",
label.y = 10 # Adjust based on your y-axis range
) +
labs(
title = "Heatmap with Trendline and R²: Sleep Duration vs. Quality of Sleep",
x = "Sleep Duration (hours)",
y = "Quality of Sleep (1-10)",
fill = "Count"
) +
theme_classic()
Figure 2: Sleep Duration vs. Quality of Sleep
library(ggplot2)
library(ggpmisc)
ggplot(sleepData, aes(x = Stress.Level, y = Quality.of.Sleep)) +
geom_bin2d(bins = 10) +
scale_fill_gradient(low = "lightblue", high = "darkblue") +
geom_smooth(method = "lm", color = "red", se = TRUE, size = 1) +
stat_poly_eq(
aes(label = paste(after_stat(eq.label), after_stat(rr.label), sep = "~~~")),
formula = y ~ x,
parse = TRUE,
label.x = "left",
label.y = 10
) +
labs(title = "Heatmap with Trendline and R²: Stress Level vs. Quality of Sleep",
x = "Stress Level (1-10)",
y = "Quality of Sleep (1-10)",
fill = "Count") +
theme_classic()
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
Figure 3: Stress level vs. Quality of Sleep
ggplot(sleepData, aes(x = Occupation, y = Sleep.Duration, fill = Gender)) +
geom_violin(trim = FALSE, alpha = 0.6, color = NA, position = "dodge") + # Add violin for distribution, dodge by gender
geom_boxplot(width = 0.2, outlier.shape = NA, alpha = 0.9, position = "dodge") + # Narrow boxplot inside violin, dodge by gender
stat_summary(fun = mean, geom = "point", shape = 21, size = 3, fill = "white", position = "dodge") + # Add mean points
scale_fill_brewer(palette = "Set3") + # Colorful & clean palette
labs(title = "Sleep Duration by Occupation and Gender",
subtitle = "Violin + Box Plot with Mean Points, Grouped by Gender",
x = "Occupation",
y = "Sleep Duration (hours)") +
theme_classic() +
theme(
axis.text.x = element_text(angle = 45, hjust = 1),
legend.position = "bottom", # Move legend to the bottom for clarity
plot.title = element_text(size = 16, face = "bold"),
plot.subtitle = element_text(size = 12, color = "gray40")
)
## Warning: Width not defined
## ℹ Set with `position_dodge(width = ...)`
Figure 4: Sleep Duration by Occupationp
library(ggplot2)
library(forcats) # Required for reordering factors
# Reorder occupations by median sleep quality
ggplot(sleepData %>%
mutate(Occupation = fct_reorder(Occupation, Quality.of.Sleep, .fun = median)),
aes(x = Occupation, y = Quality.of.Sleep, fill = Gender)) + # Group by Gender
# Violin plot to show distribution
geom_violin(trim = FALSE, alpha = 0.6, color = NA, position = "dodge") + # Add violin for distribution, dodge by gender
# Boxplot inside violin plot
geom_boxplot(width = 0.2, outlier.shape = NA, alpha = 0.9, position = "dodge") + # Narrow boxplot inside violin, dodge by gender
# Add mean points to show the central tendency
stat_summary(fun = mean, geom = "point", shape = 21, size = 3, fill = "white", position = "dodge") + # Add mean points
# Colorful palette for different genders
scale_fill_brewer(palette = "Set3") + # Colorful & clean palette
# Add labels and titles
labs(title = "Sleep Quality by Occupation and Gender",
subtitle = "Violin + Box Plot with Mean Points, Ordered by Median Sleep Quality",
x = "Occupation",
y = "Sleep Quality (1-10)") +
# Aesthetic theme
theme_classic() +
theme(
axis.text.x = element_text(angle = 45, hjust = 1), # Rotate x-axis labels for better readability
plot.title = element_text(size = 16, face = "bold"),
plot.subtitle = element_text(size = 12, color = "gray40"),
legend.position = "bottom" # Move the legend to the bottom
)
## Warning: Width not defined
## ℹ Set with `position_dodge(width = ...)`
Figure 4: Sleep Quality by Occupationp
library(ggplot2)
library(dplyr)
# Calculate percentages for each Sleep Duration category and Sleep Disorder
sleepData %>%
group_by(Sleep.Duration, Sleep.Disorder) %>% # Group by Sleep Duration and Sleep Disorder
tally() %>% # Count the occurrences for each group
group_by(Sleep.Duration) %>% # Group by Sleep Duration to calculate percentages within each category
mutate(Percent = n / sum(n) * 100) %>% # Calculate the percentage for each disorder
ggplot(aes(x = Sleep.Duration, y = Percent, fill = Sleep.Disorder)) +
# Create a stacked bar plot
geom_bar(stat = "identity", position = "stack") +
# Add labels and titles
labs(title = "Distribution of Sleep Disorders by Sleep Duration (Percentage)",
x = "Sleep Duration (hours)",
y = "Percentage",
fill = "Sleep Disorder") +
theme_classic() +
theme(legend.position = "bottom") +
# Adjust x-axis labels for readability
theme(axis.text.x = element_text(angle = 45, hjust = 1))
Figure 5: Sleep duration and Sleep Disorders
library(ggplot2)
library(dplyr)
# Calculate percentages for each Age group and Sleep Disorder
sleepData %>%
group_by(Age, Sleep.Disorder) %>% # Group by Age and Sleep Disorder
tally() %>% # Count the occurrences for each group
group_by(Age) %>% # Group by Age to calculate percentages within each category
mutate(Percent = n / sum(n) * 100) %>% # Calculate the percentage for each disorder
ggplot(aes(x = factor(Age), y = Percent, fill = Sleep.Disorder)) +
# Create a stacked bar plot
geom_bar(stat = "identity", position = "stack") +
# Add labels and titles
labs(title = "Age Distribution by Sleep Disorder (Percentage)",
x = "Age",
y = "Percentage",
fill = "Sleep Disorder") +
theme_classic() +
theme(legend.position = "bottom") +
# Adjust x-axis labels for readability
theme(axis.text.x = element_text(angle = 45, hjust = 1))
Figure 6: Age vs. Sleep Disorder
library(ggplot2)
library(forcats) # Required for reordering factors
# Reorder Sleep Disorder by median sleep quality
ggplot(sleepData %>%
mutate(Sleep.Disorder = fct_reorder(Sleep.Disorder, Quality.of.Sleep, .fun = median)),
aes(x = Sleep.Disorder, y = Quality.of.Sleep, fill = Sleep.Disorder)) +
# Violin plot to show distribution
geom_violin(trim = FALSE, alpha = 0.6, color = NA) + # Add violin for distribution
# Boxplot inside violin plot
geom_boxplot(width = 0.2, outlier.shape = NA, alpha = 0.9) + # Narrow boxplot inside violin
# Add mean points to show the central tendency
stat_summary(fun = mean, geom = "point", shape = 21, size = 3, fill = "white") + # Add mean points
# Colorful palette for different sleep disorders, with "Healthy" in dark color
scale_fill_manual(values = c("Healthy" = "#2C3E50", "Insomnia" = "#F39C12", "Sleep Apnea" = "#E74C3C")) + # Dark color for Healthy
# Add labels and titles
labs(title = "Sleep Quality by Sleep Disorder",
subtitle = "Violin + Box Plot with Mean Points, Ordered by Median Sleep Quality",
x = "Sleep Disorder",
y = "Sleep Quality (1-10)") +
# Aesthetic theme
theme_classic() +
theme(
plot.title = element_text(size = 16, face = "bold"),
plot.subtitle = element_text(size = 12, color = "gray40"),
axis.text.x = element_text(angle = 45, hjust = 1), # Rotate x-axis labels for better readability
legend.position = "none" # No legend needed for Sleep Disorder
)
Figure 8: Age vs. Sleep Disorder