knitr::opts_chunk$set(echo = TRUE)
#tidyr, dplyr, and tidyverse for data manipulation and cleaning. ggplot2 and ggpmisc for data visualization. forcats for reordering factor levels.

library(tidyr)
library(ggplot2)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ lubridate 1.9.4     ✔ tibble    3.2.1
## ✔ purrr     1.0.4     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dplyr)
library(forcats)

sleepData <- read.csv("~/Downloads/Sleep_Data_Sampled - Sleep_Data_Sampled.csv")

class(sleepData) #This helps verify that the data was successfully loaded into a data frame format.
## [1] "data.frame"
levels(factor(sleepData$BMI.Category)) #Check what BMI categories exist (e.g., "Normal", "Overweight", "Obese"). Converts the BMI.Category column into a factor, which is R’s way of handling categorical variables.Then, levels() extracts the distinct categories (factor levels) present in that column.
## [1] "Normal"        "Normal Weight" "Obese"         "Overweight"
sleepData <- sleepData %>%
  mutate(BMI.Category = ifelse(BMI.Category == "Normal Weight", "Normal", BMI.Category)) #"Normal Weight" was recoded as "Normal" under the BMI.Category column for simplicity and to avoid long labels in plots.

sleepData <- sleepData %>% drop_na() #Uses the drop_na() function from the tidyverse to remove any rows with missing values (NA) in any column.Essential for avoiding issues in your analysis and plots, since many functions don't handle NA values well. Helps ensure statistical summaries and visualizations are based on complete data only.

sleepData <- sleepData %>% distinct() #Removes duplicate rows in dataset, keeping only the unique ones.Prevents skewed results due to repeated entries.Cleans up the data so each row represents a unique observation.

summary(sleepData) #Gives a quick statistical summary of each column in the dataset.For numeric variables, it shows:Min, 1st Quartile, Median, Mean, 3rd Quartile, Max.For categorical (factor/character) variables, it shows:A count of how many entries fall into each category.A fast way to inspect dataset.Helps detect outliers, data entry errors, or imbalances.
##    Person.ID        Gender               Age         Occupation       
##  Min.   :    1   Length:15000       Min.   :27.00   Length:15000      
##  1st Qu.: 3751   Class :character   1st Qu.:40.00   Class :character  
##  Median : 7500   Mode  :character   Median :44.00   Mode  :character  
##  Mean   : 7500                      Mean   :44.13                     
##  3rd Qu.:11250                      3rd Qu.:48.00                     
##  Max.   :15000                      Max.   :59.00                     
##  Sleep.Duration  Quality.of.Sleep Physical.Activity.Level  Stress.Level  
##  Min.   :5.800   Min.   :4.000    Min.   :30.00           Min.   :3.000  
##  1st Qu.:6.500   1st Qu.:6.000    1st Qu.:45.00           1st Qu.:4.000  
##  Median :7.000   Median :7.000    Median :60.00           Median :6.000  
##  Mean   :6.997   Mean   :7.131    Mean   :59.92           Mean   :5.655  
##  3rd Qu.:7.450   3rd Qu.:8.000    3rd Qu.:75.00           3rd Qu.:6.000  
##  Max.   :8.500   Max.   :9.000    Max.   :90.00           Max.   :8.000  
##  BMI.Category       Blood.Pressure       Heart.Rate     Daily.Steps   
##  Length:15000       Length:15000       Min.   :65.00   Min.   : 3000  
##  Class :character   Class :character   1st Qu.:68.00   1st Qu.: 6000  
##  Mode  :character   Mode  :character   Median :70.00   Median : 6500  
##                                        Mean   :70.86   Mean   : 6795  
##                                        3rd Qu.:72.00   3rd Qu.: 7600  
##                                        Max.   :86.00   Max.   :10000  
##  Sleep.Disorder    
##  Length:15000      
##  Class :character  
##  Mode  :character  
##                    
##                    
## 
ggplot(sleepData, aes(x = BMI.Category, fill = Sleep.Disorder)) +
  geom_bar(position = "fill") +
  scale_y_continuous(labels = scales::percent) +
  labs(title = "BMI Category and Sleep Disorders (Percentage)",
       x = "BMI Category",
       y = "Percentage",
       fill = "Sleep Disorder") +
  theme_classic() +
  theme(legend.position = "bottom")
 Figure 1: BMI Category and Sleep Disorder

Figure 1: BMI Category and Sleep Disorder

library(ggplot2)
library(ggpmisc)


ggplot(sleepData, aes(x = Sleep.Duration, y = Quality.of.Sleep)) +
  geom_bin2d(bins = 10) +
  scale_fill_gradient(low = "lightblue", high = "darkblue") +
  
  
  geom_smooth(method = "lm", color = "red", se = TRUE, ,linewidth = 1) +
  
  
  stat_poly_eq(
    aes(label = paste(after_stat(eq.label), after_stat(rr.label), sep = "~~~")),
    formula = y ~ x,
    parse = TRUE,
    label.x = "left",
    label.y = 10  # Adjust based on your y-axis range
  ) +
  
  labs(
    title = "Heatmap with Trendline and R²: Sleep Duration vs. Quality of Sleep",
    x = "Sleep Duration (hours)",
    y = "Quality of Sleep (1-10)",
    fill = "Count"
  ) +
  theme_classic()
 Figure 2: Sleep Duration vs. Quality of Sleep

Figure 2: Sleep Duration vs. Quality of Sleep

library(ggplot2)
library(ggpmisc)

ggplot(sleepData, aes(x = Stress.Level, y = Quality.of.Sleep)) +
  geom_bin2d(bins = 10) +
  scale_fill_gradient(low = "lightblue", high = "darkblue") +
  geom_smooth(method = "lm", color = "red", se = TRUE, size = 1) +
  stat_poly_eq(
    aes(label = paste(after_stat(eq.label), after_stat(rr.label), sep = "~~~")),
    formula = y ~ x,
    parse = TRUE,
    label.x = "left", 
    label.y = 10       
  ) +
  labs(title = "Heatmap with Trendline and R²: Stress Level vs. Quality of Sleep", 
       x = "Stress Level (1-10)", 
       y = "Quality of Sleep (1-10)",
       fill = "Count") +
  theme_classic()
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
 Figure 3: Stress level vs. Quality of Sleep

Figure 3: Stress level vs. Quality of Sleep

ggplot(sleepData, aes(x = Occupation, y = Sleep.Duration, fill = Gender)) +
  geom_violin(trim = FALSE, alpha = 0.6, color = NA, position = "dodge") +  # Add violin for distribution, dodge by gender
  geom_boxplot(width = 0.2, outlier.shape = NA, alpha = 0.9, position = "dodge") +  # Narrow boxplot inside violin, dodge by gender
  stat_summary(fun = mean, geom = "point", shape = 21, size = 3, fill = "white", position = "dodge") +  # Add mean points
  scale_fill_brewer(palette = "Set3") +  # Colorful & clean palette
  
  labs(title = "Sleep Duration by Occupation and Gender", 
       subtitle = "Violin + Box Plot with Mean Points, Grouped by Gender",
       x = "Occupation", 
       y = "Sleep Duration (hours)") +
  
  theme_classic() +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1),
    legend.position = "bottom",  # Move legend to the bottom for clarity
    plot.title = element_text(size = 16, face = "bold"),
    plot.subtitle = element_text(size = 12, color = "gray40")
  )
## Warning: Width not defined
## ℹ Set with `position_dodge(width = ...)`
 Figure 4: Sleep Duration by Occupationp

Figure 4: Sleep Duration by Occupationp

library(ggplot2)
library(forcats)  # Required for reordering factors

# Reorder occupations by median sleep quality
ggplot(sleepData %>%
         mutate(Occupation = fct_reorder(Occupation, Quality.of.Sleep, .fun = median)), 
       aes(x = Occupation, y = Quality.of.Sleep, fill = Gender)) +  # Group by Gender
  
  # Violin plot to show distribution
  geom_violin(trim = FALSE, alpha = 0.6, color = NA, position = "dodge") +  # Add violin for distribution, dodge by gender
  
  # Boxplot inside violin plot
  geom_boxplot(width = 0.2, outlier.shape = NA, alpha = 0.9, position = "dodge") +  # Narrow boxplot inside violin, dodge by gender
  
  # Add mean points to show the central tendency
  stat_summary(fun = mean, geom = "point", shape = 21, size = 3, fill = "white", position = "dodge") +  # Add mean points
  
  # Colorful palette for different genders
  scale_fill_brewer(palette = "Set3") +  # Colorful & clean palette
  
  # Add labels and titles
  labs(title = "Sleep Quality by Occupation and Gender", 
       subtitle = "Violin + Box Plot with Mean Points, Ordered by Median Sleep Quality",
       x = "Occupation", 
       y = "Sleep Quality (1-10)") +
  
  # Aesthetic theme
  theme_classic() +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1),  # Rotate x-axis labels for better readability
    plot.title = element_text(size = 16, face = "bold"),
    plot.subtitle = element_text(size = 12, color = "gray40"),
    legend.position = "bottom"  # Move the legend to the bottom
  )
## Warning: Width not defined
## ℹ Set with `position_dodge(width = ...)`
 Figure 4: Sleep Quality by Occupationp

Figure 4: Sleep Quality by Occupationp

library(ggplot2)
library(dplyr)

# Calculate percentages for each Sleep Duration category and Sleep Disorder
sleepData %>%
  group_by(Sleep.Duration, Sleep.Disorder) %>%  # Group by Sleep Duration and Sleep Disorder
  tally() %>%  # Count the occurrences for each group
  group_by(Sleep.Duration) %>%  # Group by Sleep Duration to calculate percentages within each category
  mutate(Percent = n / sum(n) * 100) %>%  # Calculate the percentage for each disorder
  ggplot(aes(x = Sleep.Duration, y = Percent, fill = Sleep.Disorder)) +
  
  # Create a stacked bar plot
  geom_bar(stat = "identity", position = "stack") +
  
  # Add labels and titles
  labs(title = "Distribution of Sleep Disorders by Sleep Duration (Percentage)",
       x = "Sleep Duration (hours)", 
       y = "Percentage",
       fill = "Sleep Disorder") +
  theme_classic() +
  theme(legend.position = "bottom") + 
  
  # Adjust x-axis labels for readability
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) 
 Figure 5: Sleep duration and Sleep Disorders

Figure 5: Sleep duration and Sleep Disorders

library(ggplot2)
library(dplyr)

# Calculate percentages for each Age group and Sleep Disorder
sleepData %>%
  group_by(Age, Sleep.Disorder) %>%  # Group by Age and Sleep Disorder
  tally() %>%  # Count the occurrences for each group
  group_by(Age) %>%  # Group by Age to calculate percentages within each category
  mutate(Percent = n / sum(n) * 100) %>%  # Calculate the percentage for each disorder
  ggplot(aes(x = factor(Age), y = Percent, fill = Sleep.Disorder)) +
  
  # Create a stacked bar plot
  geom_bar(stat = "identity", position = "stack") +
  
  # Add labels and titles
  labs(title = "Age Distribution by Sleep Disorder (Percentage)",
       x = "Age", 
       y = "Percentage",
       fill = "Sleep Disorder") +
  theme_classic() +
  theme(legend.position = "bottom") +
  
  # Adjust x-axis labels for readability
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) 
 Figure 6: Age vs. Sleep Disorder

Figure 6: Age vs. Sleep Disorder

library(ggplot2)
library(forcats)  # Required for reordering factors

# Reorder Sleep Disorder by median sleep quality
ggplot(sleepData %>%
         mutate(Sleep.Disorder = fct_reorder(Sleep.Disorder, Quality.of.Sleep, .fun = median)), 
       aes(x = Sleep.Disorder, y = Quality.of.Sleep, fill = Sleep.Disorder)) +
  
  # Violin plot to show distribution
  geom_violin(trim = FALSE, alpha = 0.6, color = NA) +  # Add violin for distribution
  
  # Boxplot inside violin plot
  geom_boxplot(width = 0.2, outlier.shape = NA, alpha = 0.9) +  # Narrow boxplot inside violin
  
  # Add mean points to show the central tendency
  stat_summary(fun = mean, geom = "point", shape = 21, size = 3, fill = "white") +  # Add mean points
  
  # Colorful palette for different sleep disorders, with "Healthy" in dark color
  scale_fill_manual(values = c("Healthy" = "#2C3E50", "Insomnia" = "#F39C12", "Sleep Apnea" = "#E74C3C")) +  # Dark color for Healthy
  
  # Add labels and titles
  labs(title = "Sleep Quality by Sleep Disorder", 
       subtitle = "Violin + Box Plot with Mean Points, Ordered by Median Sleep Quality",
       x = "Sleep Disorder", 
       y = "Sleep Quality (1-10)") +
  
  # Aesthetic theme
  theme_classic() +
  theme(
    plot.title = element_text(size = 16, face = "bold"),
    plot.subtitle = element_text(size = 12, color = "gray40"),
    axis.text.x = element_text(angle = 45, hjust = 1),  # Rotate x-axis labels for better readability
    legend.position = "none"  # No legend needed for Sleep Disorder
  )
 Figure 8: Age vs. Sleep Disorder

Figure 8: Age vs. Sleep Disorder