Introduction

Using the data from lock5stat.com, I will address the following questions.

  1. Is there a significant difference in the average GPA between male and female college students?
  2. Is there a significant difference in the average number of early classes between the first two class years and other class years?
  3. Do students who identify as “larks” have significantly better cognitive skills (cognition z-score) compared to “owls”?
  4. Is there a significant difference in the average number of classes missed in a semester between students who had at least one early class (EarlyClass=1) and those who didn’t (EarlyClass=0)?
  5. Is there a significant difference in the average happiness level between students with at least moderate depression and normal depression status?
  6. Is there a significant difference in average sleep quality scores between students who reported having at least one all-nighter (AllNighter=1) and those who didn’t (AllNighter=0)?
  7. Do students who abstain from alcohol use have significantly better stress scores than those who report heavy alcohol use?
  8. Is there a significant difference in the average number of drinks per week between students of different genders?
  9. Is there a significant difference in the average weekday bedtime between students with high and low stress (Stress=High vs. Stress=Normal)?
  10. Is there a significant difference in the average hours of sleep on weekends between first two year students and other students?

Analysis

Here, we will explore these questions in detail.

college = read.csv("https://www.lock5stat.com/datasets3e/SleepStudy.csv")
head(college)
##   Gender ClassYear LarkOwl NumEarlyClass EarlyClass  GPA ClassesMissed
## 1      0         4 Neither             0          0 3.60             0
## 2      0         4 Neither             2          1 3.24             0
## 3      0         4     Owl             0          0 2.97            12
## 4      0         1    Lark             5          1 3.76             0
## 5      0         4     Owl             0          0 3.20             4
## 6      1         4 Neither             0          0 3.50             0
##   CognitionZscore PoorSleepQuality DepressionScore AnxietyScore StressScore
## 1           -0.26                4               4            3           8
## 2            1.39                6               1            0           3
## 3            0.38               18              18           18           9
## 4            1.39                9               1            4           6
## 5            1.22                9               7           25          14
## 6           -0.04                6              14            8          28
##   DepressionStatus AnxietyStatus Stress DASScore Happiness AlcoholUse Drinks
## 1           normal        normal normal       15        28   Moderate     10
## 2           normal        normal normal        4        25   Moderate      6
## 3         moderate        severe normal       45        17      Light      3
## 4           normal        normal normal       11        32      Light      2
## 5           normal        severe normal       46        15   Moderate      4
## 6         moderate      moderate   high       50        22    Abstain      0
##   WeekdayBed WeekdayRise WeekdaySleep WeekendBed WeekendRise WeekendSleep
## 1      25.75        8.70         7.70      25.75        9.50         5.88
## 2      25.70        8.20         6.80      26.00       10.00         7.25
## 3      27.44        6.55         3.00      28.00       12.59        10.09
## 4      23.50        7.17         6.77      27.00        8.00         7.25
## 5      25.90        8.67         6.09      23.75        9.50         7.00
## 6      23.80        8.95         9.05      26.00       10.75         9.00
##   AverageSleep AllNighter
## 1         7.18          0
## 2         6.93          0
## 3         5.02          0
## 4         6.90          0
## 5         6.35          0
## 6         9.04          0

Q1) Is there a significant difference in the average GPA between male and female college students?

# Install and load necessary packages
install.packages("tidyverse") # Run this if tidyverse is not installed
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.4'
## (as 'lib' is unspecified)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
# Load the data
url <- "https://www.lock5stat.com/datasets3e/SleepStudy.csv"
sleep_data <- read.csv(url)

# Step 1: Inspect unique values in Gender to understand the data
print("Unique Gender values in the original dataset:")
## [1] "Unique Gender values in the original dataset:"
print(unique(sleep_data$Gender))
## [1] 0 1
# Step 2: Map numeric Gender values to "Male" and "Female"
clean_data <- sleep_data %>%
  # Remove rows with missing Gender or GPA values
  filter(!is.na(Gender) & !is.na(GPA)) %>%
  # Map numeric Gender values to "Male" and "Female"
  mutate(Gender = case_when(
    Gender == 1 ~ "Male",
    Gender == 0 ~ "Female",
    TRUE ~ NA_character_ # Replace unexpected values with NA
  )) %>%
  # Remove rows with NA in Gender
  filter(!is.na(Gender))

# Step 3: Verify cleaned data
print("Unique Gender values after cleaning:")
## [1] "Unique Gender values after cleaning:"
print(unique(clean_data$Gender))
## [1] "Female" "Male"
# Step 4: Perform a two-sample t-test
if (length(unique(clean_data$Gender)) == 2) { # Ensure Gender has exactly 2 levels
  t_test_result <- t.test(GPA ~ Gender, data = clean_data, var.equal = TRUE)
  
  # Display the t-test results
  print("T-Test Results:")
  print(t_test_result)
} else {
  print("Error: Gender column does not have exactly 2 levels (Male and Female).")
}
## [1] "T-Test Results:"
## 
##  Two Sample t-test
## 
## data:  GPA by Gender
## t = 3.9962, df = 251, p-value = 8.465e-05
## alternative hypothesis: true difference in means between group Female and group Male is not equal to 0
## 95 percent confidence interval:
##  0.1020292 0.3003212
## sample estimates:
## mean in group Female   mean in group Male 
##             3.324901             3.123725
# Step 5: Create a boxplot with custom colors
if (length(unique(clean_data$Gender)) == 2) { # Ensure Gender has exactly 2 levels
  ggplot(clean_data, aes(x = Gender, y = GPA, fill = Gender)) +
    geom_boxplot() +
    scale_fill_manual(values = c("Male" = "blue", "Female" = "pink")) +
    labs(title = "GPA Comparison Between Male and Female Students",
         x = "Gender",
         y = "GPA") +
    theme_minimal()
} else {
  print("Error: Cannot create plot because Gender column does not have exactly 2 levels.")
}

Q2) Is there a significant difference in the average number of early classes between the first two class years and other class years?

# Install and load necessary packages
install.packages("tidyverse") # Run this if tidyverse is not installed
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.4'
## (as 'lib' is unspecified)
library(tidyverse)

# Load the data
url <- "https://www.lock5stat.com/datasets3e/SleepStudy.csv"
sleep_data <- read.csv(url)

# Step 1: Inspect unique values in ClassYear to understand the data
print("Unique ClassYear values in the original dataset:")
## [1] "Unique ClassYear values in the original dataset:"
print(unique(sleep_data$ClassYear))
## [1] 4 1 2 3
# Step 2: Map numeric ClassYear values to descriptive labels
clean_data <- sleep_data %>%
  # Remove rows with missing ClassYear or NumEarlyClass values
  filter(!is.na(ClassYear) & !is.na(NumEarlyClass)) %>%
  # Map numeric ClassYear to descriptive labels
  mutate(ClassYear = case_when(
    ClassYear == 1 ~ "First-Year",
    ClassYear == 2 ~ "Sophomore",
    ClassYear == 3 ~ "Junior",
    ClassYear == 4 ~ "Senior",
    TRUE ~ NA_character_ # Handle unexpected values as NA
  )) %>%
  # Create a new grouping for "First Two Years" and "Other Years"
  mutate(ClassYearGroup = case_when(
    ClassYear %in% c("First-Year", "Sophomore") ~ "First Two Years",
    TRUE ~ "Other Years"
  ))

# Step 3: Verify the new grouping
print("Unique ClassYearGroup values:")
## [1] "Unique ClassYearGroup values:"
print(unique(clean_data$ClassYearGroup))
## [1] "Other Years"     "First Two Years"
# Step 4: Perform a two-sample t-test
if (length(unique(clean_data$ClassYearGroup)) == 2) { # Ensure we have exactly 2 groups
  t_test_result <- t.test(NumEarlyClass ~ ClassYearGroup, data = clean_data, var.equal = TRUE)
  
  # Display the t-test results
  print("T-Test Results:")
  print(t_test_result)
} else {
  print("Error: ClassYearGroup does not have exactly 2 levels.")
}
## [1] "T-Test Results:"
## 
##  Two Sample t-test
## 
## data:  NumEarlyClass by ClassYearGroup
## t = 4.0419, df = 251, p-value = 7.056e-05
## alternative hypothesis: true difference in means between group First Two Years and group Other Years is not equal to 0
## 95 percent confidence interval:
##  0.391789 1.136443
## sample estimates:
## mean in group First Two Years     mean in group Other Years 
##                      2.070423                      1.306306
# Step 5: Create a boxplot to visualize the difference
if (length(unique(clean_data$ClassYearGroup)) == 2) { # Ensure we have exactly 2 groups
  ggplot(clean_data, aes(x = ClassYearGroup, y = NumEarlyClass, fill = ClassYearGroup)) +
    geom_boxplot() +
    scale_fill_manual(values = c("First Two Years" = "lightblue", "Other Years" = "orange")) +
    labs(title = "Comparison of Early Classes Between First Two Years and Other Years",
         x = "Class Year Group",
         y = "Number of Early Classes") +
    theme_minimal()
} else {
  print("Error: Cannot create plot because ClassYearGroup does not have exactly 2 levels.")
}

Q3) Do students who identify as “larks” have significantly better cognitive skills (cognition z-score) compared to “owls”?

# Install and load necessary packages
install.packages("tidyverse") # Run this if tidyverse is not installed
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.4'
## (as 'lib' is unspecified)
library(tidyverse)

# Load the data
url <- "https://www.lock5stat.com/datasets3e/SleepStudy.csv"
sleep_data <- read.csv(url)

# Step 1: Inspect unique values in LarkOwl to understand the data
print("Unique LarkOwl values in the original dataset:")
## [1] "Unique LarkOwl values in the original dataset:"
print(unique(sleep_data$LarkOwl))
## [1] "Neither" "Owl"     "Lark"
# Step 2: Clean and filter the data
clean_data <- sleep_data %>%
  # Remove rows with missing LarkOwl or CognitionZscore values
  filter(!is.na(LarkOwl) & !is.na(CognitionZscore)) %>%
  # Exclude rows where LarkOwl is "Neither"
  filter(LarkOwl %in% c("Lark", "Owl")) %>%
  # Standardize LarkOwl values if needed
  mutate(LarkOwl = as.character(LarkOwl))

# Step 3: Verify the cleaned data
print("Unique LarkOwl values after cleaning:")
## [1] "Unique LarkOwl values after cleaning:"
print(unique(clean_data$LarkOwl))
## [1] "Owl"  "Lark"
# Step 4: Perform a two-sample t-test
if (length(unique(clean_data$LarkOwl)) == 2) { # Ensure we have exactly 2 groups
  t_test_result <- t.test(CognitionZscore ~ LarkOwl, data = clean_data, var.equal = TRUE)
  
  # Display the t-test results
  print("T-Test Results:")
  print(t_test_result)
} else {
  print("Error: LarkOwl column does not have exactly 2 levels (e.g., Lark and Owl).")
}
## [1] "T-Test Results:"
## 
##  Two Sample t-test
## 
## data:  CognitionZscore by LarkOwl
## t = 0.82293, df = 88, p-value = 0.4128
## alternative hypothesis: true difference in means between group Lark and group Owl is not equal to 0
## 95 percent confidence interval:
##  -0.1819703  0.4391928
## sample estimates:
## mean in group Lark  mean in group Owl 
##         0.09024390        -0.03836735
# Step 5: Create a boxplot to visualize the difference
if (length(unique(clean_data$LarkOwl)) == 2) { # Ensure we have exactly 2 groups
  ggplot(clean_data, aes(x = LarkOwl, y = CognitionZscore, fill = LarkOwl)) +
    geom_boxplot() +
    scale_fill_manual(values = c("Lark" = "lightblue", "Owl" = "lightyellow")) +
    labs(title = "Comparison of Cognitive Skills Between Larks and Owls",
         x = "Chronotype (Lark vs. Owl)",
         y = "Cognition Z-Score") +
    theme_minimal()
} else {
  print("Error: Cannot create plot because LarkOwl column does not have exactly 2 levels.")
}

Q4) Is there a significant difference in the average number of classes missed in a semester between students who had at least one early class (EarlyClass=1) and those who didn’t (EarlyClass=0)?

# Install and load necessary packages
install.packages("tidyverse") # Run this if tidyverse is not installed
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.4'
## (as 'lib' is unspecified)
library(tidyverse)

# Load the data
url <- "https://www.lock5stat.com/datasets3e/SleepStudy.csv"
sleep_data <- read.csv(url)

# Step 1: Inspect unique values in EarlyClass to understand the data
print("Unique EarlyClass values in the original dataset:")
## [1] "Unique EarlyClass values in the original dataset:"
print(unique(sleep_data$EarlyClass))
## [1] 0 1
# Step 2: Clean and filter the data
clean_data <- sleep_data %>%
  # Remove rows with missing EarlyClass or ClassesMissed values
  filter(!is.na(EarlyClass) & !is.na(ClassesMissed)) %>%
  # Standardize EarlyClass values to ensure they are binary (0 or 1)
  filter(EarlyClass %in% c(0, 1)) %>%
  mutate(EarlyClass = as.factor(EarlyClass)) # Convert to factor for clarity

# Step 3: Verify the cleaned data
print("Unique EarlyClass values after cleaning:")
## [1] "Unique EarlyClass values after cleaning:"
print(unique(clean_data$EarlyClass))
## [1] 0 1
## Levels: 0 1
# Step 4: Perform a two-sample t-test
if (length(unique(clean_data$EarlyClass)) == 2) { # Ensure we have exactly 2 groups
  t_test_result <- t.test(ClassesMissed ~ EarlyClass, data = clean_data, var.equal = TRUE)
  
  # Display the t-test results
  print("T-Test Results:")
  print(t_test_result)
} else {
  print("Error: EarlyClass column does not have exactly 2 levels (e.g., 0 and 1).")
}
## [1] "T-Test Results:"
## 
##  Two Sample t-test
## 
## data:  ClassesMissed by EarlyClass
## t = 1.5319, df = 251, p-value = 0.1268
## alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
## 95 percent confidence interval:
##  -0.1882095  1.5061367
## sample estimates:
## mean in group 0 mean in group 1 
##        2.647059        1.988095
# Step 5: Create a boxplot to visualize the difference
if (length(unique(clean_data$EarlyClass)) == 2) { # Ensure we have exactly 2 groups
  ggplot(clean_data, aes(x = EarlyClass, y = ClassesMissed, fill = EarlyClass)) +
    geom_boxplot() +
    scale_fill_manual(values = c("0" = "lightblue", "1" = "orange")) +
    labs(title = "Comparison of Classes Missed Between Students With and Without Early Classes",
         x = "Early Class (0 = No, 1 = Yes)",
         y = "Number of Classes Missed") +
    theme_minimal()
} else {
  print("Error: Cannot create plot because EarlyClass column does not have exactly 2 levels.")
}

Q5) Is there a significant difference in the average happiness level between students with at least moderate depression and normal depression status?

# Install and load necessary packages
install.packages("tidyverse") # Run this if tidyverse is not installed
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.4'
## (as 'lib' is unspecified)
library(tidyverse)

# Load the data
url <- "https://www.lock5stat.com/datasets3e/SleepStudy.csv"
sleep_data <- read.csv(url)

# Step 1: Inspect unique values in DepressionStatus to understand the data
print("Unique DepressionStatus values in the original dataset:")
## [1] "Unique DepressionStatus values in the original dataset:"
print(unique(sleep_data$DepressionStatus))
## [1] "normal"   "moderate" "severe"
# Step 2: Clean and categorize data
clean_data <- sleep_data %>%
  # Remove rows with missing DepressionStatus or Happiness values
  filter(!is.na(DepressionStatus) & !is.na(Happiness)) %>%
  # Standardize DepressionStatus to ensure consistent capitalization
  mutate(DepressionStatus = tolower(DepressionStatus)) %>% # Convert to lowercase
  mutate(DepressionGroup = case_when(
    DepressionStatus == "normal" ~ "Normal",
    DepressionStatus %in% c("moderate", "severe") ~ "Moderate or Severe",
    TRUE ~ NA_character_
  )) %>%
  # Remove rows with NA in DepressionGroup
  filter(!is.na(DepressionGroup))

# Step 3: Verify the DepressionGroup categorization
print("Unique DepressionGroup values:")
## [1] "Unique DepressionGroup values:"
print(unique(clean_data$DepressionGroup))
## [1] "Normal"             "Moderate or Severe"
# Step 4: Perform a two-sample t-test
if (length(unique(clean_data$DepressionGroup)) == 2) { # Ensure we have exactly 2 groups
  t_test_result <- t.test(Happiness ~ DepressionGroup, data = clean_data, var.equal = TRUE)
  
  # Display the t-test results
  print("T-Test Results:")
  print(t_test_result)
} else {
  print("Error: DepressionGroup does not have exactly 2 levels (e.g., Normal and Moderate/Severe).")
}
## [1] "T-Test Results:"
## 
##  Two Sample t-test
## 
## data:  Happiness by DepressionGroup
## t = -6.4426, df = 251, p-value = 5.954e-10
## alternative hypothesis: true difference in means between group Moderate or Severe and group Normal is not equal to 0
## 95 percent confidence interval:
##  -7.107907 -3.779653
## sample estimates:
## mean in group Moderate or Severe             mean in group Normal 
##                         21.61364                         27.05742
# Step 5: Create a boxplot to visualize the difference
if (length(unique(clean_data$DepressionGroup)) == 2) { # Ensure we have exactly 2 groups
  ggplot(clean_data, aes(x = DepressionGroup, y = Happiness, fill = DepressionGroup)) +
    geom_boxplot() +
    scale_fill_manual(values = c("Normal" = "lightyellow", "Moderate or Severe" = "lightblue")) +
    labs(title = "Comparison of Happiness Levels Between Normal and Moderate/Severe Depression Groups",
         x = "Depression Group",
         y = "Happiness Level") +
    theme_minimal()
} else {
  print("Error: Cannot create plot because DepressionGroup does not have exactly 2 levels.")
}

Q6) Is there a significant difference in average sleep quality scores between students who reported having at least one all-nighter (AllNighter=1) and those who didn’t (AllNighter=0)?

# Install and load necessary packages
install.packages("tidyverse") # Run this if tidyverse is not installed
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.4'
## (as 'lib' is unspecified)
library(tidyverse)

# Load the data
url <- "https://www.lock5stat.com/datasets3e/SleepStudy.csv"
sleep_data <- read.csv(url)

# Step 1: Inspect unique values in AllNighter to understand the data
print("Unique AllNighter values in the original dataset:")
## [1] "Unique AllNighter values in the original dataset:"
print(unique(sleep_data$AllNighter))
## [1] 0 1
# Step 2: Inspect column names to find the sleep quality variable
print("Column names in the dataset:")
## [1] "Column names in the dataset:"
print(colnames(sleep_data))
##  [1] "Gender"           "ClassYear"        "LarkOwl"          "NumEarlyClass"   
##  [5] "EarlyClass"       "GPA"              "ClassesMissed"    "CognitionZscore" 
##  [9] "PoorSleepQuality" "DepressionScore"  "AnxietyScore"     "StressScore"     
## [13] "DepressionStatus" "AnxietyStatus"    "Stress"           "DASScore"        
## [17] "Happiness"        "AlcoholUse"       "Drinks"           "WeekdayBed"      
## [21] "WeekdayRise"      "WeekdaySleep"     "WeekendBed"       "WeekendRise"     
## [25] "WeekendSleep"     "AverageSleep"     "AllNighter"
# Step 3: Clean and filter the data (using PoorSleepQuality as an example variable)
clean_data <- sleep_data %>%
  # Remove rows with missing AllNighter or PoorSleepQuality values
  filter(!is.na(AllNighter) & !is.na(PoorSleepQuality)) %>%
  # Ensure AllNighter values are binary (0 or 1)
  filter(AllNighter %in% c(0, 1)) %>%
  # Convert AllNighter to a factor for clear grouping
  mutate(AllNighter = as.factor(AllNighter))

# Step 4: Verify the cleaned data
print("Unique AllNighter values after cleaning:")
## [1] "Unique AllNighter values after cleaning:"
print(unique(clean_data$AllNighter))
## [1] 0 1
## Levels: 0 1
# Step 5: Perform a two-sample t-test
if (length(unique(clean_data$AllNighter)) == 2) { # Ensure we have exactly 2 groups
  t_test_result <- t.test(PoorSleepQuality ~ AllNighter, data = clean_data, var.equal = TRUE)
  
  # Display the t-test results
  print("T-Test Results:")
  print(t_test_result)
} else {
  print("Error: AllNighter column does not have exactly 2 levels (e.g., 0 and 1).")
}
## [1] "T-Test Results:"
## 
##  Two Sample t-test
## 
## data:  PoorSleepQuality by AllNighter
## t = -1.664, df = 251, p-value = 0.09737
## alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
## 95 percent confidence interval:
##  -1.9486940  0.1638431
## sample estimates:
## mean in group 0 mean in group 1 
##        6.136986        7.029412
# Step 6: Create a boxplot to visualize the difference
if (length(unique(clean_data$AllNighter)) == 2) { # Ensure we have exactly 2 groups
  ggplot(clean_data, aes(x = AllNighter, y = PoorSleepQuality, fill = AllNighter)) +
    geom_boxplot() +
    scale_fill_manual(values = c("0" = "lightyellow", "1" = "orange")) +
    labs(title = "Comparison of Sleep Quality Between Students With and Without All-Nighters",
         x = "All-Nighter (0 = No, 1 = Yes)",
         y = "Sleep Quality Score") +
    theme_minimal()
} else {
  print("Error: Cannot create plot because AllNighter column does not have exactly 2 levels.")
}

Q7) Do students who abstain from alcohol use have significantly better stress scores than those who report heavy alcohol use?

# Install and load necessary packages
install.packages("tidyverse") # Run this if tidyverse is not installed
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.4'
## (as 'lib' is unspecified)
library(tidyverse)

# Load the data
url <- "https://www.lock5stat.com/datasets3e/SleepStudy.csv"
sleep_data <- read.csv(url)

# Step 1: Inspect unique values in AlcoholUse to understand the data
print("Unique AlcoholUse values in the original dataset:")
## [1] "Unique AlcoholUse values in the original dataset:"
print(unique(sleep_data$AlcoholUse))
## [1] "Moderate" "Light"    "Abstain"  "Heavy"
# Step 2: Clean and filter the data
clean_data <- sleep_data %>%
  # Remove rows with missing AlcoholUse or StressScore values
  filter(!is.na(AlcoholUse) & !is.na(StressScore)) %>%
  # Categorize AlcoholUse into two groups: "Abstain" and "Heavy"
  mutate(AlcoholGroup = case_when(
    AlcoholUse %in% c("Abstain", "None") ~ "Abstain", # Match possible abstaining labels
    AlcoholUse == "Heavy" ~ "Heavy",
    TRUE ~ NA_character_ # Exclude other categories
  )) %>%
  # Remove rows with NA in AlcoholGroup
  filter(!is.na(AlcoholGroup))

# Step 3: Verify the AlcoholGroup categorization
print("Unique AlcoholGroup values after cleaning:")
## [1] "Unique AlcoholGroup values after cleaning:"
print(unique(clean_data$AlcoholGroup))
## [1] "Abstain" "Heavy"
# Step 4: Perform a two-sample t-test
if (length(unique(clean_data$AlcoholGroup)) == 2) { # Ensure we have exactly 2 groups
  t_test_result <- t.test(StressScore ~ AlcoholGroup, data = clean_data, var.equal = TRUE)
  
  # Display the t-test results
  print("T-Test Results:")
  print(t_test_result)
} else {
  print("Error: AlcoholGroup does not have exactly 2 levels (e.g., Abstain and Heavy).")
}
## [1] "T-Test Results:"
## 
##  Two Sample t-test
## 
## data:  StressScore by AlcoholGroup
## t = -0.63251, df = 48, p-value = 0.5301
## alternative hypothesis: true difference in means between group Abstain and group Heavy is not equal to 0
## 95 percent confidence interval:
##  -6.129928  3.196104
## sample estimates:
## mean in group Abstain   mean in group Heavy 
##              8.970588             10.437500
# Step 5: Create a boxplot to visualize the difference
if (length(unique(clean_data$AlcoholGroup)) == 2) { # Ensure we have exactly 2 groups
  ggplot(clean_data, aes(x = AlcoholGroup, y = StressScore, fill = AlcoholGroup)) +
    geom_boxplot() +
    scale_fill_manual(values = c("Abstain" = "lightgreen", "Heavy" = "orange")) +
    labs(title = "Stress Scores Between Students Who Abstain and Report Heavy Alcohol Use",
         x = "Alcohol Use Group",
         y = "Stress Score") +
    theme_minimal()
} else {
  print("Error: Cannot create plot because AlcoholGroup column does not have exactly 2 levels.")
}

Q8) Is there a significant difference in the average number of drinks per week between students of different genders?

# Install and load necessary packages
install.packages("tidyverse") # Run this if tidyverse is not installed
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.4'
## (as 'lib' is unspecified)
library(tidyverse)

# Load the data
url <- "https://www.lock5stat.com/datasets3e/SleepStudy.csv"
sleep_data <- read.csv(url)

# Step 1: Inspect unique values in Gender to understand the data
print("Unique Gender values in the original dataset:")
## [1] "Unique Gender values in the original dataset:"
print(unique(sleep_data$Gender))
## [1] 0 1
# Step 2: Clean and filter the data (map numeric Gender values to Male and Female)
clean_data <- sleep_data %>%
  # Remove rows with missing Gender or Drinks values
  filter(!is.na(Gender) & !is.na(Drinks)) %>%
  # Map numeric Gender values to "Male" and "Female"
  mutate(Gender = case_when(
    Gender == 1 ~ "Male",
    Gender == 0 ~ "Female",
    TRUE ~ NA_character_
  )) %>%
  # Remove rows with invalid Gender values
  filter(!is.na(Gender))

# Step 3: Verify the Gender categorization
print("Unique Gender values after cleaning:")
## [1] "Unique Gender values after cleaning:"
print(unique(clean_data$Gender))
## [1] "Female" "Male"
# Step 4: Perform a statistical test based on the number of genders
if (length(unique(clean_data$Gender)) == 2) {
  # Perform a two-sample t-test if there are exactly two genders
  t_test_result <- t.test(Drinks ~ Gender, data = clean_data, var.equal = TRUE)
  
  # Display the t-test results
  print("T-Test Results:")
  print(t_test_result)
} else {
  # Perform a one-way ANOVA if there are more than two genders
  anova_result <- aov(Drinks ~ Gender, data = clean_data)
  
  # Display the ANOVA results
  print("ANOVA Results:")
  print(summary(anova_result))
}
## [1] "T-Test Results:"
## 
##  Two Sample t-test
## 
## data:  Drinks by Gender
## t = -6.8358, df = 251, p-value = 6.16e-11
## alternative hypothesis: true difference in means between group Female and group Male is not equal to 0
## 95 percent confidence interval:
##  -4.251794 -2.349816
## sample estimates:
## mean in group Female   mean in group Male 
##             4.238411             7.539216
# Step 5: Create a boxplot to visualize the difference
if (length(unique(clean_data$Gender)) > 0) {
  ggplot(clean_data, aes(x = Gender, y = Drinks, fill = Gender)) +
    geom_boxplot() +
    labs(title = "Comparison of Drinks Per Week Between Genders",
         x = "Gender",
         y = "Drinks Per Week") +
    theme_minimal()
} else {
  print("Error: Cannot create plot because Gender column is invalid or empty.")
}

Q9) Is there a significant difference in the average weekday bedtime between students with high and low stress (Stress=High vs. Stress=Normal)?

# Install and load necessary packages
install.packages("tidyverse") # Run this if tidyverse is not installed
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.4'
## (as 'lib' is unspecified)
library(tidyverse)

# Load the data
url <- "https://www.lock5stat.com/datasets3e/SleepStudy.csv"
sleep_data <- read.csv(url)

# Step 1: Inspect unique values in Stress to understand the data
print("Unique Stress values in the original dataset:")
## [1] "Unique Stress values in the original dataset:"
print(unique(sleep_data$Stress))
## [1] "normal" "high"
# Step 2: Clean and filter the data
clean_data <- sleep_data %>%
  # Remove rows with missing Stress or WeekdayBed values
  filter(!is.na(Stress) & !is.na(WeekdayBed)) %>%
  # Standardize Stress values to "High" and "Normal" (adjust for lowercase values)
  mutate(StressGroup = case_when(
    Stress == "high" ~ "High Stress",
    Stress == "normal" ~ "Normal Stress",
    TRUE ~ NA_character_ # Exclude other categories
  )) %>%
  # Remove rows with NA in StressGroup
  filter(!is.na(StressGroup))

# Step 3: Verify the StressGroup categorization
print("Unique StressGroup values after cleaning:")
## [1] "Unique StressGroup values after cleaning:"
print(unique(clean_data$StressGroup))
## [1] "Normal Stress" "High Stress"
# Step 4: Perform a two-sample t-test
if (length(unique(clean_data$StressGroup)) == 2) { # Ensure we have exactly 2 groups
  t_test_result <- t.test(WeekdayBed ~ StressGroup, data = clean_data, var.equal = TRUE)
  
  # Display the t-test results
  print("T-Test Results:")
  print(t_test_result)
} else {
  print("Error: StressGroup column does not have exactly 2 levels (e.g., High Stress and Normal Stress).")
}
## [1] "T-Test Results:"
## 
##  Two Sample t-test
## 
## data:  WeekdayBed by StressGroup
## t = -1.0891, df = 251, p-value = 0.2771
## alternative hypothesis: true difference in means between group High Stress and group Normal Stress is not equal to 0
## 95 percent confidence interval:
##  -0.4786176  0.1377546
## sample estimates:
##   mean in group High Stress mean in group Normal Stress 
##                    24.71500                    24.88543
# Step 5: Create a boxplot to visualize the difference
if (length(unique(clean_data$StressGroup)) == 2) { # Ensure we have exactly 2 groups
  ggplot(clean_data, aes(x = StressGroup, y = WeekdayBed, fill = StressGroup)) +
    geom_boxplot() +
    scale_fill_manual(values = c("High Stress" = "red", "Normal Stress" = "lightgreen")) +
    labs(title = "Comparison of Weekday Bedtime Between High and Normal Stress Groups",
         x = "Stress Group",
         y = "Weekday Bedtime") +
    theme_minimal()
} else {
  print("Error: Cannot create plot because StressGroup column does not have exactly 2 levels.")
}

Q10) Is there a significant difference in the average hours of sleep on weekends between first two year students and other students?

# Install and load necessary packages
install.packages("tidyverse") # Run this if tidyverse is not installed
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.4'
## (as 'lib' is unspecified)
library(tidyverse)

# Load the data
url <- "https://www.lock5stat.com/datasets3e/SleepStudy.csv"
sleep_data <- read.csv(url)

# Step 1: Inspect unique values in ClassYear to understand the data
print("Unique ClassYear values in the original dataset:")
## [1] "Unique ClassYear values in the original dataset:"
print(unique(sleep_data$ClassYear))
## [1] 4 1 2 3
# Step 2: Clean and filter the data (map numeric ClassYear values to descriptive labels)
clean_data <- sleep_data %>%
  # Remove rows with missing ClassYear or WeekendSleep values
  filter(!is.na(ClassYear) & !is.na(WeekendSleep)) %>%
  # Map numeric ClassYear values to descriptive labels
  mutate(ClassYear = case_when(
    ClassYear == 1 ~ "First-Year",
    ClassYear == 2 ~ "Sophomore",
    ClassYear == 3 ~ "Junior",
    ClassYear == 4 ~ "Senior",
    TRUE ~ NA_character_
  )) %>%
  # Categorize students into first two years ("First-Year", "Sophomore") and others
  mutate(ClassGroup = case_when(
    ClassYear %in% c("First-Year", "Sophomore") ~ "First Two Years",
    ClassYear %in% c("Junior", "Senior") ~ "Other Years",
    TRUE ~ NA_character_
  )) %>%
  # Remove rows with NA in ClassGroup
  filter(!is.na(ClassGroup))

# Step 3: Verify the ClassGroup categorization
print("Unique ClassGroup values after cleaning:")
## [1] "Unique ClassGroup values after cleaning:"
print(unique(clean_data$ClassGroup))
## [1] "Other Years"     "First Two Years"
# Step 4: Perform a two-sample t-test
if (length(unique(clean_data$ClassGroup)) == 2) { # Ensure we have exactly 2 groups
  t_test_result <- t.test(WeekendSleep ~ ClassGroup, data = clean_data, var.equal = TRUE)
  
  # Display the t-test results
  print("T-Test Results:")
  print(t_test_result)
} else {
  print("Error: ClassGroup column does not have exactly 2 levels (e.g., First Two Years and Other Years).")
}
## [1] "T-Test Results:"
## 
##  Two Sample t-test
## 
## data:  WeekendSleep by ClassGroup
## t = -0.047839, df = 251, p-value = 0.9619
## alternative hypothesis: true difference in means between group First Two Years and group Other Years is not equal to 0
## 95 percent confidence interval:
##  -0.3500149  0.3334142
## sample estimates:
## mean in group First Two Years     mean in group Other Years 
##                      8.213592                      8.221892
# Step 5: Create a boxplot to visualize the difference
if (length(unique(clean_data$ClassGroup)) == 2) { # Ensure we have exactly 2 groups
  ggplot(clean_data, aes(x = ClassGroup, y = WeekendSleep, fill = ClassGroup)) +
    geom_boxplot() +
    scale_fill_manual(values = c("First Two Years" = "lightgreen", "Other Years" = "orange")) +
    labs(title = "Comparison of Weekend Sleep Between First Two Years and Other Students",
         x = "Class Group",
         y = "Weekend Sleep (Hours)") +
    theme_minimal()
} else {
  print("Error: Cannot create plot because ClassGroup column does not have exactly 2 levels.")
}