Using the data from lock5stat.com, I will address the following questions.
Here, we will explore these questions in detail.
college = read.csv("https://www.lock5stat.com/datasets3e/SleepStudy.csv")
head(college)
## Gender ClassYear LarkOwl NumEarlyClass EarlyClass GPA ClassesMissed
## 1 0 4 Neither 0 0 3.60 0
## 2 0 4 Neither 2 1 3.24 0
## 3 0 4 Owl 0 0 2.97 12
## 4 0 1 Lark 5 1 3.76 0
## 5 0 4 Owl 0 0 3.20 4
## 6 1 4 Neither 0 0 3.50 0
## CognitionZscore PoorSleepQuality DepressionScore AnxietyScore StressScore
## 1 -0.26 4 4 3 8
## 2 1.39 6 1 0 3
## 3 0.38 18 18 18 9
## 4 1.39 9 1 4 6
## 5 1.22 9 7 25 14
## 6 -0.04 6 14 8 28
## DepressionStatus AnxietyStatus Stress DASScore Happiness AlcoholUse Drinks
## 1 normal normal normal 15 28 Moderate 10
## 2 normal normal normal 4 25 Moderate 6
## 3 moderate severe normal 45 17 Light 3
## 4 normal normal normal 11 32 Light 2
## 5 normal severe normal 46 15 Moderate 4
## 6 moderate moderate high 50 22 Abstain 0
## WeekdayBed WeekdayRise WeekdaySleep WeekendBed WeekendRise WeekendSleep
## 1 25.75 8.70 7.70 25.75 9.50 5.88
## 2 25.70 8.20 6.80 26.00 10.00 7.25
## 3 27.44 6.55 3.00 28.00 12.59 10.09
## 4 23.50 7.17 6.77 27.00 8.00 7.25
## 5 25.90 8.67 6.09 23.75 9.50 7.00
## 6 23.80 8.95 9.05 26.00 10.75 9.00
## AverageSleep AllNighter
## 1 7.18 0
## 2 6.93 0
## 3 5.02 0
## 4 6.90 0
## 5 6.35 0
## 6 9.04 0
# Install and load necessary packages
install.packages("tidyverse") # Run this if tidyverse is not installed
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.4'
## (as 'lib' is unspecified)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
# Load the data
url <- "https://www.lock5stat.com/datasets3e/SleepStudy.csv"
sleep_data <- read.csv(url)
# Step 1: Inspect unique values in Gender to understand the data
print("Unique Gender values in the original dataset:")
## [1] "Unique Gender values in the original dataset:"
print(unique(sleep_data$Gender))
## [1] 0 1
# Step 2: Map numeric Gender values to "Male" and "Female"
clean_data <- sleep_data %>%
# Remove rows with missing Gender or GPA values
filter(!is.na(Gender) & !is.na(GPA)) %>%
# Map numeric Gender values to "Male" and "Female"
mutate(Gender = case_when(
Gender == 1 ~ "Male",
Gender == 0 ~ "Female",
TRUE ~ NA_character_ # Replace unexpected values with NA
)) %>%
# Remove rows with NA in Gender
filter(!is.na(Gender))
# Step 3: Verify cleaned data
print("Unique Gender values after cleaning:")
## [1] "Unique Gender values after cleaning:"
print(unique(clean_data$Gender))
## [1] "Female" "Male"
# Step 4: Perform a two-sample t-test
if (length(unique(clean_data$Gender)) == 2) { # Ensure Gender has exactly 2 levels
t_test_result <- t.test(GPA ~ Gender, data = clean_data, var.equal = TRUE)
# Display the t-test results
print("T-Test Results:")
print(t_test_result)
} else {
print("Error: Gender column does not have exactly 2 levels (Male and Female).")
}
## [1] "T-Test Results:"
##
## Two Sample t-test
##
## data: GPA by Gender
## t = 3.9962, df = 251, p-value = 8.465e-05
## alternative hypothesis: true difference in means between group Female and group Male is not equal to 0
## 95 percent confidence interval:
## 0.1020292 0.3003212
## sample estimates:
## mean in group Female mean in group Male
## 3.324901 3.123725
# Step 5: Create a boxplot with custom colors
if (length(unique(clean_data$Gender)) == 2) { # Ensure Gender has exactly 2 levels
ggplot(clean_data, aes(x = Gender, y = GPA, fill = Gender)) +
geom_boxplot() +
scale_fill_manual(values = c("Male" = "blue", "Female" = "pink")) +
labs(title = "GPA Comparison Between Male and Female Students",
x = "Gender",
y = "GPA") +
theme_minimal()
} else {
print("Error: Cannot create plot because Gender column does not have exactly 2 levels.")
}
# Install and load necessary packages
install.packages("tidyverse") # Run this if tidyverse is not installed
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.4'
## (as 'lib' is unspecified)
library(tidyverse)
# Load the data
url <- "https://www.lock5stat.com/datasets3e/SleepStudy.csv"
sleep_data <- read.csv(url)
# Step 1: Inspect unique values in ClassYear to understand the data
print("Unique ClassYear values in the original dataset:")
## [1] "Unique ClassYear values in the original dataset:"
print(unique(sleep_data$ClassYear))
## [1] 4 1 2 3
# Step 2: Map numeric ClassYear values to descriptive labels
clean_data <- sleep_data %>%
# Remove rows with missing ClassYear or NumEarlyClass values
filter(!is.na(ClassYear) & !is.na(NumEarlyClass)) %>%
# Map numeric ClassYear to descriptive labels
mutate(ClassYear = case_when(
ClassYear == 1 ~ "First-Year",
ClassYear == 2 ~ "Sophomore",
ClassYear == 3 ~ "Junior",
ClassYear == 4 ~ "Senior",
TRUE ~ NA_character_ # Handle unexpected values as NA
)) %>%
# Create a new grouping for "First Two Years" and "Other Years"
mutate(ClassYearGroup = case_when(
ClassYear %in% c("First-Year", "Sophomore") ~ "First Two Years",
TRUE ~ "Other Years"
))
# Step 3: Verify the new grouping
print("Unique ClassYearGroup values:")
## [1] "Unique ClassYearGroup values:"
print(unique(clean_data$ClassYearGroup))
## [1] "Other Years" "First Two Years"
# Step 4: Perform a two-sample t-test
if (length(unique(clean_data$ClassYearGroup)) == 2) { # Ensure we have exactly 2 groups
t_test_result <- t.test(NumEarlyClass ~ ClassYearGroup, data = clean_data, var.equal = TRUE)
# Display the t-test results
print("T-Test Results:")
print(t_test_result)
} else {
print("Error: ClassYearGroup does not have exactly 2 levels.")
}
## [1] "T-Test Results:"
##
## Two Sample t-test
##
## data: NumEarlyClass by ClassYearGroup
## t = 4.0419, df = 251, p-value = 7.056e-05
## alternative hypothesis: true difference in means between group First Two Years and group Other Years is not equal to 0
## 95 percent confidence interval:
## 0.391789 1.136443
## sample estimates:
## mean in group First Two Years mean in group Other Years
## 2.070423 1.306306
# Step 5: Create a boxplot to visualize the difference
if (length(unique(clean_data$ClassYearGroup)) == 2) { # Ensure we have exactly 2 groups
ggplot(clean_data, aes(x = ClassYearGroup, y = NumEarlyClass, fill = ClassYearGroup)) +
geom_boxplot() +
scale_fill_manual(values = c("First Two Years" = "lightblue", "Other Years" = "orange")) +
labs(title = "Comparison of Early Classes Between First Two Years and Other Years",
x = "Class Year Group",
y = "Number of Early Classes") +
theme_minimal()
} else {
print("Error: Cannot create plot because ClassYearGroup does not have exactly 2 levels.")
}
# Install and load necessary packages
install.packages("tidyverse") # Run this if tidyverse is not installed
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.4'
## (as 'lib' is unspecified)
library(tidyverse)
# Load the data
url <- "https://www.lock5stat.com/datasets3e/SleepStudy.csv"
sleep_data <- read.csv(url)
# Step 1: Inspect unique values in LarkOwl to understand the data
print("Unique LarkOwl values in the original dataset:")
## [1] "Unique LarkOwl values in the original dataset:"
print(unique(sleep_data$LarkOwl))
## [1] "Neither" "Owl" "Lark"
# Step 2: Clean and filter the data
clean_data <- sleep_data %>%
# Remove rows with missing LarkOwl or CognitionZscore values
filter(!is.na(LarkOwl) & !is.na(CognitionZscore)) %>%
# Exclude rows where LarkOwl is "Neither"
filter(LarkOwl %in% c("Lark", "Owl")) %>%
# Standardize LarkOwl values if needed
mutate(LarkOwl = as.character(LarkOwl))
# Step 3: Verify the cleaned data
print("Unique LarkOwl values after cleaning:")
## [1] "Unique LarkOwl values after cleaning:"
print(unique(clean_data$LarkOwl))
## [1] "Owl" "Lark"
# Step 4: Perform a two-sample t-test
if (length(unique(clean_data$LarkOwl)) == 2) { # Ensure we have exactly 2 groups
t_test_result <- t.test(CognitionZscore ~ LarkOwl, data = clean_data, var.equal = TRUE)
# Display the t-test results
print("T-Test Results:")
print(t_test_result)
} else {
print("Error: LarkOwl column does not have exactly 2 levels (e.g., Lark and Owl).")
}
## [1] "T-Test Results:"
##
## Two Sample t-test
##
## data: CognitionZscore by LarkOwl
## t = 0.82293, df = 88, p-value = 0.4128
## alternative hypothesis: true difference in means between group Lark and group Owl is not equal to 0
## 95 percent confidence interval:
## -0.1819703 0.4391928
## sample estimates:
## mean in group Lark mean in group Owl
## 0.09024390 -0.03836735
# Step 5: Create a boxplot to visualize the difference
if (length(unique(clean_data$LarkOwl)) == 2) { # Ensure we have exactly 2 groups
ggplot(clean_data, aes(x = LarkOwl, y = CognitionZscore, fill = LarkOwl)) +
geom_boxplot() +
scale_fill_manual(values = c("Lark" = "lightblue", "Owl" = "lightyellow")) +
labs(title = "Comparison of Cognitive Skills Between Larks and Owls",
x = "Chronotype (Lark vs. Owl)",
y = "Cognition Z-Score") +
theme_minimal()
} else {
print("Error: Cannot create plot because LarkOwl column does not have exactly 2 levels.")
}
# Install and load necessary packages
install.packages("tidyverse") # Run this if tidyverse is not installed
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.4'
## (as 'lib' is unspecified)
library(tidyverse)
# Load the data
url <- "https://www.lock5stat.com/datasets3e/SleepStudy.csv"
sleep_data <- read.csv(url)
# Step 1: Inspect unique values in EarlyClass to understand the data
print("Unique EarlyClass values in the original dataset:")
## [1] "Unique EarlyClass values in the original dataset:"
print(unique(sleep_data$EarlyClass))
## [1] 0 1
# Step 2: Clean and filter the data
clean_data <- sleep_data %>%
# Remove rows with missing EarlyClass or ClassesMissed values
filter(!is.na(EarlyClass) & !is.na(ClassesMissed)) %>%
# Standardize EarlyClass values to ensure they are binary (0 or 1)
filter(EarlyClass %in% c(0, 1)) %>%
mutate(EarlyClass = as.factor(EarlyClass)) # Convert to factor for clarity
# Step 3: Verify the cleaned data
print("Unique EarlyClass values after cleaning:")
## [1] "Unique EarlyClass values after cleaning:"
print(unique(clean_data$EarlyClass))
## [1] 0 1
## Levels: 0 1
# Step 4: Perform a two-sample t-test
if (length(unique(clean_data$EarlyClass)) == 2) { # Ensure we have exactly 2 groups
t_test_result <- t.test(ClassesMissed ~ EarlyClass, data = clean_data, var.equal = TRUE)
# Display the t-test results
print("T-Test Results:")
print(t_test_result)
} else {
print("Error: EarlyClass column does not have exactly 2 levels (e.g., 0 and 1).")
}
## [1] "T-Test Results:"
##
## Two Sample t-test
##
## data: ClassesMissed by EarlyClass
## t = 1.5319, df = 251, p-value = 0.1268
## alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
## 95 percent confidence interval:
## -0.1882095 1.5061367
## sample estimates:
## mean in group 0 mean in group 1
## 2.647059 1.988095
# Step 5: Create a boxplot to visualize the difference
if (length(unique(clean_data$EarlyClass)) == 2) { # Ensure we have exactly 2 groups
ggplot(clean_data, aes(x = EarlyClass, y = ClassesMissed, fill = EarlyClass)) +
geom_boxplot() +
scale_fill_manual(values = c("0" = "lightblue", "1" = "orange")) +
labs(title = "Comparison of Classes Missed Between Students With and Without Early Classes",
x = "Early Class (0 = No, 1 = Yes)",
y = "Number of Classes Missed") +
theme_minimal()
} else {
print("Error: Cannot create plot because EarlyClass column does not have exactly 2 levels.")
}
# Install and load necessary packages
install.packages("tidyverse") # Run this if tidyverse is not installed
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.4'
## (as 'lib' is unspecified)
library(tidyverse)
# Load the data
url <- "https://www.lock5stat.com/datasets3e/SleepStudy.csv"
sleep_data <- read.csv(url)
# Step 1: Inspect unique values in DepressionStatus to understand the data
print("Unique DepressionStatus values in the original dataset:")
## [1] "Unique DepressionStatus values in the original dataset:"
print(unique(sleep_data$DepressionStatus))
## [1] "normal" "moderate" "severe"
# Step 2: Clean and categorize data
clean_data <- sleep_data %>%
# Remove rows with missing DepressionStatus or Happiness values
filter(!is.na(DepressionStatus) & !is.na(Happiness)) %>%
# Standardize DepressionStatus to ensure consistent capitalization
mutate(DepressionStatus = tolower(DepressionStatus)) %>% # Convert to lowercase
mutate(DepressionGroup = case_when(
DepressionStatus == "normal" ~ "Normal",
DepressionStatus %in% c("moderate", "severe") ~ "Moderate or Severe",
TRUE ~ NA_character_
)) %>%
# Remove rows with NA in DepressionGroup
filter(!is.na(DepressionGroup))
# Step 3: Verify the DepressionGroup categorization
print("Unique DepressionGroup values:")
## [1] "Unique DepressionGroup values:"
print(unique(clean_data$DepressionGroup))
## [1] "Normal" "Moderate or Severe"
# Step 4: Perform a two-sample t-test
if (length(unique(clean_data$DepressionGroup)) == 2) { # Ensure we have exactly 2 groups
t_test_result <- t.test(Happiness ~ DepressionGroup, data = clean_data, var.equal = TRUE)
# Display the t-test results
print("T-Test Results:")
print(t_test_result)
} else {
print("Error: DepressionGroup does not have exactly 2 levels (e.g., Normal and Moderate/Severe).")
}
## [1] "T-Test Results:"
##
## Two Sample t-test
##
## data: Happiness by DepressionGroup
## t = -6.4426, df = 251, p-value = 5.954e-10
## alternative hypothesis: true difference in means between group Moderate or Severe and group Normal is not equal to 0
## 95 percent confidence interval:
## -7.107907 -3.779653
## sample estimates:
## mean in group Moderate or Severe mean in group Normal
## 21.61364 27.05742
# Step 5: Create a boxplot to visualize the difference
if (length(unique(clean_data$DepressionGroup)) == 2) { # Ensure we have exactly 2 groups
ggplot(clean_data, aes(x = DepressionGroup, y = Happiness, fill = DepressionGroup)) +
geom_boxplot() +
scale_fill_manual(values = c("Normal" = "lightyellow", "Moderate or Severe" = "lightblue")) +
labs(title = "Comparison of Happiness Levels Between Normal and Moderate/Severe Depression Groups",
x = "Depression Group",
y = "Happiness Level") +
theme_minimal()
} else {
print("Error: Cannot create plot because DepressionGroup does not have exactly 2 levels.")
}
# Install and load necessary packages
install.packages("tidyverse") # Run this if tidyverse is not installed
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.4'
## (as 'lib' is unspecified)
library(tidyverse)
# Load the data
url <- "https://www.lock5stat.com/datasets3e/SleepStudy.csv"
sleep_data <- read.csv(url)
# Step 1: Inspect unique values in AllNighter to understand the data
print("Unique AllNighter values in the original dataset:")
## [1] "Unique AllNighter values in the original dataset:"
print(unique(sleep_data$AllNighter))
## [1] 0 1
# Step 2: Inspect column names to find the sleep quality variable
print("Column names in the dataset:")
## [1] "Column names in the dataset:"
print(colnames(sleep_data))
## [1] "Gender" "ClassYear" "LarkOwl" "NumEarlyClass"
## [5] "EarlyClass" "GPA" "ClassesMissed" "CognitionZscore"
## [9] "PoorSleepQuality" "DepressionScore" "AnxietyScore" "StressScore"
## [13] "DepressionStatus" "AnxietyStatus" "Stress" "DASScore"
## [17] "Happiness" "AlcoholUse" "Drinks" "WeekdayBed"
## [21] "WeekdayRise" "WeekdaySleep" "WeekendBed" "WeekendRise"
## [25] "WeekendSleep" "AverageSleep" "AllNighter"
# Step 3: Clean and filter the data (using PoorSleepQuality as an example variable)
clean_data <- sleep_data %>%
# Remove rows with missing AllNighter or PoorSleepQuality values
filter(!is.na(AllNighter) & !is.na(PoorSleepQuality)) %>%
# Ensure AllNighter values are binary (0 or 1)
filter(AllNighter %in% c(0, 1)) %>%
# Convert AllNighter to a factor for clear grouping
mutate(AllNighter = as.factor(AllNighter))
# Step 4: Verify the cleaned data
print("Unique AllNighter values after cleaning:")
## [1] "Unique AllNighter values after cleaning:"
print(unique(clean_data$AllNighter))
## [1] 0 1
## Levels: 0 1
# Step 5: Perform a two-sample t-test
if (length(unique(clean_data$AllNighter)) == 2) { # Ensure we have exactly 2 groups
t_test_result <- t.test(PoorSleepQuality ~ AllNighter, data = clean_data, var.equal = TRUE)
# Display the t-test results
print("T-Test Results:")
print(t_test_result)
} else {
print("Error: AllNighter column does not have exactly 2 levels (e.g., 0 and 1).")
}
## [1] "T-Test Results:"
##
## Two Sample t-test
##
## data: PoorSleepQuality by AllNighter
## t = -1.664, df = 251, p-value = 0.09737
## alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
## 95 percent confidence interval:
## -1.9486940 0.1638431
## sample estimates:
## mean in group 0 mean in group 1
## 6.136986 7.029412
# Step 6: Create a boxplot to visualize the difference
if (length(unique(clean_data$AllNighter)) == 2) { # Ensure we have exactly 2 groups
ggplot(clean_data, aes(x = AllNighter, y = PoorSleepQuality, fill = AllNighter)) +
geom_boxplot() +
scale_fill_manual(values = c("0" = "lightyellow", "1" = "orange")) +
labs(title = "Comparison of Sleep Quality Between Students With and Without All-Nighters",
x = "All-Nighter (0 = No, 1 = Yes)",
y = "Sleep Quality Score") +
theme_minimal()
} else {
print("Error: Cannot create plot because AllNighter column does not have exactly 2 levels.")
}
# Install and load necessary packages
install.packages("tidyverse") # Run this if tidyverse is not installed
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.4'
## (as 'lib' is unspecified)
library(tidyverse)
# Load the data
url <- "https://www.lock5stat.com/datasets3e/SleepStudy.csv"
sleep_data <- read.csv(url)
# Step 1: Inspect unique values in AlcoholUse to understand the data
print("Unique AlcoholUse values in the original dataset:")
## [1] "Unique AlcoholUse values in the original dataset:"
print(unique(sleep_data$AlcoholUse))
## [1] "Moderate" "Light" "Abstain" "Heavy"
# Step 2: Clean and filter the data
clean_data <- sleep_data %>%
# Remove rows with missing AlcoholUse or StressScore values
filter(!is.na(AlcoholUse) & !is.na(StressScore)) %>%
# Categorize AlcoholUse into two groups: "Abstain" and "Heavy"
mutate(AlcoholGroup = case_when(
AlcoholUse %in% c("Abstain", "None") ~ "Abstain", # Match possible abstaining labels
AlcoholUse == "Heavy" ~ "Heavy",
TRUE ~ NA_character_ # Exclude other categories
)) %>%
# Remove rows with NA in AlcoholGroup
filter(!is.na(AlcoholGroup))
# Step 3: Verify the AlcoholGroup categorization
print("Unique AlcoholGroup values after cleaning:")
## [1] "Unique AlcoholGroup values after cleaning:"
print(unique(clean_data$AlcoholGroup))
## [1] "Abstain" "Heavy"
# Step 4: Perform a two-sample t-test
if (length(unique(clean_data$AlcoholGroup)) == 2) { # Ensure we have exactly 2 groups
t_test_result <- t.test(StressScore ~ AlcoholGroup, data = clean_data, var.equal = TRUE)
# Display the t-test results
print("T-Test Results:")
print(t_test_result)
} else {
print("Error: AlcoholGroup does not have exactly 2 levels (e.g., Abstain and Heavy).")
}
## [1] "T-Test Results:"
##
## Two Sample t-test
##
## data: StressScore by AlcoholGroup
## t = -0.63251, df = 48, p-value = 0.5301
## alternative hypothesis: true difference in means between group Abstain and group Heavy is not equal to 0
## 95 percent confidence interval:
## -6.129928 3.196104
## sample estimates:
## mean in group Abstain mean in group Heavy
## 8.970588 10.437500
# Step 5: Create a boxplot to visualize the difference
if (length(unique(clean_data$AlcoholGroup)) == 2) { # Ensure we have exactly 2 groups
ggplot(clean_data, aes(x = AlcoholGroup, y = StressScore, fill = AlcoholGroup)) +
geom_boxplot() +
scale_fill_manual(values = c("Abstain" = "lightgreen", "Heavy" = "orange")) +
labs(title = "Stress Scores Between Students Who Abstain and Report Heavy Alcohol Use",
x = "Alcohol Use Group",
y = "Stress Score") +
theme_minimal()
} else {
print("Error: Cannot create plot because AlcoholGroup column does not have exactly 2 levels.")
}
# Install and load necessary packages
install.packages("tidyverse") # Run this if tidyverse is not installed
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.4'
## (as 'lib' is unspecified)
library(tidyverse)
# Load the data
url <- "https://www.lock5stat.com/datasets3e/SleepStudy.csv"
sleep_data <- read.csv(url)
# Step 1: Inspect unique values in Gender to understand the data
print("Unique Gender values in the original dataset:")
## [1] "Unique Gender values in the original dataset:"
print(unique(sleep_data$Gender))
## [1] 0 1
# Step 2: Clean and filter the data (map numeric Gender values to Male and Female)
clean_data <- sleep_data %>%
# Remove rows with missing Gender or Drinks values
filter(!is.na(Gender) & !is.na(Drinks)) %>%
# Map numeric Gender values to "Male" and "Female"
mutate(Gender = case_when(
Gender == 1 ~ "Male",
Gender == 0 ~ "Female",
TRUE ~ NA_character_
)) %>%
# Remove rows with invalid Gender values
filter(!is.na(Gender))
# Step 3: Verify the Gender categorization
print("Unique Gender values after cleaning:")
## [1] "Unique Gender values after cleaning:"
print(unique(clean_data$Gender))
## [1] "Female" "Male"
# Step 4: Perform a statistical test based on the number of genders
if (length(unique(clean_data$Gender)) == 2) {
# Perform a two-sample t-test if there are exactly two genders
t_test_result <- t.test(Drinks ~ Gender, data = clean_data, var.equal = TRUE)
# Display the t-test results
print("T-Test Results:")
print(t_test_result)
} else {
# Perform a one-way ANOVA if there are more than two genders
anova_result <- aov(Drinks ~ Gender, data = clean_data)
# Display the ANOVA results
print("ANOVA Results:")
print(summary(anova_result))
}
## [1] "T-Test Results:"
##
## Two Sample t-test
##
## data: Drinks by Gender
## t = -6.8358, df = 251, p-value = 6.16e-11
## alternative hypothesis: true difference in means between group Female and group Male is not equal to 0
## 95 percent confidence interval:
## -4.251794 -2.349816
## sample estimates:
## mean in group Female mean in group Male
## 4.238411 7.539216
# Step 5: Create a boxplot to visualize the difference
if (length(unique(clean_data$Gender)) > 0) {
ggplot(clean_data, aes(x = Gender, y = Drinks, fill = Gender)) +
geom_boxplot() +
labs(title = "Comparison of Drinks Per Week Between Genders",
x = "Gender",
y = "Drinks Per Week") +
theme_minimal()
} else {
print("Error: Cannot create plot because Gender column is invalid or empty.")
}
# Install and load necessary packages
install.packages("tidyverse") # Run this if tidyverse is not installed
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.4'
## (as 'lib' is unspecified)
library(tidyverse)
# Load the data
url <- "https://www.lock5stat.com/datasets3e/SleepStudy.csv"
sleep_data <- read.csv(url)
# Step 1: Inspect unique values in Stress to understand the data
print("Unique Stress values in the original dataset:")
## [1] "Unique Stress values in the original dataset:"
print(unique(sleep_data$Stress))
## [1] "normal" "high"
# Step 2: Clean and filter the data
clean_data <- sleep_data %>%
# Remove rows with missing Stress or WeekdayBed values
filter(!is.na(Stress) & !is.na(WeekdayBed)) %>%
# Standardize Stress values to "High" and "Normal" (adjust for lowercase values)
mutate(StressGroup = case_when(
Stress == "high" ~ "High Stress",
Stress == "normal" ~ "Normal Stress",
TRUE ~ NA_character_ # Exclude other categories
)) %>%
# Remove rows with NA in StressGroup
filter(!is.na(StressGroup))
# Step 3: Verify the StressGroup categorization
print("Unique StressGroup values after cleaning:")
## [1] "Unique StressGroup values after cleaning:"
print(unique(clean_data$StressGroup))
## [1] "Normal Stress" "High Stress"
# Step 4: Perform a two-sample t-test
if (length(unique(clean_data$StressGroup)) == 2) { # Ensure we have exactly 2 groups
t_test_result <- t.test(WeekdayBed ~ StressGroup, data = clean_data, var.equal = TRUE)
# Display the t-test results
print("T-Test Results:")
print(t_test_result)
} else {
print("Error: StressGroup column does not have exactly 2 levels (e.g., High Stress and Normal Stress).")
}
## [1] "T-Test Results:"
##
## Two Sample t-test
##
## data: WeekdayBed by StressGroup
## t = -1.0891, df = 251, p-value = 0.2771
## alternative hypothesis: true difference in means between group High Stress and group Normal Stress is not equal to 0
## 95 percent confidence interval:
## -0.4786176 0.1377546
## sample estimates:
## mean in group High Stress mean in group Normal Stress
## 24.71500 24.88543
# Step 5: Create a boxplot to visualize the difference
if (length(unique(clean_data$StressGroup)) == 2) { # Ensure we have exactly 2 groups
ggplot(clean_data, aes(x = StressGroup, y = WeekdayBed, fill = StressGroup)) +
geom_boxplot() +
scale_fill_manual(values = c("High Stress" = "red", "Normal Stress" = "lightgreen")) +
labs(title = "Comparison of Weekday Bedtime Between High and Normal Stress Groups",
x = "Stress Group",
y = "Weekday Bedtime") +
theme_minimal()
} else {
print("Error: Cannot create plot because StressGroup column does not have exactly 2 levels.")
}
# Install and load necessary packages
install.packages("tidyverse") # Run this if tidyverse is not installed
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.4'
## (as 'lib' is unspecified)
library(tidyverse)
# Load the data
url <- "https://www.lock5stat.com/datasets3e/SleepStudy.csv"
sleep_data <- read.csv(url)
# Step 1: Inspect unique values in ClassYear to understand the data
print("Unique ClassYear values in the original dataset:")
## [1] "Unique ClassYear values in the original dataset:"
print(unique(sleep_data$ClassYear))
## [1] 4 1 2 3
# Step 2: Clean and filter the data (map numeric ClassYear values to descriptive labels)
clean_data <- sleep_data %>%
# Remove rows with missing ClassYear or WeekendSleep values
filter(!is.na(ClassYear) & !is.na(WeekendSleep)) %>%
# Map numeric ClassYear values to descriptive labels
mutate(ClassYear = case_when(
ClassYear == 1 ~ "First-Year",
ClassYear == 2 ~ "Sophomore",
ClassYear == 3 ~ "Junior",
ClassYear == 4 ~ "Senior",
TRUE ~ NA_character_
)) %>%
# Categorize students into first two years ("First-Year", "Sophomore") and others
mutate(ClassGroup = case_when(
ClassYear %in% c("First-Year", "Sophomore") ~ "First Two Years",
ClassYear %in% c("Junior", "Senior") ~ "Other Years",
TRUE ~ NA_character_
)) %>%
# Remove rows with NA in ClassGroup
filter(!is.na(ClassGroup))
# Step 3: Verify the ClassGroup categorization
print("Unique ClassGroup values after cleaning:")
## [1] "Unique ClassGroup values after cleaning:"
print(unique(clean_data$ClassGroup))
## [1] "Other Years" "First Two Years"
# Step 4: Perform a two-sample t-test
if (length(unique(clean_data$ClassGroup)) == 2) { # Ensure we have exactly 2 groups
t_test_result <- t.test(WeekendSleep ~ ClassGroup, data = clean_data, var.equal = TRUE)
# Display the t-test results
print("T-Test Results:")
print(t_test_result)
} else {
print("Error: ClassGroup column does not have exactly 2 levels (e.g., First Two Years and Other Years).")
}
## [1] "T-Test Results:"
##
## Two Sample t-test
##
## data: WeekendSleep by ClassGroup
## t = -0.047839, df = 251, p-value = 0.9619
## alternative hypothesis: true difference in means between group First Two Years and group Other Years is not equal to 0
## 95 percent confidence interval:
## -0.3500149 0.3334142
## sample estimates:
## mean in group First Two Years mean in group Other Years
## 8.213592 8.221892
# Step 5: Create a boxplot to visualize the difference
if (length(unique(clean_data$ClassGroup)) == 2) { # Ensure we have exactly 2 groups
ggplot(clean_data, aes(x = ClassGroup, y = WeekendSleep, fill = ClassGroup)) +
geom_boxplot() +
scale_fill_manual(values = c("First Two Years" = "lightgreen", "Other Years" = "orange")) +
labs(title = "Comparison of Weekend Sleep Between First Two Years and Other Students",
x = "Class Group",
y = "Weekend Sleep (Hours)") +
theme_minimal()
} else {
print("Error: Cannot create plot because ClassGroup column does not have exactly 2 levels.")
}