##Introduction This report presents an analysis of sleep patterns among college students, utilizing the “SleepStudy” dataset obtained from https://www.lock5stat.com/datapage3e.html. The dataset comprises 253 observations on 27 variables, providing valuable insights into the sleep habits, psychological well-being, and lifestyle choices of college students. We will be using the following questions to explore the data and analyze it.
Is there a significant difference in the average GPA between male and female college students?
Is there a significant difference in the average number of early classes between the first two class years and other class years?
Do students who identify as “larks” have significantly better cognitive skills (cognition z-score) compared to “owls”?
Is there a significant difference in the average number of classes missed in a semester between students who had at least one early class (EarlyClass=1) and those who didn’t (EarlyClass=0)?
Is there a significant difference in the average happiness level between students with at least moderate depression and normal depression status?
Is there a significant difference in average sleep quality scores between students who reported having at least one all-nighter (AllNighter=1) and those who didn’t (AllNighter=0)?
Do students who abstain from alcohol use have significantly better stress scores than those who report heavy alcohol use?
Is there a significant difference in the average number of drinks per week between students of different genders?
Is there a significant difference in the average weekday bedtime between students with high and low stress (Stress=High vs. Stress=Normal)?
10.Is there a significant difference in the average hours of sleep on weekends between first two year students and other students?
We will explore the questions in detail.
sleep = read.csv("https://www.lock5stat.com/datasets3e/SleepStudy.csv")
head(sleep)
## Gender ClassYear LarkOwl NumEarlyClass EarlyClass GPA ClassesMissed
## 1 0 4 Neither 0 0 3.60 0
## 2 0 4 Neither 2 1 3.24 0
## 3 0 4 Owl 0 0 2.97 12
## 4 0 1 Lark 5 1 3.76 0
## 5 0 4 Owl 0 0 3.20 4
## 6 1 4 Neither 0 0 3.50 0
## CognitionZscore PoorSleepQuality DepressionScore AnxietyScore StressScore
## 1 -0.26 4 4 3 8
## 2 1.39 6 1 0 3
## 3 0.38 18 18 18 9
## 4 1.39 9 1 4 6
## 5 1.22 9 7 25 14
## 6 -0.04 6 14 8 28
## DepressionStatus AnxietyStatus Stress DASScore Happiness AlcoholUse Drinks
## 1 normal normal normal 15 28 Moderate 10
## 2 normal normal normal 4 25 Moderate 6
## 3 moderate severe normal 45 17 Light 3
## 4 normal normal normal 11 32 Light 2
## 5 normal severe normal 46 15 Moderate 4
## 6 moderate moderate high 50 22 Abstain 0
## WeekdayBed WeekdayRise WeekdaySleep WeekendBed WeekendRise WeekendSleep
## 1 25.75 8.70 7.70 25.75 9.50 5.88
## 2 25.70 8.20 6.80 26.00 10.00 7.25
## 3 27.44 6.55 3.00 28.00 12.59 10.09
## 4 23.50 7.17 6.77 27.00 8.00 7.25
## 5 25.90 8.67 6.09 23.75 9.50 7.00
## 6 23.80 8.95 9.05 26.00 10.75 9.00
## AverageSleep AllNighter
## 1 7.18 0
## 2 6.93 0
## 3 5.02 0
## 4 6.90 0
## 5 6.35 0
## 6 9.04 0
t.test(GPA ~ Gender, data = sleep)
##
## Welch Two Sample t-test
##
## data: GPA by Gender
## t = 3.9139, df = 200.9, p-value = 0.0001243
## alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
## 95 percent confidence interval:
## 0.09982254 0.30252780
## sample estimates:
## mean in group 0 mean in group 1
## 3.324901 3.123725
mean_gpa <- sleep %>%
group_by(Gender) %>%
summarize(Mean_GPA = mean(GPA, na.rm = TRUE))
print(mean_gpa)
## # A tibble: 2 × 2
## Gender Mean_GPA
## <int> <dbl>
## 1 0 3.32
## 2 1 3.12
ggplot(mean_gpa, aes(x = factor(Gender), y = Mean_GPA, fill = factor(Gender))) +
geom_bar(stat = "identity", show.legend = FALSE, width = 0.5) + # Adjust bar width here
labs(x = "Gender", y = "Mean GPA", title = "Mean GPA by Gender") +
scale_x_discrete(labels = c("0" = "Female", "1" = "Male")) +
theme_minimal()
The average GPA for female students is slightly higher than for male
students.
sleep$YearGroup2 <- ifelse(sleep$ClassYear %in% c(1, 2), "FirstTwo", "Other")
t.test(NumEarlyClass ~ YearGroup2, data = sleep)
##
## Welch Two Sample t-test
##
## data: NumEarlyClass by YearGroup2
## t = 4.1813, df = 250.69, p-value = 4.009e-05
## alternative hypothesis: true difference in means between group FirstTwo and group Other is not equal to 0
## 95 percent confidence interval:
## 0.4042016 1.1240309
## sample estimates:
## mean in group FirstTwo mean in group Other
## 2.070423 1.306306
average_early_classes_by_year <- sleep %>%
group_by(ClassYear) %>%
summarize(AverageEarlyClasses = mean(NumEarlyClass, na.rm = TRUE))
print(average_early_classes_by_year)
## # A tibble: 4 × 2
## ClassYear AverageEarlyClasses
## <int> <dbl>
## 1 1 2.36
## 2 2 1.93
## 3 3 1.43
## 4 4 1.19
Freshmen have the highest average number of early classes (2.36), which decreases as students progress through their academic years.
lark_owl_data <- sleep %>%
filter(LarkOwl %in% c("lark", "owl"))
# Clean the LarkOwl column to standardize values
sleep$LarkOwl <- trimws(tolower(sleep$LarkOwl))
# Filter the data for 'lark' and 'owl' groups and remove rows with NA in LarkOwl
lark_owl_data <- sleep %>%
filter(LarkOwl %in% c("lark", "owl")) %>%
filter(!is.na(LarkOwl))
t.test(CognitionZscore ~ LarkOwl, data = lark_owl_data)
##
## Welch Two Sample t-test
##
## data: CognitionZscore by LarkOwl
## t = 0.80571, df = 75.331, p-value = 0.4229
## alternative hypothesis: true difference in means between group lark and group owl is not equal to 0
## 95 percent confidence interval:
## -0.1893561 0.4465786
## sample estimates:
## mean in group lark mean in group owl
## 0.09024390 -0.03836735
# Summarize the average cognition Z-score by group
average_cognition <- lark_owl_data %>%
group_by(LarkOwl) %>%
summarize(MeanCognitionZscore = mean(CognitionZscore, na.rm = TRUE))
# Display the result in a table format using knitr::kable
knitr::kable(average_cognition, format = "html", caption = "Average Cognition Z-score by Lark/Owl Groups")
| LarkOwl | MeanCognitionZscore |
|---|---|
| lark | 0.0902439 |
| owl | -0.0383673 |
Students who identify as “larks” have a slightly higher average cognition z-score (0.09) compared to students who identify as “owls” (-0.04).
t.test(ClassesMissed ~ EarlyClass, data = sleep)
##
## Welch Two Sample t-test
##
## data: ClassesMissed by EarlyClass
## t = 1.4755, df = 152.78, p-value = 0.1421
## alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
## 95 percent confidence interval:
## -0.2233558 1.5412830
## sample estimates:
## mean in group 0 mean in group 1
## 2.647059 1.988095
average_missed_class <- sleep %>%
group_by(EarlyClass) %>%
summarise(MeanClassMissed = mean(ClassesMissed, na.rm = TRUE))
print(average_missed_class)
## # A tibble: 2 × 2
## EarlyClass MeanClassMissed
## <int> <dbl>
## 1 0 2.65
## 2 1 1.99
Students without early classes tend to miss more classes on average than those who have early classes.
gpa_early_classes <- sleep %>%
group_by(NumEarlyClass) %>%
summarise(AverageGPA = mean(GPA, na.rm = TRUE))
print(gpa_early_classes)
## # A tibble: 6 × 2
## NumEarlyClass AverageGPA
## <int> <dbl>
## 1 0 3.20
## 2 1 3.37
## 3 2 3.22
## 4 3 3.23
## 5 4 3.50
## 6 5 3.32
plot(sleep$NumEarlyClass, sleep$GPA,
main = "Relationship Between Number of Early Classes and GPA",
xlab = "Number of Early Classes",
ylab = "GPA",
pch = 19, # Use solid circle for points
col = "blue", # Color of the points
cex = 1.5) # Size of the points
abline(lm(GPA ~ NumEarlyClass, data = sleep), col = "red", lwd = 2) #trendline
Students with 4 early classes have the highest average GPA (3.4991),
suggesting that having 4 early classes might be associated with higher
GPA, at least for this dataset.
t.test(PoorSleepQuality ~ AllNighter, data = sleep)
##
## Welch Two Sample t-test
##
## data: PoorSleepQuality by AllNighter
## t = -1.7068, df = 44.708, p-value = 0.09479
## alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
## 95 percent confidence interval:
## -1.9456958 0.1608449
## sample estimates:
## mean in group 0 mean in group 1
## 6.136986 7.029412
# Group by AllNighter (0 or 1) and calculate the average PoorSleepQuality
average_sleep_quality <- sleep %>%
group_by(AllNighter) %>%
summarise(AverageSleep = mean(PoorSleepQuality, na.rm = TRUE))
# Print the result
print(average_sleep_quality)
## # A tibble: 2 × 2
## AllNighter AverageSleep
## <int> <dbl>
## 1 0 6.14
## 2 1 7.03
# Perform the t-test to compare the number of missed classes between males and females
t_test_result <- t.test(ClassesMissed ~ Gender, data = sleep)
# Print the t-test result
print(t_test_result)
##
## Welch Two Sample t-test
##
## data: ClassesMissed by Gender
## t = -1.9776, df = 172.73, p-value = 0.04957
## alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
## 95 percent confidence interval:
## -1.727480044 -0.001646043
## sample estimates:
## mean in group 0 mean in group 1
## 1.860927 2.725490
# Create a boxplot to visualize the number of missed classes by gender
boxplot(ClassesMissed ~ Gender, data = sleep,
main = "Classes Missed by Gender",
xlab = "Gender (1 = Male, 0 = Female)",
ylab = "Number of Classes Missed",
col = c("lightblue", "lightpink"))
The t-test indicates a statistically significant difference between the average number of classes missed by male and female students, with male students missing more classes on average than female students.
# Perform the two-sample t-test for the number of drinks per week by gender
t_test_result <- t.test(Drinks ~ factor(Gender), data = sleep)
# Print the t-test result
print(t_test_result)
##
## Welch Two Sample t-test
##
## data: Drinks by factor(Gender)
## t = -6.1601, df = 142.75, p-value = 7.002e-09
## alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
## 95 percent confidence interval:
## -4.360009 -2.241601
## sample estimates:
## mean in group 0 mean in group 1
## 4.238411 7.539216
# Load necessary library
library(ggplot2)
# Create a boxplot to visualize the number of drinks per week by gender
ggplot(sleep, aes(x = factor(Gender), y = Drinks, fill = factor(Gender))) +
geom_boxplot() +
labs(title = "Number of Drinks per Week by Gender",
x = "Gender",
y = "Number of Drinks per Week") +
scale_x_discrete(labels = c("Female", "Male")) +
theme_minimal() +
scale_fill_manual(values = c("lightpink", "lightblue"))
T-test shows a statistically significant difference between males and females in terms of their average number of drinks per week (p-value = 0.0195). Boxplot visually confirms this, showing that females tend to have a higher number of drinks per week compared to males, with noticeable differences in the distribution.
t.test(WeekdayBed ~ Stress, data = sleep)
##
## Welch Two Sample t-test
##
## data: WeekdayBed by Stress
## t = -1.0746, df = 87.048, p-value = 0.2855
## alternative hypothesis: true difference in means between group high and group normal is not equal to 0
## 95 percent confidence interval:
## -0.4856597 0.1447968
## sample estimates:
## mean in group high mean in group normal
## 24.71500 24.88543
The t-test results indicate that there is no significant difference in weekday bedtimes between students with high stress and those with normal stress. The p-value is 0.2855, which is greater than 0.05, meaning any observed difference in bedtimes is likely due to random variation.
# Step 1: Create a new variable to categorize students into two groups
sleep$YearGroup <- ifelse(sleep$ClassYear %in% c(1, 2), "FirstTwoYears", "OtherYears")
# Step 2: Perform a t-test to compare WeekendSleep between the two groups
t_test_result <- t.test(WeekendSleep ~ YearGroup, data = sleep)
# Output the t-test result
print(t_test_result)
##
## Welch Two Sample t-test
##
## data: WeekendSleep by YearGroup
## t = -0.047888, df = 237.36, p-value = 0.9618
## alternative hypothesis: true difference in means between group FirstTwoYears and group OtherYears is not equal to 0
## 95 percent confidence interval:
## -0.3497614 0.3331607
## sample estimates:
## mean in group FirstTwoYears mean in group OtherYears
## 8.213592 8.221892
# Step 3: Create a violin plot to visualize the distribution of WeekendSleep for each group
library(ggplot2)
ggplot(sleep, aes(x = YearGroup, y = WeekendSleep, fill = YearGroup)) +
geom_violin() +
labs(title = "Distribution of Weekend Sleep Hours Between First Two Years and Other Students",
x = "Year Group",
y = "Weekend Sleep Hours") +
theme_minimal() +
scale_fill_manual(values = c("skyblue", "lightgreen"))
The t-test results show no significant difference in weekend sleep hours between first two-year students and other students. The p-value is 0.9618, indicating that any observed difference is likely due to random chance. Therefore, we fail to reject the null hypothesis and conclude that both groups have similar weekend sleep hours.
#Appendix
The following R code was used for data analysis and visualization for the different questions in this report.
mean_gpa <- sleep %>%
group_by(Gender) %>%
summarize(Mean_GPA = mean(GPA, na.rm = TRUE))
print(mean_gpa)
ggplot(mean_gpa, aes(x = factor(Gender), y = Mean_GPA, fill = factor(Gender))) +
geom_bar(stat = "identity", show.legend = FALSE, width = 0.5) + # Adjust bar width here
labs(x = "Gender", y = "Mean GPA", title = "Mean GPA by Gender") +
scale_x_discrete(labels = c("0" = "Female", "1" = "Male")) +
theme_minimal()
average_early_classes_by_year <- sleep %>%
group_by(ClassYear) %>%
summarize(AverageEarlyClasses = mean(NumEarlyClass, na.rm = TRUE))
print(average_early_classes_by_year)
# Clean the LarkOwl column to standardize values
sleep$LarkOwl <- trimws(tolower(sleep$LarkOwl))
# Filter the data for 'lark' and 'owl' groups and remove rows with NA in LarkOwl
lark_owl_data <- sleep %>%
filter(LarkOwl %in% c("lark", "owl")) %>%
filter(!is.na(LarkOwl))
# Summarize the average cognition Z-score by group
average_cognition <- lark_owl_data %>%
group_by(LarkOwl) %>%
summarize(MeanCognitionZscore = mean(CognitionZscore, na.rm = TRUE))
# Display the result in a table format using knitr::kable
knitr::kable(average_cognition, format = "html", caption = "Average Cognition Z-score by Lark/Owl Groups")
average_missed_class <- sleep %>%
group_by(EarlyClass) %>%
summarise(MeanClassMissed = mean(ClassesMissed, na.rm = TRUE))
print(average_missed_class)
gpa_early_classes <- sleep %>%
group_by(NumEarlyClass) %>%
summarise(AverageGPA = mean(GPA, na.rm = TRUE))
print(gpa_early_classes)
plot(sleep$NumEarlyClass, sleep$GPA,
main = "Relationship Between Number of Early Classes and GPA",
xlab = "Number of Early Classes",
ylab = "GPA",
pch = 19, # Use solid circle for points
col = "blue", # Color of the points
cex = 1.5) # Size of the points
abline(lm(GPA ~ NumEarlyClass, data = sleep), col = "red", lwd = 2) #trendline
# Group by AllNighter (0 or 1) and calculate the average PoorSleepQuality
average_sleep_quality <- sleep %>%
group_by(AllNighter) %>%
summarise(AverageSleep = mean(PoorSleepQuality, na.rm = TRUE))
# Print the result
print(average_sleep_quality)
# Perform the t-test to compare the number of missed classes between males and females
t_test_result <- t.test(ClassesMissed ~ Gender, data = sleep)
# Print the t-test result
print(t_test_result)
# Create a boxplot to visualize the number of missed classes by gender
boxplot(ClassesMissed ~ Gender, data = sleep,
main = "Classes Missed by Gender",
xlab = "Gender (1 = Male, 0 = Female)",
ylab = "Number of Classes Missed",
col = c("lightblue", "lightpink"))
# Perform the two-sample t-test for the number of drinks per week by gender
t_test_result <- t.test(Drinks ~ factor(Gender), data = sleep)
# Print the t-test result
print(t_test_result)
# Load necessary library
library(ggplot2)
# Create a boxplot to visualize the number of drinks per week by gender
ggplot(sleep, aes(x = factor(Gender), y = Drinks, fill = factor(Gender))) +
geom_boxplot() +
labs(title = "Number of Drinks per Week by Gender",
x = "Gender",
y = "Number of Drinks per Week") +
scale_x_discrete(labels = c("Female", "Male")) +
theme_minimal() +
scale_fill_manual(values = c("lightpink", "lightblue"))
t.test(WeekdayBed ~ Stress, data = sleep)
# Step 1: Create a new variable to categorize students into two groups
sleep$YearGroup <- ifelse(sleep$ClassYear %in% c(1, 2), "FirstTwoYears", "OtherYears")
# Step 2: Perform a t-test to compare WeekendSleep between the two groups
t_test_result <- t.test(WeekendSleep ~ YearGroup, data = sleep)
# Output the t-test result
print(t_test_result)
# Step 3: Create a violin plot to visualize the distribution of WeekendSleep for each group
library(ggplot2)
ggplot(sleep, aes(x = YearGroup, y = WeekendSleep, fill = YearGroup)) +
geom_violin() +
labs(title = "Distribution of Weekend Sleep Hours Between First Two Years and Other Students",
x = "Year Group",
y = "Weekend Sleep Hours") +
theme_minimal()+