Project2: Exploring Sleep Patterns in College Students

##Introduction This report presents an analysis of sleep patterns among college students, utilizing the “SleepStudy” dataset obtained from https://www.lock5stat.com/datapage3e.html. The dataset comprises 253 observations on 27 variables, providing valuable insights into the sleep habits, psychological well-being, and lifestyle choices of college students. We will be using the following questions to explore the data and analyze it.

10.Is there a significant difference in the average hours of sleep on weekends between first two year students and other students?

Analysis

We will explore the questions in detail.

sleep = read.csv("https://www.lock5stat.com/datasets3e/SleepStudy.csv")
head(sleep)

##   Gender ClassYear LarkOwl NumEarlyClass EarlyClass  GPA ClassesMissed
## 1      0         4 Neither             0          0 3.60             0
## 2      0         4 Neither             2          1 3.24             0
## 3      0         4     Owl             0          0 2.97            12
## 4      0         1    Lark             5          1 3.76             0
## 5      0         4     Owl             0          0 3.20             4
## 6      1         4 Neither             0          0 3.50             0
##   CognitionZscore PoorSleepQuality DepressionScore AnxietyScore StressScore
## 1           -0.26                4               4            3           8
## 2            1.39                6               1            0           3
## 3            0.38               18              18           18           9
## 4            1.39                9               1            4           6
## 5            1.22                9               7           25          14
## 6           -0.04                6              14            8          28
##   DepressionStatus AnxietyStatus Stress DASScore Happiness AlcoholUse Drinks
## 1           normal        normal normal       15        28   Moderate     10
## 2           normal        normal normal        4        25   Moderate      6
## 3         moderate        severe normal       45        17      Light      3
## 4           normal        normal normal       11        32      Light      2
## 5           normal        severe normal       46        15   Moderate      4
## 6         moderate      moderate   high       50        22    Abstain      0
##   WeekdayBed WeekdayRise WeekdaySleep WeekendBed WeekendRise WeekendSleep
## 1      25.75        8.70         7.70      25.75        9.50         5.88
## 2      25.70        8.20         6.80      26.00       10.00         7.25
## 3      27.44        6.55         3.00      28.00       12.59        10.09
## 4      23.50        7.17         6.77      27.00        8.00         7.25
## 5      25.90        8.67         6.09      23.75        9.50         7.00
## 6      23.80        8.95         9.05      26.00       10.75         9.00
##   AverageSleep AllNighter
## 1         7.18          0
## 2         6.93          0
## 3         5.02          0
## 4         6.90          0
## 5         6.35          0
## 6         9.04          0

Q1. Is there a significant difference in the average GPA between male and female college students?

t.test(GPA ~ Gender, data = sleep)

## 
##  Welch Two Sample t-test
## 
## data:  GPA by Gender
## t = 3.9139, df = 200.9, p-value = 0.0001243
## alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
## 95 percent confidence interval:
##  0.09982254 0.30252780
## sample estimates:
## mean in group 0 mean in group 1 
##        3.324901        3.123725

mean_gpa <- sleep %>%
  group_by(Gender) %>%
  summarize(Mean_GPA = mean(GPA, na.rm = TRUE))
print(mean_gpa)

## # A tibble: 2 × 2
##   Gender Mean_GPA
##    <int>    <dbl>
## 1      0     3.32
## 2      1     3.12

ggplot(mean_gpa, aes(x = factor(Gender), y = Mean_GPA, fill = factor(Gender))) +
  geom_bar(stat = "identity", show.legend = FALSE, width = 0.5) +  # Adjust bar width here
  labs(x = "Gender", y = "Mean GPA", title = "Mean GPA by Gender") +
  scale_x_discrete(labels = c("0" = "Female", "1" = "Male")) +
  theme_minimal()

The average GPA for female students is slightly higher than for male students.

Q2. Is there a significant difference in the average number of early classes between the first two class years and other class years?

sleep$YearGroup2 <- ifelse(sleep$ClassYear %in% c(1, 2), "FirstTwo", "Other")

t.test(NumEarlyClass ~ YearGroup2, data = sleep)

## 
##  Welch Two Sample t-test
## 
## data:  NumEarlyClass by YearGroup2
## t = 4.1813, df = 250.69, p-value = 4.009e-05
## alternative hypothesis: true difference in means between group FirstTwo and group Other is not equal to 0
## 95 percent confidence interval:
##  0.4042016 1.1240309
## sample estimates:
## mean in group FirstTwo    mean in group Other 
##               2.070423               1.306306

average_early_classes_by_year <- sleep %>%
  group_by(ClassYear) %>%
  summarize(AverageEarlyClasses = mean(NumEarlyClass, na.rm = TRUE))

print(average_early_classes_by_year)

## # A tibble: 4 × 2
##   ClassYear AverageEarlyClasses
##       <int>               <dbl>
## 1         1                2.36
## 2         2                1.93
## 3         3                1.43
## 4         4                1.19

Freshmen have the highest average number of early classes (2.36), which decreases as students progress through their academic years.

Q3. Do students who identify as “larks” have significantly better cognitive skills (cognition z-score) compared to “owls”?

lark_owl_data <- sleep %>%
  filter(LarkOwl %in% c("lark", "owl"))

# Clean the LarkOwl column to standardize values
sleep$LarkOwl <- trimws(tolower(sleep$LarkOwl))

# Filter the data for 'lark' and 'owl' groups and remove rows with NA in LarkOwl
lark_owl_data <- sleep %>%
  filter(LarkOwl %in% c("lark", "owl")) %>%
  filter(!is.na(LarkOwl))
t.test(CognitionZscore ~ LarkOwl, data = lark_owl_data)

## 
##  Welch Two Sample t-test
## 
## data:  CognitionZscore by LarkOwl
## t = 0.80571, df = 75.331, p-value = 0.4229
## alternative hypothesis: true difference in means between group lark and group owl is not equal to 0
## 95 percent confidence interval:
##  -0.1893561  0.4465786
## sample estimates:
## mean in group lark  mean in group owl 
##         0.09024390        -0.03836735

# Summarize the average cognition Z-score by group
average_cognition <- lark_owl_data %>%
  group_by(LarkOwl) %>%
  summarize(MeanCognitionZscore = mean(CognitionZscore, na.rm = TRUE))

# Display the result in a table format using knitr::kable
knitr::kable(average_cognition, format = "html", caption = "Average Cognition Z-score by Lark/Owl Groups")

Average Cognition Z-score by Lark/Owl Groups
LarkOwl	MeanCognitionZscore
lark	0.0902439
owl	-0.0383673

Students who identify as “larks” have a slightly higher average cognition z-score (0.09) compared to students who identify as “owls” (-0.04).

Q4. Is there a significant difference in the average number of classes missed in a semester between students who had at least one early class (EarlyClass=1) and those who didn’t (EarlyClass=0)?

t.test(ClassesMissed ~ EarlyClass, data = sleep)

## 
##  Welch Two Sample t-test
## 
## data:  ClassesMissed by EarlyClass
## t = 1.4755, df = 152.78, p-value = 0.1421
## alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
## 95 percent confidence interval:
##  -0.2233558  1.5412830
## sample estimates:
## mean in group 0 mean in group 1 
##        2.647059        1.988095

average_missed_class <- sleep %>%
  group_by(EarlyClass) %>%
  summarise(MeanClassMissed = mean(ClassesMissed, na.rm = TRUE))

print(average_missed_class)

## # A tibble: 2 × 2
##   EarlyClass MeanClassMissed
##        <int>           <dbl>
## 1          0            2.65
## 2          1            1.99

Students without early classes tend to miss more classes on average than those who have early classes.

Q5.Is there a significant relationship between GPA and the number of early classes (NumEarlyClass)?

gpa_early_classes <- sleep %>%
  group_by(NumEarlyClass) %>%
  summarise(AverageGPA = mean(GPA, na.rm = TRUE))

print(gpa_early_classes)

## # A tibble: 6 × 2
##   NumEarlyClass AverageGPA
##           <int>      <dbl>
## 1             0       3.20
## 2             1       3.37
## 3             2       3.22
## 4             3       3.23
## 5             4       3.50
## 6             5       3.32

plot(sleep$NumEarlyClass, sleep$GPA, 
     main = "Relationship Between Number of Early Classes and GPA",
     xlab = "Number of Early Classes", 
     ylab = "GPA", 
     pch = 19, # Use solid circle for points
     col = "blue", # Color of the points
     cex = 1.5) # Size of the points
abline(lm(GPA ~ NumEarlyClass, data = sleep), col = "red", lwd = 2) #trendline

Students with 4 early classes have the highest average GPA (3.4991), suggesting that having 4 early classes might be associated with higher GPA, at least for this dataset.

Q6. Is there a significant difference in average sleep quality scores between students who reported having at least one all-nighter (AllNighter=1) and those who didn’t (AllNighter=0)?

t.test(PoorSleepQuality ~ AllNighter, data = sleep)

## 
##  Welch Two Sample t-test
## 
## data:  PoorSleepQuality by AllNighter
## t = -1.7068, df = 44.708, p-value = 0.09479
## alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
## 95 percent confidence interval:
##  -1.9456958  0.1608449
## sample estimates:
## mean in group 0 mean in group 1 
##        6.136986        7.029412

# Group by AllNighter (0 or 1) and calculate the average PoorSleepQuality
average_sleep_quality <- sleep %>%
  group_by(AllNighter) %>%
  summarise(AverageSleep = mean(PoorSleepQuality, na.rm = TRUE))

# Print the result
print(average_sleep_quality)

## # A tibble: 2 × 2
##   AllNighter AverageSleep
##        <int>        <dbl>
## 1          0         6.14
## 2          1         7.03

Q7. Is there a significant difference in the average number of missed classes between male and female students?

# Perform the t-test to compare the number of missed classes between males and females
t_test_result <- t.test(ClassesMissed ~ Gender, data = sleep)

# Print the t-test result
print(t_test_result)

## 
##  Welch Two Sample t-test
## 
## data:  ClassesMissed by Gender
## t = -1.9776, df = 172.73, p-value = 0.04957
## alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
## 95 percent confidence interval:
##  -1.727480044 -0.001646043
## sample estimates:
## mean in group 0 mean in group 1 
##        1.860927        2.725490

# Create a boxplot to visualize the number of missed classes by gender
boxplot(ClassesMissed ~ Gender, data = sleep,
        main = "Classes Missed by Gender",
        xlab = "Gender (1 = Male, 0 = Female)", 
        ylab = "Number of Classes Missed",
        col = c("lightblue", "lightpink"))

The t-test indicates a statistically significant difference between the average number of classes missed by male and female students, with male students missing more classes on average than female students.

Q8. Is there a significant difference in the average number of drinks per week between students of different genders?

# Perform the two-sample t-test for the number of drinks per week by gender
t_test_result <- t.test(Drinks ~ factor(Gender), data = sleep)

# Print the t-test result
print(t_test_result)

## 
##  Welch Two Sample t-test
## 
## data:  Drinks by factor(Gender)
## t = -6.1601, df = 142.75, p-value = 7.002e-09
## alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
## 95 percent confidence interval:
##  -4.360009 -2.241601
## sample estimates:
## mean in group 0 mean in group 1 
##        4.238411        7.539216

# Load necessary library
library(ggplot2)

# Create a boxplot to visualize the number of drinks per week by gender
ggplot(sleep, aes(x = factor(Gender), y = Drinks, fill = factor(Gender))) +
  geom_boxplot() +
  labs(title = "Number of Drinks per Week by Gender",
       x = "Gender",
       y = "Number of Drinks per Week") +
  scale_x_discrete(labels = c("Female", "Male")) +
  theme_minimal() +
  scale_fill_manual(values = c("lightpink", "lightblue"))

T-test shows a statistically significant difference between males and females in terms of their average number of drinks per week (p-value = 0.0195). Boxplot visually confirms this, showing that females tend to have a higher number of drinks per week compared to males, with noticeable differences in the distribution.

Q9. Is there a significant difference in the average weekday bedtime between students with high and low stress (Stress=High vs. Stress=Normal)?

t.test(WeekdayBed ~ Stress, data = sleep)

## 
##  Welch Two Sample t-test
## 
## data:  WeekdayBed by Stress
## t = -1.0746, df = 87.048, p-value = 0.2855
## alternative hypothesis: true difference in means between group high and group normal is not equal to 0
## 95 percent confidence interval:
##  -0.4856597  0.1447968
## sample estimates:
##   mean in group high mean in group normal 
##             24.71500             24.88543

The t-test results indicate that there is no significant difference in weekday bedtimes between students with high stress and those with normal stress. The p-value is 0.2855, which is greater than 0.05, meaning any observed difference in bedtimes is likely due to random variation.

Q10.Is there a significant difference in the average hours of sleep on weekends between first two year students and other students?

# Step 1: Create a new variable to categorize students into two groups
sleep$YearGroup <- ifelse(sleep$ClassYear %in% c(1, 2), "FirstTwoYears", "OtherYears")

# Step 2: Perform a t-test to compare WeekendSleep between the two groups
t_test_result <- t.test(WeekendSleep ~ YearGroup, data = sleep)

# Output the t-test result
print(t_test_result)

## 
##  Welch Two Sample t-test
## 
## data:  WeekendSleep by YearGroup
## t = -0.047888, df = 237.36, p-value = 0.9618
## alternative hypothesis: true difference in means between group FirstTwoYears and group OtherYears is not equal to 0
## 95 percent confidence interval:
##  -0.3497614  0.3331607
## sample estimates:
## mean in group FirstTwoYears    mean in group OtherYears 
##                    8.213592                    8.221892

# Step 3: Create a violin plot to visualize the distribution of WeekendSleep for each group
library(ggplot2)
ggplot(sleep, aes(x = YearGroup, y = WeekendSleep, fill = YearGroup)) +
  geom_violin() +
  labs(title = "Distribution of Weekend Sleep Hours Between First Two Years and Other Students",
       x = "Year Group", 
       y = "Weekend Sleep Hours") +
  theme_minimal() +
  scale_fill_manual(values = c("skyblue", "lightgreen"))

The t-test results show no significant difference in weekend sleep hours between first two-year students and other students. The p-value is 0.9618, indicating that any observed difference is likely due to random chance. Therefore, we fail to reject the null hypothesis and conclude that both groups have similar weekend sleep hours.

#Appendix

The following R code was used for data analysis and visualization for the different questions in this report.

mean_gpa <- sleep %>%
  group_by(Gender) %>%
  summarize(Mean_GPA = mean(GPA, na.rm = TRUE))

print(mean_gpa)
ggplot(mean_gpa, aes(x = factor(Gender), y = Mean_GPA, fill = factor(Gender))) +
  geom_bar(stat = "identity", show.legend = FALSE, width = 0.5) +  # Adjust bar width here
  labs(x = "Gender", y = "Mean GPA", title = "Mean GPA by Gender") +
  scale_x_discrete(labels = c("0" = "Female", "1" = "Male")) +
  theme_minimal()

average_early_classes_by_year <- sleep %>%
  group_by(ClassYear) %>%
  summarize(AverageEarlyClasses = mean(NumEarlyClass, na.rm = TRUE))

print(average_early_classes_by_year)

# Clean the LarkOwl column to standardize values
sleep$LarkOwl <- trimws(tolower(sleep$LarkOwl))

# Filter the data for 'lark' and 'owl' groups and remove rows with NA in LarkOwl
lark_owl_data <- sleep %>%
  filter(LarkOwl %in% c("lark", "owl")) %>%
  filter(!is.na(LarkOwl))

# Summarize the average cognition Z-score by group
average_cognition <- lark_owl_data %>%
  group_by(LarkOwl) %>%
  summarize(MeanCognitionZscore = mean(CognitionZscore, na.rm = TRUE))

# Display the result in a table format using knitr::kable
knitr::kable(average_cognition, format = "html", caption = "Average Cognition Z-score by Lark/Owl Groups")


average_missed_class <- sleep %>%
  group_by(EarlyClass) %>%
  summarise(MeanClassMissed = mean(ClassesMissed, na.rm = TRUE))

print(average_missed_class)

gpa_early_classes <- sleep %>%
  group_by(NumEarlyClass) %>%
  summarise(AverageGPA = mean(GPA, na.rm = TRUE))

print(gpa_early_classes)

plot(sleep$NumEarlyClass, sleep$GPA, 
     main = "Relationship Between Number of Early Classes and GPA",
     xlab = "Number of Early Classes", 
     ylab = "GPA", 
     pch = 19, # Use solid circle for points
     col = "blue", # Color of the points
     cex = 1.5) # Size of the points
abline(lm(GPA ~ NumEarlyClass, data = sleep), col = "red", lwd = 2) #trendline

# Group by AllNighter (0 or 1) and calculate the average PoorSleepQuality
average_sleep_quality <- sleep %>%
  group_by(AllNighter) %>%
  summarise(AverageSleep = mean(PoorSleepQuality, na.rm = TRUE))

# Print the result
print(average_sleep_quality)

# Perform the t-test to compare the number of missed classes between males and females
t_test_result <- t.test(ClassesMissed ~ Gender, data = sleep)

# Print the t-test result
print(t_test_result)

# Create a boxplot to visualize the number of missed classes by gender
boxplot(ClassesMissed ~ Gender, data = sleep,
        main = "Classes Missed by Gender",
        xlab = "Gender (1 = Male, 0 = Female)", 
        ylab = "Number of Classes Missed",
        col = c("lightblue", "lightpink"))

# Perform the two-sample t-test for the number of drinks per week by gender
t_test_result <- t.test(Drinks ~ factor(Gender), data = sleep)

# Print the t-test result
print(t_test_result)

# Load necessary library
library(ggplot2)

# Create a boxplot to visualize the number of drinks per week by gender
ggplot(sleep, aes(x = factor(Gender), y = Drinks, fill = factor(Gender))) +
  geom_boxplot() +
  labs(title = "Number of Drinks per Week by Gender",
       x = "Gender",
       y = "Number of Drinks per Week") +
  scale_x_discrete(labels = c("Female", "Male")) +
  theme_minimal() +
  scale_fill_manual(values = c("lightpink", "lightblue"))


t.test(WeekdayBed ~ Stress, data = sleep)

# Step 1: Create a new variable to categorize students into two groups
sleep$YearGroup <- ifelse(sleep$ClassYear %in% c(1, 2), "FirstTwoYears", "OtherYears")

# Step 2: Perform a t-test to compare WeekendSleep between the two groups
t_test_result <- t.test(WeekendSleep ~ YearGroup, data = sleep)

# Output the t-test result
print(t_test_result)

# Step 3: Create a violin plot to visualize the distribution of WeekendSleep for each group
library(ggplot2)
ggplot(sleep, aes(x = YearGroup, y = WeekendSleep, fill = YearGroup)) +
  geom_violin() +
  labs(title = "Distribution of Weekend Sleep Hours Between First Two Years and Other Students",
       x = "Year Group", 
       y = "Weekend Sleep Hours") +
  theme_minimal()+