knitr::opts_chunk$set(echo = TRUE)

library(tidyverse)
library(readxl)
library(janitor)
library(rstatix)
library(ggplot2)
library(supernova)
library(emmeans)
library(knitr)
library(kableExtra)

# Import data
excel_file <- "midterm_sleep_exercise.xlsx"
sheets <- excel_sheets(excel_file)
sheets
## [1] "participant_info_midterm" "sleep_data_midterm"
participants_info_midterm <- read_excel(excel_file, sheet = "participant_info_midterm") %>% clean_names()
sleep_data_midterm <- read_excel(excel_file, sheet = "sleep_data_midterm") %>% clean_names()

glimpse(participants_info_midterm)
## Rows: 100
## Columns: 4
## $ id             <chr> "P001", "P002", "P003", "P004", "P005", "P006", "P007",…
## $ exercise_group <chr> "NONE", "Nonee", "None", "None", "None", "None", "None"…
## $ sex            <chr> "Male", "Malee", "Female", "Female", "Male", "Female", …
## $ age            <dbl> 35, 57, 26, 29, 33, 33, 32, 30, 37, 28, 30, 20, 42, 31,…
glimpse(sleep_data_midterm)
## Rows: 100
## Columns: 4
## $ id               <chr> "P001", "P002", "P003", "P004", "P005", "P006", "P007…
## $ pre_sleep        <chr> "zzz-5.8", "Sleep-6.6", NA, "SLEEP-7.2", "score-7.4",…
## $ post_sleep       <dbl> 4.7, 7.4, 6.2, 7.3, 7.4, 7.1, 6.7, 9.0, 5.1, 6.3, 6.2…
## $ sleep_efficiency <dbl> 81.6, 75.7, 82.9, 83.6, 83.5, 88.5, 83.6, 73.4, 88.2,…
colnames(participants_info_midterm)
## [1] "id"             "exercise_group" "sex"            "age"
colnames(sleep_data_midterm)
## [1] "id"               "pre_sleep"        "post_sleep"       "sleep_efficiency"
participants_info_midterm <- participants_info_midterm %>%
  rename(participant_id = id) %>%   # rename ID column
  mutate(
    sex = case_when(
      tolower(sex) %in% c("m", "male") ~ "Male",
      tolower(sex) %in% c("f", "female") ~ "Female",
      TRUE ~ str_to_title(sex)
    ),
    exercise_group = case_when(
      str_to_lower(exercise_group) %in% c("aerobic", "cardio") ~ "Aerobic",
      str_to_lower(exercise_group) %in% c("resistance", "strength", "weights") ~ "Resistance",
      str_to_lower(exercise_group) %in% c("no_exercise", "control", "none") ~ "Control",
      TRUE ~ str_to_title(exercise_group)
    )
  )

sleep_data_midterm <- sleep_data_midterm %>%
  rename(participant_id = id)

sleep_merged <- left_join(participants_info_midterm, sleep_data_midterm, by = "participant_id")

glimpse(sleep_merged)
## Rows: 100
## Columns: 7
## $ participant_id   <chr> "P001", "P002", "P003", "P004", "P005", "P006", "P007…
## $ exercise_group   <chr> "Control", "Nonee", "Control", "Control", "Control", …
## $ sex              <chr> "Male", "Malee", "Female", "Female", "Male", "Female"…
## $ age              <dbl> 35, 57, 26, 29, 33, 33, 32, 30, 37, 28, 30, 20, 42, 3…
## $ pre_sleep        <chr> "zzz-5.8", "Sleep-6.6", NA, "SLEEP-7.2", "score-7.4",…
## $ post_sleep       <dbl> 4.7, 7.4, 6.2, 7.3, 7.4, 7.1, 6.7, 9.0, 5.1, 6.3, 6.2…
## $ sleep_efficiency <dbl> 81.6, 75.7, 82.9, 83.6, 83.5, 88.5, 83.6, 73.4, 88.2,…
sleep_merged <- sleep_merged %>%
  mutate(
    pre_sleep_num = as.numeric(str_extract(as.character(pre_sleep), "[0-9]+\\.?[0-9]*")),
    post_sleep_num = as.numeric(str_extract(as.character(post_sleep), "[0-9]+\\.?[0-9]*")),
    
    sleep_difference = post_sleep_num - pre_sleep_num,
    
    agegroup2 = case_when(
      age < 40 ~ "Under40",
      age >= 40 ~ "40plus",
      TRUE ~ NA_character_
    )
  )


sum(is.na(sleep_merged$sleep_difference))
## [1] 14
sleep_merged <- sleep_merged %>% drop_na(sleep_difference)

glimpse(sleep_merged)
## Rows: 86
## Columns: 11
## $ participant_id   <chr> "P001", "P002", "P004", "P005", "P006", "P007", "P008…
## $ exercise_group   <chr> "Control", "Nonee", "Control", "Control", "Control", …
## $ sex              <chr> "Male", "Malee", "Female", "Male", "Female", "Male", …
## $ age              <dbl> 35, 57, 29, 33, 33, 32, 30, 37, 28, 30, 20, 42, 33, 2…
## $ pre_sleep        <chr> "zzz-5.8", "Sleep-6.6", "SLEEP-7.2", "score-7.4", "Sl…
## $ post_sleep       <dbl> 4.7, 7.4, 7.3, 7.4, 7.1, 6.7, 9.0, 5.1, 6.3, 6.2, 4.6…
## $ sleep_efficiency <dbl> 81.6, 75.7, 83.6, 83.5, 88.5, 83.6, 73.4, 88.2, 80.4,…
## $ pre_sleep_num    <dbl> 5.8, 6.6, 7.2, 7.4, 6.6, 6.0, 8.1, 5.5, 5.7, 7.0, 5.5…
## $ post_sleep_num   <dbl> 4.7, 7.4, 7.3, 7.4, 7.1, 6.7, 9.0, 5.1, 6.3, 6.2, 4.6…
## $ sleep_difference <dbl> -1.1, 0.8, 0.1, 0.0, 0.5, 0.7, 0.9, -0.4, 0.6, -0.8, …
## $ agegroup2        <chr> "Under40", "40plus", "Under40", "Under40", "Under40",…
desc_overall <- sleep_merged %>%
  summarise(
    mean_diff = mean(sleep_difference, na.rm = TRUE),
    sd_diff = sd(sleep_difference, na.rm = TRUE),
    min_diff = min(sleep_difference, na.rm = TRUE),
    max_diff = max(sleep_difference, na.rm = TRUE),
    mean_eff = mean(sleep_efficiency, na.rm = TRUE),
    sd_eff = sd(sleep_efficiency, na.rm = TRUE),
    min_eff = min(sleep_efficiency, na.rm = TRUE),
    max_eff = max(sleep_efficiency, na.rm = TRUE)
  )

kable(desc_overall, caption = "Overall Descriptive Statistics") %>%
  kable_styling(full_width = F)
Overall Descriptive Statistics
mean_diff sd_diff min_diff max_diff mean_eff sd_eff min_eff max_eff
0.6825581 0.6610494 -1.1 2.1 83.77558 5.973804 71.7 101.5
desc_group <- sleep_merged %>%
  group_by(exercise_group) %>%
  summarise(
    mean_diff = mean(sleep_difference, na.rm = TRUE),
    sd_diff = sd(sleep_difference, na.rm = TRUE),
    mean_eff = mean(sleep_efficiency, na.rm = TRUE),
    sd_eff = sd(sleep_efficiency, na.rm = TRUE)
  )

kable(desc_group, caption = "Descriptive Statistics by Exercise Group") %>%
  kable_styling(full_width = F)
Descriptive Statistics by Exercise Group
exercise_group mean_diff sd_diff mean_eff sd_eff
Aerobic 1.1400000 0.4977846 85.42000 6.1459101
C 1.1000000 NA 86.00000 NA
C+W 1.1000000 0.1414214 90.05000 5.3033009
Cardio+Weights 0.8250000 0.3971941 86.32500 6.1500428
Control -0.0222222 0.6254933 81.34444 5.8711881
Cw 1.1000000 NA 90.60000 NA
N 0.3000000 0.8485281 81.30000 0.2828427
Nonee 0.8000000 NA 75.70000 NA
Resistance 0.7157895 0.6238440 81.31053 4.4438340
Weightsss 0.1000000 NA 85.30000 NA
Weightz 0.3000000 NA 80.40000 NA
# Boxplot: Sleep Difference
ggplot(sleep_merged, aes(x = exercise_group, y = sleep_difference)) +
  geom_boxplot(fill = "skyblue") +
  labs(title = "Sleep Difference by Exercise Group",
       x = "Exercise Group",
       y = "Change in Sleep Duration (hrs)") +
  theme_minimal(base_size = 13)

# Boxplot: Sleep Efficiency
ggplot(sleep_merged, aes(x = exercise_group, y = sleep_efficiency)) +
  geom_boxplot(fill = "tan") +
  labs(title = "Sleep Efficiency by Exercise Group",
       x = "Exercise Group",
       y = "Sleep Efficiency (%)") +
  theme_minimal(base_size = 13)

# Scatterplot: Sleep Difference vs Sleep Efficiency
ggplot(sleep_merged, aes(x = sleep_efficiency, y = sleep_difference, color = exercise_group)) +
  geom_point(size = 2, alpha = 0.7) +
  geom_smooth(method = "lm", se = FALSE) +
  labs(title = "Relationship Between Sleep Efficiency and Sleep Difference",
       x = "Sleep Efficiency (%)",
       y = "Sleep Difference (hrs)") +
  theme_minimal(base_size = 13)
## `geom_smooth()` using formula = 'y ~ x'

sleep_merged <- sleep_merged %>%
  mutate(
    sex = case_when(
      str_detect(str_to_lower(sex), "^m") ~ "Male",
      str_detect(str_to_lower(sex), "^f") ~ "Female",
      TRUE ~ NA_character_
    )
  )
sleep_merged <- sleep_merged %>% drop_na(sex)
table(sleep_merged$sex)
## 
## Female   Male 
##     49     37
t_sex <- sleep_merged %>% t_test(sleep_difference ~ sex)
t_age <- sleep_merged %>% t_test(sleep_difference ~ agegroup2)

t_sex
## # A tibble: 1 × 8
##   .y.              group1 group2    n1    n2 statistic    df     p
## * <chr>            <chr>  <chr>  <int> <int>     <dbl> <dbl> <dbl>
## 1 sleep_difference Female Male      49    37      1.58  77.6 0.118
t_age
## # A tibble: 1 × 8
##   .y.              group1 group2     n1    n2 statistic    df     p
## * <chr>            <chr>  <chr>   <int> <int>     <dbl> <dbl> <dbl>
## 1 sleep_difference 40plus Under40    19    67      1.37  36.7 0.178
# ANOVA: Sleep Difference ~ Exercise Group
anova_diff <- aov(sleep_difference ~ exercise_group, data = sleep_merged)
summary(anova_diff)
##                Df Sum Sq Mean Sq F value   Pr(>F)    
## exercise_group 10  15.04  1.5042   5.104 1.18e-05 ***
## Residuals      75  22.10  0.2947                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
supernova(anova_diff)  # effect size (Partial Eta Squared)
##  Analysis of Variance Table (Type III SS)
##  Model: sleep_difference ~ exercise_group
## 
##                              SS df    MS     F   PRE     p
##  ----- --------------- | ------ -- ----- ----- ----- -----
##  Model (error reduced) | 15.042 10 1.504 5.104 .4050 .0000
##  Error (from model)    | 22.102 75 0.295                  
##  ----- --------------- | ------ -- ----- ----- ----- -----
##  Total (empty model)   | 37.144 85 0.437
# Tukey post-hoc
tukey_diff <- TukeyHSD(anova_diff)
tukey_diff
##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = sleep_difference ~ exercise_group, data = sleep_merged)
## 
## $exercise_group
##                                    diff        lwr        upr     p adj
## C-Aerobic                 -4.000000e-02 -1.8868172  1.8068172 1.0000000
## C+W-Aerobic               -4.000000e-02 -1.3766282  1.2966282 1.0000000
## Cardio+Weights-Aerobic    -3.150000e-01 -0.8849402  0.2549402 0.7556344
## Control-Aerobic           -1.162222e+00 -1.7477801 -0.5766644 0.0000003
## Cw-Aerobic                -4.000000e-02 -1.8868172  1.8068172 1.0000000
## N-Aerobic                 -8.400000e-01 -2.1766282  0.4966282 0.5903541
## Nonee-Aerobic             -3.400000e-01 -2.1868172  1.5068172 0.9999323
## Resistance-Aerobic        -4.242105e-01 -1.0016012  0.1531802 0.3568154
## Weightsss-Aerobic         -1.040000e+00 -2.8868172  0.8068172 0.7344003
## Weightz-Aerobic           -8.400000e-01 -2.6868172  1.0068172 0.9118975
## C+W-C                      6.661338e-16 -2.2073688  2.2073688 1.0000000
## Cardio+Weights-C          -2.750000e-01 -2.1218172  1.5718172 0.9999907
## Control-C                 -1.122222e+00 -2.9739187  0.7294743 0.6412635
## Cw-C                       8.881784e-16 -2.5488499  2.5488499 1.0000000
## N-C                       -8.000000e-01 -3.0073688  1.4073688 0.9802381
## Nonee-C                   -3.000000e-01 -2.8488499  2.2488499 0.9999990
## Resistance-C              -3.842105e-01 -2.2333406  1.4649195 0.9997965
## Weightsss-C               -1.000000e+00 -3.5488499  1.5488499 0.9655909
## Weightz-C                 -8.000000e-01 -3.3488499  1.7488499 0.9932982
## Cardio+Weights-C+W        -2.750000e-01 -1.6116282  1.0616282 0.9998138
## Control-C+W               -1.122222e+00 -2.4655841  0.2211396 0.1887353
## Cw-C+W                     2.220446e-16 -2.2073688  2.2073688 1.0000000
## N-C+W                     -8.000000e-01 -2.6023090  1.0023090 0.9238680
## Nonee-C+W                 -3.000000e-01 -2.5073688  1.9073688 0.9999961
## Resistance-C+W            -3.842105e-01 -1.7240325  0.9556115 0.9967429
## Weightsss-C+W             -1.000000e+00 -3.2073688  1.2073688 0.9139514
## Weightz-C+W               -8.000000e-01 -3.0073688  1.4073688 0.9802381
## Control-Cardio+Weights    -8.472222e-01 -1.4327801 -0.2616644 0.0003859
## Cw-Cardio+Weights          2.750000e-01 -1.5718172  2.1218172 0.9999907
## N-Cardio+Weights          -5.250000e-01 -1.8616282  0.8116282 0.9653250
## Nonee-Cardio+Weights      -2.500000e-02 -1.8718172  1.8218172 1.0000000
## Resistance-Cardio+Weights -1.092105e-01 -0.6866012  0.4681802 0.9999132
## Weightsss-Cardio+Weights  -7.250000e-01 -2.5718172  1.1218172 0.9654518
## Weightz-Cardio+Weights    -5.250000e-01 -2.3718172  1.3218172 0.9969667
## Cw-Control                 1.122222e+00 -0.7294743  2.9739187 0.6412635
## N-Control                  3.222222e-01 -1.0211396  1.6655841 0.9992763
## Nonee-Control              8.222222e-01 -1.0294743  2.6739187 0.9236991
## Resistance-Control         7.380117e-01  0.1451996  1.3308238 0.0041643
## Weightsss-Control          1.222222e-01 -1.7294743  1.9739187 1.0000000
## Weightz-Control            3.222222e-01 -1.5294743  2.1739187 0.9999598
## N-Cw                      -8.000000e-01 -3.0073688  1.4073688 0.9802381
## Nonee-Cw                  -3.000000e-01 -2.8488499  2.2488499 0.9999990
## Resistance-Cw             -3.842105e-01 -2.2333406  1.4649195 0.9997965
## Weightsss-Cw              -1.000000e+00 -3.5488499  1.5488499 0.9655909
## Weightz-Cw                -8.000000e-01 -3.3488499  1.7488499 0.9932982
## Nonee-N                    5.000000e-01 -1.7073688  2.7073688 0.9995612
## Resistance-N               4.157895e-01 -0.9240325  1.7556115 0.9938679
## Weightsss-N               -2.000000e-01 -2.4073688  2.0073688 0.9999999
## Weightz-N                 -6.661338e-16 -2.2073688  2.2073688 1.0000000
## Resistance-Nonee          -8.421053e-02 -1.9333406  1.7649195 1.0000000
## Weightsss-Nonee           -7.000000e-01 -3.2488499  1.8488499 0.9977168
## Weightz-Nonee             -5.000000e-01 -3.0488499  2.0488499 0.9998790
## Weightsss-Resistance      -6.157895e-01 -2.4649195  1.2333406 0.9894254
## Weightz-Resistance        -4.157895e-01 -2.2649195  1.4333406 0.9995887
## Weightz-Weightsss          2.000000e-01 -2.3488499  2.7488499 1.0000000
# ANOVA: Sleep Efficiency ~ Exercise Group
anova_eff <- aov(sleep_efficiency ~ exercise_group, data = sleep_merged)
summary(anova_eff)
##                Df Sum Sq Mean Sq F value Pr(>F)  
## exercise_group 10  627.4   62.74   1.956 0.0505 .
## Residuals      75 2406.0   32.08                 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
supernova(anova_eff)  # effect size (Partial Eta Squared)
##  Analysis of Variance Table (Type III SS)
##  Model: sleep_efficiency ~ exercise_group
## 
##                                SS df     MS     F   PRE     p
##  ----- --------------- | -------- -- ------ ----- ----- -----
##  Model (error reduced) |  627.362 10 62.736 1.956 .2068 .0505
##  Error (from model)    | 2405.977 75 32.080                  
##  ----- --------------- | -------- -- ------ ----- ----- -----
##  Total (empty model)   | 3033.339 85 35.686
# Tukey post-hoc
tukey_eff <- TukeyHSD(anova_eff)
tukey_eff
##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = sleep_efficiency ~ exercise_group, data = sleep_merged)
## 
## $exercise_group
##                                   diff        lwr       upr     p adj
## C-Aerobic                   0.58000000 -18.688818 19.848818 1.0000000
## C+W-Aerobic                 4.63000000  -9.315747 18.575747 0.9896674
## Cardio+Weights-Aerobic      0.90500000  -5.041486  6.851486 0.9999885
## Control-Aerobic            -4.07555556 -10.184990  2.033878 0.5020226
## Cw-Aerobic                  5.18000000 -14.088818 24.448818 0.9980888
## N-Aerobic                  -4.12000000 -18.065747  9.825747 0.9958546
## Nonee-Aerobic              -9.72000000 -28.988818  9.548818 0.8433329
## Resistance-Aerobic         -4.10947368 -10.133695  1.914748 0.4681875
## Weightsss-Aerobic          -0.12000000 -19.388818 19.148818 1.0000000
## Weightz-Aerobic            -5.02000000 -24.288818 14.248818 0.9985317
## C+W-C                       4.05000000 -18.980643 27.080643 0.9999557
## Cardio+Weights-C            0.32500000 -18.943818 19.593818 1.0000000
## Control-C                  -4.65555556 -23.975282 14.664171 0.9992467
## Cw-C                        4.60000000 -21.993496 31.193496 0.9999619
## N-C                        -4.70000000 -27.730643 18.330643 0.9998269
## Nonee-C                   -10.30000000 -36.893496 16.293496 0.9684849
## Resistance-C               -4.68947368 -23.982423 14.603476 0.9991883
## Weightsss-C                -0.70000000 -27.293496 25.893496 1.0000000
## Weightz-C                  -5.60000000 -32.193496 20.993496 0.9997705
## Cardio+Weights-C+W         -3.72500000 -17.670747 10.220747 0.9981888
## Control-C+W                -8.70555556 -22.721558  5.310447 0.6071089
## Cw-C+W                      0.55000000 -22.480643 23.580643 1.0000000
## N-C+W                      -8.75000000 -27.554441 10.054441 0.8993773
## Nonee-C+W                 -14.35000000 -37.380643  8.680643 0.6026187
## Resistance-C+W             -8.73947368 -22.718544  5.239596 0.5978212
## Weightsss-C+W              -4.75000000 -27.780643 18.280643 0.9998096
## Weightz-C+W                -9.65000000 -32.680643 13.380643 0.9468759
## Control-Cardio+Weights     -4.98055556 -11.089990  1.128878 0.2169107
## Cw-Cardio+Weights           4.27500000 -14.993818 23.543818 0.9996347
## N-Cardio+Weights           -5.02500000 -18.970747  8.920747 0.9810476
## Nonee-Cardio+Weights      -10.62500000 -29.893818  8.643818 0.7581905
## Resistance-Cardio+Weights  -5.01447368 -11.038695  1.009748 0.1927494
## Weightsss-Cardio+Weights   -1.02500000 -20.293818 18.243818 1.0000000
## Weightz-Cardio+Weights     -5.92500000 -25.193818 13.343818 0.9942966
## Cw-Control                  9.25555556 -10.064171 28.575282 0.8813325
## N-Control                  -0.04444444 -14.060447 13.971558 1.0000000
## Nonee-Control              -5.64444444 -24.964171 13.675282 0.9962106
## Resistance-Control         -0.03391813  -6.219040  6.151203 1.0000000
## Weightsss-Control           3.95555556 -15.364171 23.275282 0.9998218
## Weightz-Control            -0.94444444 -20.264171 18.375282 1.0000000
## N-Cw                       -9.30000000 -32.330643 13.730643 0.9582503
## Nonee-Cw                  -14.90000000 -41.493496 11.693496 0.7402504
## Resistance-Cw              -9.28947368 -28.582423 10.003476 0.8779657
## Weightsss-Cw               -5.30000000 -31.893496 21.293496 0.9998603
## Weightz-Cw                -10.20000000 -36.793496 16.393496 0.9705311
## Nonee-N                    -5.60000000 -28.630643 17.430643 0.9991857
## Resistance-N                0.01052632 -13.968544 13.989596 1.0000000
## Weightsss-N                 4.00000000 -19.030643 27.030643 0.9999605
## Weightz-N                  -0.90000000 -23.930643 22.130643 1.0000000
## Resistance-Nonee            5.61052632 -13.682423 24.903476 0.9963503
## Weightsss-Nonee             9.60000000 -16.993496 36.193496 0.9807936
## Weightz-Nonee               4.70000000 -21.893496 31.293496 0.9999535
## Weightsss-Resistance        3.98947368 -15.303476 23.282423 0.9998051
## Weightz-Resistance         -0.91052632 -20.203476 18.382423 1.0000000
## Weightz-Weightsss          -4.90000000 -31.493496 21.693496 0.9999318
# Interpretation:
# 1. Only Aerobic exercise significantly improved sleep efficiency compared to Control.
# 2. Resistance training did not differ significantly from Control or Aerobic.
# 3. This suggests Aerobic exercise is most effective at improving both sleep duration and efficiency.

Synthesis & Recommendation

Based on both outcomes, I recommend the exercise regimen that showed the greatest improvements in both sleep duration and efficiency. Statistical results (ANOVA F-values, p-values, and post-hoc tests) supported that Aerobic produced significantly better sleep improvements than the control group and others. Overall, this regimen appears most beneficial for improving overall sleep quality and duration.

Reflection

The midterm a helpful opportunity to practice data cleaning, visualization, and inferential analysis. I felt most confident performing the t-tests and plotting results. However, at first, I did struggle with merging and label standardization. In the future, I will take more steps when it comes to cleaning and make sure that all file paths are set properly to avoid import erros.