Lab 1

Logo

Lab 1 - VO2, Age, and Gender

Fredrik Vinge

November 2024

Lab 1. In this lab task, you will address the following research questions by discussing and outlining the detailed steps required for analysis:

  1. How is age associated with cardiorespiratory fitness?

  2. Are there gender differences in cardiorespiratory fitness?

  3. Does sex modify the association between age and cardiorespiratory fitness?

  4. Do education levels modify the association with cardiorespiratory fitness?

Additional Discussion Point: Consider the representativeness of the study sample and discuss how this might influence your results. #Consider using datasets: “DEMO.XPT” and “CVX.XPT” needed packages and syntax for data organization

Prepare by loading the right packages!

Code
library("tidyverse")
library("haven")
library("writexl")
library("readxl")
library("broom")
library("foreign")
library("sjPlot")
library("patchwork")
library("palmerpenguins")
library("ggtext")
library("gganimate")
library("gifski")
library("transformr")

First, sorting the data

Code
DEMO <- read_xpt("DEMO.XPT")
CRF <- read_xpt("CVX.XPT")

#str(DEMO)
#str(CRF)
#head(DEMO)
#head(CRF)
#summary(DEMO)
#summary(CRF)

DEMO_NICE <- DEMO |>
  rename("#" = SEQN,
         "Gender" = RIAGENDR,
         "ExamAgeMonths" = RIDAGEEX,
          "Education_level" = DMDEDUC2,
         "Race" = RIDRETH1) |>
           select("#","Gender","ExamAgeMonths","Education_level","Race")

CRF_NICE<- CRF |>
  rename("#" = SEQN,
         "ml_VO2" = CVDESVO2) |>
  select("#","ml_VO2")

Information about the data “#” = Label:Respondent sequence number

“Gender” = 1=male; 2= female

“ExamAgeMonths” = Exam Age in Months

“Education_levels” = Adults 20+ - 1=Less Than 9th Grade; - 2= 9-11th Grade/Includes 12th grade with no diploma; - 3=High School Grad/GED or Equivalent; - 4=Some College or AA degree; - 5=College Graduate or above; - 7=Refused; - 9=Don’t Know

“Race” = - 1=Mexican American; - 2=Other Hispanic; - 3=Non-Hispanic White; - 4=Non-Hispanic Black; - 5=Other Race - Including Multi-Racial)

Code
head(DEMO_NICE)
# A tibble: 6 × 5
    `#` Gender ExamAgeMonths Education_level  Race
  <dbl>  <dbl>         <dbl>           <dbl> <dbl>
1     1      2            31              NA     4
2     2      1           926               5     3
3     3      2           126              NA     3
4     4      1            23              NA     4
5     5      1           597               5     3
6     6      2           230              NA     5

“ml_VO2” = also called relative VO2, calculated by VO2 divided by body weight.

Code
head(CRF_NICE)
# A tibble: 6 × 2
    `#` ml_VO2
  <dbl>  <dbl>
1     5   40.0
2     6   35.5
3     8   NA  
4    10   NA  
5    11   58.8
6    12   NA  

Second, clean the data

Code
sum(colSums(is.na(DEMO)))
[1] 86316
Code
sum(colSums(is.na(CRF)))
[1] 147039
Code
summary(DEMO_NICE$ExamAgeMonths)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
    0.0   129.0   228.0   345.4   550.0  1019.0     824 
Code
AgeDataBleh <- data.frame(DEMO_NICE$ExamAgeMonths)

The Age-data looks weird?

Code
#The data looks strange as Age data
AgeDataBleh <- data.frame(DEMO_NICE$ExamAgeMonths)

AgeDataBlehPlot <- ggplot(AgeDataBleh, aes(x = DEMO_NICE.ExamAgeMonths)) +
  geom_histogram(binwidth = 10, fill = "skyblue", color = "black", alpha = 0.7) +
  labs(title = "Age Distribution",
       x = "Age (Months)",
       y = "Count") +
  theme_minimal(base_size = 15) +
  theme(
    plot.title = element_text(hjust = 0.5, face = "bold", color = "white"),
    axis.title.x = element_text(face = "italic"),
    axis.title.y = element_text(face = "italic"),
    panel.background = element_rect(fill = "black", color = NA),      # Sets background to black
    plot.background = element_rect(fill = "black", color = NA),       # Sets plot area background to black
    panel.grid.major = element_blank(),   # Removes major grid lines
    panel.grid.minor = element_blank(),   # Removes minor grid lines
    axis.text = element_text(color = "white"),    # Makes axis text white for visibility
    axis.title = element_text(color = "white"),   # Makes axis titles white
  )
AgeDataBlehPlot

Fix the Age data!

Code
summary(DEMO_NICE$ExamAgeMonths)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
    0.0   129.0   228.0   345.4   550.0  1019.0     824 
Code
#Mutate the age in months to age in years, but keep the original variable
DEMO_NICE <- DEMO_NICE %>% 
  mutate(AGE = ExamAgeMonths/12)

summary(DEMO_NICE$AGE)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
   0.00   10.75   19.00   28.78   45.83   84.92     824 
Code
DEMO_NICE <- DEMO_NICE %>% 
  filter(!is.na(AGE) & AGE >=20)

summary(DEMO_NICE$AGE)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  20.00   33.50   47.67   49.35   65.00   84.92 

Now, the AGE variable is clean and good. Show it with a plot again

Code
AgeData <- data.frame(DEMO_NICE$AGE)

AgeDataPlot <- ggplot(AgeData, aes(x = DEMO_NICE.AGE)) +
  geom_histogram(binwidth = 5, fill = "skyblue", color = "black", alpha = 0.7) +
  labs(title = "Age Distribution",
       x = "Age (years)",
       y = "Count") +
  theme_minimal(base_size = 15) +
  theme(
    plot.title = element_text(hjust = 0.5, face = "bold", color = "white"),
    axis.title.x = element_text(face = "italic"),
    axis.title.y = element_text(face = "italic"),
    panel.background = element_rect(fill = "black", color = NA),      # Sets background to black
    plot.background = element_rect(fill = "black", color = NA),       # Sets plot area background to black
    panel.grid.major = element_blank(),   # Removes major grid lines
    panel.grid.minor = element_blank(),   # Removes minor grid lines
    axis.text = element_text(color = "white"),    # Makes axis text white for visibility
    axis.title = element_text(color = "white"),   # Makes axis titles white
  )
AgeDataPlot

Code
#Comboplot <- AgeDataBlehPlot / AgeDataPlot + plot_layout(ncol = 2)
#Comboplot
#plot
Code
comparison_with_labels <- AgeDataBlehPlot / AgeDataPlot +
  plot_layout(ncol = 1) +
  plot_annotation(
    title = "Comparison Between Clean and Modified Versions",
    subtitle = "The clean version is highlighted for easy comparison",
    theme = theme(plot.title = element_text(hjust = 0.5))
  ) &
  theme(
    plot.tag.position = "bottom",  # Set position of tags
    plot.tag = element_text(face = "bold")  # Make the tag stand out
  )

# Add tags to identify the versions
plot_clean <- AgeDataPlot + ggtitle("Age in years") + theme(plot.title = element_text(color = "darkgreen", face = "bold", size = 14))
plot_other <- AgeDataBlehPlot + ggtitle(" Age in months") + theme(plot.title = element_text(color = "red", face = "bold", size = 14))

# Print the labeled plot
comparison_with_labels <- plot_clean / plot_other
print(comparison_with_labels)

Now its time to merge the data sets!

Code
Merged_Lab1 <- DEMO_NICE %>% 
  left_join(CRF_NICE, by = "#")

View(Merged_Lab1)

Now, let’s remove the NA values found in the data!

Code
colSums(is.na(Merged_Lab1))
              #          Gender   ExamAgeMonths Education_level            Race 
              0               0               0              17               0 
            AGE          ml_VO2 
              0            3318 
Code
Merged_Lab1 <- Merged_Lab1 %>% 
  filter(!is.na(ml_VO2))

summary(Merged_Lab1)
       #            Gender      ExamAgeMonths   Education_level      Race      
 Min.   :   5   Min.   :1.000   Min.   :240.0   Min.   :1.000   Min.   :1.000  
 1st Qu.:2756   1st Qu.:1.000   1st Qu.:316.0   1st Qu.:2.000   1st Qu.:1.000  
 Median :4915   Median :1.000   Median :409.0   Median :4.000   Median :3.000  
 Mean   :5056   Mean   :1.493   Mean   :406.6   Mean   :3.403   Mean   :2.576  
 3rd Qu.:7376   3rd Qu.:2.000   3rd Qu.:491.0   3rd Qu.:4.000   3rd Qu.:3.000  
 Max.   :9961   Max.   :2.000   Max.   :599.0   Max.   :9.000   Max.   :5.000  
                                                NA's   :11                     
      AGE            ml_VO2      
 Min.   :20.00   Min.   : 17.99  
 1st Qu.:26.33   1st Qu.: 33.23  
 Median :34.08   Median : 38.60  
 Mean   :33.88   Mean   : 40.26  
 3rd Qu.:40.92   3rd Qu.: 45.59  
 Max.   :49.92   Max.   :132.07  
                                 

Plot the VO2 also!

Code
VO2data <- data.frame(Merged_Lab1$ml_VO2)

ggplot(VO2data, aes(x = Merged_Lab1$ml_VO2)) +
  geom_histogram(binwidth = 5, fill = "skyblue", color = "black", alpha = 0.7) +
  scale_x_continuous(limits = c(10, 135), breaks = seq(10, 135, by = 10)) +
  labs(title = "VO2max Distribution",
       x = "VO2 (ml/kg/min)",
       y = "Count") +
  theme_minimal(base_size = 15) +
  theme(
    plot.title = element_text(hjust = 0.5, face = "bold", color = "white"),
    axis.title.x = element_text(face = "italic"),
    axis.title.y = element_text(face = "italic"),
    panel.background = element_rect(fill = "black", color = NA),      # Sets background to black
    plot.background = element_rect(fill = "black", color = NA),       # Sets plot area background to black
    panel.grid.major = element_blank(),   # Removes major grid lines
    panel.grid.minor = element_blank(),   # Removes minor grid lines
    axis.text = element_text(color = "white"),    # Makes axis text white for visibility
    axis.title = element_text(color = "white"),   # Makes axis titles white
  )

Lets do some statistics!

Code
corr_result <- cor.test(Merged_Lab1$ml_VO2, Merged_Lab1$AGE, method = "pearson")
print(corr_result)

    Pearson's product-moment correlation

data:  Merged_Lab1$ml_VO2 and Merged_Lab1$AGE
t = -3.0336, df = 1009, p-value = 0.002479
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
 -0.15581028 -0.03361104
sample estimates:
        cor 
-0.09506879 
Code
corr_value <- round(corr_result$estimate, 3)

Now Plotting the Correlation

Code
ggplot(Merged_Lab1, aes(x = AGE, y = ml_VO2)) +
  geom_point(color = "darkslategray", size = 1.5) +  # Scatter points
  geom_smooth(method = "lm", color = "skyblue", linewidth = 2 ,se = FALSE) +  # Add a regression line
  labs(
    title = "Correlation Between VO2 and Age",
    subtitle = paste("Pearson Correlation: r =", round(corr_result$estimate, 2)),
    x = "Age (years)",
    y = "VO2 (mL/kg/min)"
  ) +
  theme_minimal() +
  theme(
    plot.title = element_text(face = "bold", size = 12, hjust = 0.5),
    plot.subtitle = element_text(size = 12, hjust = 0.5),
    panel.background = element_rect(fill = "black", color = NA),      # Sets background to black
    plot.background = element_rect(fill = "black", color = NA),       # Sets plot area background to black
    panel.grid.major = element_blank(),   # Removes major grid lines
    panel.grid.minor = element_blank(),   # Removes minor grid lines
    axis.text = element_text(color = "white"),    # Makes axis text white for visibility
    axis.title = element_text(color = "white"),   # Makes axis titles white
  ) +
  annotate(
    "text",
    x = max(Merged_Lab1$AGE) - 8,  # Adjust the x-coordinate for better placement
    y = max(Merged_Lab1$ml_VO2) - 2,  # Adjust the y-coordinate for better placement
    label = paste("Pearson r =", corr_value),
    size = 6,
    color = "white"
  )

Fit a model between AGE and VO2

Code
model1 <- lm(ml_VO2 ~ AGE, data = Merged_Lab1)
tidy_model <- tidy(model1, conf.int = TRUE)
tidy_model
# A tibble: 2 × 7
  term        estimate std.error statistic   p.value conf.low conf.high
  <chr>          <dbl>     <dbl>     <dbl>     <dbl>    <dbl>     <dbl>
1 (Intercept)   44.2      1.34       32.9  6.78e-162   41.6     46.8   
2 AGE           -0.117    0.0384     -3.03 2.48e-  3   -0.192   -0.0412

Q1 question - How is age associated with cardiorespiratory fitness?

Q1 answer:

  • Age is negatively associated with VO2 with estimate of -0.117, (p>0.5) conf (-0.1920 - -0.0411)
Code
Predict_age <- tidy_model$estimate[1] - tidy_model$estimate[2] * Merged_Lab1$AGE
summary(lm(ml_VO2 ~ AGE, data = Merged_Lab1))$r.squared
[1] 0.009038075
Code
model1.2 <- lm(ml_VO2 ~ AGE, data = Merged_Lab1)

new_data <- data.frame(AGE = c(50:80))

predictions <- predict(model1.2, newdata = new_data, interval = "confidence")
# Add predictions to new_data dataframe
new_data$fit <- predictions[, "fit"]
new_data$lwr <- predictions[, "lwr"]
new_data$upr <- predictions[, "upr"]
Code
ggplot(Merged_Lab1, aes(x = AGE, y = ml_VO2)) +
  geom_point(color = "darkslategray", size = 1.5) +  # Scatter points
  geom_smooth(method = "lm", color = "skyblue", linewidth = 2 ,se = FALSE) + 
  geom_line(data = new_data, mapping = aes(x = AGE, y = fit), color = "red", linewidth = 1) +# Add a regression line
  labs(
    title = "Correlation Between VO2 and Age",
    subtitle = paste("Pearson Correlation: r =", round(corr_result$estimate, 2)),
    x = "Age (years)",
    y = "VO2 (mL/kg/min)"
  ) +
  theme_minimal() +
  theme(
    plot.title = element_text(face = "bold", size = 12, hjust = 0.5),
    plot.subtitle = element_text(size = 12, hjust = 0.5),
    panel.background = element_rect(fill = "black", color = NA),      # Sets background to black
    plot.background = element_rect(fill = "black", color = NA),       # Sets plot area background to black
    panel.grid.major = element_blank(),   # Removes major grid lines
    panel.grid.minor = element_blank(),   # Removes minor grid lines
    axis.text = element_text(color = "white"),    # Makes axis text white for visibility
    axis.title = element_text(color = "white"),   # Makes axis titles white
  ) +
  annotate(
    "text",
    x = max(Merged_Lab1$AGE) - 8,  # Adjust the x-coordinate for better placement
    y = max(Merged_Lab1$ml_VO2) - 2,  # Adjust the y-coordinate for better placement
    label = paste("Pearson r =", corr_value),
    size = 6,
    color = "white"
  ) +
  annotate(
    "text",
    x = max(Merged_Lab1$AGE) - -12,  # Adjust the x-coordinate for better placement
    y = max(Merged_Lab1$ml_VO2) - 30,  # Adjust the y-coordinate for better placement
    label = paste("Per year increase in Age:", round(tidy_model$estimate[2], 3), "lower VO2 (ml/kg/min)"),
    size = 4,
    color = "maroon"
  )

Question 2: Are there gender differences in cardiorespiratory fitness?

Now to answer question 2.

  • Are there gender differences in cardiorespiratory fitness?
Code
Merged_Lab1$Gender <- as.factor(Merged_Lab1$Gender)
model4 <- lm(ml_VO2 ~ AGE + Gender, data = Merged_Lab1)
tidy_model2 <- tidy(model4, conf.int = TRUE)
tidy_model2
# A tibble: 3 × 7
  term        estimate std.error statistic   p.value conf.low conf.high
  <chr>          <dbl>     <dbl>     <dbl>     <dbl>    <dbl>     <dbl>
1 (Intercept)  47.5       1.27       37.4  7.85e-193   45.0     50.0   
2 AGE          -0.0993    0.0356     -2.79 5.42e-  3   -0.169   -0.0294
3 Gender2      -7.95      0.612     -13.0  9.23e- 36   -9.15    -6.75  

Q2 answer:

This shows that adjusting for AGE, the average VO2 (ml/kg/min) was lower for females by almost 8 (7.95).

Code
Twoplots <- ggplot(data = Merged_Lab1, aes(x = AGE, y = ml_VO2, color = Gender)) +
  geom_point(size = 1, alpha = 0.7) +  # Scatter plot with points colored by gender
  geom_smooth(method = "lm", se = TRUE, aes(fill = Gender), alpha = 0.2) +  # Regression line with confidence interval, split by gender
  labs(
    title = "VO2 vs Age",
    subtitle = "Gender as a Factor",
    x = "Age (years)",
    y = "VO2 (mL/kg/min)",
    color = "Gender",
    fill = "Gender"
  ) +
  theme_minimal() +
  theme(
    plot.title = element_text(face = "bold", size = 14, hjust = 0.5, color = "white"),  # Title in white
    plot.subtitle = element_text(size = 10, hjust = 0.5, color = "white"),  # Subtitle in white
    axis.title = element_text(color = "white"),   # Axis titles in white
    axis.text = element_text(color = "white"),    # Axis text in white
    legend.text = element_text(color = "white"),  # Legend text in white
    legend.title = element_text(color = "white"), # Legend title in white
    panel.background = element_rect(fill = "black", color = NA),  # Sets panel background to black
    plot.background = element_rect(fill = "black", color = NA),   # Sets plot area background to black
    panel.grid.major = element_blank(),  # Removes major grid lines
    panel.grid.minor = element_blank()   # Removes minor grid lines
  ) + 
  facet_wrap(~ Gender)

Oneplot <- ggplot(data = Merged_Lab1, aes(x = AGE, y = ml_VO2, color = Gender)) +
  geom_point(size = 1, alpha = 0.3) +  # Scatter plot with points colored by gender
  geom_smooth(method = "lm", se = TRUE, aes(fill = Gender), alpha = 0.2) +  # Regression line with confidence interval, split by gender
  labs(
    title = "VO2 vs Age",
    x = "Age (years)",
    y = "VO2 (mL/kg/min)",
    color = "Gender",
    fill = "Gender"
  ) +
  scale_color_discrete(labels = c("Male", "Female")) +  # Set colors and labels for points
  scale_fill_discrete(labels = c("Male", "Female")) +   # Set colors and labels for the smooth lines
  theme_minimal() +
  theme(
    plot.title = element_text(face = "bold", size = 12, hjust = 0.5, color = "white"),  # Title in white
    plot.subtitle = element_text(size = 14, hjust = 0.5, color = "white"),  # Subtitle in white
    axis.title = element_text(color = "white"),   # Axis titles in white
    axis.text = element_text(color = "white"),    # Axis text in white
    legend.text = element_text(color = "white"),  # Legend text in white
    legend.title = element_text(color = "white"), # Legend title in white
    panel.background = element_rect(fill = "black", color = NA),  # Sets panel background to black
    plot.background = element_rect(fill = "black", color = NA),   # Sets plot area background to black
    panel.grid.major = element_blank(),  # Removes major grid lines
    panel.grid.minor = element_blank()   # Removes minor grid lines
  ) + 
  annotate(
    "text",
    x = max(Merged_Lab1$AGE) - 10,  # Adjust the x-coordinate for better placement
    y = max(Merged_Lab1$ml_VO2) - 30,  # Adjust the y-coordinate for better placement
    label = paste("Female had on average", round(tidy_model2$estimate[3], 2), "lower VO2(ml/kg/min)"),
    size = 3,
    color = "white"
  )
Code
Oneplot

Code
Twoplots

Question 3: Does sex modify the association between age and cardiorespiratory fitness?

Question 4: Do education levels modify the association with cardiorespiratory fitness?