Lab 1. In this lab task, you will address the following research questions by discussing and outlining the detailed steps required for analysis:
How is age associated with cardiorespiratory fitness?
Are there gender differences in cardiorespiratory fitness?
Does sex modify the association between age and cardiorespiratory fitness?
Do education levels modify the association with cardiorespiratory fitness?
Additional Discussion Point: Consider the representativeness of the study sample and discuss how this might influence your results. #Consider using datasets: “DEMO.XPT” and “CVX.XPT” needed packages and syntax for data organization
Information about the data “#” = Label:Respondent sequence number
“Gender” = 1=male; 2= female
“ExamAgeMonths” = Exam Age in Months
“Education_levels” = Adults 20+ - 1=Less Than 9th Grade; - 2= 9-11th Grade/Includes 12th grade with no diploma; - 3=High School Grad/GED or Equivalent; - 4=Some College or AA degree; - 5=College Graduate or above; - 7=Refused; - 9=Don’t Know
“ml_VO2” = also called relative VO2, calculated by VO2 divided by body weight.
Code
head(CRF_NICE)
# A tibble: 6 × 2
`#` ml_VO2
<dbl> <dbl>
1 5 40.0
2 6 35.5
3 8 NA
4 10 NA
5 11 58.8
6 12 NA
Second, clean the data
Code
sum(colSums(is.na(DEMO)))
[1] 86316
Code
sum(colSums(is.na(CRF)))
[1] 147039
Code
summary(DEMO_NICE$ExamAgeMonths)
Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
0.0 129.0 228.0 345.4 550.0 1019.0 824
Code
AgeDataBleh <-data.frame(DEMO_NICE$ExamAgeMonths)
The Age-data looks weird?
Code
#The data looks strange as Age dataAgeDataBleh <-data.frame(DEMO_NICE$ExamAgeMonths)AgeDataBlehPlot <-ggplot(AgeDataBleh, aes(x = DEMO_NICE.ExamAgeMonths)) +geom_histogram(binwidth =10, fill ="skyblue", color ="black", alpha =0.7) +labs(title ="Age Distribution",x ="Age (Months)",y ="Count") +theme_minimal(base_size =15) +theme(plot.title =element_text(hjust =0.5, face ="bold", color ="white"),axis.title.x =element_text(face ="italic"),axis.title.y =element_text(face ="italic"),panel.background =element_rect(fill ="black", color =NA), # Sets background to blackplot.background =element_rect(fill ="black", color =NA), # Sets plot area background to blackpanel.grid.major =element_blank(), # Removes major grid linespanel.grid.minor =element_blank(), # Removes minor grid linesaxis.text =element_text(color ="white"), # Makes axis text white for visibilityaxis.title =element_text(color ="white"), # Makes axis titles white )AgeDataBlehPlot
Fix the Age data!
Code
summary(DEMO_NICE$ExamAgeMonths)
Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
0.0 129.0 228.0 345.4 550.0 1019.0 824
Code
#Mutate the age in months to age in years, but keep the original variableDEMO_NICE <- DEMO_NICE %>%mutate(AGE = ExamAgeMonths/12)summary(DEMO_NICE$AGE)
Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
0.00 10.75 19.00 28.78 45.83 84.92 824
Code
DEMO_NICE <- DEMO_NICE %>%filter(!is.na(AGE) & AGE >=20)summary(DEMO_NICE$AGE)
Min. 1st Qu. Median Mean 3rd Qu. Max.
20.00 33.50 47.67 49.35 65.00 84.92
Now, the AGE variable is clean and good. Show it with a plot again
Code
AgeData <-data.frame(DEMO_NICE$AGE)AgeDataPlot <-ggplot(AgeData, aes(x = DEMO_NICE.AGE)) +geom_histogram(binwidth =5, fill ="skyblue", color ="black", alpha =0.7) +labs(title ="Age Distribution",x ="Age (years)",y ="Count") +theme_minimal(base_size =15) +theme(plot.title =element_text(hjust =0.5, face ="bold", color ="white"),axis.title.x =element_text(face ="italic"),axis.title.y =element_text(face ="italic"),panel.background =element_rect(fill ="black", color =NA), # Sets background to blackplot.background =element_rect(fill ="black", color =NA), # Sets plot area background to blackpanel.grid.major =element_blank(), # Removes major grid linespanel.grid.minor =element_blank(), # Removes minor grid linesaxis.text =element_text(color ="white"), # Makes axis text white for visibilityaxis.title =element_text(color ="white"), # Makes axis titles white )AgeDataPlot
comparison_with_labels <- AgeDataBlehPlot / AgeDataPlot +plot_layout(ncol =1) +plot_annotation(title ="Comparison Between Clean and Modified Versions",subtitle ="The clean version is highlighted for easy comparison",theme =theme(plot.title =element_text(hjust =0.5)) ) &theme(plot.tag.position ="bottom", # Set position of tagsplot.tag =element_text(face ="bold") # Make the tag stand out )# Add tags to identify the versionsplot_clean <- AgeDataPlot +ggtitle("Age in years") +theme(plot.title =element_text(color ="darkgreen", face ="bold", size =14))plot_other <- AgeDataBlehPlot +ggtitle(" Age in months") +theme(plot.title =element_text(color ="red", face ="bold", size =14))# Print the labeled plotcomparison_with_labels <- plot_clean / plot_otherprint(comparison_with_labels)
Now its time to merge the data sets!
Code
Merged_Lab1 <- DEMO_NICE %>%left_join(CRF_NICE, by ="#")View(Merged_Lab1)
Now, let’s remove the NA values found in the data!
# Gender ExamAgeMonths Education_level Race
Min. : 5 Min. :1.000 Min. :240.0 Min. :1.000 Min. :1.000
1st Qu.:2756 1st Qu.:1.000 1st Qu.:316.0 1st Qu.:2.000 1st Qu.:1.000
Median :4915 Median :1.000 Median :409.0 Median :4.000 Median :3.000
Mean :5056 Mean :1.493 Mean :406.6 Mean :3.403 Mean :2.576
3rd Qu.:7376 3rd Qu.:2.000 3rd Qu.:491.0 3rd Qu.:4.000 3rd Qu.:3.000
Max. :9961 Max. :2.000 Max. :599.0 Max. :9.000 Max. :5.000
NA's :11
AGE ml_VO2
Min. :20.00 Min. : 17.99
1st Qu.:26.33 1st Qu.: 33.23
Median :34.08 Median : 38.60
Mean :33.88 Mean : 40.26
3rd Qu.:40.92 3rd Qu.: 45.59
Max. :49.92 Max. :132.07
Plot the VO2 also!
Code
VO2data <-data.frame(Merged_Lab1$ml_VO2)ggplot(VO2data, aes(x = Merged_Lab1$ml_VO2)) +geom_histogram(binwidth =5, fill ="skyblue", color ="black", alpha =0.7) +scale_x_continuous(limits =c(10, 135), breaks =seq(10, 135, by =10)) +labs(title ="VO2max Distribution",x ="VO2 (ml/kg/min)",y ="Count") +theme_minimal(base_size =15) +theme(plot.title =element_text(hjust =0.5, face ="bold", color ="white"),axis.title.x =element_text(face ="italic"),axis.title.y =element_text(face ="italic"),panel.background =element_rect(fill ="black", color =NA), # Sets background to blackplot.background =element_rect(fill ="black", color =NA), # Sets plot area background to blackpanel.grid.major =element_blank(), # Removes major grid linespanel.grid.minor =element_blank(), # Removes minor grid linesaxis.text =element_text(color ="white"), # Makes axis text white for visibilityaxis.title =element_text(color ="white"), # Makes axis titles white )
This shows that adjusting for AGE, the average VO2 (ml/kg/min) was lower for females by almost 8 (7.95).
Code
Twoplots <-ggplot(data = Merged_Lab1, aes(x = AGE, y = ml_VO2, color = Gender)) +geom_point(size =1, alpha =0.7) +# Scatter plot with points colored by gendergeom_smooth(method ="lm", se =TRUE, aes(fill = Gender), alpha =0.2) +# Regression line with confidence interval, split by genderlabs(title ="VO2 vs Age",subtitle ="Gender as a Factor",x ="Age (years)",y ="VO2 (mL/kg/min)",color ="Gender",fill ="Gender" ) +theme_minimal() +theme(plot.title =element_text(face ="bold", size =14, hjust =0.5, color ="white"), # Title in whiteplot.subtitle =element_text(size =10, hjust =0.5, color ="white"), # Subtitle in whiteaxis.title =element_text(color ="white"), # Axis titles in whiteaxis.text =element_text(color ="white"), # Axis text in whitelegend.text =element_text(color ="white"), # Legend text in whitelegend.title =element_text(color ="white"), # Legend title in whitepanel.background =element_rect(fill ="black", color =NA), # Sets panel background to blackplot.background =element_rect(fill ="black", color =NA), # Sets plot area background to blackpanel.grid.major =element_blank(), # Removes major grid linespanel.grid.minor =element_blank() # Removes minor grid lines ) +facet_wrap(~ Gender)Oneplot <-ggplot(data = Merged_Lab1, aes(x = AGE, y = ml_VO2, color = Gender)) +geom_point(size =1, alpha =0.3) +# Scatter plot with points colored by gendergeom_smooth(method ="lm", se =TRUE, aes(fill = Gender), alpha =0.2) +# Regression line with confidence interval, split by genderlabs(title ="VO2 vs Age",x ="Age (years)",y ="VO2 (mL/kg/min)",color ="Gender",fill ="Gender" ) +scale_color_discrete(labels =c("Male", "Female")) +# Set colors and labels for pointsscale_fill_discrete(labels =c("Male", "Female")) +# Set colors and labels for the smooth linestheme_minimal() +theme(plot.title =element_text(face ="bold", size =12, hjust =0.5, color ="white"), # Title in whiteplot.subtitle =element_text(size =14, hjust =0.5, color ="white"), # Subtitle in whiteaxis.title =element_text(color ="white"), # Axis titles in whiteaxis.text =element_text(color ="white"), # Axis text in whitelegend.text =element_text(color ="white"), # Legend text in whitelegend.title =element_text(color ="white"), # Legend title in whitepanel.background =element_rect(fill ="black", color =NA), # Sets panel background to blackplot.background =element_rect(fill ="black", color =NA), # Sets plot area background to blackpanel.grid.major =element_blank(), # Removes major grid linespanel.grid.minor =element_blank() # Removes minor grid lines ) +annotate("text",x =max(Merged_Lab1$AGE) -10, # Adjust the x-coordinate for better placementy =max(Merged_Lab1$ml_VO2) -30, # Adjust the y-coordinate for better placementlabel =paste("Female had on average", round(tidy_model2$estimate[3], 2), "lower VO2(ml/kg/min)"),size =3,color ="white" )
Code
Oneplot
Code
Twoplots
Question 3: Does sex modify the association between age and cardiorespiratory fitness?
Question 4: Do education levels modify the association with cardiorespiratory fitness?