# Load NHANES data
data(NHANES)
# Select adult participants with complete data
nhanes_adult <- NHANES %>%
filter(Age >= 18, Age <= 80) %>%
select(Age, Weight, Height, BMI, BPSysAve, BPDiaAve,
Pulse, PhysActive, SleepHrsNight, TotChol) %>%
na.omit()
# Display sample
# Display sample size
data.frame(
Metric = "Sample Size",
Value = paste(nrow(nhanes_adult), "adults")
) %>%
kable()
| Metric | Value |
|---|---|
| Sample Size | 6778 adults |
head(nhanes_adult, 8) %>%
kable(digits = 1, caption = "NHANES Adult Data Sample")
| Age | Weight | Height | BMI | BPSysAve | BPDiaAve | Pulse | PhysActive | SleepHrsNight | TotChol |
|---|---|---|---|---|---|---|---|---|---|
| 34 | 87.4 | 164.7 | 32.2 | 113 | 85 | 70 | No | 4 | 3.5 |
| 34 | 87.4 | 164.7 | 32.2 | 113 | 85 | 70 | No | 4 | 3.5 |
| 34 | 87.4 | 164.7 | 32.2 | 113 | 85 | 70 | No | 4 | 3.5 |
| 49 | 86.7 | 168.4 | 30.6 | 112 | 75 | 86 | No | 8 | 6.7 |
| 45 | 75.7 | 166.7 | 27.2 | 118 | 64 | 62 | Yes | 8 | 5.8 |
| 45 | 75.7 | 166.7 | 27.2 | 118 | 64 | 62 | Yes | 8 | 5.8 |
| 45 | 75.7 | 166.7 | 27.2 | 118 | 64 | 62 | Yes | 8 | 5.8 |
| 66 | 68.0 | 169.5 | 23.7 | 111 | 63 | 60 | Yes | 7 | 5.0 |
Now it’s your turn to practice! Use the same NHANES dataset and follow the examples above.
Total Points: 25 points
Research Question: Is there a correlation between weight and height among US adults?
Your tasks:
cor.test() and
display with tidy() (3 points)# YOUR CODE HERE
# a. Scatterplot
ggplot(nhanes_adult, aes(x = Height, y = Weight)) +
geom_point(alpha = 0.3, color = "steelblue") +
geom_smooth(method = "lm", se = TRUE, color = "red") +
labs(
title = "Height vs Weight",
subtitle = "NHANES Data, Adults 18-80 years",
x = "Height (cm)",
y = "Weight (kg)"
) +
theme_minimal()
# b. Correlation test with tidy() display
#Pearson correlation
cor_ht_wt <- cor.test(nhanes_adult$Height, nhanes_adult$Weight)
#Results in clean table
tidy(cor_ht_wt) %>%
select(estimate, statistic, p.value, conf.low, conf.high) %>%
kable(
digits = 3,
col.names = c("r", "t-statistic", "p-value", "95% CI Lower", "95% CI Upper"),
caption = "Pearson Correlation: Height and Weight"
)
| r | t-statistic | p-value | 95% CI Lower | 95% CI Upper |
|---|---|---|---|---|
| 0.453 | 41.836 | 0 | 0.434 | 0.472 |
# c. Statistical significance
##**r = 0.451 moderate positive correlation
##**p < 0.001 statistically significant
##**95% CI [0.432, 0.469] significant
# d. r² and interpretation (write as comment)
r_squared <- cor_ht_wt$estimate^2
data.frame(
Measure = c("Correlation (r)", "Coefficient of Determination (r²)",
"Variance Explained"),
Value = c(
round(cor_ht_wt$estimate, 3),
round(r_squared, 3),
paste0(round(r_squared * 100, 1), "%")
)
) %>%
kable(caption = "Summary of Correlation Strength")
| Measure | Value |
|---|---|
| Correlation (r) | 0.453 |
| Coefficient of Determination (r²) | 0.205 |
| Variance Explained | 20.5% |
Interpretation There is a statistically significant moderate positive correlation between height and weight. As height increases, weight usually increases as well. Height only explains 20.3% of variation in weight, indicating other factors play a role. —
Research Question: What are the relationships among BMI, weight, and height?
Your tasks:
# YOUR CODE HERE
# a. Correlation matrix
# Selecting variables
cardio_vars <- nhanes_adult %>%
select (BMI, Weight, Height)
#Calculating correlation matrix
cor_matrix <- cor(cardio_vars, use = "complete.obs")
cor_matrix %>%
kable(digits = 3, caption = "BMI, Height, & Weight Correlation Matrix")
| BMI | Weight | Height | |
|---|---|---|---|
| BMI | 1.000 | 0.881 | -0.009 |
| Weight | 0.881 | 1.000 | 0.453 |
| Height | -0.009 | 0.453 | 1.000 |
# b. Visualize with corrplot
corrplot(cor_matrix,
method = "circle",
type = "lower",
tl.col = "black",
tl.srt = 45,
addCoef.col = "black",
number.cex = 0.7,
col = colorRampPalette(c("#3498db", "white", "#e74c3c"))(200),
title = "BMI, Height, & Weight Correlations",
mar = c(0,0,2,0))
# c. Strongest correlation:
data.frame(
Relationship = c(
"Weight & BMI",
"Height & Weight"
),
Correlation = c(
round(cor_matrix["Weight", "BMI"], 3),
round(cor_matrix["Height", "Weight"], 3)
),
Strength = c("Strong", "Moderate")
) %>%
kable(caption = "Notable Correlations Summary")
| Relationship | Correlation | Strength |
|---|---|---|
| Weight & BMI | 0.881 | Strong |
| Height & Weight | 0.453 | Moderate |
# d. Explanation (write as comment)
# Weight and BMI show the strongest correlation where r=0.880, which reflects the biological connection between them. Height and weight are not as strongly correlated, but still a moderate positive trend, showing that other factors influence weight, rather than height alone.
Research Question: Is there a relationship between hours of sleep and age?
Your tasks:
tidy()
(2 points)# YOUR CODE HERE
# a. Scatterplot
ggplot(nhanes_adult, aes(x = Age, y = SleepHrsNight)) +
geom_point(alpha = 0.3, color = "darkgreen") +
geom_smooth(method = "lm", se = TRUE, color = "red", fill = "pink") +
labs(
title = "Age vs Hours of Sleep",
x = "Age (years)",
y = "Hours of Sleep (hrs)"
) +
theme_minimal()
# b. Correlation with tidy()
cor_age_sleep <- cor.test(nhanes_adult$Age, nhanes_adult$SleepHrsNight)
# Display results
tidy(cor_age_sleep) %>%
select(estimate, statistic, p.value, conf.low, conf.high) %>%
kable(
digits = 3,
col.names = c("r", "t-statistic", "p-value", "95% CI Lower", "95% CI Upper"),
caption = "Pearson Correlation: Age and Hours of Sleep"
)
| r | t-statistic | p-value | 95% CI Lower | 95% CI Upper |
|---|---|---|---|---|
| 0.024 | 1.989 | 0.047 | 0 | 0.048 |
r_squared_age <- cor_age_sleep$estimate^2
data.frame(
Measure = c("r²", "Variance Explained"),
Value = c(
round(r_squared_age, 4),
paste0(round(r_squared_age * 100, 2), "%")
)
) %>%
kable(caption = "Effect Size")
| Measure | Value | |
|---|---|---|
| cor | r² | 6e-04 |
| Variance Explained | 0.06% |
# c. Interpretation (write as comment)
# The correlation between age and hours of sleep was r= 0.023, indicating a weak positive correlation. The association is not statistically significant as the p value is 0.057. The 95% CI of [-0.001, 0.046] includes zero confirming the correlation is not significant. Age only explains 0.05% of variation in hours of sleep, illustrating that other factors play a more significant role in how many hours people sleep.
Challenge: Investigate the relationship between two variables of your choice from the NHANES dataset. Include: , - Scatterplot - Correlation test with clean display - Assumption checks - Thoughtful interpretation
# YOUR CODE HERE
# Scatterplot investigating the relationship between age and total cholesterol
ggplot(nhanes_adult, aes(x = Age, y = TotChol)) +
geom_point(alpha = 0.3, color = "darkgreen") +
geom_smooth(method = "lm", se = TRUE, color = "red", fill = "pink") +
labs(
title = "Age vs Total Cholesterol",
x = "Age (years)",
y = "Total Cholesterol"
) +
theme_minimal()
# Correlation with tidy()
cor_age_chol <- cor.test(nhanes_adult$Age, nhanes_adult$TotChol)
# Display results
tidy(cor_age_chol) %>%
select(estimate, statistic, p.value, conf.low, conf.high) %>%
kable(
digits = 3,
col.names = c("r", "t-statistic", "p-value", "95% CI Lower", "95% CI Upper"),
caption = "Pearson Correlation: Age and Total Cholesterol"
)
| r | t-statistic | p-value | 95% CI Lower | 95% CI Upper |
|---|---|---|---|---|
| 0.16 | 13.329 | 0 | 0.137 | 0.183 |
r_squared_age <- cor_age_chol$estimate^2
data.frame(
Measure = c("r²", "Variance Explained"),
Value = c(
round(r_squared_age, 4),
paste0(round(r_squared_age * 100, 2), "%")
)
) %>%
kable(caption = "Effect Size")
| Measure | Value | |
|---|---|---|
| cor | r² | 0.0255 |
| Variance Explained | 2.55% |
# Check assumptions
##**Assumption 1: Linearity** (already checked with scatterplot ✓)
##**Assumption 2: Bivariate Normality**
# Q-Q plots for normality
par(mfrow = c(1, 2))
qqnorm(nhanes_adult$Age, main = "Q-Q Plot: Age")
qqline(nhanes_adult$Age, col = "red")
qqnorm(nhanes_adult$TotChol, main = "Q-Q Plot: Total Cholesterol")
qqline(nhanes_adult$TotChol, col = "red")
par(mfrow = c(1, 1))
Assessment: Both age and total cholesterol are approximately normally distributed. Some deviation in the tails, but with large sample size, the correlation test is robust to minor violations.
Assumption 3: No Extreme Outliers (scatterplot shows no extreme outliers ✓)
Interpretation The correlation between age and total cholesterol is r = 0.16, indicating a weak positive correlation. The p-value of <0.0001 shows statistical significance. The 95% CI, [0.137, 0.183], further confirms the statistical significance as it doesn’t include zero. Age explains only 2.55% of variation showing that other variables play a more substantial role to total cholesterol levels. ```
Save your work with your name:
Correlation_Lab_YourName.Rmd
Knit to HTML to create your report
Publish to RPubs:
Submit to Brightspace:
Due: End of class today
Grading: This lab is worth 15% of your in-class lab grade. The lowest 2 lab grades are dropped.
cor.test() - Calculate correlation and test
significancetidy() - Clean display of statistical test resultscor() - Calculate correlation matrixcorrplot() - Visualize correlation matrixggplot() + geom_point() - Scatterplotsgeom_smooth(method="lm") - Add fitted regression
lineqqnorm() / qqline() - Check normality?cor.test in
consoleRemember:
✓ Correlation measures LINEAR relationships only
✓ Always visualize your data first
✓ Correlation ≠ Causation
✓ Check your assumptions
✓ Consider confounding and alternative explanations
This lab activity was created for EPI 553: Principles of
Statistical Inference II
University at Albany, College of Integrated Health
Sciences
Spring 2026
set.seed(553) ```