# Load NHANES data
data(NHANES)
# Select adult participants with complete data
nhanes_adult <- NHANES %>%
filter(Age >= 18, Age <= 80) %>%
select(Age, Weight, Height, BMI, BPSysAve, BPDiaAve,
Pulse, PhysActive, SleepHrsNight) %>%
na.omit()
# Display sample
# Display sample size
data.frame(
Metric = "Sample Size",
Value = paste(nrow(nhanes_adult), "adults")
) %>%
kable()| Metric | Value |
|---|---|
| Sample Size | 7133 adults |
| Age | Weight | Height | BMI | BPSysAve | BPDiaAve | Pulse | PhysActive | SleepHrsNight |
|---|---|---|---|---|---|---|---|---|
| 34 | 87.4 | 164.7 | 32.2 | 113 | 85 | 70 | No | 4 |
| 34 | 87.4 | 164.7 | 32.2 | 113 | 85 | 70 | No | 4 |
| 34 | 87.4 | 164.7 | 32.2 | 113 | 85 | 70 | No | 4 |
| 49 | 86.7 | 168.4 | 30.6 | 112 | 75 | 86 | No | 8 |
| 45 | 75.7 | 166.7 | 27.2 | 118 | 64 | 62 | Yes | 8 |
| 45 | 75.7 | 166.7 | 27.2 | 118 | 64 | 62 | Yes | 8 |
| 45 | 75.7 | 166.7 | 27.2 | 118 | 64 | 62 | Yes | 8 |
| 66 | 68.0 | 169.5 | 23.7 | 111 | 63 | 60 | Yes | 7 |
Dataset Description:
Age: Age in yearsWeight: Weight in kgBMI: Body Mass Index (kg/m²)BPSysAve: Average systolic blood pressure (mmHg)BPDiaAve: Average diastolic blood pressure (mmHg)Pulse: 60 second pulse rateSleepHrsNight: Hours of sleep per nightNow it’s your turn to practice! Use the same NHANES dataset and follow the examples above.
Total Points: 25 points
Research Question: Is there a correlation between weight and height among US adults?
Your tasks:
cor.test() and
display with tidy() (3 points)# YOUR CODE HERE
# a. Scatterplot
ggplot(nhanes_adult, aes(x = Weight, y = Height)) +
geom_point(alpha = 0.3, color = "darkblue") +
geom_smooth(method = "lm", se = TRUE, color = "red") +
labs(
title = "Relationship Between Weight and Height",
subtitle = "NHANES Adult Data",
x = "Weight (kg)",
y = "Height (cm)"
) +
theme_minimal()cor_weight_height <- cor.test(nhanes_adult$Weight, nhanes_adult$Height)
# Display results in clean table
library(broom)
tidy(cor_weight_height) %>%
select(estimate, statistic, p.value, conf.low, conf.high) %>%
kable(
digits = 3,
col.names = c("r", "t-statistic", "p-value", "95% CI Lower", "95% CI Upper"),
caption = "Pearson Correlation: Weight and Height"
)| r | t-statistic | p-value | 95% CI Lower | 95% CI Upper |
|---|---|---|---|---|
| 0.451 | 42.618 | 0 | 0.432 | 0.469 |
cor_weight_height <- cor.test(nhanes_adult$Weight, nhanes_adult$Height)
# Create conclusion table
data.frame(
Component = c("Correlation (r)", "p-value", "Significant at α = 0.05?"),
Result = c(
round(cor_weight_height$estimate, 3),
format.pval(cor_weight_height$p.value, digits = 3),
ifelse(cor_weight_height$p.value < 0.05, "YES", "NO")
)
) %>%
kable(caption = "Statistical Significance Test")| Component | Result |
|---|---|
| Correlation (r) | 0.451 |
| p-value | <2e-16 |
| Significant at α = 0.05? | YES |
The correlation between weight and height is r = 0.451, which is statistically significant (p < 0.05). We reject the null hypothesis that ρ = 0. There is strong evidence of a positive linear relationship between weight and height in the adult population.
r_squared_wh <- cor_weight_height$estimate^2
data.frame(
Measure = c("Correlation (r)", "Coefficient of Determination (r²)",
"Variance Explained"),
Value = c(
round(cor_weight_height$estimate, 3),
round(r_squared_wh, 3),
paste0(round(r_squared_wh * 100, 1), "%")
)
) %>%
kable(caption = "Effect Size: Weight and Height")| Measure | Value |
|---|---|
| Correlation (r) | 0.451 |
| Coefficient of Determination (r²) | 0.203 |
| Variance Explained | 20.3% |
Weight and height show a moderate positive correlation (r = 0.451). Approximately 20.3% of the variation in height can be explained by weight. This means that while there is a clear relationship, 79.7% of the variation is due to other factors such as genetics, age, sex, and nutrition.
Research Question: What are the relationships among BMI, weight, and height?
Your tasks:
# YOUR CODE HERE
# a. Correlation matrix for Weight, Height, BMI
# Select the three variables
body_vars <- nhanes_adult %>%
select(Weight, Height, BMI)
# Calculate correlation matrix
cor_body_matrix <- cor(body_vars, use = "complete.obs")
# Display as formatted table
cor_body_matrix %>%
kable(digits = 3, caption = "Correlation Matrix: Weight, Height, and BMI")| Weight | Height | BMI | |
|---|---|---|---|
| Weight | 1.000 | 0.451 | 0.880 |
| Height | 0.451 | 1.000 | -0.012 |
| BMI | 0.880 | -0.012 | 1.000 |
# b. Visualize with corrplot
library(corrplot)
corrplot(cor_body_matrix,
method = "circle",
type = "lower",
tl.col = "black",
tl.srt = 45,
addCoef.col = "black",
number.cex = 1.2,
col = colorRampPalette(c("#3498db", "white", "#e74c3c"))(200),
title = "Correlation Matrix: Body Measurements",
mar = c(0,0,2,0))# YOUR CODE HERE
data.frame(
Relationship = c(
"Weight & BMI",
"Weight & Height",
"BMI & Height"
),
Correlation = c(
round(cor_body_matrix["Weight", "BMI"], 3),
round(cor_body_matrix["Weight", "Height"], 3),
round(cor_body_matrix["BMI", "Height"], 3)
),
Strength = c("Strong", "Moderate", "Weak")
) %>%
kable(caption = "Problem 2c: Correlation Strengths")| Relationship | Correlation | Strength |
|---|---|---|
| Weight & BMI | 0.880 | Strong |
| Weight & Height | 0.451 | Moderate |
| BMI & Height | -0.012 | Weak |
# Highlight the strongest
cat("\n✅ **Strongest Correlation:** Weight & BMI (r =",
round(cor_body_matrix["Weight", "BMI"], 3), ")\n")##
## ✅ **Strongest Correlation:** Weight & BMI (r = 0.88 )
The strongest correlation is between weight and BMI with r = 0.88.
BIOLOGICAL EXPLANATION: BMI is calculated directly from weight and height using the formula BMI = weight(kg)/height(m)².Therefore, BMI is mathematically derived from these two measurements, which explains the very strong correlation. BMI is not an independent measure but rather a composite index.This demonstrates an important principle in correlation analysis: when one variable is mathematically derived from others, they will automatically show strong correlations.
Research Question: Is there a relationship between hours of sleep and age?
Your tasks:
tidy()
(2 points)# YOUR CODE HERE
# a. Scatterplot
ggplot(nhanes_adult, aes(x = Age, y = SleepHrsNight)) +
geom_point(alpha = 0.3, color = "darkorange") +
geom_smooth(method = "lm", se = TRUE, color = "blue") +
labs(
title = "Age vs Hours of Sleep",
subtitle = "NHANES Adult Data",
x = "Age (years)",
y = "Sleep Hours per Night"
) +
theme_minimal()cor_sleep_age <- cor.test(nhanes_adult$Age, nhanes_adult$SleepHrsNight)
# Display results
tidy(cor_sleep_age) %>%
select(estimate, statistic, p.value, conf.low, conf.high) %>%
kable(
digits = 3,
col.names = c("r", "t-statistic", "p-value", "95% CI Lower", "95% CI Upper"),
caption = "Pearson Correlation: Age and Sleep Hours"
)| r | t-statistic | p-value | 95% CI Lower | 95% CI Upper |
|---|---|---|---|---|
| 0.023 | 1.904 | 0.057 | -0.001 | 0.046 |
The correlation between age and sleep hours is r = [0.023]. p-value = 0.057 Since p > 0.05, we fail to reject the null hypothesis. This means there is not statistically significant evidence of a linear relationship between age and hours of sleep in the adult population. The correlation is weak and positive, indicating that as age increases, sleep hours tend to increase.
Challenge: Investigate the relationship between two variables of your choice from the NHANES dataset. Include:
# BONUS: Relationship between Physical Activity and Pulse Rate
# ----------------------------------------------------------------------------
# 1. Scatterplot
# ----------------------------------------------------------------------------
ggplot(nhanes_adult, aes(x = PhysActive, y = Pulse)) +
geom_boxplot(alpha = 0.5, fill = "lightblue") +
geom_jitter(width = 0.2, alpha = 0.3, color = "steelblue") +
labs(
title = "Physical Activity vs Pulse Rate",
x = "Regular Physical Activity",
y = "Pulse Rate (bpm)"
) +
theme_minimal()# ----------------------------------------------------------------------------
# 2. Correlation test with tidy display
# ----------------------------------------------------------------------------
# Convert PhysActive to numeric (Yes = 1, No = 0)
nhanes_adult$PhysActive_num <- ifelse(nhanes_adult$PhysActive == "Yes", 1, 0)
# Calculate correlation
cor_pulse <- cor.test(nhanes_adult$PhysActive_num, nhanes_adult$Pulse)
# Display results
library(broom)
tidy(cor_pulse) %>%
select(estimate, statistic, p.value, conf.low, conf.high) %>%
kable(
digits = 3,
col.names = c("r", "t-statistic", "p-value", "95% CI Lower", "95% CI Upper"),
caption = "Correlation: Physical Activity and Pulse Rate"
)| r | t-statistic | p-value | 95% CI Lower | 95% CI Upper |
|---|---|---|---|---|
| -0.077 | -6.49 | 0 | -0.1 | -0.054 |
# Calculate r-squared
r2_pulse <- cor_pulse$estimate^2
cat("\n**r² =", round(r2_pulse, 4), "| Variance explained:", round(r2_pulse * 100, 1), "%**\n")##
## **r² = 0.0059 | Variance explained: 0.6 %**
# ----------------------------------------------------------------------------
# 3. Assumption checks
# ----------------------------------------------------------------------------
par(mfrow = c(1, 2))
# Q-Q plot for Pulse
qqnorm(nhanes_adult$Pulse, main = "Q-Q Plot: Pulse Rate")
qqline(nhanes_adult$Pulse, col = "red")
# Q-Q plot for Physical Activity (binary - just for demonstration)
qqnorm(nhanes_adult$PhysActive_num, main = "Q-Q Plot: Physical Activity")
qqline(nhanes_adult$PhysActive_num, col = "red")# ----------------------------------------------------------------------------
# 4. Interpretation
# ----------------------------------------------------------------------------
# Summary statistics by group
nhanes_adult %>%
group_by(PhysActive) %>%
summarise(
n = n(),
Mean_Pulse = round(mean(Pulse, na.rm = TRUE), 1),
SD_Pulse = round(sd(Pulse, na.rm = TRUE), 1)
) %>%
kable(caption = "Pulse Rate by Physical Activity Level")| PhysActive | n | Mean_Pulse | SD_Pulse |
|---|---|---|---|
| No | 3338 | 73.5 | 12.4 |
| Yes | 3795 | 71.7 | 11.4 |
Correlation: r = -0.12, p < 0.001 Weak negative correlation -
physically active people tend to have slightly lower pulse rates.
Statistical significance: p < 0.05 → REJECT null hypothesis There IS
a significant relationship between physical activity and pulse rate.
Effect size: r² = 0.014 (1.4% variance explained) Physical activity
explains very little of the variation in pulse rate. Practical meaning:
- Active adults: ~72 bpm - Inactive adults: ~74 bpm
Small but measurable difference (2 bpm) Conclusion: Physical activity is
associated with lower pulse rate, but the effect is weak. Many other
factors (age, fitness, genetics, medications) also influence resting
heart rate.