# Load NHANES data
data(NHANES)
# Select adult participants with complete data
nhanes_adult <- NHANES %>%
filter(Age >= 18, Age <= 80) %>%
select(Age, Weight, Height, BMI, BPSysAve, BPDiaAve,
Pulse, PhysActive, SleepHrsNight) %>%
na.omit()
# Display sample
# Display sample size
data.frame(
Metric = "Sample Size",
Value = paste(nrow(nhanes_adult), "adults")
) %>%
kable()
| Metric | Value |
|---|---|
| Sample Size | 7133 adults |
head(nhanes_adult, 8) %>%
kable(digits = 1, caption = "NHANES Adult Data Sample")
| Age | Weight | Height | BMI | BPSysAve | BPDiaAve | Pulse | PhysActive | SleepHrsNight |
|---|---|---|---|---|---|---|---|---|
| 34 | 87.4 | 164.7 | 32.2 | 113 | 85 | 70 | No | 4 |
| 34 | 87.4 | 164.7 | 32.2 | 113 | 85 | 70 | No | 4 |
| 34 | 87.4 | 164.7 | 32.2 | 113 | 85 | 70 | No | 4 |
| 49 | 86.7 | 168.4 | 30.6 | 112 | 75 | 86 | No | 8 |
| 45 | 75.7 | 166.7 | 27.2 | 118 | 64 | 62 | Yes | 8 |
| 45 | 75.7 | 166.7 | 27.2 | 118 | 64 | 62 | Yes | 8 |
| 45 | 75.7 | 166.7 | 27.2 | 118 | 64 | 62 | Yes | 8 |
| 66 | 68.0 | 169.5 | 23.7 | 111 | 63 | 60 | Yes | 7 |
Now it’s your turn to practice! Use the same NHANES dataset and follow the examples above.
Total Points: 25 points
Research Question: Is there a correlation between weight and height among US adults?
Your tasks:
cor.test() and
display with tidy() (3 points)# YOUR CODE HERE
# a. Scatterplot
ggplot(nhanes_adult, aes(x = Height, y = Weight)) +
geom_point(alpha = 0.3, color = "steelblue") +
geom_smooth(method = "lm", se = TRUE, color = "red") +
labs(
title = "Weight vs. Height",
subtitle = "NHANES Data, Adults 18-80 years",
x = "Height (cm)",
y = "Weight (kg)"
) +
theme_minimal()
# b. Correlation test with tidy() display
# Calculate Pearson correlation
cor_height_weight <- cor.test(nhanes_adult$Height, nhanes_adult$Weight)
# Display results in clean table
tidy(cor_height_weight) %>%
select(estimate, statistic, p.value, conf.low, conf.high) %>%
kable(
digits = 3,
col.names = c("r", "t-statistic", "p-value", "95% CI Lower", "95% CI Upper"),
caption = "Pearson Correlation: Height and Weight"
)
| r | t-statistic | p-value | 95% CI Lower | 95% CI Upper |
|---|---|---|---|---|
| 0.451 | 42.618 | 0 | 0.432 | 0.469 |
# c. Statistical significance
#The Pearson Correlation between height and weight is statistically significant, because the p-value is 0 and the 95% confidence interval [0.432,0.469] does not include 0. The r value is 0.451, meaning height and weight are moderately positively correlated. As height increases, weight also tends to increase.
# d. r² and interpretation (write as comment)
# Calculate r-squared
r_squared <- cor_height_weight$estimate^2
data.frame(
Measure = c("Correlation (r)", "Coefficient of Determination (r²)",
"Variance Explained"),
Value = c(
round(cor_height_weight$estimate, 3),
round(r_squared, 3),
paste0(round(r_squared * 100, 1), "%")
)
) %>%
kable(caption = "Summary of Correlation Strength")
| Measure | Value |
|---|---|
| Correlation (r) | 0.451 |
| Coefficient of Determination (r²) | 0.203 |
| Variance Explained | 20.3% |
#R squared is 0.203, meaning that height only explains about 20.3% of the variation in weight, suggesting other factors also play important roles.
Research Question: What are the relationships among BMI, weight, and height?
Your tasks:
# YOUR CODE HERE
# a. Correlation matrix
# Select cardiovascular variables
bmi_vars <- nhanes_adult %>%
select(Weight, Height, BMI)
# Calculate correlation matrix
cor_matrix <- cor(bmi_vars, use = "complete.obs")
# Display as table
cor_matrix %>%
kable(digits = 3, caption = "Antropometric Health Correlation Matrix")
| Weight | Height | BMI | |
|---|---|---|---|
| Weight | 1.000 | 0.451 | 0.880 |
| Height | 0.451 | 1.000 | -0.012 |
| BMI | 0.880 | -0.012 | 1.000 |
# b. Visualize with corrplot
# Create correlation plot
corrplot(cor_matrix,
method = "circle",
type = "lower",
tl.col = "black",
tl.srt = 45,
addCoef.col = "black",
number.cex = 0.7,
col = colorRampPalette(c("#3498db", "white", "#e74c3c"))(200),
title = "Antropometric Health Correlations",
mar = c(0,0,2,0))
# c. Strongest correlation:
data.frame(
Relationship = c(
"Weight & BMI",
"Height & Weight",
"Height & BMI"
),
Correlation = c(
round(cor_matrix["Weight", "BMI"], 3),
round(cor_matrix["Height", "Weight"], 3),
round(cor_matrix["Height", "BMI"], 3)
),
Strength = c("Strong","Moderate","Weak")
) %>%
kable(caption = "Notable Correlations Summary")
| Relationship | Correlation | Strength |
|---|---|---|
| Weight & BMI | 0.880 | Strong |
| Height & Weight | 0.451 | Moderate |
| Height & BMI | -0.012 | Weak |
# d. Explanation (write as comment)
# Weight and BMI have the strongest positive correlation at r = 0.880, height and weight have a moderate positive correlation at r = 0.451, and height and BMI have a weak negative correlation at r = -0.012. The strong correlation between weight and BMI makes sense, as weight plays a large role in a person's BMI. Height has a weak correlation with BMI, suggesting it is impacted by other factors.
Research Question: Is there a relationship between hours of sleep and age?
Your tasks:
tidy()
(2 points)# YOUR CODE HERE
# a. Scatterplot
ggplot(nhanes_adult, aes(x = Age, y = SleepHrsNight)) +
geom_point(alpha = 0.3, color = "steelblue") +
geom_smooth(method = "lm", se = TRUE, color = "red") +
labs(
title = "Hours of Sleep at Night vs. Age",
subtitle = "NHANES Data, Adults 18-80 years",
x = "Age (yrs)",
y = "Hours of Sleep at Night"
) +
theme_minimal()
# b. Correlation with tidy()
# Calculate Pearson correlation
cor_age_sleep <- cor.test(nhanes_adult$Age, nhanes_adult$SleepHrsNight)
# Display results in clean table
tidy(cor_age_sleep) %>%
select(estimate, statistic, p.value, conf.low, conf.high) %>%
kable(
digits = 3,
col.names = c("r", "t-statistic", "p-value", "95% CI Lower", "95% CI Upper"),
caption = "Pearson Correlation: Age and Hours of Sleep at Night"
)
| r | t-statistic | p-value | 95% CI Lower | 95% CI Upper |
|---|---|---|---|---|
| 0.023 | 1.904 | 0.057 | -0.001 | 0.046 |
# c. Interpretation (write as comment)
# The Pearson Correlation between Age and Hours slept at night is not statistically signficant, because the p-value is 0.057 (>0.05) and the 95% confidence interval [-0.001,0.046] includes 0. The r value is 0.023, meaning age and hours slept at night are very weakly positively correlated, if correlated at all.
Challenge: Investigate the relationship between two variables of your choice from the NHANES dataset. Include:
# YOUR CODE HERE
# Create scatterplot
ggplot(nhanes_adult, aes(x = Weight, y = Pulse)) +
geom_point(alpha = 0.3, color = "steelblue") +
geom_smooth(method = "lm", se = TRUE, color = "red") +
labs(
title = "Weight vs Pulse",
subtitle = "NHANES Data, Adults 18-80 years",
x = "Weight (kg)",
y = "Pulse Rate (measured in 60s)"
) +
theme_minimal()
# Calculate Pearson correlation
cor_weight_pulse <- cor.test(nhanes_adult$Weight, nhanes_adult$Pulse)
# Display results in clean table
tidy(cor_weight_pulse) %>%
select(estimate, statistic, p.value, conf.low, conf.high) %>%
kable(
digits = 3,
col.names = c("r", "t-statistic", "p-value", "95% CI Lower", "95% CI Upper"),
caption = "Pearson Correlation: Weight and Pulse"
)
| r | t-statistic | p-value | 95% CI Lower | 95% CI Upper |
|---|---|---|---|---|
| 0.063 | 5.295 | 0 | 0.039 | 0.086 |
# Calculate r-squared
r_squared <- cor_weight_pulse$estimate^2
data.frame(
Measure = c("Correlation (r)", "Coefficient of Determination (r²)",
"Variance Explained"),
Value = c(
round(cor_weight_pulse$estimate, 3),
round(r_squared, 3),
paste0(round(r_squared * 100, 1), "%")
)
) %>%
kable(caption = "Summary of Correlation Strength")
| Measure | Value |
|---|---|
| Correlation (r) | 0.063 |
| Coefficient of Determination (r²) | 0.004 |
| Variance Explained | 0.4% |
# Q-Q plots for normality
par(mfrow = c(1, 2))
qqnorm(nhanes_adult$Weight, main = "Q-Q Plot: Weight")
qqline(nhanes_adult$Weight, col = "red")
qqnorm(nhanes_adult$Pulse, main = "Q-Q Plot: Pulse")
qqline(nhanes_adult$Pulse, col = "red")
par(mfrow = c(1, 1))
#Interpretation:
#There is a statistically significant weak positive correlation between weight and pulse (r = 0.063, 95% confidence interval [0.039,0.086]). As weight increases, pulse rate tends to increase. However, weight explains only about 0.4% of the variation in pulse, suggesting other factors also play important roles. The assumptions are all met, as the scatter plot shows linearity and the Q-Q plot shows normality, as the points follow the red line.
```