knitr::opts_chunk$set(
  echo = TRUE,
  warning = FALSE,
  message = FALSE,
  fig.align = "center",
  fig.width = 8,
  fig.height = 6
)

# Load required packages
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.5.2
## Warning: package 'ggplot2' was built under R version 4.5.2
## Warning: package 'tibble' was built under R version 4.5.2
## Warning: package 'tidyr' was built under R version 4.5.2
## Warning: package 'readr' was built under R version 4.5.2
## Warning: package 'purrr' was built under R version 4.5.2
## Warning: package 'dplyr' was built under R version 4.5.2
## Warning: package 'stringr' was built under R version 4.5.2
## Warning: package 'forcats' was built under R version 4.5.2
## Warning: package 'lubridate' was built under R version 4.5.2
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.6
## ✔ forcats   1.0.1     ✔ stringr   1.6.0
## ✔ ggplot2   4.0.1     ✔ tibble    3.3.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.2
## ✔ purrr     1.2.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggplot2)
library(knitr)
## Warning: package 'knitr' was built under R version 4.5.2
library(broom)      # For clean statistical output
## Warning: package 'broom' was built under R version 4.5.2
library(corrplot)
## Warning: package 'corrplot' was built under R version 4.5.2
## corrplot 0.95 loaded
library(NHANES)
## Warning: package 'NHANES' was built under R version 4.5.2
# Set seed
set.seed(553)

PART A: GUIDED EXAMPLES

Setup: Load Packages and Data

# Load NHANES data
data(NHANES)

# Select adult participants with complete data
nhanes_adult <- NHANES %>%
  filter(Age >= 18, Age <= 80) %>%
  select(Age, Weight, Height, BMI, BPSysAve, BPDiaAve, 
         Pulse, PhysActive, SleepHrsNight) %>%
  na.omit()

# Display sample
# Display sample size
data.frame(
  Metric = "Sample Size",
  Value = paste(nrow(nhanes_adult), "adults")
) %>%
  kable()
Metric Value
Sample Size 7133 adults
head(nhanes_adult, 8) %>%
  kable(digits = 1, caption = "NHANES Adult Data Sample")
NHANES Adult Data Sample
Age Weight Height BMI BPSysAve BPDiaAve Pulse PhysActive SleepHrsNight
34 87.4 164.7 32.2 113 85 70 No 4
34 87.4 164.7 32.2 113 85 70 No 4
34 87.4 164.7 32.2 113 85 70 No 4
49 86.7 168.4 30.6 112 75 86 No 8
45 75.7 166.7 27.2 118 64 62 Yes 8
45 75.7 166.7 27.2 118 64 62 Yes 8
45 75.7 166.7 27.2 118 64 62 Yes 8
66 68.0 169.5 23.7 111 63 60 Yes 7

Dataset Description:

  • Age: Age in years
  • Weight: Weight in kg
  • BMI: Body Mass Index (kg/m²)
  • BPSysAve: Average systolic blood pressure (mmHg)
  • BPDiaAve: Average diastolic blood pressure (mmHg)
  • Pulse: 60 second pulse rate
  • SleepHrsNight: Hours of sleep per night

PART B: YOUR TURN - Practice Problems

Now it’s your turn to practice! Use the same NHANES dataset and follow the examples above.

Total Points: 25 points


Problem 1: Weight and Height (10 points)

Research Question: Is there a correlation between weight and height among US adults?

Your tasks:

  1. Create a scatterplot with a fitted line (2 points)
  2. Calculate Pearson correlation using cor.test() and display with tidy() (3 points)
  3. Test for statistical significance and state your conclusion (2 points)
  4. Calculate r² and interpret in 2-3 sentences (3 points)
# YOUR CODE HERE

# a. Scatterplot
ggplot(nhanes_adult, aes(x = Height, y = Weight)) +
  geom_point(alpha = 0.3, color = "steelblue") +
  geom_smooth(method = "lm", se = TRUE, color = "red") +
  labs(
    title = "Weight vs. Height",
    subtitle = "NHANES Data, Adults 18-80 years",
    y = "Weight (kg)",
    x = "Height (cm)"
  ) +
  theme_minimal()

# b. Correlation test with tidy() display
cor_wt_ht <- cor.test(nhanes_adult$Height, nhanes_adult$Weight)

tidy(cor_wt_ht) %>%
  select(estimate, statistic, p.value, conf.low, conf.high) %>%
  kable(
    digits = 3,
    col.names = c("r", "t-statistic", "p-value", "95% CI Lower", "95% CI Upper"),
    caption = "Pearson Correlation: Weight and Height"
  )
Pearson Correlation: Weight and Height
r t-statistic p-value 95% CI Lower 95% CI Upper
0.451 42.618 0 0.432 0.469
# c. Statistical significance

#Hypothesis Test

#H₀: ρ = 0 (no correlation between height and weight in population)
#H₁: ρ ≠ 0 (correlation exists)
#α = 0.05

#r = 0.451                  Moderate positive correlation
#p < 0.001                  Statistically significant (reject H₀)
#95% CI [0.432, 0.469]:     Does not contain zero (confirms significance)

#Conclusion: Reject H₀, there is a positive correlation between height and weight in the population.


# d. r² and interpretation (write as comment)

r_squared_wtht <- cor_wt_ht$estimate^2

data.frame(
  Measure = c("r²", "Variance Explained"),
  Value = c(
    round(r_squared_wtht, 4),
    paste0(round(r_squared_wtht * 100, 2), "%")
  )
) %>%
  kable(caption = "Effect Size")
Effect Size
Measure Value
cor 0.203
Variance Explained 20.3%
#r² = 0.203, meaning height explains 20.3% of the variance in weight. This indicates that height is an important contributor to weight, but there are likely other important factors that also contribute to weight. 

Problem 2: Correlation Matrix Analysis (10 points)

Research Question: What are the relationships among BMI, weight, and height?

Your tasks:

  1. Create a correlation matrix for: Weight, Height, BMI (3 points)
  2. Visualize the matrix using corrplot (3 points)
  3. Identify which pair has the strongest correlation (2 points)
  4. Explain why that correlation makes sense biologically/mathematically (2 points)
# YOUR CODE HERE

# a. Correlation matrix
body_vars <- nhanes_adult %>%
  select(Weight, Height, BMI)

# Calculate correlation matrix
cor_matrix <- cor(body_vars, use = "complete.obs")

# Display as table
cor_matrix %>%
  kable(digits = 3, caption = "Weight, Height, and BMI Correlation Matrix")
Weight, Height, and BMI Correlation Matrix
Weight Height BMI
Weight 1.000 0.451 0.880
Height 0.451 1.000 -0.012
BMI 0.880 -0.012 1.000
# b. Visualize with corrplot
corrplot(cor_matrix, 
         method = "circle",
         type = "lower",
         tl.col = "black",
         tl.srt = 45,
         addCoef.col = "black",
         number.cex = 0.7,
         col = colorRampPalette(c("#3498db", "white", "#e74c3c"))(200),
         title = "Weight, Height, BMI Correlations",
         mar = c(0,0,2,0))

# c. Strongest correlation:

#Weight and BMI have the strongest correlation with r = 0.880.

# d. Explanation (write as comment)

#This correlation makes sense because BMI is calculated as BMI = weight (kg) / height (m) ^2. Thus, as weight increases, BMI also increases. 
# 

Problem 3: Sleep and Age (5 points)

Research Question: Is there a relationship between hours of sleep and age?

Your tasks:

  1. Create a scatterplot (1 point)
  2. Calculate Pearson correlation and display with tidy() (2 points)
  3. Interpret whether the relationship is statistically significant (2 points)
# YOUR CODE HERE

# a. Scatterplot

ggplot(nhanes_adult, aes(x = Age, y = SleepHrsNight)) +
  geom_point(alpha = 0.3, color = "steelblue") +
  geom_smooth(method = "lm", se = TRUE, color = "red") +
  labs(
    title = "Age Vs. Hours of Sleep",
    subtitle = "NHANES Data, Adults 18-80 years",
    y = "Sleep per night (hours)",
    x = "Age (years)"
  ) +
  theme_minimal()

# b. Correlation with tidy()

cor_age_slp <- cor.test(nhanes_adult$Age, nhanes_adult$SleepHrsNight)

tidy(cor_age_slp) %>%
  select(estimate, statistic, p.value, conf.low, conf.high) %>%
  kable(
    digits = 3,
    col.names = c("r", "t-statistic", "p-value", "95% CI Lower", "95% CI Upper"),
    caption = "Pearson Correlation: Age and Hours of Sleep"
  )
Pearson Correlation: Age and Hours of Sleep
r t-statistic p-value 95% CI Lower 95% CI Upper
0.023 1.904 0.057 -0.001 0.046
# c. Interpretation (write as comment)

# This relationship is not statistically significant as p > 0.05. Thus, we do not have evidence to conclude that age and hours of sleep are correlated. 

Bonus (Optional, 5 extra points)

Challenge: Investigate the relationship between two variables of your choice from the NHANES dataset. Include:

  • Scatterplot
  • Correlation test with clean display
  • Assumption checks
  • Thoughtful interpretation
# YOUR CODE HERE

ggplot(nhanes_adult, aes(x = Age, y = Pulse)) +
  geom_point(alpha = 0.3, color = "steelblue") +
  geom_smooth(method = "lm", se = TRUE, color = "red") +
  labs(
    title = "Age Vs. Pulse (BPM)",
    subtitle = "NHANES Data, Adults 18-80 years",
    y = "Pulse (BPM)",
    x = "Age (years)"
  ) +
  theme_minimal()

# b. Correlation with tidy()

cor_age_pul <- cor.test(nhanes_adult$Age, nhanes_adult$Pulse)

tidy(cor_age_pul) %>%
  select(estimate, statistic, p.value, conf.low, conf.high) %>%
  kable(
    digits = 3,
    col.names = c("r", "t-statistic", "p-value", "95% CI Lower", "95% CI Upper"),
    caption = "Pearson Correlation: Age and Pulse"
  )
Pearson Correlation: Age and Pulse
r t-statistic p-value 95% CI Lower 95% CI Upper
-0.153 -13.116 0 -0.176 -0.131
#r-squared

r_squared_age_pul <- cor_age_pul$estimate^2

data.frame(
  Measure = c("r²", "Variance Explained"),
  Value = c(
    round(r_squared_age_pul, 4),
    paste0(round(r_squared_age_pul * 100, 2), "%")
  )
) %>%
  kable(caption = "Effect Size")
Effect Size
Measure Value
cor 0.0236
Variance Explained 2.36%
#Assumption tests

par(mfrow = c(1, 2))

qqnorm(nhanes_adult$Age, main = "Q-Q Plot: Age")
qqline(nhanes_adult$Age, col = "red")

qqnorm(nhanes_adult$Pulse, main = "Q-Q Plot: Pulse")
qqline(nhanes_adult$Pulse, col = "red")

par(mfrow = c(1, 1))

#Assumption 1: Linearity

#The assumption of linearity has been met, as indicated by the scatterplot. 

#Assumption 2: Normality

#Both Age and Pulse are approximately normally distributed; the points in the Q-Q plots tend to follow the red line. Although points on the tails deviate from the normality assumption, the sample size is large (n = 7133), so the test should still be valid.

#Assumption 3: No extreme outliers 

#There are a few points that could be considered extreme outliers in the scatterplot, but there are not many and given the large sample size, the effect of these should be minimal. 

#Conclusion

#Since p<0.05, we reject our null hypothesis and conclude there is a negative correlation between Age and Pulse. That is, as age increases, pulse decreases. This is confirmed by the 95% CI [-0.176, -0.131], which does not intersect with 0. r² = 0.0236, indicating only 2.36% of the variance in pulse is explained by age. This suggests that despite the statistically significant correlation, age is not a major contributor to pulse and that other variables are likely more important in affecting pulse.

Grading Rubric

Problem 1: Weight and Height (10 points)

    1. Scatterplot properly formatted with labels: 2 points
    1. Correct correlation with clean display: 3 points
    1. Significance test correctly interpreted: 2 points
    1. r² calculated and interpreted: 3 points

Problem 2: Correlation Matrix (10 points)

    1. Correct matrix calculated: 3 points
    1. Well-formatted correlation plot: 3 points
    1. Strongest correlation identified: 2 points
    1. Biological/mathematical explanation: 2 points

Problem 3: Sleep and Age (5 points)

    1. Scatterplot: 1 point
    1. Correlation calculated and displayed: 2 points
    1. Interpretation of significance: 2 points

Submission Instructions

  1. Save your work with your name: Correlation_Lab_YourName.Rmd

  2. Knit to HTML to create your report

  3. Publish to RPubs:

    • Click the Publish button (blue icon) in the HTML preview window
    • Choose RPubs from the options
    • Follow the prompts to publish (create account if needed)
    • Copy your RPubs URL
  4. Submit to Brightspace:

    • Upload your .Rmd file
    • Paste your RPubs link in the assignment comments or submission text box
  5. Due: End of class today

Grading: This lab is worth 15% of your in-class lab grade. The lowest 2 lab grades are dropped.


Additional Resources

R Functions Used Today

  • cor.test() - Calculate correlation and test significance
  • tidy() - Clean display of statistical test results
  • cor() - Calculate correlation matrix
  • corrplot() - Visualize correlation matrix
  • ggplot() + geom_point() - Scatterplots
  • geom_smooth(method="lm") - Add fitted regression line
  • qqnorm() / qqline() - Check normality

For More Help

  • Textbook: Chapter 6 - Correlation Analysis
  • Office Hours: See syllabus
  • TA Help: See syllabus
  • R Documentation: Type ?cor.test in console

Remember:

✓ Correlation measures LINEAR relationships only
✓ Always visualize your data first
✓ Correlation ≠ Causation
✓ Check your assumptions
✓ Consider confounding and alternative explanations


This lab activity was created for EPI 553: Principles of Statistical Inference II
University at Albany, College of Integrated Health Sciences
Spring 2026