# Load required packages
library(tidyverse)    # Data manipulation (dplyr, ggplot2, etc.)
## Warning: package 'ggplot2' was built under R version 4.4.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   4.0.2     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(NHANES)       # NHANES dataset
## Warning: package 'NHANES' was built under R version 4.4.3
library(knitr)        # For professional table output
## Warning: package 'knitr' was built under R version 4.4.3
library(kableExtra)   # Enhanced tables
## Warning: package 'kableExtra' was built under R version 4.4.3
## 
## Attaching package: 'kableExtra'
## 
## The following object is masked from 'package:dplyr':
## 
##     group_rows
# Select key variables for analysis
nhanes_analysis <- NHANES %>%
  dplyr::select(
    ID,
    Gender,           # Sex (Male/Female)
    Age,              # Age in years
    Race1,            # Race/ethnicity
    Education,        # Education level
    BMI,              # Body Mass Index
    Pulse,            # Resting heart rate
    BPSys1,           # Systolic blood pressure (1st reading)
    BPDia1,           # Diastolic blood pressure (1st reading)
    PhysActive,       # Physically active (Yes/No)
    SmokeNow,         # Current smoking status
    Diabetes,         # Diabetes diagnosis (Yes/No)
    HealthGen         # General health rating
  ) %>%
  # Create a binary hypertension indicator (BPSys1 >= 140 OR BPDia1 >= 90)
  mutate(
    Hypertension = factor(ifelse(BPSys1 >= 140 | BPDia1 >= 90, "Yes", "No"))
  )

# Remove rows with missing values for key variables
nhanes_analysis2 <- nhanes_analysis %>%
        filter(complete.cases(.))  # Complete cases only


# View the processed dataset
head(nhanes_analysis, 10)
## # A tibble: 10 × 14
##       ID Gender   Age Race1 Education      BMI Pulse BPSys1 BPDia1 PhysActive
##    <int> <fct>  <int> <fct> <fct>        <dbl> <int>  <int>  <int> <fct>     
##  1 51624 male      34 White High School   32.2    70    114     88 No        
##  2 51624 male      34 White High School   32.2    70    114     88 No        
##  3 51624 male      34 White High School   32.2    70    114     88 No        
##  4 51625 male       4 Other <NA>          15.3    NA     NA     NA <NA>      
##  5 51630 female    49 White Some College  30.6    86    118     82 No        
##  6 51638 male       9 White <NA>          16.8    82     84     50 <NA>      
##  7 51646 male       8 White <NA>          20.6    72    114     46 <NA>      
##  8 51647 female    45 White College Grad  27.2    62    106     62 Yes       
##  9 51647 female    45 White College Grad  27.2    62    106     62 Yes       
## 10 51647 female    45 White College Grad  27.2    62    106     62 Yes       
## # ℹ 4 more variables: SmokeNow <fct>, Diabetes <fct>, HealthGen <fct>,
## #   Hypertension <fct>
# Check dimensions
dim(nhanes_analysis)
## [1] 10000    14

Your Turn: Guided Practice

🎯 Task 1: Explore Health Disparities by Education (15 minutes)

Using the nhanes_analysis data, explore:

“How does hypertension prevalence vary by education level?”

Write code to:

  1. Group by education level
  2. Calculate sample size, mean systolic BP, and percent with hypertension
  3. Print the results
# Your code here:
health_by_education <- nhanes_analysis %>%
  group_by(Education) %>%
  summarise(
    N = n(),
    Mean_SysBP = round(mean(BPSys1, na.rm = TRUE), 2),
    Pct_Hypertension = round(
      sum(Hypertension == "Yes", na.rm = TRUE) / sum(!is.na(Hypertension)) * 100, 2)
  )

print(health_by_education)
## # A tibble: 6 × 4
##   Education          N Mean_SysBP Pct_Hypertension
##   <fct>          <int>      <dbl>            <dbl>
## 1 8th Grade        451       128.            28.3 
## 2 9 - 11th Grade   888       124.            17.3 
## 3 High School     1517       124.            18.9 
## 4 Some College    2267       122.            16.6 
## 5 College Grad    2098       119.            13.1 
## 6 <NA>            2779       106.             0.72

🎯 Task 2: Create a Visualization (10 minutes)

Create a bar chart showing hypertension by education level:

# Your visualization here:
health_by_education %>%
  filter(!is.na(Education)) %>%
  ggplot(aes(x = Education, y = Pct_Hypertension)) +
  geom_col(fill = "steelblue", alpha = 0.7) +
  geom_text(aes(label = paste0(Pct_Hypertension, "%")), 
            vjust = -0.5, size = 3) +
  labs(
    title = "Hypertension Prevalence by Education Level",
    x = "Education Level",
    y = "Percent with Hypertension (%)",
    caption = "Source: NHANES"
  ) +
  ylim(0, 50) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))


🎯 Task 3: Write a Data Interpretation (5 minutes)

Write 2-3 sentences:

“What does this pattern tell us about health disparities and social determinants?” The higher the education level is the lower is the percentage of hypertension. People with higher education level are more aware about their health and can better manage hypertension. In aspects of public health it matters because public health specialists can make better policies to help people.

Consider: - Which education groups have highest/lowest hypertension? - What might explain these differences? - Why does this matter for public health?


Grading Rubric for Task 3

Criteria Excellent (Full Credit) Adequate Needs Work
Identifies pattern Explicitly states which groups have highest/lowest rates Mentions direction but lacks specificity Vague or incorrect about pattern
Explains mechanism References social determinants, access, or health literacy Mentions inequality but lacks detail No explanation provided
Public health relevance Discusses implications for policy or programs Notes importance but general Missing public health connection
Writing quality Clear, 2-3 well-written sentences Adequate but could be clearer Incomplete or unclear

Assessment Rubric: Overall Lab Performance

Scoring Guide (100 points total)

Task 1: Code (25 points)

  • ✓ Correct group_by() (5 pts)
  • ✓ Calculates N correctly (5 pts)
  • ✓ Calculates mean systolic BP correctly (5 pts)
  • ✓ Calculates hypertension percentage correctly (10 pts)

Task 2: Visualization (25 points)

  • ✓ Filters missing values (5 pts)
  • ✓ Correct plot type and aesthetics (10 pts)
  • ✓ Proper labels and formatting (5 pts)
  • ✓ Readable axis labels (5 pts)

Task 3: Interpretation (25 points)

  • ✓ Identifies specific pattern in data (8 pts)
  • ✓ Explains mechanism/social determinants (8 pts)
  • ✓ Connects to public health implications (9 pts)

Overall Code Quality (25 points)

  • ✓ Comments explain code (5 pts)
  • ✓ Code runs without errors (10 pts)
  • ✓ Output is properly formatted (5 pts)
  • ✓ Submitted as HTML file (5 pts)

Exporting Your Work

Save and Knit

  1. Save: File → Save As → Lab01_NHANES_YourName.Rmd
  2. Knit: Click the blue Knit button
  3. Submit: Upload the .Rpubs link to Brightspace

Key Takeaways

Skills Practiced

✓ Loading data from R packages
✓ Data exploration with str(), summary(), head()
✓ Grouping and summarizing with group_by() and summarise()
✓ Creating derived variables with mutate()
✓ Calculating epidemiological statistics
✓ Stratification to reveal disparities
✓ Professional visualization with ggplot2
✓ Publication-ready tables


Troubleshooting

“object ‘NHANES’ not found”

→ Make sure you ran data(NHANES) after loading the package

Missing values (NA) showing

→ This is normal! Always use na.rm = TRUE in calculations

Bar chart looks wrong

→ Use filter(!is.na(Variable)) to remove missing groups


Resources


sessionInfo()
## R version 4.4.2 (2024-10-31 ucrt)
## Platform: x86_64-w64-mingw32/x64
## Running under: Windows 10 x64 (build 19045)
## 
## Matrix products: default
## 
## 
## locale:
## [1] LC_COLLATE=English_United States.utf8 
## [2] LC_CTYPE=English_United States.utf8   
## [3] LC_MONETARY=English_United States.utf8
## [4] LC_NUMERIC=C                          
## [5] LC_TIME=English_United States.utf8    
## 
## time zone: America/New_York
## tzcode source: internal
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] kableExtra_1.4.0 knitr_1.51       NHANES_2.1.0     lubridate_1.9.3 
##  [5] forcats_1.0.0    stringr_1.5.1    dplyr_1.1.4      purrr_1.0.2     
##  [9] readr_2.1.5      tidyr_1.3.1      tibble_3.2.1     ggplot2_4.0.2   
## [13] tidyverse_2.0.0 
## 
## loaded via a namespace (and not attached):
##  [1] gtable_0.3.6       jsonlite_2.0.0     compiler_4.4.2     tidyselect_1.2.1  
##  [5] xml2_1.3.6         jquerylib_0.1.4    textshaping_0.4.0  systemfonts_1.3.1 
##  [9] scales_1.4.0       yaml_2.3.10        fastmap_1.2.0      R6_2.6.1          
## [13] labeling_0.4.3     generics_0.1.4     svglite_2.2.2      bslib_0.10.0      
## [17] pillar_1.11.1      RColorBrewer_1.1-3 tzdb_0.4.0         rlang_1.1.4       
## [21] utf8_1.2.4         stringi_1.8.4      cachem_1.1.0       xfun_0.56         
## [25] sass_0.4.10        S7_0.2.1           viridisLite_0.4.3  timechange_0.3.0  
## [29] cli_3.6.3          withr_3.0.2        magrittr_2.0.3     digest_0.6.37     
## [33] grid_4.4.2         rstudioapi_0.18.0  hms_1.1.4          lifecycle_1.0.5   
## [37] vctrs_0.6.5        evaluate_1.0.5     glue_1.8.0         farver_2.1.2      
## [41] rmarkdown_2.30     tools_4.4.2        pkgconfig_2.0.3    htmltools_0.5.8.1

Lab Activity 1 Complete!

Last updated: February 04, 2026