# Load required packages
library(tidyverse) # Data manipulation (dplyr, ggplot2, etc.)
## Warning: package 'ggplot2' was built under R version 4.4.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 4.0.2 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(NHANES) # NHANES dataset
## Warning: package 'NHANES' was built under R version 4.4.3
library(knitr) # For professional table output
## Warning: package 'knitr' was built under R version 4.4.3
library(kableExtra) # Enhanced tables
## Warning: package 'kableExtra' was built under R version 4.4.3
##
## Attaching package: 'kableExtra'
##
## The following object is masked from 'package:dplyr':
##
## group_rows
# Select key variables for analysis
nhanes_analysis <- NHANES %>%
dplyr::select(
ID,
Gender, # Sex (Male/Female)
Age, # Age in years
Race1, # Race/ethnicity
Education, # Education level
BMI, # Body Mass Index
Pulse, # Resting heart rate
BPSys1, # Systolic blood pressure (1st reading)
BPDia1, # Diastolic blood pressure (1st reading)
PhysActive, # Physically active (Yes/No)
SmokeNow, # Current smoking status
Diabetes, # Diabetes diagnosis (Yes/No)
HealthGen # General health rating
) %>%
# Create a binary hypertension indicator (BPSys1 >= 140 OR BPDia1 >= 90)
mutate(
Hypertension = factor(ifelse(BPSys1 >= 140 | BPDia1 >= 90, "Yes", "No"))
)
# Remove rows with missing values for key variables
nhanes_analysis2 <- nhanes_analysis %>%
filter(complete.cases(.)) # Complete cases only
# View the processed dataset
head(nhanes_analysis, 10)
## # A tibble: 10 × 14
## ID Gender Age Race1 Education BMI Pulse BPSys1 BPDia1 PhysActive
## <int> <fct> <int> <fct> <fct> <dbl> <int> <int> <int> <fct>
## 1 51624 male 34 White High School 32.2 70 114 88 No
## 2 51624 male 34 White High School 32.2 70 114 88 No
## 3 51624 male 34 White High School 32.2 70 114 88 No
## 4 51625 male 4 Other <NA> 15.3 NA NA NA <NA>
## 5 51630 female 49 White Some College 30.6 86 118 82 No
## 6 51638 male 9 White <NA> 16.8 82 84 50 <NA>
## 7 51646 male 8 White <NA> 20.6 72 114 46 <NA>
## 8 51647 female 45 White College Grad 27.2 62 106 62 Yes
## 9 51647 female 45 White College Grad 27.2 62 106 62 Yes
## 10 51647 female 45 White College Grad 27.2 62 106 62 Yes
## # ℹ 4 more variables: SmokeNow <fct>, Diabetes <fct>, HealthGen <fct>,
## # Hypertension <fct>
# Check dimensions
dim(nhanes_analysis)
## [1] 10000 14
Using the nhanes_analysis data, explore:
“How does hypertension prevalence vary by education level?”
Write code to:
# Your code here:
health_by_education <- nhanes_analysis %>%
group_by(Education) %>%
summarise(
N = n(),
Mean_SysBP = round(mean(BPSys1, na.rm = TRUE), 2),
Pct_Hypertension = round(
sum(Hypertension == "Yes", na.rm = TRUE) / sum(!is.na(Hypertension)) * 100, 2)
)
print(health_by_education)
## # A tibble: 6 × 4
## Education N Mean_SysBP Pct_Hypertension
## <fct> <int> <dbl> <dbl>
## 1 8th Grade 451 128. 28.3
## 2 9 - 11th Grade 888 124. 17.3
## 3 High School 1517 124. 18.9
## 4 Some College 2267 122. 16.6
## 5 College Grad 2098 119. 13.1
## 6 <NA> 2779 106. 0.72
Create a bar chart showing hypertension by education level:
# Your visualization here:
health_by_education %>%
filter(!is.na(Education)) %>%
ggplot(aes(x = Education, y = Pct_Hypertension)) +
geom_col(fill = "steelblue", alpha = 0.7) +
geom_text(aes(label = paste0(Pct_Hypertension, "%")),
vjust = -0.5, size = 3) +
labs(
title = "Hypertension Prevalence by Education Level",
x = "Education Level",
y = "Percent with Hypertension (%)",
caption = "Source: NHANES"
) +
ylim(0, 50) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
Write 2-3 sentences:
“What does this pattern tell us about health disparities and social determinants?” The higher the education level is the lower is the percentage of hypertension. People with higher education level are more aware about their health and can better manage hypertension. In aspects of public health it matters because public health specialists can make better policies to help people.
Consider: - Which education groups have highest/lowest hypertension? - What might explain these differences? - Why does this matter for public health?
| Criteria | Excellent (Full Credit) | Adequate | Needs Work |
|---|---|---|---|
| Identifies pattern | Explicitly states which groups have highest/lowest rates | Mentions direction but lacks specificity | Vague or incorrect about pattern |
| Explains mechanism | References social determinants, access, or health literacy | Mentions inequality but lacks detail | No explanation provided |
| Public health relevance | Discusses implications for policy or programs | Notes importance but general | Missing public health connection |
| Writing quality | Clear, 2-3 well-written sentences | Adequate but could be clearer | Incomplete or unclear |
group_by() (5 pts)Lab01_NHANES_YourName.Rmd✓ Loading data from R packages
✓ Data exploration with str(), summary(),
head()
✓ Grouping and summarizing with group_by() and
summarise()
✓ Creating derived variables with mutate()
✓ Calculating epidemiological statistics
✓ Stratification to reveal disparities
✓ Professional visualization with ggplot2
✓ Publication-ready tables
→ Make sure you ran data(NHANES) after loading the
package
→ This is normal! Always use na.rm = TRUE in
calculations
→ Use filter(!is.na(Variable)) to remove missing
groups
sessionInfo()
## R version 4.4.2 (2024-10-31 ucrt)
## Platform: x86_64-w64-mingw32/x64
## Running under: Windows 10 x64 (build 19045)
##
## Matrix products: default
##
##
## locale:
## [1] LC_COLLATE=English_United States.utf8
## [2] LC_CTYPE=English_United States.utf8
## [3] LC_MONETARY=English_United States.utf8
## [4] LC_NUMERIC=C
## [5] LC_TIME=English_United States.utf8
##
## time zone: America/New_York
## tzcode source: internal
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] kableExtra_1.4.0 knitr_1.51 NHANES_2.1.0 lubridate_1.9.3
## [5] forcats_1.0.0 stringr_1.5.1 dplyr_1.1.4 purrr_1.0.2
## [9] readr_2.1.5 tidyr_1.3.1 tibble_3.2.1 ggplot2_4.0.2
## [13] tidyverse_2.0.0
##
## loaded via a namespace (and not attached):
## [1] gtable_0.3.6 jsonlite_2.0.0 compiler_4.4.2 tidyselect_1.2.1
## [5] xml2_1.3.6 jquerylib_0.1.4 textshaping_0.4.0 systemfonts_1.3.1
## [9] scales_1.4.0 yaml_2.3.10 fastmap_1.2.0 R6_2.6.1
## [13] labeling_0.4.3 generics_0.1.4 svglite_2.2.2 bslib_0.10.0
## [17] pillar_1.11.1 RColorBrewer_1.1-3 tzdb_0.4.0 rlang_1.1.4
## [21] utf8_1.2.4 stringi_1.8.4 cachem_1.1.0 xfun_0.56
## [25] sass_0.4.10 S7_0.2.1 viridisLite_0.4.3 timechange_0.3.0
## [29] cli_3.6.3 withr_3.0.2 magrittr_2.0.3 digest_0.6.37
## [33] grid_4.4.2 rstudioapi_0.18.0 hms_1.1.4 lifecycle_1.0.5
## [37] vctrs_0.6.5 evaluate_1.0.5 glue_1.8.0 farver_2.1.2
## [41] rmarkdown_2.30 tools_4.4.2 pkgconfig_2.0.3 htmltools_0.5.8.1
Lab Activity 1 Complete!
Last updated: February 04, 2026