install.packages("NHANES")
library(tidyverse) # Data manipulation (dplyr, ggplot2, etc.)
library(NHANES) # NHANES dataset
library(knitr) # For professional table output
library(kableExtra) # Enhanced tables
data(NHANES)
head(NHANES, n = 10)
## # A tibble: 10 × 76
## ID SurveyYr Gender Age AgeDecade AgeMonths Race1 Race3 Education
## <int> <fct> <fct> <int> <fct> <int> <fct> <fct> <fct>
## 1 51624 2009_10 male 34 " 30-39" 409 White <NA> High School
## 2 51624 2009_10 male 34 " 30-39" 409 White <NA> High School
## 3 51624 2009_10 male 34 " 30-39" 409 White <NA> High School
## 4 51625 2009_10 male 4 " 0-9" 49 Other <NA> <NA>
## 5 51630 2009_10 female 49 " 40-49" 596 White <NA> Some College
## 6 51638 2009_10 male 9 " 0-9" 115 White <NA> <NA>
## 7 51646 2009_10 male 8 " 0-9" 101 White <NA> <NA>
## 8 51647 2009_10 female 45 " 40-49" 541 White <NA> College Grad
## 9 51647 2009_10 female 45 " 40-49" 541 White <NA> College Grad
## 10 51647 2009_10 female 45 " 40-49" 541 White <NA> College Grad
## # ℹ 67 more variables: MaritalStatus <fct>, HHIncome <fct>, HHIncomeMid <int>,
## # Poverty <dbl>, HomeRooms <int>, HomeOwn <fct>, Work <fct>, Weight <dbl>,
## # Length <dbl>, HeadCirc <dbl>, Height <dbl>, BMI <dbl>,
## # BMICatUnder20yrs <fct>, BMI_WHO <fct>, Pulse <int>, BPSysAve <int>,
## # BPDiaAve <int>, BPSys1 <int>, BPDia1 <int>, BPSys2 <int>, BPDia2 <int>,
## # BPSys3 <int>, BPDia3 <int>, Testosterone <dbl>, DirectChol <dbl>,
## # TotChol <dbl>, UrineVol1 <int>, UrineFlow1 <dbl>, UrineVol2 <int>, …
#select variables
nhanes_analysis <- NHANES %>%
select(
ID,
Gender, # Sex (Male/Female)
Age, # Age in years
Race1, # Race/ethnicity
Education, # Education level
BMI, # Body Mass Index
Pulse, # Resting heart rate
BPSys1, # Systolic blood pressure (1st reading)
BPDia1, # Diastolic blood pressure (1st reading)
PhysActive, # Physically active (Yes/No)
SmokeNow, # Current smoking status
Diabetes, # Diabetes diagnosis (Yes/No)
HealthGen # General health rating
) %>% mutate(
Hypertension = ifelse(BPSys1 >= 140 | BPDia1 >= 90, "Yes", "No")
)
##group hypertension by education level
health_by_education <- nhanes_analysis %>%
group_by(Education) %>%
summarise(
N = n(),
Mean_SysBP = round(mean(BPSys1, na.rm = TRUE), 2),
Pct_Hypertension = round(
sum(Hypertension == "Yes", na.rm = TRUE) / sum(!is.na(Hypertension)) * 100, 2)
)
print(health_by_education)
## # A tibble: 6 × 4
## Education N Mean_SysBP Pct_Hypertension
## <fct> <int> <dbl> <dbl>
## 1 8th Grade 451 128. 28.3
## 2 9 - 11th Grade 888 124. 17.3
## 3 High School 1517 124. 18.9
## 4 Some College 2267 122. 16.6
## 5 College Grad 2098 119. 13.1
## 6 <NA> 2779 106. 0.72
##create a barchart
health_by_education %>%
filter(!is.na(Education)) %>%
ggplot(aes(x = Education, y = Pct_Hypertension)) +
geom_col(fill = "darkred", alpha = 0.7) +
geom_text(aes(label = paste0(Pct_Hypertension, "%")),
vjust = -0.5, size = 3) +
labs(
title = "Hypertension Prevalence by Education Level",
x = "Education Level",
y = "Percent with Hypertension (%)",
caption = "Source: NHANES"
) +
ylim(0, 50) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))

##The bar graph clearly shows an inverse link between education level and the prevalence of hypertension, with college graduates having the lowest prevalence (≈13%) and those with only an eighth-grade education having the highest (≈28%). Social determinants of health that tend to improve with increasing educational attainment, such as income, work stress, health literacy, access to preventive treatment, and community resources, are probably reflected in this pattern.