# Load required packages
library(tidyverse)    # Data manipulation (dplyr, ggplot2, etc.)
## Warning: package 'tidyverse' was built under R version 4.5.2
## Warning: package 'tibble' was built under R version 4.5.2
## Warning: package 'tidyr' was built under R version 4.5.2
## Warning: package 'readr' was built under R version 4.5.2
## Warning: package 'purrr' was built under R version 4.5.2
## Warning: package 'dplyr' was built under R version 4.5.2
## Warning: package 'stringr' was built under R version 4.5.2
## Warning: package 'forcats' was built under R version 4.5.2
## Warning: package 'lubridate' was built under R version 4.5.2
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.6
## ✔ forcats   1.0.1     ✔ stringr   1.6.0
## ✔ ggplot2   4.0.0     ✔ tibble    3.3.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.2
## ✔ purrr     1.2.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(NHANES)       # NHANES dataset
## Warning: package 'NHANES' was built under R version 4.5.2
library(knitr)        # For professional table output
## Warning: package 'knitr' was built under R version 4.5.2
library(kableExtra)   # Enhanced tables
## Warning: package 'kableExtra' was built under R version 4.5.2
## 
## Attaching package: 'kableExtra'
## 
## The following object is masked from 'package:dplyr':
## 
##     group_rows
# Select key variables for analysis
nhanes_analysis <- NHANES %>% 
 dplyr:: select(
    ID,
    Gender,           # Sex (Male/Female)
    Age,              # Age in years
    Race1,            # Race/ethnicity
    Education,        # Education level
    BMI,              # Body Mass Index
    Pulse,            # Resting heart rate
    BPSys1,           # Systolic blood pressure (1st reading)
    BPDia1,           # Diastolic blood pressure (1st reading)
    PhysActive,       # Physically active (Yes/No)
    SmokeNow,         # Current smoking status
    Diabetes,         # Diabetes diagnosis (Yes/No)
    HealthGen         # General health rating
  ) %>%
  
  # Create a binary hypertension indicator (BPSys1 >= 140 OR BPDia1 >= 90)
  mutate(
    Hypertension = ifelse(BPSys1 >= 140 | BPDia1 >= 90, "Yes", "No")
  )


nhanes_analysis2 <- nhanes_analysis %>% 
    filter(complete.cases(.))

🎯 Task 1: Explore Health Disparities by Education (15 minutes)

Using the nhanes_analysis data, explore:

“How does hypertension prevalence vary by education level?”

Write code to:

  1. Group by education level
  2. Calculate sample size, mean systolic BP, and percent with hypertension
  3. Print the results
# Your code here:

health_by_education <- nhanes_analysis %>%
  group_by(Education) %>%
  summarise(
    N = n(),
    Mean_SysBP = round(mean(BPSys1, na.rm = TRUE), 2),
    Pct_Hypertension = round(
      sum(Hypertension == "Yes", na.rm = TRUE) / sum(!is.na(Hypertension)) * 100, 2)
  )

print(health_by_education)
## # A tibble: 6 × 4
##   Education          N Mean_SysBP Pct_Hypertension
##   <fct>          <int>      <dbl>            <dbl>
## 1 8th Grade        451       128.            28.3 
## 2 9 - 11th Grade   888       124.            17.3 
## 3 High School     1517       124.            18.9 
## 4 Some College    2267       122.            16.6 
## 5 College Grad    2098       119.            13.1 
## 6 <NA>            2779       106.             0.72

🎯 Task 2: Create a Visualization (10 minutes)

Create a bar chart showing hypertension by education level:

# Your visualization here:
health_by_education %>%
  filter(!is.na(Education)) %>%
  ggplot(aes(x = Education, y = Pct_Hypertension)) +
  geom_col(fill = "steelblue", alpha = 0.7) +
  geom_text(aes(label = paste0(Pct_Hypertension, "%")), 
            vjust = -0.5, size = 3) +
  labs(
    title = "Hypertension Prevalence by Education Level",
    x = "Education Level",
    y = "Percent with Hypertension (%)",
    caption = "Source: NHANES"
  ) +
  ylim(0, 50) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))


🎯 Task 3: Write a Data Interpretation (5 minutes)

Write 2-3 sentences:

“What does this pattern tell us about health disparities and social determinants?” Those who only completed an education up to 8th grade have the highest prevalence of hypertension, and those who graduated college have the lowest prevalence of hypertension.There is a general trend that as education level increases, the prevalence of hypertension decreases. A decreased knowledge of proper nutrition due to lack of education, high stress from lower paying jobs due to lower education level, decreased access to healthy and fresh foods due to lower income, and less time able to participate in physical activity because of more time spent working could be a few explanations as to why there is an increase in hypertension associated with lower education levels. This is important to know for public health in order for resources to be targeted toward those with lower education levels supporting a decrease in hypertension, access to nutrition information and healthy foods, and more.