# Load required packages
library(tidyverse) # Data manipulation (dplyr, ggplot2, etc.)
library(NHANES) # NHANES dataset
library(knitr) # For professional table output
library(kableExtra) # Enhanced tables
# Load the NHANES data
data(NHANES)
# Examine the first few rows
head(NHANES, n = 10)
## # A tibble: 10 × 76
## ID SurveyYr Gender Age AgeDecade AgeMonths Race1 Race3 Education MaritalStatus HHIncome
## <int> <fct> <fct> <int> <fct> <int> <fct> <fct> <fct> <fct> <fct>
## 1 51624 2009_10 male 34 " 30-39" 409 White <NA> High School Married 25000-349…
## 2 51624 2009_10 male 34 " 30-39" 409 White <NA> High School Married 25000-349…
## 3 51624 2009_10 male 34 " 30-39" 409 White <NA> High School Married 25000-349…
## 4 51625 2009_10 male 4 " 0-9" 49 Other <NA> <NA> <NA> 20000-249…
## 5 51630 2009_10 female 49 " 40-49" 596 White <NA> Some College LivePartner 35000-449…
## 6 51638 2009_10 male 9 " 0-9" 115 White <NA> <NA> <NA> 75000-999…
## 7 51646 2009_10 male 8 " 0-9" 101 White <NA> <NA> <NA> 55000-649…
## 8 51647 2009_10 female 45 " 40-49" 541 White <NA> College Grad Married 75000-999…
## 9 51647 2009_10 female 45 " 40-49" 541 White <NA> College Grad Married 75000-999…
## 10 51647 2009_10 female 45 " 40-49" 541 White <NA> College Grad Married 75000-999…
## # ℹ 65 more variables: HHIncomeMid <int>, Poverty <dbl>, HomeRooms <int>, HomeOwn <fct>,
## # Work <fct>, Weight <dbl>, Length <dbl>, HeadCirc <dbl>, Height <dbl>, BMI <dbl>,
## # BMICatUnder20yrs <fct>, BMI_WHO <fct>, Pulse <int>, BPSysAve <int>, BPDiaAve <int>,
## # BPSys1 <int>, BPDia1 <int>, BPSys2 <int>, BPDia2 <int>, BPSys3 <int>, BPDia3 <int>,
## # Testosterone <dbl>, DirectChol <dbl>, TotChol <dbl>, UrineVol1 <int>, UrineFlow1 <dbl>,
## # UrineVol2 <int>, UrineFlow2 <dbl>, Diabetes <fct>, DiabetesAge <int>, HealthGen <fct>,
## # DaysPhysHlthBad <int>, DaysMentHlthBad <int>, LittleInterest <fct>, Depressed <fct>, …
# View data structure
str(NHANES)
## tibble [10,000 × 76] (S3: tbl_df/tbl/data.frame)
## $ ID : int [1:10000] 51624 51624 51624 51625 51630 51638 51646 51647 51647 51647 ...
## $ SurveyYr : Factor w/ 2 levels "2009_10","2011_12": 1 1 1 1 1 1 1 1 1 1 ...
## $ Gender : Factor w/ 2 levels "female","male": 2 2 2 2 1 2 2 1 1 1 ...
## $ Age : int [1:10000] 34 34 34 4 49 9 8 45 45 45 ...
## $ AgeDecade : Factor w/ 8 levels " 0-9"," 10-19",..: 4 4 4 1 5 1 1 5 5 5 ...
## $ AgeMonths : int [1:10000] 409 409 409 49 596 115 101 541 541 541 ...
## $ Race1 : Factor w/ 5 levels "Black","Hispanic",..: 4 4 4 5 4 4 4 4 4 4 ...
## $ Race3 : Factor w/ 6 levels "Asian","Black",..: NA NA NA NA NA NA NA NA NA NA ...
## $ Education : Factor w/ 5 levels "8th Grade","9 - 11th Grade",..: 3 3 3 NA 4 NA NA 5 5 5 ...
## $ MaritalStatus : Factor w/ 6 levels "Divorced","LivePartner",..: 3 3 3 NA 2 NA NA 3 3 3 ...
## $ HHIncome : Factor w/ 12 levels " 0-4999"," 5000-9999",..: 6 6 6 5 7 11 9 11 11 11 ...
## $ HHIncomeMid : int [1:10000] 30000 30000 30000 22500 40000 87500 60000 87500 87500 87500 ...
## $ Poverty : num [1:10000] 1.36 1.36 1.36 1.07 1.91 1.84 2.33 5 5 5 ...
## $ HomeRooms : int [1:10000] 6 6 6 9 5 6 7 6 6 6 ...
## $ HomeOwn : Factor w/ 3 levels "Own","Rent","Other": 1 1 1 1 2 2 1 1 1 1 ...
## $ Work : Factor w/ 3 levels "Looking","NotWorking",..: 2 2 2 NA 2 NA NA 3 3 3 ...
## $ Weight : num [1:10000] 87.4 87.4 87.4 17 86.7 29.8 35.2 75.7 75.7 75.7 ...
## $ Length : num [1:10000] NA NA NA NA NA NA NA NA NA NA ...
## $ HeadCirc : num [1:10000] NA NA NA NA NA NA NA NA NA NA ...
## $ Height : num [1:10000] 165 165 165 105 168 ...
## $ BMI : num [1:10000] 32.2 32.2 32.2 15.3 30.6 ...
## $ BMICatUnder20yrs: Factor w/ 4 levels "UnderWeight",..: NA NA NA NA NA NA NA NA NA NA ...
## $ BMI_WHO : Factor w/ 4 levels "12.0_18.5","18.5_to_24.9",..: 4 4 4 1 4 1 2 3 3 3 ...
## $ Pulse : int [1:10000] 70 70 70 NA 86 82 72 62 62 62 ...
## $ BPSysAve : int [1:10000] 113 113 113 NA 112 86 107 118 118 118 ...
## $ BPDiaAve : int [1:10000] 85 85 85 NA 75 47 37 64 64 64 ...
## $ BPSys1 : int [1:10000] 114 114 114 NA 118 84 114 106 106 106 ...
## $ BPDia1 : int [1:10000] 88 88 88 NA 82 50 46 62 62 62 ...
## $ BPSys2 : int [1:10000] 114 114 114 NA 108 84 108 118 118 118 ...
## $ BPDia2 : int [1:10000] 88 88 88 NA 74 50 36 68 68 68 ...
## $ BPSys3 : int [1:10000] 112 112 112 NA 116 88 106 118 118 118 ...
## $ BPDia3 : int [1:10000] 82 82 82 NA 76 44 38 60 60 60 ...
## $ Testosterone : num [1:10000] NA NA NA NA NA NA NA NA NA NA ...
## $ DirectChol : num [1:10000] 1.29 1.29 1.29 NA 1.16 1.34 1.55 2.12 2.12 2.12 ...
## $ TotChol : num [1:10000] 3.49 3.49 3.49 NA 6.7 4.86 4.09 5.82 5.82 5.82 ...
## $ UrineVol1 : int [1:10000] 352 352 352 NA 77 123 238 106 106 106 ...
## $ UrineFlow1 : num [1:10000] NA NA NA NA 0.094 ...
## $ UrineVol2 : int [1:10000] NA NA NA NA NA NA NA NA NA NA ...
## $ UrineFlow2 : num [1:10000] NA NA NA NA NA NA NA NA NA NA ...
## $ Diabetes : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...
## $ DiabetesAge : int [1:10000] NA NA NA NA NA NA NA NA NA NA ...
## $ HealthGen : Factor w/ 5 levels "Excellent","Vgood",..: 3 3 3 NA 3 NA NA 2 2 2 ...
## $ DaysPhysHlthBad : int [1:10000] 0 0 0 NA 0 NA NA 0 0 0 ...
## $ DaysMentHlthBad : int [1:10000] 15 15 15 NA 10 NA NA 3 3 3 ...
## $ LittleInterest : Factor w/ 3 levels "None","Several",..: 3 3 3 NA 2 NA NA 1 1 1 ...
## $ Depressed : Factor w/ 3 levels "None","Several",..: 2 2 2 NA 2 NA NA 1 1 1 ...
## $ nPregnancies : int [1:10000] NA NA NA NA 2 NA NA 1 1 1 ...
## $ nBabies : int [1:10000] NA NA NA NA 2 NA NA NA NA NA ...
## $ Age1stBaby : int [1:10000] NA NA NA NA 27 NA NA NA NA NA ...
## $ SleepHrsNight : int [1:10000] 4 4 4 NA 8 NA NA 8 8 8 ...
## $ SleepTrouble : Factor w/ 2 levels "No","Yes": 2 2 2 NA 2 NA NA 1 1 1 ...
## $ PhysActive : Factor w/ 2 levels "No","Yes": 1 1 1 NA 1 NA NA 2 2 2 ...
## $ PhysActiveDays : int [1:10000] NA NA NA NA NA NA NA 5 5 5 ...
## $ TVHrsDay : Factor w/ 7 levels "0_hrs","0_to_1_hr",..: NA NA NA NA NA NA NA NA NA NA ...
## $ CompHrsDay : Factor w/ 7 levels "0_hrs","0_to_1_hr",..: NA NA NA NA NA NA NA NA NA NA ...
## $ TVHrsDayChild : int [1:10000] NA NA NA 4 NA 5 1 NA NA NA ...
## $ CompHrsDayChild : int [1:10000] NA NA NA 1 NA 0 6 NA NA NA ...
## $ Alcohol12PlusYr : Factor w/ 2 levels "No","Yes": 2 2 2 NA 2 NA NA 2 2 2 ...
## $ AlcoholDay : int [1:10000] NA NA NA NA 2 NA NA 3 3 3 ...
## $ AlcoholYear : int [1:10000] 0 0 0 NA 20 NA NA 52 52 52 ...
## $ SmokeNow : Factor w/ 2 levels "No","Yes": 1 1 1 NA 2 NA NA NA NA NA ...
## $ Smoke100 : Factor w/ 2 levels "No","Yes": 2 2 2 NA 2 NA NA 1 1 1 ...
## $ Smoke100n : Factor w/ 2 levels "Non-Smoker","Smoker": 2 2 2 NA 2 NA NA 1 1 1 ...
## $ SmokeAge : int [1:10000] 18 18 18 NA 38 NA NA NA NA NA ...
## $ Marijuana : Factor w/ 2 levels "No","Yes": 2 2 2 NA 2 NA NA 2 2 2 ...
## $ AgeFirstMarij : int [1:10000] 17 17 17 NA 18 NA NA 13 13 13 ...
## $ RegularMarij : Factor w/ 2 levels "No","Yes": 1 1 1 NA 1 NA NA 1 1 1 ...
## $ AgeRegMarij : int [1:10000] NA NA NA NA NA NA NA NA NA NA ...
## $ HardDrugs : Factor w/ 2 levels "No","Yes": 2 2 2 NA 2 NA NA 1 1 1 ...
## $ SexEver : Factor w/ 2 levels "No","Yes": 2 2 2 NA 2 NA NA 2 2 2 ...
## $ SexAge : int [1:10000] 16 16 16 NA 12 NA NA 13 13 13 ...
## $ SexNumPartnLife : int [1:10000] 8 8 8 NA 10 NA NA 20 20 20 ...
## $ SexNumPartYear : int [1:10000] 1 1 1 NA 1 NA NA 0 0 0 ...
## $ SameSex : Factor w/ 2 levels "No","Yes": 1 1 1 NA 2 NA NA 2 2 2 ...
## $ SexOrientation : Factor w/ 3 levels "Bisexual","Heterosexual",..: 2 2 2 NA 2 NA NA 1 1 1 ...
## $ PregnantNow : Factor w/ 3 levels "Yes","No","Unknown": NA NA NA NA NA NA NA NA NA NA ...
# Dimensions: rows (observations) and columns (variables)
dim(NHANES)
## [1] 10000 76
# What variables do we have?
names(NHANES)
## [1] "ID" "SurveyYr" "Gender" "Age" "AgeDecade"
## [6] "AgeMonths" "Race1" "Race3" "Education" "MaritalStatus"
## [11] "HHIncome" "HHIncomeMid" "Poverty" "HomeRooms" "HomeOwn"
## [16] "Work" "Weight" "Length" "HeadCirc" "Height"
## [21] "BMI" "BMICatUnder20yrs" "BMI_WHO" "Pulse" "BPSysAve"
## [26] "BPDiaAve" "BPSys1" "BPDia1" "BPSys2" "BPDia2"
## [31] "BPSys3" "BPDia3" "Testosterone" "DirectChol" "TotChol"
## [36] "UrineVol1" "UrineFlow1" "UrineVol2" "UrineFlow2" "Diabetes"
## [41] "DiabetesAge" "HealthGen" "DaysPhysHlthBad" "DaysMentHlthBad" "LittleInterest"
## [46] "Depressed" "nPregnancies" "nBabies" "Age1stBaby" "SleepHrsNight"
## [51] "SleepTrouble" "PhysActive" "PhysActiveDays" "TVHrsDay" "CompHrsDay"
## [56] "TVHrsDayChild" "CompHrsDayChild" "Alcohol12PlusYr" "AlcoholDay" "AlcoholYear"
## [61] "SmokeNow" "Smoke100" "Smoke100n" "SmokeAge" "Marijuana"
## [66] "AgeFirstMarij" "RegularMarij" "AgeRegMarij" "HardDrugs" "SexEver"
## [71] "SexAge" "SexNumPartnLife" "SexNumPartYear" "SameSex" "SexOrientation"
## [76] "PregnantNow"
# Count missing values in each column
missing_summary <- data.frame(
Variable = names(NHANES),
Missing_Count = colSums(is.na(NHANES)),
Missing_Percent = round(colSums(is.na(NHANES)) / nrow(NHANES) * 100, 2)
)
# Show variables with the most missing data
print(missing_summary[order(-missing_summary$Missing_Count), ][1:15, ])
## Variable Missing_Count Missing_Percent
## HeadCirc HeadCirc 9912 99.12
## Length Length 9457 94.57
## DiabetesAge DiabetesAge 9371 93.71
## TVHrsDayChild TVHrsDayChild 9347 93.47
## CompHrsDayChild CompHrsDayChild 9347 93.47
## BMICatUnder20yrs BMICatUnder20yrs 8726 87.26
## AgeRegMarij AgeRegMarij 8634 86.34
## UrineFlow2 UrineFlow2 8524 85.24
## UrineVol2 UrineVol2 8522 85.22
## PregnantNow PregnantNow 8304 83.04
## Age1stBaby Age1stBaby 8116 81.16
## nBabies nBabies 7584 75.84
## nPregnancies nPregnancies 7396 73.96
## AgeFirstMarij AgeFirstMarij 7109 71.09
## SmokeAge SmokeAge 6920 69.20
# Select key variables for analysis
nhanes_analysis <- NHANES %>%
dplyr::select(
ID,
Gender, # Sex (Male/Female)
Age, # Age in years
Race1, # Race/ethnicity
Education, # Education level
BMI, # Body Mass Index
Pulse, # Resting heart rate
BPSys1, # Systolic blood pressure (1st reading)
BPDia1, # Diastolic blood pressure (1st reading)
PhysActive, # Physically active (Yes/No)
SmokeNow, # Current smoking status
Diabetes, # Diabetes diagnosis (Yes/No)
HealthGen # General health rating
) %>%
# Create a binary hypertension indicator (BPSys1 >= 140 OR BPDia1 >= 90)
mutate(
Hypertension = factor(ifelse(BPSys1 >= 140 | BPDia1 >= 90, "Yes", "No"))
)
# Remove rows with missing values for key variables
nhanes_analysis2 <- nhanes_analysis %>%
filter(complete.cases(.)) # Complete cases only
# View the processed dataset
head(nhanes_analysis, 10)
## # A tibble: 10 × 14
## ID Gender Age Race1 Education BMI Pulse BPSys1 BPDia1 PhysActive SmokeNow Diabetes
## <int> <fct> <int> <fct> <fct> <dbl> <int> <int> <int> <fct> <fct> <fct>
## 1 51624 male 34 White High School 32.2 70 114 88 No No No
## 2 51624 male 34 White High School 32.2 70 114 88 No No No
## 3 51624 male 34 White High School 32.2 70 114 88 No No No
## 4 51625 male 4 Other <NA> 15.3 NA NA NA <NA> <NA> No
## 5 51630 female 49 White Some College 30.6 86 118 82 No Yes No
## 6 51638 male 9 White <NA> 16.8 82 84 50 <NA> <NA> No
## 7 51646 male 8 White <NA> 20.6 72 114 46 <NA> <NA> No
## 8 51647 female 45 White College Grad 27.2 62 106 62 Yes <NA> No
## 9 51647 female 45 White College Grad 27.2 62 106 62 Yes <NA> No
## 10 51647 female 45 White College Grad 27.2 62 106 62 Yes <NA> No
## # ℹ 2 more variables: HealthGen <fct>, Hypertension <fct>
# Check dimensions
dim(nhanes_analysis)
## [1] 10000 14
Research Question: Do cardiovascular health indicators and physical activity differ by gender?
# Create summary statistics grouped by gender
health_by_gender <- nhanes_analysis %>%
group_by(Gender) %>%
summarise(
N = n(),
# Descriptive statistics
Mean_Age = round(mean(Age, na.rm = TRUE), 2),
Mean_BMI = round(mean(BMI, na.rm = TRUE), 2),
Mean_SysBP = round(mean(BPSys1, na.rm = TRUE), 2),
Mean_DiaBP = round(mean(BPDia1, na.rm = TRUE), 2),
# Prevalence estimates (percentages)
Pct_Physically_Active = round(
sum(PhysActive == "Yes", na.rm = TRUE) / sum(!is.na(PhysActive)) * 100, 2),
Pct_Current_Smoker = round(
sum(SmokeNow == "Yes", na.rm = TRUE) / sum(!is.na(SmokeNow)) * 100, 2),
Pct_Hypertension = round(
sum(Hypertension == "Yes", na.rm = TRUE) / sum(!is.na(Hypertension)) * 100, 2),
Pct_Diabetes = round(
sum(Diabetes == "Yes", na.rm = TRUE) / sum(!is.na(Diabetes)) * 100, 2)
)
print(health_by_gender)
## # A tibble: 2 × 10
## Gender N Mean_Age Mean_BMI Mean_SysBP Mean_DiaBP Pct_Physically_Active Pct_Current_Smoker
## <fct> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 female 5020 37.6 26.8 117. 67.0 53.2 45.5
## 2 male 4980 35.8 26.6 121. 69.5 58.6 45.8
## # ℹ 2 more variables: Pct_Hypertension <dbl>, Pct_Diabetes <dbl>
Research Question: How do health outcomes and behaviors vary across the lifespan?
#What not to do:
# Summary by age
health_by_age <- nhanes_analysis %>%
group_by(Age) %>%
summarise(
N = n(),
Mean_BMI = round(mean(BMI, na.rm = TRUE), 2),
Mean_SysBP = round(mean(BPSys1, na.rm = TRUE), 2),
Pct_Hypertension = round(
sum(Hypertension == "Yes", na.rm = TRUE) / sum(!is.na(Hypertension)) * 100, 2),
Pct_Diabetes = round(
sum(Diabetes == "Yes", na.rm = TRUE) / sum(!is.na(Diabetes)) * 100, 2),
Pct_Physically_Active = round(
sum(PhysActive == "Yes", na.rm = TRUE) / sum(!is.na(PhysActive)) * 100, 2)
)
#Do this:
# Create age groups for analysis
nhanes_analysis <- nhanes_analysis %>%
mutate(
Age_Group = cut(Age,
breaks = c(0, 20, 35, 50, 65, 100),
labels = c("18-20", "21-35", "36-50", "51-65", "65+"))
)
# Summary by age group
health_by_age <- nhanes_analysis %>%
group_by(Age_Group) %>%
summarise(
N = n(),
Mean_BMI = round(mean(BMI, na.rm = TRUE), 2),
Mean_SysBP = round(mean(BPSys1, na.rm = TRUE), 2),
Pct_Hypertension = round(
sum(Hypertension == "Yes", na.rm = TRUE) / sum(!is.na(Hypertension)) * 100, 2),
Pct_Diabetes = round(
sum(Diabetes == "Yes", na.rm = TRUE) / sum(!is.na(Diabetes)) * 100, 2),
Pct_Physically_Active = round(
sum(PhysActive == "Yes", na.rm = TRUE) / sum(!is.na(PhysActive)) * 100, 2)
)
print(health_by_age)
## # A tibble: 6 × 7
## Age_Group N Mean_BMI Mean_SysBP Pct_Hypertension Pct_Diabetes Pct_Physically_Active
## <fct> <int> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 18-20 2769 20.8 107. 0.73 0.65 76.8
## 2 21-35 1984 28.2 114. 4.72 1.46 61.8
## 3 36-50 2138 29.1 118. 11.4 6.69 55.5
## 4 51-65 1784 29.4 127. 22.6 17.7 47.5
## 5 65+ 1188 28.6 135. 40.4 21.4 37.2
## 6 <NA> 137 NaN NaN NaN NaN NaN
Research Question: Are there racial/ethnic differences in hypertension that differ by gender?
# Stratified by gender and race/ethnicity
health_stratified <- nhanes_analysis %>%
group_by(Gender, Race1) %>%
summarise(
N = n(),
Mean_SysBP = round(mean(BPSys1, na.rm = TRUE), 2),
Mean_BMI = round(mean(BMI, na.rm = TRUE), 2),
Pct_Hypertension = round(
sum(Hypertension == "Yes", na.rm = TRUE) / sum(!is.na(Hypertension)) * 100, 2),
.groups = 'drop'
)
print(health_stratified)
## # A tibble: 10 × 6
## Gender Race1 N Mean_SysBP Mean_BMI Pct_Hypertension
## <fct> <fct> <int> <dbl> <dbl> <dbl>
## 1 female Black 614 120. 29.8 17.4
## 2 female Hispanic 320 114. 26.3 8.88
## 3 female Mexican 452 112. 26.4 7.08
## 4 female White 3221 118. 26.6 13.8
## 5 female Other 413 113. 24.0 10
## 6 male Black 583 123. 26.3 17.3
## 7 male Hispanic 290 119. 26.4 10.1
## 8 male Mexican 563 119. 26.6 12.4
## 9 male White 3151 121. 26.8 15.2
## 10 male Other 393 118. 24.8 12.8
# Create visualization
health_by_age_gender <- nhanes_analysis %>%
group_by(Age_Group, Gender) %>%
summarise(
Pct_Hypertension = round(
sum(Hypertension == "Yes", na.rm = TRUE) / sum(!is.na(Hypertension)) * 100, 2),
.groups = 'drop'
)
ggplot(health_by_age_gender, aes(x = Age_Group, y = Pct_Hypertension, fill = Gender)) +
geom_col(position = "dodge", alpha = 0.8) +
labs(
title = "Hypertension Prevalence by Age and Gender",
subtitle = "NHANES Data",
x = "Age Group",
y = "Prevalence (%)",
fill = "Gender",
caption = "Source: National Health and Nutrition Examination Survey"
) +
theme_minimal() +
theme(
plot.title = element_text(size = 14, face = "bold"),
axis.text.x = element_text(angle = 45, hjust = 1)
)
## Physical Activity by Age Group
activity_by_age <- health_by_age %>%
filter(!is.na(Age_Group))
ggplot(activity_by_age, aes(x = Age_Group, y = Pct_Physically_Active)) +
geom_col(fill = "#2ecc71", alpha = 0.8) +
geom_text(aes(label = paste0(Pct_Physically_Active, "%")),
vjust = -0.5, size = 3) +
labs(
title = "Physical Activity Participation by Age Group",
subtitle = "NHANES Data",
x = "Age Group",
y = "Prevalence (%)",
caption = "Source: National Health and Nutrition Examination Survey"
) +
ylim(0, 70) +
theme_minimal() +
theme(
plot.title = element_text(size = 14, face = "bold"),
axis.text.x = element_text(angle = 45, hjust = 1)
)
hypertension_by_race <- nhanes_analysis %>%
group_by(Race1) %>%
summarise(
Pct_Hypertension = round(
sum(Hypertension == "Yes", na.rm = TRUE) / sum(!is.na(Hypertension)) * 100, 2),
N = n(),
.groups = 'drop'
) %>%
filter(!is.na(Race1))
ggplot(hypertension_by_race, aes(x = Race1, y = Pct_Hypertension)) +
geom_col(fill = "#e74c3c", alpha = 0.8) +
geom_text(aes(label = paste0(Pct_Hypertension, "%")),
vjust = -0.5, size = 3) +
labs(
title = "Hypertension Prevalence by Race/Ethnicity",
subtitle = "NHANES Data",
x = "Race/Ethnicity",
y = "Prevalence (%)",
caption = "Source: National Health and Nutrition Examination Survey"
) +
ylim(0, 50) +
theme_minimal() +
theme(
plot.title = element_text(size = 14, face = "bold"),
axis.text.x = element_text(angle = 45, hjust = 1)
)
# Create summary table by gender
summary_table <- health_by_gender %>%
select(
Gender,
N,
`Mean Age` = Mean_Age,
`Mean BMI` = Mean_BMI,
`Mean SysBP` = Mean_SysBP,
`% Hypertension` = Pct_Hypertension,
`% Physically Active` = Pct_Physically_Active
)
# Display with kableExtra
kable(summary_table,
caption = "Cardiovascular Health and Physical Activity by Gender",
format = "html") %>%
kable_styling(
bootstrap_options = c("striped", "hover", "condensed"),
full_width = FALSE
)
| Gender | N | Mean Age | Mean BMI | Mean SysBP | % Hypertension | % Physically Active |
|---|---|---|---|---|---|---|
| female | 5020 | 37.64 | 26.77 | 117.3 | 13.07 | 53.18 |
| male | 4980 | 35.83 | 26.55 | 120.9 | 14.68 | 58.56 |
“How does hypertension prevalence vary by education level?”
Code:
# Your code here:
health_by_education <- nhanes_analysis %>%
group_by(Education) %>%
summarise(
N = n(),
Mean_SysBP = round(mean(BPSys1, na.rm = TRUE), 2),
Pct_Hypertension = round(
sum(Hypertension == "Yes", na.rm = TRUE) / sum(!is.na(Hypertension)) * 100, 2)
)
print(health_by_education)
Answer: The lowest mean systolic BP is 106.2 and the extremely low hypertension prevalence is 0.72%. This indicates that individuals with less education have approximately 9mmHg higher systolic pressure compared to college graduates. Therefore, hypertension prevalence is highest among those with less than a high school education and steadily declines with increasing educational attainment, reaching its lowest among college students. This means that, more education often means better understanding of health risks and how to manage them.
Create a bar chart showing hypertension by education level:
# Your visualization here:
health_by_education %>%
filter(!is.na(Education)) %>%
ggplot(aes(x = Education, y = Pct_Hypertension)) +
geom_col(fill = "steelblue", alpha = 0.7) +
geom_text(aes(label = paste0(Pct_Hypertension, "%")),
vjust = -0.5, size = 3) +
labs(
title = "Hypertension Prevalence by Education Level",
x = "Education Level",
y = "Percent with Hypertension (%)",
caption = "Source: NHANES"
) +
ylim(0, 50) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
Explain:
The bar chat clearly shows the trend. The higher the educational level, the lower the lower the prevalence of hypertension.
Write 2-3 sentences:
“What does this pattern tell us about health disparities and social determinants?”
Consider: - Which education groups have highest/lowest hypertension? - What might explain these differences? - Why does this matter for public health?
Answer: The pattern clearly confirms the fundamental truth about the social determinants of health and health disparities.
Key Identifiers: 1. Education is a “core cause” which greatly impacts the health of a population through multiple pathways.