library(SASxport)
library(naniar) # for missing data
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5 v purrr 0.3.4
## v tibble 3.1.6 v dplyr 1.0.8
## v tidyr 1.2.0 v stringr 1.4.0
## v readr 2.1.2 v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
#library(ggplot2)
library(GGally)
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
library(ggcorrplot)
library(fs) # read multiple files
Data source: National Health and Nutrition Examination Survey (NHANES).
NHANES is an annual survey taken by the Centers for Disease Control and Prevention. The survey is a program that is designed to assess the health and nutritional status of adults and children in the United States. The program takes a nationwide sample of about five thousand persons each year. Data collected includes demographics, dietary and health related questions and laboratory tests results. Analysis from the survey can be used to determine the risk factors for diseases.
Our main objective is to analyse different behaviors that impact BMI index. We started the analysis on body measurements, demographics, insulin, blood glucose, cholesterol ration and blood pressure. We found a major problem that almost stopped the project. But later we found some interesting points. We demonstrated all the findings with bar graphs, proportion graphs and correlation heatmaps.
For this project, we used NHANES 2017 - March 2020 Pre-Pandemic Data. There are 82 data sets in the survey. Most of the data sets have about 15 thousand observations and dozens of features. We chose blood pressure, blood glucose, insulin and cholesterol data sets as known factors to examine our methods. We chose diet behavior data sets as unknown factors to analyse.
# read Body Measures dataset
BodyMeasures = read.xport("P_BMX.XPT")
# rename varialbes
BodyMeasures = BodyMeasures %>%
rename(Weight = BMXWT,
BMI = BMXBMI) %>%
select(SEQN,Weight,BMI,BMDBMIC)
# age <18, BMI
BMIC = BodyMeasures %>%
mutate(Class_BMI = BMDBMIC) %>%
select(-BMDBMIC) %>%
drop_na()
There’s children BMI category, but no adult BMI category in the original data
# age > 18, BMI
BMIA = BodyMeasures %>%
filter(is.na(BMDBMIC)) %>%
select(-BMDBMIC) %>%
drop_na()
BMIA$Class_BMI = cut(BMIA$BMI,
breaks = c(0,18.5,25,30,max(BMIA$BMI)),
include.lowest = TRUE,
labels = c(1,2,3,4))
# combine children BMI and adult BMI two dataframes
BMI = rbind(BMIA,BMIC)
# convert Class_BMI to factor
BMI$Class_BMI = as.factor(BMI$Class_BMI)
#[1]
levels(BMI$Class_BMI) = c("Under Weight",
"Healthy Weight",
"Overweight",
"Obese")
Demographics = read.xport("P_DEMO.XPT")
# rename variables
Demographics = Demographics %>%
dplyr::rename(Gender = RIAGENDR,
Age = RIDAGEYR,
Race = RIDRETH3,
Education = DMDEDUC2,
Marital = DMDMARTZ,
Country_of_Birth = DMDBORN4) %>%
dplyr::select(SEQN,Age,Gender,Race,Country_of_Birth,
Marital,Education)
# convert Gender,Race, Country_of_Birth and Marital to factor
Demographics[,c(3:7)][] = lapply(Demographics[,c(3:7)],as.factor)
# rename levels
levels(Demographics$Gender) = c("Male", "Female")
levels(Demographics$Race) = c("Mexican American",
"Hispanic",
"White",
"Black",
"Asian",
"Others including multi-racial")
levels(Demographics$Country_of_Birth) = c("Born in US",
"Others",
"Refused",
"don't know")
levels(Demographics$Marital) = c("Married/Living with Partner",
"Divorced/Separated",
"Never married",
"Refused",
"Don't know")
levels(Demographics$Education) = c("Less than 9th grade",
"9-11th grade",
"High school graduate",
"Some college or AA degree",
"College graduate or above",
"Refused",
"Don't know")
# create a new variable Age_Range based on Age variable
Demographics$Age_Range = cut(Demographics$Age,
breaks = c(0,18,40,60,max(Demographics$Age)),
include.lowest = TRUE,
labels = c("Minor (smaller than 18 years old)",
"Young Adult (18-39)",
"Middle Age (40-59)",
"Seniors(greater than 60)"))
# inner join BMI and Demographics dataframes
BMI_Demo = merge(Demographics,BMI, by = "SEQN")
ggplot(BMI_Demo) +
aes(x = Race, y = BMI, fill = Class_BMI) +
geom_col(position = "fill") +
scale_x_discrete(guide = guide_axis(n.dodge=3))+
# geom_text(aes(label = n), position = position_fill(0.5), vjust = 0, check_overlap = TRUE) +
labs(y = "Proportions",
title = "BMI Proportion vs. Race")
ggplot(BMI_Demo) +
aes(x = Age_Range, y = BMI, fill = Class_BMI) +
geom_col(position = "fill") +
labs(y = "Proportions",
title = "BMI Proportion vs. Age")
Insulin = read.xport("P_INS.XPT")
Insulin = Insulin %>%
rename(Insulin_μU_mL = LBXIN) %>%
select(SEQN,Insulin_μU_mL)
Blood_Glucose= read.xport("P_GLU.XPT")
Blood_Glucose = Blood_Glucose %>%
rename(Fasting_glucose = LBXGLU) %>%
select(SEQN,Fasting_glucose)
# inner join insulin and glucose
GI = merge(Insulin,Blood_Glucose)
BMI_Demo[,c(1:4,8:11)]
# inner join insulin, glucose and BMI
BIG = merge(GI,BMI_Demo[,c(1:4,8:11)])
BIG = BIG %>% drop_na()
BIG = BIG %>%
# [3] 0 = normal, 0.5 = prediabetes, 1 = diabetes
dplyr::mutate(Diabetes = ifelse(Fasting_glucose < 100, 0,
ifelse(Fasting_glucose >=100 & Fasting_glucose <126, 0.5,
ifelse(Fasting_glucose >= 126,1,NA))))
# show the relation ship of BMI, insulin and blood gloucose
ggcorrplot(cor(BIG[,c(2:4,8:9,11)],method = "spearman"),
type = "lower",
hc.order =TRUE,
colors = c("blue", "white", "#E46726"),
lab = TRUE) +
labs(title = "Correlation Heatmap of BMI, Insulin and Blood Glucose")
CholesterolTotal = read.xport("P_TCHOL.XPT")
CholesterolHDL = read.xport("P_HDL.XPT")
Cholesterol = merge(CholesterolTotal,CholesterolHDL, by = "SEQN")
Cholesterol = Cholesterol %>%
mutate(Ratio_Total_HDL = LBXTC/LBDHDD ) %>%
select(SEQN, Ratio_Total_HDL)%>%
drop_na()
# create new variable Class_Cholesterol
Cholesterol$Class_Cholesterol = cut(Cholesterol$Ratio_Total_HDL,
breaks = c(0,3.5,4,5,max(Cholesterol$Ratio_Total_HDL)), #[2]
labels = c("The lowest risk of heart attack",
"14% more likely to experience heart attack",
"46% more likely to experience heart attack",
"89% more likely to experience heart attack"))
BloodPressure = read.xport("P_BPXO.XPT")
BloodPressure = BloodPressure %>%
rename(Systolic1 = BPXOSY1,
Diastolic1 = BPXODI1,
Systolic2 = BPXOSY2,
Diastolic2 = BPXODI2,
Systolic3 = BPXOSY3,
Diastolic3 = BPXODI3,
Pules1 = BPXOPLS1,
Pules2 = BPXOPLS2,
Pules3 = BPXOPLS3) %>%
select(-BPAOARM, -BPAOCSZ)
# take the average values of Systolic and Diastolic
BloodPressure = BloodPressure %>%
mutate(Systolic = (Systolic1 + Systolic2 + Systolic3)/3,
Diastolic = (Diastolic1 + Diastolic2 + Diastolic3)/3,
Pules = (Pules1 + Pules2 + Pules3)/3) %>%
select(SEQN,Systolic,Diastolic)
BMI_age = BMI_Demo %>% select(SEQN,Age,BMI,Weight,Age_Range, Class_BMI)
BPC = merge(BloodPressure,Cholesterol)
BPC_BMI = merge(BMI_age,BPC)
BPC_BMI = BPC_BMI %>% drop_na()
ggcorrplot(cor(BPC_BMI[,c(2:3,7:9)],method = "spearman"),
type = "lower",
hc.order =TRUE,
colors = c("blue", "white", "#E46726"),
lab = TRUE) +
labs(title = "Correlation Heatmap of BMI, Blood Pressure and Cholesterol")
The correlation of BIM and Age here is 0.34, but in previous heatmap, it is 0.2. So we wanted to check the correlation in the biggest data set here.
Read file names
file_list = dir_ls("./DATA")
Read all the files
df_list = map(file_list, read.xport)
Select diet behavior columns
for(i in seq(9,12)){
df_list[[i]] = df_list[[i]] %>%
select(SEQN,DBQ197,DBQ700,DBD895,DBD900,DBD905,DBD910) %>%
mutate(Year = as.numeric(gsub("\\D","",names(df_list[i]))))
}
merge age with diet behavior into one dataframe
N2011 = merge(df_list[[9]],df_list[[1]][,c("SEQN","RIDAGEYR")])
N2013 = merge(df_list[[10]],df_list[[2]][,c("SEQN","RIDAGEYR")])
N2015 = merge(df_list[[11]],df_list[[3]][,c("SEQN","RIDAGEYR")])
N2019 = merge(df_list[[12]],df_list[[8]][,c("SEQN","RIDAGEYR")])
#combine the datasets from 2007 to 2019
Diet = do.call("rbind", list(N2011,N2013,N2015,N2019))
Rename
Diet = Diet %>%
drop_na() %>%
rename(Milk = DBQ197,
How_healthy_the_diet = DBQ700,
Meals_not_home_perpared = DBD895,
FastFood = DBD900,
Ready_to_eat_foods = DBD905,
FrozenMeals = DBD910,
Age = RIDAGEYR)
Diet = Diet %>%
#filter out refuse and don't know observations
filter(Milk != 9,
Milk != 7,
How_healthy_the_diet != 7,
How_healthy_the_diet != 9,
Meals_not_home_perpared != 7777,
Meals_not_home_perpared != 9999,
FastFood != 7777,
FastFood != 9999,
Ready_to_eat_foods != 7777,
Ready_to_eat_foods !=9999,
FrozenMeals != 7777,
FrozenMeals != 9999) %>%
mutate(How_healthy_the_diet1 = ifelse(How_healthy_the_diet == 5, 10, # 5 = poor
ifelse(How_healthy_the_diet == 4, 11, # 4 = fair
ifelse(How_healthy_the_diet == 3, 12, # 3 = good
ifelse(How_healthy_the_diet == 2, 13, # 2 = very good
ifelse(How_healthy_the_diet == 1, 14,NA))))), # 1 = Excellent
Meals_not_home_perpared = replace(Meals_not_home_perpared, Meals_not_home_perpared == 5555, 22),
FastFood = replace(FastFood, FastFood == 5555, 22),
Ready_to_eat_foods = replace(Ready_to_eat_foods, Ready_to_eat_foods == 6666, 91),
FrozenMeals = replace(FrozenMeals, FrozenMeals == 6666,91))
BMI_C = do.call("rbind", list(df_list[[4]][,c("SEQN","BMXBMI","BMDBMIC","BMXWT")],
df_list[[5]][,c("SEQN","BMXBMI","BMDBMIC","BMXWT")],
df_list[[6]][,c("SEQN","BMXBMI","BMDBMIC","BMXWT")],
df_list[[7]][,c("SEQN","BMXBMI","BMDBMIC","BMXWT")]))
BMI_C_Diet = merge(BMI_C, Diet)
Teenage BMI
BMIC = BMI_C_Diet%>%
rename(Weight = BMXWT) %>%
mutate(BMI = BMDBMIC) %>%
drop_na() %>%
select(c(-3,-6))
Adult BMI
BMIA = BMI_C_Diet %>%
filter(is.na(BMDBMIC)) %>%
mutate(BMI = ifelse( BMXBMI < 18.5,1,
ifelse( BMXBMI >= 18.5 & BMXBMI < 25,2,
ifelse( BMXBMI >= 25 & BMXBMI < 35, 3,
ifelse( BMXBMI >= 35,4,NA))))) %>%
rename(Weight = BMXWT) %>%
select(c(-3,-6))
BMI = rbind(BMIC,BMIA)
BMI = BMI %>% drop_na()
ggcorrplot(cor(BMI[,c(-1,-2)],method = "spearman"),
type = "lower",
hc.order =TRUE,
colors = c("blue", "white", "#E46726"),
lab = TRUE) +
labs(title = "Correlation Heatmap of BMI and Diet Behaviors")
[1] https://wwwn.cdc.gov/nchs/nhanes/continuousnhanes/default.aspx?cycle=2017-2020