library(SASxport)
library(naniar) # for missing data
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5     v purrr   0.3.4
## v tibble  3.1.6     v dplyr   1.0.8
## v tidyr   1.2.0     v stringr 1.4.0
## v readr   2.1.2     v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
#library(ggplot2)
library(GGally)
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2
library(ggcorrplot)
library(fs) # read multiple files

Data source: National Health and Nutrition Examination Survey (NHANES).

NHANES is an annual survey taken by the Centers for Disease Control and Prevention. The survey is a program that is designed to assess the health and nutritional status of adults and children in the United States. The program takes a nationwide sample of about five thousand persons each year. Data collected includes demographics, dietary and health related questions and laboratory tests results. Analysis from the survey can be used to determine the risk factors for diseases.

Our main objective is to analyse different behaviors that impact BMI index. We started the analysis on body measurements, demographics, insulin, blood glucose, cholesterol ration and blood pressure. We found a major problem that almost stopped the project. But later we found some interesting points. We demonstrated all the findings with bar graphs, proportion graphs and correlation heatmaps.

For this project, we used NHANES 2017 - March 2020 Pre-Pandemic Data. There are 82 data sets in the survey. Most of the data sets have about 15 thousand observations and dozens of features. We chose blood pressure, blood glucose, insulin and cholesterol data sets as known factors to examine our methods. We chose diet behavior data sets as unknown factors to analyse.

Dataset 1 Examination - Body Measures

# read Body Measures dataset
BodyMeasures = read.xport("P_BMX.XPT")
# rename varialbes
BodyMeasures = BodyMeasures %>%
  rename(Weight = BMXWT,
         BMI = BMXBMI) %>%
  select(SEQN,Weight,BMI,BMDBMIC)
# age <18, BMI
BMIC = BodyMeasures %>% 
  mutate(Class_BMI = BMDBMIC) %>% 
  select(-BMDBMIC) %>%
  drop_na()

There’s children BMI category, but no adult BMI category in the original data

# age > 18, BMI

BMIA = BodyMeasures %>% 
  filter(is.na(BMDBMIC)) %>%
  select(-BMDBMIC) %>%
  drop_na()

BMIA$Class_BMI = cut(BMIA$BMI, 
                     breaks = c(0,18.5,25,30,max(BMIA$BMI)),
                     include.lowest = TRUE,
                     labels = c(1,2,3,4))
# combine children BMI and adult BMI two dataframes
BMI = rbind(BMIA,BMIC)

# convert Class_BMI to factor
BMI$Class_BMI = as.factor(BMI$Class_BMI) 
#[1]
levels(BMI$Class_BMI) = c("Under Weight", 
                          "Healthy Weight",
                          "Overweight",
                          "Obese")

Dataset 2 Demographics

Demographics = read.xport("P_DEMO.XPT") 
# rename variables
Demographics = Demographics %>%
  dplyr::rename(Gender = RIAGENDR,
                Age = RIDAGEYR,
                Race = RIDRETH3,
                Education = DMDEDUC2,
                Marital = DMDMARTZ,
                Country_of_Birth = DMDBORN4) %>%
  dplyr::select(SEQN,Age,Gender,Race,Country_of_Birth,
                Marital,Education) 
# convert Gender,Race, Country_of_Birth and Marital to factor
Demographics[,c(3:7)][] = lapply(Demographics[,c(3:7)],as.factor)
# rename levels
levels(Demographics$Gender) = c("Male", "Female")
levels(Demographics$Race) = c("Mexican American",
                              "Hispanic",
                              "White",
                              "Black",
                              "Asian",
                              "Others including multi-racial")
levels(Demographics$Country_of_Birth) = c("Born in US",
                                          "Others",
                                          "Refused",
                                          "don't know")
levels(Demographics$Marital) = c("Married/Living with Partner",
                                 "Divorced/Separated",
                                 "Never married",
                                 "Refused",
                                 "Don't know")
levels(Demographics$Education) = c("Less than 9th grade",
                                   "9-11th grade",
                                   "High school graduate",
                                   "Some college or AA degree",
                                   "College graduate or above",
                                   "Refused",
                                   "Don't know")
# create a new variable Age_Range based on Age variable
Demographics$Age_Range = cut(Demographics$Age,
                             breaks = c(0,18,40,60,max(Demographics$Age)),
                             include.lowest = TRUE,
                             labels = c("Minor (smaller than 18 years old)",
                                        "Young Adult (18-39)",
                                        "Middle Age (40-59)",
                                        "Seniors(greater than 60)"))
# inner join BMI and Demographics dataframes
BMI_Demo = merge(Demographics,BMI, by = "SEQN")
ggplot(BMI_Demo) +
  aes(x = Race, y = BMI, fill = Class_BMI) +
  geom_col(position = "fill") +
  scale_x_discrete(guide = guide_axis(n.dodge=3))+
#  geom_text(aes(label = n),  position = position_fill(0.5), vjust = 0, check_overlap = TRUE) +
  labs(y = "Proportions",
       title = "BMI Proportion vs. Race")

ggplot(BMI_Demo) +
  aes(x = Age_Range, y = BMI, fill = Class_BMI) +
  geom_col(position = "fill") +
  labs(y = "Proportions",
     title = "BMI Proportion vs. Age")

Dataset 3 Blood Test - Insulin

Insulin = read.xport("P_INS.XPT")
Insulin = Insulin %>%
  rename(Insulin_μU_mL = LBXIN) %>%
  select(SEQN,Insulin_μU_mL)

Dataset 4 Blood Test - Blood Glucose

Blood_Glucose= read.xport("P_GLU.XPT")
Blood_Glucose = Blood_Glucose %>%
  rename(Fasting_glucose = LBXGLU) %>%
  select(SEQN,Fasting_glucose)
# inner join insulin and glucose
GI = merge(Insulin,Blood_Glucose)
BMI_Demo[,c(1:4,8:11)]
# inner join insulin, glucose and BMI
BIG = merge(GI,BMI_Demo[,c(1:4,8:11)])
BIG = BIG %>% drop_na()
BIG = BIG %>%
  # [3] 0 = normal, 0.5 = prediabetes, 1 = diabetes
  dplyr::mutate(Diabetes = ifelse(Fasting_glucose < 100, 0,
                    ifelse(Fasting_glucose >=100 & Fasting_glucose <126, 0.5,
                    ifelse(Fasting_glucose >= 126,1,NA)))) 
# show the relation ship of BMI, insulin and blood gloucose
ggcorrplot(cor(BIG[,c(2:4,8:9,11)],method = "spearman"),
           type = "lower",
           hc.order =TRUE,
           colors = c("blue", "white", "#E46726"),
           lab = TRUE) +
  labs(title = "Correlation Heatmap of BMI, Insulin and Blood Glucose")

Dataset 5 Blood Test - Cholesterol - Total

CholesterolTotal = read.xport("P_TCHOL.XPT")

Dataset 6 Blood Test - Cholesterol - High-Density Lipoprotein

CholesterolHDL = read.xport("P_HDL.XPT")
Cholesterol = merge(CholesterolTotal,CholesterolHDL, by = "SEQN")
Cholesterol = Cholesterol %>%
  mutate(Ratio_Total_HDL = LBXTC/LBDHDD ) %>%
  select(SEQN, Ratio_Total_HDL)%>%
  drop_na()
# create new variable Class_Cholesterol
Cholesterol$Class_Cholesterol = cut(Cholesterol$Ratio_Total_HDL, 
                                    breaks = c(0,3.5,4,5,max(Cholesterol$Ratio_Total_HDL)), #[2]
                                    labels = c("The lowest risk of heart attack",
                                             "14% more likely to experience heart attack",
                                             "46% more likely to experience heart attack",
                                             "89% more likely to experience heart attack"))

Dataset 7 Blood Pressure

BloodPressure = read.xport("P_BPXO.XPT")
BloodPressure = BloodPressure %>%
  rename(Systolic1 = BPXOSY1,
         Diastolic1 = BPXODI1,
         Systolic2 = BPXOSY2,
         Diastolic2 = BPXODI2,
         Systolic3 = BPXOSY3,
         Diastolic3 = BPXODI3,
         Pules1 = BPXOPLS1,
         Pules2 = BPXOPLS2,
         Pules3 = BPXOPLS3) %>%
  select(-BPAOARM, -BPAOCSZ)
# take the average values of Systolic and Diastolic
BloodPressure = BloodPressure %>%
  mutate(Systolic = (Systolic1 + Systolic2  + Systolic3)/3,
         Diastolic = (Diastolic1 + Diastolic2 + Diastolic3)/3,
         Pules = (Pules1 + Pules2 + Pules3)/3) %>%
  select(SEQN,Systolic,Diastolic)
BMI_age = BMI_Demo %>% select(SEQN,Age,BMI,Weight,Age_Range, Class_BMI)
BPC = merge(BloodPressure,Cholesterol)
BPC_BMI = merge(BMI_age,BPC)
BPC_BMI = BPC_BMI %>% drop_na()
ggcorrplot(cor(BPC_BMI[,c(2:3,7:9)],method = "spearman"),
           type = "lower",
           hc.order =TRUE,
           colors = c("blue", "white", "#E46726"),
           lab = TRUE) +
  labs(title = "Correlation Heatmap of BMI, Blood Pressure and Cholesterol")

The correlation of BIM and Age here is 0.34, but in previous heatmap, it is 0.2. So we wanted to check the correlation in the biggest data set here.

Diet Behaviors

Read file names

file_list = dir_ls("./DATA")

Read all the files

df_list = map(file_list, read.xport)

Select diet behavior columns

for(i in seq(9,12)){
  df_list[[i]] = df_list[[i]] %>%
    select(SEQN,DBQ197,DBQ700,DBD895,DBD900,DBD905,DBD910) %>%
    mutate(Year = as.numeric(gsub("\\D","",names(df_list[i]))))
}

merge age with diet behavior into one dataframe

N2011 = merge(df_list[[9]],df_list[[1]][,c("SEQN","RIDAGEYR")])
N2013 = merge(df_list[[10]],df_list[[2]][,c("SEQN","RIDAGEYR")])
N2015 = merge(df_list[[11]],df_list[[3]][,c("SEQN","RIDAGEYR")])
N2019 = merge(df_list[[12]],df_list[[8]][,c("SEQN","RIDAGEYR")])
#combine the datasets from 2007 to 2019
Diet = do.call("rbind", list(N2011,N2013,N2015,N2019))

Rename

Diet = Diet %>%
  drop_na() %>%
  rename(Milk = DBQ197,
         How_healthy_the_diet = DBQ700,
         Meals_not_home_perpared = DBD895,
         FastFood = DBD900,
         Ready_to_eat_foods = DBD905,
         FrozenMeals = DBD910,
         Age = RIDAGEYR)
Diet = Diet %>%
  #filter out refuse and don't know observations
  filter(Milk != 9,
         Milk != 7,
         How_healthy_the_diet != 7,
         How_healthy_the_diet != 9,
         Meals_not_home_perpared != 7777,
         Meals_not_home_perpared != 9999,
         FastFood != 7777,
         FastFood != 9999,
         Ready_to_eat_foods != 7777,
         Ready_to_eat_foods !=9999,
         FrozenMeals != 7777,
         FrozenMeals != 9999) %>%
  mutate(How_healthy_the_diet1 = ifelse(How_healthy_the_diet == 5, 10, # 5 = poor
                                 ifelse(How_healthy_the_diet == 4, 11, # 4 = fair
                                 ifelse(How_healthy_the_diet == 3, 12, # 3 = good
                                 ifelse(How_healthy_the_diet == 2, 13, # 2 = very good
                                 ifelse(How_healthy_the_diet == 1, 14,NA))))), # 1 = Excellent
         Meals_not_home_perpared = replace(Meals_not_home_perpared, Meals_not_home_perpared == 5555, 22),
         FastFood = replace(FastFood, FastFood == 5555, 22),
         Ready_to_eat_foods = replace(Ready_to_eat_foods, Ready_to_eat_foods == 6666, 91),
         FrozenMeals = replace(FrozenMeals, FrozenMeals == 6666,91)) 
BMI_C = do.call("rbind", list(df_list[[4]][,c("SEQN","BMXBMI","BMDBMIC","BMXWT")],
                      df_list[[5]][,c("SEQN","BMXBMI","BMDBMIC","BMXWT")],
                      df_list[[6]][,c("SEQN","BMXBMI","BMDBMIC","BMXWT")],
                      df_list[[7]][,c("SEQN","BMXBMI","BMDBMIC","BMXWT")]))
BMI_C_Diet = merge(BMI_C, Diet)

Teenage BMI

BMIC = BMI_C_Diet%>%
  rename(Weight = BMXWT) %>%
  mutate(BMI = BMDBMIC) %>%
  drop_na() %>%
  select(c(-3,-6))

Adult BMI

BMIA = BMI_C_Diet %>%
  filter(is.na(BMDBMIC)) %>%
  mutate(BMI = ifelse( BMXBMI < 18.5,1,
               ifelse( BMXBMI >= 18.5 & BMXBMI < 25,2,
               ifelse( BMXBMI >= 25 & BMXBMI < 35, 3,
               ifelse( BMXBMI >= 35,4,NA))))) %>%
  rename(Weight = BMXWT) %>%
  select(c(-3,-6)) 
BMI = rbind(BMIC,BMIA)
BMI = BMI %>% drop_na()
ggcorrplot(cor(BMI[,c(-1,-2)],method = "spearman"),
           type = "lower",
           hc.order =TRUE,
           colors = c("blue", "white", "#E46726"),
           lab = TRUE) +
  labs(title = "Correlation Heatmap of BMI and Diet Behaviors")

[1] https://wwwn.cdc.gov/nchs/nhanes/continuousnhanes/default.aspx?cycle=2017-2020