EDA_NHANES

Major Updates

  • Using updated target variable
  • Impute daily/yearly alcohol consumption based on key indicator (GGT, MCV in clinical data)
  • Taking medicine status (for HBP and High chol) has been updated based on diagnosis status (legitimate NA due to no such disease now become 0)
  • Update plot for factor variables

Basic Summaries

# Check missingness and overall summary
colSums(is.na(dt1))
                     SEQN         is_sleep_deprived      daily_sleep_duration 
                        0                         0                         0 
                   gender                       age                       edu 
                        0                         0                       973 
             alc_freq_12m               avg_alc_day            high_BP_diag_1 
                     1000                      1106                        12 
             take_med_HBP                 high_chol             high_chol_med 
                        0                         0                         0 
    min_outdoors_work_day min_outdoors_not_work_day                 diag_diab 
                     4890                      4371                         4 
             insulin_take         diab_pill_for_lbp               health_diet 
                        0                         0                         2 
            not_home_meal            fast_food_meal               frozen_meal 
                       20                      2108                       121 
          gen_health_cond   seen_mental_health_prof          family_pov_index 
                        9                         3                      2234 
                      GGT                       MCV 
                     1490                      1220 
summary(dt1)
      SEQN        is_sleep_deprived daily_sleep_duration    gender    
 Min.   :109266   0:7717            Min.   : 2.000       Male  :4905  
 1st Qu.:113195   1:2359            1st Qu.: 7.000       Female:5171  
 Median :117022                     Median : 7.940                    
 Mean   :117069                     Mean   : 7.848                    
 3rd Qu.:120975                     3rd Qu.: 8.730                    
 Max.   :124822                     Max.   :14.020                    
                                                                      
      age                      edu        alc_freq_12m     avg_alc_day    
 Min.   :16.00   <9th grade      : 702   Min.   : 0.000   Min.   : 1.000  
 1st Qu.:31.00   9–11th grade    :1026   1st Qu.: 3.000   1st Qu.: 2.000  
 Median :48.00   High school grad:2187   Median : 5.000   Median : 2.000  
 Mean   :47.88   Some college/AA :2945   Mean   : 4.931   Mean   : 2.486  
 3rd Qu.:64.00   College grad+   :2243   3rd Qu.: 7.000   3rd Qu.: 3.000  
 Max.   :80.00   NA's            : 973   Max.   :10.000   Max.   :15.000  
                                         NA's   :1000     NA's   :1106    
 high_BP_diag_1 take_med_HBP high_chol  high_chol_med min_outdoors_work_day
 No  :6520      No :7330     No :6824   No :8097      Min.   :  0.0        
 Yes :3544      Yes:2746     Yes:3252   Yes:1979      1st Qu.: 15.0        
 NA's:  12                                            Median : 60.0        
                                                      Mean   :118.5        
                                                      3rd Qu.:180.0        
                                                      Max.   :480.0        
                                                      NA's   :4890         
 min_outdoors_not_work_day      diag_diab    insulin_take diab_pill_for_lbp
 Min.   :  0.0             No        :8392   No :9665     No :8964         
 1st Qu.: 60.0             Yes       :1409   Yes: 411     Yes:1112         
 Median :120.0             Borderline: 271                                 
 Mean   :148.4             NA's      :   4                                 
 3rd Qu.:240.0                                                             
 Max.   :480.0                                                             
 NA's   :4371                                                              
    health_diet   not_home_meal    fast_food_meal    frozen_meal    
 Excellent: 815   Min.   : 0.000   Min.   : 0.000   Min.   : 0.000  
 Very good:2029   1st Qu.: 1.000   1st Qu.: 0.000   1st Qu.: 0.000  
 Good     :3929   Median : 2.000   Median : 1.000   Median : 0.000  
 Fair     :2606   Mean   : 3.308   Mean   : 2.162   Mean   : 1.663  
 Poor     : 695   3rd Qu.: 4.000   3rd Qu.: 3.000   3rd Qu.: 2.000  
 NA's     :   2   Max.   :21.000   Max.   :21.000   Max.   :21.000  
                  NA's   :20       NA's   :2108     NA's   :121     
  gen_health_cond seen_mental_health_prof family_pov_index      GGT         
 Excellent:1247   No  :9019               Min.   :0.000    Min.   :   2.00  
 Very good:2794   Yes :1054               1st Qu.:1.070    1st Qu.:  14.00  
 Good     :3662   NA's:   3               Median :1.935    Median :  20.00  
 Fair     :1970                           Mean   :2.365    Mean   :  30.89  
 Poor     : 394                           3rd Qu.:3.640    3rd Qu.:  31.00  
 NA's     :   9                           Max.   :5.000    Max.   :2394.00  
                                          NA's   :2234     NA's   :1490     
      MCV        
 Min.   : 35.40  
 1st Qu.: 85.40  
 Median : 88.80  
 Mean   : 88.27  
 3rd Qu.: 91.90  
 Max.   :114.60  
 NA's   :1220    

Correlation: Numeeric Vars vs. Numeric Target (Sleep Length)

  • Appears to have no meaningful correlation
# Create a correlation table for numeric predictors with daily_sleep_duration
cor_table <- dt1 %>%
  # Select all numeric columns
  select(where(is.numeric)) %>%
  # Remove target variables and predictors we want to exclude
  select(-c(SEQN, GGT, MCV)) %>%
  # Compute correlations with daily_sleep_duration using the complete cases
  summarise(across(everything(), ~ cor(dt1$daily_sleep_duration, .,
                                       use = "complete.obs"))) %>%
  # Convert from wide to long format
  pivot_longer(cols = everything(), names_to = "Predictor", values_to = "Correlation_with_daily_sleep") %>%
  mutate(Correlation_with_daily_sleep = round(Correlation_with_daily_sleep, 3))

# Plot the correlations as a horizontal bar plot, ordering predictors by correlation value
cor_table <- cor_table %>%
  arrange(Correlation_with_daily_sleep) %>%
  mutate(Predictor = factor(Predictor, levels = Predictor))

ggplot(cor_table, aes(x = Predictor, y = Correlation_with_daily_sleep)) +
  geom_col(fill = "steelblue") +
  coord_flip() +
  labs(
    title = "Correlation of Numeric Predictors with Daily Sleep Duration",
    x = "Predictor",
    y = "Correlation"
  )

Correlation: Numeeric Vars vs. Binary Target (Sleep Deprivation)

  • average alcohol intake and frequency of taking fast food meal have marginal relationship with sleep deprivation
numeric_vars <- dt1 %>% 
  select(where(is.numeric)) %>% 
  names()

# Remove the ID and the target (and any other numeric predictors you want to exclude)
predictor_vars <- setdiff(numeric_vars, c("SEQN", "daily_sleep_duration", "GGT", "MCV", "is_sleep_deprived"))

# Run a bivariate logistic regression for each numeric predictor and extract coefficient and p-value
logistic_results <- lapply(predictor_vars, function(var) {
  # Build the formula
  form <- as.formula(paste("is_sleep_deprived ~", var))
  
  # Fit the logistic regression model
  model <- glm(form, data = dt1, family = binomial)
  
  # Get the summary and extract the coefficient (for the predictor only) and its p-value
  summ <- summary(model)
  coef_val <- summ$coefficients[2, "Estimate"]
  p_val <- summ$coefficients[2, "Pr(>|z|)"]
  
  # Return a data frame with the predictor name, coefficient, and p-value
  data.frame(Predictor = var,
             Coefficient = round(coef_val, 3),
             p_value = round(p_val, 3),
             stringsAsFactors = FALSE)
})

# Combine the results into a single data frame
logistic_results_df <- do.call(rbind, logistic_results)

# Print the summary table
print(logistic_results_df)
                  Predictor Coefficient p_value
1                       age       0.004   0.001
2              alc_freq_12m      -0.005   0.515
3               avg_alc_day       0.068   0.000
4     min_outdoors_work_day       0.001   0.008
5 min_outdoors_not_work_day       0.001   0.008
6             not_home_meal       0.017   0.006
7            fast_food_meal       0.047   0.000
8               frozen_meal       0.003   0.698
9          family_pov_index      -0.026   0.127

Distribution: Categorical Vars vs. Binary Target (Sleep Deprivation)

  • Potential significant difference in terms of gender, edu level, BP, insulin, healthy diet, health condition (no sig. test yet)
# Get the names of all factor variables in dt1
factor_vars <- dt1 %>% 
  select(where(is.factor)) %>%
  names()

# Remove the target variable from the list
factor_predictors <- setdiff(factor_vars, "is_sleep_deprived")

# Loop over each factor predictor
for (varname in factor_predictors) {
  
  # Filter out rows missing either the predictor or the target
  dt_no_na <- dt1 %>%
    filter(!is.na(.data[[varname]]), !is.na(is_sleep_deprived))
  
  # Create a table of counts by predictor level and sleep deprivation status
  counts <- dt_no_na %>%
    group_by(across(all_of(varname)), is_sleep_deprived) %>%
    summarise(n = n(), .groups = "drop") %>%
    group_by(across(all_of(varname))) %>%
    mutate(percent = n / sum(n))
  
  # Generate the grouped bar plot with percentage labels above each bar
  p <- ggplot(dt_no_na, aes(x = .data[[varname]], fill = is_sleep_deprived)) +
    geom_bar(position = position_dodge(width = 0.9)) +
    geom_text(data = counts,
              aes(x = .data[[varname]], y = n, label = percent(percent, accuracy = 0.1)),
              position = position_dodge(width = 0.9),
              vjust = -0.5, size = 3) +
    labs(
      title = paste("Grouped Bar Chart of", varname, "vs. Sleep Deprivation (binary)"),
      x = varname,
      y = "Count"
    ) +
    theme_minimal() +
    theme(axis.text.x = element_text(angle = 45, hjust = 1))
  
  print(p)
}