EDA_NHANES

Major Updates

Using updated target variable
Impute daily/yearly alcohol consumption based on key indicator (GGT, MCV in clinical data)
Taking medicine status (for HBP and High chol) has been updated based on diagnosis status (legitimate NA due to no such disease now become 0)
Update plot for factor variables

Basic Summaries

# Check missingness and overall summary
colSums(is.na(dt1))

                     SEQN         is_sleep_deprived      daily_sleep_duration 
                        0                         0                         0 
                   gender                       age                       edu 
                        0                         0                       973 
             alc_freq_12m               avg_alc_day            high_BP_diag_1 
                     1000                      1106                        12 
             take_med_HBP                 high_chol             high_chol_med 
                        0                         0                         0 
    min_outdoors_work_day min_outdoors_not_work_day                 diag_diab 
                     4890                      4371                         4 
             insulin_take         diab_pill_for_lbp               health_diet 
                        0                         0                         2 
            not_home_meal            fast_food_meal               frozen_meal 
                       20                      2108                       121 
          gen_health_cond   seen_mental_health_prof          family_pov_index 
                        9                         3                      2234 
                      GGT                       MCV 
                     1490                      1220

summary(dt1)

      SEQN        is_sleep_deprived daily_sleep_duration    gender    
 Min.   :109266   0:7717            Min.   : 2.000       Male  :4905  
 1st Qu.:113195   1:2359            1st Qu.: 7.000       Female:5171  
 Median :117022                     Median : 7.940                    
 Mean   :117069                     Mean   : 7.848                    
 3rd Qu.:120975                     3rd Qu.: 8.730                    
 Max.   :124822                     Max.   :14.020                    
                                                                      
      age                      edu        alc_freq_12m     avg_alc_day    
 Min.   :16.00   <9th grade      : 702   Min.   : 0.000   Min.   : 1.000  
 1st Qu.:31.00   9–11th grade    :1026   1st Qu.: 3.000   1st Qu.: 2.000  
 Median :48.00   High school grad:2187   Median : 5.000   Median : 2.000  
 Mean   :47.88   Some college/AA :2945   Mean   : 4.931   Mean   : 2.486  
 3rd Qu.:64.00   College grad+   :2243   3rd Qu.: 7.000   3rd Qu.: 3.000  
 Max.   :80.00   NA's            : 973   Max.   :10.000   Max.   :15.000  
                                         NA's   :1000     NA's   :1106    
 high_BP_diag_1 take_med_HBP high_chol  high_chol_med min_outdoors_work_day
 No  :6520      No :7330     No :6824   No :8097      Min.   :  0.0        
 Yes :3544      Yes:2746     Yes:3252   Yes:1979      1st Qu.: 15.0        
 NA's:  12                                            Median : 60.0        
                                                      Mean   :118.5        
                                                      3rd Qu.:180.0        
                                                      Max.   :480.0        
                                                      NA's   :4890         
 min_outdoors_not_work_day      diag_diab    insulin_take diab_pill_for_lbp
 Min.   :  0.0             No        :8392   No :9665     No :8964         
 1st Qu.: 60.0             Yes       :1409   Yes: 411     Yes:1112         
 Median :120.0             Borderline: 271                                 
 Mean   :148.4             NA's      :   4                                 
 3rd Qu.:240.0                                                             
 Max.   :480.0                                                             
 NA's   :4371                                                              
    health_diet   not_home_meal    fast_food_meal    frozen_meal    
 Excellent: 815   Min.   : 0.000   Min.   : 0.000   Min.   : 0.000  
 Very good:2029   1st Qu.: 1.000   1st Qu.: 0.000   1st Qu.: 0.000  
 Good     :3929   Median : 2.000   Median : 1.000   Median : 0.000  
 Fair     :2606   Mean   : 3.308   Mean   : 2.162   Mean   : 1.663  
 Poor     : 695   3rd Qu.: 4.000   3rd Qu.: 3.000   3rd Qu.: 2.000  
 NA's     :   2   Max.   :21.000   Max.   :21.000   Max.   :21.000  
                  NA's   :20       NA's   :2108     NA's   :121     
  gen_health_cond seen_mental_health_prof family_pov_index      GGT         
 Excellent:1247   No  :9019               Min.   :0.000    Min.   :   2.00  
 Very good:2794   Yes :1054               1st Qu.:1.070    1st Qu.:  14.00  
 Good     :3662   NA's:   3               Median :1.935    Median :  20.00  
 Fair     :1970                           Mean   :2.365    Mean   :  30.89  
 Poor     : 394                           3rd Qu.:3.640    3rd Qu.:  31.00  
 NA's     :   9                           Max.   :5.000    Max.   :2394.00  
                                          NA's   :2234     NA's   :1490     
      MCV        
 Min.   : 35.40  
 1st Qu.: 85.40  
 Median : 88.80  
 Mean   : 88.27  
 3rd Qu.: 91.90  
 Max.   :114.60  
 NA's   :1220

Correlation: Numeeric Vars vs. Numeric Target (Sleep Length)

Appears to have no meaningful correlation

# Create a correlation table for numeric predictors with daily_sleep_duration
cor_table <- dt1 %>%
  # Select all numeric columns
  select(where(is.numeric)) %>%
  # Remove target variables and predictors we want to exclude
  select(-c(SEQN, GGT, MCV)) %>%
  # Compute correlations with daily_sleep_duration using the complete cases
  summarise(across(everything(), ~ cor(dt1$daily_sleep_duration, .,
                                       use = "complete.obs"))) %>%
  # Convert from wide to long format
  pivot_longer(cols = everything(), names_to = "Predictor", values_to = "Correlation_with_daily_sleep") %>%
  mutate(Correlation_with_daily_sleep = round(Correlation_with_daily_sleep, 3))

# Plot the correlations as a horizontal bar plot, ordering predictors by correlation value
cor_table <- cor_table %>%
  arrange(Correlation_with_daily_sleep) %>%
  mutate(Predictor = factor(Predictor, levels = Predictor))

ggplot(cor_table, aes(x = Predictor, y = Correlation_with_daily_sleep)) +
  geom_col(fill = "steelblue") +
  coord_flip() +
  labs(
    title = "Correlation of Numeric Predictors with Daily Sleep Duration",
    x = "Predictor",
    y = "Correlation"
  )

Correlation: Numeeric Vars vs. Binary Target (Sleep Deprivation)

average alcohol intake and frequency of taking fast food meal have marginal relationship with sleep deprivation

numeric_vars <- dt1 %>% 
  select(where(is.numeric)) %>% 
  names()

# Remove the ID and the target (and any other numeric predictors you want to exclude)
predictor_vars <- setdiff(numeric_vars, c("SEQN", "daily_sleep_duration", "GGT", "MCV", "is_sleep_deprived"))

# Run a bivariate logistic regression for each numeric predictor and extract coefficient and p-value
logistic_results <- lapply(predictor_vars, function(var) {
  # Build the formula
  form <- as.formula(paste("is_sleep_deprived ~", var))
  
  # Fit the logistic regression model
  model <- glm(form, data = dt1, family = binomial)
  
  # Get the summary and extract the coefficient (for the predictor only) and its p-value
  summ <- summary(model)
  coef_val <- summ$coefficients[2, "Estimate"]
  p_val <- summ$coefficients[2, "Pr(>|z|)"]
  
  # Return a data frame with the predictor name, coefficient, and p-value
  data.frame(Predictor = var,
             Coefficient = round(coef_val, 3),
             p_value = round(p_val, 3),
             stringsAsFactors = FALSE)
})

# Combine the results into a single data frame
logistic_results_df <- do.call(rbind, logistic_results)

# Print the summary table
print(logistic_results_df)

                  Predictor Coefficient p_value
1                       age       0.004   0.001
2              alc_freq_12m      -0.005   0.515
3               avg_alc_day       0.068   0.000
4     min_outdoors_work_day       0.001   0.008
5 min_outdoors_not_work_day       0.001   0.008
6             not_home_meal       0.017   0.006
7            fast_food_meal       0.047   0.000
8               frozen_meal       0.003   0.698
9          family_pov_index      -0.026   0.127

Distribution: Categorical Vars vs. Binary Target (Sleep Deprivation)

Potential significant difference in terms of gender, edu level, BP, insulin, healthy diet, health condition (no sig. test yet)

# Get the names of all factor variables in dt1
factor_vars <- dt1 %>% 
  select(where(is.factor)) %>%
  names()

# Remove the target variable from the list
factor_predictors <- setdiff(factor_vars, "is_sleep_deprived")

# Loop over each factor predictor
for (varname in factor_predictors) {
  
  # Filter out rows missing either the predictor or the target
  dt_no_na <- dt1 %>%
    filter(!is.na(.data[[varname]]), !is.na(is_sleep_deprived))
  
  # Create a table of counts by predictor level and sleep deprivation status
  counts <- dt_no_na %>%
    group_by(across(all_of(varname)), is_sleep_deprived) %>%
    summarise(n = n(), .groups = "drop") %>%
    group_by(across(all_of(varname))) %>%
    mutate(percent = n / sum(n))
  
  # Generate the grouped bar plot with percentage labels above each bar
  p <- ggplot(dt_no_na, aes(x = .data[[varname]], fill = is_sleep_deprived)) +
    geom_bar(position = position_dodge(width = 0.9)) +
    geom_text(data = counts,
              aes(x = .data[[varname]], y = n, label = percent(percent, accuracy = 0.1)),
              position = position_dodge(width = 0.9),
              vjust = -0.5, size = 3) +
    labs(
      title = paste("Grouped Bar Chart of", varname, "vs. Sleep Deprivation (binary)"),
      x = varname,
      y = "Count"
    ) +
    theme_minimal() +
    theme(axis.text.x = element_text(angle = 45, hjust = 1))
  
  print(p)
}