This report explores potential exposure variables related to dietary patterns in the NHANES dataset. All of the variables are categorical.
We begin by loading the necessary libraries:
library(tidyverse)
library(here)
library(janitor)
library(patchwork)
library(rlang)
library(gt)
library(forcats)
library(labelled)
Next, we read in the cleaned and merged dataset, which combines the variables chosen by Professor Fregni and the ones chosen by the students.
df <- readRDS(here("Output", "merged_dataset_fregni_plus_students.rds")) %>%
clean_names()
To make our dietary variables more readable and meaningful, we recode
them from numeric codes to labeled factors. This includes whether the
person is currently on a diet (drqsdiet) and several types
of diets such as weight loss, low fat, low salt, etc.
recode_diet_variables <- function(df) {
# Save labels
saved_labels <- list(
currently_on_diet = var_label(df$drqsdiet),
weight_loss_diet = var_label(df$drqsdt1),
low_fat_diet = var_label(df$drqsdt2),
low_salt_diet = var_label(df$drqsdt3),
low_sugar_diet = var_label(df$drqsdt4),
low_fiber_diet = var_label(df$drqsdt5),
high_fiber_diet = var_label(df$drqsdt6),
diabetic_diet = var_label(df$drqsdt7),
weight_gain_diet = var_label(df$drqsdt8),
low_carb_diet = var_label(df$drqsdt9),
high_protein_diet = var_label(df$drqsdt10),
gluten_free_diet = var_label(df$drqsdt11),
renal_kidney_diet = var_label(df$drqsdt12),
other_special_diet = var_label(df$drqsdt91)
)
# Rename variables
df <- df %>%
rename(
currently_on_diet = drqsdiet,
weight_loss_diet = drqsdt1,
low_fat_diet = drqsdt2,
low_salt_diet = drqsdt3,
low_sugar_diet = drqsdt4,
low_fiber_diet = drqsdt5,
high_fiber_diet = drqsdt6,
diabetic_diet = drqsdt7,
weight_gain_diet = drqsdt8,
low_carb_diet = drqsdt9,
high_protein_diet = drqsdt10,
gluten_free_diet = drqsdt11,
renal_kidney_diet = drqsdt12,
other_special_diet = drqsdt91
)
# Recode values
df <- df %>%
mutate(
currently_on_diet = recode_factor(currently_on_diet, `1` = "Yes", `2` = "No", `9` = "Don't know") %>%
fct_explicit_na(na_level = "Missing"),
weight_loss_diet = recode_factor(weight_loss_diet, `1` = "Weight loss/Low calorie diet"),
low_fat_diet = recode_factor(low_fat_diet, `2` = "Low fat/Low cholesterol diet"),
low_salt_diet = recode_factor(low_salt_diet, `3` = "Low salt/Low sodium diet"),
low_sugar_diet = recode_factor(low_sugar_diet, `4` = "Sugar free/Low sugar diet"),
low_fiber_diet = recode_factor(low_fiber_diet, `5` = "Low fiber diet"),
high_fiber_diet = recode_factor(high_fiber_diet, `6` = "High fiber diet"),
diabetic_diet = recode_factor(diabetic_diet, `7` = "Diabetic diet"),
weight_gain_diet = recode_factor(weight_gain_diet, `8` = "Weight gain/Muscle building diet"),
low_carb_diet = recode_factor(low_carb_diet, `9` = "Low carbohydrate diet"),
high_protein_diet = recode_factor(high_protein_diet, `10` = "High protein diet"),
gluten_free_diet = recode_factor(gluten_free_diet, `11` = "Gluten-free/Celiac diet"),
renal_kidney_diet = recode_factor(renal_kidney_diet, `12` = "Renal/Kidney diet"),
other_special_diet = recode_factor(other_special_diet, `91` = "Other special diet")
)
# Restore labels
for (var in names(saved_labels)) {
var_label(df[[var]]) <- saved_labels[[var]]
}
return(df)
}
df <- recode_diet_variables(df)
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `currently_on_diet = `%>%`(...)`.
## Caused by warning:
## ! `fct_explicit_na()` was deprecated in forcats 1.0.0.
## ℹ Please use `fct_na_value_to_level()` instead.
To evaluate internal consistency, we created a derived variable
any_specific_diet_flag that indicates whether participants
reported following at least one specific diet. We then cross-tabulated
this with the general question on whether the participant was currently
on any diet. This allows us to identify potential discrepancies—such as
individuals who reported a specific dietary pattern but did not indicate
they were currently on a diet. The table below summarizes all
combinations of these two variables:
df <- df %>%
mutate(
# Create a new factor variable indicating if any specific diet is followed (excluding 'currently_on_diet')
any_specific_diet_flag = factor(
if_else(
# Count non-NA values across all *_diet variables, excluding 'currently_on_diet'
rowSums(across(
ends_with("_diet") & !matches("^currently_on_diet$"),
~ !is.na(.)
)) > 0,
"Yes", # If any non-NA diet is present
"No" # If all are NA
),
levels = c("Yes", "No") # Set factor levels
)
)
summary_tab <- df %>%
count(currently_on_diet, any_specific_diet_flag, name = "Count") %>%
mutate(
Percent = round(100 * Count / sum(Count), 1)
) %>%
complete(
currently_on_diet,
any_specific_diet_flag,
fill = list(Count = 0, Percent = 0)
) %>%
rename(
`Currently on diet` = currently_on_diet,
`Following any specific diet` = any_specific_diet_flag
) %>%
gt() %>%
tab_header(
title = md("**Consistency Between Diet Flags**"),
subtitle = "Are there any patients flagged to be on a specific diet that didn't show up as being currently on a diet?"
) %>%
cols_align(align = "center", columns = everything()) %>%
opt_row_striping()
summary_tab
| Consistency Between Diet Flags | |||
| Are there any patients flagged to be on a specific diet that didn't show up as being currently on a diet? | |||
| Currently on diet | Following any specific diet | Count | Percent |
|---|---|---|---|
| Yes | Yes | 905 | 9.8 |
| Yes | No | 0 | 0.0 |
| No | Yes | 0 | 0.0 |
| No | No | 6773 | 73.2 |
| Don't know | Yes | 0 | 0.0 |
| Don't know | No | 50 | 0.5 |
| Missing | Yes | 0 | 0.0 |
| Missing | No | 1526 | 16.5 |
To begin, we explore the general question about whether participants
are currently following a special diet. The variable
currently_on_diet captures this information using labeled
categorical responses. The plot and table below display the distribution
of responses, including those who responded “Yes”, “No”, “Don’t know”,
or left the question unanswered.
# Frequency table and plot for the general diet question
diet_freq_tbl <- df %>%
count(currently_on_diet, name = "n") %>%
mutate(
percent = round(100 * n / sum(n), 1)
) %>%
gt() %>%
cols_label(
currently_on_diet = "Response",
n = "Frequency",
percent = "Percent (%)"
) %>%
tab_header(
title = "How Participants Responded to 'Are You Currently on a Diet?'"
) %>%
fmt_number(columns = percent, decimals = 1) %>%
cols_align(align = "center", columns = everything())
diet_freq_tbl
| How Participants Responded to 'Are You Currently on a Diet?' | ||
| Response | Frequency | Percent (%) |
|---|---|---|
| Yes | 905 | 9.8 |
| No | 6773 | 73.2 |
| Don't know | 50 | 0.5 |
| Missing | 1526 | 16.5 |
# Bar Plot
barplot_gen_variable <- df %>%
count(currently_on_diet, name = "count") %>%
mutate(
percent = round(100 * count / sum(count), 2)
) %>%
ggplot(aes(x = fct_rev(fct_infreq(currently_on_diet)), y = count)) +
geom_bar(stat = "identity", fill = "skyblue", color = "black") +
labs(
title = "How Participants Responded to 'Are You Currently on a Diet?'",
x = NULL,
y = "Count"
) +
theme_minimal() +
coord_flip()
barplot_gen_variable
To explore the distribution of special diets among respondents, we first
calculate the total number of individuals currently following a diet,
followed by a breakdown of the specific types of diets they report. We
also summarize responses from those not currently on a diet, those who
answered “Don’t know,” and those with missing data. The resulting table
presents both absolute and relative frequencies, formatted for
publication using the
gt package. Specific diets are
displayed as indented, italicized subcategories beneath the “Yes” group
for clarity.
# 1) Totals
# Calculate the number of respondents currently on a special diet ("Yes")
n_yes <- df %>% filter(currently_on_diet == "Yes") %>% nrow()
# Calculate the total number of respondents
n_total <- nrow(df)
# 2) "Yes" summary row
# Create a row summarizing the total frequency and percentage of "Yes" responses
yes_row <- tibble(
label = "Yes",
Frequency = n_yes,
`Percent (%)` = round(100 * n_yes / n_total, 1)
)
# 3) Specific diets under "Yes"
# Extract and count specific diet types among those who answered "Yes"
yes_diets <- df %>%
filter(currently_on_diet == "Yes") %>%
pivot_longer(
cols = ends_with("_diet") & !matches("^currently_on_diet$"),
names_to = "var", values_to = "diet"
) %>%
filter(!is.na(diet)) %>%
count(diet, name = "Frequency") %>%
arrange(desc(Frequency)) %>%
mutate(
label = paste0("• ", diet), # Add bullet to distinguish subcategories
`Percent (%)` = round(100 * Frequency / n_total, 1) # Percent out of total
) %>%
select(label, Frequency, `Percent (%)`)
# 4) Other categories
# Count and format the "No", "Don't know", and "Missing" responses
others <- df %>%
filter(currently_on_diet != "Yes") %>%
count(currently_on_diet, name = "Frequency") %>%
mutate(
label = as.character(currently_on_diet),
`Percent (%)` = round(100 * Frequency / n_total, 1)
) %>%
select(label, Frequency, `Percent (%)`)
# 5) Combine
# Bind the summary row, specific diets, and other categories into one table
final_tbl <- bind_rows(yes_row, yes_diets, others)
# 6) Render with gt and style the bullets
# Create gt table with proper header, alignment, and styling
final_tbl %>%
gt(rowname_col = "label") %>%
tab_stubhead(label = "Following a diet?") %>%
tab_header(title = "Absolute and relative frequencies of specific diets") %>%
cols_align(align = "center", columns = c(Frequency, `Percent (%)`)) %>%
# Indent and italicize only the bullet-labeled diet types
tab_style(
style = cell_text(style = "italic", indent = px(15), size = px(12)),
locations = cells_stub(rows = startsWith(final_tbl$label, "• "))
) %>%
# Reduce font size in data cells of bullet-labeled rows
tab_style(
style = cell_text(size = px(12)),
locations = cells_body(
columns = c(Frequency, `Percent (%)`),
rows = startsWith(final_tbl$label, "• ")
)
)
| Absolute and relative frequencies of specific diets | ||
| Following a diet? | Frequency | Percent (%) |
|---|---|---|
| Yes | 905 | 9.8 |
| • Weight loss/Low calorie diet | 475 | 5.1 |
| • Diabetic diet | 139 | 1.5 |
| • Low salt/Low sodium diet | 109 | 1.2 |
| • Low fat/Low cholesterol diet | 95 | 1.0 |
| • Low carbohydrate diet | 85 | 0.9 |
| • Other special diet | 44 | 0.5 |
| • Sugar free/Low sugar diet | 40 | 0.4 |
| • High protein diet | 26 | 0.3 |
| • Weight gain/Muscle building diet | 23 | 0.2 |
| • Gluten-free/Celiac diet | 16 | 0.2 |
| • Renal/Kidney diet | 10 | 0.1 |
| • High fiber diet | 4 | 0.0 |
| • Low fiber diet | 2 | 0.0 |
| No | 6773 | 73.2 |
| Don't know | 50 | 0.5 |
| Missing | 1526 | 16.5 |
To visualize the overlap and combinations of different specific diets among participants who reported following a special diet, we used an UpSet plot. This type of plot offers a clear summary of how frequently participants reported one or more particular dietary patterns, as well as which combinations are most common.
library(ComplexUpset)
# 1. Prepare the data (as before)
upset_data <- df %>%
filter(currently_on_diet == "Yes") %>%
mutate(across(
ends_with("_diet") & !matches("^currently_on_diet$"),
~ !is.na(.)
))
specific_diets <- upset_data %>%
select(ends_with("_diet") & !matches("^currently_on_diet$")) %>%
names()
diet_labels <- c(
weight_loss_diet = "Weight loss / Low-calorie diet",
low_fat_diet = "Low-fat / Low-cholesterol diet",
low_salt_diet = "Low-salt / Low-sodium diet",
low_sugar_diet = "Sugar-free / Low-sugar diet",
low_fiber_diet = "Low-fiber diet",
high_fiber_diet = "High-fiber diet",
diabetic_diet = "Diabetic diet",
weight_gain_diet = "Weight-gain / Muscle-building diet",
low_carb_diet = "Low-carb diet",
high_protein_diet = "High-protein diet",
gluten_free_diet = "Gluten-free / Celiac diet",
renal_kidney_diet = "Renal / Kidney diet",
other_special_diet = "Other special diet"
)
# 2. Rename only the specific diet columns
upset_data_pub <- upset_data %>%
rename_with(~ diet_labels[.x], .cols = all_of(specific_diets))
# 3. Now select ONLY the renamed columns (in order) for plotting
specific_diets_pub <- unname(diet_labels[specific_diets]) # vector of publication-ready names
# 4. UpSet plot using *only* the correct columns as sets
upset(
upset_data_pub,
specific_diets_pub,
name = "Specific Diets",
min_size = 5,
width_ratio = 0.1
) +
theme(
axis.text.x = element_blank(),
plot.title = element_text(size = 14, face = "bold")
) +
ggtitle("Overlap of Specific Diet Types Among Dieting Participants")
This plot shows the most common individual diets on the left, and the set intersections (i.e. combinations of diets) along the bottom, with bar heights indicating the number of participants in each intersection.