I used logit model to model the consumer choice Perceived health is positively correlated with health Perceived taste is positively correlated with taste Health and taste are independent with each other

In addition, healthy product (health >3)’s perceived health correlated stronger to their health level

First simulate the product

# Set seed for reproducibility
set.seed(456)

# Number of products to simulate
n_products <- 200

# Simulate health (discrete values from 1 to 5)
health <- sample(1:5, n_products, replace = TRUE)

# Create indicator for healthy products (health > 3)
is_healthy <- health > 3

# Define different correlation coefficients based on health level
# Higher correlation for healthy products (health > 3)
rho_healthy <- 0.8    # Stronger correlation for healthy products
rho_unhealthy <- 0.3  # Weaker correlation for less healthy products

# Initialize perceived_health vector
perceived_health_continuous <- numeric(n_products)

# Process healthy products (health > 3)
if (sum(is_healthy) > 0) {
  healthy_indices <- which(is_healthy)
  n_healthy <- length(healthy_indices)
  
  # Convert health to z-score for correlation purposes
  z_health_healthy <- scale(health[healthy_indices])
  
  # Generate correlated normal variable for healthy products
  cov_matrix_healthy <- matrix(c(1, rho_healthy, rho_healthy, 1), nrow = 2)
  z_vals_healthy <- mvrnorm(n_healthy, mu = c(0, 0), Sigma = cov_matrix_healthy)
  
  # Store the correlated values
  perceived_health_continuous[healthy_indices] <- z_vals_healthy[, 2]
}

# Process less healthy products (health <= 3)
if (sum(!is_healthy) > 0) {
  unhealthy_indices <- which(!is_healthy)
  n_unhealthy <- length(unhealthy_indices)
  
  # Convert health to z-score for correlation purposes
  z_health_unhealthy <- scale(health[unhealthy_indices])
  
  # Generate correlated normal variable for less healthy products
  cov_matrix_unhealthy <- matrix(c(1, rho_unhealthy, rho_unhealthy, 1), nrow = 2)
  z_vals_unhealthy <- mvrnorm(n_unhealthy, mu = c(0, 0), Sigma = cov_matrix_unhealthy)
  
  # Store the correlated values
  perceived_health_continuous[unhealthy_indices] <- z_vals_unhealthy[, 2]
}

# Normalize the entire perceived_health vector
perceived_health_continuous <- scale(perceived_health_continuous)

# Convert to discrete values (1-5) using quantiles
perceived_health <- cut(perceived_health_continuous,
                       breaks = quantile(perceived_health_continuous, probs = seq(0, 1, length.out = 6)),
                       labels = 1:5, include.lowest = TRUE)
perceived_health <- as.integer(perceived_health)

# Simulate taste (independent from health, values from 1 to 5)
taste <- sample(1:5, n_products, replace = TRUE)

# Simulate perceived_taste with correlation 0.5 with taste
rho_taste <- 0.5
z_taste <- scale(taste)
cov_matrix_taste <- matrix(c(1, rho_taste, rho_taste, 1), nrow = 2)
z_vals_taste <- mvrnorm(n_products, mu = c(0, 0), Sigma = cov_matrix_taste)
z_perceived_taste <- z_vals_taste[, 2]
perceived_taste_continuous <- scale(z_perceived_taste)

# Convert to discrete values (1-5)
perceived_taste <- cut(perceived_taste_continuous,
                      breaks = quantile(perceived_taste_continuous, probs = seq(0, 1, length.out = 6)),
                      labels = 1:5, include.lowest = TRUE)
perceived_taste <- as.integer(perceived_taste)

# Combine into a data frame
products <- data.frame(
  product_id = 1:n_products,
  health = health,
  perceived_health = perceived_health,
  taste = taste,
  perceived_taste = perceived_taste
)

# Calculate actual correlations to verify
cor_healthy <- cor(health[is_healthy], perceived_health[is_healthy])
cor_unhealthy <- cor(health[!is_healthy], perceived_health[!is_healthy])
cor_overall <- cor(health, perceived_health)
cor_taste <- cor(taste, perceived_taste)

# Print the correlations
cat("Correlation between health and perceived health (overall):", round(cor_overall, 3), "\n")
## Correlation between health and perceived health (overall): -0.087
cat("Correlation between health and perceived health (for healthy products):", round(cor_healthy, 3), "\n")
## Correlation between health and perceived health (for healthy products): 0.048
cat("Correlation between health and perceived health (for less healthy products):", round(cor_unhealthy, 3), "\n")
## Correlation between health and perceived health (for less healthy products): -0.086
cat("Correlation between taste and perceived taste (overall):", round(cor_taste, 3), "\n")
## Correlation between taste and perceived taste (overall): -0.005
# Verify independence between health and taste
health_taste_cor <- cor(health, taste)
cat("Correlation between health and taste:", round(health_taste_cor, 3), "\n")
## Correlation between health and taste: 0.028
# Remove duplicates
# Create a new dataframe with unique product characteristics (excluding product_id)
product_chars <- products[, c("health", "perceived_health", "taste", "perceived_taste")]

# Find unique combinations
unique_chars <- unique(product_chars)

# Add new product IDs
unique_products <- cbind(unique_chars, product_id = 1:nrow(unique_chars))

# Check the result
cat("Number of unique product combinations:", nrow(unique_products), "\n")
## Number of unique product combinations: 173
print(head(unique_products))
##   health perceived_health taste perceived_taste product_id
## 1      5                1     3               4          1
## 2      5                1     5               4          2
## 3      3                2     4               5          3
## 4      5                2     1               5          4
## 5      4                2     5               5          5
## 6      3                5     5               3          6

Then simulate the user

# Assuming you already have:
# - unique_products: data.frame with product_id, health, perceived_health, taste, perceived_taste
# - n_individuals = 1000

set.seed(123)

# Simulate individual-level parameters
n_individuals <- 4000
alpha <- rnorm(n_individuals, mean = 0.5, sd = 0.2)
beta_1 <- rnorm(n_individuals, mean = 1.2, sd = 0.3)  # weight on perceived health
beta_2 <- rnorm(n_individuals, mean = 0.8, sd = 0.3)  # weight on perceived taste

individuals <- data.frame(
  id = 1:n_individuals,
  alpha = alpha,
  beta_1 = beta_1,
  beta_2 = beta_2
)

# Add product ID
unique_products$product_id <- 1:nrow(unique_products)

# All person-product combinations
full_data <- expand.grid(id = individuals$id, product_id = unique_products$product_id)

# Merge individual parameters
full_data <- full_data %>%
  left_join(individuals, by = "id") %>%
  left_join(unique_products, by = "product_id")

# Simulate epsilon for each choice situation
set.seed(456)
full_data$epsilon <- rnorm(nrow(full_data), 0, 1)

# Compute perceived utility (what drives choice)
full_data$perceived_utility <- with(full_data,
  alpha + beta_1 * perceived_health + beta_2 * perceived_taste + epsilon
)

full_data$utility <- with(full_data,
  alpha + beta_1 * health + beta_2 * taste + epsilon
)
# Define logit choice function
simulate_logit_choice <- function(df, utility_column) {
  df %>%
    group_by(id) %>%
    mutate(
      exp_utility = exp(!!sym(utility_column)),
      prob = exp_utility / sum(exp_utility)
    ) %>%
    slice_sample(n = 1, weight_by = prob, replace = TRUE) %>%
    ungroup()
}
# Create outside option for each individual
outside_option <- individuals %>%
  mutate(
    product_id = 0,
    health = NA,
    perceived_health = NA,
    taste = NA,
    perceived_taste = NA,
    epsilon = rnorm(n_individuals, 0, 1),
    perceived_utility = 0,  # No beta terms
    utility = alpha + epsilon,
    utility_wlabel = alpha + epsilon,
    utility_hlabel = alpha + epsilon,
    utility_nulabel = alpha + epsilon
  )

Warning label

We apply a warning label intervention that adjusts how individuals perceive a product’s healthiness. The updated perceived health value is stored as perceived_health_with_label.

The rules are as follows:

  1. If health ∈ {1, 2}, and perceived_health ∈ {3, 4, 5 }, perceived_health_with_label is set to the 2.
  2. If perceived_health ∈ {1, 2} but health ≥ 4, then perceived_health_with_label is updated to 3.

Now, people do not make inference no health and taste

# Apply the health warning logic
full_data <- full_data %>%
  mutate(
    perceived_health_with_wlabel = case_when(
      health %in% c(1, 2) & perceived_health %in% c(3, 4, 5) ~ 2,
      perceived_health %in% c(1, 2) & health >= 3 ~ 3,
      TRUE ~ perceived_health
    )
  )

full_data$utility_wlabel <- with(full_data,
  alpha + beta_1 * perceived_health_with_wlabel + beta_2 * perceived_taste + epsilon
)

Health Label

We apply a health label intervention that modifies how individuals perceive a product’s healthiness. The adjusted perception is stored in the variable perceived_health_with_hlabel.

The rules are as follows:

  1. If health ∈ {4, 5}, but perceived_health ∈ {1, 2,3}, then then perceived_health_with_hlabel is 4
  2. If perceived_health ∈ {4, 5} but health < 4, then perceived_health_with_hlabel is updated to 3.

Now, individuals do not make inferences about the healthiness or taste beyond what is given.

# Apply the health label logic
full_data <- full_data %>%
  mutate(
    perceived_health_with_hlabel = case_when(
      health %in% c(4, 5) & perceived_health %in% c(1, 2, 3) ~ 4,
      perceived_health %in% c(4, 5) & health < 4 ~ 3,
      TRUE ~ perceived_health
    )
  )

# Calculate utility based on perceived health after applying health label
full_data$utility_hlabel <- with(full_data,
  alpha + beta_1 * perceived_health_with_hlabel + beta_2 * perceived_taste + epsilon
)

Nutrition score label

# Calculate utility based on perceived health after applying health label
full_data$utility_nulabel <- with(full_data,
  alpha + beta_1 * health + beta_2 * perceived_taste + epsilon
)
# Bind outside option into full_data
full_data_with_outside <- bind_rows(full_data, outside_option)

Simulate choices

chosen_true <- simulate_logit_choice(full_data_with_outside, "utility")
chosen <- simulate_logit_choice(full_data_with_outside, "perceived_utility")
chosen_wlabel <- simulate_logit_choice(full_data_with_outside, "utility_wlabel")
chosen_hlabel <- simulate_logit_choice(full_data_with_outside, "utility_hlabel")
chosen_nulabel <- simulate_logit_choice(full_data_with_outside, "utility_nulabel")

Comparison

library(ggplot2)

# Sum of utilities across individuals in each scenario
consumer_utility_summary <- data.frame(
  scenario = c("True Utility", "No Label", "Warning Label", "Health Label", 'Nutrition-Score Label'),
  total_utility = c(
    sum(chosen_true$utility),
    sum(chosen$utility),
    sum(chosen_wlabel$utility),
    sum(chosen_hlabel$utility),
    sum(chosen_nulabel$utility)
  )
)
# Calculate increase relative to 'No Label' condition
baseline_utility <- consumer_utility_summary$total_utility[consumer_utility_summary$scenario == "No Label"]
consumer_utility_summary$utility_increase <- consumer_utility_summary$total_utility - baseline_utility
consumer_utility_summary$percent_increase <- 100 * (consumer_utility_summary$utility_increase / baseline_utility)
library(ggplot2)

ggplot(consumer_utility_summary, aes(x = scenario, y = total_utility)) +
  geom_bar(stat = "identity", fill = "steelblue") +
  geom_text(aes(label = ifelse(scenario == "No Label", "", 
                               paste0("+", round(percent_increase, 1), "%"))),
            vjust = -0.5, size = 5) +
  labs(
    title = "Total Consumer Utility Under Different Labeling Scenarios",
    x = "Scenario",
    y = "Total Utility"
  ) +
  scale_y_continuous(expand = expansion(mult = c(0, 0.15)))+
  theme_minimal() +
  theme(
    text = element_text(size = 14),
    axis.text.x = element_text(angle = 15, hjust = 1)
  )

library(dplyr)

# Calculate how many products were affected by the warning label intervention
warning_label_changes <- full_data %>%
  dplyr::select(product_id, health, perceived_health, perceived_health_with_wlabel) %>%
  distinct() %>%
  filter(perceived_health != perceived_health_with_wlabel) %>%
  nrow()

# Calculate how many products were affected by the health label intervention
health_label_changes <- full_data %>%
  dplyr::select(product_id, health, perceived_health, perceived_health_with_hlabel) %>%
  distinct() %>%
  filter(perceived_health != perceived_health_with_hlabel) %>%
  nrow()

# For nutrition score label, we don't modify perceived_health directly
# Instead, we use the true health value in the utility calculation
# But we can count how many products would have a different perception if we did this
nutrition_score_changes <- full_data %>%
  dplyr::select(product_id, health, perceived_health) %>%
  distinct() %>%
  filter(health != perceived_health) %>%
  nrow()

# Print the results
cat("Number of products with updated perceived health in Warning Label scenario:", warning_label_changes, "\n")
## Number of products with updated perceived health in Warning Label scenario: 94
cat("Number of products with updated perceived health in Health Label scenario:", health_label_changes, "\n")
## Number of products with updated perceived health in Health Label scenario: 84
cat("Number of products with different true vs perceived health (Nutrition Score scenario):", nutrition_score_changes, "\n")
## Number of products with different true vs perceived health (Nutrition Score scenario): 141
# We can also calculate the percentage of products affected in each scenario
total_products <- length(unique(full_data$product_id))
cat("\nPercentage of products affected:\n")
## 
## Percentage of products affected:
cat("Warning Label scenario:", round(warning_label_changes/total_products*100, 2), "%\n")
## Warning Label scenario: 54.34 %
cat("Health Label scenario:", round(health_label_changes/total_products*100, 2), "%\n")
## Health Label scenario: 48.55 %
cat("Nutrition Score scenario:", round(nutrition_score_changes/total_products*100, 2), "%\n")
## Nutrition Score scenario: 81.5 %
# We can also examine the specific changes in perceived health for each scenario
# Warning Label changes
warning_label_details <- full_data %>%
  dplyr::select(product_id, health, perceived_health, perceived_health_with_wlabel) %>%
  distinct() %>%
  filter(perceived_health != perceived_health_with_wlabel) %>%
  mutate(change_type = case_when(
    health %in% c(1, 2) & perceived_health %in% c(3, 4, 5) ~ "Unhealthy product perceived as healthy -> corrected to 2",
    perceived_health %in% c(1, 2) & health >= 3 ~ "Healthy product perceived as unhealthy -> updated to 3",
    TRUE ~ "Other"
  ))

# Count by change type for warning label
warning_label_counts <- warning_label_details %>%
  count(change_type) %>%
  arrange(desc(n))

# Health Label changes
health_label_details <- full_data %>%
  dplyr::select(product_id, health, perceived_health, perceived_health_with_hlabel) %>%
  distinct() %>%
  filter(perceived_health != perceived_health_with_hlabel) %>%
  mutate(change_type = case_when(
    health %in% c(4, 5) & perceived_health %in% c(1, 2, 3) ~ "Healthy product perceived as less healthy -> updated to 4",
    perceived_health %in% c(4, 5) & health < 4 ~ "Less healthy product perceived as healthy -> corrected to 3",
    TRUE ~ "Other"
  ))

# Count by change type for health label
health_label_counts <- health_label_details %>%
  count(change_type) %>%
  arrange(desc(n))

# Print detailed results
cat("\nWarning Label changes by type:\n")
## 
## Warning Label changes by type:
print(warning_label_counts)
##                                                change_type  n
## 1   Healthy product perceived as unhealthy -> updated to 3 47
## 2 Unhealthy product perceived as healthy -> corrected to 2 47
cat("\nHealth Label changes by type:\n")
## 
## Health Label changes by type:
print(health_label_counts)
##                                                   change_type  n
## 1 Less healthy product perceived as healthy -> corrected to 3 44
## 2   Healthy product perceived as less healthy -> updated to 4 40

Run the choice simulation multiple times

library(dplyr)
library(ggplot2)
library(data.table) # For faster data manipulation
## Warning: package 'data.table' was built under R version 4.3.3
## 
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
## 
##     between, first, last
library(future.apply) # For parallel processing
## Warning: package 'future.apply' was built under R version 4.3.3
## Loading required package: future
## Warning: package 'future' was built under R version 4.3.3
# Install packages if needed
if (!requireNamespace("data.table", quietly = TRUE)) install.packages("data.table")
if (!requireNamespace("future.apply", quietly = TRUE)) install.packages("future.apply")

# Set up parallel processing
plan(multisession, workers = parallel::detectCores() - 1)

# More efficient logit choice function using data.table
simulate_logit_choice_fast <- function(dt, utility_column) {
  # Convert to data.table if not already
  DT <- as.data.table(dt)
  
  # Perform grouping and calculations efficiently
  result <- DT[, {
    # Calculate exponential of utility and probabilities
    exp_u <- exp(get(utility_column))
    probs <- exp_u / sum(exp_u)
    
    # Sample one row based on probabilities
    i <- sample(1:.N, 1, prob = probs)
    .SD[i]
  }, by = id]
  
  return(result)
}

# Function to run a single simulation and return results
run_single_simulation <- function(full_data_with_outside) {
  # Create data.table copy to avoid modifying original
  dt <- as.data.table(full_data_with_outside)
  
  # Simulate choices for each condition
  chosen_true <- simulate_logit_choice_fast(dt, "utility")
  chosen <- simulate_logit_choice_fast(dt, "perceived_utility")
  chosen_wlabel <- simulate_logit_choice_fast(dt, "utility_wlabel")
  chosen_hlabel <- simulate_logit_choice_fast(dt, "utility_hlabel")
  chosen_nulabel <- simulate_logit_choice_fast(dt, "utility_nulabel")
  
  # Calculate total utility for each condition (faster with data.table)
  result <- data.table(
    scenario = c("True Utility", "No Label", "Warning Label", "Health Label", "Nutrition-Score Label"),
    total_utility = c(
      sum(chosen_true$utility),
      sum(chosen$utility),
      sum(chosen_wlabel$utility),
      sum(chosen_hlabel$utility),
      sum(chosen_nulabel$utility)
    )
  )
  
  return(result)
}

# Run N simulations in parallel
run_multiple_simulations <- function(full_data_with_outside, N = 1000) {
  # Run N simulations in parallel without progress reporting
  # (removing progress reporting to avoid the error)
  all_results <- future_lapply(1:N, function(i) {
    result <- run_single_simulation(full_data_with_outside)
    result$simulation <- i
    return(result)
  }, future.seed = TRUE)
  
  # Combine all results
  combined_results <- rbindlist(all_results)
  
  # Calculate summary statistics by scenario (using data.table for speed)
  summary_stats <- combined_results[, .(
    mean_utility = mean(total_utility),
    sd_utility = sd(total_utility),
    n = .N,
    se = sd(total_utility) / sqrt(.N),
    lower_ci = mean(total_utility) - 1.96 * sd(total_utility) / sqrt(.N),
    upper_ci = mean(total_utility) + 1.96 * sd(total_utility) / sqrt(.N)
  ), by = scenario]
  
  # Calculate increases relative to "No Label"
  baseline_utility <- summary_stats[scenario == "No Label", mean_utility]
  summary_stats[, utility_increase := mean_utility - baseline_utility]
  summary_stats[, percent_increase := 100 * (utility_increase / baseline_utility)]
  
  return(summary_stats)
}

# Assuming full_data_with_outside is already loaded
# Run simulations with optimized code
results <- run_multiple_simulations(full_data_with_outside, N = 1000)

# Create the plot with error bars for confidence intervals
ggplot(results, aes(x = scenario, y = mean_utility)) +
  geom_bar(stat = "identity", fill = "steelblue") +
  geom_errorbar(aes(ymin = lower_ci, ymax = upper_ci), width = 0.2) +
  geom_text(aes(label = ifelse(scenario == "No Label", "", 
                             paste0("+", round(percent_increase, 1), "%"))),
          vjust = -0.5, size = 5) +
  labs(
    title = "Total Consumer Utility Under Different Labeling Scenarios",
    subtitle = "Mean values with 95% confidence intervals (N = 1000 simulations)",
    x = "Scenario",
    y = "Mean Total Utility"
  ) +
  scale_y_continuous(expand = expansion(mult = c(0, 0.15))) +
  theme_minimal() +
  theme(
    text = element_text(size = 14),
    axis.text.x = element_text(angle = 15, hjust = 1)
  )

print(results)
## Index: <scenario>
##                 scenario mean_utility sd_utility     n       se lower_ci
##                   <char>        <num>      <num> <int>    <num>    <num>
## 1:          True Utility     41793.31   85.91026  1000 2.716721 41787.98
## 2:              No Label     27172.67  146.91532  1000 4.645870 27163.56
## 3:         Warning Label     33267.55  131.31577  1000 4.152569 33259.41
## 4:          Health Label     34121.06  139.81367  1000 4.421296 34112.39
## 5: Nutrition-Score Label     37492.58  113.99588  1000 3.604866 37485.51
##    upper_ci utility_increase percent_increase
##       <num>            <num>            <num>
## 1: 41798.63        14620.637         53.80641
## 2: 27181.77            0.000          0.00000
## 3: 33275.69         6094.882         22.43019
## 4: 34129.72         6948.390         25.57125
## 5: 37499.64        10319.907         37.97900