Simulation

I used logit model to model the consumer choice Perceived health is positively correlated with health Perceived taste is positively correlated with taste Health and taste are independent with each other

First simulate the product

library(MASS)  # for mvrnorm

## 
## Attaching package: 'MASS'

## The following object is masked from 'package:dplyr':
## 
##     select

# Set seed
set.seed(456)

# Number of products to simulate
n_products <- 200

# Simulate health (discrete values from 1 to 5)
health <- sample(1:5, n_products, replace = TRUE)

# Simulate perceived_health with correlation 0.5 with health
# Step 1: Convert health to z-score for correlation purposes
z_health <- scale(health)

# Step 2: Generate correlated normal variable
rho <- 0.5
cov_matrix <- matrix(c(1, rho, rho, 1), nrow = 2)
z_vals <- mvrnorm(n_products, mu = c(0, 0), Sigma = cov_matrix)

# Step 3: Use second correlated variable to generate perceived_health
z_perceived_health <- z_vals[, 2]
perceived_health_continuous <- scale(z_perceived_health)  # normalize
# Rank and match to discrete values
perceived_health <- cut(perceived_health_continuous,
                        breaks = quantile(perceived_health_continuous, probs = seq(0, 1, length.out = 6)),
                        labels = 1:5, include.lowest = TRUE)
perceived_health <- as.integer(perceived_health)

# Simulate taste (1 or 2)
taste <- sample(1:2, n_products, replace = TRUE)

# Correlated perceived_taste
z_taste <- scale(taste)
z_vals_taste <- mvrnorm(n_products, mu = c(0, 0), Sigma = cov_matrix)
z_perceived_taste <- z_vals_taste[, 2]
perceived_taste <- cut(z_perceived_taste,
                       breaks = quantile(z_perceived_taste, probs = seq(0, 1, length.out = 3)),
                       labels = 1:2, include.lowest = TRUE)
perceived_taste <- as.integer(perceived_taste)

# Combine into data frame
products <- data.frame(
  health = health,
  perceived_health = perceived_health,
  taste = taste,
  perceived_taste = perceived_taste
)

# Remove duplicates
unique_products <- unique(products)

# View number of unique products and a sample
cat("Number of unique products:", nrow(unique_products), "\n")

## Number of unique products: 89

head(unique_products)

##   health perceived_health taste perceived_taste
## 1      5                2     1               1
## 2      5                2     1               2
## 3      3                4     1               2
## 5      4                5     1               1
## 6      3                4     2               2
## 7      1                2     1               1

Then simulate the user

# Assuming you already have:
# - unique_products: data.frame with product_id, health, perceived_health, taste, perceived_taste
# - n_individuals = 1000

set.seed(123)

# Simulate individual-level parameters
n_individuals <- 4000
alpha <- rnorm(n_individuals, mean = 0.5, sd = 0.2)
beta_1 <- rnorm(n_individuals, mean = 1.2, sd = 0.3)  # weight on perceived health
beta_2 <- rnorm(n_individuals, mean = 0.8, sd = 0.3)  # weight on perceived taste

individuals <- data.frame(
  id = 1:n_individuals,
  alpha = alpha,
  beta_1 = beta_1,
  beta_2 = beta_2
)

# Add product ID
unique_products$product_id <- 1:nrow(unique_products)

# All person-product combinations
full_data <- expand.grid(id = individuals$id, product_id = unique_products$product_id)

# Merge individual parameters
full_data <- full_data %>%
  left_join(individuals, by = "id") %>%
  left_join(unique_products, by = "product_id")

# Simulate epsilon for each choice situation
set.seed(456)
full_data$epsilon <- rnorm(nrow(full_data), 0, 1)

# Compute perceived utility (what drives choice)
full_data$perceived_utility <- with(full_data,
  alpha + beta_1 * perceived_health + beta_2 * perceived_taste + epsilon
)

full_data$utility <- with(full_data,
  alpha + beta_1 * health + beta_2 * taste + epsilon
)

# Define logit choice function
simulate_logit_choice <- function(df, utility_column) {
  df %>%
    group_by(id) %>%
    mutate(
      exp_utility = exp(!!sym(utility_column)),
      prob = exp_utility / sum(exp_utility)
    ) %>%
    slice_sample(n = 1, weight_by = prob, replace = TRUE) %>%
    ungroup()
}

# Create outside option for each individual
outside_option <- individuals %>%
  mutate(
    product_id = 0,
    health = NA,
    perceived_health = NA,
    taste = NA,
    perceived_taste = NA,
    epsilon = rnorm(n_individuals, 0, 1),
    perceived_utility = 0,  # No beta terms
    utility = alpha + epsilon,
    utility_wlabel = alpha + epsilon,
    utility_hlabel = alpha + epsilon,
    utility_nulabel = alpha + epsilon
  )

Warning label

We apply a warning label intervention that adjusts how individuals perceive a product’s healthiness. The updated perceived health value is stored as perceived_health_with_label.

The rules are as follows:

If health ∈ {1, 2}, and perceived_health ∈ {3, 4, 5 }, perceived_health_with_label is set to the 2.
If perceived_health ∈ {1, 2} but health ≥ 4, then perceived_health_with_label is updated to 3.

Now, people do not make inference no health and taste

# Apply the health warning logic
full_data <- full_data %>%
  mutate(
    perceived_health_with_wlabel = case_when(
      health %in% c(1, 2) & perceived_health %in% c(3, 4, 5) ~ 2,
      perceived_health %in% c(1, 2) & health >= 3 ~ 3,
      TRUE ~ perceived_health
    )
  )

full_data$utility_wlabel <- with(full_data,
  alpha + beta_1 * perceived_health_with_wlabel + beta_2 * perceived_taste + epsilon
)

Health Label

We apply a health label intervention that modifies how individuals perceive a product’s healthiness. The adjusted perception is stored in the variable perceived_health_with_hlabel.

The rules are as follows:

If health ∈ {4, 5}, but perceived_health ∈ {1, 2,3}, then then perceived_health_with_hlabel is 4
If perceived_health ∈ {4, 5} but health < 4, then perceived_health_with_hlabel is updated to 3.

Now, individuals do not make inferences about the healthiness or taste beyond what is given.

# Apply the health label logic
full_data <- full_data %>%
  mutate(
    perceived_health_with_hlabel = case_when(
      health %in% c(4, 5) & perceived_health %in% c(1, 2, 3) ~ 4,
      perceived_health %in% c(4, 5) & health < 4 ~ 3,
      TRUE ~ perceived_health
    )
  )

# Calculate utility based on perceived health after applying health label
full_data$utility_hlabel <- with(full_data,
  alpha + beta_1 * perceived_health_with_hlabel + beta_2 * perceived_taste + epsilon
)

Nutrition score label

# Calculate utility based on perceived health after applying health label
full_data$utility_nulabel <- with(full_data,
  alpha + beta_1 * health + beta_2 * perceived_taste + epsilon
)

# Bind outside option into full_data
full_data_with_outside <- bind_rows(full_data, outside_option)

Simulate choices

chosen_true <- simulate_logit_choice(full_data_with_outside, "utility")
chosen <- simulate_logit_choice(full_data_with_outside, "perceived_utility")
chosen_wlabel <- simulate_logit_choice(full_data_with_outside, "utility_wlabel")
chosen_hlabel <- simulate_logit_choice(full_data_with_outside, "utility_hlabel")
chosen_nulabel <- simulate_logit_choice(full_data_with_outside, "utility_nulabel")

Comparison

library(ggplot2)

# Sum of utilities across individuals in each scenario
consumer_utility_summary <- data.frame(
  scenario = c("True Utility", "No Label", "Warning Label", "Health Label", 'Nutrition-Score Label'),
  total_utility = c(
    sum(chosen_true$utility),
    sum(chosen$utility),
    sum(chosen_wlabel$utility),
    sum(chosen_hlabel$utility),
    sum(chosen_nulabel$utility)
  )
)

# Calculate increase relative to 'No Label' condition
baseline_utility <- consumer_utility_summary$total_utility[consumer_utility_summary$scenario == "No Label"]
consumer_utility_summary$utility_increase <- consumer_utility_summary$total_utility - baseline_utility
consumer_utility_summary$percent_increase <- 100 * (consumer_utility_summary$utility_increase / baseline_utility)

library(ggplot2)

ggplot(consumer_utility_summary, aes(x = scenario, y = total_utility)) +
  geom_bar(stat = "identity", fill = "steelblue") +
  geom_text(aes(label = ifelse(scenario == "No Label", "", 
                               paste0("+", round(percent_increase, 1), "%"))),
            vjust = -0.5, size = 5) +
  labs(
    title = "Total Consumer Utility Under Different Labeling Scenarios",
    x = "Scenario",
    y = "Total Utility"
  ) +
  scale_y_continuous(expand = expansion(mult = c(0, 0.15)))+
  theme_minimal() +
  theme(
    text = element_text(size = 14),
    axis.text.x = element_text(angle = 15, hjust = 1)
  )

library(dplyr)

# Calculate how many products were affected by the warning label intervention
warning_label_changes <- full_data %>%
  dplyr::select(product_id, health, perceived_health, perceived_health_with_wlabel) %>%
  distinct() %>%
  filter(perceived_health != perceived_health_with_wlabel) %>%
  nrow()

# Calculate how many products were affected by the health label intervention
health_label_changes <- full_data %>%
  dplyr::select(product_id, health, perceived_health, perceived_health_with_hlabel) %>%
  distinct() %>%
  filter(perceived_health != perceived_health_with_hlabel) %>%
  nrow()

# For nutrition score label, we don't modify perceived_health directly
# Instead, we use the true health value in the utility calculation
# But we can count how many products would have a different perception if we did this
nutrition_score_changes <- full_data %>%
  dplyr::select(product_id, health, perceived_health) %>%
  distinct() %>%
  filter(health != perceived_health) %>%
  nrow()

# Print the results
cat("Number of products with updated perceived health in Warning Label scenario:", warning_label_changes, "\n")

## Number of products with updated perceived health in Warning Label scenario: 40

cat("Number of products with updated perceived health in Health Label scenario:", health_label_changes, "\n")

## Number of products with updated perceived health in Health Label scenario: 42

cat("Number of products with different true vs perceived health (Nutrition Score scenario):", nutrition_score_changes, "\n")

## Number of products with different true vs perceived health (Nutrition Score scenario): 69

# We can also calculate the percentage of products affected in each scenario
total_products <- length(unique(full_data$product_id))
cat("\nPercentage of products affected:\n")

## 
## Percentage of products affected:

cat("Warning Label scenario:", round(warning_label_changes/total_products*100, 2), "%\n")

## Warning Label scenario: 44.94 %

cat("Health Label scenario:", round(health_label_changes/total_products*100, 2), "%\n")

## Health Label scenario: 47.19 %

cat("Nutrition Score scenario:", round(nutrition_score_changes/total_products*100, 2), "%\n")

## Nutrition Score scenario: 77.53 %

# We can also examine the specific changes in perceived health for each scenario
# Warning Label changes
warning_label_details <- full_data %>%
  dplyr::select(product_id, health, perceived_health, perceived_health_with_wlabel) %>%
  distinct() %>%
  filter(perceived_health != perceived_health_with_wlabel) %>%
  mutate(change_type = case_when(
    health %in% c(1, 2) & perceived_health %in% c(3, 4, 5) ~ "Unhealthy product perceived as healthy -> corrected to 2",
    perceived_health %in% c(1, 2) & health >= 3 ~ "Healthy product perceived as unhealthy -> updated to 3",
    TRUE ~ "Other"
  ))

# Count by change type for warning label
warning_label_counts <- warning_label_details %>%
  count(change_type) %>%
  arrange(desc(n))

# Health Label changes
health_label_details <- full_data %>%
  dplyr::select(product_id, health, perceived_health, perceived_health_with_hlabel) %>%
  distinct() %>%
  filter(perceived_health != perceived_health_with_hlabel) %>%
  mutate(change_type = case_when(
    health %in% c(4, 5) & perceived_health %in% c(1, 2, 3) ~ "Healthy product perceived as less healthy -> updated to 4",
    perceived_health %in% c(4, 5) & health < 4 ~ "Less healthy product perceived as healthy -> corrected to 3",
    TRUE ~ "Other"
  ))

# Count by change type for health label
health_label_counts <- health_label_details %>%
  count(change_type) %>%
  arrange(desc(n))

# Print detailed results
cat("\nWarning Label changes by type:\n")

## 
## Warning Label changes by type:

print(warning_label_counts)

##                                                change_type  n
## 1   Healthy product perceived as unhealthy -> updated to 3 20
## 2 Unhealthy product perceived as healthy -> corrected to 2 20

cat("\nHealth Label changes by type:\n")

## 
## Health Label changes by type:

print(health_label_counts)

##                                                   change_type  n
## 1 Less healthy product perceived as healthy -> corrected to 3 22
## 2   Healthy product perceived as less healthy -> updated to 4 20

Run the choice simulation multiple times

library(dplyr)
library(ggplot2)
library(data.table) # For faster data manipulation

## Warning: package 'data.table' was built under R version 4.3.3

## 
## Attaching package: 'data.table'

## The following objects are masked from 'package:dplyr':
## 
##     between, first, last

library(future.apply) # For parallel processing

## Warning: package 'future.apply' was built under R version 4.3.3

## Loading required package: future

## Warning: package 'future' was built under R version 4.3.3

# Install packages if needed
if (!requireNamespace("data.table", quietly = TRUE)) install.packages("data.table")
if (!requireNamespace("future.apply", quietly = TRUE)) install.packages("future.apply")

# Set up parallel processing
plan(multisession, workers = parallel::detectCores() - 1)

# More efficient logit choice function using data.table
simulate_logit_choice_fast <- function(dt, utility_column) {
  # Convert to data.table if not already
  DT <- as.data.table(dt)
  
  # Perform grouping and calculations efficiently
  result <- DT[, {
    # Calculate exponential of utility and probabilities
    exp_u <- exp(get(utility_column))
    probs <- exp_u / sum(exp_u)
    
    # Sample one row based on probabilities
    i <- sample(1:.N, 1, prob = probs)
    .SD[i]
  }, by = id]
  
  return(result)
}

# Function to run a single simulation and return results
run_single_simulation <- function(full_data_with_outside) {
  # Create data.table copy to avoid modifying original
  dt <- as.data.table(full_data_with_outside)
  
  # Simulate choices for each condition
  chosen_true <- simulate_logit_choice_fast(dt, "utility")
  chosen <- simulate_logit_choice_fast(dt, "perceived_utility")
  chosen_wlabel <- simulate_logit_choice_fast(dt, "utility_wlabel")
  chosen_hlabel <- simulate_logit_choice_fast(dt, "utility_hlabel")
  chosen_nulabel <- simulate_logit_choice_fast(dt, "utility_nulabel")
  
  # Calculate total utility for each condition (faster with data.table)
  result <- data.table(
    scenario = c("True Utility", "No Label", "Warning Label", "Health Label", "Nutrition-Score Label"),
    total_utility = c(
      sum(chosen_true$utility),
      sum(chosen$utility),
      sum(chosen_wlabel$utility),
      sum(chosen_hlabel$utility),
      sum(chosen_nulabel$utility)
    )
  )
  
  return(result)
}

# Run N simulations in parallel
run_multiple_simulations <- function(full_data_with_outside, N = 1000) {
  # Run N simulations in parallel without progress reporting
  # (removing progress reporting to avoid the error)
  all_results <- future_lapply(1:N, function(i) {
    result <- run_single_simulation(full_data_with_outside)
    result$simulation <- i
    return(result)
  }, future.seed = TRUE)
  
  # Combine all results
  combined_results <- rbindlist(all_results)
  
  # Calculate summary statistics by scenario (using data.table for speed)
  summary_stats <- combined_results[, .(
    mean_utility = mean(total_utility),
    sd_utility = sd(total_utility),
    n = .N,
    se = sd(total_utility) / sqrt(.N),
    lower_ci = mean(total_utility) - 1.96 * sd(total_utility) / sqrt(.N),
    upper_ci = mean(total_utility) + 1.96 * sd(total_utility) / sqrt(.N)
  ), by = scenario]
  
  # Calculate increases relative to "No Label"
  baseline_utility <- summary_stats[scenario == "No Label", mean_utility]
  summary_stats[, utility_increase := mean_utility - baseline_utility]
  summary_stats[, percent_increase := 100 * (utility_increase / baseline_utility)]
  
  return(summary_stats)
}

# Assuming full_data_with_outside is already loaded
# Run simulations with optimized code
results <- run_multiple_simulations(full_data_with_outside, N = 1000)

# Create the plot with error bars for confidence intervals
ggplot(results, aes(x = scenario, y = mean_utility)) +
  geom_bar(stat = "identity", fill = "steelblue") +
  geom_errorbar(aes(ymin = lower_ci, ymax = upper_ci), width = 0.2) +
  geom_text(aes(label = ifelse(scenario == "No Label", "", 
                             paste0("+", round(percent_increase, 1), "%"))),
          vjust = -0.5, size = 5) +
  labs(
    title = "Total Consumer Utility Under Different Labeling Scenarios",
    subtitle = "Mean values with 95% confidence intervals (N = 1000 simulations)",
    x = "Scenario",
    y = "Mean Total Utility"
  ) +
  scale_y_continuous(expand = expansion(mult = c(0, 0.15))) +
  theme_minimal() +
  theme(
    text = element_text(size = 14),
    axis.text.x = element_text(angle = 15, hjust = 1)
  )

print(results)

## Index: <scenario>
##                 scenario mean_utility sd_utility     n       se lower_ci
##                   <char>        <num>      <num> <int>    <num>    <num>
## 1:          True Utility     33115.26   83.87833  1000 2.652466 33110.06
## 2:              No Label     25403.26  121.81277  1000 3.852058 25395.71
## 3:         Warning Label     29107.68   96.39871  1000 3.048395 29101.70
## 4:          Health Label     30207.59  101.81057  1000 3.219533 30201.28
## 5: Nutrition-Score Label     32543.91   82.33296  1000 2.603597 32538.81
##    upper_ci utility_increase percent_increase
##       <num>            <num>            <num>
## 1: 33120.46         7712.002         30.35832
## 2: 25410.81            0.000          0.00000
## 3: 29113.65         3704.421         14.58247
## 4: 30213.90         4804.333         18.91227
## 5: 32549.01         7140.654         28.10921

Simulation_02

Rui

2025-04-28