Simulation_04: Healthy products’s perceived health is closer to health

I used logit model to model the consumer choice Perceived health is positively correlated with health Perceived taste is positively correlated with taste Health and taste are independent with each other

In addition (1) healthy product (health >3)’s perceived health correlated stronger to their health level (2) perceived taste is positively correlated with taste and negatively correlated with perceived health

First simulate the product

# Set seed for reproducibility
set.seed(456)

# Number of products to simulate
n_products <- 200

# Simulate health (discrete values from 1 to 5)
health <- sample(1:5, n_products, replace = TRUE)

# Generate taste values (independent from health, values from 1 to 5)
taste <- sample(1:2, n_products, replace = TRUE)

# Create indicator for healthy products (health > 3)
is_healthy <- health > 3

# Define different correlation coefficients based on health level
# Higher correlation for healthy products (health > 3)
rho_healthy <- 0.8    # Stronger correlation for healthy products
rho_unhealthy <- 0.3  # Weaker correlation for less healthy products

# Generate base random noise
noise <- rnorm(n_products)

# Create perceived health based on actual health with added noise
# Using a different approach to ensure the correlation is maintained
perceived_health_raw <- numeric(n_products)

# For healthy products: stronger relationship with health
healthy_indices <- which(is_healthy)
perceived_health_raw[healthy_indices] <- health[healthy_indices] + 
  (1/rho_healthy - 1) * rnorm(length(healthy_indices))

# For less healthy products: weaker relationship with health
unhealthy_indices <- which(!is_healthy)
perceived_health_raw[unhealthy_indices] <- health[unhealthy_indices] + 
  (1/rho_unhealthy - 1) * rnorm(length(unhealthy_indices))

# Convert to discrete values using a more direct approach that preserves correlation
# Rank and then assign 1-5 values based on quintiles
perceived_health <- rank(perceived_health_raw)
perceived_health <- ceiling(5 * perceived_health / n_products)

# Define correlation parameters
rho_taste_actual <- 0.5    # Correlation between actual taste and perceived taste
rho_healthy_perceived <- 0.8    # Correlation between health and perceived health for healthy products
rho_unhealthy_perceived <- 0.3  # Correlation between health and perceived health for less healthy products
rho_health_taste <- -0.3   # Negative correlation between perceived health and perceived taste

# First, generate perceived health values
perceived_health_raw <- numeric(n_products)

# For healthy products: stronger relationship with health
healthy_indices <- which(is_healthy)
perceived_health_raw[healthy_indices] <- health[healthy_indices] + 
  (1/rho_healthy_perceived - 1) * rnorm(length(healthy_indices))

# For less healthy products: weaker relationship with health
unhealthy_indices <- which(!is_healthy)
perceived_health_raw[unhealthy_indices] <- health[unhealthy_indices] + 
  (1/rho_unhealthy_perceived - 1) * rnorm(length(unhealthy_indices))

# Convert to discrete values using ranks
perceived_health <- rank(perceived_health_raw)
perceived_health <- ceiling(5 * perceived_health / n_products)

# Now generate perceived taste values that are:
# 1. Positively correlated with actual taste
# 2. Negatively correlated with perceived health
# We'll use a weighted combination of factors

# Generate the component correlated with actual taste
taste_component <- taste + (1/rho_taste_actual - 1) * rnorm(n_products)

# Generate the component negatively correlated with perceived health
# To create a negative correlation, we use -perceived_health
health_component <- -perceived_health + rnorm(n_products)

# Combine the components with appropriate weights
# We'll use weights that maintain the specified correlations
weight_taste <- 0.7  # Weight for taste correlation
weight_health <- 0.3 # Weight for health correlation (negative)

perceived_taste_raw <- weight_taste * scale(taste_component) + 
                      weight_health * scale(health_component)

# Convert to discrete values using ranks
perceived_taste <- rank(perceived_taste_raw)
perceived_taste <- ceiling(5 * perceived_taste / n_products)

# Make sure dplyr is loaded
library(dplyr)

# Combine into a data frame
products <- data.frame(
  product_id = 1:n_products,
  health = health,
  perceived_health = perceived_health,
  taste = taste,
  perceived_taste = perceived_taste
)

# Create a new dataframe with unique product characteristics (excluding product_id)
# Using explicit package reference to avoid function conflicts
unique_products <- products %>%
  # Select only the characteristic columns (exclude product_id)
  dplyr::select(health, perceived_health, taste, perceived_taste) %>%
  # Remove duplicate rows based on these characteristics
  dplyr::distinct() %>%
  # Add a new product_id column with sequential numbers
  dplyr::mutate(product_id = row_number())

# View the new unique products dataframe
cat("Number of unique product combinations:", nrow(unique_products), "\n")

## Number of unique product combinations: 100

head(unique_products)

##   health perceived_health taste perceived_taste product_id
## 1      5                5     1               1          1
## 2      5                5     1               3          2
## 3      3                3     1               3          3
## 4      4                4     2               1          4
## 5      3                3     1               2          5
## 6      1                2     2               2          6

# Now we can use either the original products dataframe or the unique_products dataframe
# for the logit choice model. If using unique_products, we ensure no duplicate characteristics.

# Calculate all correlations to verify
cor_health_perceived_healthy <- cor(health[is_healthy], perceived_health[is_healthy])
cor_health_perceived_unhealthy <- cor(health[!is_healthy], perceived_health[!is_healthy])
cor_health_perceived_overall <- cor(health, perceived_health)
cor_taste_perceived <- cor(taste, perceived_taste)
cor_perceived_health_taste <- cor(perceived_health, perceived_taste)

# Print the correlations
cat("TARGET CORRELATIONS:\n")

## TARGET CORRELATIONS:

cat("Health → Perceived Health (healthy products): ", rho_healthy_perceived, "\n")

## Health → Perceived Health (healthy products):  0.8

cat("Health → Perceived Health (less healthy products): ", rho_unhealthy_perceived, "\n")

## Health → Perceived Health (less healthy products):  0.3

cat("Actual Taste → Perceived Taste: ", rho_taste_actual, "\n")

## Actual Taste → Perceived Taste:  0.5

cat("Perceived Health → Perceived Taste: ", rho_health_taste, "\n\n")

## Perceived Health → Perceived Taste:  -0.3

cat("ACTUAL CORRELATIONS:\n")

## ACTUAL CORRELATIONS:

cat("Health → Perceived Health (overall): ", round(cor_health_perceived_overall, 3), "\n")

## Health → Perceived Health (overall):  0.643

cat("Health → Perceived Health (healthy products): ", round(cor_health_perceived_healthy, 3), "\n")

## Health → Perceived Health (healthy products):  0.776

cat("Health → Perceived Health (less healthy products): ", round(cor_health_perceived_unhealthy, 3), "\n")

## Health → Perceived Health (less healthy products):  0.36

cat("Actual Taste → Perceived Taste: ", round(cor_taste_perceived, 3), "\n")

## Actual Taste → Perceived Taste:  0.447

cat("Perceived Health → Perceived Taste: ", round(cor_perceived_health_taste, 3), "\n")

## Perceived Health → Perceived Taste:  -0.348

# Verify independence between actual health and actual taste
health_taste_cor <- cor(health, taste)
cat("Health → Taste (should be close to 0): ", round(health_taste_cor, 3), "\n\n")

## Health → Taste (should be close to 0):  -0.024

Then simulate the user

# Assuming you already have:
# - unique_products: data.frame with product_id, health, perceived_health, taste, perceived_taste
# - n_individuals = 1000

set.seed(123)

# Simulate individual-level parameters
n_individuals <- 4000
alpha <- rnorm(n_individuals, mean = 0.5, sd = 0.2)
beta_1 <- rnorm(n_individuals, mean = 1.2, sd = 0.3)  # weight on perceived health
beta_2 <- rnorm(n_individuals, mean = 0.8, sd = 0.3)  # weight on perceived taste

individuals <- data.frame(
  id = 1:n_individuals,
  alpha = alpha,
  beta_1 = beta_1,
  beta_2 = beta_2
)

# Add product ID
unique_products$product_id <- 1:nrow(unique_products)

# All person-product combinations
full_data <- expand.grid(id = individuals$id, product_id = unique_products$product_id)

# Merge individual parameters
full_data <- full_data %>%
  left_join(individuals, by = "id") %>%
  left_join(unique_products, by = "product_id")

# Simulate epsilon for each choice situation
set.seed(456)
full_data$epsilon <- rnorm(nrow(full_data), 0, 1)

# Compute perceived utility (what drives choice)
full_data$perceived_utility <- with(full_data,
  alpha + beta_1 * perceived_health + beta_2 * perceived_taste + epsilon
)

full_data$utility <- with(full_data,
  alpha + beta_1 * health + beta_2 * taste + epsilon
)

# Define logit choice function
simulate_logit_choice <- function(df, utility_column) {
  df %>%
    group_by(id) %>%
    mutate(
      exp_utility = exp(!!sym(utility_column)),
      prob = exp_utility / sum(exp_utility)
    ) %>%
    slice_sample(n = 1, weight_by = prob, replace = TRUE) %>%
    ungroup()
}

# Create outside option for each individual
outside_option <- individuals %>%
  mutate(
    product_id = 0,
    health = NA,
    perceived_health = NA,
    taste = NA,
    perceived_taste = NA,
    epsilon = rnorm(n_individuals, 0, 1),
    perceived_utility = 0,  # No beta terms
    utility = alpha + epsilon,
    utility_wlabel = alpha + epsilon,
    utility_hlabel = alpha + epsilon,
    utility_nulabel = alpha + epsilon
  )

Warning label

We apply a warning label intervention that adjusts how individuals perceive a product’s healthiness. The updated perceived health value is stored as perceived_health_with_label.

The rules are as follows:

If health ∈ {1, 2}, and perceived_health ∈ {3, 4, 5 }, perceived_health_with_label is set to the 2.
If perceived_health ∈ {1, 2} but health ≥ 4, then perceived_health_with_label is updated to 3.

Now, people do not make inference no health and taste

# Apply the health warning logic
full_data <- full_data %>%
  mutate(
    perceived_health_with_wlabel = case_when(
      health %in% c(1, 2) & perceived_health %in% c(3, 4, 5) ~ 2,
      perceived_health %in% c(1, 2) & health >= 3 ~ 3,
      TRUE ~ perceived_health
    )
  )

full_data$utility_wlabel <- with(full_data,
  alpha + beta_1 * perceived_health_with_wlabel + beta_2 * perceived_taste + epsilon
)

Health Label

We apply a health label intervention that modifies how individuals perceive a product’s healthiness. The adjusted perception is stored in the variable perceived_health_with_hlabel.

The rules are as follows:

If health ∈ {4, 5}, but perceived_health ∈ {1, 2,3}, then then perceived_health_with_hlabel is 4
If perceived_health ∈ {4, 5} but health < 4, then perceived_health_with_hlabel is updated to 3.

Now, individuals do not make inferences about the healthiness or taste beyond what is given.

# Apply the health label logic
full_data <- full_data %>%
  mutate(
    perceived_health_with_hlabel = case_when(
      health %in% c(4, 5) & perceived_health %in% c(1, 2, 3) ~ 4,
      perceived_health %in% c(4, 5) & health < 4 ~ 3,
      TRUE ~ perceived_health
    )
  )

# Calculate utility based on perceived health after applying health label
full_data$utility_hlabel <- with(full_data,
  alpha + beta_1 * perceived_health_with_hlabel + beta_2 * perceived_taste + epsilon
)

Nutrition score label

# Calculate utility based on perceived health after applying health label
full_data$utility_nulabel <- with(full_data,
  alpha + beta_1 * health + beta_2 * perceived_taste + epsilon
)

# Bind outside option into full_data
full_data_with_outside <- bind_rows(full_data, outside_option)

Simulate choices

chosen_true <- simulate_logit_choice(full_data_with_outside, "utility")
chosen <- simulate_logit_choice(full_data_with_outside, "perceived_utility")
chosen_wlabel <- simulate_logit_choice(full_data_with_outside, "utility_wlabel")
chosen_hlabel <- simulate_logit_choice(full_data_with_outside, "utility_hlabel")
chosen_nulabel <- simulate_logit_choice(full_data_with_outside, "utility_nulabel")

Comparison

library(ggplot2)

# Sum of utilities across individuals in each scenario
consumer_utility_summary <- data.frame(
  scenario = c("True Utility", "No Label", "Warning Label", "Health Label", 'Nutrition-Score Label'),
  total_utility = c(
    sum(chosen_true$utility),
    sum(chosen$utility),
    sum(chosen_wlabel$utility),
    sum(chosen_hlabel$utility),
    sum(chosen_nulabel$utility)
  )
)

# Calculate increase relative to 'No Label' condition
baseline_utility <- consumer_utility_summary$total_utility[consumer_utility_summary$scenario == "No Label"]
consumer_utility_summary$utility_increase <- consumer_utility_summary$total_utility - baseline_utility
consumer_utility_summary$percent_increase <- 100 * (consumer_utility_summary$utility_increase / baseline_utility)

library(ggplot2)

ggplot(consumer_utility_summary, aes(x = scenario, y = total_utility)) +
  geom_bar(stat = "identity", fill = "steelblue") +
  geom_text(aes(label = ifelse(scenario == "No Label", "", 
                               paste0("+", round(percent_increase, 1), "%"))),
            vjust = -0.5, size = 5) +
  labs(
    title = "Total Consumer Utility Under Different Labeling Scenarios",
    x = "Scenario",
    y = "Total Utility"
  ) +
  scale_y_continuous(expand = expansion(mult = c(0, 0.15)))+
  theme_minimal() +
  theme(
    text = element_text(size = 14),
    axis.text.x = element_text(angle = 15, hjust = 1)
  )

library(dplyr)

# Calculate how many products were affected by the warning label intervention
warning_label_changes <- full_data %>%
  dplyr::select(product_id, health, perceived_health, perceived_health_with_wlabel) %>%
  distinct() %>%
  filter(perceived_health != perceived_health_with_wlabel) %>%
  nrow()

# Calculate how many products were affected by the health label intervention
health_label_changes <- full_data %>%
  dplyr::select(product_id, health, perceived_health, perceived_health_with_hlabel) %>%
  distinct() %>%
  filter(perceived_health != perceived_health_with_hlabel) %>%
  nrow()

# For nutrition score label, we don't modify perceived_health directly
# Instead, we use the true health value in the utility calculation
# But we can count how many products would have a different perception if we did this
nutrition_score_changes <- full_data %>%
  dplyr::select(product_id, health, perceived_health) %>%
  distinct() %>%
  filter(health != perceived_health) %>%
  nrow()

# Print the results
cat("Number of products with updated perceived health in Warning Label scenario:", warning_label_changes, "\n")

## Number of products with updated perceived health in Warning Label scenario: 30

cat("Number of products with updated perceived health in Health Label scenario:", health_label_changes, "\n")

## Number of products with updated perceived health in Health Label scenario: 27

cat("Number of products with different true vs perceived health (Nutrition Score scenario):", nutrition_score_changes, "\n")

## Number of products with different true vs perceived health (Nutrition Score scenario): 70

# We can also calculate the percentage of products affected in each scenario
total_products <- length(unique(full_data$product_id))
cat("\nPercentage of products affected:\n")

## 
## Percentage of products affected:

cat("Warning Label scenario:", round(warning_label_changes/total_products*100, 2), "%\n")

## Warning Label scenario: 30 %

cat("Health Label scenario:", round(health_label_changes/total_products*100, 2), "%\n")

## Health Label scenario: 27 %

cat("Nutrition Score scenario:", round(nutrition_score_changes/total_products*100, 2), "%\n")

## Nutrition Score scenario: 70 %

# We can also examine the specific changes in perceived health for each scenario
# Warning Label changes
warning_label_details <- full_data %>%
  dplyr::select(product_id, health, perceived_health, perceived_health_with_wlabel) %>%
  distinct() %>%
  filter(perceived_health != perceived_health_with_wlabel) %>%
  mutate(change_type = case_when(
    health %in% c(1, 2) & perceived_health %in% c(3, 4, 5) ~ "Unhealthy product perceived as healthy -> corrected to 2",
    perceived_health %in% c(1, 2) & health >= 3 ~ "Healthy product perceived as unhealthy -> updated to 3",
    TRUE ~ "Other"
  ))

# Count by change type for warning label
warning_label_counts <- warning_label_details %>%
  count(change_type) %>%
  arrange(desc(n))

# Health Label changes
health_label_details <- full_data %>%
  dplyr::select(product_id, health, perceived_health, perceived_health_with_hlabel) %>%
  distinct() %>%
  filter(perceived_health != perceived_health_with_hlabel) %>%
  mutate(change_type = case_when(
    health %in% c(4, 5) & perceived_health %in% c(1, 2, 3) ~ "Healthy product perceived as less healthy -> updated to 4",
    perceived_health %in% c(4, 5) & health < 4 ~ "Less healthy product perceived as healthy -> corrected to 3",
    TRUE ~ "Other"
  ))

# Count by change type for health label
health_label_counts <- health_label_details %>%
  count(change_type) %>%
  arrange(desc(n))

# Print detailed results
cat("\nWarning Label changes by type:\n")

## 
## Warning Label changes by type:

print(warning_label_counts)

##                                                change_type  n
## 1 Unhealthy product perceived as healthy -> corrected to 2 18
## 2   Healthy product perceived as unhealthy -> updated to 3 12

cat("\nHealth Label changes by type:\n")

## 
## Health Label changes by type:

print(health_label_counts)

##                                                   change_type  n
## 1 Less healthy product perceived as healthy -> corrected to 3 19
## 2   Healthy product perceived as less healthy -> updated to 4  8

Run the choice simulation multiple times

library(dplyr)
library(ggplot2)
library(data.table) # For faster data manipulation

## Warning: package 'data.table' was built under R version 4.3.3

## 
## Attaching package: 'data.table'

## The following objects are masked from 'package:dplyr':
## 
##     between, first, last

library(future.apply) # For parallel processing

## Warning: package 'future.apply' was built under R version 4.3.3

## Loading required package: future

## Warning: package 'future' was built under R version 4.3.3

# Install packages if needed
if (!requireNamespace("data.table", quietly = TRUE)) install.packages("data.table")
if (!requireNamespace("future.apply", quietly = TRUE)) install.packages("future.apply")

# Set up parallel processing
plan(multisession, workers = parallel::detectCores() - 1)

# More efficient logit choice function using data.table
simulate_logit_choice_fast <- function(dt, utility_column) {
  # Convert to data.table if not already
  DT <- as.data.table(dt)
  
  # Perform grouping and calculations efficiently
  result <- DT[, {
    # Calculate exponential of utility and probabilities
    exp_u <- exp(get(utility_column))
    probs <- exp_u / sum(exp_u)
    
    # Sample one row based on probabilities
    i <- sample(1:.N, 1, prob = probs)
    .SD[i]
  }, by = id]
  
  return(result)
}

# Function to run a single simulation and return results
run_single_simulation <- function(full_data_with_outside) {
  # Create data.table copy to avoid modifying original
  dt <- as.data.table(full_data_with_outside)
  
  # Simulate choices for each condition
  chosen_true <- simulate_logit_choice_fast(dt, "utility")
  chosen <- simulate_logit_choice_fast(dt, "perceived_utility")
  chosen_wlabel <- simulate_logit_choice_fast(dt, "utility_wlabel")
  chosen_hlabel <- simulate_logit_choice_fast(dt, "utility_hlabel")
  chosen_nulabel <- simulate_logit_choice_fast(dt, "utility_nulabel")
  
  # Calculate total utility for each condition (faster with data.table)
  result <- data.table(
    scenario = c("True Utility", "No Label", "Warning Label", "Health Label", "Nutrition-Score Label"),
    total_utility = c(
      sum(chosen_true$utility),
      sum(chosen$utility),
      sum(chosen_wlabel$utility),
      sum(chosen_hlabel$utility),
      sum(chosen_nulabel$utility)
    )
  )
  
  return(result)
}

# Run N simulations in parallel
run_multiple_simulations <- function(full_data_with_outside, N = 1000) {
  # Run N simulations in parallel without progress reporting
  # (removing progress reporting to avoid the error)
  all_results <- future_lapply(1:N, function(i) {
    result <- run_single_simulation(full_data_with_outside)
    result$simulation <- i
    return(result)
  }, future.seed = TRUE)
  
  # Combine all results
  combined_results <- rbindlist(all_results)
  
  # Calculate summary statistics by scenario (using data.table for speed)
  summary_stats <- combined_results[, .(
    mean_utility = mean(total_utility),
    sd_utility = sd(total_utility),
    n = .N,
    se = sd(total_utility) / sqrt(.N),
    lower_ci = mean(total_utility) - 1.96 * sd(total_utility) / sqrt(.N),
    upper_ci = mean(total_utility) + 1.96 * sd(total_utility) / sqrt(.N)
  ), by = scenario]
  
  # Calculate increases relative to "No Label"
  baseline_utility <- summary_stats[scenario == "No Label", mean_utility]
  summary_stats[, utility_increase := mean_utility - baseline_utility]
  summary_stats[, percent_increase := 100 * (utility_increase / baseline_utility)]
  
  return(summary_stats)
}

# Assuming full_data_with_outside is already loaded
# Run simulations with optimized code
results <- run_multiple_simulations(full_data_with_outside, N = 1000)

# Create the plot with error bars for confidence intervals
ggplot(results, aes(x = scenario, y = mean_utility)) +
  geom_bar(stat = "identity", fill = "steelblue") +
  geom_errorbar(aes(ymin = lower_ci, ymax = upper_ci), width = 0.2) +
  geom_text(aes(label = ifelse(scenario == "No Label", "", 
                             paste0("+", round(percent_increase, 1), "%"))),
          vjust = -0.5, size = 5) +
  labs(
    title = "Total Consumer Utility Under Different Labeling Scenarios",
    subtitle = "Mean values with 95% confidence intervals (N = 1000 simulations)",
    x = "Scenario",
    y = "Mean Total Utility"
  ) +
  scale_y_continuous(expand = expansion(mult = c(0, 0.15))) +
  theme_minimal() +
  theme(
    text = element_text(size = 14),
    axis.text.x = element_text(angle = 15, hjust = 1)
  )

print(results)

## Index: <scenario>
##                 scenario mean_utility sd_utility     n       se lower_ci
##                   <char>        <num>      <num> <int>    <num>    <num>
## 1:          True Utility     32911.89   83.44955  1000 2.638907 32906.71
## 2:              No Label     28170.89  116.26504  1000 3.676623 28163.69
## 3:         Warning Label     30001.68  100.49229  1000 3.177845 29995.45
## 4:          Health Label     31191.52  106.79153  1000 3.377045 31184.90
## 5: Nutrition-Score Label     32627.84   81.38229  1000 2.573534 32622.79
##    upper_ci utility_increase percent_increase
##       <num>            <num>            <num>
## 1: 32917.06         4740.993         16.82940
## 2: 28178.10            0.000          0.00000
## 3: 30007.91         1830.790          6.49887
## 4: 31198.14         3020.630         10.72252
## 5: 32632.88         4456.945         15.82110