I used logit model to model the consumer choice Perceived health is positively correlated with health Perceived taste is positively correlated with taste Health and taste are independent with each other
In addition (1) healthy product (health >3)’s perceived health correlated stronger to their health level (2) perceived taste is positively correlated with taste and negatively correlated with perceived health
# Set seed for reproducibility
set.seed(456)
# Number of products to simulate
n_products <- 200
# Simulate health (discrete values from 1 to 5)
health <- sample(1:5, n_products, replace = TRUE)
# Generate taste values (independent from health, values from 1 to 5)
taste <- sample(1:2, n_products, replace = TRUE)
# Create indicator for healthy products (health > 3)
is_healthy <- health > 3
# Define different correlation coefficients based on health level
# Higher correlation for healthy products (health > 3)
rho_healthy <- 0.8 # Stronger correlation for healthy products
rho_unhealthy <- 0.3 # Weaker correlation for less healthy products
# Generate base random noise
noise <- rnorm(n_products)
# Create perceived health based on actual health with added noise
# Using a different approach to ensure the correlation is maintained
perceived_health_raw <- numeric(n_products)
# For healthy products: stronger relationship with health
healthy_indices <- which(is_healthy)
perceived_health_raw[healthy_indices] <- health[healthy_indices] +
(1/rho_healthy - 1) * rnorm(length(healthy_indices))
# For less healthy products: weaker relationship with health
unhealthy_indices <- which(!is_healthy)
perceived_health_raw[unhealthy_indices] <- health[unhealthy_indices] +
(1/rho_unhealthy - 1) * rnorm(length(unhealthy_indices))
# Convert to discrete values using a more direct approach that preserves correlation
# Rank and then assign 1-5 values based on quintiles
perceived_health <- rank(perceived_health_raw)
perceived_health <- ceiling(5 * perceived_health / n_products)
# Define correlation parameters
rho_taste_actual <- 0.5 # Correlation between actual taste and perceived taste
rho_healthy_perceived <- 0.8 # Correlation between health and perceived health for healthy products
rho_unhealthy_perceived <- 0.3 # Correlation between health and perceived health for less healthy products
rho_health_taste <- -0.3 # Negative correlation between perceived health and perceived taste
# First, generate perceived health values
perceived_health_raw <- numeric(n_products)
# For healthy products: stronger relationship with health
healthy_indices <- which(is_healthy)
perceived_health_raw[healthy_indices] <- health[healthy_indices] +
(1/rho_healthy_perceived - 1) * rnorm(length(healthy_indices))
# For less healthy products: weaker relationship with health
unhealthy_indices <- which(!is_healthy)
perceived_health_raw[unhealthy_indices] <- health[unhealthy_indices] +
(1/rho_unhealthy_perceived - 1) * rnorm(length(unhealthy_indices))
# Convert to discrete values using ranks
perceived_health <- rank(perceived_health_raw)
perceived_health <- ceiling(5 * perceived_health / n_products)
# Now generate perceived taste values that are:
# 1. Positively correlated with actual taste
# 2. Negatively correlated with perceived health
# We'll use a weighted combination of factors
# Generate the component correlated with actual taste
taste_component <- taste + (1/rho_taste_actual - 1) * rnorm(n_products)
# Generate the component negatively correlated with perceived health
# To create a negative correlation, we use -perceived_health
health_component <- -perceived_health + rnorm(n_products)
# Combine the components with appropriate weights
# We'll use weights that maintain the specified correlations
weight_taste <- 0.7 # Weight for taste correlation
weight_health <- 0.3 # Weight for health correlation (negative)
perceived_taste_raw <- weight_taste * scale(taste_component) +
weight_health * scale(health_component)
# Convert to discrete values using ranks
perceived_taste <- rank(perceived_taste_raw)
perceived_taste <- ceiling(5 * perceived_taste / n_products)
# Make sure dplyr is loaded
library(dplyr)
# Combine into a data frame
products <- data.frame(
product_id = 1:n_products,
health = health,
perceived_health = perceived_health,
taste = taste,
perceived_taste = perceived_taste
)
# Create a new dataframe with unique product characteristics (excluding product_id)
# Using explicit package reference to avoid function conflicts
unique_products <- products %>%
# Select only the characteristic columns (exclude product_id)
dplyr::select(health, perceived_health, taste, perceived_taste) %>%
# Remove duplicate rows based on these characteristics
dplyr::distinct() %>%
# Add a new product_id column with sequential numbers
dplyr::mutate(product_id = row_number())
# View the new unique products dataframe
cat("Number of unique product combinations:", nrow(unique_products), "\n")
## Number of unique product combinations: 100
head(unique_products)
## health perceived_health taste perceived_taste product_id
## 1 5 5 1 1 1
## 2 5 5 1 3 2
## 3 3 3 1 3 3
## 4 4 4 2 1 4
## 5 3 3 1 2 5
## 6 1 2 2 2 6
# Now we can use either the original products dataframe or the unique_products dataframe
# for the logit choice model. If using unique_products, we ensure no duplicate characteristics.
# Calculate all correlations to verify
cor_health_perceived_healthy <- cor(health[is_healthy], perceived_health[is_healthy])
cor_health_perceived_unhealthy <- cor(health[!is_healthy], perceived_health[!is_healthy])
cor_health_perceived_overall <- cor(health, perceived_health)
cor_taste_perceived <- cor(taste, perceived_taste)
cor_perceived_health_taste <- cor(perceived_health, perceived_taste)
# Print the correlations
cat("TARGET CORRELATIONS:\n")
## TARGET CORRELATIONS:
cat("Health → Perceived Health (healthy products): ", rho_healthy_perceived, "\n")
## Health → Perceived Health (healthy products): 0.8
cat("Health → Perceived Health (less healthy products): ", rho_unhealthy_perceived, "\n")
## Health → Perceived Health (less healthy products): 0.3
cat("Actual Taste → Perceived Taste: ", rho_taste_actual, "\n")
## Actual Taste → Perceived Taste: 0.5
cat("Perceived Health → Perceived Taste: ", rho_health_taste, "\n\n")
## Perceived Health → Perceived Taste: -0.3
cat("ACTUAL CORRELATIONS:\n")
## ACTUAL CORRELATIONS:
cat("Health → Perceived Health (overall): ", round(cor_health_perceived_overall, 3), "\n")
## Health → Perceived Health (overall): 0.643
cat("Health → Perceived Health (healthy products): ", round(cor_health_perceived_healthy, 3), "\n")
## Health → Perceived Health (healthy products): 0.776
cat("Health → Perceived Health (less healthy products): ", round(cor_health_perceived_unhealthy, 3), "\n")
## Health → Perceived Health (less healthy products): 0.36
cat("Actual Taste → Perceived Taste: ", round(cor_taste_perceived, 3), "\n")
## Actual Taste → Perceived Taste: 0.447
cat("Perceived Health → Perceived Taste: ", round(cor_perceived_health_taste, 3), "\n")
## Perceived Health → Perceived Taste: -0.348
# Verify independence between actual health and actual taste
health_taste_cor <- cor(health, taste)
cat("Health → Taste (should be close to 0): ", round(health_taste_cor, 3), "\n\n")
## Health → Taste (should be close to 0): -0.024
# Assuming you already have:
# - unique_products: data.frame with product_id, health, perceived_health, taste, perceived_taste
# - n_individuals = 1000
set.seed(123)
# Simulate individual-level parameters
n_individuals <- 4000
alpha <- rnorm(n_individuals, mean = 0.5, sd = 0.2)
beta_1 <- rnorm(n_individuals, mean = 1.2, sd = 0.3) # weight on perceived health
beta_2 <- rnorm(n_individuals, mean = 0.8, sd = 0.3) # weight on perceived taste
individuals <- data.frame(
id = 1:n_individuals,
alpha = alpha,
beta_1 = beta_1,
beta_2 = beta_2
)
# Add product ID
unique_products$product_id <- 1:nrow(unique_products)
# All person-product combinations
full_data <- expand.grid(id = individuals$id, product_id = unique_products$product_id)
# Merge individual parameters
full_data <- full_data %>%
left_join(individuals, by = "id") %>%
left_join(unique_products, by = "product_id")
# Simulate epsilon for each choice situation
set.seed(456)
full_data$epsilon <- rnorm(nrow(full_data), 0, 1)
# Compute perceived utility (what drives choice)
full_data$perceived_utility <- with(full_data,
alpha + beta_1 * perceived_health + beta_2 * perceived_taste + epsilon
)
full_data$utility <- with(full_data,
alpha + beta_1 * health + beta_2 * taste + epsilon
)
# Define logit choice function
simulate_logit_choice <- function(df, utility_column) {
df %>%
group_by(id) %>%
mutate(
exp_utility = exp(!!sym(utility_column)),
prob = exp_utility / sum(exp_utility)
) %>%
slice_sample(n = 1, weight_by = prob, replace = TRUE) %>%
ungroup()
}
# Create outside option for each individual
outside_option <- individuals %>%
mutate(
product_id = 0,
health = NA,
perceived_health = NA,
taste = NA,
perceived_taste = NA,
epsilon = rnorm(n_individuals, 0, 1),
perceived_utility = 0, # No beta terms
utility = alpha + epsilon,
utility_wlabel = alpha + epsilon,
utility_hlabel = alpha + epsilon,
utility_nulabel = alpha + epsilon
)
We apply a warning label intervention that adjusts how individuals
perceive a product’s healthiness. The updated perceived health value is
stored as perceived_health_with_label
.
The rules are as follows:
health
∈ {1, 2}, and perceived_health
∈
{3, 4, 5 }, perceived_health_with_label
is set to
the 2.perceived_health
∈ {1, 2} but
health
≥ 4, then perceived_health_with_label
is updated to 3.Now, people do not make inference no health and taste
# Apply the health warning logic
full_data <- full_data %>%
mutate(
perceived_health_with_wlabel = case_when(
health %in% c(1, 2) & perceived_health %in% c(3, 4, 5) ~ 2,
perceived_health %in% c(1, 2) & health >= 3 ~ 3,
TRUE ~ perceived_health
)
)
full_data$utility_wlabel <- with(full_data,
alpha + beta_1 * perceived_health_with_wlabel + beta_2 * perceived_taste + epsilon
)
We apply a health label intervention that modifies
how individuals perceive a product’s healthiness. The adjusted
perception is stored in the variable
perceived_health_with_hlabel
.
The rules are as follows:
health
∈ {4, 5}, but perceived_health
∈
{1, 2,3}, then then perceived_health_with_hlabel
is 4perceived_health
∈ {4, 5} but
health
< 4, then
perceived_health_with_hlabel
is updated to
3.Now, individuals do not make inferences about the healthiness or taste beyond what is given.
# Apply the health label logic
full_data <- full_data %>%
mutate(
perceived_health_with_hlabel = case_when(
health %in% c(4, 5) & perceived_health %in% c(1, 2, 3) ~ 4,
perceived_health %in% c(4, 5) & health < 4 ~ 3,
TRUE ~ perceived_health
)
)
# Calculate utility based on perceived health after applying health label
full_data$utility_hlabel <- with(full_data,
alpha + beta_1 * perceived_health_with_hlabel + beta_2 * perceived_taste + epsilon
)
# Calculate utility based on perceived health after applying health label
full_data$utility_nulabel <- with(full_data,
alpha + beta_1 * health + beta_2 * perceived_taste + epsilon
)
# Bind outside option into full_data
full_data_with_outside <- bind_rows(full_data, outside_option)
chosen_true <- simulate_logit_choice(full_data_with_outside, "utility")
chosen <- simulate_logit_choice(full_data_with_outside, "perceived_utility")
chosen_wlabel <- simulate_logit_choice(full_data_with_outside, "utility_wlabel")
chosen_hlabel <- simulate_logit_choice(full_data_with_outside, "utility_hlabel")
chosen_nulabel <- simulate_logit_choice(full_data_with_outside, "utility_nulabel")
library(ggplot2)
# Sum of utilities across individuals in each scenario
consumer_utility_summary <- data.frame(
scenario = c("True Utility", "No Label", "Warning Label", "Health Label", 'Nutrition-Score Label'),
total_utility = c(
sum(chosen_true$utility),
sum(chosen$utility),
sum(chosen_wlabel$utility),
sum(chosen_hlabel$utility),
sum(chosen_nulabel$utility)
)
)
# Calculate increase relative to 'No Label' condition
baseline_utility <- consumer_utility_summary$total_utility[consumer_utility_summary$scenario == "No Label"]
consumer_utility_summary$utility_increase <- consumer_utility_summary$total_utility - baseline_utility
consumer_utility_summary$percent_increase <- 100 * (consumer_utility_summary$utility_increase / baseline_utility)
library(ggplot2)
ggplot(consumer_utility_summary, aes(x = scenario, y = total_utility)) +
geom_bar(stat = "identity", fill = "steelblue") +
geom_text(aes(label = ifelse(scenario == "No Label", "",
paste0("+", round(percent_increase, 1), "%"))),
vjust = -0.5, size = 5) +
labs(
title = "Total Consumer Utility Under Different Labeling Scenarios",
x = "Scenario",
y = "Total Utility"
) +
scale_y_continuous(expand = expansion(mult = c(0, 0.15)))+
theme_minimal() +
theme(
text = element_text(size = 14),
axis.text.x = element_text(angle = 15, hjust = 1)
)
library(dplyr)
# Calculate how many products were affected by the warning label intervention
warning_label_changes <- full_data %>%
dplyr::select(product_id, health, perceived_health, perceived_health_with_wlabel) %>%
distinct() %>%
filter(perceived_health != perceived_health_with_wlabel) %>%
nrow()
# Calculate how many products were affected by the health label intervention
health_label_changes <- full_data %>%
dplyr::select(product_id, health, perceived_health, perceived_health_with_hlabel) %>%
distinct() %>%
filter(perceived_health != perceived_health_with_hlabel) %>%
nrow()
# For nutrition score label, we don't modify perceived_health directly
# Instead, we use the true health value in the utility calculation
# But we can count how many products would have a different perception if we did this
nutrition_score_changes <- full_data %>%
dplyr::select(product_id, health, perceived_health) %>%
distinct() %>%
filter(health != perceived_health) %>%
nrow()
# Print the results
cat("Number of products with updated perceived health in Warning Label scenario:", warning_label_changes, "\n")
## Number of products with updated perceived health in Warning Label scenario: 30
cat("Number of products with updated perceived health in Health Label scenario:", health_label_changes, "\n")
## Number of products with updated perceived health in Health Label scenario: 27
cat("Number of products with different true vs perceived health (Nutrition Score scenario):", nutrition_score_changes, "\n")
## Number of products with different true vs perceived health (Nutrition Score scenario): 70
# We can also calculate the percentage of products affected in each scenario
total_products <- length(unique(full_data$product_id))
cat("\nPercentage of products affected:\n")
##
## Percentage of products affected:
cat("Warning Label scenario:", round(warning_label_changes/total_products*100, 2), "%\n")
## Warning Label scenario: 30 %
cat("Health Label scenario:", round(health_label_changes/total_products*100, 2), "%\n")
## Health Label scenario: 27 %
cat("Nutrition Score scenario:", round(nutrition_score_changes/total_products*100, 2), "%\n")
## Nutrition Score scenario: 70 %
# We can also examine the specific changes in perceived health for each scenario
# Warning Label changes
warning_label_details <- full_data %>%
dplyr::select(product_id, health, perceived_health, perceived_health_with_wlabel) %>%
distinct() %>%
filter(perceived_health != perceived_health_with_wlabel) %>%
mutate(change_type = case_when(
health %in% c(1, 2) & perceived_health %in% c(3, 4, 5) ~ "Unhealthy product perceived as healthy -> corrected to 2",
perceived_health %in% c(1, 2) & health >= 3 ~ "Healthy product perceived as unhealthy -> updated to 3",
TRUE ~ "Other"
))
# Count by change type for warning label
warning_label_counts <- warning_label_details %>%
count(change_type) %>%
arrange(desc(n))
# Health Label changes
health_label_details <- full_data %>%
dplyr::select(product_id, health, perceived_health, perceived_health_with_hlabel) %>%
distinct() %>%
filter(perceived_health != perceived_health_with_hlabel) %>%
mutate(change_type = case_when(
health %in% c(4, 5) & perceived_health %in% c(1, 2, 3) ~ "Healthy product perceived as less healthy -> updated to 4",
perceived_health %in% c(4, 5) & health < 4 ~ "Less healthy product perceived as healthy -> corrected to 3",
TRUE ~ "Other"
))
# Count by change type for health label
health_label_counts <- health_label_details %>%
count(change_type) %>%
arrange(desc(n))
# Print detailed results
cat("\nWarning Label changes by type:\n")
##
## Warning Label changes by type:
print(warning_label_counts)
## change_type n
## 1 Unhealthy product perceived as healthy -> corrected to 2 18
## 2 Healthy product perceived as unhealthy -> updated to 3 12
cat("\nHealth Label changes by type:\n")
##
## Health Label changes by type:
print(health_label_counts)
## change_type n
## 1 Less healthy product perceived as healthy -> corrected to 3 19
## 2 Healthy product perceived as less healthy -> updated to 4 8
library(dplyr)
library(ggplot2)
library(data.table) # For faster data manipulation
## Warning: package 'data.table' was built under R version 4.3.3
##
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
##
## between, first, last
library(future.apply) # For parallel processing
## Warning: package 'future.apply' was built under R version 4.3.3
## Loading required package: future
## Warning: package 'future' was built under R version 4.3.3
# Install packages if needed
if (!requireNamespace("data.table", quietly = TRUE)) install.packages("data.table")
if (!requireNamespace("future.apply", quietly = TRUE)) install.packages("future.apply")
# Set up parallel processing
plan(multisession, workers = parallel::detectCores() - 1)
# More efficient logit choice function using data.table
simulate_logit_choice_fast <- function(dt, utility_column) {
# Convert to data.table if not already
DT <- as.data.table(dt)
# Perform grouping and calculations efficiently
result <- DT[, {
# Calculate exponential of utility and probabilities
exp_u <- exp(get(utility_column))
probs <- exp_u / sum(exp_u)
# Sample one row based on probabilities
i <- sample(1:.N, 1, prob = probs)
.SD[i]
}, by = id]
return(result)
}
# Function to run a single simulation and return results
run_single_simulation <- function(full_data_with_outside) {
# Create data.table copy to avoid modifying original
dt <- as.data.table(full_data_with_outside)
# Simulate choices for each condition
chosen_true <- simulate_logit_choice_fast(dt, "utility")
chosen <- simulate_logit_choice_fast(dt, "perceived_utility")
chosen_wlabel <- simulate_logit_choice_fast(dt, "utility_wlabel")
chosen_hlabel <- simulate_logit_choice_fast(dt, "utility_hlabel")
chosen_nulabel <- simulate_logit_choice_fast(dt, "utility_nulabel")
# Calculate total utility for each condition (faster with data.table)
result <- data.table(
scenario = c("True Utility", "No Label", "Warning Label", "Health Label", "Nutrition-Score Label"),
total_utility = c(
sum(chosen_true$utility),
sum(chosen$utility),
sum(chosen_wlabel$utility),
sum(chosen_hlabel$utility),
sum(chosen_nulabel$utility)
)
)
return(result)
}
# Run N simulations in parallel
run_multiple_simulations <- function(full_data_with_outside, N = 1000) {
# Run N simulations in parallel without progress reporting
# (removing progress reporting to avoid the error)
all_results <- future_lapply(1:N, function(i) {
result <- run_single_simulation(full_data_with_outside)
result$simulation <- i
return(result)
}, future.seed = TRUE)
# Combine all results
combined_results <- rbindlist(all_results)
# Calculate summary statistics by scenario (using data.table for speed)
summary_stats <- combined_results[, .(
mean_utility = mean(total_utility),
sd_utility = sd(total_utility),
n = .N,
se = sd(total_utility) / sqrt(.N),
lower_ci = mean(total_utility) - 1.96 * sd(total_utility) / sqrt(.N),
upper_ci = mean(total_utility) + 1.96 * sd(total_utility) / sqrt(.N)
), by = scenario]
# Calculate increases relative to "No Label"
baseline_utility <- summary_stats[scenario == "No Label", mean_utility]
summary_stats[, utility_increase := mean_utility - baseline_utility]
summary_stats[, percent_increase := 100 * (utility_increase / baseline_utility)]
return(summary_stats)
}
# Assuming full_data_with_outside is already loaded
# Run simulations with optimized code
results <- run_multiple_simulations(full_data_with_outside, N = 1000)
# Create the plot with error bars for confidence intervals
ggplot(results, aes(x = scenario, y = mean_utility)) +
geom_bar(stat = "identity", fill = "steelblue") +
geom_errorbar(aes(ymin = lower_ci, ymax = upper_ci), width = 0.2) +
geom_text(aes(label = ifelse(scenario == "No Label", "",
paste0("+", round(percent_increase, 1), "%"))),
vjust = -0.5, size = 5) +
labs(
title = "Total Consumer Utility Under Different Labeling Scenarios",
subtitle = "Mean values with 95% confidence intervals (N = 1000 simulations)",
x = "Scenario",
y = "Mean Total Utility"
) +
scale_y_continuous(expand = expansion(mult = c(0, 0.15))) +
theme_minimal() +
theme(
text = element_text(size = 14),
axis.text.x = element_text(angle = 15, hjust = 1)
)
print(results)
## Index: <scenario>
## scenario mean_utility sd_utility n se lower_ci
## <char> <num> <num> <int> <num> <num>
## 1: True Utility 32911.89 83.44955 1000 2.638907 32906.71
## 2: No Label 28170.89 116.26504 1000 3.676623 28163.69
## 3: Warning Label 30001.68 100.49229 1000 3.177845 29995.45
## 4: Health Label 31191.52 106.79153 1000 3.377045 31184.90
## 5: Nutrition-Score Label 32627.84 81.38229 1000 2.573534 32622.79
## upper_ci utility_increase percent_increase
## <num> <num> <num>
## 1: 32917.06 4740.993 16.82940
## 2: 28178.10 0.000 0.00000
## 3: 30007.91 1830.790 6.49887
## 4: 31198.14 3020.630 10.72252
## 5: 32632.88 4456.945 15.82110