Load Data

# find the most recent file
find_most_recent <- function(directory, pattern){
  files <-  list.files(directory, pattern = pattern, full.names = TRUE) 
  filename <- files[which.max(file.info(files)$mtime)]
  return(filename)
}

df_file <- find_most_recent("qualtrics_data/consumer_demand/main_data_v2", "Consumer")
print(paste0("Loading data from: ", df_file))
## [1] "Loading data from: qualtrics_data/consumer_demand/main_data_v2/USTM+-+Consumer+Demand_May+9,+2025_09.07.csv"
df_final <- read.csv(df_file, stringsAsFactors = FALSE)

review_database <- read.csv("sampled_comments_per_video_mode_balanced.csv", stringsAsFactors = FALSE)

Data Pre-processing

df_final <- df_final %>% 
  filter(StartDate >= "2025-05-08") %>%
  filter(Status == "IP Address") %>%
  filter(Finished == "True") %>%
  filter(Consent == "YES")
df_final$Choice1 <- mapply(function(row, col) row[[col]], split(df_final, seq(nrow(df_final))), gsub("^\\$\\{e://Field/([^}]+)\\}$", "\\1", df_final$Choice1))
df_final$Choice2 <- mapply(function(row, col) row[[col]], split(df_final, seq(nrow(df_final))), gsub("^\\$\\{e://Field/([^}]+)\\}$", "\\1", df_final$Choice2))
df_final$Choice3 <- mapply(function(row, col) row[[col]], split(df_final, seq(nrow(df_final))), gsub("^\\$\\{e://Field/([^}]+)\\}$", "\\1", df_final$Choice3))
df_final$Choice4 <- mapply(function(row, col) row[[col]], split(df_final, seq(nrow(df_final))), gsub("^\\$\\{e://Field/([^}]+)\\}$", "\\1", df_final$Choice4))
df_final$age <- as.numeric(df_final$age)
df_final$social_media_use <- factor(df_final$social_media_use, levels = c("1 hour or less", "1-3 hours", "3-5 hours", "5+ hours"))
df_final$website_use <- factor(df_final$website_use, levels = c("1 hour or less", "1-3 hours", "3-5 hours", "5+ hours"))
df_final$edu <- factor(df_final$edu, levels = c("Did not graduate from high school", "High school graduate (high school diploma or equivalent including GED)", "Some college, but no degree", "2-year college degree", "4-year college degree", "Postgraduate degree (MA, MBA, JD, PhD, etc.)"))
df_final$polparty <- factor(df_final$polparty, levels = c("Democrat", "Republican", "Independent", "Other Party"))
df_final$libcons <- factor(df_final$libcons, levels = c("Strong Conservative", "Moderate Conservative", "Moderate", "Moderate Liberal", "Strong Liberal"))
df_final$income <- factor(df_final$income, levels = c("Prefer not to say", "Less than $10,000", "$10,000-$49,999", "$50,000-$99,999", "$100,000-$149,999", "$150,000 or more"))
df_final$social_media_reply <- factor(df_final$social_media_reply, levels = c("Never", "Few times a year", "Few times a month", "Few times a week", "1-2 times per day", "More than 4 times per day"))
df_final$review_freq <- factor(df_final$review_freq, levels = c("Never", "Rarely (1 - 20% of the time)", "Occasionally (21 - 40% of the time)", "Sometimes (41 - 60% of the time)", "Often (61 - 80% of the time)", "Very often (81 - 100% of the time)"))
df_final$read_review_freq <- factor(df_final$read_review_freq, levels = c("Never", "Rarely (1 - 20% of the time)", "Occasionally (21 - 40% of the time)", "Sometimes (41 - 60% of the time)", "Often (61 - 80% of the time)", "Very often (81 - 100% of the time)"))
df_final$review_essential <- factor(df_final$review_essential, levels = c("Not important at all", "Somewhat not important", "Neutral", "Somewhat important", "Very important"))
df_final$AIreview_use <- factor(df_final$AIreview_use, levels = c("Much less frequently", "Less frequently", "About the same", "More frequently", "Much more frequently"))
# social media
df_final$social_media_X <- ifelse(str_detect(df_final$social_media, "X (formerly Twitter)"), 1, 0)
df_final$social_media_FB <- ifelse(str_detect(df_final$social_media, "Facebook"), 1, 0)
df_final$social_media_IG <- ifelse(str_detect(df_final$social_media, "Instagram"), 1, 0)
df_final$social_media_LI <- ifelse(str_detect(df_final$social_media, "LinkedIn"), 1, 0)
df_final$social_media_SN <- ifelse(str_detect(df_final$social_media, "Snapchat"), 1, 0)
df_final$social_media_TK <- ifelse(str_detect(df_final$social_media, "TikTok"), 1, 0)
df_final$social_media_YT <- ifelse(str_detect(df_final$social_media, "YouTube"), 1, 0)
df_final$social_media_nonUser <- ifelse(str_detect(df_final$social_media, "not active"), 1, 0)
df_final$social_media_user <- ifelse((df_final$social_media_X == 1) | (df_final$social_media_FB == 1) | (df_final$social_media_IG == 1) | (df_final$social_media_LI == 1) | (df_final$social_media_SN == 1) | (df_final$social_media_TK == 1) , 1, 0)

# social media use time
df_final$social_media_use_numeric <- case_when(df_final$social_media_use == "1 hour or less" ~ 1,
                                               df_final$social_media_use == "1-3 hours" ~ 2,
                                               df_final$social_media_use == "3-5 hours" ~ 3,
                                               df_final$social_media_use == "5+ hours" ~ 4)
df_final$social_media_use_1 <- ifelse(df_final$social_media_use == "1 hour or less", 1, 0)
df_final$social_media_use_13 <- ifelse(df_final$social_media_use == "1-3 hours", 1, 0)
df_final$social_media_use_35 <- ifelse(df_final$social_media_use == "3-5 hours", 1, 0)
df_final$social_media_use_5 <- ifelse(df_final$social_media_use == "5+ hours", 1, 0)

# website use
df_final$website_use_numeric <- case_when(df_final$website_use == "1 hour or less" ~ 1,
                                          df_final$website_use == "1-3 hours" ~ 2,
                                          df_final$website_use == "3-5 hours" ~ 3,
                                          df_final$website_use == "5+ hours" ~ 4)
df_final$website_use_1 <- ifelse(df_final$website_use == "1 hour or less", 1, 0)
df_final$website_use_13 <- ifelse(df_final$website_use == "1-3 hours", 1, 0)
df_final$website_use_35 <- ifelse(df_final$website_use == "3-5 hours", 1, 0)
df_final$website_use_5 <- ifelse(df_final$website_use == "5+ hours", 1, 0)

# gender
df_final$genderFemale <- ifelse(df_final$gender == "Female", 1, 0)

# race
df_final$raceAsian <- ifelse(df_final$race == "Asian/Pacific Islander", 1, 0)
df_final$raceBlack <- ifelse(df_final$race == "Black or African American", 1, 0)
df_final$raceHispanic <- ifelse(df_final$race == "Latino or Hispanic", 1, 0)
df_final$raceWhite <- ifelse(df_final$race == "Caucasian/White", 1, 0)
df_final$raceOther <- ifelse(df_final$race %in% c("Asian/Pacific Islander", "Black or African American", "Latino or Hispanic", "Caucasian/White"), 0, 1)

# education
df_final$eduHighSchoolOrLess <- ifelse(df_final$edu %in% c("Did not graduate from high school", "High school graduate (high school diploma or equivalent including GED)"), 1, 0)
df_final$eduSomeCollege <- ifelse(df_final$edu == "Some college, but no degree", 1, 0)
df_final$eduBachelor <- ifelse(df_final$edu %in% c("2-year college degree", "4-year college degree"), 1, 0)
df_final$eduPostGrad <- ifelse(df_final$edu == "Postgraduate degree (MA, MBA, JD, PhD, etc.)", 1, 0)

# political party
df_final$polpartyDem <- ifelse(df_final$polparty == "Democrat", 1, 0)
df_final$polpartyRep <- ifelse(df_final$polparty == "Republican", 1, 0)
df_final$polpartyInd <- ifelse(df_final$polparty == "Independent", 1, 0)
df_final$polpartyOther <- ifelse(df_final$polparty %in% c("Democrat", "Republican"), 0, 1)

# political ideology
df_final$libcons_numeric <- case_when(df_final$libcons == "Strong Liberal" ~ 5,
          df_final$libcons == "Moderate Liberal" ~ 4,
          df_final$libcons == "Moderate" ~ 3,
          df_final$libcons == "Moderate Conservative" ~ 2,
          df_final$libcons == "Strong Conservative" ~ 1) 

# income
df_final$income_numeric <- case_when(df_final$income == "Less than $10,000" ~ 1,
                                     df_final$income == "$10,000-$49,999" ~ 2,
                                     df_final$income == "$50,000-$99,999" ~ 3,
                                     df_final$income == "$100,000-$149,999" ~ 4,
                                     df_final$income == "$150,000 or more" ~ 5,
                                     TRUE ~ 0)
df_final$income_flag <- ifelse(df_final$income_numeric == 0, 1, 0)

# social media reply
df_final$social_media_reply_numeric <- case_when(df_final$social_media_reply == "Never" ~ 1,
                                                 df_final$social_media_reply == "Few times a year" ~ 2,
                                                 df_final$social_media_reply == "Few times a month" ~ 3,
                                                 df_final$social_media_reply == "Few times a week" ~ 4,
                                                 df_final$social_media_reply == "1-2 times per day" ~ 5,
                                                 df_final$social_media_reply == "More than 4 times per day" ~ 6) 
df_final$social_media_reply_never <- ifelse(df_final$social_media_reply == "Never", 1, 0)
df_final$social_media_reply_fewayear <- ifelse(df_final$social_media_reply == "Few times a year", 1, 0)
df_final$social_media_reply_fewamonth <- ifelse(df_final$social_media_reply == "Few times a month", 1, 0)
df_final$social_media_reply_fewaweek <- ifelse(df_final$social_media_reply == "Few times a week", 1, 0)
df_final$social_media_reply_12times <- ifelse(df_final$social_media_reply == "1-2 times per day", 1, 0)
df_final$social_media_reply_4times <- ifelse(df_final$social_media_reply == "More than 4 times per day", 1, 0)

# review frequency                              
df_final$review_freq_numeric <- case_when(df_final$review_freq == "Never" ~ 1,
                                          df_final$review_freq == "Rarely (1 - 20% of the time)" ~ 2,
                                          df_final$review_freq == "Occasionally (21 - 40% of the time)" ~ 3,
                                          df_final$review_freq == "Sometimes (41 - 60% of the time)" ~ 4,
                                          df_final$review_freq == "Often (61 - 80% of the time)" ~ 5,
                                          df_final$review_freq == "Very often (81 - 100% of the time)" ~ 6)
df_final$review_freq_never <- ifelse(df_final$review_freq == "Never", 1, 0)
df_final$review_freq_rarely <- ifelse(df_final$review_freq == "Rarely (1 - 20% of the time)", 1, 0)
df_final$review_freq_occasionally <- ifelse(df_final$review_freq == "Occasionally (21 - 40% of the time)", 1, 0)
df_final$review_freq_sometimes <- ifelse(df_final$review_freq == "Sometimes (41 - 60% of the time)", 1, 0)
df_final$review_freq_often <- ifelse(df_final$review_freq == "Often (61 - 80% of the time)", 1, 0)
df_final$review_freq_veryoften <- ifelse(df_final$review_freq == "Very often (81 - 100% of the time)", 1, 0)

# read review frequency
df_final$read_review_freq_numeric <- case_when(df_final$read_review_freq == "Never" ~ 1,
                                                df_final$read_review_freq == "Rarely (1 - 20% of the time)" ~ 2,
                                                df_final$read_review_freq == "Occasionally (21 - 40% of the time)" ~ 3,
                                                df_final$read_review_freq == "Sometimes (41 - 60% of the time)" ~ 4,
                                                df_final$read_review_freq == "Often (61 - 80% of the time)" ~ 5,
                                                df_final$read_review_freq == "Very often (81 - 100% of the time)" ~ 6)
df_final$read_review_freq_never <- ifelse(df_final$read_review_freq == "Never", 1, 0)
df_final$read_review_freq_rarely <- ifelse(df_final$read_review_freq == "Rarely (1 - 20% of the time)", 1, 0)
df_final$read_review_freq_occasionally <- ifelse(df_final$read_review_freq == "Occasionally (21 - 40% of the time)", 1, 0)
df_final$read_review_freq_sometimes <- ifelse(df_final$read_review_freq == "Sometimes (41 - 60% of the time)", 1, 0)
df_final$read_review_freq_often <- ifelse(df_final$read_review_freq == "Often (61 - 80% of the time)", 1, 0)
df_final$read_review_freq_veryoften <- ifelse(df_final$read_review_freq == "Very often (81 - 100% of the time)", 1, 0)

# review essential
df_final$review_essential_numeric <- case_when(df_final$review_essential == "Not important at all" ~ 1,
                                                df_final$review_essential == "Somewhat not important" ~ 2,
                                                df_final$review_essential == "Neutral" ~ 3,
                                                df_final$review_essential == "Somewhat important" ~ 4,
                                                df_final$review_essential == "Very important" ~ 5)
df_final$review_essential_notimportant <- ifelse(df_final$review_essential == "Not important at all", 1, 0)
df_final$review_essential_somewhatnotimportant <- ifelse(df_final$review_essential == "Somewhat not important", 1, 0)
df_final$review_essential_neutral <- ifelse(df_final$review_essential == "Neutral", 1, 0)
df_final$review_essential_somewhatimportant <- ifelse(df_final$review_essential == "Somewhat important", 1, 0)
df_final$review_essential_veryimportant <- ifelse(df_final$review_essential == "Very important", 1, 0)

# AIreview use
df_final$AIreview_use_numeric <- case_when(df_final$AIreview_use == "Much less frequently" ~ 1,
                                            df_final$AIreview_use == "Less frequently" ~ 2,
                                            df_final$AIreview_use == "About the same" ~ 3,
                                            df_final$AIreview_use == "More frequently" ~ 4,
                                            df_final$AIreview_use == "Much more frequently" ~ 5)
df_final$AIreview_use_muchlessfrequently <- ifelse(df_final$AIreview_use == "Much less frequently", 1, 0)
df_final$AIreview_use_lessfrequently <- ifelse(df_final$AIreview_use == "Less frequently", 1, 0)
df_final$AIreview_use_aboutthesame <- ifelse(df_final$AIreview_use == "About the same", 1, 0)
df_final$AIreview_use_morefrequently <- ifelse(df_final$AIreview_use == "More frequently", 1, 0)
df_final$AIreview_use_muchmorefrequently <- ifelse(df_final$AIreview_use == "Much more frequently", 1, 0)



covariates_all <- c("social_media_use", "website_use", "gender", "age",  "edu", "polparty", "libcons", "income", "social_media_reply", "review_freq", "social_media_X", "social_media_FB", "social_media_IG", "social_media_LI", "social_media_SN", "social_media_TK", "social_media_YT", "social_media_nonUser", "social_media_use_numeric", "social_media_use_1", "social_media_use_13", "social_media_use_35", "social_media_use_5", "website_use_numeric", "website_use_1", "website_use_13", "website_use_35", "website_use_5", "genderFemale", "raceAsian", "raceBlack", "raceHispanic", "raceWhite", "raceOther", "eduHighSchoolOrLess", "eduSomeCollege", "eduBachelor", "eduPostGrad", "polpartyDem", "polpartyRep", "polpartyInd", "polpartyOther", "libcons_numeric", "income_numeric", "income_flag", "social_media_reply_numeric", "social_media_reply_never", "social_media_reply_fewayear", "social_media_reply_fewamonth", "social_media_reply_fewaweek", "social_media_reply_12times", "social_media_reply_4times", "review_freq_numeric", "review_freq_never", "review_freq_rarely", "review_freq_occasionally", "review_freq_sometimes", "review_freq_often", "review_freq_veryoften", "read_review_freq_numeric", "read_review_freq_never", "read_review_freq_rarely", "read_review_freq_occasionally", "read_review_freq_sometimes", "read_review_freq_often", "read_review_freq_veryoften", "review_essential_numeric", "review_essential_notimportant", "review_essential_somewhatnotimportant", "review_essential_neutral", "review_essential_somewhatimportant", "review_essential_veryimportant", "AIreview_use_numeric", "AIreview_use_muchlessfrequently", "AIreview_use_lessfrequently", "AIreview_use_aboutthesame", "AIreview_use_morefrequently", "AIreview_use_muchmorefrequently")

covariates_simple <- c("age", "social_media_YT", "social_media_nonUser", "social_media_user", "social_media_use_numeric", "website_use_numeric", "genderFemale", "raceAsian", "raceBlack", "raceHispanic", "raceWhite", "raceOther", "eduHighSchoolOrLess", "eduSomeCollege", "eduBachelor", "eduPostGrad", "polpartyDem", "polpartyRep", "polpartyOther", "libcons_numeric", "income_numeric",  "social_media_reply_numeric",  "review_freq_numeric", "read_review_freq_numeric", "review_essential_numeric", "AIreview_use_numeric")

covariates_simple_fancy <- c("Age", "YouTube User", "Social Media: Non-User", "Social Media: User", "Social Media Usage (1 - 4 Scale)", "Online Usage (1 - 4 Scale)", "Female", "Race: Asian", "Race: Black", "Race: Hispanic", "Race: White", "Race: Other", "Education: High School or Less", "Education: Some College", "Education: Bachelor", "Education: Postgraduate", "Political Party: Democrat", "Political Party: Republican", "Political Party: Other", "Political Ideology (1 - 5 Scale; 5 Strong Liberal)", "Income (1 - 5 Scale)", "Social Media Reply Frequency (1 - 6 Scale)", "Review Frequency (1 - 6 Scale)", "Read Review Frequency (1 - 6 Scale)", "Review Importance (1 - 5 Scale)", "AI Review Hypothetical Usage (1 - 5 Scale)")
df_final <- df_final %>%
  mutate(across(.cols = all_of(covariates_simple),
                .fns = ~ scale(., center = TRUE, scale = TRUE),
                .names = "{.col}_cov"))

covariates_simple_demeaned <- paste0(covariates_simple, "_cov")
covariates_simple_demeaned_treatment <- paste0(covariates_simple_demeaned, " * treatment")
covariates_simple_demeaned_treatment_fancy <- paste0(c("Treatment Mixed Reviews * ", "Treatment All AI Reviews * "), rep(covariates_simple_fancy, each = 2))
grouping_map <- c(
  "17" = "Coin Operated",
  "22" = "Crook$",
  "20" = "Forever Sleep",
  "16" = "Soft Rain",
  "23" = "One-Minute Time Machine",
  "14" = "Alternative Math",
  "11" = "Radical Honesty",
  "21" = "Different",
  "18" = "The Cook",
  "19" = "Skipped",
  "13" = "Boom",
  "15" = "French Roast"
)

# Mapping: grouping → title
map_grouping_to_title <- function(grouping_vector) {
  unname(grouping_map[as.character(grouping_vector)])
}

# Reverse mapping: title → grouping
map_title_to_grouping <- function(title_vector) {
  name_lookup <- setNames(names(grouping_map), grouping_map)
  as.integer(name_lookup[title_vector])
}
df_final <- df_final %>% mutate(
  # extract number from the question text
  mech_breadth = as.numeric(str_extract(mech_breadth, "\\d+")),
  mech_length = as.numeric(str_extract(mech_length, "\\d+")),
  mech_detail = as.numeric(str_extract(mech_detail, "\\d+")),
  mech_tone = as.numeric(str_extract(mech_tone, "\\d+")),
  mech_valence = as.numeric(str_extract(mech_valence, "\\d+"))
)
# makeCodebook(df_final[, covariates_all], replace = TRUE,
#              reportTitle = 'Covariates Summary', # change this with final version
#              file = 'processed_final_data/codebook_covariates_consumer_demand.Rmd') # change this with final version

Check for Potential Errors

If there are any errors in the data, it will be printed out. Otherwise, nothing will be printed.

error_responseId <- c()

Check Video Randomization

TODO: ALSO NEED TO CHECK WHETHER EACH VIDEO ON AVERAGE GETS RANDOMIZED INTO DIFFERENT POSITIONS

check_video_randomization <- function(row){
  # check if video1Title to video12Title are all different
  video_title <- df_final %>% 
    dplyr::select(starts_with("video")) %>% 
    dplyr::select(ends_with("Title")) %>% 
    dplyr::slice(row) %>% 
    unlist() %>% 
    as.character()
  
  return(length(unique(video_title)) == 12)
}

for (i in 1:nrow(df_final)) {
  if (!check_video_randomization(i)) {
    print(paste("Video randomization error in row", i))
    error_responseId <- c(error_responseId, df_final$ResponseId[i])
  }
}
## [1] "Video randomization error in row 358"
## [1] "Video randomization error in row 607"

Check Condition Randomization

TODO: ALSO NEED TO CHECK WHETHER EACH CONDITION ON AVERAGE GETS RANDOMIZED INTO DIFFERENT POSITIONS

check_condition_randomization <- function(row){
  # check if video1Cond to video3Cond contains one of allAi, allNonAi, half
  # similarly for video4Cond to video6Cond, video7Cond to video9Cond, and video10Cond to video12Cond
  condition <- df_final %>% 
    dplyr::select(starts_with("video")) %>% 
    dplyr::select(ends_with("Cond")) %>% 
    dplyr::slice(row) 
  
  firstChoiceRandom <- (condition[, c("video1Cond", "video2Cond", "video3Cond")] %>% unlist() %>% unique() %>% length()) == 3
  secondChoiceRandom <- (condition[, c("video4Cond", "video5Cond", "video6Cond")] %>% unlist() %>% unique() %>% length()) == 3
  thirdChoiceRandom <- (condition[, c("video7Cond", "video8Cond", "video9Cond")] %>% unlist() %>% unique() %>% length()) == 3
  fourthChoiceRandom <- (condition[, c("video10Cond", "video11Cond", "video12Cond")] %>% unlist() %>% unique() %>% length()) == 3
  
  # only if all four conditions are random return TRUE otherwise return FALSE
  return(firstChoiceRandom & secondChoiceRandom & thirdChoiceRandom & fourthChoiceRandom)
}

for (i in 1:nrow(df_final)) {
  if (!check_condition_randomization(i)) {
    print(paste("Condition randomization error in row", i))
    error_responseId <- c(error_responseId, df_final$ResponseId[i])
  }
}
## [1] "Condition randomization error in row 358"
## [1] "Condition randomization error in row 607"

Check Review Randomization

check_review_randomization <- function(row){
  # check if video1Review to video12Review are all different
  review <- df_final %>% 
    dplyr::select(starts_with("video")) %>% 
    dplyr::select(ends_with("CommentsShown")) %>% 
    dplyr::slice(row) 
  
  condition <- df_final %>% 
    dplyr::select(starts_with("video")) %>% 
    dplyr::select(ends_with("Cond")) %>% 
    dplyr::slice(row)
  
  video_title <- df_final %>% 
    dplyr::select(starts_with("video")) %>% 
    dplyr::select(ends_with("Title")) %>% 
    dplyr::slice(row)
  
  for (j in 1:length(review)){
    # separate by ||||
    re <- review[j] %>% unname() %>% unlist() 
    re <- strsplit(re, "\\|\\|\\|\\|")[[1]]
    video <- video_title[j] %>% unname() %>% unlist()
    cond <- condition[j] %>% unname() %>% unlist()
    
    videoID <- map_title_to_grouping(video)
    
    review_database_j <- review_database %>% 
      filter(Video.Id == videoID)
    
    review_database_j <- review_database_j %>% filter(Content %in% re)
    
    if (nrow(review_database_j) != 4) {
      print(paste("Review randomization error in row", row, "for video", j))
      assign(paste0("video", j, "ReviewRandom"), FALSE)
    } else {
      if (cond == "allAi"){
        # check mode_label is either all 3 or 4
        assign(paste0("video", j, "ReviewRandom"), !any(review_database_j$mode_label %in% c(1, 2)))
      } else if (cond == "allNonAi"){
        assign(paste0("video", j, "ReviewRandom"), !any(review_database_j$mode_label %in% c(3, 4)))
      } else if (cond == "half"){
        # should have two in 1 and 2, and two in 3 and 4
        assign(paste0("video", j, "ReviewRandom"), sum(review_database_j$mode_label %in% c(1, 2)) == 2 & sum(review_database_j$mode_label %in% c(3, 4)) == 2)
      }
    }
  }
  
  return (video1ReviewRandom & video2ReviewRandom & video3ReviewRandom & 
            video4ReviewRandom & video5ReviewRandom & video6ReviewRandom & 
            video7ReviewRandom & video8ReviewRandom & video9ReviewRandom & 
            video10ReviewRandom & video11ReviewRandom & video12ReviewRandom)
  
  

}

for (i in 1:nrow(df_final)) {
  if (!check_review_randomization(i)) {
    print(paste("Review randomization error in row", i))
    error_responseId <- c(error_responseId, df_final$ResponseId[i])
  }
}
## [1] "Review randomization error in row 168 for video 1"
## [1] "Review randomization error in row 168 for video 2"
## [1] "Review randomization error in row 168 for video 3"
## [1] "Review randomization error in row 168 for video 4"
## [1] "Review randomization error in row 168 for video 5"
## [1] "Review randomization error in row 168 for video 6"
## [1] "Review randomization error in row 168 for video 7"
## [1] "Review randomization error in row 168 for video 8"
## [1] "Review randomization error in row 168 for video 9"
## [1] "Review randomization error in row 168 for video 10"
## [1] "Review randomization error in row 168 for video 11"
## [1] "Review randomization error in row 168 for video 12"
## [1] "Review randomization error in row 168"
## [1] "Review randomization error in row 194 for video 1"
## [1] "Review randomization error in row 194 for video 2"
## [1] "Review randomization error in row 194 for video 3"
## [1] "Review randomization error in row 194 for video 4"
## [1] "Review randomization error in row 194 for video 5"
## [1] "Review randomization error in row 194 for video 6"
## [1] "Review randomization error in row 194 for video 7"
## [1] "Review randomization error in row 194 for video 8"
## [1] "Review randomization error in row 194 for video 9"
## [1] "Review randomization error in row 194 for video 10"
## [1] "Review randomization error in row 194 for video 11"
## [1] "Review randomization error in row 194 for video 12"
## [1] "Review randomization error in row 194"
## [1] "Review randomization error in row 358 for video 1"
## [1] "Review randomization error in row 358 for video 2"
## [1] "Review randomization error in row 358 for video 3"
## [1] "Review randomization error in row 358 for video 4"
## [1] "Review randomization error in row 358 for video 5"
## [1] "Review randomization error in row 358 for video 6"
## [1] "Review randomization error in row 358 for video 7"
## [1] "Review randomization error in row 358 for video 8"
## [1] "Review randomization error in row 358 for video 9"
## [1] "Review randomization error in row 358 for video 10"
## [1] "Review randomization error in row 358 for video 11"
## [1] "Review randomization error in row 358 for video 12"
## [1] "Review randomization error in row 358"
## [1] "Review randomization error in row 428 for video 1"
## [1] "Review randomization error in row 428 for video 2"
## [1] "Review randomization error in row 428 for video 3"
## [1] "Review randomization error in row 428 for video 4"
## [1] "Review randomization error in row 428 for video 5"
## [1] "Review randomization error in row 428 for video 6"
## [1] "Review randomization error in row 428 for video 7"
## [1] "Review randomization error in row 428 for video 8"
## [1] "Review randomization error in row 428 for video 9"
## [1] "Review randomization error in row 428 for video 10"
## [1] "Review randomization error in row 428 for video 11"
## [1] "Review randomization error in row 428 for video 12"
## [1] "Review randomization error in row 428"
## [1] "Review randomization error in row 488 for video 1"
## [1] "Review randomization error in row 488 for video 2"
## [1] "Review randomization error in row 488 for video 3"
## [1] "Review randomization error in row 488 for video 4"
## [1] "Review randomization error in row 488 for video 5"
## [1] "Review randomization error in row 488 for video 6"
## [1] "Review randomization error in row 488 for video 7"
## [1] "Review randomization error in row 488 for video 8"
## [1] "Review randomization error in row 488 for video 9"
## [1] "Review randomization error in row 488 for video 10"
## [1] "Review randomization error in row 488 for video 11"
## [1] "Review randomization error in row 488 for video 12"
## [1] "Review randomization error in row 488"
## [1] "Review randomization error in row 572 for video 1"
## [1] "Review randomization error in row 572 for video 2"
## [1] "Review randomization error in row 572 for video 3"
## [1] "Review randomization error in row 572 for video 4"
## [1] "Review randomization error in row 572 for video 5"
## [1] "Review randomization error in row 572 for video 6"
## [1] "Review randomization error in row 572 for video 7"
## [1] "Review randomization error in row 572 for video 8"
## [1] "Review randomization error in row 572 for video 9"
## [1] "Review randomization error in row 572 for video 10"
## [1] "Review randomization error in row 572 for video 11"
## [1] "Review randomization error in row 572 for video 12"
## [1] "Review randomization error in row 572"
## [1] "Review randomization error in row 607 for video 1"
## [1] "Review randomization error in row 607 for video 2"
## [1] "Review randomization error in row 607 for video 3"
## [1] "Review randomization error in row 607 for video 4"
## [1] "Review randomization error in row 607 for video 5"
## [1] "Review randomization error in row 607 for video 6"
## [1] "Review randomization error in row 607 for video 7"
## [1] "Review randomization error in row 607 for video 8"
## [1] "Review randomization error in row 607 for video 9"
## [1] "Review randomization error in row 607 for video 10"
## [1] "Review randomization error in row 607 for video 11"
## [1] "Review randomization error in row 607 for video 12"
## [1] "Review randomization error in row 607"
## [1] "Review randomization error in row 706 for video 4"
## [1] "Review randomization error in row 706 for video 5"
## [1] "Review randomization error in row 706 for video 6"
## [1] "Review randomization error in row 706 for video 7"
## [1] "Review randomization error in row 706 for video 8"
## [1] "Review randomization error in row 706 for video 9"
## [1] "Review randomization error in row 706 for video 10"
## [1] "Review randomization error in row 706 for video 11"
## [1] "Review randomization error in row 706 for video 12"
## [1] "Review randomization error in row 706"
## [1] "Review randomization error in row 758 for video 4"
## [1] "Review randomization error in row 758 for video 5"
## [1] "Review randomization error in row 758 for video 6"
## [1] "Review randomization error in row 758 for video 7"
## [1] "Review randomization error in row 758 for video 8"
## [1] "Review randomization error in row 758 for video 9"
## [1] "Review randomization error in row 758 for video 10"
## [1] "Review randomization error in row 758 for video 11"
## [1] "Review randomization error in row 758 for video 12"
## [1] "Review randomization error in row 758"

Check Multiple Choice Randomization

check_mc_randomization <- function(row){
  # check if video1MC to video12MC are all different
  mc_columns <- names(df_final)[grepl("Choice[0-9]$", names(df_final))]
  mc <- df_final %>% 
    # select the columns follow pattern "Choice" and a number
    dplyr::select(all_of(mc_columns)) %>%
    # ends with a number
    dplyr::slice(row) 
  
  video_title <- df_final %>% 
    dplyr::select(starts_with("video")) %>% 
    dplyr::select(ends_with("Title")) %>% 
    dplyr::slice(row)
  
  firstChoiceRandom <- mc[1] %in% video_title[1:3]
  secondChoiceRandom <- mc[2] %in% video_title[4:6]
  thirdChoiceRandom <- mc[3] %in% video_title[7:9]
  fourthChoiceRandom <- mc[4] %in% video_title[10:12]
  
  
  
  return (firstChoiceRandom & secondChoiceRandom & thirdChoiceRandom & fourthChoiceRandom)
}

for (i in 1:nrow(df_final)) {
  if (!check_mc_randomization(i)) {
    print(paste("Multiple choice randomization error in row", i))
    error_responseId <- c(error_responseId, df_final$ResponseId[i])
  }
}
df_final <- df_final %>% filter(!ResponseId %in% error_responseId)

Check for Quick Responses

quick_responses <- df_final %>% dplyr::select(ProlificID, ResponseId, 
                           ends_with("time_Page.Submit"), -AIreview_use_time_Page.Submit) %>% 
  # as.numeric to page submit
  dplyr::mutate(across(ends_with("time_Page.Submit"), as.numeric)) %>%
  dplyr::mutate(quick_response_count = rowSums(.[, -c(1:2)] <= 10)) %>%
  dplyr::filter(quick_response_count > 0) %>% 
  arrange(desc(quick_response_count)) %>%
  dplyr::select(ProlificID, quick_response_count,
                ends_with("time_Page.Submit"), ResponseId)

quick_responses
quick_response_ids <- quick_responses$ResponseId
quick_response_ids_loose <- quick_responses %>% filter(quick_response_count <= 2) %>% dplyr::select(ResponseId) %>% unlist() %>% unname()

Reshape to Long Format

reshape_video_choice_data <- function(df) {
  # List to store reshaped rows
  reshaped_list <- list()

  # Covariate columns (shared across all rows for a respondent)
  covariate_cols <- setdiff(names(df), grep("^Choice|^video", names(df), value = TRUE))

  for (i in seq_len(nrow(df))) {
    for (choice_num in 1:4) {
      # Indices for the three videos in this choice set
      video_indices <- ((choice_num - 1) * 3 + 1):(choice_num * 3)

      # Build column names for this choice set
      video_title_cols <- paste0("video", video_indices, "Title")
      video_comments_cols <- paste0("video", video_indices, "CommentsShown")
      video_cond_cols <- paste0("video", video_indices, "Cond")
      # Add more video-related columns as needed

      # Extract covariates
      covariates <- df[i, covariate_cols, drop = FALSE]

      # Extract choice and video info
      choice_col <- paste0("Choice", choice_num)
      choice_val <- df[[choice_col]][i]

      video_titles <- as.list(df[i, video_title_cols])
      names(video_titles) <- paste0("video", 1:3, "Title")

      video_comments <- as.list(df[i, video_comments_cols])
      names(video_comments) <- paste0("video", 1:3, "CommentsShown")

      video_conds <- as.list(df[i, video_cond_cols])
      names(video_conds) <- paste0("video", 1:3, "Cond")

      # find the chosen condition
      choice_cond <- which(video_titles == choice_val)
      treatment <- video_conds[choice_cond] %>% unlist() %>% unname()

      # Combine all into a single row
      row <- cbind(
        covariates,
        data.frame(
          choice_set = choice_num,
          choice = choice_val,
          video1Title = video_titles[[1]],
          video2Title = video_titles[[2]],
          video3Title = video_titles[[3]],
          video1CommentsShown = video_comments[[1]],
          video2CommentsShown = video_comments[[2]],
          video3CommentsShown = video_comments[[3]],
          video1Cond = video_conds[[1]],
          video2Cond = video_conds[[2]],
          video3Cond = video_conds[[3]],
          treatment = treatment
        )
      )
      reshaped_list[[length(reshaped_list) + 1]] <- row
    }
  }
  # Combine all rows into a data.frame
  reshaped_df <- do.call(rbind, reshaped_list)
  rownames(reshaped_df) <- NULL
  return(reshaped_df)
}

# note: should clear out the people who finished too quickly first.
df_long_4 <- reshape_video_choice_data(df_final)
df_long_4$choice <- factor(df_long_4$treatment, levels = c("allNonAi", "half", "allAi"))
reshape_video_choice_data_12rows <- function(df) {
  reshaped_list <- list()
  covariate_cols <- setdiff(names(df), grep("^Choice|^video", names(df), value = TRUE))
  
  for (i in seq_len(nrow(df))) {
    for (choice_num in 1:4) {
      video_indices <- ((choice_num - 1) * 3 + 1):(choice_num * 3)
      video_title_cols <- paste0("video", video_indices, "Title")
      video_comments_cols <- paste0("video", video_indices, "CommentsShown")
      video_cond_cols <- paste0("video", video_indices, "Cond")
      # Add more video-related columns as needed
      
      covariates <- df[i, covariate_cols, drop = FALSE]
      choice_col <- paste0("Choice", choice_num)
      choice_val <- df[[choice_col]][i]
      # Extract timing column for this choice set
      choice_time_col <- paste0("Choice", choice_num, "_time_Page.Submit")
      choice_time_val <- if (choice_time_col %in% names(df)) df[[choice_time_col]][i] else NA
      
      for (j in 1:3) {
        video_title <- df[[video_title_cols[j]]][i]
        video_id <- map_title_to_grouping(video_title)
        video_comments <- df[[video_comments_cols[j]]][i]
        video_cond <- df[[video_cond_cols[j]]][i]
        chosen <- as.integer(video_title == choice_val)
        
        row <- cbind(
          covariates,
          data.frame(
            choice_set = choice_num,
            video_in_set = j,
            videoId = video_id,
            videoTitle = video_title,
            videoCommentsShown = video_comments,
            videoCond = video_cond,
            chosen = chosen,
            choice_time_PageSubmit = as.numeric(choice_time_val)
          )
        )
        reshaped_list[[length(reshaped_list) + 1]] <- row
      }
    }
  }
  reshaped_df <- do.call(rbind, reshaped_list)
  rownames(reshaped_df) <- NULL
  return(reshaped_df)
}

df_long <- reshape_video_choice_data_12rows(df_final)
# further processing
df_long$treatment <- factor(df_long$videoCond, levels = c("allNonAi", "half", "allAi"))

# also create four columns of commentsShown
separate_comments <- function(comments) {
  comments <- strsplit(comments, "\\|\\|\\|\\|")[[1]]
  return(comments)
}

for (i in 1:nrow(df_long)) {
  separated_comments <- separate_comments(df_long$videoCommentsShown[i])
  df_long$videoComment1[i] <- separated_comments[1]
  df_long$videoComment2[i] <- separated_comments[2]
  df_long$videoComment3[i] <- separated_comments[3]
  df_long$videoComment4[i] <- separated_comments[4]
}

Analysis (Panel)

panel_lm <- feglm(chosen ~ treatment | choice_set + videoTitle + video_in_set, family  = binomial("logit"),
      data = df_long , cluster = "ResponseId")

panel_lm %>% summary()
## GLM estimation, family = binomial, Dep. Var.: chosen
## Observations: 9,480
## Fixed-effects: choice_set: 4,  videoTitle: 12,  video_in_set: 3
## Standard-errors: Clustered (ResponseId) 
##                Estimate Std. Error  z value         Pr(>|z|)    
## treatmenthalf  0.456476   0.071064  6.42341 0.00000000013325 ***
## treatmentallAi 1.055624   0.078600 13.43041        < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## Log-Likelihood: -5,795.0   Adj. Pseudo R2: 0.036657
##            BIC: 11,763.9     Squared Cor.: 0.050582

Choice Share (Should Add Up to 1)

df_long$pred <- predict(panel_lm, type = "response")
# Model‑based choice share by condition
choice_share <- aggregate(pred ~ treatment, df_long, mean)
choice_share
cov_form <- as.formula(paste("chosen ~ treatment + ", paste(covariates_simple_demeaned_treatment, collapse = " + "), "| choice_set + videoTitle + video_in_set"))
panel_lm_with_cov <- feglm(cov_form,
      data = df_long,
      family  = binomial("logit"),
      cluster = "ResponseId")
keep <- list(
              "treatmenthalf" = "Treatment: Mixed Reviews",
              "treatmentallAi" = "Treatment: All AI Reviews"
             )

model_list <- list(panel_lm, panel_lm_with_cov)
texreg::screenreg(model_list,
               stars = c(0.05, 0.01, 0.001),
               caption = "Panel Regression Results",
               label = "tab:panel_regression",
               digits = 4,
               custom.coef.map = keep,
               custom.note = "Standard errors are clustered at the user level.",
               custom.model.names = c("Without Covariates", "With Covariates"),
               custom.coef.names = c("Treatment: Mixed Reviews", "Treatment: All AI Reviews"))
## 
## ==============================================================
##                            Without Covariates  With Covariates
## --------------------------------------------------------------
## Treatment: Mixed Reviews       0.4565 ***          0.4692 *** 
##                               (0.0711)            (0.0708)    
## Treatment: All AI Reviews      1.0556 ***          1.0783 *** 
##                               (0.0786)            (0.0786)    
## --------------------------------------------------------------
## Num. obs.                   9480                9468          
## Num. groups: choice_set        4                   4          
## Num. groups: videoTitle       12                  12          
## Num. groups: video_in_set      3                   3          
## Deviance                   11589.9160          11468.3730     
## Log Likelihood             -5794.9580          -5734.1865     
## Pseudo R^2                     0.0367              0.0341     
## ==============================================================
## Standard errors are clustered at the user level.

Constrain to First Choice

panel_lm_1st <-  feglm(chosen ~ treatment | choice_set + videoTitle + video_in_set, family  = binomial("logit"),
      data = df_long %>% filter(choice_set == 1), cluster = "ResponseId")

panel_lm_1st %>% summary()
## GLM estimation, family = binomial, Dep. Var.: chosen
## Observations: 2,370
## Fixed-effects: choice_set: 1,  videoTitle: 12,  video_in_set: 3
## Standard-errors: Clustered (ResponseId) 
##                Estimate Std. Error z value              Pr(>|z|)    
## treatmenthalf  0.387169   0.135257 2.86246 0.0042036877231000922 ** 
## treatmentallAi 1.097414   0.137171 8.00033 0.0000000000000012409 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## Log-Likelihood: -1,430.6   Adj. Pseudo R2: 0.04171 
##            BIC:  2,985.6     Squared Cor.: 0.065872
panel_lm_1st_with_cov <- feglm(cov_form,
      data = df_long %>% filter(choice_set == 1),
      family = binomial("logit"),
      cluster = "ResponseId")
model_list <- list(panel_lm_1st, panel_lm_1st_with_cov)
texreg::screenreg(model_list,
               stars = c(0.05, 0.01, 0.001),
               caption = "Panel Regression Results",
               label = "tab:panel_regression",
               digits = 4,
               custom.coef.map = keep,
               custom.note = "Standard errors are clustered at the user level.",
               custom.model.names = c("Without Covariates", "With Covariates"),
               custom.coef.names = c("Treatment: Mixed Reviews", "Treatment: All AI Reviews"))
## 
## ==============================================================
##                            Without Covariates  With Covariates
## --------------------------------------------------------------
## Treatment: Mixed Reviews       0.3872 **           0.4387 **  
##                               (0.1353)            (0.1406)    
## Treatment: All AI Reviews      1.0974 ***          1.1739 *** 
##                               (0.1372)            (0.1421)    
## --------------------------------------------------------------
## Num. obs.                   2370                2367          
## Num. groups: choice_set        1                   1          
## Num. groups: videoTitle       12                  12          
## Num. groups: video_in_set      3                   3          
## Deviance                    2861.2353           2778.3991     
## Log Likelihood             -1430.6176          -1389.1996     
## Pseudo R^2                     0.0417              0.0222     
## ==============================================================
## Standard errors are clustered at the user level.

Subgroup Analyses

We have customized subgroup analyses for Length, Sentiment, and Choice Time (i.e. how much time a person spends on a choice question).

Preprocessing of Subgroup Variables

# Comment Length
df_long$videoComment1Length <- nchar(df_long$videoComment1)
df_long$videoComment2Length <- nchar(df_long$videoComment2)
df_long$videoComment3Length <- nchar(df_long$videoComment3)
df_long$videoComment4Length <- nchar(df_long$videoComment4)

df_long$videoCommentOverallLength <- df_long$videoComment1Length + df_long$videoComment2Length + df_long$videoComment3Length + df_long$videoComment4Length

# # winsorize
# df_long$videoCommentOverallLengthWinsorize <- Winsorize(df_long$videoCommentOverallLength, val = quantile(df_long$videoCommentOverallLength, probs = c(0, 0.99), na.rm = T))


df_long$videoCommentOverallLengthAboveMedian <- ifelse(df_long$videoCommentOverallLength > median(df_long$videoCommentOverallLength, na.rm = T), 1, 0)

# plot distribution of comment overall length
# ggplot(df_long, aes(x = videoCommentOverallLength)) +
#   geom_histogram(binwidth = 10, fill = "blue", alpha = 0.7) +
#   theme_minimal() +
#   labs(title = "Distribution of Overall Comment Length", x = "Overall Length", y = "Frequency")
# Comment Sentiment
df_long$videoComment1Sentiment <- NA
df_long$videoComment2Sentiment <- NA
df_long$videoComment3Sentiment <- NA
df_long$videoComment4Sentiment <- NA

for (i in 1:nrow(df_long)) {
  videoComment1 <- df_long$videoComment1[i]
  videoComment2 <- df_long$videoComment2[i]
  videoComment3 <- df_long$videoComment3[i]
  videoComment4 <- df_long$videoComment4[i]
  
  # find the sentiment in review_database
  sentiment1 <- review_database %>% filter(Content == videoComment1) %>% dplyr::select(sentiment) %>% unlist() %>% unname()
  sentiment2 <- review_database %>% filter(Content == videoComment2) %>% dplyr::select(sentiment) %>% unlist() %>% unname()
  sentiment3 <- review_database %>% filter(Content == videoComment3) %>% dplyr::select(sentiment) %>% unlist() %>% unname()
  sentiment4 <- review_database %>% filter(Content == videoComment4) %>% dplyr::select(sentiment) %>% unlist() %>% unname()
  
  df_long$videoComment1Sentiment[i] <- sentiment1
  df_long$videoComment2Sentiment[i] <- sentiment2
  df_long$videoComment3Sentiment[i] <- sentiment3
  df_long$videoComment4Sentiment[i] <- sentiment4
}

df_long$videoComment1Sentiment <- factor(df_long$videoComment1Sentiment, levels = c("negative", "neutral", "positive"))
df_long$videoComment2Sentiment <- factor(df_long$videoComment2Sentiment, levels = c("negative", "neutral", "positive"))
df_long$videoComment3Sentiment <- factor(df_long$videoComment3Sentiment, levels = c("negative", "neutral", "positive"))
df_long$videoComment4Sentiment <- factor(df_long$videoComment4Sentiment, levels = c("negative", "neutral", "positive"))


# row-wise how many comments have positive sentiments
df_long$videoCommentNumPosSentiment <- sapply(1:nrow(df_long), function(i) {
  sum(c(df_long$videoComment1Sentiment[i], df_long$videoComment2Sentiment[i], df_long$videoComment3Sentiment[i], df_long$videoComment4Sentiment[i]) == "positive")
})

df_long$videoCommentNumNegSentiment <- sapply(1:nrow(df_long), function(i) {
  sum(c(df_long$videoComment1Sentiment[i], df_long$videoComment2Sentiment[i], df_long$videoComment3Sentiment[i], df_long$videoComment4Sentiment[i]) == "negative")
})
df_long$choice_time_PageSubmitAboveMedian <- ifelse(df_long$choice_time_PageSubmit > median(df_long$choice_time_PageSubmit, na.rm = T), 1, 0)
df_long$choice_time_PageSubmitAbove10 <- ifelse(df_long$choice_time_PageSubmit > 10, 1, 0)
median_split <- function(df, var){
  median_val <- median(df[[var]], na.rm = T)
  df[[paste0(var, "_median")]] <- ifelse(df[[var]] > median_val, "Above Median", "Below Median")
  df[[paste0(var, "_median")]] <- factor(df[[paste0(var, "_median")]], levels = c("Below Median", "Above Median"))
  return(df)
}
quartile_split <- function(df, var){
  quartiles <- quantile(df[[var]], probs = c(0.25, 0.5, 0.75), na.rm = T)
  df[[paste0(var, "_quartile")]] <- cut(df[[var]], breaks = c(-Inf, quartiles, Inf), labels = c("Q1", "Q2", "Q3", "Q4"))
  return(df)
}
quintile_split <- function(df, var){
  quintiles <- quantile(df[[var]], probs = seq(0, 1, by = 0.2), na.rm = T)
  df[[paste0(var, "_quintile")]] <- cut(df[[var]], breaks = quintiles, labels = c("Q1", "Q2", "Q3", "Q4", "Q5"))
  return(df)
}
recenter <- function(df, var){
  values <- unique(df[[var]]) %>% sort()
  # check if the number of values is odd
  if (length(values) %% 2 == 1) {
    median_val <- values[(length(values) + 1) / 2]
    df[[paste0(var, "_recenter")]] <- df[[var]] - median_val
    
  } else {
    stop("The variable is not continuous or the number of values is not odd.")
  }
  return(df)
}
df_long <- df_long %>% 
  median_split("social_media_use_numeric") %>%
  median_split("website_use_numeric") %>%
  median_split("social_media_reply_numeric") %>% 
  median_split("review_freq_numeric") %>% 
  median_split("age") %>% 
  median_split("income_numeric") %>% 
  median_split("libcons_numeric") %>%
  median_split("review_freq_numeric") %>%
  median_split("read_review_freq_numeric") %>%
  median_split("AIreview_use_numeric") %>%
  median_split("mech_breadth") %>%
  median_split("mech_length") %>%
  median_split("mech_detail") %>%
  median_split("mech_tone") %>%
  median_split("mech_valence")


df_long <- df_long %>%
  recenter("libcons_numeric") %>%
  recenter("review_essential_numeric") %>%
  recenter("AIreview_use_numeric") %>%
  recenter("mech_breadth") %>%
  recenter("mech_length") %>%
  recenter("mech_detail") %>%
  recenter("mech_tone") %>%
  recenter("mech_valence")

df_long <- df_long %>%
  quartile_split("videoCommentOverallLength") %>%
  quartile_split("choice_time_PageSubmit")

df_long <- df_long %>%
  quintile_split("videoCommentOverallLength") %>%
  quintile_split("choice_time_PageSubmit")



df_long$review_essential_numeric_5 <- ifelse(df_long$review_essential_numeric == 5, "Very Important (= 5)", "< 5")
df_long$review_essential_numeric_5 <- factor(df_long$review_essential_numeric_5, levels = c("< 5", "Very Important (= 5)"))

df_long$edu_combined <- case_when(
  df_long$edu == "Did not graduate from high school" ~ "High School or Less",
  df_long$edu == "High school graduate (high school diploma or equivalent including GED)" ~ "High School or Less",
  df_long$edu == "Some college, but no degree" ~ "Some College",
  df_long$edu == "2-year college degree" ~ "Bachelor's Degree",
  df_long$edu == "4-year college degree" ~ "Bachelor's Degree",
  df_long$edu == "Postgraduate degree (MA, MBA, JD, PhD, etc.)" ~ "Graduate Degree"
)

df_long$edu_combined <- factor(df_long$edu_combined, levels = c("High School or Less", "Some College", "Bachelor's Degree", "Graduate Degree"))

df_long$race_combined <- case_when(
  df_long$race == "Asian/Pacific Islander" ~ "Asian",
  df_long$race == "Black or African American" ~ "Black",
  df_long$race == "Latino or Hispanic" ~ "Hispanic",
  df_long$race == "Caucasian/White" ~ "White",
  TRUE ~ "Other"
)
df_long$race_combined <- factor(df_long$race_combined, levels = c("White", "Black", "Hispanic", "Asian", "Other"))

df_long$polparty_combined <- case_when(
  df_long$polparty == "Democrat" ~ "Democrat",
  df_long$polparty == "Republican" ~ "Republican",
  TRUE ~ "Other"
)
df_long$polparty_combined <- factor(df_long$polparty_combined, levels = c("Democrat", "Republican", "Other"))

Length

Median Split of Overall Length

panel_lm_length_median <- feglm(chosen ~ treatment * videoCommentOverallLengthAboveMedian | choice_set + videoTitle + video_in_set, family  = binomial("logit"),
      data = df_long , cluster = "ResponseId")

panel_lm_length_median %>% summary()
## GLM estimation, family = binomial, Dep. Var.: chosen
## Observations: 9,480
## Fixed-effects: choice_set: 4,  videoTitle: 12,  video_in_set: 3
## Standard-errors: Clustered (ResponseId) 
##                                                      Estimate Std. Error
## treatmenthalf                                        0.352550   0.081989
## treatmentallAi                                       0.754321   0.182306
## videoCommentOverallLengthAboveMedian                 0.420075   0.308453
## treatmenthalf:videoCommentOverallLengthAboveMedian  -0.210705   0.317135
## treatmentallAi:videoCommentOverallLengthAboveMedian -0.095896   0.349537
##                                                       z value    Pr(>|z|)    
## treatmenthalf                                        4.299965 0.000017083 ***
## treatmentallAi                                       4.137655 0.000035087 ***
## videoCommentOverallLengthAboveMedian                 1.361875 0.173237211    
## treatmenthalf:videoCommentOverallLengthAboveMedian  -0.664401 0.506433661    
## treatmentallAi:videoCommentOverallLengthAboveMedian -0.274352 0.783814277    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## Log-Likelihood: -5,789.0   Adj. Pseudo R2: 0.037141
##            BIC: 11,779.5     Squared Cor.: 0.051821

Quintile Split of Overall Length

panel_lm_length_quintile <- feglm(chosen ~ treatment * videoCommentOverallLength_quintile | choice_set + videoTitle + video_in_set, family  = binomial("logit"),
      data = df_long , cluster = "ResponseId")
## NOTE: 1 observation removed because of NA values (RHS: 1).
## The variable 'treatmentallAi:videoCommentOverallLength_quintileQ5' has been removed because of collinearity (see $collin.var).
panel_lm_length_quintile %>% summary()
## GLM estimation, family = binomial, Dep. Var.: chosen
## Observations: 9,479
## Fixed-effects: choice_set: 4,  videoTitle: 12,  video_in_set: 3
## Standard-errors: Clustered (ResponseId) 
##                                                       Estimate Std. Error
## treatmenthalf                                        -0.871769   0.590928
## treatmentallAi                                       11.710761   0.105380
## videoCommentOverallLength_quintileQ2                  0.124140   0.090192
## videoCommentOverallLength_quintileQ3                 -0.055309   0.245567
## videoCommentOverallLength_quintileQ4                  0.523799   0.546973
## videoCommentOverallLength_quintileQ5                -10.493879   0.106384
## treatmenthalf:videoCommentOverallLength_quintileQ2    1.045270   0.600449
## treatmentallAi:videoCommentOverallLength_quintileQ2 -11.065947   0.309124
## treatmenthalf:videoCommentOverallLength_quintileQ3    1.484043   0.628001
## treatmentallAi:videoCommentOverallLength_quintileQ3 -10.904640   0.302454
## treatmenthalf:videoCommentOverallLength_quintileQ4    0.937523   0.808613
## treatmentallAi:videoCommentOverallLength_quintileQ4 -11.253822   0.554951
## treatmenthalf:videoCommentOverallLength_quintileQ5   12.117335   0.674641
##                                                        z value  Pr(>|z|)    
## treatmenthalf                                        -1.475254  0.140144    
## treatmentallAi                                      111.129240 < 2.2e-16 ***
## videoCommentOverallLength_quintileQ2                  1.376393  0.168700    
## videoCommentOverallLength_quintileQ3                 -0.225228  0.821802    
## videoCommentOverallLength_quintileQ4                  0.957633  0.338248    
## videoCommentOverallLength_quintileQ5                -98.641629 < 2.2e-16 ***
## treatmenthalf:videoCommentOverallLength_quintileQ2    1.740814  0.081716 .  
## treatmentallAi:videoCommentOverallLength_quintileQ2 -35.797769 < 2.2e-16 ***
## treatmenthalf:videoCommentOverallLength_quintileQ3    2.363123  0.018122 *  
## treatmentallAi:videoCommentOverallLength_quintileQ3 -36.053899 < 2.2e-16 ***
## treatmenthalf:videoCommentOverallLength_quintileQ4    1.159421  0.246285    
## treatmentallAi:videoCommentOverallLength_quintileQ4 -20.278945 < 2.2e-16 ***
## treatmenthalf:videoCommentOverallLength_quintileQ5   17.961168 < 2.2e-16 ***
## ... 1 variable was removed because of collinearity (treatmentallAi:videoCommentOverallLength_quintileQ5)
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## Log-Likelihood: -5,778.8   Adj. Pseudo R2: 0.037454
##            BIC: 11,832.2     Squared Cor.: 0.053867

Sentiment

Given we have four comments in each panel observation (i.e. four comments for each video), we create the following two variables

  • videoCommentNumPosSentiment: number of positive comments for the video
  • videoCommentNumNegSentiment: number of negative comments for the video
panel_lm_sentiment_pos <- feglm(chosen ~ treatment * videoCommentNumPosSentiment | choice_set + videoTitle + video_in_set, family  = binomial("logit"),
      data = df_long , cluster = "ResponseId")

panel_lm_sentiment_pos %>% summary()
## GLM estimation, family = binomial, Dep. Var.: chosen
## Observations: 9,480
## Fixed-effects: choice_set: 4,  videoTitle: 12,  video_in_set: 3
## Standard-errors: Clustered (ResponseId) 
##                                             Estimate Std. Error   z value
## treatmenthalf                               0.153934   0.213964  0.719437
## treatmentallAi                              1.066570   0.206163  5.173438
## videoCommentNumPosSentiment                 0.185498   0.048654  3.812623
## treatmenthalf:videoCommentNumPosSentiment   0.086987   0.065510  1.327845
## treatmentallAi:videoCommentNumPosSentiment -0.022844   0.061805 -0.369615
##                                                 Pr(>|z|)    
## treatmenthalf                              0.47187147000    
## treatmentallAi                             0.00000022983 ***
## videoCommentNumPosSentiment                0.00013750002 ***
## treatmenthalf:videoCommentNumPosSentiment  0.18422941474    
## treatmentallAi:videoCommentNumPosSentiment 0.71166939072    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## Log-Likelihood: -5,768.1   Adj. Pseudo R2: 0.040608
##            BIC: 11,737.7     Squared Cor.: 0.056164
panel_lm_sentiment_neg <- feglm(chosen ~ treatment * videoCommentNumNegSentiment | choice_set + videoTitle + video_in_set, family  = binomial("logit"),
      data = df_long , cluster = "ResponseId")

panel_lm_sentiment_neg %>% summary()
## GLM estimation, family = binomial, Dep. Var.: chosen
## Observations: 9,480
## Fixed-effects: choice_set: 4,  videoTitle: 12,  video_in_set: 3
## Standard-errors: Clustered (ResponseId) 
##                                             Estimate Std. Error  z value
## treatmenthalf                               0.523598   0.083317  6.28440
## treatmentallAi                              1.076036   0.090306 11.91538
## videoCommentNumNegSentiment                -0.176993   0.060997 -2.90169
## treatmenthalf:videoCommentNumNegSentiment  -0.172703   0.083174 -2.07641
## treatmentallAi:videoCommentNumNegSentiment -0.145880   0.085190 -1.71241
##                                                    Pr(>|z|)    
## treatmenthalf                              0.00000000032913 ***
## treatmentallAi                                    < 2.2e-16 ***
## videoCommentNumNegSentiment                0.00371155880552 ** 
## treatmenthalf:videoCommentNumNegSentiment  0.03785641074412 *  
## treatmentallAi:videoCommentNumNegSentiment 0.08682187961760 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## Log-Likelihood: -5,761.9   Adj. Pseudo R2: 0.041635
##            BIC: 11,725.3     Squared Cor.: 0.057455

Choice Time

We do two estimations:

  • whether the time used to make a decision was above or below the median choice time
  • quintile split of the time used to make a decision
  • whether the time used to make a decision is above or below 10 seconds
panel_lm_time <- feglm(chosen ~ treatment * choice_time_PageSubmitAboveMedian | choice_set + videoTitle + video_in_set, family  = binomial("logit"),
      data = df_long , cluster = "ResponseId")
panel_lm_time %>% summary()
## GLM estimation, family = binomial, Dep. Var.: chosen
## Observations: 9,480
## Fixed-effects: choice_set: 4,  videoTitle: 12,  video_in_set: 3
## Standard-errors: Clustered (ResponseId) 
##                                                   Estimate Std. Error  z value
## treatmenthalf                                     0.310931   0.100319  3.09943
## treatmentallAi                                    0.871230   0.105227  8.27953
## choice_time_PageSubmitAboveMedian                -0.238508   0.091393 -2.60970
## treatmenthalf:choice_time_PageSubmitAboveMedian   0.298679   0.139662  2.13858
## treatmentallAi:choice_time_PageSubmitAboveMedian  0.376730   0.145741  2.58493
##                                                   Pr(>|z|)    
## treatmenthalf                                    0.0019389 ** 
## treatmentallAi                                   < 2.2e-16 ***
## choice_time_PageSubmitAboveMedian                0.0090621 ** 
## treatmenthalf:choice_time_PageSubmitAboveMedian  0.0324695 *  
## treatmentallAi:choice_time_PageSubmitAboveMedian 0.0097400 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## Log-Likelihood: -5,788.9   Adj. Pseudo R2: 0.037162
##            BIC: 11,779.3     Squared Cor.: 0.05161
panel_lm_time_quintile <- feglm(chosen ~ treatment * choice_time_PageSubmit_quintile | choice_set + videoTitle + video_in_set, family  = binomial("logit"),
      data = df_long , cluster = "ResponseId")
## NOTE: 3 observations removed because of NA values (RHS: 3).
panel_lm_time_quintile %>% summary()
## GLM estimation, family = binomial, Dep. Var.: chosen
## Observations: 9,477
## Fixed-effects: choice_set: 4,  videoTitle: 12,  video_in_set: 3
## Standard-errors: Clustered (ResponseId) 
##                                                   Estimate Std. Error  z value
## treatmenthalf                                    -0.156051   0.155370 -1.00438
## treatmentallAi                                    0.517267   0.168686  3.06644
## choice_time_PageSubmit_quintileQ2                -0.395064   0.135838 -2.90835
## choice_time_PageSubmit_quintileQ3                -0.480123   0.138573 -3.46477
## choice_time_PageSubmit_quintileQ4                -0.515611   0.139812 -3.68789
## choice_time_PageSubmit_quintileQ5                -0.648808   0.141834 -4.57441
## treatmenthalf:choice_time_PageSubmit_quintileQ2   0.637406   0.215430  2.95877
## treatmentallAi:choice_time_PageSubmit_quintileQ2  0.510398   0.225751  2.26089
## treatmenthalf:choice_time_PageSubmit_quintileQ3   0.750283   0.219138  3.42379
## treatmentallAi:choice_time_PageSubmit_quintileQ3  0.632886   0.223327  2.83390
## treatmenthalf:choice_time_PageSubmit_quintileQ4   0.857444   0.218542  3.92347
## treatmentallAi:choice_time_PageSubmit_quintileQ4  0.629506   0.228474  2.75526
## treatmenthalf:choice_time_PageSubmit_quintileQ5   0.870719   0.216064  4.02991
## treatmentallAi:choice_time_PageSubmit_quintileQ5  0.978068   0.233891  4.18172
##                                                      Pr(>|z|)    
## treatmenthalf                                    0.3151936826    
## treatmentallAi                                   0.0021662164 ** 
## choice_time_PageSubmit_quintileQ2                0.0036334565 ** 
## choice_time_PageSubmit_quintileQ3                0.0005306931 ***
## choice_time_PageSubmit_quintileQ4                0.0002261258 ***
## choice_time_PageSubmit_quintileQ5                0.0000047756 ***
## treatmenthalf:choice_time_PageSubmit_quintileQ2  0.0030887355 ** 
## treatmentallAi:choice_time_PageSubmit_quintileQ2 0.0237659298 *  
## treatmenthalf:choice_time_PageSubmit_quintileQ3  0.0006175335 ***
## treatmentallAi:choice_time_PageSubmit_quintileQ3 0.0045983889 ** 
## treatmenthalf:choice_time_PageSubmit_quintileQ4  0.0000872818 ***
## treatmentallAi:choice_time_PageSubmit_quintileQ4 0.0058645425 ** 
## treatmenthalf:choice_time_PageSubmit_quintileQ5  0.0000557985 ***
## treatmentallAi:choice_time_PageSubmit_quintileQ5 0.0000289315 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## Log-Likelihood: -5,771.0   Adj. Pseudo R2: 0.038334
##            BIC: 11,825.9     Squared Cor.: 0.054491
panel_lm_time_10 <- feglm(chosen ~ treatment * choice_time_PageSubmitAbove10 | choice_set + videoTitle + video_in_set, family  = binomial("logit"),
      data = df_long , cluster = "ResponseId")
panel_lm_time_10 %>% summary()
## GLM estimation, family = binomial, Dep. Var.: chosen
## Observations: 9,480
## Fixed-effects: choice_set: 4,  videoTitle: 12,  video_in_set: 3
## Standard-errors: Clustered (ResponseId) 
##                                               Estimate Std. Error   z value
## treatmenthalf                                -0.477802   0.222686 -2.145626
## treatmentallAi                                0.145578   0.232593  0.625891
## choice_time_PageSubmitAbove10                -0.713656   0.137802 -5.178861
## treatmenthalf:choice_time_PageSubmitAbove10   1.050914   0.233881  4.493377
## treatmentallAi:choice_time_PageSubmitAbove10  1.025878   0.242195  4.235754
##                                                   Pr(>|z|)    
## treatmenthalf                                0.03190282812 *  
## treatmentallAi                               0.53138647221    
## choice_time_PageSubmitAbove10                0.00000022324 ***
## treatmenthalf:choice_time_PageSubmitAbove10  0.00000701026 ***
## treatmentallAi:choice_time_PageSubmitAbove10 0.00002277860 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## Log-Likelihood: -5,773.3   Adj. Pseudo R2: 0.039753
##            BIC: 11,748.0     Squared Cor.: 0.054627

Subgroup Analyses Median Split (Demographics / Mech)

These demographic/behaviorial variables use median split

  • social_media_use_numeric
  • website_use_numeric
  • social_media_reply_numeric
  • review_freq_numeric
  • age
  • income_numeric
  • libcons_numeric
  • review_freq_numeric
  • read_review_freq_numeric
  • review_essential_numeric
  • AIreview_use_numeric

We also use median split for the following “mechanism” variables (1 - 5 scale original question)

  • mech_breadth: one topic versus many topics
  • mech_length: short versus long reviews
  • mech_detail: general versus detailed
  • mech_tone: neutral, factual versus emotional, expressive
  • mech_valence: critical versus praising

For review_essential_numeric_5, we use whether = 5 or < 5 (on a 1 - 5 scale). Note: this could suggest experimenter demand bias.

subgroups_columns <- c("social_media_use_numeric_median", "website_use_numeric_median", "social_media_reply_numeric_median", "review_freq_numeric_median", "age_median", "income_numeric_median", "libcons_numeric_median", "edu_combined", "race_combined", "polparty_combined", "review_freq_numeric_median", "read_review_freq_numeric_median", "review_essential_numeric_5", "AIreview_use_numeric_median", "mech_breadth_median", "mech_length_median", "mech_detail_median", "mech_tone_median", "mech_valence_median")

for (subgroup in subgroups_columns){
  print(subgroup)

  subgroup_form <- as.formula(paste0("chosen ~ treatment * ", subgroup, " | choice_set + videoTitle + video_in_set"))
  subgroup_lm <- feglm(subgroup_form, family  = binomial("logit"),
      data = df_long , cluster = "ResponseId")
  assign(paste0("subgroup_lm_", subgroup), subgroup_lm)
}
## [1] "social_media_use_numeric_median"
## [1] "website_use_numeric_median"
## [1] "social_media_reply_numeric_median"
## [1] "review_freq_numeric_median"
## [1] "age_median"
## [1] "income_numeric_median"
## [1] "libcons_numeric_median"
## [1] "edu_combined"
## [1] "race_combined"
## [1] "polparty_combined"
## [1] "review_freq_numeric_median"
## [1] "read_review_freq_numeric_median"
## [1] "review_essential_numeric_5"
## [1] "AIreview_use_numeric_median"
## [1] "mech_breadth_median"
## [1] "mech_length_median"
## [1] "mech_detail_median"
## [1] "mech_tone_median"
## [1] "mech_valence_median"
for (subgroup in subgroups_columns){
  subgroup_lm <- get(paste0("subgroup_lm_", subgroup))
  coef_table <- summary(subgroup_lm)$coeftable
  cat("#### ", subgroup, "\n")
  print(paste0("Values for this variable: ", paste(unique(df_long[[subgroup]]), collapse = ", ")))
  rows_to_extract <- rownames(coef_table)[!str_detect(rownames(coef_table), "video") & (rownames(coef_table) != "(Intercept)")]
  subgroup_coef <- coef_table[rows_to_extract, ]
  print(kable(subgroup_coef, format = "markdown"))
  cat("\n")
}

social_media_use_numeric_median

[1] “Values for this variable: Below Median, Above Median”

Estimate Std. Error z value Pr(>|z|)
treatmenthalf 0.4892536 0.0865068 5.6556650 0.0000000
treatmentallAi 1.0849814 0.0938676 11.5586416 0.0000000
social_media_use_numeric_medianAbove Median 0.0658607 0.1035899 0.6357837 0.5249174
treatmenthalf:social_media_use_numeric_medianAbove Median -0.0981152 0.1516221 -0.6471033 0.5175651
treatmentallAi:social_media_use_numeric_medianAbove Median -0.0876608 0.1700785 -0.5154141 0.6062637

website_use_numeric_median

[1] “Values for this variable: Below Median, Above Median”

Estimate Std. Error z value Pr(>|z|)
treatmenthalf 0.4990683 0.0881494 5.6616182 0.0000000
treatmentallAi 1.1249981 0.0949216 11.8518626 0.0000000
website_use_numeric_medianAbove Median 0.1215868 0.1020944 1.1909256 0.2336828
treatmenthalf:website_use_numeric_medianAbove Median -0.1262011 0.1484070 -0.8503714 0.3951186
treatmentallAi:website_use_numeric_medianAbove Median -0.2071179 0.1685576 -1.2287666 0.2191593

social_media_reply_numeric_median

[1] “Values for this variable: Below Median, Above Median”

Estimate Std. Error z value Pr(>|z|)
treatmenthalf 0.5461071 0.0790471 6.9086293 0.0000000
treatmentallAi 1.0503838 0.0870308 12.0691101 0.0000000
social_media_reply_numeric_medianAbove Median 0.1227883 0.1209431 1.0152568 0.3099834
treatmenthalf:social_media_reply_numeric_medianAbove Median -0.4034569 0.1775621 -2.2722013 0.0230744
treatmentallAi:social_media_reply_numeric_medianAbove Median 0.0258078 0.1990263 0.1296704 0.8968272

review_freq_numeric_median

[1] “Values for this variable: Below Median, Above Median”

Estimate Std. Error z value Pr(>|z|)
treatmenthalf 0.5879337 0.0926056 6.348791 0.0000000
treatmentallAi 1.1044790 0.1005708 10.982099 0.0000000
review_freq_numeric_medianAbove Median 0.1449329 0.0977059 1.483360 0.1379789
treatmenthalf:review_freq_numeric_medianAbove Median -0.3051722 0.1432150 -2.130868 0.0331000
treatmentallAi:review_freq_numeric_medianAbove Median -0.1104911 0.1599153 -0.690935 0.4896064

age_median

[1] “Values for this variable: Below Median, Above Median”

Estimate Std. Error z value Pr(>|z|)
treatmenthalf 0.4819323 0.1013851 4.7534850 0.0000020
treatmentallAi 1.1087732 0.1086471 10.2052750 0.0000000
age_medianAbove Median 0.0612048 0.0965396 0.6339861 0.5260899
treatmenthalf:age_medianAbove Median -0.0530461 0.1415220 -0.3748256 0.7077902
treatmentallAi:age_medianAbove Median -0.1114428 0.1571823 -0.7090037 0.4783222

income_numeric_median

[1] “Values for this variable: Above Median, Below Median”

Estimate Std. Error z value Pr(>|z|)
treatmenthalf 0.4848226 0.0860773 5.6324115 0.0000000
treatmentallAi 0.9995116 0.0969034 10.3145184 0.0000000
income_numeric_medianAbove Median -0.0353410 0.1027440 -0.3439713 0.7308679
treatmenthalf:income_numeric_medianAbove Median -0.0849464 0.1519029 -0.5592156 0.5760146
treatmentallAi:income_numeric_medianAbove Median 0.1643352 0.1653160 0.9940669 0.3201903

libcons_numeric_median

[1] “Values for this variable: Below Median, Above Median, NA”

Estimate Std. Error z value Pr(>|z|)
treatmenthalf 0.3101322 0.0897028 3.457330 0.0005456
treatmentallAi 0.8868747 0.1004017 8.833262 0.0000000
libcons_numeric_medianAbove Median -0.3083354 0.1010519 -3.051257 0.0022789
treatmenthalf:libcons_numeric_medianAbove Median 0.3894625 0.1465184 2.658114 0.0078579
treatmentallAi:libcons_numeric_medianAbove Median 0.4522346 0.1611412 2.806449 0.0050091

edu_combined

[1] “Values for this variable: Some College, Graduate Degree, Bachelor’s Degree, High School or Less”

Estimate Std. Error z value Pr(>|z|)
treatmenthalf 0.7128830 0.2128516 3.3492025 0.0008104
treatmentallAi 1.0376760 0.2550112 4.0691389 0.0000472
edu_combinedSome College 0.1297374 0.2091577 0.6202851 0.5350701
edu_combinedBachelor’s Degree 0.0781043 0.1714407 0.4555762 0.6486948
edu_combinedGraduate Degree 0.0945997 0.1777041 0.5323437 0.5944880
treatmenthalf:edu_combinedSome College -0.3256780 0.3023406 -1.0771891 0.2813958
treatmentallAi:edu_combinedSome College -0.0470976 0.3411125 -0.1380706 0.8901846
treatmenthalf:edu_combinedBachelor’s Degree -0.2991327 0.2377563 -1.2581486 0.2083380
treatmentallAi:edu_combinedBachelor’s Degree 0.0666522 0.2793904 0.2385629 0.8114445
treatmenthalf:edu_combinedGraduate Degree -0.2439795 0.2454246 -0.9941117 0.3201685
treatmentallAi:edu_combinedGraduate Degree -0.0265592 0.2911638 -0.0912173 0.9273199

race_combined

[1] “Values for this variable: Black, White, Asian, Hispanic, Other”

Estimate Std. Error z value Pr(>|z|)
treatmenthalf 0.4747966 0.0872051 5.4445949 0.0000001
treatmentallAi 1.0745900 0.0964819 11.1377350 0.0000000
race_combinedBlack 0.0508756 0.1168926 0.4352338 0.6633927
race_combinedHispanic 0.4005985 0.2906019 1.3785131 0.1680449
race_combinedAsian -0.0932219 0.2694355 -0.3459897 0.7293504
race_combinedOther -0.1285229 0.2236537 -0.5746511 0.5655273
treatmenthalf:race_combinedBlack -0.0989006 0.1754475 -0.5637049 0.5729550
treatmentallAi:race_combinedBlack -0.0466665 0.1887560 -0.2472320 0.8047287
treatmenthalf:race_combinedHispanic -0.5151286 0.3862377 -1.3337088 0.1822993
treatmentallAi:race_combinedHispanic -0.6181278 0.5442721 -1.1356963 0.2560837
treatmenthalf:race_combinedAsian -0.0277326 0.3850182 -0.0720293 0.9425786
treatmentallAi:race_combinedAsian 0.2632241 0.4204166 0.6261029 0.5312474
treatmenthalf:race_combinedOther 0.3672037 0.3287793 1.1168701 0.2640499
treatmentallAi:race_combinedOther 0.0058082 0.3500947 0.0165904 0.9867634

polparty_combined

[1] “Values for this variable: Other, Republican, Democrat”

Estimate Std. Error z value Pr(>|z|)
treatmenthalf 0.5919063 0.1227454 4.8222264 0.0000014
treatmentallAi 1.1652151 0.1326936 8.7812440 0.0000000
polparty_combinedRepublican 0.2139708 0.1103565 1.9389047 0.0525129
polparty_combinedOther -0.0369587 0.1352409 -0.2732804 0.7846377
treatmenthalf:polparty_combinedRepublican -0.3206172 0.1628641 -1.9686183 0.0489969
treatmentallAi:polparty_combinedRepublican -0.2762617 0.1777736 -1.5540082 0.1201825
treatmenthalf:polparty_combinedOther 0.0327091 0.1929206 0.1695469 0.8653665
treatmentallAi:polparty_combinedOther 0.0652730 0.2165151 0.3014709 0.7630555

review_freq_numeric_median

[1] “Values for this variable: Below Median, Above Median”

Estimate Std. Error z value Pr(>|z|)
treatmenthalf 0.5879337 0.0926056 6.348791 0.0000000
treatmentallAi 1.1044790 0.1005708 10.982099 0.0000000
review_freq_numeric_medianAbove Median 0.1449329 0.0977059 1.483360 0.1379789
treatmenthalf:review_freq_numeric_medianAbove Median -0.3051722 0.1432150 -2.130868 0.0331000
treatmentallAi:review_freq_numeric_medianAbove Median -0.1104911 0.1599153 -0.690935 0.4896064

read_review_freq_numeric_median

[1] “Values for this variable: Below Median, Above Median”

Estimate Std. Error z value Pr(>|z|)
treatmenthalf 0.3984441 0.0909073 4.382969 0.0000117
treatmentallAi 0.9895600 0.0997271 9.922677 0.0000000
read_review_freq_numeric_medianAbove Median -0.1201067 0.0995197 -1.206864 0.2274846
treatmenthalf:read_review_freq_numeric_medianAbove Median 0.1541406 0.1452886 1.060927 0.2887232
treatmentallAi:read_review_freq_numeric_medianAbove Median 0.1752897 0.1616532 1.084357 0.2782066

review_essential_numeric_5

[1] “Values for this variable: < 5, Very Important (= 5)”

Estimate Std. Error z value Pr(>|z|)
treatmenthalf 0.4964622 0.0964216 5.1488683 0.0000003
treatmentallAi 0.9420077 0.1068953 8.8124332 0.0000000
review_essential_numeric_5Very Important (= 5) -0.0587219 0.0966374 -0.6076516 0.5434186
treatmenthalf:review_essential_numeric_5Very Important (= 5) -0.0793063 0.1419812 -0.5585687 0.5764561
treatmentallAi:review_essential_numeric_5Very Important (= 5) 0.2222678 0.1569010 1.4166115 0.1565965

AIreview_use_numeric_median

[1] “Values for this variable: Below Median, Above Median”

Estimate Std. Error z value Pr(>|z|)
treatmenthalf 0.5441398 0.0787601 6.908824 0.0000000
treatmentallAi 1.1122229 0.0864707 12.862419 0.0000000
AIreview_use_numeric_medianAbove Median 0.2296949 0.1211540 1.895892 0.0579744
treatmenthalf:AIreview_use_numeric_medianAbove Median -0.4004725 0.1784720 -2.243895 0.0248391
treatmentallAi:AIreview_use_numeric_medianAbove Median -0.2517120 0.2003107 -1.256608 0.2088956

mech_breadth_median

[1] “Values for this variable: Below Median, Above Median”

Estimate Std. Error z value Pr(>|z|)
treatmenthalf 0.3539948 0.0779350 4.542180 0.0000056
treatmentallAi 0.8983430 0.0852270 10.540592 0.0000000
mech_breadth_medianAbove Median -0.5188299 0.1381022 -3.756855 0.0001721
treatmenthalf:mech_breadth_medianAbove Median 0.5567548 0.1883784 2.955513 0.0031215
treatmentallAi:mech_breadth_medianAbove Median 0.8222358 0.2133020 3.854797 0.0001158

mech_length_median

[1] “Values for this variable: Below Median, Above Median”

Estimate Std. Error z value Pr(>|z|)
treatmenthalf 0.3344945 0.0948121 3.527971 0.0004188
treatmentallAi 0.7634959 0.0997655 7.652904 0.0000000
mech_length_medianAbove Median -0.3580863 0.0992673 -3.607293 0.0003094
treatmenthalf:mech_length_medianAbove Median 0.2905379 0.1435918 2.023360 0.0430360
treatmentallAi:mech_length_medianAbove Median 0.6671729 0.1599667 4.170698 0.0000304

mech_detail_median

[1] “Values for this variable: Below Median, Above Median”

Estimate Std. Error z value Pr(>|z|)
treatmenthalf 0.4096091 0.0898912 4.5567201 0.0000052
treatmentallAi 0.9353735 0.0987703 9.4701890 0.0000000
mech_detail_medianAbove Median -0.1671498 0.1007009 -1.6598647 0.0969417
treatmenthalf:mech_detail_medianAbove Median 0.1269494 0.1463721 0.8673057 0.3857745
treatmentallAi:mech_detail_medianAbove Median 0.3175879 0.1625854 1.9533607 0.0507769

mech_tone_median

[1] “Values for this variable: Below Median, Above Median”

Estimate Std. Error z value Pr(>|z|)
treatmenthalf 0.5266179 0.0901068 5.8443770 0.0000000
treatmentallAi 0.9936711 0.0988085 10.0565374 0.0000000
mech_tone_medianAbove Median 0.0006784 0.0996431 0.0068087 0.9945675
treatmenthalf:mech_tone_medianAbove Median -0.2025102 0.1457845 -1.3891069 0.1648002
treatmentallAi:mech_tone_medianAbove Median 0.1741006 0.1630418 1.0678281 0.2855981

mech_valence_median

[1] “Values for this variable: Below Median, Above Median”

Estimate Std. Error z value Pr(>|z|)
treatmenthalf 0.5523281 0.0901576 6.126251 0.0000000
treatmentallAi 1.1213640 0.0972037 11.536229 0.0000000
mech_valence_medianAbove Median 0.1546823 0.0996084 1.552904 0.1204461
treatmenthalf:mech_valence_medianAbove Median -0.2590101 0.1461239 -1.772537 0.0763054
treatmentallAi:mech_valence_medianAbove Median -0.1753980 0.1642716 -1.067731 0.2856417

Subgroup Analyses Recentered (Demographics / Mech)

These demographic/behaviorial variables use recentered: meaning the value should have original scale of 1 to 5 and we recenter 3 to be 0, creating a scale of -2 to 2

  • libcons_numeric
  • review_essential_numeric
  • AIreview_use_numeric
  • mech_breadth
  • mech_length
  • mech_detail
  • mech_tone
  • mech_valence
subgroups_columns_recentered <- c("libcons_numeric_recenter", "review_essential_numeric_recenter", "AIreview_use_numeric_recenter", "mech_breadth_recenter", "mech_length_recenter", "mech_detail_recenter", "mech_tone_recenter", "mech_valence_recenter")
for (subgroup in subgroups_columns_recentered){
  print(subgroup)

  subgroup_form <- as.formula(paste0("chosen ~ treatment * ", subgroup, " | choice_set + videoTitle + video_in_set"))
  subgroup_lm <- feglm(subgroup_form, family  = binomial("logit"),
      data = df_long , cluster = "ResponseId")
  assign(paste0("subgroup_lm_", subgroup), subgroup_lm)
}
## [1] "libcons_numeric_recenter"
## [1] "review_essential_numeric_recenter"
## [1] "AIreview_use_numeric_recenter"
## [1] "mech_breadth_recenter"
## [1] "mech_length_recenter"
## [1] "mech_detail_recenter"
## [1] "mech_tone_recenter"
## [1] "mech_valence_recenter"
for (subgroup in subgroups_columns_recentered){
  subgroup_lm <- get(paste0("subgroup_lm_", subgroup))
  coef_table <- summary(subgroup_lm)$coeftable
  cat("#### ", subgroup, "\n")
  print(paste0("Values for this variable: ", paste(unique(df_long[[subgroup]]) %>% sort(), collapse = ", ")))
  rows_to_extract <- rownames(coef_table)[!str_detect(rownames(coef_table), "video") & (rownames(coef_table) != "(Intercept)")]
  subgroup_coef <- coef_table[rows_to_extract, ]
  print(kable(subgroup_coef, format = "markdown"))
  cat("\n")
}

libcons_numeric_recenter

[1] “Values for this variable: -2, -1, 0, 1, 2”

Estimate Std. Error z value Pr(>|z|)
treatmenthalf 0.4725172 0.0711762 6.638693 0.0000000
treatmentallAi 1.0740710 0.0787182 13.644509 0.0000000
libcons_numeric_recenter -0.1354861 0.0383634 -3.531646 0.0004130
treatmenthalf:libcons_numeric_recenter 0.1884398 0.0556516 3.386065 0.0007090
treatmentallAi:libcons_numeric_recenter 0.1851000 0.0620990 2.980724 0.0028757

review_essential_numeric_recenter

[1] “Values for this variable: -2, -1, 0, 1, 2”

Estimate Std. Error z value Pr(>|z|)
treatmenthalf 0.4821556 0.1432312 3.3662746 0.0007619
treatmentallAi 0.8432789 0.1537001 5.4865209 0.0000000
review_essential_numeric_recenter -0.0516215 0.0615178 -0.8391311 0.4013958
treatmenthalf:review_essential_numeric_recenter -0.0183014 0.0932474 -0.1962672 0.8444010
treatmentallAi:review_essential_numeric_recenter 0.1516935 0.1002854 1.5126176 0.1303768

AIreview_use_numeric_recenter

[1] “Values for this variable: -2, -1, 0, 1, 2”

Estimate Std. Error z value Pr(>|z|)
treatmenthalf 0.4070101 0.0767385 5.3038603 0.0000001
treatmentallAi 1.0299342 0.0856392 12.0264368 0.0000000
AIreview_use_numeric_recenter 0.0722234 0.0486713 1.4839016 0.1378350
treatmenthalf:AIreview_use_numeric_recenter -0.1323680 0.0694953 -1.9047039 0.0568186
treatmentallAi:AIreview_use_numeric_recenter -0.0717417 0.0786214 -0.9124965 0.3615074

mech_breadth_recenter

[1] “Values for this variable: -2, -1, 0, 1, 2”

Estimate Std. Error z value Pr(>|z|)
treatmenthalf 0.4060543 0.0745182 5.449063 0.0000001
treatmentallAi 0.9677337 0.0804968 12.022013 0.0000000
mech_breadth_recenter -0.1364632 0.0382069 -3.571686 0.0003547
treatmenthalf:mech_breadth_recenter 0.1416777 0.0577603 2.452854 0.0141728
treatmentallAi:mech_breadth_recenter 0.2312048 0.0624813 3.700384 0.0002153

mech_length_recenter

[1] “Values for this variable: -2, -1, 0, 1, 2”

Estimate Std. Error z value Pr(>|z|)
treatmenthalf 0.4270154 0.0730822 5.842945 0.0000000
treatmentallAi 0.9743349 0.0782162 12.456952 0.0000000
mech_length_recenter -0.1684713 0.0428342 -3.933101 0.0000839
treatmenthalf:mech_length_recenter 0.1386359 0.0635666 2.180956 0.0291867
treatmentallAi:mech_length_recenter 0.3167628 0.0701937 4.512696 0.0000064

mech_detail_recenter

[1] “Values for this variable: -2, -1, 0, 1, 2”

Estimate Std. Error z value Pr(>|z|)
treatmenthalf 0.4010078 0.0952771 4.2088601 0.0000257
treatmentallAi 0.9052173 0.1026175 8.8212769 0.0000000
mech_detail_recenter -0.0829868 0.0437096 -1.8985937 0.0576179
treatmenthalf:mech_detail_recenter 0.0615489 0.0656003 0.9382406 0.3481208
treatmentallAi:mech_detail_recenter 0.1623200 0.0708168 2.2921115 0.0218992

mech_tone_recenter

[1] “Values for this variable: -2, -1, 0, 1, 2”

Estimate Std. Error z value Pr(>|z|)
treatmenthalf 0.4536040 0.0710567 6.3836879 0.0000000
treatmentallAi 1.0598293 0.0785393 13.4942565 0.0000000
mech_tone_recenter -0.0285579 0.0385432 -0.7409313 0.4587351
treatmenthalf:mech_tone_recenter -0.0445169 0.0564565 -0.7885168 0.4303945
treatmentallAi:mech_tone_recenter 0.1132020 0.0621725 1.8207738 0.0686413

mech_valence_recenter

[1] “Values for this variable: -2, -1, 0, 1, 2”

Estimate Std. Error z value Pr(>|z|)
treatmenthalf 0.4831300 0.0770257 6.2723244 0.0000000
treatmentallAi 1.0496732 0.0813997 12.8952979 0.0000000
mech_valence_recenter 0.0196624 0.0490485 0.4008776 0.6885102
treatmenthalf:mech_valence_recenter -0.0768310 0.0725293 -1.0593102 0.2894585
treatmentallAi:mech_valence_recenter 0.0169408 0.0775877 0.2183444 0.8271608

Mediation Analysis

Not sure if done correctly given main analysis is panel regression yet mechanism questions are participant level. Probably subgroup analysis (above) makes more sense.

mediator_columns <- c("mech_length", "mech_breadth", "mech_detail", "mech_tone","mech_valence")
mech_fancy_names <- c("Length", "Topic Diversity", "Level of Detail", "Language Tone", "Valence")
mech_mapping <- c(
  "Length" = "mech_length",
  "Topic Diversity" = "mech_breadth",
  "Level of Detail" = "mech_detail",
  "Language Tone" = "mech_tone",
  "Valence" = "mech_valence"
)
# visual correlation plot
mediator_corr <- cor(df_final[, c(mediator_columns)], use = "pairwise.complete.obs")
# change with fancy names
rownames(mediator_corr) <- c(mech_fancy_names)
colnames(mediator_corr) <- c(mech_fancy_names)

# visualize
corrplot(mediator_corr,method = 'number')

Step 1: Mediator Treatment Effect

mediator_coef_df <- data.frame()
for (m in mediator_columns){
  cov_form_lm <- paste0(m, " ~ treatment | choice_set + videoTitle + video_in_set")
  mediator_lm <- feglm(as.formula(cov_form_lm), family = "gaussian", data = df_long, cluster = "ResponseId")
  mediator_lm_coef <- summary(mediator_lm)$coeftable[, c("Estimate", "Std. Error")]
  mediator_coef_df <- rbind(mediator_coef_df, cbind(mediator_lm_coef, mech_fancy_names[which(mech_mapping == m)]))
}
colnames(mediator_coef_df) <- c("Estimate", "Std. Error", "Mediator")
mediator_coef_df$Mediator <- factor(mediator_coef_df$Mediator, levels = c("Length", "Topic Diversity", "Level of Detail", "Language Tone", "Valence") )
mediator_coef_df$Estimate <- as.numeric(as.character(mediator_coef_df$Estimate))
mediator_coef_df$`Std. Error` <- as.numeric(as.character(mediator_coef_df$`Std. Error`))
mediator_coef_df$Treatment <- rownames(mediator_coef_df)
mediator_coef_df$Treatment <- ifelse(str_detect(mediator_coef_df$Treatment, "all"), "All AI", "Mixed Reviews")
mediator_coef_df$Treatment <- factor(mediator_coef_df$Treatment, levels = c("Mixed Reviews", "All AI"))
#plot bar plot
ggplot(mediator_coef_df, aes(x = Mediator, y = Estimate * 1000000000, fill = Treatment)) +
  geom_bar(stat = "identity", position = "dodge") +
  geom_errorbar(aes(ymin = (Estimate - 1.96*`Std. Error`) * 1000000000, ymax = (Estimate + 1.96*`Std. Error`) * 1000000000), width = 0.2, position = position_dodge(0.9)) +
  labs(title = "Mediator Analysis", x = "Mediator", y = "Estimate") +
  scale_fill_manual(name = "vs All Human",
                    values=c("Mixed Reviews" = "turquoise2","All AI"="pink2")) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1),
        # center title
        title = element_text(face = "bold"))

cov_form_lm_tone <- paste0("chosen ~ treatment + mech_tone | choice_set + videoTitle + video_in_set")
mediator_lm_tone <- feglm(as.formula(cov_form_lm_tone), family = "binomial", data = df_long, cluster = "ResponseId")

texreg::screenreg(mediator_lm_tone,
               stars = c(0.05, 0.01, 0.001),
               caption = "Mediation Analysis",
               label = "tab:mediator_regression",
               digits = 4,
               custom.coef.map = list("treatmenthalf" = "Treatment: Mixed Reviews","treatmentallAi" = "Treatment: All AI Reviews",  "mech_tone" = "Mechanism: Language Tone"),
               custom.note = "Standard errors are clustered at the user level.",
               custom.model.names = c("Without Covariates"),
               custom.coef.names = c("Treatment: Mixed Reviews", "Treatment: All AI Reviews", "Mechanism: Language Tone"))
## 
## =============================================
##                            Without Covariates
## ---------------------------------------------
## Treatment: Mixed Reviews       0.4565 ***    
##                               (0.0711)       
## Treatment: All AI Reviews      1.0556 ***    
##                               (0.0786)       
## Mechanism: Language Tone       0.0001        
##                               (0.0003)       
## ---------------------------------------------
## Num. obs.                   9480             
## Num. groups: choice_set        4             
## Num. groups: videoTitle       12             
## Num. groups: video_in_set      3             
## Deviance                   11589.9160        
## Log Likelihood             -5794.9580        
## Pseudo R^2                     0.0365        
## =============================================
## Standard errors are clustered at the user level.