# find the most recent file
find_most_recent <- function(directory, pattern){
files <- list.files(directory, pattern = pattern, full.names = TRUE)
filename <- files[which.max(file.info(files)$mtime)]
return(filename)
}
df_file <- find_most_recent("qualtrics_data/consumer_demand/main_data_v2", "Consumer")
print(paste0("Loading data from: ", df_file))
## [1] "Loading data from: qualtrics_data/consumer_demand/main_data_v2/USTM+-+Consumer+Demand_May+9,+2025_09.07.csv"
df_final <- read.csv(df_file, stringsAsFactors = FALSE)
review_database <- read.csv("sampled_comments_per_video_mode_balanced.csv", stringsAsFactors = FALSE)
df_final <- df_final %>%
filter(StartDate >= "2025-05-08") %>%
filter(Status == "IP Address") %>%
filter(Finished == "True") %>%
filter(Consent == "YES")
df_final$Choice1 <- mapply(function(row, col) row[[col]], split(df_final, seq(nrow(df_final))), gsub("^\\$\\{e://Field/([^}]+)\\}$", "\\1", df_final$Choice1))
df_final$Choice2 <- mapply(function(row, col) row[[col]], split(df_final, seq(nrow(df_final))), gsub("^\\$\\{e://Field/([^}]+)\\}$", "\\1", df_final$Choice2))
df_final$Choice3 <- mapply(function(row, col) row[[col]], split(df_final, seq(nrow(df_final))), gsub("^\\$\\{e://Field/([^}]+)\\}$", "\\1", df_final$Choice3))
df_final$Choice4 <- mapply(function(row, col) row[[col]], split(df_final, seq(nrow(df_final))), gsub("^\\$\\{e://Field/([^}]+)\\}$", "\\1", df_final$Choice4))
df_final$age <- as.numeric(df_final$age)
df_final$social_media_use <- factor(df_final$social_media_use, levels = c("1 hour or less", "1-3 hours", "3-5 hours", "5+ hours"))
df_final$website_use <- factor(df_final$website_use, levels = c("1 hour or less", "1-3 hours", "3-5 hours", "5+ hours"))
df_final$edu <- factor(df_final$edu, levels = c("Did not graduate from high school", "High school graduate (high school diploma or equivalent including GED)", "Some college, but no degree", "2-year college degree", "4-year college degree", "Postgraduate degree (MA, MBA, JD, PhD, etc.)"))
df_final$polparty <- factor(df_final$polparty, levels = c("Democrat", "Republican", "Independent", "Other Party"))
df_final$libcons <- factor(df_final$libcons, levels = c("Strong Conservative", "Moderate Conservative", "Moderate", "Moderate Liberal", "Strong Liberal"))
df_final$income <- factor(df_final$income, levels = c("Prefer not to say", "Less than $10,000", "$10,000-$49,999", "$50,000-$99,999", "$100,000-$149,999", "$150,000 or more"))
df_final$social_media_reply <- factor(df_final$social_media_reply, levels = c("Never", "Few times a year", "Few times a month", "Few times a week", "1-2 times per day", "More than 4 times per day"))
df_final$review_freq <- factor(df_final$review_freq, levels = c("Never", "Rarely (1 - 20% of the time)", "Occasionally (21 - 40% of the time)", "Sometimes (41 - 60% of the time)", "Often (61 - 80% of the time)", "Very often (81 - 100% of the time)"))
df_final$read_review_freq <- factor(df_final$read_review_freq, levels = c("Never", "Rarely (1 - 20% of the time)", "Occasionally (21 - 40% of the time)", "Sometimes (41 - 60% of the time)", "Often (61 - 80% of the time)", "Very often (81 - 100% of the time)"))
df_final$review_essential <- factor(df_final$review_essential, levels = c("Not important at all", "Somewhat not important", "Neutral", "Somewhat important", "Very important"))
df_final$AIreview_use <- factor(df_final$AIreview_use, levels = c("Much less frequently", "Less frequently", "About the same", "More frequently", "Much more frequently"))
# social media
df_final$social_media_X <- ifelse(str_detect(df_final$social_media, "X (formerly Twitter)"), 1, 0)
df_final$social_media_FB <- ifelse(str_detect(df_final$social_media, "Facebook"), 1, 0)
df_final$social_media_IG <- ifelse(str_detect(df_final$social_media, "Instagram"), 1, 0)
df_final$social_media_LI <- ifelse(str_detect(df_final$social_media, "LinkedIn"), 1, 0)
df_final$social_media_SN <- ifelse(str_detect(df_final$social_media, "Snapchat"), 1, 0)
df_final$social_media_TK <- ifelse(str_detect(df_final$social_media, "TikTok"), 1, 0)
df_final$social_media_YT <- ifelse(str_detect(df_final$social_media, "YouTube"), 1, 0)
df_final$social_media_nonUser <- ifelse(str_detect(df_final$social_media, "not active"), 1, 0)
df_final$social_media_user <- ifelse((df_final$social_media_X == 1) | (df_final$social_media_FB == 1) | (df_final$social_media_IG == 1) | (df_final$social_media_LI == 1) | (df_final$social_media_SN == 1) | (df_final$social_media_TK == 1) , 1, 0)
# social media use time
df_final$social_media_use_numeric <- case_when(df_final$social_media_use == "1 hour or less" ~ 1,
df_final$social_media_use == "1-3 hours" ~ 2,
df_final$social_media_use == "3-5 hours" ~ 3,
df_final$social_media_use == "5+ hours" ~ 4)
df_final$social_media_use_1 <- ifelse(df_final$social_media_use == "1 hour or less", 1, 0)
df_final$social_media_use_13 <- ifelse(df_final$social_media_use == "1-3 hours", 1, 0)
df_final$social_media_use_35 <- ifelse(df_final$social_media_use == "3-5 hours", 1, 0)
df_final$social_media_use_5 <- ifelse(df_final$social_media_use == "5+ hours", 1, 0)
# website use
df_final$website_use_numeric <- case_when(df_final$website_use == "1 hour or less" ~ 1,
df_final$website_use == "1-3 hours" ~ 2,
df_final$website_use == "3-5 hours" ~ 3,
df_final$website_use == "5+ hours" ~ 4)
df_final$website_use_1 <- ifelse(df_final$website_use == "1 hour or less", 1, 0)
df_final$website_use_13 <- ifelse(df_final$website_use == "1-3 hours", 1, 0)
df_final$website_use_35 <- ifelse(df_final$website_use == "3-5 hours", 1, 0)
df_final$website_use_5 <- ifelse(df_final$website_use == "5+ hours", 1, 0)
# gender
df_final$genderFemale <- ifelse(df_final$gender == "Female", 1, 0)
# race
df_final$raceAsian <- ifelse(df_final$race == "Asian/Pacific Islander", 1, 0)
df_final$raceBlack <- ifelse(df_final$race == "Black or African American", 1, 0)
df_final$raceHispanic <- ifelse(df_final$race == "Latino or Hispanic", 1, 0)
df_final$raceWhite <- ifelse(df_final$race == "Caucasian/White", 1, 0)
df_final$raceOther <- ifelse(df_final$race %in% c("Asian/Pacific Islander", "Black or African American", "Latino or Hispanic", "Caucasian/White"), 0, 1)
# education
df_final$eduHighSchoolOrLess <- ifelse(df_final$edu %in% c("Did not graduate from high school", "High school graduate (high school diploma or equivalent including GED)"), 1, 0)
df_final$eduSomeCollege <- ifelse(df_final$edu == "Some college, but no degree", 1, 0)
df_final$eduBachelor <- ifelse(df_final$edu %in% c("2-year college degree", "4-year college degree"), 1, 0)
df_final$eduPostGrad <- ifelse(df_final$edu == "Postgraduate degree (MA, MBA, JD, PhD, etc.)", 1, 0)
# political party
df_final$polpartyDem <- ifelse(df_final$polparty == "Democrat", 1, 0)
df_final$polpartyRep <- ifelse(df_final$polparty == "Republican", 1, 0)
df_final$polpartyInd <- ifelse(df_final$polparty == "Independent", 1, 0)
df_final$polpartyOther <- ifelse(df_final$polparty %in% c("Democrat", "Republican"), 0, 1)
# political ideology
df_final$libcons_numeric <- case_when(df_final$libcons == "Strong Liberal" ~ 5,
df_final$libcons == "Moderate Liberal" ~ 4,
df_final$libcons == "Moderate" ~ 3,
df_final$libcons == "Moderate Conservative" ~ 2,
df_final$libcons == "Strong Conservative" ~ 1)
# income
df_final$income_numeric <- case_when(df_final$income == "Less than $10,000" ~ 1,
df_final$income == "$10,000-$49,999" ~ 2,
df_final$income == "$50,000-$99,999" ~ 3,
df_final$income == "$100,000-$149,999" ~ 4,
df_final$income == "$150,000 or more" ~ 5,
TRUE ~ 0)
df_final$income_flag <- ifelse(df_final$income_numeric == 0, 1, 0)
# social media reply
df_final$social_media_reply_numeric <- case_when(df_final$social_media_reply == "Never" ~ 1,
df_final$social_media_reply == "Few times a year" ~ 2,
df_final$social_media_reply == "Few times a month" ~ 3,
df_final$social_media_reply == "Few times a week" ~ 4,
df_final$social_media_reply == "1-2 times per day" ~ 5,
df_final$social_media_reply == "More than 4 times per day" ~ 6)
df_final$social_media_reply_never <- ifelse(df_final$social_media_reply == "Never", 1, 0)
df_final$social_media_reply_fewayear <- ifelse(df_final$social_media_reply == "Few times a year", 1, 0)
df_final$social_media_reply_fewamonth <- ifelse(df_final$social_media_reply == "Few times a month", 1, 0)
df_final$social_media_reply_fewaweek <- ifelse(df_final$social_media_reply == "Few times a week", 1, 0)
df_final$social_media_reply_12times <- ifelse(df_final$social_media_reply == "1-2 times per day", 1, 0)
df_final$social_media_reply_4times <- ifelse(df_final$social_media_reply == "More than 4 times per day", 1, 0)
# review frequency
df_final$review_freq_numeric <- case_when(df_final$review_freq == "Never" ~ 1,
df_final$review_freq == "Rarely (1 - 20% of the time)" ~ 2,
df_final$review_freq == "Occasionally (21 - 40% of the time)" ~ 3,
df_final$review_freq == "Sometimes (41 - 60% of the time)" ~ 4,
df_final$review_freq == "Often (61 - 80% of the time)" ~ 5,
df_final$review_freq == "Very often (81 - 100% of the time)" ~ 6)
df_final$review_freq_never <- ifelse(df_final$review_freq == "Never", 1, 0)
df_final$review_freq_rarely <- ifelse(df_final$review_freq == "Rarely (1 - 20% of the time)", 1, 0)
df_final$review_freq_occasionally <- ifelse(df_final$review_freq == "Occasionally (21 - 40% of the time)", 1, 0)
df_final$review_freq_sometimes <- ifelse(df_final$review_freq == "Sometimes (41 - 60% of the time)", 1, 0)
df_final$review_freq_often <- ifelse(df_final$review_freq == "Often (61 - 80% of the time)", 1, 0)
df_final$review_freq_veryoften <- ifelse(df_final$review_freq == "Very often (81 - 100% of the time)", 1, 0)
# read review frequency
df_final$read_review_freq_numeric <- case_when(df_final$read_review_freq == "Never" ~ 1,
df_final$read_review_freq == "Rarely (1 - 20% of the time)" ~ 2,
df_final$read_review_freq == "Occasionally (21 - 40% of the time)" ~ 3,
df_final$read_review_freq == "Sometimes (41 - 60% of the time)" ~ 4,
df_final$read_review_freq == "Often (61 - 80% of the time)" ~ 5,
df_final$read_review_freq == "Very often (81 - 100% of the time)" ~ 6)
df_final$read_review_freq_never <- ifelse(df_final$read_review_freq == "Never", 1, 0)
df_final$read_review_freq_rarely <- ifelse(df_final$read_review_freq == "Rarely (1 - 20% of the time)", 1, 0)
df_final$read_review_freq_occasionally <- ifelse(df_final$read_review_freq == "Occasionally (21 - 40% of the time)", 1, 0)
df_final$read_review_freq_sometimes <- ifelse(df_final$read_review_freq == "Sometimes (41 - 60% of the time)", 1, 0)
df_final$read_review_freq_often <- ifelse(df_final$read_review_freq == "Often (61 - 80% of the time)", 1, 0)
df_final$read_review_freq_veryoften <- ifelse(df_final$read_review_freq == "Very often (81 - 100% of the time)", 1, 0)
# review essential
df_final$review_essential_numeric <- case_when(df_final$review_essential == "Not important at all" ~ 1,
df_final$review_essential == "Somewhat not important" ~ 2,
df_final$review_essential == "Neutral" ~ 3,
df_final$review_essential == "Somewhat important" ~ 4,
df_final$review_essential == "Very important" ~ 5)
df_final$review_essential_notimportant <- ifelse(df_final$review_essential == "Not important at all", 1, 0)
df_final$review_essential_somewhatnotimportant <- ifelse(df_final$review_essential == "Somewhat not important", 1, 0)
df_final$review_essential_neutral <- ifelse(df_final$review_essential == "Neutral", 1, 0)
df_final$review_essential_somewhatimportant <- ifelse(df_final$review_essential == "Somewhat important", 1, 0)
df_final$review_essential_veryimportant <- ifelse(df_final$review_essential == "Very important", 1, 0)
# AIreview use
df_final$AIreview_use_numeric <- case_when(df_final$AIreview_use == "Much less frequently" ~ 1,
df_final$AIreview_use == "Less frequently" ~ 2,
df_final$AIreview_use == "About the same" ~ 3,
df_final$AIreview_use == "More frequently" ~ 4,
df_final$AIreview_use == "Much more frequently" ~ 5)
df_final$AIreview_use_muchlessfrequently <- ifelse(df_final$AIreview_use == "Much less frequently", 1, 0)
df_final$AIreview_use_lessfrequently <- ifelse(df_final$AIreview_use == "Less frequently", 1, 0)
df_final$AIreview_use_aboutthesame <- ifelse(df_final$AIreview_use == "About the same", 1, 0)
df_final$AIreview_use_morefrequently <- ifelse(df_final$AIreview_use == "More frequently", 1, 0)
df_final$AIreview_use_muchmorefrequently <- ifelse(df_final$AIreview_use == "Much more frequently", 1, 0)
covariates_all <- c("social_media_use", "website_use", "gender", "age", "edu", "polparty", "libcons", "income", "social_media_reply", "review_freq", "social_media_X", "social_media_FB", "social_media_IG", "social_media_LI", "social_media_SN", "social_media_TK", "social_media_YT", "social_media_nonUser", "social_media_use_numeric", "social_media_use_1", "social_media_use_13", "social_media_use_35", "social_media_use_5", "website_use_numeric", "website_use_1", "website_use_13", "website_use_35", "website_use_5", "genderFemale", "raceAsian", "raceBlack", "raceHispanic", "raceWhite", "raceOther", "eduHighSchoolOrLess", "eduSomeCollege", "eduBachelor", "eduPostGrad", "polpartyDem", "polpartyRep", "polpartyInd", "polpartyOther", "libcons_numeric", "income_numeric", "income_flag", "social_media_reply_numeric", "social_media_reply_never", "social_media_reply_fewayear", "social_media_reply_fewamonth", "social_media_reply_fewaweek", "social_media_reply_12times", "social_media_reply_4times", "review_freq_numeric", "review_freq_never", "review_freq_rarely", "review_freq_occasionally", "review_freq_sometimes", "review_freq_often", "review_freq_veryoften", "read_review_freq_numeric", "read_review_freq_never", "read_review_freq_rarely", "read_review_freq_occasionally", "read_review_freq_sometimes", "read_review_freq_often", "read_review_freq_veryoften", "review_essential_numeric", "review_essential_notimportant", "review_essential_somewhatnotimportant", "review_essential_neutral", "review_essential_somewhatimportant", "review_essential_veryimportant", "AIreview_use_numeric", "AIreview_use_muchlessfrequently", "AIreview_use_lessfrequently", "AIreview_use_aboutthesame", "AIreview_use_morefrequently", "AIreview_use_muchmorefrequently")
covariates_simple <- c("age", "social_media_YT", "social_media_nonUser", "social_media_user", "social_media_use_numeric", "website_use_numeric", "genderFemale", "raceAsian", "raceBlack", "raceHispanic", "raceWhite", "raceOther", "eduHighSchoolOrLess", "eduSomeCollege", "eduBachelor", "eduPostGrad", "polpartyDem", "polpartyRep", "polpartyOther", "libcons_numeric", "income_numeric", "social_media_reply_numeric", "review_freq_numeric", "read_review_freq_numeric", "review_essential_numeric", "AIreview_use_numeric")
covariates_simple_fancy <- c("Age", "YouTube User", "Social Media: Non-User", "Social Media: User", "Social Media Usage (1 - 4 Scale)", "Online Usage (1 - 4 Scale)", "Female", "Race: Asian", "Race: Black", "Race: Hispanic", "Race: White", "Race: Other", "Education: High School or Less", "Education: Some College", "Education: Bachelor", "Education: Postgraduate", "Political Party: Democrat", "Political Party: Republican", "Political Party: Other", "Political Ideology (1 - 5 Scale; 5 Strong Liberal)", "Income (1 - 5 Scale)", "Social Media Reply Frequency (1 - 6 Scale)", "Review Frequency (1 - 6 Scale)", "Read Review Frequency (1 - 6 Scale)", "Review Importance (1 - 5 Scale)", "AI Review Hypothetical Usage (1 - 5 Scale)")
df_final <- df_final %>%
mutate(across(.cols = all_of(covariates_simple),
.fns = ~ scale(., center = TRUE, scale = TRUE),
.names = "{.col}_cov"))
covariates_simple_demeaned <- paste0(covariates_simple, "_cov")
covariates_simple_demeaned_treatment <- paste0(covariates_simple_demeaned, " * treatment")
covariates_simple_demeaned_treatment_fancy <- paste0(c("Treatment Mixed Reviews * ", "Treatment All AI Reviews * "), rep(covariates_simple_fancy, each = 2))
grouping_map <- c(
"17" = "Coin Operated",
"22" = "Crook$",
"20" = "Forever Sleep",
"16" = "Soft Rain",
"23" = "One-Minute Time Machine",
"14" = "Alternative Math",
"11" = "Radical Honesty",
"21" = "Different",
"18" = "The Cook",
"19" = "Skipped",
"13" = "Boom",
"15" = "French Roast"
)
# Mapping: grouping → title
map_grouping_to_title <- function(grouping_vector) {
unname(grouping_map[as.character(grouping_vector)])
}
# Reverse mapping: title → grouping
map_title_to_grouping <- function(title_vector) {
name_lookup <- setNames(names(grouping_map), grouping_map)
as.integer(name_lookup[title_vector])
}
df_final <- df_final %>% mutate(
# extract number from the question text
mech_breadth = as.numeric(str_extract(mech_breadth, "\\d+")),
mech_length = as.numeric(str_extract(mech_length, "\\d+")),
mech_detail = as.numeric(str_extract(mech_detail, "\\d+")),
mech_tone = as.numeric(str_extract(mech_tone, "\\d+")),
mech_valence = as.numeric(str_extract(mech_valence, "\\d+"))
)
# makeCodebook(df_final[, covariates_all], replace = TRUE,
# reportTitle = 'Covariates Summary', # change this with final version
# file = 'processed_final_data/codebook_covariates_consumer_demand.Rmd') # change this with final version
If there are any errors in the data, it will be printed out. Otherwise, nothing will be printed.
error_responseId <- c()
TODO: ALSO NEED TO CHECK WHETHER EACH VIDEO ON AVERAGE GETS RANDOMIZED INTO DIFFERENT POSITIONS
check_video_randomization <- function(row){
# check if video1Title to video12Title are all different
video_title <- df_final %>%
dplyr::select(starts_with("video")) %>%
dplyr::select(ends_with("Title")) %>%
dplyr::slice(row) %>%
unlist() %>%
as.character()
return(length(unique(video_title)) == 12)
}
for (i in 1:nrow(df_final)) {
if (!check_video_randomization(i)) {
print(paste("Video randomization error in row", i))
error_responseId <- c(error_responseId, df_final$ResponseId[i])
}
}
## [1] "Video randomization error in row 358"
## [1] "Video randomization error in row 607"
TODO: ALSO NEED TO CHECK WHETHER EACH CONDITION ON AVERAGE GETS RANDOMIZED INTO DIFFERENT POSITIONS
check_condition_randomization <- function(row){
# check if video1Cond to video3Cond contains one of allAi, allNonAi, half
# similarly for video4Cond to video6Cond, video7Cond to video9Cond, and video10Cond to video12Cond
condition <- df_final %>%
dplyr::select(starts_with("video")) %>%
dplyr::select(ends_with("Cond")) %>%
dplyr::slice(row)
firstChoiceRandom <- (condition[, c("video1Cond", "video2Cond", "video3Cond")] %>% unlist() %>% unique() %>% length()) == 3
secondChoiceRandom <- (condition[, c("video4Cond", "video5Cond", "video6Cond")] %>% unlist() %>% unique() %>% length()) == 3
thirdChoiceRandom <- (condition[, c("video7Cond", "video8Cond", "video9Cond")] %>% unlist() %>% unique() %>% length()) == 3
fourthChoiceRandom <- (condition[, c("video10Cond", "video11Cond", "video12Cond")] %>% unlist() %>% unique() %>% length()) == 3
# only if all four conditions are random return TRUE otherwise return FALSE
return(firstChoiceRandom & secondChoiceRandom & thirdChoiceRandom & fourthChoiceRandom)
}
for (i in 1:nrow(df_final)) {
if (!check_condition_randomization(i)) {
print(paste("Condition randomization error in row", i))
error_responseId <- c(error_responseId, df_final$ResponseId[i])
}
}
## [1] "Condition randomization error in row 358"
## [1] "Condition randomization error in row 607"
check_review_randomization <- function(row){
# check if video1Review to video12Review are all different
review <- df_final %>%
dplyr::select(starts_with("video")) %>%
dplyr::select(ends_with("CommentsShown")) %>%
dplyr::slice(row)
condition <- df_final %>%
dplyr::select(starts_with("video")) %>%
dplyr::select(ends_with("Cond")) %>%
dplyr::slice(row)
video_title <- df_final %>%
dplyr::select(starts_with("video")) %>%
dplyr::select(ends_with("Title")) %>%
dplyr::slice(row)
for (j in 1:length(review)){
# separate by ||||
re <- review[j] %>% unname() %>% unlist()
re <- strsplit(re, "\\|\\|\\|\\|")[[1]]
video <- video_title[j] %>% unname() %>% unlist()
cond <- condition[j] %>% unname() %>% unlist()
videoID <- map_title_to_grouping(video)
review_database_j <- review_database %>%
filter(Video.Id == videoID)
review_database_j <- review_database_j %>% filter(Content %in% re)
if (nrow(review_database_j) != 4) {
print(paste("Review randomization error in row", row, "for video", j))
assign(paste0("video", j, "ReviewRandom"), FALSE)
} else {
if (cond == "allAi"){
# check mode_label is either all 3 or 4
assign(paste0("video", j, "ReviewRandom"), !any(review_database_j$mode_label %in% c(1, 2)))
} else if (cond == "allNonAi"){
assign(paste0("video", j, "ReviewRandom"), !any(review_database_j$mode_label %in% c(3, 4)))
} else if (cond == "half"){
# should have two in 1 and 2, and two in 3 and 4
assign(paste0("video", j, "ReviewRandom"), sum(review_database_j$mode_label %in% c(1, 2)) == 2 & sum(review_database_j$mode_label %in% c(3, 4)) == 2)
}
}
}
return (video1ReviewRandom & video2ReviewRandom & video3ReviewRandom &
video4ReviewRandom & video5ReviewRandom & video6ReviewRandom &
video7ReviewRandom & video8ReviewRandom & video9ReviewRandom &
video10ReviewRandom & video11ReviewRandom & video12ReviewRandom)
}
for (i in 1:nrow(df_final)) {
if (!check_review_randomization(i)) {
print(paste("Review randomization error in row", i))
error_responseId <- c(error_responseId, df_final$ResponseId[i])
}
}
## [1] "Review randomization error in row 168 for video 1"
## [1] "Review randomization error in row 168 for video 2"
## [1] "Review randomization error in row 168 for video 3"
## [1] "Review randomization error in row 168 for video 4"
## [1] "Review randomization error in row 168 for video 5"
## [1] "Review randomization error in row 168 for video 6"
## [1] "Review randomization error in row 168 for video 7"
## [1] "Review randomization error in row 168 for video 8"
## [1] "Review randomization error in row 168 for video 9"
## [1] "Review randomization error in row 168 for video 10"
## [1] "Review randomization error in row 168 for video 11"
## [1] "Review randomization error in row 168 for video 12"
## [1] "Review randomization error in row 168"
## [1] "Review randomization error in row 194 for video 1"
## [1] "Review randomization error in row 194 for video 2"
## [1] "Review randomization error in row 194 for video 3"
## [1] "Review randomization error in row 194 for video 4"
## [1] "Review randomization error in row 194 for video 5"
## [1] "Review randomization error in row 194 for video 6"
## [1] "Review randomization error in row 194 for video 7"
## [1] "Review randomization error in row 194 for video 8"
## [1] "Review randomization error in row 194 for video 9"
## [1] "Review randomization error in row 194 for video 10"
## [1] "Review randomization error in row 194 for video 11"
## [1] "Review randomization error in row 194 for video 12"
## [1] "Review randomization error in row 194"
## [1] "Review randomization error in row 358 for video 1"
## [1] "Review randomization error in row 358 for video 2"
## [1] "Review randomization error in row 358 for video 3"
## [1] "Review randomization error in row 358 for video 4"
## [1] "Review randomization error in row 358 for video 5"
## [1] "Review randomization error in row 358 for video 6"
## [1] "Review randomization error in row 358 for video 7"
## [1] "Review randomization error in row 358 for video 8"
## [1] "Review randomization error in row 358 for video 9"
## [1] "Review randomization error in row 358 for video 10"
## [1] "Review randomization error in row 358 for video 11"
## [1] "Review randomization error in row 358 for video 12"
## [1] "Review randomization error in row 358"
## [1] "Review randomization error in row 428 for video 1"
## [1] "Review randomization error in row 428 for video 2"
## [1] "Review randomization error in row 428 for video 3"
## [1] "Review randomization error in row 428 for video 4"
## [1] "Review randomization error in row 428 for video 5"
## [1] "Review randomization error in row 428 for video 6"
## [1] "Review randomization error in row 428 for video 7"
## [1] "Review randomization error in row 428 for video 8"
## [1] "Review randomization error in row 428 for video 9"
## [1] "Review randomization error in row 428 for video 10"
## [1] "Review randomization error in row 428 for video 11"
## [1] "Review randomization error in row 428 for video 12"
## [1] "Review randomization error in row 428"
## [1] "Review randomization error in row 488 for video 1"
## [1] "Review randomization error in row 488 for video 2"
## [1] "Review randomization error in row 488 for video 3"
## [1] "Review randomization error in row 488 for video 4"
## [1] "Review randomization error in row 488 for video 5"
## [1] "Review randomization error in row 488 for video 6"
## [1] "Review randomization error in row 488 for video 7"
## [1] "Review randomization error in row 488 for video 8"
## [1] "Review randomization error in row 488 for video 9"
## [1] "Review randomization error in row 488 for video 10"
## [1] "Review randomization error in row 488 for video 11"
## [1] "Review randomization error in row 488 for video 12"
## [1] "Review randomization error in row 488"
## [1] "Review randomization error in row 572 for video 1"
## [1] "Review randomization error in row 572 for video 2"
## [1] "Review randomization error in row 572 for video 3"
## [1] "Review randomization error in row 572 for video 4"
## [1] "Review randomization error in row 572 for video 5"
## [1] "Review randomization error in row 572 for video 6"
## [1] "Review randomization error in row 572 for video 7"
## [1] "Review randomization error in row 572 for video 8"
## [1] "Review randomization error in row 572 for video 9"
## [1] "Review randomization error in row 572 for video 10"
## [1] "Review randomization error in row 572 for video 11"
## [1] "Review randomization error in row 572 for video 12"
## [1] "Review randomization error in row 572"
## [1] "Review randomization error in row 607 for video 1"
## [1] "Review randomization error in row 607 for video 2"
## [1] "Review randomization error in row 607 for video 3"
## [1] "Review randomization error in row 607 for video 4"
## [1] "Review randomization error in row 607 for video 5"
## [1] "Review randomization error in row 607 for video 6"
## [1] "Review randomization error in row 607 for video 7"
## [1] "Review randomization error in row 607 for video 8"
## [1] "Review randomization error in row 607 for video 9"
## [1] "Review randomization error in row 607 for video 10"
## [1] "Review randomization error in row 607 for video 11"
## [1] "Review randomization error in row 607 for video 12"
## [1] "Review randomization error in row 607"
## [1] "Review randomization error in row 706 for video 4"
## [1] "Review randomization error in row 706 for video 5"
## [1] "Review randomization error in row 706 for video 6"
## [1] "Review randomization error in row 706 for video 7"
## [1] "Review randomization error in row 706 for video 8"
## [1] "Review randomization error in row 706 for video 9"
## [1] "Review randomization error in row 706 for video 10"
## [1] "Review randomization error in row 706 for video 11"
## [1] "Review randomization error in row 706 for video 12"
## [1] "Review randomization error in row 706"
## [1] "Review randomization error in row 758 for video 4"
## [1] "Review randomization error in row 758 for video 5"
## [1] "Review randomization error in row 758 for video 6"
## [1] "Review randomization error in row 758 for video 7"
## [1] "Review randomization error in row 758 for video 8"
## [1] "Review randomization error in row 758 for video 9"
## [1] "Review randomization error in row 758 for video 10"
## [1] "Review randomization error in row 758 for video 11"
## [1] "Review randomization error in row 758 for video 12"
## [1] "Review randomization error in row 758"
check_mc_randomization <- function(row){
# check if video1MC to video12MC are all different
mc_columns <- names(df_final)[grepl("Choice[0-9]$", names(df_final))]
mc <- df_final %>%
# select the columns follow pattern "Choice" and a number
dplyr::select(all_of(mc_columns)) %>%
# ends with a number
dplyr::slice(row)
video_title <- df_final %>%
dplyr::select(starts_with("video")) %>%
dplyr::select(ends_with("Title")) %>%
dplyr::slice(row)
firstChoiceRandom <- mc[1] %in% video_title[1:3]
secondChoiceRandom <- mc[2] %in% video_title[4:6]
thirdChoiceRandom <- mc[3] %in% video_title[7:9]
fourthChoiceRandom <- mc[4] %in% video_title[10:12]
return (firstChoiceRandom & secondChoiceRandom & thirdChoiceRandom & fourthChoiceRandom)
}
for (i in 1:nrow(df_final)) {
if (!check_mc_randomization(i)) {
print(paste("Multiple choice randomization error in row", i))
error_responseId <- c(error_responseId, df_final$ResponseId[i])
}
}
df_final <- df_final %>% filter(!ResponseId %in% error_responseId)
quick_responses <- df_final %>% dplyr::select(ProlificID, ResponseId,
ends_with("time_Page.Submit"), -AIreview_use_time_Page.Submit) %>%
# as.numeric to page submit
dplyr::mutate(across(ends_with("time_Page.Submit"), as.numeric)) %>%
dplyr::mutate(quick_response_count = rowSums(.[, -c(1:2)] <= 10)) %>%
dplyr::filter(quick_response_count > 0) %>%
arrange(desc(quick_response_count)) %>%
dplyr::select(ProlificID, quick_response_count,
ends_with("time_Page.Submit"), ResponseId)
quick_responses
quick_response_ids <- quick_responses$ResponseId
quick_response_ids_loose <- quick_responses %>% filter(quick_response_count <= 2) %>% dplyr::select(ResponseId) %>% unlist() %>% unname()
reshape_video_choice_data <- function(df) {
# List to store reshaped rows
reshaped_list <- list()
# Covariate columns (shared across all rows for a respondent)
covariate_cols <- setdiff(names(df), grep("^Choice|^video", names(df), value = TRUE))
for (i in seq_len(nrow(df))) {
for (choice_num in 1:4) {
# Indices for the three videos in this choice set
video_indices <- ((choice_num - 1) * 3 + 1):(choice_num * 3)
# Build column names for this choice set
video_title_cols <- paste0("video", video_indices, "Title")
video_comments_cols <- paste0("video", video_indices, "CommentsShown")
video_cond_cols <- paste0("video", video_indices, "Cond")
# Add more video-related columns as needed
# Extract covariates
covariates <- df[i, covariate_cols, drop = FALSE]
# Extract choice and video info
choice_col <- paste0("Choice", choice_num)
choice_val <- df[[choice_col]][i]
video_titles <- as.list(df[i, video_title_cols])
names(video_titles) <- paste0("video", 1:3, "Title")
video_comments <- as.list(df[i, video_comments_cols])
names(video_comments) <- paste0("video", 1:3, "CommentsShown")
video_conds <- as.list(df[i, video_cond_cols])
names(video_conds) <- paste0("video", 1:3, "Cond")
# find the chosen condition
choice_cond <- which(video_titles == choice_val)
treatment <- video_conds[choice_cond] %>% unlist() %>% unname()
# Combine all into a single row
row <- cbind(
covariates,
data.frame(
choice_set = choice_num,
choice = choice_val,
video1Title = video_titles[[1]],
video2Title = video_titles[[2]],
video3Title = video_titles[[3]],
video1CommentsShown = video_comments[[1]],
video2CommentsShown = video_comments[[2]],
video3CommentsShown = video_comments[[3]],
video1Cond = video_conds[[1]],
video2Cond = video_conds[[2]],
video3Cond = video_conds[[3]],
treatment = treatment
)
)
reshaped_list[[length(reshaped_list) + 1]] <- row
}
}
# Combine all rows into a data.frame
reshaped_df <- do.call(rbind, reshaped_list)
rownames(reshaped_df) <- NULL
return(reshaped_df)
}
# note: should clear out the people who finished too quickly first.
df_long_4 <- reshape_video_choice_data(df_final)
df_long_4$choice <- factor(df_long_4$treatment, levels = c("allNonAi", "half", "allAi"))
reshape_video_choice_data_12rows <- function(df) {
reshaped_list <- list()
covariate_cols <- setdiff(names(df), grep("^Choice|^video", names(df), value = TRUE))
for (i in seq_len(nrow(df))) {
for (choice_num in 1:4) {
video_indices <- ((choice_num - 1) * 3 + 1):(choice_num * 3)
video_title_cols <- paste0("video", video_indices, "Title")
video_comments_cols <- paste0("video", video_indices, "CommentsShown")
video_cond_cols <- paste0("video", video_indices, "Cond")
# Add more video-related columns as needed
covariates <- df[i, covariate_cols, drop = FALSE]
choice_col <- paste0("Choice", choice_num)
choice_val <- df[[choice_col]][i]
# Extract timing column for this choice set
choice_time_col <- paste0("Choice", choice_num, "_time_Page.Submit")
choice_time_val <- if (choice_time_col %in% names(df)) df[[choice_time_col]][i] else NA
for (j in 1:3) {
video_title <- df[[video_title_cols[j]]][i]
video_id <- map_title_to_grouping(video_title)
video_comments <- df[[video_comments_cols[j]]][i]
video_cond <- df[[video_cond_cols[j]]][i]
chosen <- as.integer(video_title == choice_val)
row <- cbind(
covariates,
data.frame(
choice_set = choice_num,
video_in_set = j,
videoId = video_id,
videoTitle = video_title,
videoCommentsShown = video_comments,
videoCond = video_cond,
chosen = chosen,
choice_time_PageSubmit = as.numeric(choice_time_val)
)
)
reshaped_list[[length(reshaped_list) + 1]] <- row
}
}
}
reshaped_df <- do.call(rbind, reshaped_list)
rownames(reshaped_df) <- NULL
return(reshaped_df)
}
df_long <- reshape_video_choice_data_12rows(df_final)
# further processing
df_long$treatment <- factor(df_long$videoCond, levels = c("allNonAi", "half", "allAi"))
# also create four columns of commentsShown
separate_comments <- function(comments) {
comments <- strsplit(comments, "\\|\\|\\|\\|")[[1]]
return(comments)
}
for (i in 1:nrow(df_long)) {
separated_comments <- separate_comments(df_long$videoCommentsShown[i])
df_long$videoComment1[i] <- separated_comments[1]
df_long$videoComment2[i] <- separated_comments[2]
df_long$videoComment3[i] <- separated_comments[3]
df_long$videoComment4[i] <- separated_comments[4]
}
panel_lm <- feglm(chosen ~ treatment | choice_set + videoTitle + video_in_set, family = binomial("logit"),
data = df_long , cluster = "ResponseId")
panel_lm %>% summary()
## GLM estimation, family = binomial, Dep. Var.: chosen
## Observations: 9,480
## Fixed-effects: choice_set: 4, videoTitle: 12, video_in_set: 3
## Standard-errors: Clustered (ResponseId)
## Estimate Std. Error z value Pr(>|z|)
## treatmenthalf 0.456476 0.071064 6.42341 0.00000000013325 ***
## treatmentallAi 1.055624 0.078600 13.43041 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## Log-Likelihood: -5,795.0 Adj. Pseudo R2: 0.036657
## BIC: 11,763.9 Squared Cor.: 0.050582
Choice Share (Should Add Up to 1)
df_long$pred <- predict(panel_lm, type = "response")
# Model‑based choice share by condition
choice_share <- aggregate(pred ~ treatment, df_long, mean)
choice_share
cov_form <- as.formula(paste("chosen ~ treatment + ", paste(covariates_simple_demeaned_treatment, collapse = " + "), "| choice_set + videoTitle + video_in_set"))
panel_lm_with_cov <- feglm(cov_form,
data = df_long,
family = binomial("logit"),
cluster = "ResponseId")
keep <- list(
"treatmenthalf" = "Treatment: Mixed Reviews",
"treatmentallAi" = "Treatment: All AI Reviews"
)
model_list <- list(panel_lm, panel_lm_with_cov)
texreg::screenreg(model_list,
stars = c(0.05, 0.01, 0.001),
caption = "Panel Regression Results",
label = "tab:panel_regression",
digits = 4,
custom.coef.map = keep,
custom.note = "Standard errors are clustered at the user level.",
custom.model.names = c("Without Covariates", "With Covariates"),
custom.coef.names = c("Treatment: Mixed Reviews", "Treatment: All AI Reviews"))
##
## ==============================================================
## Without Covariates With Covariates
## --------------------------------------------------------------
## Treatment: Mixed Reviews 0.4565 *** 0.4692 ***
## (0.0711) (0.0708)
## Treatment: All AI Reviews 1.0556 *** 1.0783 ***
## (0.0786) (0.0786)
## --------------------------------------------------------------
## Num. obs. 9480 9468
## Num. groups: choice_set 4 4
## Num. groups: videoTitle 12 12
## Num. groups: video_in_set 3 3
## Deviance 11589.9160 11468.3730
## Log Likelihood -5794.9580 -5734.1865
## Pseudo R^2 0.0367 0.0341
## ==============================================================
## Standard errors are clustered at the user level.
Constrain to First Choice
panel_lm_1st <- feglm(chosen ~ treatment | choice_set + videoTitle + video_in_set, family = binomial("logit"),
data = df_long %>% filter(choice_set == 1), cluster = "ResponseId")
panel_lm_1st %>% summary()
## GLM estimation, family = binomial, Dep. Var.: chosen
## Observations: 2,370
## Fixed-effects: choice_set: 1, videoTitle: 12, video_in_set: 3
## Standard-errors: Clustered (ResponseId)
## Estimate Std. Error z value Pr(>|z|)
## treatmenthalf 0.387169 0.135257 2.86246 0.0042036877231000922 **
## treatmentallAi 1.097414 0.137171 8.00033 0.0000000000000012409 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## Log-Likelihood: -1,430.6 Adj. Pseudo R2: 0.04171
## BIC: 2,985.6 Squared Cor.: 0.065872
panel_lm_1st_with_cov <- feglm(cov_form,
data = df_long %>% filter(choice_set == 1),
family = binomial("logit"),
cluster = "ResponseId")
model_list <- list(panel_lm_1st, panel_lm_1st_with_cov)
texreg::screenreg(model_list,
stars = c(0.05, 0.01, 0.001),
caption = "Panel Regression Results",
label = "tab:panel_regression",
digits = 4,
custom.coef.map = keep,
custom.note = "Standard errors are clustered at the user level.",
custom.model.names = c("Without Covariates", "With Covariates"),
custom.coef.names = c("Treatment: Mixed Reviews", "Treatment: All AI Reviews"))
##
## ==============================================================
## Without Covariates With Covariates
## --------------------------------------------------------------
## Treatment: Mixed Reviews 0.3872 ** 0.4387 **
## (0.1353) (0.1406)
## Treatment: All AI Reviews 1.0974 *** 1.1739 ***
## (0.1372) (0.1421)
## --------------------------------------------------------------
## Num. obs. 2370 2367
## Num. groups: choice_set 1 1
## Num. groups: videoTitle 12 12
## Num. groups: video_in_set 3 3
## Deviance 2861.2353 2778.3991
## Log Likelihood -1430.6176 -1389.1996
## Pseudo R^2 0.0417 0.0222
## ==============================================================
## Standard errors are clustered at the user level.
We have customized subgroup analyses for Length, Sentiment, and Choice Time (i.e. how much time a person spends on a choice question).
Preprocessing of Subgroup Variables
# Comment Length
df_long$videoComment1Length <- nchar(df_long$videoComment1)
df_long$videoComment2Length <- nchar(df_long$videoComment2)
df_long$videoComment3Length <- nchar(df_long$videoComment3)
df_long$videoComment4Length <- nchar(df_long$videoComment4)
df_long$videoCommentOverallLength <- df_long$videoComment1Length + df_long$videoComment2Length + df_long$videoComment3Length + df_long$videoComment4Length
# # winsorize
# df_long$videoCommentOverallLengthWinsorize <- Winsorize(df_long$videoCommentOverallLength, val = quantile(df_long$videoCommentOverallLength, probs = c(0, 0.99), na.rm = T))
df_long$videoCommentOverallLengthAboveMedian <- ifelse(df_long$videoCommentOverallLength > median(df_long$videoCommentOverallLength, na.rm = T), 1, 0)
# plot distribution of comment overall length
# ggplot(df_long, aes(x = videoCommentOverallLength)) +
# geom_histogram(binwidth = 10, fill = "blue", alpha = 0.7) +
# theme_minimal() +
# labs(title = "Distribution of Overall Comment Length", x = "Overall Length", y = "Frequency")
# Comment Sentiment
df_long$videoComment1Sentiment <- NA
df_long$videoComment2Sentiment <- NA
df_long$videoComment3Sentiment <- NA
df_long$videoComment4Sentiment <- NA
for (i in 1:nrow(df_long)) {
videoComment1 <- df_long$videoComment1[i]
videoComment2 <- df_long$videoComment2[i]
videoComment3 <- df_long$videoComment3[i]
videoComment4 <- df_long$videoComment4[i]
# find the sentiment in review_database
sentiment1 <- review_database %>% filter(Content == videoComment1) %>% dplyr::select(sentiment) %>% unlist() %>% unname()
sentiment2 <- review_database %>% filter(Content == videoComment2) %>% dplyr::select(sentiment) %>% unlist() %>% unname()
sentiment3 <- review_database %>% filter(Content == videoComment3) %>% dplyr::select(sentiment) %>% unlist() %>% unname()
sentiment4 <- review_database %>% filter(Content == videoComment4) %>% dplyr::select(sentiment) %>% unlist() %>% unname()
df_long$videoComment1Sentiment[i] <- sentiment1
df_long$videoComment2Sentiment[i] <- sentiment2
df_long$videoComment3Sentiment[i] <- sentiment3
df_long$videoComment4Sentiment[i] <- sentiment4
}
df_long$videoComment1Sentiment <- factor(df_long$videoComment1Sentiment, levels = c("negative", "neutral", "positive"))
df_long$videoComment2Sentiment <- factor(df_long$videoComment2Sentiment, levels = c("negative", "neutral", "positive"))
df_long$videoComment3Sentiment <- factor(df_long$videoComment3Sentiment, levels = c("negative", "neutral", "positive"))
df_long$videoComment4Sentiment <- factor(df_long$videoComment4Sentiment, levels = c("negative", "neutral", "positive"))
# row-wise how many comments have positive sentiments
df_long$videoCommentNumPosSentiment <- sapply(1:nrow(df_long), function(i) {
sum(c(df_long$videoComment1Sentiment[i], df_long$videoComment2Sentiment[i], df_long$videoComment3Sentiment[i], df_long$videoComment4Sentiment[i]) == "positive")
})
df_long$videoCommentNumNegSentiment <- sapply(1:nrow(df_long), function(i) {
sum(c(df_long$videoComment1Sentiment[i], df_long$videoComment2Sentiment[i], df_long$videoComment3Sentiment[i], df_long$videoComment4Sentiment[i]) == "negative")
})
df_long$choice_time_PageSubmitAboveMedian <- ifelse(df_long$choice_time_PageSubmit > median(df_long$choice_time_PageSubmit, na.rm = T), 1, 0)
df_long$choice_time_PageSubmitAbove10 <- ifelse(df_long$choice_time_PageSubmit > 10, 1, 0)
median_split <- function(df, var){
median_val <- median(df[[var]], na.rm = T)
df[[paste0(var, "_median")]] <- ifelse(df[[var]] > median_val, "Above Median", "Below Median")
df[[paste0(var, "_median")]] <- factor(df[[paste0(var, "_median")]], levels = c("Below Median", "Above Median"))
return(df)
}
quartile_split <- function(df, var){
quartiles <- quantile(df[[var]], probs = c(0.25, 0.5, 0.75), na.rm = T)
df[[paste0(var, "_quartile")]] <- cut(df[[var]], breaks = c(-Inf, quartiles, Inf), labels = c("Q1", "Q2", "Q3", "Q4"))
return(df)
}
quintile_split <- function(df, var){
quintiles <- quantile(df[[var]], probs = seq(0, 1, by = 0.2), na.rm = T)
df[[paste0(var, "_quintile")]] <- cut(df[[var]], breaks = quintiles, labels = c("Q1", "Q2", "Q3", "Q4", "Q5"))
return(df)
}
recenter <- function(df, var){
values <- unique(df[[var]]) %>% sort()
# check if the number of values is odd
if (length(values) %% 2 == 1) {
median_val <- values[(length(values) + 1) / 2]
df[[paste0(var, "_recenter")]] <- df[[var]] - median_val
} else {
stop("The variable is not continuous or the number of values is not odd.")
}
return(df)
}
df_long <- df_long %>%
median_split("social_media_use_numeric") %>%
median_split("website_use_numeric") %>%
median_split("social_media_reply_numeric") %>%
median_split("review_freq_numeric") %>%
median_split("age") %>%
median_split("income_numeric") %>%
median_split("libcons_numeric") %>%
median_split("review_freq_numeric") %>%
median_split("read_review_freq_numeric") %>%
median_split("AIreview_use_numeric") %>%
median_split("mech_breadth") %>%
median_split("mech_length") %>%
median_split("mech_detail") %>%
median_split("mech_tone") %>%
median_split("mech_valence")
df_long <- df_long %>%
recenter("libcons_numeric") %>%
recenter("review_essential_numeric") %>%
recenter("AIreview_use_numeric") %>%
recenter("mech_breadth") %>%
recenter("mech_length") %>%
recenter("mech_detail") %>%
recenter("mech_tone") %>%
recenter("mech_valence")
df_long <- df_long %>%
quartile_split("videoCommentOverallLength") %>%
quartile_split("choice_time_PageSubmit")
df_long <- df_long %>%
quintile_split("videoCommentOverallLength") %>%
quintile_split("choice_time_PageSubmit")
df_long$review_essential_numeric_5 <- ifelse(df_long$review_essential_numeric == 5, "Very Important (= 5)", "< 5")
df_long$review_essential_numeric_5 <- factor(df_long$review_essential_numeric_5, levels = c("< 5", "Very Important (= 5)"))
df_long$edu_combined <- case_when(
df_long$edu == "Did not graduate from high school" ~ "High School or Less",
df_long$edu == "High school graduate (high school diploma or equivalent including GED)" ~ "High School or Less",
df_long$edu == "Some college, but no degree" ~ "Some College",
df_long$edu == "2-year college degree" ~ "Bachelor's Degree",
df_long$edu == "4-year college degree" ~ "Bachelor's Degree",
df_long$edu == "Postgraduate degree (MA, MBA, JD, PhD, etc.)" ~ "Graduate Degree"
)
df_long$edu_combined <- factor(df_long$edu_combined, levels = c("High School or Less", "Some College", "Bachelor's Degree", "Graduate Degree"))
df_long$race_combined <- case_when(
df_long$race == "Asian/Pacific Islander" ~ "Asian",
df_long$race == "Black or African American" ~ "Black",
df_long$race == "Latino or Hispanic" ~ "Hispanic",
df_long$race == "Caucasian/White" ~ "White",
TRUE ~ "Other"
)
df_long$race_combined <- factor(df_long$race_combined, levels = c("White", "Black", "Hispanic", "Asian", "Other"))
df_long$polparty_combined <- case_when(
df_long$polparty == "Democrat" ~ "Democrat",
df_long$polparty == "Republican" ~ "Republican",
TRUE ~ "Other"
)
df_long$polparty_combined <- factor(df_long$polparty_combined, levels = c("Democrat", "Republican", "Other"))
Median Split of Overall Length
panel_lm_length_median <- feglm(chosen ~ treatment * videoCommentOverallLengthAboveMedian | choice_set + videoTitle + video_in_set, family = binomial("logit"),
data = df_long , cluster = "ResponseId")
panel_lm_length_median %>% summary()
## GLM estimation, family = binomial, Dep. Var.: chosen
## Observations: 9,480
## Fixed-effects: choice_set: 4, videoTitle: 12, video_in_set: 3
## Standard-errors: Clustered (ResponseId)
## Estimate Std. Error
## treatmenthalf 0.352550 0.081989
## treatmentallAi 0.754321 0.182306
## videoCommentOverallLengthAboveMedian 0.420075 0.308453
## treatmenthalf:videoCommentOverallLengthAboveMedian -0.210705 0.317135
## treatmentallAi:videoCommentOverallLengthAboveMedian -0.095896 0.349537
## z value Pr(>|z|)
## treatmenthalf 4.299965 0.000017083 ***
## treatmentallAi 4.137655 0.000035087 ***
## videoCommentOverallLengthAboveMedian 1.361875 0.173237211
## treatmenthalf:videoCommentOverallLengthAboveMedian -0.664401 0.506433661
## treatmentallAi:videoCommentOverallLengthAboveMedian -0.274352 0.783814277
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## Log-Likelihood: -5,789.0 Adj. Pseudo R2: 0.037141
## BIC: 11,779.5 Squared Cor.: 0.051821
Quintile Split of Overall Length
panel_lm_length_quintile <- feglm(chosen ~ treatment * videoCommentOverallLength_quintile | choice_set + videoTitle + video_in_set, family = binomial("logit"),
data = df_long , cluster = "ResponseId")
## NOTE: 1 observation removed because of NA values (RHS: 1).
## The variable 'treatmentallAi:videoCommentOverallLength_quintileQ5' has been removed because of collinearity (see $collin.var).
panel_lm_length_quintile %>% summary()
## GLM estimation, family = binomial, Dep. Var.: chosen
## Observations: 9,479
## Fixed-effects: choice_set: 4, videoTitle: 12, video_in_set: 3
## Standard-errors: Clustered (ResponseId)
## Estimate Std. Error
## treatmenthalf -0.871769 0.590928
## treatmentallAi 11.710761 0.105380
## videoCommentOverallLength_quintileQ2 0.124140 0.090192
## videoCommentOverallLength_quintileQ3 -0.055309 0.245567
## videoCommentOverallLength_quintileQ4 0.523799 0.546973
## videoCommentOverallLength_quintileQ5 -10.493879 0.106384
## treatmenthalf:videoCommentOverallLength_quintileQ2 1.045270 0.600449
## treatmentallAi:videoCommentOverallLength_quintileQ2 -11.065947 0.309124
## treatmenthalf:videoCommentOverallLength_quintileQ3 1.484043 0.628001
## treatmentallAi:videoCommentOverallLength_quintileQ3 -10.904640 0.302454
## treatmenthalf:videoCommentOverallLength_quintileQ4 0.937523 0.808613
## treatmentallAi:videoCommentOverallLength_quintileQ4 -11.253822 0.554951
## treatmenthalf:videoCommentOverallLength_quintileQ5 12.117335 0.674641
## z value Pr(>|z|)
## treatmenthalf -1.475254 0.140144
## treatmentallAi 111.129240 < 2.2e-16 ***
## videoCommentOverallLength_quintileQ2 1.376393 0.168700
## videoCommentOverallLength_quintileQ3 -0.225228 0.821802
## videoCommentOverallLength_quintileQ4 0.957633 0.338248
## videoCommentOverallLength_quintileQ5 -98.641629 < 2.2e-16 ***
## treatmenthalf:videoCommentOverallLength_quintileQ2 1.740814 0.081716 .
## treatmentallAi:videoCommentOverallLength_quintileQ2 -35.797769 < 2.2e-16 ***
## treatmenthalf:videoCommentOverallLength_quintileQ3 2.363123 0.018122 *
## treatmentallAi:videoCommentOverallLength_quintileQ3 -36.053899 < 2.2e-16 ***
## treatmenthalf:videoCommentOverallLength_quintileQ4 1.159421 0.246285
## treatmentallAi:videoCommentOverallLength_quintileQ4 -20.278945 < 2.2e-16 ***
## treatmenthalf:videoCommentOverallLength_quintileQ5 17.961168 < 2.2e-16 ***
## ... 1 variable was removed because of collinearity (treatmentallAi:videoCommentOverallLength_quintileQ5)
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## Log-Likelihood: -5,778.8 Adj. Pseudo R2: 0.037454
## BIC: 11,832.2 Squared Cor.: 0.053867
Given we have four comments in each panel observation (i.e. four comments for each video), we create the following two variables
videoCommentNumPosSentiment: number of positive
comments for the videovideoCommentNumNegSentiment: number of negative
comments for the videopanel_lm_sentiment_pos <- feglm(chosen ~ treatment * videoCommentNumPosSentiment | choice_set + videoTitle + video_in_set, family = binomial("logit"),
data = df_long , cluster = "ResponseId")
panel_lm_sentiment_pos %>% summary()
## GLM estimation, family = binomial, Dep. Var.: chosen
## Observations: 9,480
## Fixed-effects: choice_set: 4, videoTitle: 12, video_in_set: 3
## Standard-errors: Clustered (ResponseId)
## Estimate Std. Error z value
## treatmenthalf 0.153934 0.213964 0.719437
## treatmentallAi 1.066570 0.206163 5.173438
## videoCommentNumPosSentiment 0.185498 0.048654 3.812623
## treatmenthalf:videoCommentNumPosSentiment 0.086987 0.065510 1.327845
## treatmentallAi:videoCommentNumPosSentiment -0.022844 0.061805 -0.369615
## Pr(>|z|)
## treatmenthalf 0.47187147000
## treatmentallAi 0.00000022983 ***
## videoCommentNumPosSentiment 0.00013750002 ***
## treatmenthalf:videoCommentNumPosSentiment 0.18422941474
## treatmentallAi:videoCommentNumPosSentiment 0.71166939072
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## Log-Likelihood: -5,768.1 Adj. Pseudo R2: 0.040608
## BIC: 11,737.7 Squared Cor.: 0.056164
panel_lm_sentiment_neg <- feglm(chosen ~ treatment * videoCommentNumNegSentiment | choice_set + videoTitle + video_in_set, family = binomial("logit"),
data = df_long , cluster = "ResponseId")
panel_lm_sentiment_neg %>% summary()
## GLM estimation, family = binomial, Dep. Var.: chosen
## Observations: 9,480
## Fixed-effects: choice_set: 4, videoTitle: 12, video_in_set: 3
## Standard-errors: Clustered (ResponseId)
## Estimate Std. Error z value
## treatmenthalf 0.523598 0.083317 6.28440
## treatmentallAi 1.076036 0.090306 11.91538
## videoCommentNumNegSentiment -0.176993 0.060997 -2.90169
## treatmenthalf:videoCommentNumNegSentiment -0.172703 0.083174 -2.07641
## treatmentallAi:videoCommentNumNegSentiment -0.145880 0.085190 -1.71241
## Pr(>|z|)
## treatmenthalf 0.00000000032913 ***
## treatmentallAi < 2.2e-16 ***
## videoCommentNumNegSentiment 0.00371155880552 **
## treatmenthalf:videoCommentNumNegSentiment 0.03785641074412 *
## treatmentallAi:videoCommentNumNegSentiment 0.08682187961760 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## Log-Likelihood: -5,761.9 Adj. Pseudo R2: 0.041635
## BIC: 11,725.3 Squared Cor.: 0.057455
We do two estimations:
panel_lm_time <- feglm(chosen ~ treatment * choice_time_PageSubmitAboveMedian | choice_set + videoTitle + video_in_set, family = binomial("logit"),
data = df_long , cluster = "ResponseId")
panel_lm_time %>% summary()
## GLM estimation, family = binomial, Dep. Var.: chosen
## Observations: 9,480
## Fixed-effects: choice_set: 4, videoTitle: 12, video_in_set: 3
## Standard-errors: Clustered (ResponseId)
## Estimate Std. Error z value
## treatmenthalf 0.310931 0.100319 3.09943
## treatmentallAi 0.871230 0.105227 8.27953
## choice_time_PageSubmitAboveMedian -0.238508 0.091393 -2.60970
## treatmenthalf:choice_time_PageSubmitAboveMedian 0.298679 0.139662 2.13858
## treatmentallAi:choice_time_PageSubmitAboveMedian 0.376730 0.145741 2.58493
## Pr(>|z|)
## treatmenthalf 0.0019389 **
## treatmentallAi < 2.2e-16 ***
## choice_time_PageSubmitAboveMedian 0.0090621 **
## treatmenthalf:choice_time_PageSubmitAboveMedian 0.0324695 *
## treatmentallAi:choice_time_PageSubmitAboveMedian 0.0097400 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## Log-Likelihood: -5,788.9 Adj. Pseudo R2: 0.037162
## BIC: 11,779.3 Squared Cor.: 0.05161
panel_lm_time_quintile <- feglm(chosen ~ treatment * choice_time_PageSubmit_quintile | choice_set + videoTitle + video_in_set, family = binomial("logit"),
data = df_long , cluster = "ResponseId")
## NOTE: 3 observations removed because of NA values (RHS: 3).
panel_lm_time_quintile %>% summary()
## GLM estimation, family = binomial, Dep. Var.: chosen
## Observations: 9,477
## Fixed-effects: choice_set: 4, videoTitle: 12, video_in_set: 3
## Standard-errors: Clustered (ResponseId)
## Estimate Std. Error z value
## treatmenthalf -0.156051 0.155370 -1.00438
## treatmentallAi 0.517267 0.168686 3.06644
## choice_time_PageSubmit_quintileQ2 -0.395064 0.135838 -2.90835
## choice_time_PageSubmit_quintileQ3 -0.480123 0.138573 -3.46477
## choice_time_PageSubmit_quintileQ4 -0.515611 0.139812 -3.68789
## choice_time_PageSubmit_quintileQ5 -0.648808 0.141834 -4.57441
## treatmenthalf:choice_time_PageSubmit_quintileQ2 0.637406 0.215430 2.95877
## treatmentallAi:choice_time_PageSubmit_quintileQ2 0.510398 0.225751 2.26089
## treatmenthalf:choice_time_PageSubmit_quintileQ3 0.750283 0.219138 3.42379
## treatmentallAi:choice_time_PageSubmit_quintileQ3 0.632886 0.223327 2.83390
## treatmenthalf:choice_time_PageSubmit_quintileQ4 0.857444 0.218542 3.92347
## treatmentallAi:choice_time_PageSubmit_quintileQ4 0.629506 0.228474 2.75526
## treatmenthalf:choice_time_PageSubmit_quintileQ5 0.870719 0.216064 4.02991
## treatmentallAi:choice_time_PageSubmit_quintileQ5 0.978068 0.233891 4.18172
## Pr(>|z|)
## treatmenthalf 0.3151936826
## treatmentallAi 0.0021662164 **
## choice_time_PageSubmit_quintileQ2 0.0036334565 **
## choice_time_PageSubmit_quintileQ3 0.0005306931 ***
## choice_time_PageSubmit_quintileQ4 0.0002261258 ***
## choice_time_PageSubmit_quintileQ5 0.0000047756 ***
## treatmenthalf:choice_time_PageSubmit_quintileQ2 0.0030887355 **
## treatmentallAi:choice_time_PageSubmit_quintileQ2 0.0237659298 *
## treatmenthalf:choice_time_PageSubmit_quintileQ3 0.0006175335 ***
## treatmentallAi:choice_time_PageSubmit_quintileQ3 0.0045983889 **
## treatmenthalf:choice_time_PageSubmit_quintileQ4 0.0000872818 ***
## treatmentallAi:choice_time_PageSubmit_quintileQ4 0.0058645425 **
## treatmenthalf:choice_time_PageSubmit_quintileQ5 0.0000557985 ***
## treatmentallAi:choice_time_PageSubmit_quintileQ5 0.0000289315 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## Log-Likelihood: -5,771.0 Adj. Pseudo R2: 0.038334
## BIC: 11,825.9 Squared Cor.: 0.054491
panel_lm_time_10 <- feglm(chosen ~ treatment * choice_time_PageSubmitAbove10 | choice_set + videoTitle + video_in_set, family = binomial("logit"),
data = df_long , cluster = "ResponseId")
panel_lm_time_10 %>% summary()
## GLM estimation, family = binomial, Dep. Var.: chosen
## Observations: 9,480
## Fixed-effects: choice_set: 4, videoTitle: 12, video_in_set: 3
## Standard-errors: Clustered (ResponseId)
## Estimate Std. Error z value
## treatmenthalf -0.477802 0.222686 -2.145626
## treatmentallAi 0.145578 0.232593 0.625891
## choice_time_PageSubmitAbove10 -0.713656 0.137802 -5.178861
## treatmenthalf:choice_time_PageSubmitAbove10 1.050914 0.233881 4.493377
## treatmentallAi:choice_time_PageSubmitAbove10 1.025878 0.242195 4.235754
## Pr(>|z|)
## treatmenthalf 0.03190282812 *
## treatmentallAi 0.53138647221
## choice_time_PageSubmitAbove10 0.00000022324 ***
## treatmenthalf:choice_time_PageSubmitAbove10 0.00000701026 ***
## treatmentallAi:choice_time_PageSubmitAbove10 0.00002277860 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## Log-Likelihood: -5,773.3 Adj. Pseudo R2: 0.039753
## BIC: 11,748.0 Squared Cor.: 0.054627
These demographic/behaviorial variables use median split
social_media_use_numericwebsite_use_numericsocial_media_reply_numericreview_freq_numericageincome_numericlibcons_numericreview_freq_numericread_review_freq_numericreview_essential_numericAIreview_use_numericWe also use median split for the following “mechanism” variables (1 - 5 scale original question)
mech_breadth: one topic versus many topicsmech_length: short versus long reviewsmech_detail: general versus detailedmech_tone: neutral, factual versus emotional,
expressivemech_valence: critical versus praisingFor review_essential_numeric_5, we use whether = 5 or
< 5 (on a 1 - 5 scale). Note: this could suggest experimenter demand
bias.
subgroups_columns <- c("social_media_use_numeric_median", "website_use_numeric_median", "social_media_reply_numeric_median", "review_freq_numeric_median", "age_median", "income_numeric_median", "libcons_numeric_median", "edu_combined", "race_combined", "polparty_combined", "review_freq_numeric_median", "read_review_freq_numeric_median", "review_essential_numeric_5", "AIreview_use_numeric_median", "mech_breadth_median", "mech_length_median", "mech_detail_median", "mech_tone_median", "mech_valence_median")
for (subgroup in subgroups_columns){
print(subgroup)
subgroup_form <- as.formula(paste0("chosen ~ treatment * ", subgroup, " | choice_set + videoTitle + video_in_set"))
subgroup_lm <- feglm(subgroup_form, family = binomial("logit"),
data = df_long , cluster = "ResponseId")
assign(paste0("subgroup_lm_", subgroup), subgroup_lm)
}
## [1] "social_media_use_numeric_median"
## [1] "website_use_numeric_median"
## [1] "social_media_reply_numeric_median"
## [1] "review_freq_numeric_median"
## [1] "age_median"
## [1] "income_numeric_median"
## [1] "libcons_numeric_median"
## [1] "edu_combined"
## [1] "race_combined"
## [1] "polparty_combined"
## [1] "review_freq_numeric_median"
## [1] "read_review_freq_numeric_median"
## [1] "review_essential_numeric_5"
## [1] "AIreview_use_numeric_median"
## [1] "mech_breadth_median"
## [1] "mech_length_median"
## [1] "mech_detail_median"
## [1] "mech_tone_median"
## [1] "mech_valence_median"
for (subgroup in subgroups_columns){
subgroup_lm <- get(paste0("subgroup_lm_", subgroup))
coef_table <- summary(subgroup_lm)$coeftable
cat("#### ", subgroup, "\n")
print(paste0("Values for this variable: ", paste(unique(df_long[[subgroup]]), collapse = ", ")))
rows_to_extract <- rownames(coef_table)[!str_detect(rownames(coef_table), "video") & (rownames(coef_table) != "(Intercept)")]
subgroup_coef <- coef_table[rows_to_extract, ]
print(kable(subgroup_coef, format = "markdown"))
cat("\n")
}
[1] “Values for this variable: Below Median, Above Median”
| Estimate | Std. Error | z value | Pr(>|z|) | |
|---|---|---|---|---|
| treatmenthalf | 0.4990683 | 0.0881494 | 5.6616182 | 0.0000000 |
| treatmentallAi | 1.1249981 | 0.0949216 | 11.8518626 | 0.0000000 |
| website_use_numeric_medianAbove Median | 0.1215868 | 0.1020944 | 1.1909256 | 0.2336828 |
| treatmenthalf:website_use_numeric_medianAbove Median | -0.1262011 | 0.1484070 | -0.8503714 | 0.3951186 |
| treatmentallAi:website_use_numeric_medianAbove Median | -0.2071179 | 0.1685576 | -1.2287666 | 0.2191593 |
[1] “Values for this variable: Below Median, Above Median”
| Estimate | Std. Error | z value | Pr(>|z|) | |
|---|---|---|---|---|
| treatmenthalf | 0.5879337 | 0.0926056 | 6.348791 | 0.0000000 |
| treatmentallAi | 1.1044790 | 0.1005708 | 10.982099 | 0.0000000 |
| review_freq_numeric_medianAbove Median | 0.1449329 | 0.0977059 | 1.483360 | 0.1379789 |
| treatmenthalf:review_freq_numeric_medianAbove Median | -0.3051722 | 0.1432150 | -2.130868 | 0.0331000 |
| treatmentallAi:review_freq_numeric_medianAbove Median | -0.1104911 | 0.1599153 | -0.690935 | 0.4896064 |
[1] “Values for this variable: Below Median, Above Median”
| Estimate | Std. Error | z value | Pr(>|z|) | |
|---|---|---|---|---|
| treatmenthalf | 0.4819323 | 0.1013851 | 4.7534850 | 0.0000020 |
| treatmentallAi | 1.1087732 | 0.1086471 | 10.2052750 | 0.0000000 |
| age_medianAbove Median | 0.0612048 | 0.0965396 | 0.6339861 | 0.5260899 |
| treatmenthalf:age_medianAbove Median | -0.0530461 | 0.1415220 | -0.3748256 | 0.7077902 |
| treatmentallAi:age_medianAbove Median | -0.1114428 | 0.1571823 | -0.7090037 | 0.4783222 |
[1] “Values for this variable: Above Median, Below Median”
| Estimate | Std. Error | z value | Pr(>|z|) | |
|---|---|---|---|---|
| treatmenthalf | 0.4848226 | 0.0860773 | 5.6324115 | 0.0000000 |
| treatmentallAi | 0.9995116 | 0.0969034 | 10.3145184 | 0.0000000 |
| income_numeric_medianAbove Median | -0.0353410 | 0.1027440 | -0.3439713 | 0.7308679 |
| treatmenthalf:income_numeric_medianAbove Median | -0.0849464 | 0.1519029 | -0.5592156 | 0.5760146 |
| treatmentallAi:income_numeric_medianAbove Median | 0.1643352 | 0.1653160 | 0.9940669 | 0.3201903 |
[1] “Values for this variable: Below Median, Above Median, NA”
| Estimate | Std. Error | z value | Pr(>|z|) | |
|---|---|---|---|---|
| treatmenthalf | 0.3101322 | 0.0897028 | 3.457330 | 0.0005456 |
| treatmentallAi | 0.8868747 | 0.1004017 | 8.833262 | 0.0000000 |
| libcons_numeric_medianAbove Median | -0.3083354 | 0.1010519 | -3.051257 | 0.0022789 |
| treatmenthalf:libcons_numeric_medianAbove Median | 0.3894625 | 0.1465184 | 2.658114 | 0.0078579 |
| treatmentallAi:libcons_numeric_medianAbove Median | 0.4522346 | 0.1611412 | 2.806449 | 0.0050091 |
[1] “Values for this variable: Some College, Graduate Degree, Bachelor’s Degree, High School or Less”
| Estimate | Std. Error | z value | Pr(>|z|) | |
|---|---|---|---|---|
| treatmenthalf | 0.7128830 | 0.2128516 | 3.3492025 | 0.0008104 |
| treatmentallAi | 1.0376760 | 0.2550112 | 4.0691389 | 0.0000472 |
| edu_combinedSome College | 0.1297374 | 0.2091577 | 0.6202851 | 0.5350701 |
| edu_combinedBachelor’s Degree | 0.0781043 | 0.1714407 | 0.4555762 | 0.6486948 |
| edu_combinedGraduate Degree | 0.0945997 | 0.1777041 | 0.5323437 | 0.5944880 |
| treatmenthalf:edu_combinedSome College | -0.3256780 | 0.3023406 | -1.0771891 | 0.2813958 |
| treatmentallAi:edu_combinedSome College | -0.0470976 | 0.3411125 | -0.1380706 | 0.8901846 |
| treatmenthalf:edu_combinedBachelor’s Degree | -0.2991327 | 0.2377563 | -1.2581486 | 0.2083380 |
| treatmentallAi:edu_combinedBachelor’s Degree | 0.0666522 | 0.2793904 | 0.2385629 | 0.8114445 |
| treatmenthalf:edu_combinedGraduate Degree | -0.2439795 | 0.2454246 | -0.9941117 | 0.3201685 |
| treatmentallAi:edu_combinedGraduate Degree | -0.0265592 | 0.2911638 | -0.0912173 | 0.9273199 |
[1] “Values for this variable: Black, White, Asian, Hispanic, Other”
| Estimate | Std. Error | z value | Pr(>|z|) | |
|---|---|---|---|---|
| treatmenthalf | 0.4747966 | 0.0872051 | 5.4445949 | 0.0000001 |
| treatmentallAi | 1.0745900 | 0.0964819 | 11.1377350 | 0.0000000 |
| race_combinedBlack | 0.0508756 | 0.1168926 | 0.4352338 | 0.6633927 |
| race_combinedHispanic | 0.4005985 | 0.2906019 | 1.3785131 | 0.1680449 |
| race_combinedAsian | -0.0932219 | 0.2694355 | -0.3459897 | 0.7293504 |
| race_combinedOther | -0.1285229 | 0.2236537 | -0.5746511 | 0.5655273 |
| treatmenthalf:race_combinedBlack | -0.0989006 | 0.1754475 | -0.5637049 | 0.5729550 |
| treatmentallAi:race_combinedBlack | -0.0466665 | 0.1887560 | -0.2472320 | 0.8047287 |
| treatmenthalf:race_combinedHispanic | -0.5151286 | 0.3862377 | -1.3337088 | 0.1822993 |
| treatmentallAi:race_combinedHispanic | -0.6181278 | 0.5442721 | -1.1356963 | 0.2560837 |
| treatmenthalf:race_combinedAsian | -0.0277326 | 0.3850182 | -0.0720293 | 0.9425786 |
| treatmentallAi:race_combinedAsian | 0.2632241 | 0.4204166 | 0.6261029 | 0.5312474 |
| treatmenthalf:race_combinedOther | 0.3672037 | 0.3287793 | 1.1168701 | 0.2640499 |
| treatmentallAi:race_combinedOther | 0.0058082 | 0.3500947 | 0.0165904 | 0.9867634 |
[1] “Values for this variable: Other, Republican, Democrat”
| Estimate | Std. Error | z value | Pr(>|z|) | |
|---|---|---|---|---|
| treatmenthalf | 0.5919063 | 0.1227454 | 4.8222264 | 0.0000014 |
| treatmentallAi | 1.1652151 | 0.1326936 | 8.7812440 | 0.0000000 |
| polparty_combinedRepublican | 0.2139708 | 0.1103565 | 1.9389047 | 0.0525129 |
| polparty_combinedOther | -0.0369587 | 0.1352409 | -0.2732804 | 0.7846377 |
| treatmenthalf:polparty_combinedRepublican | -0.3206172 | 0.1628641 | -1.9686183 | 0.0489969 |
| treatmentallAi:polparty_combinedRepublican | -0.2762617 | 0.1777736 | -1.5540082 | 0.1201825 |
| treatmenthalf:polparty_combinedOther | 0.0327091 | 0.1929206 | 0.1695469 | 0.8653665 |
| treatmentallAi:polparty_combinedOther | 0.0652730 | 0.2165151 | 0.3014709 | 0.7630555 |
[1] “Values for this variable: Below Median, Above Median”
| Estimate | Std. Error | z value | Pr(>|z|) | |
|---|---|---|---|---|
| treatmenthalf | 0.5879337 | 0.0926056 | 6.348791 | 0.0000000 |
| treatmentallAi | 1.1044790 | 0.1005708 | 10.982099 | 0.0000000 |
| review_freq_numeric_medianAbove Median | 0.1449329 | 0.0977059 | 1.483360 | 0.1379789 |
| treatmenthalf:review_freq_numeric_medianAbove Median | -0.3051722 | 0.1432150 | -2.130868 | 0.0331000 |
| treatmentallAi:review_freq_numeric_medianAbove Median | -0.1104911 | 0.1599153 | -0.690935 | 0.4896064 |
[1] “Values for this variable: Below Median, Above Median”
| Estimate | Std. Error | z value | Pr(>|z|) | |
|---|---|---|---|---|
| treatmenthalf | 0.3984441 | 0.0909073 | 4.382969 | 0.0000117 |
| treatmentallAi | 0.9895600 | 0.0997271 | 9.922677 | 0.0000000 |
| read_review_freq_numeric_medianAbove Median | -0.1201067 | 0.0995197 | -1.206864 | 0.2274846 |
| treatmenthalf:read_review_freq_numeric_medianAbove Median | 0.1541406 | 0.1452886 | 1.060927 | 0.2887232 |
| treatmentallAi:read_review_freq_numeric_medianAbove Median | 0.1752897 | 0.1616532 | 1.084357 | 0.2782066 |
[1] “Values for this variable: < 5, Very Important (= 5)”
| Estimate | Std. Error | z value | Pr(>|z|) | |
|---|---|---|---|---|
| treatmenthalf | 0.4964622 | 0.0964216 | 5.1488683 | 0.0000003 |
| treatmentallAi | 0.9420077 | 0.1068953 | 8.8124332 | 0.0000000 |
| review_essential_numeric_5Very Important (= 5) | -0.0587219 | 0.0966374 | -0.6076516 | 0.5434186 |
| treatmenthalf:review_essential_numeric_5Very Important (= 5) | -0.0793063 | 0.1419812 | -0.5585687 | 0.5764561 |
| treatmentallAi:review_essential_numeric_5Very Important (= 5) | 0.2222678 | 0.1569010 | 1.4166115 | 0.1565965 |
[1] “Values for this variable: Below Median, Above Median”
| Estimate | Std. Error | z value | Pr(>|z|) | |
|---|---|---|---|---|
| treatmenthalf | 0.5441398 | 0.0787601 | 6.908824 | 0.0000000 |
| treatmentallAi | 1.1122229 | 0.0864707 | 12.862419 | 0.0000000 |
| AIreview_use_numeric_medianAbove Median | 0.2296949 | 0.1211540 | 1.895892 | 0.0579744 |
| treatmenthalf:AIreview_use_numeric_medianAbove Median | -0.4004725 | 0.1784720 | -2.243895 | 0.0248391 |
| treatmentallAi:AIreview_use_numeric_medianAbove Median | -0.2517120 | 0.2003107 | -1.256608 | 0.2088956 |
[1] “Values for this variable: Below Median, Above Median”
| Estimate | Std. Error | z value | Pr(>|z|) | |
|---|---|---|---|---|
| treatmenthalf | 0.3539948 | 0.0779350 | 4.542180 | 0.0000056 |
| treatmentallAi | 0.8983430 | 0.0852270 | 10.540592 | 0.0000000 |
| mech_breadth_medianAbove Median | -0.5188299 | 0.1381022 | -3.756855 | 0.0001721 |
| treatmenthalf:mech_breadth_medianAbove Median | 0.5567548 | 0.1883784 | 2.955513 | 0.0031215 |
| treatmentallAi:mech_breadth_medianAbove Median | 0.8222358 | 0.2133020 | 3.854797 | 0.0001158 |
[1] “Values for this variable: Below Median, Above Median”
| Estimate | Std. Error | z value | Pr(>|z|) | |
|---|---|---|---|---|
| treatmenthalf | 0.3344945 | 0.0948121 | 3.527971 | 0.0004188 |
| treatmentallAi | 0.7634959 | 0.0997655 | 7.652904 | 0.0000000 |
| mech_length_medianAbove Median | -0.3580863 | 0.0992673 | -3.607293 | 0.0003094 |
| treatmenthalf:mech_length_medianAbove Median | 0.2905379 | 0.1435918 | 2.023360 | 0.0430360 |
| treatmentallAi:mech_length_medianAbove Median | 0.6671729 | 0.1599667 | 4.170698 | 0.0000304 |
[1] “Values for this variable: Below Median, Above Median”
| Estimate | Std. Error | z value | Pr(>|z|) | |
|---|---|---|---|---|
| treatmenthalf | 0.4096091 | 0.0898912 | 4.5567201 | 0.0000052 |
| treatmentallAi | 0.9353735 | 0.0987703 | 9.4701890 | 0.0000000 |
| mech_detail_medianAbove Median | -0.1671498 | 0.1007009 | -1.6598647 | 0.0969417 |
| treatmenthalf:mech_detail_medianAbove Median | 0.1269494 | 0.1463721 | 0.8673057 | 0.3857745 |
| treatmentallAi:mech_detail_medianAbove Median | 0.3175879 | 0.1625854 | 1.9533607 | 0.0507769 |
[1] “Values for this variable: Below Median, Above Median”
| Estimate | Std. Error | z value | Pr(>|z|) | |
|---|---|---|---|---|
| treatmenthalf | 0.5266179 | 0.0901068 | 5.8443770 | 0.0000000 |
| treatmentallAi | 0.9936711 | 0.0988085 | 10.0565374 | 0.0000000 |
| mech_tone_medianAbove Median | 0.0006784 | 0.0996431 | 0.0068087 | 0.9945675 |
| treatmenthalf:mech_tone_medianAbove Median | -0.2025102 | 0.1457845 | -1.3891069 | 0.1648002 |
| treatmentallAi:mech_tone_medianAbove Median | 0.1741006 | 0.1630418 | 1.0678281 | 0.2855981 |
[1] “Values for this variable: Below Median, Above Median”
| Estimate | Std. Error | z value | Pr(>|z|) | |
|---|---|---|---|---|
| treatmenthalf | 0.5523281 | 0.0901576 | 6.126251 | 0.0000000 |
| treatmentallAi | 1.1213640 | 0.0972037 | 11.536229 | 0.0000000 |
| mech_valence_medianAbove Median | 0.1546823 | 0.0996084 | 1.552904 | 0.1204461 |
| treatmenthalf:mech_valence_medianAbove Median | -0.2590101 | 0.1461239 | -1.772537 | 0.0763054 |
| treatmentallAi:mech_valence_medianAbove Median | -0.1753980 | 0.1642716 | -1.067731 | 0.2856417 |
These demographic/behaviorial variables use recentered: meaning the value should have original scale of 1 to 5 and we recenter 3 to be 0, creating a scale of -2 to 2
libcons_numericreview_essential_numericAIreview_use_numericmech_breadthmech_lengthmech_detailmech_tonemech_valencesubgroups_columns_recentered <- c("libcons_numeric_recenter", "review_essential_numeric_recenter", "AIreview_use_numeric_recenter", "mech_breadth_recenter", "mech_length_recenter", "mech_detail_recenter", "mech_tone_recenter", "mech_valence_recenter")
for (subgroup in subgroups_columns_recentered){
print(subgroup)
subgroup_form <- as.formula(paste0("chosen ~ treatment * ", subgroup, " | choice_set + videoTitle + video_in_set"))
subgroup_lm <- feglm(subgroup_form, family = binomial("logit"),
data = df_long , cluster = "ResponseId")
assign(paste0("subgroup_lm_", subgroup), subgroup_lm)
}
## [1] "libcons_numeric_recenter"
## [1] "review_essential_numeric_recenter"
## [1] "AIreview_use_numeric_recenter"
## [1] "mech_breadth_recenter"
## [1] "mech_length_recenter"
## [1] "mech_detail_recenter"
## [1] "mech_tone_recenter"
## [1] "mech_valence_recenter"
for (subgroup in subgroups_columns_recentered){
subgroup_lm <- get(paste0("subgroup_lm_", subgroup))
coef_table <- summary(subgroup_lm)$coeftable
cat("#### ", subgroup, "\n")
print(paste0("Values for this variable: ", paste(unique(df_long[[subgroup]]) %>% sort(), collapse = ", ")))
rows_to_extract <- rownames(coef_table)[!str_detect(rownames(coef_table), "video") & (rownames(coef_table) != "(Intercept)")]
subgroup_coef <- coef_table[rows_to_extract, ]
print(kable(subgroup_coef, format = "markdown"))
cat("\n")
}
[1] “Values for this variable: -2, -1, 0, 1, 2”
| Estimate | Std. Error | z value | Pr(>|z|) | |
|---|---|---|---|---|
| treatmenthalf | 0.4725172 | 0.0711762 | 6.638693 | 0.0000000 |
| treatmentallAi | 1.0740710 | 0.0787182 | 13.644509 | 0.0000000 |
| libcons_numeric_recenter | -0.1354861 | 0.0383634 | -3.531646 | 0.0004130 |
| treatmenthalf:libcons_numeric_recenter | 0.1884398 | 0.0556516 | 3.386065 | 0.0007090 |
| treatmentallAi:libcons_numeric_recenter | 0.1851000 | 0.0620990 | 2.980724 | 0.0028757 |
[1] “Values for this variable: -2, -1, 0, 1, 2”
| Estimate | Std. Error | z value | Pr(>|z|) | |
|---|---|---|---|---|
| treatmenthalf | 0.4821556 | 0.1432312 | 3.3662746 | 0.0007619 |
| treatmentallAi | 0.8432789 | 0.1537001 | 5.4865209 | 0.0000000 |
| review_essential_numeric_recenter | -0.0516215 | 0.0615178 | -0.8391311 | 0.4013958 |
| treatmenthalf:review_essential_numeric_recenter | -0.0183014 | 0.0932474 | -0.1962672 | 0.8444010 |
| treatmentallAi:review_essential_numeric_recenter | 0.1516935 | 0.1002854 | 1.5126176 | 0.1303768 |
[1] “Values for this variable: -2, -1, 0, 1, 2”
| Estimate | Std. Error | z value | Pr(>|z|) | |
|---|---|---|---|---|
| treatmenthalf | 0.4070101 | 0.0767385 | 5.3038603 | 0.0000001 |
| treatmentallAi | 1.0299342 | 0.0856392 | 12.0264368 | 0.0000000 |
| AIreview_use_numeric_recenter | 0.0722234 | 0.0486713 | 1.4839016 | 0.1378350 |
| treatmenthalf:AIreview_use_numeric_recenter | -0.1323680 | 0.0694953 | -1.9047039 | 0.0568186 |
| treatmentallAi:AIreview_use_numeric_recenter | -0.0717417 | 0.0786214 | -0.9124965 | 0.3615074 |
[1] “Values for this variable: -2, -1, 0, 1, 2”
| Estimate | Std. Error | z value | Pr(>|z|) | |
|---|---|---|---|---|
| treatmenthalf | 0.4060543 | 0.0745182 | 5.449063 | 0.0000001 |
| treatmentallAi | 0.9677337 | 0.0804968 | 12.022013 | 0.0000000 |
| mech_breadth_recenter | -0.1364632 | 0.0382069 | -3.571686 | 0.0003547 |
| treatmenthalf:mech_breadth_recenter | 0.1416777 | 0.0577603 | 2.452854 | 0.0141728 |
| treatmentallAi:mech_breadth_recenter | 0.2312048 | 0.0624813 | 3.700384 | 0.0002153 |
[1] “Values for this variable: -2, -1, 0, 1, 2”
| Estimate | Std. Error | z value | Pr(>|z|) | |
|---|---|---|---|---|
| treatmenthalf | 0.4270154 | 0.0730822 | 5.842945 | 0.0000000 |
| treatmentallAi | 0.9743349 | 0.0782162 | 12.456952 | 0.0000000 |
| mech_length_recenter | -0.1684713 | 0.0428342 | -3.933101 | 0.0000839 |
| treatmenthalf:mech_length_recenter | 0.1386359 | 0.0635666 | 2.180956 | 0.0291867 |
| treatmentallAi:mech_length_recenter | 0.3167628 | 0.0701937 | 4.512696 | 0.0000064 |
[1] “Values for this variable: -2, -1, 0, 1, 2”
| Estimate | Std. Error | z value | Pr(>|z|) | |
|---|---|---|---|---|
| treatmenthalf | 0.4010078 | 0.0952771 | 4.2088601 | 0.0000257 |
| treatmentallAi | 0.9052173 | 0.1026175 | 8.8212769 | 0.0000000 |
| mech_detail_recenter | -0.0829868 | 0.0437096 | -1.8985937 | 0.0576179 |
| treatmenthalf:mech_detail_recenter | 0.0615489 | 0.0656003 | 0.9382406 | 0.3481208 |
| treatmentallAi:mech_detail_recenter | 0.1623200 | 0.0708168 | 2.2921115 | 0.0218992 |
[1] “Values for this variable: -2, -1, 0, 1, 2”
| Estimate | Std. Error | z value | Pr(>|z|) | |
|---|---|---|---|---|
| treatmenthalf | 0.4536040 | 0.0710567 | 6.3836879 | 0.0000000 |
| treatmentallAi | 1.0598293 | 0.0785393 | 13.4942565 | 0.0000000 |
| mech_tone_recenter | -0.0285579 | 0.0385432 | -0.7409313 | 0.4587351 |
| treatmenthalf:mech_tone_recenter | -0.0445169 | 0.0564565 | -0.7885168 | 0.4303945 |
| treatmentallAi:mech_tone_recenter | 0.1132020 | 0.0621725 | 1.8207738 | 0.0686413 |
[1] “Values for this variable: -2, -1, 0, 1, 2”
| Estimate | Std. Error | z value | Pr(>|z|) | |
|---|---|---|---|---|
| treatmenthalf | 0.4831300 | 0.0770257 | 6.2723244 | 0.0000000 |
| treatmentallAi | 1.0496732 | 0.0813997 | 12.8952979 | 0.0000000 |
| mech_valence_recenter | 0.0196624 | 0.0490485 | 0.4008776 | 0.6885102 |
| treatmenthalf:mech_valence_recenter | -0.0768310 | 0.0725293 | -1.0593102 | 0.2894585 |
| treatmentallAi:mech_valence_recenter | 0.0169408 | 0.0775877 | 0.2183444 | 0.8271608 |
Not sure if done correctly given main analysis is panel regression yet mechanism questions are participant level. Probably subgroup analysis (above) makes more sense.
mediator_columns <- c("mech_length", "mech_breadth", "mech_detail", "mech_tone","mech_valence")
mech_fancy_names <- c("Length", "Topic Diversity", "Level of Detail", "Language Tone", "Valence")
mech_mapping <- c(
"Length" = "mech_length",
"Topic Diversity" = "mech_breadth",
"Level of Detail" = "mech_detail",
"Language Tone" = "mech_tone",
"Valence" = "mech_valence"
)
# visual correlation plot
mediator_corr <- cor(df_final[, c(mediator_columns)], use = "pairwise.complete.obs")
# change with fancy names
rownames(mediator_corr) <- c(mech_fancy_names)
colnames(mediator_corr) <- c(mech_fancy_names)
# visualize
corrplot(mediator_corr,method = 'number')
Step 1: Mediator Treatment Effect
mediator_coef_df <- data.frame()
for (m in mediator_columns){
cov_form_lm <- paste0(m, " ~ treatment | choice_set + videoTitle + video_in_set")
mediator_lm <- feglm(as.formula(cov_form_lm), family = "gaussian", data = df_long, cluster = "ResponseId")
mediator_lm_coef <- summary(mediator_lm)$coeftable[, c("Estimate", "Std. Error")]
mediator_coef_df <- rbind(mediator_coef_df, cbind(mediator_lm_coef, mech_fancy_names[which(mech_mapping == m)]))
}
colnames(mediator_coef_df) <- c("Estimate", "Std. Error", "Mediator")
mediator_coef_df$Mediator <- factor(mediator_coef_df$Mediator, levels = c("Length", "Topic Diversity", "Level of Detail", "Language Tone", "Valence") )
mediator_coef_df$Estimate <- as.numeric(as.character(mediator_coef_df$Estimate))
mediator_coef_df$`Std. Error` <- as.numeric(as.character(mediator_coef_df$`Std. Error`))
mediator_coef_df$Treatment <- rownames(mediator_coef_df)
mediator_coef_df$Treatment <- ifelse(str_detect(mediator_coef_df$Treatment, "all"), "All AI", "Mixed Reviews")
mediator_coef_df$Treatment <- factor(mediator_coef_df$Treatment, levels = c("Mixed Reviews", "All AI"))
#plot bar plot
ggplot(mediator_coef_df, aes(x = Mediator, y = Estimate * 1000000000, fill = Treatment)) +
geom_bar(stat = "identity", position = "dodge") +
geom_errorbar(aes(ymin = (Estimate - 1.96*`Std. Error`) * 1000000000, ymax = (Estimate + 1.96*`Std. Error`) * 1000000000), width = 0.2, position = position_dodge(0.9)) +
labs(title = "Mediator Analysis", x = "Mediator", y = "Estimate") +
scale_fill_manual(name = "vs All Human",
values=c("Mixed Reviews" = "turquoise2","All AI"="pink2")) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1),
# center title
title = element_text(face = "bold"))
cov_form_lm_tone <- paste0("chosen ~ treatment + mech_tone | choice_set + videoTitle + video_in_set")
mediator_lm_tone <- feglm(as.formula(cov_form_lm_tone), family = "binomial", data = df_long, cluster = "ResponseId")
texreg::screenreg(mediator_lm_tone,
stars = c(0.05, 0.01, 0.001),
caption = "Mediation Analysis",
label = "tab:mediator_regression",
digits = 4,
custom.coef.map = list("treatmenthalf" = "Treatment: Mixed Reviews","treatmentallAi" = "Treatment: All AI Reviews", "mech_tone" = "Mechanism: Language Tone"),
custom.note = "Standard errors are clustered at the user level.",
custom.model.names = c("Without Covariates"),
custom.coef.names = c("Treatment: Mixed Reviews", "Treatment: All AI Reviews", "Mechanism: Language Tone"))
##
## =============================================
## Without Covariates
## ---------------------------------------------
## Treatment: Mixed Reviews 0.4565 ***
## (0.0711)
## Treatment: All AI Reviews 1.0556 ***
## (0.0786)
## Mechanism: Language Tone 0.0001
## (0.0003)
## ---------------------------------------------
## Num. obs. 9480
## Num. groups: choice_set 4
## Num. groups: videoTitle 12
## Num. groups: video_in_set 3
## Deviance 11589.9160
## Log Likelihood -5794.9580
## Pseudo R^2 0.0365
## =============================================
## Standard errors are clustered at the user level.
social_media_use_numeric_median
[1] “Values for this variable: Below Median, Above Median”