Figure1: Engagement Across Toxicity Levels (All Posts)

Total number of engagements across platforms by toxicity level (binned by rounding to the nearest 0.25).

ALL_Platforms$dummy0 <- round(ALL_Platforms$toxicity/25,2)*25 ==0
ALL_Platforms$dummy0.25 <- round(ALL_Platforms$toxicity/25,2)*25 ==.25
ALL_Platforms$dummy0.5 <- round(ALL_Platforms$toxicity/25,2)*25 ==.5
ALL_Platforms$dummy0.75 <- round(ALL_Platforms$toxicity/25,2)*25 ==.75
ALL_Platforms$dummy1 <- round(ALL_Platforms$toxicity/25,2)*25 ==1

f1<-"dummy0.25+dummy0.5+dummy0.75+dummy1"
f2<-"dummy0+dummy0.5+dummy0.75+dummy1"
f3<-"dummy0+dummy0.25+dummy0.75+dummy1"
f4<-"dummy0+dummy0.25+dummy0.5+dummy1"
f5<-"dummy0+dummy0.25+dummy0.5+dummy0.75"
fs<-c(f1,f2,f3,f4,f5)
pcs<- c(0,.25,.5,.75,1)


main_results <- data.frame(
  platform = character(),
  measure = character(),
  var = character(),
  value = numeric(),
  stringsAsFactors = FALSE)

platform <- "All"  # Use all platform combined

for (i in 1:5) {
  r <- feglm(as.formula(paste('log10(engagement + 1) ~', fs[i], '| username')),cluster = "username",data = ALL_Platforms)
  summary(r)
  
  main_results <- rbind(main_results, data.frame(
    platform = platform,
    measure = paste('dummy_pc_', pcs[i], sep = ''),
    var = 'mean',
    value = mean(r$sumFE)))
  
  ci <- t.test(r$sumFE)$conf.int
  
  main_results <- rbind(main_results, data.frame(
    platform = platform,
    measure = paste('dummy_pc_', pcs[i], sep = ''),
    var = 'ci.lower',
    value = ci[1]))
  
  main_results <- rbind(main_results, data.frame(
    platform = platform,
    measure = paste('dummy_pc_', pcs[i], sep = ''),
    var = 'ci.upper',
    value = ci[2]))
}
## NOTE: 13,702 observations removed because of NA values (LHS: 13,702).
## NOTE: 13,702 observations removed because of NA values (LHS: 13,702).
## NOTE: 13,702 observations removed because of NA values (LHS: 13,702).
## NOTE: 13,702 observations removed because of NA values (LHS: 13,702).
## NOTE: 13,702 observations removed because of NA values (LHS: 13,702).
plot_data <- main_results %>%
  filter(platform == "All") %>%
  pivot_wider(names_from = var, values_from = value) %>%
  mutate(
    toxicity = as.numeric(str_extract(measure, "[0-9.]+"))) %>%
  arrange(toxicity)

# Plot
ggplot(plot_data, aes(x = toxicity, y = mean)) +
  geom_line(color = "black", size = 1) +
  geom_ribbon(aes(ymin = ci.lower, ymax = ci.upper), fill = "gray70", alpha = 0.5) +
  labs(
    x = "Toxicity (percentile)",
    y = "Engagement (log10-transformed)",
    title = "Engagement Across Toxicity Levels") +theme_minimal(base_size = 14)

Figure2: Engagement Across Toxicity Levels (Top 10% Engagement Posts)

Proportion of posts that are among the 10% highest engagement posts on each platform by toxicity level (binned by rounding to the nearest 0.25).

# Top 10% of posts by engagement
threshold <- quantile(ALL_Platforms$engagement, 0.9, na.rm = TRUE)

ALL_Platforms_top10 <- ALL_Platforms %>% filter(engagement >= threshold)


ALL_Platforms_top10$dummy0 <- round(ALL_Platforms_top10$toxicity/25,2)*25 ==0
ALL_Platforms_top10$dummy0.25 <- round(ALL_Platforms_top10$toxicity/25,2)*25 ==.25
ALL_Platforms_top10$dummy0.5 <- round(ALL_Platforms_top10$toxicity/25,2)*25 ==.5
ALL_Platforms_top10$dummy0.75 <- round(ALL_Platforms_top10$toxicity/25,2)*25 ==.75
ALL_Platforms_top10$dummy1 <- round(ALL_Platforms_top10$toxicity/25,2)*25 ==1

f1<-"dummy0.25+dummy0.5+dummy0.75+dummy1"
f2<-"dummy0+dummy0.5+dummy0.75+dummy1"
f3<-"dummy0+dummy0.25+dummy0.75+dummy1"
f4<-"dummy0+dummy0.25+dummy0.5+dummy1"
f5<-"dummy0+dummy0.25+dummy0.5+dummy0.75"
fs<-c(f1,f2,f3,f4,f5)
pcs<- c(0,.25,.5,.75,1)

main_results <- data.frame(
  platform = character(),
  measure = character(),
  var = character(),
  value = numeric(),
  stringsAsFactors = FALSE)

platform <- "All"  

for (i in 1:5) {
  r <- feglm(as.formula(paste('log10(engagement + 1) ~', fs[i], '| username')),cluster = "username",data = ALL_Platforms_top10)
  
  summary(r)
  
  main_results <- rbind(main_results, data.frame(
    platform = platform,
    measure = paste('dummy_pc_', pcs[i], sep = ''),
    var = 'mean',
    value = mean(r$sumFE)))
  
  ci <- t.test(r$sumFE)$conf.int
  
  main_results <- rbind(main_results, data.frame(
    platform = platform,
    measure = paste('dummy_pc_', pcs[i], sep = ''),
    var = 'ci.lower',
    value = ci[1]))
  
  main_results <- rbind(main_results, data.frame(
    platform = platform,
    measure = paste('dummy_pc_', pcs[i], sep = ''),
    var = 'ci.upper',
    value = ci[2]))
}


plot_data <- main_results %>%
  filter(platform == "All") %>%
  pivot_wider(names_from = var, values_from = value) %>%
  mutate(
    toxicity = as.numeric(str_extract(measure, "[0-9.]+"))) %>%arrange(toxicity)

# Plot
ggplot(plot_data, aes(x = toxicity, y = mean)) +
  geom_line(color = "black", size = 1) +
  geom_ribbon(aes(ymin = ci.lower, ymax = ci.upper), fill = "gray70", alpha = 0.5) +
  labs(
    x = "Toxicity (percentile)",
    y = "Engagement (log10-transformed)",
    title = "Engagement Across Toxicity Levels (Top 10% Engagement Posts)") +theme_minimal(base_size = 14)

Figure3: Engagement Across Confidence Levels (All Posts)

Total number of engagements across platforms by confidence level.

data_posts_<-ALL_Platforms[!is.na(ALL_Platforms$certainty_avg),]

data_posts_$certainty_R <-  round(data_posts_$certainty_avg/200, 2)*200
data_posts_$certainty_R <- factor(data_posts_$certainty_R)
dummies <- model.matrix(~ certainty_R - 1, data = data_posts_) 

df_combined <- cbind(data_posts_[,c('engagement','username','platform')], dummies)

dummy_names <- colnames(dummies)

formulas <- sapply(seq_along(dummy_names), function(i) {
  paste(dummy_names[-i], collapse = "+")})

names(formulas) <- paste0("f", seq_along(formulas))

main_results<-data.frame()
platform='All'


for (i in 1:length(dummy_names)) {
  print(i)
  
  if (platform == 'All') {
    d <- df_combined
  } else {
    d <- df_combined[df_combined$platform == platform, ]
  }
  
  r <- feglm(
    as.formula(paste('(log10(engagement+1)) ~', formulas[i], '| username')),
    cluster = "username",
    data = d
  )
  
  # Extract the certainty level being left out
  left_out_bin <- gsub("certainty_R", "", dummy_names[i])  # e.g., "certainty_R400" -> "400"
  left_out_bin <- as.numeric(left_out_bin)
  
  # Save the model summary stats
  main_results <- rbind(
    main_results,
    data.frame(platform = platform, certainty_avg = left_out_bin, var = 'mean', value = mean(r$sumFE)),
    data.frame(platform = platform, certainty_avg = left_out_bin, var = 'ci.lower', value = t.test(r$sumFE)$conf.int[1]),
    data.frame(platform = platform, certainty_avg = left_out_bin, var = 'ci.upper', value = t.test(r$sumFE)$conf.int[2])
  )
}
## [1] 1
## NOTE: 4,909 observations removed because of NA values (LHS: 4,909).
## [1] 2
## NOTE: 4,909 observations removed because of NA values (LHS: 4,909).
## [1] 3
## NOTE: 4,909 observations removed because of NA values (LHS: 4,909).
## [1] 4
## NOTE: 4,909 observations removed because of NA values (LHS: 4,909).
## [1] 5
## NOTE: 4,909 observations removed because of NA values (LHS: 4,909).
plot_data <- pivot_wider(main_results, names_from = var, values_from = value)


ggplot(plot_data, aes(x = certainty_avg, y = mean)) +
  geom_line(color = "black", size = 1) +
  geom_ribbon(aes(ymin = ci.lower, ymax = ci.upper), fill = "gray70", alpha = 0.5) +
  labs(
    x = "Certainty",
    y = "Engagement (log10-transformed)",
    title = "Engagement Across Certainty Levels (All Posts)"
  ) +
  theme_minimal(base_size = 14)

Figure4: Engagement Across Confidence Levels (Top 10% Engagement Posts)

Proportion of posts that are among the 10% highest engagement posts on each platform by confidence level.

data_posts_<-ALL_Platforms_top10[!is.na(ALL_Platforms_top10$certainty_avg),]

data_posts_$certainty_R <-  round(data_posts_$certainty_avg/200, 2)*200
data_posts_$certainty_R <- factor(data_posts_$certainty_R)
dummies <- model.matrix(~ certainty_R - 1, data = data_posts_) 

df_combined <- cbind(data_posts_[,c('engagement','username','platform')], dummies)

dummy_names <- colnames(dummies)

formulas <- sapply(seq_along(dummy_names), function(i) {
  paste(dummy_names[-i], collapse = "+")})

names(formulas) <- paste0("f", seq_along(formulas))

main_results<-data.frame()
platform='All'


for (i in 1:length(dummy_names)) {
  print(i)
  
  if (platform == 'All') {
    d <- df_combined
  } else {
    d <- df_combined[df_combined$platform == platform, ]
  }
  
  r <- feglm(
    as.formula(paste('(log10(engagement+1)) ~', formulas[i], '| username')),
    cluster = "username",
    data = d
  )
  
  # Extract the certainty level being left out
  left_out_bin <- gsub("certainty_R", "", dummy_names[i])  # e.g., "certainty_R400" -> "400"
  left_out_bin <- as.numeric(left_out_bin)
  
  # Save the model summary stats
  main_results <- rbind(
    main_results,
    data.frame(platform = platform, certainty_avg = left_out_bin, var = 'mean', value = mean(r$sumFE)),
    data.frame(platform = platform, certainty_avg = left_out_bin, var = 'ci.lower', value = t.test(r$sumFE)$conf.int[1]),
    data.frame(platform = platform, certainty_avg = left_out_bin, var = 'ci.upper', value = t.test(r$sumFE)$conf.int[2])
  )
}
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
plot_data <- pivot_wider(main_results, names_from = var, values_from = value)


ggplot(plot_data, aes(x = certainty_avg, y = mean)) +
  geom_line(color = "black", size = 1) +
  geom_ribbon(aes(ymin = ci.lower, ymax = ci.upper), fill = "gray70", alpha = 0.5) +
  labs(
    x = "Certainty",
    y = "Engagement (log10-transformed)",
    title = "Engagement Across Confidence Levels (Top 10% Engagement Posts)"
  ) +
  theme_minimal(base_size = 14)