Total number of engagements across platforms by toxicity level (binned by rounding to the nearest 0.25).
ALL_Platforms$dummy0 <- round(ALL_Platforms$toxicity/25,2)*25 ==0
ALL_Platforms$dummy0.25 <- round(ALL_Platforms$toxicity/25,2)*25 ==.25
ALL_Platforms$dummy0.5 <- round(ALL_Platforms$toxicity/25,2)*25 ==.5
ALL_Platforms$dummy0.75 <- round(ALL_Platforms$toxicity/25,2)*25 ==.75
ALL_Platforms$dummy1 <- round(ALL_Platforms$toxicity/25,2)*25 ==1
f1<-"dummy0.25+dummy0.5+dummy0.75+dummy1"
f2<-"dummy0+dummy0.5+dummy0.75+dummy1"
f3<-"dummy0+dummy0.25+dummy0.75+dummy1"
f4<-"dummy0+dummy0.25+dummy0.5+dummy1"
f5<-"dummy0+dummy0.25+dummy0.5+dummy0.75"
fs<-c(f1,f2,f3,f4,f5)
pcs<- c(0,.25,.5,.75,1)
main_results <- data.frame(
platform = character(),
measure = character(),
var = character(),
value = numeric(),
stringsAsFactors = FALSE)
platform <- "All" # Use all platform combined
for (i in 1:5) {
r <- feglm(as.formula(paste('log10(engagement + 1) ~', fs[i], '| username')),cluster = "username",data = ALL_Platforms)
summary(r)
main_results <- rbind(main_results, data.frame(
platform = platform,
measure = paste('dummy_pc_', pcs[i], sep = ''),
var = 'mean',
value = mean(r$sumFE)))
ci <- t.test(r$sumFE)$conf.int
main_results <- rbind(main_results, data.frame(
platform = platform,
measure = paste('dummy_pc_', pcs[i], sep = ''),
var = 'ci.lower',
value = ci[1]))
main_results <- rbind(main_results, data.frame(
platform = platform,
measure = paste('dummy_pc_', pcs[i], sep = ''),
var = 'ci.upper',
value = ci[2]))
}
## NOTE: 13,702 observations removed because of NA values (LHS: 13,702).
## NOTE: 13,702 observations removed because of NA values (LHS: 13,702).
## NOTE: 13,702 observations removed because of NA values (LHS: 13,702).
## NOTE: 13,702 observations removed because of NA values (LHS: 13,702).
## NOTE: 13,702 observations removed because of NA values (LHS: 13,702).
plot_data <- main_results %>%
filter(platform == "All") %>%
pivot_wider(names_from = var, values_from = value) %>%
mutate(
toxicity = as.numeric(str_extract(measure, "[0-9.]+"))) %>%
arrange(toxicity)
# Plot
ggplot(plot_data, aes(x = toxicity, y = mean)) +
geom_line(color = "black", size = 1) +
geom_ribbon(aes(ymin = ci.lower, ymax = ci.upper), fill = "gray70", alpha = 0.5) +
labs(
x = "Toxicity (percentile)",
y = "Engagement (log10-transformed)",
title = "Engagement Across Toxicity Levels") +theme_minimal(base_size = 14)
Proportion of posts that are among the 10% highest engagement posts on each platform by toxicity level (binned by rounding to the nearest 0.25).
# Top 10% of posts by engagement
threshold <- quantile(ALL_Platforms$engagement, 0.9, na.rm = TRUE)
ALL_Platforms_top10 <- ALL_Platforms %>% filter(engagement >= threshold)
ALL_Platforms_top10$dummy0 <- round(ALL_Platforms_top10$toxicity/25,2)*25 ==0
ALL_Platforms_top10$dummy0.25 <- round(ALL_Platforms_top10$toxicity/25,2)*25 ==.25
ALL_Platforms_top10$dummy0.5 <- round(ALL_Platforms_top10$toxicity/25,2)*25 ==.5
ALL_Platforms_top10$dummy0.75 <- round(ALL_Platforms_top10$toxicity/25,2)*25 ==.75
ALL_Platforms_top10$dummy1 <- round(ALL_Platforms_top10$toxicity/25,2)*25 ==1
f1<-"dummy0.25+dummy0.5+dummy0.75+dummy1"
f2<-"dummy0+dummy0.5+dummy0.75+dummy1"
f3<-"dummy0+dummy0.25+dummy0.75+dummy1"
f4<-"dummy0+dummy0.25+dummy0.5+dummy1"
f5<-"dummy0+dummy0.25+dummy0.5+dummy0.75"
fs<-c(f1,f2,f3,f4,f5)
pcs<- c(0,.25,.5,.75,1)
main_results <- data.frame(
platform = character(),
measure = character(),
var = character(),
value = numeric(),
stringsAsFactors = FALSE)
platform <- "All"
for (i in 1:5) {
r <- feglm(as.formula(paste('log10(engagement + 1) ~', fs[i], '| username')),cluster = "username",data = ALL_Platforms_top10)
summary(r)
main_results <- rbind(main_results, data.frame(
platform = platform,
measure = paste('dummy_pc_', pcs[i], sep = ''),
var = 'mean',
value = mean(r$sumFE)))
ci <- t.test(r$sumFE)$conf.int
main_results <- rbind(main_results, data.frame(
platform = platform,
measure = paste('dummy_pc_', pcs[i], sep = ''),
var = 'ci.lower',
value = ci[1]))
main_results <- rbind(main_results, data.frame(
platform = platform,
measure = paste('dummy_pc_', pcs[i], sep = ''),
var = 'ci.upper',
value = ci[2]))
}
plot_data <- main_results %>%
filter(platform == "All") %>%
pivot_wider(names_from = var, values_from = value) %>%
mutate(
toxicity = as.numeric(str_extract(measure, "[0-9.]+"))) %>%arrange(toxicity)
# Plot
ggplot(plot_data, aes(x = toxicity, y = mean)) +
geom_line(color = "black", size = 1) +
geom_ribbon(aes(ymin = ci.lower, ymax = ci.upper), fill = "gray70", alpha = 0.5) +
labs(
x = "Toxicity (percentile)",
y = "Engagement (log10-transformed)",
title = "Engagement Across Toxicity Levels (Top 10% Engagement Posts)") +theme_minimal(base_size = 14)
Total number of engagements across platforms by confidence level.
data_posts_<-ALL_Platforms[!is.na(ALL_Platforms$certainty_avg),]
data_posts_$certainty_R <- round(data_posts_$certainty_avg/200, 2)*200
data_posts_$certainty_R <- factor(data_posts_$certainty_R)
dummies <- model.matrix(~ certainty_R - 1, data = data_posts_)
df_combined <- cbind(data_posts_[,c('engagement','username','platform')], dummies)
dummy_names <- colnames(dummies)
formulas <- sapply(seq_along(dummy_names), function(i) {
paste(dummy_names[-i], collapse = "+")})
names(formulas) <- paste0("f", seq_along(formulas))
main_results<-data.frame()
platform='All'
for (i in 1:length(dummy_names)) {
print(i)
if (platform == 'All') {
d <- df_combined
} else {
d <- df_combined[df_combined$platform == platform, ]
}
r <- feglm(
as.formula(paste('(log10(engagement+1)) ~', formulas[i], '| username')),
cluster = "username",
data = d
)
# Extract the certainty level being left out
left_out_bin <- gsub("certainty_R", "", dummy_names[i]) # e.g., "certainty_R400" -> "400"
left_out_bin <- as.numeric(left_out_bin)
# Save the model summary stats
main_results <- rbind(
main_results,
data.frame(platform = platform, certainty_avg = left_out_bin, var = 'mean', value = mean(r$sumFE)),
data.frame(platform = platform, certainty_avg = left_out_bin, var = 'ci.lower', value = t.test(r$sumFE)$conf.int[1]),
data.frame(platform = platform, certainty_avg = left_out_bin, var = 'ci.upper', value = t.test(r$sumFE)$conf.int[2])
)
}
## [1] 1
## NOTE: 4,909 observations removed because of NA values (LHS: 4,909).
## [1] 2
## NOTE: 4,909 observations removed because of NA values (LHS: 4,909).
## [1] 3
## NOTE: 4,909 observations removed because of NA values (LHS: 4,909).
## [1] 4
## NOTE: 4,909 observations removed because of NA values (LHS: 4,909).
## [1] 5
## NOTE: 4,909 observations removed because of NA values (LHS: 4,909).
plot_data <- pivot_wider(main_results, names_from = var, values_from = value)
ggplot(plot_data, aes(x = certainty_avg, y = mean)) +
geom_line(color = "black", size = 1) +
geom_ribbon(aes(ymin = ci.lower, ymax = ci.upper), fill = "gray70", alpha = 0.5) +
labs(
x = "Certainty",
y = "Engagement (log10-transformed)",
title = "Engagement Across Certainty Levels (All Posts)"
) +
theme_minimal(base_size = 14)
Proportion of posts that are among the 10% highest engagement posts on each platform by confidence level.
data_posts_<-ALL_Platforms_top10[!is.na(ALL_Platforms_top10$certainty_avg),]
data_posts_$certainty_R <- round(data_posts_$certainty_avg/200, 2)*200
data_posts_$certainty_R <- factor(data_posts_$certainty_R)
dummies <- model.matrix(~ certainty_R - 1, data = data_posts_)
df_combined <- cbind(data_posts_[,c('engagement','username','platform')], dummies)
dummy_names <- colnames(dummies)
formulas <- sapply(seq_along(dummy_names), function(i) {
paste(dummy_names[-i], collapse = "+")})
names(formulas) <- paste0("f", seq_along(formulas))
main_results<-data.frame()
platform='All'
for (i in 1:length(dummy_names)) {
print(i)
if (platform == 'All') {
d <- df_combined
} else {
d <- df_combined[df_combined$platform == platform, ]
}
r <- feglm(
as.formula(paste('(log10(engagement+1)) ~', formulas[i], '| username')),
cluster = "username",
data = d
)
# Extract the certainty level being left out
left_out_bin <- gsub("certainty_R", "", dummy_names[i]) # e.g., "certainty_R400" -> "400"
left_out_bin <- as.numeric(left_out_bin)
# Save the model summary stats
main_results <- rbind(
main_results,
data.frame(platform = platform, certainty_avg = left_out_bin, var = 'mean', value = mean(r$sumFE)),
data.frame(platform = platform, certainty_avg = left_out_bin, var = 'ci.lower', value = t.test(r$sumFE)$conf.int[1]),
data.frame(platform = platform, certainty_avg = left_out_bin, var = 'ci.upper', value = t.test(r$sumFE)$conf.int[2])
)
}
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
plot_data <- pivot_wider(main_results, names_from = var, values_from = value)
ggplot(plot_data, aes(x = certainty_avg, y = mean)) +
geom_line(color = "black", size = 1) +
geom_ribbon(aes(ymin = ci.lower, ymax = ci.upper), fill = "gray70", alpha = 0.5) +
labs(
x = "Certainty",
y = "Engagement (log10-transformed)",
title = "Engagement Across Confidence Levels (Top 10% Engagement Posts)"
) +
theme_minimal(base_size = 14)