NOTE:

- this paper about certainty and engagement using controlled exp and branding context https://journals.sagepub.com/doi/full/10.1016/j.intmar.2020.06.005

- and this one about engagement and diff language features: https://journals.sagepub.com/doi/full/10.1177/00222429231152880

1. Reading data

2. Preprocess data

2.1 filtering

2.2 PCA

3. Descriptives

3.1 histograms

p1 <- ggplot(data_posts, aes(x = lean1)) + geom_histogram(bins = 20, fill = "skyblue") +theme_minimal()
p2 <- ggplot(data_posts, aes(x = certainty_avg)) + geom_histogram(bins = 20, fill = "salmon") +theme_minimal()
p3 <- ggplot(data_posts, aes(x = toxicity)) + geom_histogram(bins = 20, fill = "lightgreen")+theme_minimal()
p4 <- ggplot(data_posts, aes(x = pc1)) + geom_histogram(bins = 20, fill = "blue") +theme_minimal()
p5 <- ggplot(data_posts, aes(x = log10(1+toxicity))) + geom_histogram(bins = 20, fill = "blue") +theme_minimal()
p6 <- ggplot(data_posts, aes(x = log10(1+engage))) + geom_histogram(bins = 20, fill = "blue") +theme_minimal()
p7 <- ggplot(data_posts, aes(x = engage)) + geom_histogram(bins = 20, fill = "blue") +theme_minimal()

# Arrange them in one row
grid.arrange(p1, p2, p3,p4,p5,p6,p7, nrow = 4)

## Warning: Removed 2692899 rows containing non-finite values (`stat_bin()`).

## Warning: Removed 17994760 rows containing non-finite values (`stat_bin()`).

## Warning: Removed 2684478 rows containing non-finite values (`stat_bin()`).

## Warning: Removed 582384 rows containing non-finite values (`stat_bin()`).
## Removed 582384 rows containing non-finite values (`stat_bin()`).

3.2 associations

d_<- 
  data_posts |>
  group_by(platform)|>
  summarise(
    lean=mean(lean1,na.rm=TRUE),
    certainty_avg=mean(certainty_avg,na.rm=TRUE),
    pc1=mean(pc1,na.rm=TRUE),
    toxicity=mean(toxicity,na.rm=TRUE),
  )

p1<-d_ |>
  ggplot(aes(x = pc1, y = toxicity)) + 
  geom_point(color = 'red') +
  geom_text(aes(label = platform), vjust = -0.5, color = "blue")+
  coord_cartesian(clip = "off")+theme_minimal()

p2<-d_ |>
  ggplot(aes(x = pc1, y = certainty_avg)) + 
  geom_point(color = 'red') +
  geom_text(aes(label = platform), vjust = -0.5, color = "blue")+
  coord_cartesian(clip = "off")+theme_minimal()

p3<-d_ |>
  ggplot(aes(x = lean, y = toxicity)) + 
  geom_point(color = 'red') +
  geom_text(aes(label = platform), vjust = -0.5, color = "blue")+
  coord_cartesian(clip = "off")+theme_minimal()

p4<-d_ |>
  ggplot(aes(x = lean, y = certainty_avg)) + 
  geom_point(color = 'red') +
  geom_text(aes(label = platform), vjust = -0.5, color = "blue")+
  coord_cartesian(clip = "off")+theme_minimal()

p5<-d_ |>
  ggplot(aes(x = toxicity, y = certainty_avg)) + 
  geom_point(color = 'red') +
  geom_text(aes(label = platform), vjust = -0.5, color = "blue")+
  coord_cartesian(clip = "off")+theme_minimal()

grid.arrange(grobs=list(p1, p2, p3,p4,p5),
             width = 15, height = 8, units = "in",
             nrow = 3)

4. Analysis - posts

4.1 toxicity vs. quality

print(unique(data_posts$platform))

## [1] "Bluesky"     "Gettr"       "Truthsocial" "Gab"         "Mastodon"   
## [6] "LinkedIn"    "Telegram"    "X"

feglm( scale(certainty_avg)~scale(pc1),cluster='username',data_posts)

## NOTE: 18,524,268 observations removed because of NA values (LHS: 17,994,760, RHS: 2,684,478).

## GLM estimation, family = gaussian, Dep. Var.: scale(certainty_avg)
## Observations: 4,771,256 
## Standard-errors: Clustered (username) 
##              Estimate Std. Error    t value  Pr(>|t|)    
## (Intercept)  0.004749   0.005308   0.894709   0.37094    
## scale(pc1)  -0.050577   0.002505 -20.188669 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## Log-Likelihood: -6,741,374.8   Adj. Pseudo R2: 9.749e-4
##            BIC: 13,482,780.4     Squared Cor.: 0.002754

feglm( scale(certainty_avg)~scale(pc1)+scale(lean1),cluster='username',data_posts)

## NOTE: 18,526,578 observations removed because of NA values (LHS: 17,994,760, RHS: 2,692,899).

## GLM estimation, family = gaussian, Dep. Var.: scale(certainty_avg)
## Observations: 4,768,946 
## Standard-errors: Clustered (username) 
##               Estimate Std. Error   t value  Pr(>|t|)    
## (Intercept)   0.005769   0.005284   1.09187   0.27489    
## scale(pc1)   -0.034188   0.002322 -14.72135 < 2.2e-16 ***
## scale(lean1)  0.032164   0.002360  13.62781 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## Log-Likelihood: -6,736,320.9   Adj. Pseudo R2: 0.001288
##            BIC: 13,472,687.9     Squared Cor.: 0.003638

feglm( scale(certainty_avg)~scale(pc1)+scale(lean1)|username,cluster='username',data_posts)

## NOTE: 18,526,578 observations removed because of NA values (LHS: 17,994,760, RHS: 2,692,899).

## GLM estimation, family = gaussian, Dep. Var.: scale(certainty_avg)
## Observations: 4,768,946 
## Fixed-effects: username: 662,123
## Standard-errors: Clustered (username) 
##               Estimate Std. Error  t value   Pr(>|t|)    
## scale(pc1)   -0.011607   0.001979 -5.86498 4.4939e-09 ***
## scale(lean1)  0.005990   0.001362  4.39829 1.0913e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## Log-Likelihood: -6,071,559.8   Adj. Pseudo R2: 0.00168 
##            BIC: 22,325,036.8     Squared Cor.: 0.246052

results<-data.frame()
i<-1
for (platform in unique(data_posts$platform)){
  if (platform!='Telegram'){
  print(platform)
  
  d<-data_posts[data_posts$platform==platform,]
  
  r<-feglm( scale(toxicity_ntile)~scale(pc1),cluster='username',d)
  results<-rbind(results,get_param('toxic_pc1',2,r,platform))
  
  r<-feglm( scale(toxicity_ntile)~scale(pc1)+scale(lean1),cluster='username',d)
  results<-rbind(results,get_param('toxic_pc1_control',2,r,platform))
}
}

## [1] "Bluesky"

## NOTE: 16 observations removed because of NA values (RHS: 16).

## [1] "Gettr"

## NOTE: 26 observations removed because of NA values (RHS: 26).

## [1] "Truthsocial"

## NOTE: 312 observations removed because of NA values (RHS: 312).

## [1] "Gab"

## NOTE: 5,612 observations removed because of NA values (RHS: 5,612).

## [1] "Mastodon"

## NOTE: 2,684,361 observations removed because of NA values (RHS: 2,684,361).

## NOTE: 2,684,369 observations removed because of NA values (RHS: 2,684,369).

## [1] "LinkedIn"

## NOTE: 117 observations removed because of NA values (RHS: 117).

## NOTE: 519 observations removed because of NA values (RHS: 519).

## [1] "X"

## NOTE: 1,312 observations removed because of NA values (RHS: 1,312).

d_<- 
  results|>
  filter(var=='estimate'|var=='se')|>
  filter(measure=='toxic_pc1'|measure=='toxic_pc1_control' )|>
  pivot_wider( names_from = var, 
               values_from = value
               #               names_glue = "{measure}_{var}"
  )|>
  arrange(estimate)
d_<-d_|>mutate(
  group=ifelse (grepl( 'control', measure, fixed = TRUE),'With Political Lean Control', 'No Control')
)

# Calculate the aggregate effect for each group
res_A <- rma(yi = d_$estimate[d_$group == 'With Political Lean Control'], sei = d_$se[d_$group == 'With Political Lean Control'])
res_B <- rma(yi = d_$estimate[d_$group == 'No Control'], sei = d_$se[d_$group == 'No Control'])

group_A_effects <- d_[d_$group == "With Political Lean Control", ]

# Order data by group A effect sizes
order_indices <- order(group_A_effects$estimate)
ordered_studies <- group_A_effects$platform[order_indices]

# Create a factor for group with levels reordered to put B before A
d_$group <- factor(d_$group, levels = c("With Political Lean Control", "No Control"))

# Reorder data by study and then by group (with B first)
d_ <- d_[order(match(d_$platform, ordered_studies), d_$group), ]

# Modify slab labels to only show the study name for the first entry of each study
slab_labels <- ifelse(duplicated(d_$platform), "", d_$platform)

#png("../../figs/fig3a.png",width =25, height = 25,units = "cm", res = 300)  # Adjust size and resolution
# Create a forest plot with additional space for aggregate effects
forest(
  x = d_$estimate,
  sei = d_$se,
  slab = slab_labels,
  col = rep(c("black", "gray"), length.out = nrow(d_)),
  xlab = "Effect Size",
  main = "Association toxicity and quality",
  ylim = c(-5, nrow(d_) + 3),
  psize=.8,# Add space at the bottom,
  digits = 3
)

# Add a horizontal line
abline(h = 0.5, col = "gray")
abline(v = 0, col = "black")#, lty = 1)
# Add aggregate effects with diamond shapes manually
addpoly(res_A, atransf = FALSE, row = -1, mlab = "With Political Lean Control", col = "black", cex = 1.2)
addpoly(res_B, atransf = FALSE, row = -2, mlab = "No Control", col = "gray", cex = 1.2)

#dev.off()

feglm( scale(toxicity_ntile)~scale(pc1),cluster='username',data_posts)

## NOTE: 2,684,478 observations removed because of NA values (RHS: 2,684,478).

## GLM estimation, family = gaussian, Dep. Var.: scale(toxicity_ntile)
## Observations: 20,611,046 
## Standard-errors: Clustered (username) 
##              Estimate Std. Error  t value  Pr(>|t|)    
## (Intercept)  0.079589   0.004716  16.8764 < 2.2e-16 ***
## scale(pc1)  -0.066464   0.003253 -20.4328 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## Log-Likelihood: -28,773,361.0   Adj. Pseudo R2: 0.00165 
##            BIC:  57,546,755.7     Squared Cor.: 0.004603

feglm( scale(toxicity_ntile)~scale(pc1)+scale(lean1),cluster='username',data_posts)

## NOTE: 2,692,899 observations removed because of NA values (RHS: 2,692,899).

## GLM estimation, family = gaussian, Dep. Var.: scale(toxicity_ntile)
## Observations: 20,602,625 
## Standard-errors: Clustered (username) 
##               Estimate Std. Error  t value  Pr(>|t|)    
## (Intercept)   0.079610   0.004704  16.9230 < 2.2e-16 ***
## scale(pc1)   -0.087116   0.003788 -22.9961 < 2.2e-16 ***
## scale(lean1) -0.044158   0.003667 -12.0404 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## Log-Likelihood: -28,743,817.6   Adj. Pseudo R2: 0.002214
##            BIC:  57,487,685.7     Squared Cor.: 0.006171

feglm( scale(toxicity_ntile)~scale(pc1)+scale(lean1)|username,cluster='username',data_posts)

## NOTE: 2,692,899 observations removed because of NA values (RHS: 2,692,899).

## GLM estimation, family = gaussian, Dep. Var.: scale(toxicity_ntile)
## Observations: 20,602,625 
## Fixed-effects: username: 1,862,255
## Standard-errors: Clustered (username) 
##               Estimate Std. Error   t value  Pr(>|t|)    
## scale(pc1)   -0.019835   0.001482 -13.38314 < 2.2e-16 ***
## scale(lean1) -0.005808   0.001818  -3.19496 0.0013985 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## Log-Likelihood: -22,127,757.9   Adj. Pseudo R2: 0.167233
##            BIC:  75,617,653.8     Squared Cor.: 0.477142

4.2 confidence vs. quality

results<-data.frame()

for (platform in unique(data_posts$platform)){
  print(platform)
 
  d<-data_posts[data_posts$platform==platform,]
  r<-feglm( scale(certainty_avg)~scale(pc1),cluster='username',d)
  results<-rbind(results,get_param('certainty_pc1',2,r,platform))
  
  r<-feglm( scale(certainty_avg)~scale(pc1)+scale(lean1),cluster='username',d)
  results<-rbind(results,get_param('certainty_pc1_control',2,r,platform))

}

## [1] "Bluesky"

## NOTE: 1,182,479 observations removed because of NA values (LHS: 1,182,479).

## NOTE: 1,182,481 observations removed because of NA values (LHS: 1,182,479, RHS: 16).

## [1] "Gettr"

## NOTE: 14,226 observations removed because of NA values (LHS: 14,226).

## NOTE: 14,230 observations removed because of NA values (LHS: 14,226, RHS: 26).

## [1] "Truthsocial"

## NOTE: 1,305,002 observations removed because of NA values (LHS: 1,305,002).

## NOTE: 1,305,024 observations removed because of NA values (LHS: 1,305,002, RHS: 312).

## [1] "Gab"

## NOTE: 379,792 observations removed because of NA values (LHS: 379,792).

## NOTE: 381,707 observations removed because of NA values (LHS: 379,792, RHS: 5,612).

## [1] "Mastodon"

## NOTE: 3,344,319 observations removed because of NA values (LHS: 2,814,848, RHS: 2,684,361).

## NOTE: 3,344,320 observations removed because of NA values (LHS: 2,814,848, RHS: 2,684,369).

## [1] "LinkedIn"

## NOTE: 16,024 observations removed because of NA values (LHS: 15,987, RHS: 117).

## NOTE: 16,288 observations removed because of NA values (LHS: 15,987, RHS: 519).

## [1] "Telegram"

## NOTE: 615,783 observations removed because of NA values (LHS: 615,783).

## NOTE: 615,794 observations removed because of NA values (LHS: 615,783, RHS: 733).

## [1] "X"

## NOTE: 11,666,643 observations removed because of NA values (LHS: 11,666,643).

## NOTE: 11,666,734 observations removed because of NA values (LHS: 11,666,643, RHS: 1,312).

d_<- 
  results|>
  filter(var=='estimate'|var=='se')|>
  filter(measure=='certainty_pc1'|measure=='certainty_pc1_control' )|>
  pivot_wider( names_from = var, 
               values_from = value
               #               names_glue = "{measure}_{var}"
  )|>
  arrange(estimate)
d_<-d_|>mutate(
  group=ifelse (grepl( 'control', measure, fixed = TRUE),'With Political Lean Control', 'No Control')
)

# Calculate the aggregate effect for each group
res_A <- rma(yi = d_$estimate[d_$group == 'With Political Lean Control'], sei = d_$se[d_$group == 'With Political Lean Control'])
res_B <- rma(yi = d_$estimate[d_$group == 'No Control'], sei = d_$se[d_$group == 'No Control'])

group_A_effects <- d_[d_$group == "With Political Lean Control", ]

# Order data by group A effect sizes
order_indices <- order(group_A_effects$estimate)
ordered_studies <- group_A_effects$platform[order_indices]

# Create a factor for group with levels reordered to put B before A
d_$group <- factor(d_$group, levels = c("With Political Lean Control", "No Control"))

# Reorder data by study and then by group (with B first)
d_ <- d_[order(match(d_$platform, ordered_studies), d_$group), ]

# Modify slab labels to only show the study name for the first entry of each study
slab_labels <- ifelse(duplicated(d_$platform), "", d_$platform)

#png("../../figs/fig3a.png",width =25, height = 25,units = "cm", res = 300)  # Adjust size and resolution
# Create a forest plot with additional space for aggregate effects
forest(
  x = d_$estimate,
  sei = d_$se,
  slab = slab_labels,
  col = rep(c("black", "gray"), length.out = nrow(d_)),
  xlab = "Effect Size",
  main = "association confidence and quality",
  ylim = c(-5, nrow(d_) + 3),
  psize=.8,# Add space at the bottom,
  digits = 3
)

# Add a horizontal line
abline(h = 0.5, col = "gray")
abline(v = 0, col = "black")#, lty = 1)
# Add aggregate effects with diamond shapes manually
addpoly(res_A, atransf = FALSE, row = -1, mlab = "With Political Lean Control", col = "black", cex = 1.2)
addpoly(res_B, atransf = FALSE, row = -2, mlab = "No Control", col = "gray", cex = 1.2)

#dev.off()

feglm( scale(certainty_avg)~scale(pc1),cluster='username',data_posts)

## NOTE: 18,524,268 observations removed because of NA values (LHS: 17,994,760, RHS: 2,684,478).

## GLM estimation, family = gaussian, Dep. Var.: scale(certainty_avg)
## Observations: 4,771,256 
## Standard-errors: Clustered (username) 
##              Estimate Std. Error    t value  Pr(>|t|)    
## (Intercept)  0.004749   0.005308   0.894709   0.37094    
## scale(pc1)  -0.050577   0.002505 -20.188669 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## Log-Likelihood: -6,741,374.8   Adj. Pseudo R2: 9.749e-4
##            BIC: 13,482,780.4     Squared Cor.: 0.002754

feglm( scale(certainty_avg)~scale(pc1)+scale(lean1),cluster='username',data_posts)

## NOTE: 18,526,578 observations removed because of NA values (LHS: 17,994,760, RHS: 2,692,899).

## GLM estimation, family = gaussian, Dep. Var.: scale(certainty_avg)
## Observations: 4,768,946 
## Standard-errors: Clustered (username) 
##               Estimate Std. Error   t value  Pr(>|t|)    
## (Intercept)   0.005769   0.005284   1.09187   0.27489    
## scale(pc1)   -0.034188   0.002322 -14.72135 < 2.2e-16 ***
## scale(lean1)  0.032164   0.002360  13.62781 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## Log-Likelihood: -6,736,320.9   Adj. Pseudo R2: 0.001288
##            BIC: 13,472,687.9     Squared Cor.: 0.003638

feglm( scale(certainty_avg)~scale(pc1)+scale(lean1)|username,cluster='username',data_posts)

## NOTE: 18,526,578 observations removed because of NA values (LHS: 17,994,760, RHS: 2,692,899).

## GLM estimation, family = gaussian, Dep. Var.: scale(certainty_avg)
## Observations: 4,768,946 
## Fixed-effects: username: 662,123
## Standard-errors: Clustered (username) 
##               Estimate Std. Error  t value   Pr(>|t|)    
## scale(pc1)   -0.011607   0.001979 -5.86498 4.4939e-09 ***
## scale(lean1)  0.005990   0.001362  4.39829 1.0913e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## Log-Likelihood: -6,071,559.8   Adj. Pseudo R2: 0.00168 
##            BIC: 22,325,036.8     Squared Cor.: 0.246052

4.3 engagement vs. toxcitiy, confidence (in the same model)

  r<-feglm(scale(log10(1+engage))~ scale(certainty_avg)+scale(toxicity_ntile)+scale(lean1)|username,
           cluster='username',data_posts[data_posts$platform=='X',])

## NOTE: 11,666,734 observations removed because of NA values (RHS: 11,666,734).

## GLM estimation, family = gaussian, Dep. Var.: scale(log10(1 + engage))
## Observations: 3,291,774 
## Fixed-effects: username: 498,674
## Standard-errors: Clustered (username) 
##                        Estimate Std. Error  t value  Pr(>|t|)    
## scale(certainty_avg)   0.008283   0.002737  3.02669 0.0024726 ** 
## scale(toxicity_ntile)  0.010315   0.002703  3.81572 0.0001358 ***
## scale(lean1)          -0.008901   0.003906 -2.27867 0.0226873 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## Log-Likelihood: -1,944,357.5   Adj. Pseudo R2: 0.524344
##            BIC: 11,372,329.3     Squared Cor.: 0.856187

  r<-feglm(scale(log10(1+engage))~ scale(certainty_avg)+scale(toxicity_ntile)+scale(pc1)+scale(lean1)|username,
           cluster='username',data_posts)

## NOTE: 18,668,219 observations removed because of NA values (LHS: 582,384, RHS: 18,526,578).

## GLM estimation, family = gaussian, Dep. Var.: scale(log10(1 + engage))
## Observations: 4,627,305 
## Fixed-effects: username: 658,283
## Standard-errors: Clustered (username) 
##                        Estimate Std. Error   t value   Pr(>|t|)    
## scale(certainty_avg)   0.009484   0.002156   4.39808 1.0923e-05 ***
## scale(toxicity_ntile)  0.015517   0.002090   7.42559 1.1242e-13 ***
## scale(pc1)            -0.014081   0.001301 -10.82548  < 2.2e-16 ***
## scale(lean1)          -0.011040   0.003167  -3.48554 4.9117e-04 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## Log-Likelihood: -3,109,735.0   Adj. Pseudo R2: 0.483355
##            BIC: 16,322,520.0     Squared Cor.: 0.836049

  r<-feglm(scale(log10(1+engage))~ scale(certainty_avg)*scale(toxicity_ntile)*scale(pc1)+scale(lean1)|username,
           cluster='username',data_posts)

## NOTE: 18,668,219 observations removed because of NA values (LHS: 582,384, RHS: 18,526,578).

## GLM estimation, family = gaussian, Dep. Var.: scale(log10(1 + engage))
## Observations: 4,627,305 
## Fixed-effects: username: 658,283
## Standard-errors: Clustered (username) 
##                                                        Estimate Std. Error
## scale(certainty_avg)                                   0.011602   0.002034
## scale(toxicity_ntile)                                  0.015750   0.002103
## scale(pc1)                                            -0.014507   0.001292
## scale(lean1)                                          -0.010948   0.003177
## scale(certainty_avg):scale(toxicity_ntile)            -0.005228   0.001330
## scale(certainty_avg):scale(pc1)                        0.000621   0.000479
## scale(toxicity_ntile):scale(pc1)                       0.000995   0.000597
## scale(certainty_avg):scale(toxicity_ntile):scale(pc1)  0.000158   0.000405
##                                                          t value   Pr(>|t|)    
## scale(certainty_avg)                                    5.703620 1.1734e-08 ***
## scale(toxicity_ntile)                                   7.490527 6.8682e-14 ***
## scale(pc1)                                            -11.225761  < 2.2e-16 ***
## scale(lean1)                                           -3.446076 5.6883e-04 ***
## scale(certainty_avg):scale(toxicity_ntile)             -3.930737 8.4695e-05 ***
## scale(certainty_avg):scale(pc1)                         1.296267 1.9488e-01    
## scale(toxicity_ntile):scale(pc1)                        1.666427 9.5629e-02 .  
## scale(certainty_avg):scale(toxicity_ntile):scale(pc1)   0.389646 6.9680e-01    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## Log-Likelihood: -3,109,541.3   Adj. Pseudo R2: 0.483381
##            BIC: 16,322,194.0     Squared Cor.: 0.836062

results<-data.frame()

for (platform in unique(data_posts$platform)){
  print(platform)
  
  d<-data_posts[data_posts$platform==platform,]
  r<-feglm(scale(log10(engage+1))~ scale(certainty_avg)+scale(toxicity_ntile)+scale(lean1)|username,cluster='username',d)
  results<-rbind(results,get_param('engage_certainty',1,r,platform))
  results<-rbind(results,get_param('engage_toxicity',2,r,platform))

}

## [1] "Bluesky"

## NOTE: 1,182,481 observations removed because of NA values (RHS: 1,182,481).

## [1] "Gettr"

## NOTE: 20,259 observations removed because of NA values (LHS: 16,998, RHS: 14,230).

## [1] "Truthsocial"

## NOTE: 1,305,024 observations removed because of NA values (RHS: 1,305,024).

## [1] "Gab"

## NOTE: 381,707 observations removed because of NA values (RHS: 381,707).

## [1] "Mastodon"

## NOTE: 3,344,320 observations removed because of NA values (RHS: 3,344,320).

## [1] "LinkedIn"

## NOTE: 16,289 observations removed because of NA values (LHS: 4, RHS: 16,288).

## [1] "Telegram"

## NOTE: 751,405 observations removed because of NA values (LHS: 565,382, RHS: 615,794).

## [1] "X"

## NOTE: 11,666,734 observations removed because of NA values (RHS: 11,666,734).

d_<- 
  results|>
  filter(var=='estimate'|var=='se')|>
  filter(measure=='engage_certainty'|measure=='engage_toxicity' )|>
  pivot_wider( names_from = var, 
               values_from = value
               #               names_glue = "{measure}_{var}"
  )|>
  arrange(estimate)

d_<-d_|>mutate(
  group=ifelse (grepl( 'engage_certainty', measure, fixed = TRUE),'Certainty', 'Toxicity')
)

# Calculate the aggregate effect for each group
res_A <- rma(yi = d_$estimate[d_$group == 'Toxicity'], sei = d_$se[d_$group == 'Toxicity'])
res_B <- rma(yi = d_$estimate[d_$group == 'Certainty'], sei = d_$se[d_$group == 'Certainty'])

group_A_effects <- d_[d_$group == "Toxicity", ]

# Order data by group A effect sizes
order_indices <- order(group_A_effects$estimate)
ordered_studies <- group_A_effects$platform[order_indices]

# Create a factor for group with levels reordered to put B before A
d_$group <- factor(d_$group, levels = c("Toxicity", "Certainty"))

# Reorder data by study and then by group (with B first)
d_ <- d_[order(match(d_$platform, ordered_studies), d_$group), ]

# Modify slab labels to only show the study name for the first entry of each study
slab_labels <- ifelse(duplicated(d_$platform), "", d_$platform)

forest(
  x = d_$estimate,
  sei = d_$se,
  slab = slab_labels,
  col = rep(c("black", "gray"), length.out = nrow(d_)),
  xlab = "Effect Size",
  main = "association engagment, confidence and toxcity (w/ lean control)",
  ylim = c(-5, nrow(d_) + 3),
  psize=.8,# Add space at the bottom,
  digits = 3
)

# Add a horizontal line
abline(h = 0.5, col = "gray")
abline(v = 0, col = "black")#, lty = 1)
# Add aggregate effects with diamond shapes manually
addpoly(res_A, atransf = FALSE, row = -1, mlab = "Toxicity", col = "black", cex = 1.2)
addpoly(res_B, atransf = FALSE, row = -2, mlab = "Certainty", col = "gray", cex = 1.2)

#dev.off()

4.4 engagement toxci, certainty pc1

results<-data.frame()

for (platform in unique(data_posts$platform)){
  print(platform)
  
  d<-data_posts[data_posts$platform==platform,]
  r<-feglm(scale(log10(engage+1))~ scale(certainty_avg)+scale(toxicity_ntile)+scale(pc1)+scale(lean1)|username,cluster='username',d)
  results<-rbind(results,get_param('Certainty',1,r,platform))
  results<-rbind(results,get_param('Toxicity',2,r,platform))
  results<-rbind(results,get_param('Quality',3,r,platform))
}

## [1] "Bluesky"

## NOTE: 1,182,481 observations removed because of NA values (RHS: 1,182,481).

## [1] "Gettr"

## NOTE: 20,259 observations removed because of NA values (LHS: 16,998, RHS: 14,230).

## [1] "Truthsocial"

## NOTE: 1,305,024 observations removed because of NA values (RHS: 1,305,024).

## [1] "Gab"

## NOTE: 381,707 observations removed because of NA values (RHS: 381,707).

## [1] "Mastodon"

## NOTE: 3,344,320 observations removed because of NA values (RHS: 3,344,320).

## [1] "LinkedIn"

## NOTE: 16,289 observations removed because of NA values (LHS: 4, RHS: 16,288).

## [1] "Telegram"

## NOTE: 751,405 observations removed because of NA values (LHS: 565,382, RHS: 615,794).

## [1] "X"

## NOTE: 11,666,734 observations removed because of NA values (RHS: 11,666,734).

d_<- 
  results|>
  filter(var=='estimate'|var=='se')|>
  pivot_wider( names_from = var, 
               values_from = value
               #               names_glue = "{measure}_{var}"
  )|>
  arrange(estimate)

# Calculate the aggregate effect for each group
d_$group <- d_$measure
res_A <- rma(yi = d_$estimate[d_$group == 'Toxicity'], sei = d_$se[d_$group == 'Toxicity'])
res_B <- rma(yi = d_$estimate[d_$group == 'Certainty'], sei = d_$se[d_$group == 'Certainty'])
res_C <- rma(yi = d_$estimate[d_$group == 'Quality'], sei = d_$se[d_$group == 'Quality'])

group_A_effects <- d_[d_$group == "Toxicity", ]

# Order data by group A effect sizes
order_indices <- order(group_A_effects$estimate)
ordered_studies <- group_A_effects$platform[order_indices]

# Create a factor for group with levels reordered to put B before A
d_$group <- factor(d_$group, levels = c("Toxicity", "Certainty","Quality"))

# Reorder data by study and then by group (with B first)
d_ <- d_[order(match(d_$platform, ordered_studies), d_$group), ]

# Modify slab labels to only show the study name for the first entry of each study
slab_labels <- ifelse(duplicated(d_$platform), "", d_$platform)

forest(
  x = d_$estimate,
  sei = d_$se,
  slab = slab_labels,
  col = rep(c("blue", "green",'red'), length.out = nrow(d_)),
  xlab = "Effect Size",
  main = "association engagment, confidence, toxcity, and quality (w/ lean control)",
  ylim = c(-5, nrow(d_) + 3),
  psize=.8,# Add space at the bottom,
  digits = 3
)

# Add a horizontal line
abline(h = 0.5, col = "gray")
abline(v = 0, col = "black")#, lty = 1)
# Add aggregate effects with diamond shapes manually
addpoly(res_A, atransf = FALSE, row = -1, mlab = "Toxicity", col = "blue", cex = 1.2)
addpoly(res_B, atransf = FALSE, row = -2, mlab = "Certainty", col = "green", cex = 1.2)
addpoly(res_C, atransf = FALSE, row = -3, mlab = "Quality", col = "red", cex = 1.2)

5. analysis - headline

r <- feglm( scale(certainty_avg)~scale(pc1)#,cluster='username'
            ,data_headlines)

## NOTE: 746,866 observations removed because of NA values (LHS: 678,258, RHS: 559,391).

## GLM estimation, family = gaussian, Dep. Var.: scale(certainty_avg)
## Observations: 54,014 
## Standard-errors: IID 
##              Estimate Std. Error   t value  Pr(>|t|)    
## (Intercept)  0.010071   0.004183   2.40753  0.016064 *  
## scale(pc1)  -0.101662   0.004042 -25.15059 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## Log-Likelihood: -74,725.0   Adj. Pseudo R2: 0.004177
##            BIC: 149,471.7     Squared Cor.: 0.011576

r <- feglm( scale(toxicity)~scale(pc1)#,cluster='username'
            ,data_headlines)

## NOTE: 559,391 observations removed because of NA values (RHS: 559,391).

## GLM estimation, family = gaussian, Dep. Var.: scale(toxicity)
## Observations: 241,489 
## Standard-errors: IID 
##              Estimate Std. Error  t value  Pr(>|t|)    
## (Intercept)  0.123659   0.002611  47.3622 < 2.2e-16 ***
## scale(pc1)  -0.123782   0.002611 -47.4094 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## Log-Likelihood: -402,844.8   Adj. Pseudo R2: 0.002767
##            BIC:  805,714.4     Squared Cor.: 0.009222

6. robustness check

6.1 engagement UGC vs. headline # headline extraction is randomly stratitefed on pc1

  r<-feglm(scale(log10(engage+1))~ scale(certainty_avg)+scale(toxicity_ntile)+scale(lean1)|username,cluster='username',data_posts)

## NOTE: 18,668,219 observations removed because of NA values (LHS: 582,384, RHS: 18,526,578).

## GLM estimation, family = gaussian, Dep. Var.: scale(log10(engage + 1))
## Observations: 4,627,305 
## Fixed-effects: username: 658,283
## Standard-errors: Clustered (username) 
##                        Estimate Std. Error  t value   Pr(>|t|)    
## scale(certainty_avg)   0.009554   0.002157  4.43014 9.4189e-06 ***
## scale(toxicity_ntile)  0.015852   0.002092  7.57670 3.5492e-14 ***
## scale(lean1)          -0.006831   0.003064 -2.22938 2.5789e-02 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## Log-Likelihood: -3,110,528.2   Adj. Pseudo R2: 0.483246
##            BIC: 16,324,090.9     Squared Cor.: 0.835992

  r<-feglm(scale(log10(engage+1))~ scale(certainty_avg)+scale(toxicity_ntile)+scale(lean1)|username,
           cluster='username',data_posts[!is.na(data_posts$headline_toxic),])

## NOTE: 1,833,345 observations removed because of NA values (LHS: 43,041, RHS: 1,820,750).

## GLM estimation, family = gaussian, Dep. Var.: scale(log10(engage + 1))
## Observations: 576,603 
## Fixed-effects: username: 168,491
## Standard-errors: Clustered (username) 
##                       Estimate Std. Error  t value   Pr(>|t|)    
## scale(certainty_avg)  0.007829   0.001037  7.54781 4.4486e-14 ***
## scale(toxicity_ntile) 0.022431   0.001518 14.78066  < 2.2e-16 ***
## scale(lean1)          0.003570   0.002257  1.58195 1.1366e-01    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## Log-Likelihood:  -319,516.3   Adj. Pseudo R2: 0.442941
##            BIC: 2,874,090.1     Squared Cor.: 0.854906

  r<-feglm(scale(log10(engage+1))~ scale(certainty_avg)+scale(toxicity_ntile)+scale(lean1)|username,
           cluster='username',data_posts[!is.na(data_posts$headline_toxic),])

## NOTE: 1,833,345 observations removed because of NA values (LHS: 43,041, RHS: 1,820,750).

## GLM estimation, family = gaussian, Dep. Var.: scale(log10(engage + 1))
## Observations: 576,603 
## Fixed-effects: username: 168,491
## Standard-errors: Clustered (username) 
##                       Estimate Std. Error  t value   Pr(>|t|)    
## scale(certainty_avg)  0.007829   0.001037  7.54781 4.4486e-14 ***
## scale(toxicity_ntile) 0.022431   0.001518 14.78066  < 2.2e-16 ***
## scale(lean1)          0.003570   0.002257  1.58195 1.1366e-01    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## Log-Likelihood:  -319,516.3   Adj. Pseudo R2: 0.442941
##            BIC: 2,874,090.1     Squared Cor.: 0.854906

    r<-feglm(scale(log10(engage+1))~ scale(certainty_avg)+scale(headline_certainty_avg)+scale(toxicity_ntile)+scale(lean1)|username,
           cluster='username',data_posts[!is.na(data_posts$headline_toxic),])

## NOTE: 2,147,177 observations removed because of NA values (LHS: 43,041, RHS: 2,141,825).

## GLM estimation, family = gaussian, Dep. Var.: scale(log10(engage + 1))
## Observations: 262,771 
## Fixed-effects: username: 81,643
## Standard-errors: Clustered (username) 
##                               Estimate Std. Error  t value   Pr(>|t|)    
## scale(certainty_avg)          0.001359   0.002538 0.535733 5.9214e-01    
## scale(headline_certainty_avg) 0.004979   0.002567 1.939943 5.2390e-02 .  
## scale(toxicity_ntile)         0.016472   0.002298 7.166434 7.7625e-13 ***
## scale(lean1)                  0.002893   0.003158 0.916246 3.5954e-01    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## Log-Likelihood:  -118,355.3   Adj. Pseudo R2: 0.481684
##            BIC: 1,255,586.6     Squared Cor.: 0.869462

6.2 pc1/certainty UGC vs. headline

  r<-feglm(scale(pc1)~scale(certainty_avg)+scale(lean1),
           cluster='username',data_posts)

## NOTE: 18,526,578 observations removed because of NA values (LHS: 2,684,478, RHS: 18,526,578).

## GLM estimation, family = gaussian, Dep. Var.: scale(pc1)
## Observations: 4,768,946 
## Standard-errors: Clustered (username) 
##                       Estimate Std. Error   t value   Pr(>|t|)    
## (Intercept)          -0.033244   0.006005  -5.53578 3.0996e-08 ***
## scale(certainty_avg) -0.027898   0.001937 -14.40392  < 2.2e-16 ***
## scale(lean1)         -0.477683   0.007974 -59.90871  < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## Log-Likelihood: -6,251,503.9   Adj. Pseudo R2: 0.096426
##            BIC: 12,503,053.8     Squared Cor.: 0.244052

  r<-feglm(scale(pc1)~scale(certainty_avg)+scale(lean1),
           cluster='username',data_posts[!is.na(data_posts$headline_toxic),])

## NOTE: 1,820,750 observations removed because of NA values (LHS: 5,097, RHS: 1,820,750).

## GLM estimation, family = gaussian, Dep. Var.: scale(pc1)
## Observations: 589,198 
## Standard-errors: Clustered (username) 
##                       Estimate Std. Error   t value   Pr(>|t|)    
## (Intercept)          -0.034685   0.006229  -5.56820 2.5777e-08 ***
## scale(certainty_avg) -0.025217   0.002771  -9.09928  < 2.2e-16 ***
## scale(lean1)         -0.585104   0.006360 -91.99306  < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## Log-Likelihood:  -704,860.1   Adj. Pseudo R2: 0.161102
##            BIC: 1,409,760.1     Squared Cor.: 0.368391

      r<-feglm(scale(pc1)~scale(certainty_avg)+scale(headline_certainty_avg),
           cluster='username',data_posts[!is.na(data_posts$headline_toxic),])

## NOTE: 2,141,817 observations removed because of NA values (LHS: 5,097, RHS: 2,141,502).

## GLM estimation, family = gaussian, Dep. Var.: scale(pc1)
## Observations: 268,131 
## Standard-errors: Clustered (username) 
##                                Estimate Std. Error   t value   Pr(>|t|)    
## (Intercept)                   -0.126977   0.010276 -12.35659  < 2.2e-16 ***
## scale(certainty_avg)          -0.031447   0.005487  -5.73125 1.0004e-08 ***
## scale(headline_certainty_avg) -0.052913   0.005937  -8.91244  < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## Log-Likelihood: -385,495.1   Adj. Pseudo R2: 0.002167
##            BIC:  771,027.8     Squared Cor.: 0.00624

    r<-feglm(scale(pc1)~scale(certainty_avg)+scale(headline_certainty_avg)+scale(lean1),
           cluster='username',data_posts[data_posts$headline_avail,])

## NOTE: 2,139,666 observations removed because of NA values (LHS: 5,084, RHS: 2,139,666).

## GLM estimation, family = gaussian, Dep. Var.: scale(pc1)
## Observations: 268,123 
## Standard-errors: Clustered (username) 
##                                Estimate Std. Error   t value   Pr(>|t|)    
## (Intercept)                   -0.071351   0.007336  -9.72591  < 2.2e-16 ***
## scale(certainty_avg)          -0.008731   0.004065  -2.14795 3.1721e-02 *  
## scale(headline_certainty_avg) -0.029257   0.004581  -6.38610 1.7106e-10 ***
## scale(lean1)                  -0.574338   0.008365 -68.65833  < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## Log-Likelihood: -325,244.9   Adj. Pseudo R2: 0.158158
##            BIC:  650,539.7     Squared Cor.: 0.366071

      r<-feglm(scale(pc1)~scale(certainty_avg)+scale(lean1),
           cluster='username',data_posts[(data_posts$headline_avail) &(data_posts$text_headline_similarity<.5) ,])

## NOTE: 1,136,997 observations removed because of NA values (LHS: 4,108, RHS: 1,136,997).

## GLM estimation, family = gaussian, Dep. Var.: scale(pc1)
## Observations: 403,449 
## Standard-errors: Clustered (username) 
##                       Estimate Std. Error   t value   Pr(>|t|)    
## (Intercept)          -0.042191   0.007493  -5.63065 1.7987e-08 ***
## scale(certainty_avg) -0.024388   0.003745  -6.51301 7.3905e-11 ***
## scale(lean1)         -0.569686   0.007382 -77.16790  < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## Log-Likelihood: -491,625.4   Adj. Pseudo R2: 0.147961
##            BIC:  983,289.6     Squared Cor.: 0.345072

6.3 certainty availability

  r<-feglm(is.na(certainty_avg)~scale(toxicity_ntile)+scale(lean1)+scale(log10(1+engage)),
           cluster='username',data_posts)

## NOTE: 3,274,625 observations removed because of NA values (RHS: 3,274,625).

## GLM estimation, family = gaussian, Dep. Var.: is.na(certainty_avg)
## Observations: 20,020,899 
## Standard-errors: Clustered (username) 
##                           Estimate Std. Error   t value   Pr(>|t|)    
## (Intercept)               0.776474   0.001356 572.47117  < 2.2e-16 ***
## scale(toxicity_ntile)    -0.065841   0.001013 -64.99559  < 2.2e-16 ***
## scale(lean1)              0.005318   0.000918   5.79385 6.8802e-09 ***
## scale(log10(1 + engage)) -0.038115   0.002209 -17.25483  < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## Log-Likelihood: -10,796,082.7   Adj. Pseudo R2: 0.028614
##            BIC:  21,592,232.6     Squared Cor.: 0.031269

Confidence

2024-12-13

1. Reading data

2. Preprocess data

2.1 filtering

2.2 PCA

3. Descriptives

3.1 histograms

3.2 associations

4. Analysis - posts

4.1 toxicity vs. quality

4.2 confidence vs. quality

4.3 engagement vs. toxcitiy, confidence (in the same model)

4.4 engagement toxci, certainty pc1

5. analysis - headline

6. robustness check

6.1 engagement UGC vs. headline # headline extraction is randomly stratitefed on pc1

6.2 pc1/certainty UGC vs. headline

6.3 certainty availability