Loading, setting up

library(tidyverse)
library(lme4)
library(readxl)

# source("prepare-data.R")
to_model <- read_csv("data-to-model.csv") 
data_to_add <- read_excel("TwitterTeachersData.xlsx") %>% 
  mutate(State = tolower(State),
         State = tools::toTitleCase(State)) %>% 
  janitor::clean_names() %>% 
  rename(State = state)

to_model <- to_model %>% 
  left_join(data_to_add)
to_model <- to_model %>% 
  rename(pct_urban_in_state = `Urban Percentage of State Population in 2015`,
         three_year_poverty_rates = `Three Year Average of State Poverty Rates 2015-2017`, 
         n_students_enrolled = `Enrollment in public ele/sec. education  Digest of Ed Stats. Fall 2015 (projected)`,
         n_students_in_free_reduced_lunch_pgm = `National School Lunch Program FY 2016 (FNS.USDA.GOV)`,
         pres_election_r_support_16 = `Politics: State's support of Trump in 2016`,
         region = `Region (By Census)`,
         teacher_salaries = `Teacher Salaries NEA 2016`,
         state_spending = `State Spending on Public Elementary/Secondary Spending FY 2016 (Census) (in thousands)`,
         n_teachers = Teachers)

to_model <- to_model %>% janitor::clean_names()

to_model <- to_model %>% 
  mutate(pct_students_frpl = n_students_in_free_reduced_lunch_pgm / n_students_enrolled)

This calculates the number of tweets sent to each hashtag

state_n <- to_model %>% 
  group_by(hashtag) %>% 
  summarize(sum_n = sum(n_tweets)) %>% 
  left_join(distinct(to_model, hashtag, .keep_all = TRUE))
## Joining, by = "hashtag"

Correlations at the state level

state_n %>% 
  mutate(years_on_twitter = 2020 - lubridate::year(account_created_at)) %>% 
  select(n_tweets_for_state = sum_n, n_tweets, favorite_count, years_on_twitter, n_teachers, voices,pct_urban_in_state, three_year_poverty_rates, pct_students_frpl, teacher_salaries, pres_election_r_support_16, unemployment, governor_1_gop_0_dem_2_ind, c_ideology, g_ideology, pct_urban_in_state, three_year_poverty_rates, pct_students_frpl, teacher_salaries, pres_election_r_support_16, state_spending) %>% 
  corrr::correlate() %>% 
  corrr::fashion() %>% 
  knitr::kable()
## 
## Correlation method: 'pearson'
## Missing treated using: 'pairwise.complete.obs'
rowname n_tweets_for_state n_tweets favorite_count years_on_twitter n_teachers voices pct_urban_in_state three_year_poverty_rates pct_students_frpl teacher_salaries pres_election_r_support_16 unemployment governor_1_gop_0_dem_2_ind c_ideology g_ideology state_spending
n_tweets_for_state .26 -.15 -.10 .35 .69 .09 .10 -.02 -.05 .20 .10 .18 -.24 -.21 .28
n_tweets .26 .19 -.17 .50 .27 .10 .02 -.09 -.01 .07 .09 .01 -.11 -.08 .35
favorite_count -.15 .19 .09 -.13 -.21 .15 -.22 -.23 -.09 -.29 -.03 -.30 .13 .36 -.08
years_on_twitter -.10 -.17 .09 -.14 -.03 -.03 -.14 -.43 -.15 -.11 -.35 -.08 .07 -.00 -.16
n_teachers .35 .50 -.13 -.14 .50 .43 .19 -.12 .35 -.22 .34 -.02 .14 .06 .94
voices .69 .27 -.21 -.03 .50 .09 .01 .02 .18 .04 .11 .07 -.00 -.04 .46
pct_urban_in_state .09 .10 .15 -.03 .43 .09 -.22 -.42 .52 -.52 .26 -.14 .31 .27 .46
three_year_poverty_rates .10 .02 -.22 -.14 .19 .01 -.22 .47 -.35 .44 .60 .42 -.28 -.45 .11
pct_students_frpl -.02 -.09 -.23 -.43 -.12 .02 -.42 .47 -.38 .59 -.06 .39 -.44 -.43 -.18
teacher_salaries -.05 -.01 -.09 -.15 .35 .18 .52 -.35 -.38 -.66 .19 -.34 .70 .67 .55
pres_election_r_support_16 .20 .07 -.29 -.11 -.22 .04 -.52 .44 .59 -.66 .05 .52 -.84 -.77 -.35
unemployment .10 .09 -.03 -.35 .34 .11 .26 .60 -.06 .19 .05 .14 .07 -.09 .33
governor_1_gop_0_dem_2_ind .18 .01 -.30 -.08 -.02 .07 -.14 .42 .39 -.34 .52 .14 -.52 -.81 -.17
c_ideology -.24 -.11 .13 .07 .14 -.00 .31 -.28 -.44 .70 -.84 .07 -.52 .79 .24
g_ideology -.21 -.08 .36 -.00 .06 -.04 .27 -.45 -.43 .67 -.77 -.09 -.81 .79 .25
state_spending .28 .35 -.08 -.16 .94 .46 .46 .11 -.18 .55 -.35 .33 -.17 .24 .25

What stands out here for relating to sum_n?

  • Voices (unique users) matters a great deal
  • N teachers in state, too
  • both ideology measures (congress and governor) seem to matter, sligtly more than R pres support
  • poverty, urbanicity, unemployment matter a bit

Analysis: State level (so n = ~50)

Some linear models

m0 <- glm(sum_n ~ n_teachers, data = state_n, family = "poisson")
sjPlot::tab_model(m0, show.std = TRUE)
  sum n
Predictors Incidence Rate Ratios std. Beta CI standardized CI p
(Intercept) 5688.26 9.00 5663.27 – 5713.33 9.00 – 9.00 <0.001
n_teachers 1.00 0.36 1.00 – 1.00 0.36 – 0.36 <0.001
Observations 47
R2 Nagelkerke 1.000
m00 <- glm(sum_n ~ n_teachers + state_spending, data = state_n, family = "poisson")
sjPlot::tab_model(m00, show.std = TRUE)
  sum n
Predictors Incidence Rate Ratios std. Beta CI standardized CI p
(Intercept) 5675.12 9.00 5650.45 – 5699.88 8.99 – 9.00 <0.001
n_teachers 1.00 0.56 1.00 – 1.00 0.56 – 0.57 <0.001
state_spending 1.00 -0.23 1.00 – 1.00 -0.24 – -0.23 <0.001
Observations 47
R2 Nagelkerke 1.000
m000 <- glm(sum_n ~ n_teachers + state_spending + c_ideology + pct_urban_in_state + three_year_poverty_rates, data = state_n, family = "poisson")
sjPlot::tab_model(m000, show.std = TRUE)
  sum n
Predictors Incidence Rate Ratios std. Beta CI standardized CI p
(Intercept) 27229.41 8.90 26324.18 – 28166.13 8.89 – 8.90 <0.001
n_teachers 1.00 0.23 1.00 – 1.00 0.23 – 0.24 <0.001
state_spending 1.00 0.20 1.00 – 1.00 0.19 – 0.21 <0.001
c_ideology 0.96 -0.66 0.96 – 0.96 -0.67 – -0.66 <0.001
pct_urban_in_state 1.01 0.11 1.01 – 1.01 0.10 – 0.11 <0.001
three_year_poverty_rates 0.99 -0.04 0.99 – 0.99 -0.04 – -0.03 <0.001
Observations 45
R2 Nagelkerke 1.000
m0000 <- glm(sum_n ~ n_teachers + state_spending + c_ideology + pct_urban_in_state + three_year_poverty_rates + voices, data = state_n, family = "poisson")
sjPlot::tab_model(m0000, show.std = TRUE) 
  sum n
Predictors Incidence Rate Ratios std. Beta CI standardized CI p
(Intercept) 803.91 8.65 770.04 – 839.26 8.65 – 8.66 <0.001
n_teachers 1.00 -0.15 1.00 – 1.00 -0.16 – -0.14 <0.001
state_spending 1.00 0.10 1.00 – 1.00 0.09 – 0.11 <0.001
c_ideology 0.97 -0.53 0.97 – 0.97 -0.53 – -0.52 <0.001
pct_urban_in_state 1.03 0.45 1.03 – 1.03 0.44 – 0.45 <0.001
three_year_poverty_rates 1.06 0.17 1.06 – 1.06 0.16 – 0.17 <0.001
voices 1.00 0.72 1.00 – 1.00 0.72 – 0.73 <0.001
Observations 45
R2 Nagelkerke 1.000

What can we take away fom these?

  • Voices matter a great deal, unsurprisingly, positively
  • urbancity and ideology both matter, with less urban, more right-leaning states with more activity
  • n teachers matters little - correlated with voices

Analysis - at the individual level

to_model <- to_model %>% 
    mutate(years_on_twitter = 2020 - lubridate::year(to_model$account_created_at),
         n_tweets_by_user = n)

Model 1 - only the grouping structure (individuals nested in hashtags)

m1 <- glmer(n_tweets ~ 1 + 
              (1|hashtag), 
            data = to_model,family = 'poisson')

performance::icc(m1)
## Warning: mu of 5.2 is too close to zero, estimate of random effect variances may be unreliable.
## # Intraclass Correlation Coefficient
## 
##      Adjusted ICC: 0.545
##   Conditional ICC: 0.545
sjPlot::tab_model(m1, show.std = TRUE)
## No variables could be standardized.
  n tweets
Predictors Incidence Rate Ratios std. Beta CI standardized CI p
(Intercept) 5.19 1.65 4.67 – 5.77 1.54 – 1.75 <0.001
Random Effects
σ2 0.18
τ00 hashtag 0.21
ICC 0.54
N hashtag 53
Observations 72916
Marginal R2 / Conditional R2 0.000 / 0.545

Model 2 - with some user-level independent variables added

m2 <- glmer(n_tweets ~ 1 + 
              
              # individual
              scale(favorite_count) +
              scale(time_of_account) + 
              scale(voices) +
              
              (1|hashtag), 
            data = to_model, 
            family = "poisson")
## Warning in checkConv(attr(opt, "derivs"), opt$par, ctrl = control$checkConv, : Model is nearly unidentifiable: very large eigenvalue
##  - Rescale variables?
performance::icc(m2)
## Warning: mu of 5.2 is too close to zero, estimate of random effect variances may be unreliable.
## # Intraclass Correlation Coefficient
## 
##      Adjusted ICC: 0.651
##   Conditional ICC: 0.608
sjPlot::tab_model(m2, show.std = TRUE)
## Warning in checkConv(attr(opt, "derivs"), opt$par, ctrl = control$checkConv, : Model is nearly unidentifiable: very large eigenvalue
##  - Rescale variables?
  n tweets
Predictors Incidence Rate Ratios std. Beta CI standardized CI p
(Intercept) 4.47 1.49 3.50 – 5.70 1.24 – 1.75 <0.001
favorite_count 1.18 0.09 1.17 – 1.18 0.09 – 0.09 <0.001
time_of_account 1.18 0.16 1.17 – 1.18 0.16 – 0.17 <0.001
voices 0.98 -0.02 0.81 – 1.20 -0.21 – 0.18 0.866
Random Effects
σ2 0.18
τ00 hashtag 0.33
ICC 0.65
N hashtag 47
Observations 42774
Marginal R2 / Conditional R2 0.066 / 0.674

What do we find? - people who have been on Twitter longer tweet more

Model 3 - state-level variables added

m3 <- glmer(n_tweets ~ 1 + 
              
              # state-level
              c_ideology + 
              pct_urban_in_state + 
              three_year_poverty_rates + 
              
              (1|hashtag), 
            data = to_model, 
            family = "poisson")
## Warning in checkConv(attr(opt, "derivs"), opt$par, ctrl = control$checkConv, : Model is nearly unidentifiable: very large eigenvalue
##  - Rescale variables?
performance::icc(m3)
## Warning: mu of 5.2 is too close to zero, estimate of random effect variances may be unreliable.
## # Intraclass Correlation Coefficient
## 
##      Adjusted ICC: 0.450
##   Conditional ICC: 0.416
sjPlot::tab_model(m3, show.std = TRUE)
  n tweets
Predictors Incidence Rate Ratios std. Beta CI standardized CI p
(Intercept) 11.53 1.67 8.82 – 15.08 1.56 – 1.78 <0.001
c_ideology 0.99 -0.19 0.98 – 0.99 -0.29 – -0.08 <0.001
pct_urban_in_state 1.01 0.06 1.00 – 1.01 -0.03 – 0.16 0.113
three_year_poverty_rates 0.96 -0.08 0.93 – 0.98 -0.16 – -0.00 0.003
Random Effects
σ2 0.18
τ00 hashtag 0.14
ICC 0.45
N hashtag 45
Observations 63796
Marginal R2 / Conditional R2 0.076 / 0.492
  • ideology matters a great deal, poverty a little

Model 4 - both sets of variables added

m4 <- glmer(n_tweets ~ 1 + 
              
              # individual
              #scale(favorite_count) +
              time_of_account + 
              
              # state-level
              c_ideology + 
              pct_urban_in_state + 
              three_year_poverty_rates + 
              
              (1|hashtag), 
            data = to_model, 
            family = "poisson")
## Warning in checkConv(attr(opt, "derivs"), opt$par, ctrl = control$checkConv, : Model is nearly unidentifiable: very large eigenvalue
##  - Rescale variables?
performance::icc(m4)
## Warning: mu of 5.2 is too close to zero, estimate of random effect variances may be unreliable.
## # Intraclass Correlation Coefficient
## 
##      Adjusted ICC: 0.564
##   Conditional ICC: 0.480
sjPlot::tab_model(m4, show.std = TRUE)
  n tweets
Predictors Incidence Rate Ratios std. Beta CI standardized CI p
(Intercept) 10.99 1.60 3.69 – 32.76 1.44 – 1.75 <0.001
time_of_account 1.07 0.17 1.07 – 1.08 0.17 – 0.17 <0.001
c_ideology 0.98 -0.24 0.98 – 0.99 -0.38 – -0.10 0.001
pct_urban_in_state 1.01 0.07 1.00 – 1.02 -0.05 – 0.19 0.274
three_year_poverty_rates 0.94 -0.10 0.90 – 1.00 -0.19 – -0.01 0.032
Random Effects
σ2 0.18
τ00 hashtag 0.23
ICC 0.56
N hashtag 45
Observations 42567
Marginal R2 / Conditional R2 0.149 / 0.629

Takeaways

For predicting more users at the state level: - Voices matter a great deal positively - Less urban, more right-leaning states with more users

For predicting how active users are - How long someone has been on Twitter and how many Tweets they’ve posted seems to be associated with greater activity, though I think we need to talk and think through the logic of these - Poorer, more right-leaning states are associated with more activity