Loading, setting up

library(tidyverse)
library(lme4)
library(readxl)

# source("prepare-data.R")
to_model <- read_csv("data-to-model.csv") 
data_to_add <- read_excel("TwitterTeachersData.xlsx") %>% 
  mutate(State = tolower(State),
         State = tools::toTitleCase(State)) %>% 
  janitor::clean_names() %>% 
  rename(State = state)

to_model <- to_model %>% 
  left_join(data_to_add)

to_model <- to_model %>% 
  rename(pct_urban_in_state = `Urban Percentage of State Population in 2015`,
         three_year_poverty_rates = `Three Year Average of State Poverty Rates 2015-2017`, 
         n_students_enrolled = `Enrollment in public ele/sec. education  Digest of Ed Stats. Fall 2015 (projected)`,
         n_students_in_free_reduced_lunch_pgm = `National School Lunch Program FY 2016 (FNS.USDA.GOV)`,
         pres_election_r_support_16 = `Politics: State's support of Trump in 2016`,
         region = `Region (By Census)`,
         teacher_salaries = `Teacher Salaries NEA 2016`,
         state_spending = `State Spending on Public Elementary/Secondary Spending FY 2016 (Census) (in thousands)`,
         n_teachers = Teachers)

to_model <- to_model %>% janitor::clean_names()

to_model <- to_model %>% 
  mutate(pct_students_frpl = n_students_in_free_reduced_lunch_pgm / n_students_enrolled)

This calculates the number of tweets sent to each hashtag

state_n <- to_model %>% 
  group_by(hashtag) %>% 
  summarize(sum_n = sum(n_tweets)) %>% 
  left_join(distinct(to_model, hashtag, .keep_all = TRUE))

## Joining, by = "hashtag"

Correlations at the state level

state_n %>% 
  mutate(years_on_twitter = 2020 - lubridate::year(account_created_at)) %>% 
  select(n_tweets_for_state = sum_n, n_tweets, favorite_count, years_on_twitter, n_teachers, voices,pct_urban_in_state, three_year_poverty_rates, pct_students_frpl, teacher_salaries, pres_election_r_support_16, unemployment, governor_1_gop_0_dem_2_ind, c_ideology, g_ideology, pct_urban_in_state, three_year_poverty_rates, pct_students_frpl, teacher_salaries, pres_election_r_support_16, state_spending) %>% 
  corrr::correlate() %>% 
  corrr::fashion() %>% 
  knitr::kable()

## 
## Correlation method: 'pearson'
## Missing treated using: 'pairwise.complete.obs'

rowname	n_tweets_for_state	n_tweets	favorite_count	years_on_twitter	n_teachers	voices	pct_urban_in_state	three_year_poverty_rates	pct_students_frpl	teacher_salaries	pres_election_r_support_16	unemployment	governor_1_gop_0_dem_2_ind	c_ideology	g_ideology	state_spending
n_tweets_for_state		.26	-.15	-.10	.35	.69	.09	.10	-.02	-.05	.20	.10	.18	-.24	-.21	.28
n_tweets	.26		.19	-.17	.50	.27	.10	.02	-.09	-.01	.07	.09	.01	-.11	-.08	.35
favorite_count	-.15	.19		.09	-.13	-.21	.15	-.22	-.23	-.09	-.29	-.03	-.30	.13	.36	-.08
years_on_twitter	-.10	-.17	.09		-.14	-.03	-.03	-.14	-.43	-.15	-.11	-.35	-.08	.07	-.00	-.16
n_teachers	.35	.50	-.13	-.14		.50	.43	.19	-.12	.35	-.22	.34	-.02	.14	.06	.94
voices	.69	.27	-.21	-.03	.50		.09	.01	.02	.18	.04	.11	.07	-.00	-.04	.46
pct_urban_in_state	.09	.10	.15	-.03	.43	.09		-.22	-.42	.52	-.52	.26	-.14	.31	.27	.46
three_year_poverty_rates	.10	.02	-.22	-.14	.19	.01	-.22		.47	-.35	.44	.60	.42	-.28	-.45	.11
pct_students_frpl	-.02	-.09	-.23	-.43	-.12	.02	-.42	.47		-.38	.59	-.06	.39	-.44	-.43	-.18
teacher_salaries	-.05	-.01	-.09	-.15	.35	.18	.52	-.35	-.38		-.66	.19	-.34	.70	.67	.55
pres_election_r_support_16	.20	.07	-.29	-.11	-.22	.04	-.52	.44	.59	-.66		.05	.52	-.84	-.77	-.35
unemployment	.10	.09	-.03	-.35	.34	.11	.26	.60	-.06	.19	.05		.14	.07	-.09	.33
governor_1_gop_0_dem_2_ind	.18	.01	-.30	-.08	-.02	.07	-.14	.42	.39	-.34	.52	.14		-.52	-.81	-.17
c_ideology	-.24	-.11	.13	.07	.14	-.00	.31	-.28	-.44	.70	-.84	.07	-.52		.79	.24
g_ideology	-.21	-.08	.36	-.00	.06	-.04	.27	-.45	-.43	.67	-.77	-.09	-.81	.79		.25
state_spending	.28	.35	-.08	-.16	.94	.46	.46	.11	-.18	.55	-.35	.33	-.17	.24	.25

What stands out here for relating to sum_n?

Voices (unique users) matters a great deal
N teachers in state, too
both ideology measures (congress and governor) seem to matter, sligtly more than R pres support
poverty, urbanicity, unemployment matter a bit

Analysis: State level (so n = ~50)

Some linear models

m0 <- glm(sum_n ~ n_teachers, data = state_n, family = "poisson")
sjPlot::tab_model(m0, show.std = TRUE)

	sum n
Predictors	Incidence Rate Ratios	std. Beta	CI	standardized CI	p
(Intercept)	5688.26	9.00	5663.27 – 5713.33	9.00 – 9.00	<0.001
n_teachers	1.00	0.36	1.00 – 1.00	0.36 – 0.36	<0.001
Observations	47
R² Nagelkerke	1.000

m00 <- glm(sum_n ~ n_teachers + state_spending, data = state_n, family = "poisson")
sjPlot::tab_model(m00, show.std = TRUE)

	sum n
Predictors	Incidence Rate Ratios	std. Beta	CI	standardized CI	p
(Intercept)	5675.12	9.00	5650.45 – 5699.88	8.99 – 9.00	<0.001
n_teachers	1.00	0.56	1.00 – 1.00	0.56 – 0.57	<0.001
state_spending	1.00	-0.23	1.00 – 1.00	-0.24 – -0.23	<0.001
Observations	47
R² Nagelkerke	1.000

m000 <- glm(sum_n ~ n_teachers + state_spending + c_ideology + pct_urban_in_state + three_year_poverty_rates, data = state_n, family = "poisson")
sjPlot::tab_model(m000, show.std = TRUE)

	sum n
Predictors	Incidence Rate Ratios	std. Beta	CI	standardized CI	p
(Intercept)	27229.41	8.90	26324.18 – 28166.13	8.89 – 8.90	<0.001
n_teachers	1.00	0.23	1.00 – 1.00	0.23 – 0.24	<0.001
state_spending	1.00	0.20	1.00 – 1.00	0.19 – 0.21	<0.001
c_ideology	0.96	-0.66	0.96 – 0.96	-0.67 – -0.66	<0.001
pct_urban_in_state	1.01	0.11	1.01 – 1.01	0.10 – 0.11	<0.001
three_year_poverty_rates	0.99	-0.04	0.99 – 0.99	-0.04 – -0.03	<0.001
Observations	45
R² Nagelkerke	1.000

m0000 <- glm(sum_n ~ n_teachers + state_spending + c_ideology + pct_urban_in_state + three_year_poverty_rates + voices, data = state_n, family = "poisson")
sjPlot::tab_model(m0000, show.std = TRUE)

	sum n
Predictors	Incidence Rate Ratios	std. Beta	CI	standardized CI	p
(Intercept)	803.91	8.65	770.04 – 839.26	8.65 – 8.66	<0.001
n_teachers	1.00	-0.15	1.00 – 1.00	-0.16 – -0.14	<0.001
state_spending	1.00	0.10	1.00 – 1.00	0.09 – 0.11	<0.001
c_ideology	0.97	-0.53	0.97 – 0.97	-0.53 – -0.52	<0.001
pct_urban_in_state	1.03	0.45	1.03 – 1.03	0.44 – 0.45	<0.001
three_year_poverty_rates	1.06	0.17	1.06 – 1.06	0.16 – 0.17	<0.001
voices	1.00	0.72	1.00 – 1.00	0.72 – 0.73	<0.001
Observations	45
R² Nagelkerke	1.000

What can we take away fom these?

Voices matter a great deal, unsurprisingly, positively
urbancity and ideology both matter, with less urban, more right-leaning states with more activity
n teachers matters little - correlated with voices

Analysis - at the individual level

to_model <- to_model %>% 
    mutate(years_on_twitter = 2020 - lubridate::year(to_model$account_created_at),
         n_tweets_by_user = n)

Model 1 - only the grouping structure (individuals nested in hashtags)

m1 <- glmer(n_tweets ~ 1 + 
              (1|hashtag), 
            data = to_model,family = 'poisson')

performance::icc(m1)

## Warning: mu of 5.2 is too close to zero, estimate of random effect variances may be unreliable.

## # Intraclass Correlation Coefficient
## 
##      Adjusted ICC: 0.545
##   Conditional ICC: 0.545

sjPlot::tab_model(m1, show.std = TRUE)

## No variables could be standardized.

	n tweets
Predictors	Incidence Rate Ratios	std. Beta	CI	standardized CI	p
(Intercept)	5.19	1.65	4.67 – 5.77	1.54 – 1.75	<0.001
Random Effects
σ²	0.18
τ₀₀ _hashtag	0.21
ICC	0.54
N _hashtag	53
Observations	72916
Marginal R² / Conditional R²	0.000 / 0.545

Model 2 - with some user-level independent variables added

m2 <- glmer(n_tweets ~ 1 + 
              
              # individual
              scale(favorite_count) +
              scale(time_of_account) + 
              scale(voices) +
              
              (1|hashtag), 
            data = to_model, 
            family = "poisson")

## Warning in checkConv(attr(opt, "derivs"), opt$par, ctrl = control$checkConv, : Model is nearly unidentifiable: very large eigenvalue
##  - Rescale variables?

performance::icc(m2)

## Warning: mu of 5.2 is too close to zero, estimate of random effect variances may be unreliable.

## # Intraclass Correlation Coefficient
## 
##      Adjusted ICC: 0.651
##   Conditional ICC: 0.608

sjPlot::tab_model(m2, show.std = TRUE)

## Warning in checkConv(attr(opt, "derivs"), opt$par, ctrl = control$checkConv, : Model is nearly unidentifiable: very large eigenvalue
##  - Rescale variables?

	n tweets
Predictors	Incidence Rate Ratios	std. Beta	CI	standardized CI	p
(Intercept)	4.47	1.49	3.50 – 5.70	1.24 – 1.75	<0.001
favorite_count	1.18	0.09	1.17 – 1.18	0.09 – 0.09	<0.001
time_of_account	1.18	0.16	1.17 – 1.18	0.16 – 0.17	<0.001
voices	0.98	-0.02	0.81 – 1.20	-0.21 – 0.18	0.866
Random Effects
σ²	0.18
τ₀₀ _hashtag	0.33
ICC	0.65
N _hashtag	47
Observations	42774
Marginal R² / Conditional R²	0.066 / 0.674

What do we find? - people who have been on Twitter longer tweet more

Model 3 - state-level variables added

m3 <- glmer(n_tweets ~ 1 + 
              
              # state-level
              c_ideology + 
              pct_urban_in_state + 
              three_year_poverty_rates + 
              
              (1|hashtag), 
            data = to_model, 
            family = "poisson")

## Warning in checkConv(attr(opt, "derivs"), opt$par, ctrl = control$checkConv, : Model is nearly unidentifiable: very large eigenvalue
##  - Rescale variables?

performance::icc(m3)

## Warning: mu of 5.2 is too close to zero, estimate of random effect variances may be unreliable.

## # Intraclass Correlation Coefficient
## 
##      Adjusted ICC: 0.450
##   Conditional ICC: 0.416

sjPlot::tab_model(m3, show.std = TRUE)

	n tweets
Predictors	Incidence Rate Ratios	std. Beta	CI	standardized CI	p
(Intercept)	11.53	1.67	8.82 – 15.08	1.56 – 1.78	<0.001
c_ideology	0.99	-0.19	0.98 – 0.99	-0.29 – -0.08	<0.001
pct_urban_in_state	1.01	0.06	1.00 – 1.01	-0.03 – 0.16	0.113
three_year_poverty_rates	0.96	-0.08	0.93 – 0.98	-0.16 – -0.00	0.003
Random Effects
σ²	0.18
τ₀₀ _hashtag	0.14
ICC	0.45
N _hashtag	45
Observations	63796
Marginal R² / Conditional R²	0.076 / 0.492

ideology matters a great deal, poverty a little

Model 4 - both sets of variables added

m4 <- glmer(n_tweets ~ 1 + 
              
              # individual
              #scale(favorite_count) +
              time_of_account + 
              
              # state-level
              c_ideology + 
              pct_urban_in_state + 
              three_year_poverty_rates + 
              
              (1|hashtag), 
            data = to_model, 
            family = "poisson")

## Warning in checkConv(attr(opt, "derivs"), opt$par, ctrl = control$checkConv, : Model is nearly unidentifiable: very large eigenvalue
##  - Rescale variables?

performance::icc(m4)

## Warning: mu of 5.2 is too close to zero, estimate of random effect variances may be unreliable.

## # Intraclass Correlation Coefficient
## 
##      Adjusted ICC: 0.564
##   Conditional ICC: 0.480

sjPlot::tab_model(m4, show.std = TRUE)

	n tweets
Predictors	Incidence Rate Ratios	std. Beta	CI	standardized CI	p
(Intercept)	10.99	1.60	3.69 – 32.76	1.44 – 1.75	<0.001
time_of_account	1.07	0.17	1.07 – 1.08	0.17 – 0.17	<0.001
c_ideology	0.98	-0.24	0.98 – 0.99	-0.38 – -0.10	0.001
pct_urban_in_state	1.01	0.07	1.00 – 1.02	-0.05 – 0.19	0.274
three_year_poverty_rates	0.94	-0.10	0.90 – 1.00	-0.19 – -0.01	0.032
Random Effects
σ²	0.18
τ₀₀ _hashtag	0.23
ICC	0.56
N _hashtag	45
Observations	42567
Marginal R² / Conditional R²	0.149 / 0.629

Takeaways

For predicting more users at the state level: - Voices matter a great deal positively - Less urban, more right-leaning states with more users

For predicting how active users are - How long someone has been on Twitter and how many Tweets they’ve posted seems to be associated with greater activity, though I think we need to talk and think through the logic of these - Poorer, more right-leaning states are associated with more activity