Loading, setting up
library(tidyverse)
library(lme4)
library(readxl)
# source("prepare-data.R")
to_model <- read_csv("data-to-model.csv")
data_to_add <- read_excel("TwitterTeachersData.xlsx") %>%
mutate(State = tolower(State),
State = tools::toTitleCase(State)) %>%
janitor::clean_names() %>%
rename(State = state)
to_model <- to_model %>%
left_join(data_to_add)
to_model <- to_model %>%
rename(pct_urban_in_state = `Urban Percentage of State Population in 2015`,
three_year_poverty_rates = `Three Year Average of State Poverty Rates 2015-2017`,
n_students_enrolled = `Enrollment in public ele/sec. education Digest of Ed Stats. Fall 2015 (projected)`,
n_students_in_free_reduced_lunch_pgm = `National School Lunch Program FY 2016 (FNS.USDA.GOV)`,
pres_election_r_support_16 = `Politics: State's support of Trump in 2016`,
region = `Region (By Census)`,
teacher_salaries = `Teacher Salaries NEA 2016`,
state_spending = `State Spending on Public Elementary/Secondary Spending FY 2016 (Census) (in thousands)`,
n_teachers = Teachers)
to_model <- to_model %>% janitor::clean_names()
to_model <- to_model %>%
mutate(pct_students_frpl = n_students_in_free_reduced_lunch_pgm / n_students_enrolled)
This calculates the number of tweets sent to each hashtag
state_n <- to_model %>%
group_by(hashtag) %>%
summarize(sum_n = sum(n_tweets)) %>%
left_join(distinct(to_model, hashtag, .keep_all = TRUE))
## Joining, by = "hashtag"
Correlations at the state level
state_n %>%
mutate(years_on_twitter = 2020 - lubridate::year(account_created_at)) %>%
select(n_tweets_for_state = sum_n, n_tweets, favorite_count, years_on_twitter, n_teachers, voices,pct_urban_in_state, three_year_poverty_rates, pct_students_frpl, teacher_salaries, pres_election_r_support_16, unemployment, governor_1_gop_0_dem_2_ind, c_ideology, g_ideology, pct_urban_in_state, three_year_poverty_rates, pct_students_frpl, teacher_salaries, pres_election_r_support_16, state_spending) %>%
corrr::correlate() %>%
corrr::fashion() %>%
knitr::kable()
##
## Correlation method: 'pearson'
## Missing treated using: 'pairwise.complete.obs'
n_tweets_for_state |
|
.26 |
-.15 |
-.10 |
.35 |
.69 |
.09 |
.10 |
-.02 |
-.05 |
.20 |
.10 |
.18 |
-.24 |
-.21 |
.28 |
n_tweets |
.26 |
|
.19 |
-.17 |
.50 |
.27 |
.10 |
.02 |
-.09 |
-.01 |
.07 |
.09 |
.01 |
-.11 |
-.08 |
.35 |
favorite_count |
-.15 |
.19 |
|
.09 |
-.13 |
-.21 |
.15 |
-.22 |
-.23 |
-.09 |
-.29 |
-.03 |
-.30 |
.13 |
.36 |
-.08 |
years_on_twitter |
-.10 |
-.17 |
.09 |
|
-.14 |
-.03 |
-.03 |
-.14 |
-.43 |
-.15 |
-.11 |
-.35 |
-.08 |
.07 |
-.00 |
-.16 |
n_teachers |
.35 |
.50 |
-.13 |
-.14 |
|
.50 |
.43 |
.19 |
-.12 |
.35 |
-.22 |
.34 |
-.02 |
.14 |
.06 |
.94 |
voices |
.69 |
.27 |
-.21 |
-.03 |
.50 |
|
.09 |
.01 |
.02 |
.18 |
.04 |
.11 |
.07 |
-.00 |
-.04 |
.46 |
pct_urban_in_state |
.09 |
.10 |
.15 |
-.03 |
.43 |
.09 |
|
-.22 |
-.42 |
.52 |
-.52 |
.26 |
-.14 |
.31 |
.27 |
.46 |
three_year_poverty_rates |
.10 |
.02 |
-.22 |
-.14 |
.19 |
.01 |
-.22 |
|
.47 |
-.35 |
.44 |
.60 |
.42 |
-.28 |
-.45 |
.11 |
pct_students_frpl |
-.02 |
-.09 |
-.23 |
-.43 |
-.12 |
.02 |
-.42 |
.47 |
|
-.38 |
.59 |
-.06 |
.39 |
-.44 |
-.43 |
-.18 |
teacher_salaries |
-.05 |
-.01 |
-.09 |
-.15 |
.35 |
.18 |
.52 |
-.35 |
-.38 |
|
-.66 |
.19 |
-.34 |
.70 |
.67 |
.55 |
pres_election_r_support_16 |
.20 |
.07 |
-.29 |
-.11 |
-.22 |
.04 |
-.52 |
.44 |
.59 |
-.66 |
|
.05 |
.52 |
-.84 |
-.77 |
-.35 |
unemployment |
.10 |
.09 |
-.03 |
-.35 |
.34 |
.11 |
.26 |
.60 |
-.06 |
.19 |
.05 |
|
.14 |
.07 |
-.09 |
.33 |
governor_1_gop_0_dem_2_ind |
.18 |
.01 |
-.30 |
-.08 |
-.02 |
.07 |
-.14 |
.42 |
.39 |
-.34 |
.52 |
.14 |
|
-.52 |
-.81 |
-.17 |
c_ideology |
-.24 |
-.11 |
.13 |
.07 |
.14 |
-.00 |
.31 |
-.28 |
-.44 |
.70 |
-.84 |
.07 |
-.52 |
|
.79 |
.24 |
g_ideology |
-.21 |
-.08 |
.36 |
-.00 |
.06 |
-.04 |
.27 |
-.45 |
-.43 |
.67 |
-.77 |
-.09 |
-.81 |
.79 |
|
.25 |
state_spending |
.28 |
.35 |
-.08 |
-.16 |
.94 |
.46 |
.46 |
.11 |
-.18 |
.55 |
-.35 |
.33 |
-.17 |
.24 |
.25 |
|
What stands out here for relating to sum_n
?
- Voices (unique users) matters a great deal
- N teachers in state, too
- both ideology measures (congress and governor) seem to matter, sligtly more than R pres support
- poverty, urbanicity, unemployment matter a bit
Analysis: State level (so n = ~50)
Some linear models
m0 <- glm(sum_n ~ n_teachers, data = state_n, family = "poisson")
sjPlot::tab_model(m0, show.std = TRUE)
|
sum n
|
Predictors
|
Incidence Rate Ratios
|
std. Beta
|
CI
|
standardized CI
|
p
|
(Intercept)
|
5688.26
|
9.00
|
5663.27 – 5713.33
|
9.00 – 9.00
|
<0.001
|
n_teachers
|
1.00
|
0.36
|
1.00 – 1.00
|
0.36 – 0.36
|
<0.001
|
Observations
|
47
|
R2 Nagelkerke
|
1.000
|
m00 <- glm(sum_n ~ n_teachers + state_spending, data = state_n, family = "poisson")
sjPlot::tab_model(m00, show.std = TRUE)
|
sum n
|
Predictors
|
Incidence Rate Ratios
|
std. Beta
|
CI
|
standardized CI
|
p
|
(Intercept)
|
5675.12
|
9.00
|
5650.45 – 5699.88
|
8.99 – 9.00
|
<0.001
|
n_teachers
|
1.00
|
0.56
|
1.00 – 1.00
|
0.56 – 0.57
|
<0.001
|
state_spending
|
1.00
|
-0.23
|
1.00 – 1.00
|
-0.24 – -0.23
|
<0.001
|
Observations
|
47
|
R2 Nagelkerke
|
1.000
|
m000 <- glm(sum_n ~ n_teachers + state_spending + c_ideology + pct_urban_in_state + three_year_poverty_rates, data = state_n, family = "poisson")
sjPlot::tab_model(m000, show.std = TRUE)
|
sum n
|
Predictors
|
Incidence Rate Ratios
|
std. Beta
|
CI
|
standardized CI
|
p
|
(Intercept)
|
27229.41
|
8.90
|
26324.18 – 28166.13
|
8.89 – 8.90
|
<0.001
|
n_teachers
|
1.00
|
0.23
|
1.00 – 1.00
|
0.23 – 0.24
|
<0.001
|
state_spending
|
1.00
|
0.20
|
1.00 – 1.00
|
0.19 – 0.21
|
<0.001
|
c_ideology
|
0.96
|
-0.66
|
0.96 – 0.96
|
-0.67 – -0.66
|
<0.001
|
pct_urban_in_state
|
1.01
|
0.11
|
1.01 – 1.01
|
0.10 – 0.11
|
<0.001
|
three_year_poverty_rates
|
0.99
|
-0.04
|
0.99 – 0.99
|
-0.04 – -0.03
|
<0.001
|
Observations
|
45
|
R2 Nagelkerke
|
1.000
|
m0000 <- glm(sum_n ~ n_teachers + state_spending + c_ideology + pct_urban_in_state + three_year_poverty_rates + voices, data = state_n, family = "poisson")
sjPlot::tab_model(m0000, show.std = TRUE)
|
sum n
|
Predictors
|
Incidence Rate Ratios
|
std. Beta
|
CI
|
standardized CI
|
p
|
(Intercept)
|
803.91
|
8.65
|
770.04 – 839.26
|
8.65 – 8.66
|
<0.001
|
n_teachers
|
1.00
|
-0.15
|
1.00 – 1.00
|
-0.16 – -0.14
|
<0.001
|
state_spending
|
1.00
|
0.10
|
1.00 – 1.00
|
0.09 – 0.11
|
<0.001
|
c_ideology
|
0.97
|
-0.53
|
0.97 – 0.97
|
-0.53 – -0.52
|
<0.001
|
pct_urban_in_state
|
1.03
|
0.45
|
1.03 – 1.03
|
0.44 – 0.45
|
<0.001
|
three_year_poverty_rates
|
1.06
|
0.17
|
1.06 – 1.06
|
0.16 – 0.17
|
<0.001
|
voices
|
1.00
|
0.72
|
1.00 – 1.00
|
0.72 – 0.73
|
<0.001
|
Observations
|
45
|
R2 Nagelkerke
|
1.000
|
What can we take away fom these?
- Voices matter a great deal, unsurprisingly, positively
- urbancity and ideology both matter, with less urban, more right-leaning states with more activity
- n teachers matters little - correlated with voices
Analysis - at the individual level
to_model <- to_model %>%
mutate(years_on_twitter = 2020 - lubridate::year(to_model$account_created_at),
n_tweets_by_user = n)
Model 2 - with some user-level independent variables added
m2 <- glmer(n_tweets ~ 1 +
# individual
scale(favorite_count) +
scale(time_of_account) +
scale(voices) +
(1|hashtag),
data = to_model,
family = "poisson")
## Warning in checkConv(attr(opt, "derivs"), opt$par, ctrl = control$checkConv, : Model is nearly unidentifiable: very large eigenvalue
## - Rescale variables?
performance::icc(m2)
## Warning: mu of 5.2 is too close to zero, estimate of random effect variances may be unreliable.
## # Intraclass Correlation Coefficient
##
## Adjusted ICC: 0.651
## Conditional ICC: 0.608
sjPlot::tab_model(m2, show.std = TRUE)
## Warning in checkConv(attr(opt, "derivs"), opt$par, ctrl = control$checkConv, : Model is nearly unidentifiable: very large eigenvalue
## - Rescale variables?
|
n tweets
|
Predictors
|
Incidence Rate Ratios
|
std. Beta
|
CI
|
standardized CI
|
p
|
(Intercept)
|
4.47
|
1.49
|
3.50 – 5.70
|
1.24 – 1.75
|
<0.001
|
favorite_count
|
1.18
|
0.09
|
1.17 – 1.18
|
0.09 – 0.09
|
<0.001
|
time_of_account
|
1.18
|
0.16
|
1.17 – 1.18
|
0.16 – 0.17
|
<0.001
|
voices
|
0.98
|
-0.02
|
0.81 – 1.20
|
-0.21 – 0.18
|
0.866
|
Random Effects
|
σ2
|
0.18
|
τ00 hashtag
|
0.33
|
ICC
|
0.65
|
N hashtag
|
47
|
Observations
|
42774
|
Marginal R2 / Conditional R2
|
0.066 / 0.674
|
What do we find? - people who have been on Twitter longer tweet more
Model 3 - state-level variables added
m3 <- glmer(n_tweets ~ 1 +
# state-level
c_ideology +
pct_urban_in_state +
three_year_poverty_rates +
(1|hashtag),
data = to_model,
family = "poisson")
## Warning in checkConv(attr(opt, "derivs"), opt$par, ctrl = control$checkConv, : Model is nearly unidentifiable: very large eigenvalue
## - Rescale variables?
performance::icc(m3)
## Warning: mu of 5.2 is too close to zero, estimate of random effect variances may be unreliable.
## # Intraclass Correlation Coefficient
##
## Adjusted ICC: 0.450
## Conditional ICC: 0.416
sjPlot::tab_model(m3, show.std = TRUE)
|
n tweets
|
Predictors
|
Incidence Rate Ratios
|
std. Beta
|
CI
|
standardized CI
|
p
|
(Intercept)
|
11.53
|
1.67
|
8.82 – 15.08
|
1.56 – 1.78
|
<0.001
|
c_ideology
|
0.99
|
-0.19
|
0.98 – 0.99
|
-0.29 – -0.08
|
<0.001
|
pct_urban_in_state
|
1.01
|
0.06
|
1.00 – 1.01
|
-0.03 – 0.16
|
0.113
|
three_year_poverty_rates
|
0.96
|
-0.08
|
0.93 – 0.98
|
-0.16 – -0.00
|
0.003
|
Random Effects
|
σ2
|
0.18
|
τ00 hashtag
|
0.14
|
ICC
|
0.45
|
N hashtag
|
45
|
Observations
|
63796
|
Marginal R2 / Conditional R2
|
0.076 / 0.492
|
- ideology matters a great deal, poverty a little
Model 4 - both sets of variables added
m4 <- glmer(n_tweets ~ 1 +
# individual
#scale(favorite_count) +
time_of_account +
# state-level
c_ideology +
pct_urban_in_state +
three_year_poverty_rates +
(1|hashtag),
data = to_model,
family = "poisson")
## Warning in checkConv(attr(opt, "derivs"), opt$par, ctrl = control$checkConv, : Model is nearly unidentifiable: very large eigenvalue
## - Rescale variables?
performance::icc(m4)
## Warning: mu of 5.2 is too close to zero, estimate of random effect variances may be unreliable.
## # Intraclass Correlation Coefficient
##
## Adjusted ICC: 0.564
## Conditional ICC: 0.480
sjPlot::tab_model(m4, show.std = TRUE)
|
n tweets
|
Predictors
|
Incidence Rate Ratios
|
std. Beta
|
CI
|
standardized CI
|
p
|
(Intercept)
|
10.99
|
1.60
|
3.69 – 32.76
|
1.44 – 1.75
|
<0.001
|
time_of_account
|
1.07
|
0.17
|
1.07 – 1.08
|
0.17 – 0.17
|
<0.001
|
c_ideology
|
0.98
|
-0.24
|
0.98 – 0.99
|
-0.38 – -0.10
|
0.001
|
pct_urban_in_state
|
1.01
|
0.07
|
1.00 – 1.02
|
-0.05 – 0.19
|
0.274
|
three_year_poverty_rates
|
0.94
|
-0.10
|
0.90 – 1.00
|
-0.19 – -0.01
|
0.032
|
Random Effects
|
σ2
|
0.18
|
τ00 hashtag
|
0.23
|
ICC
|
0.56
|
N hashtag
|
45
|
Observations
|
42567
|
Marginal R2 / Conditional R2
|
0.149 / 0.629
|
Takeaways
For predicting more users at the state level: - Voices matter a great deal positively - Less urban, more right-leaning states with more users
For predicting how active users are - How long someone has been on Twitter and how many Tweets they’ve posted seems to be associated with greater activity, though I think we need to talk and think through the logic of these - Poorer, more right-leaning states are associated with more activity