Todo:

1 Loading, setting up

library(tidyverse)
library(igraph)
library(lme4)
library(sjstats)
library(rtweet)
library(rTAGS) # install with devtools::install_github('bretsw/rTAGS)
library(brms)
# source('prepare-data.R') # this creates the two files with 'to_analyze' in their name, as read below

1.1 Getting data from Open Science Framework

For notes on this one-time setup, see this walkthrough: http://centerforopenscience.github.io/osfr/articles/auth.html)

First, you must generate an API token from an OSF account that has been added to the data repository. Read how to do this here: https://developer.osf.io/#tag/General-Usage

Then, you need to add the OSF API token to the .renviron file, which is created using the following command. Here, the file is created at the user level, although this could also be set to the project level.

usethis::edit_r_environ(scope='user')

Open the .renviron file and add a single line, using this exact text but replacing <token> with your OSF API token:
OSF_PAT="<token>"

Save the file, quit R, and restart in a new session. Continue running the R script from here.

1.2 Loading the data

Now, install the osfr package and load the library:

library(osfr)

Upon loading the osfr package, you should see this message:
Automatically registered OSF personal access token.

Now you are able to retrieve and download the relevant datasets with this code:

# all-ngsschat-tweets.csv
osf_retrieve_file("https://osf.io/k2w6t/") %>% 
  osf_download(path = "ngsschat-tweets-14-15.rds", overwrite = TRUE)

# orig-pre.csv
osf_retrieve_file("https://osf.io/nj8yk/") %>% 
  osf_download(path = "orig-pre-14.csv", overwrite = TRUE)

# orig-post.csv
osf_retrieve_file("https://osf.io/ngwpt/") %>% 
  osf_download(path = "orig-post-15.csv", overwrite = TRUE)

# users-to-analyze.csv
osf_retrieve_file("https://osf.io/jz7p6/") %>% 
  osf_download(path = "users-to-analyze.csv", overwrite = TRUE)

# edgelist-to-analyze.csv
osf_retrieve_file("https://osf.io/sbyn9/") %>% 
  osf_download(path = "edgelist-to-analyze.csv", overwrite = TRUE)
orig <- read_rds("ngsschat-tweets-14-15.rds") # original data
orig_pre <- read_csv("orig-pre-14.csv") # data for the year after
orig_post <- read_csv("orig-post-15.csv") # data for the year before
users <- read_csv("users-to-analyze.csv") # processed user data
edge <- read_csv("edgelist-to-analyze.csv") # processed edgelist data

2 Time series plot

ts_plot(orig) +
  geom_vline(xintercept = as.POSIXct(as.Date("2014-08-01"))) +
  geom_vline(xintercept = as.POSIXct(as.Date("2015-07-31"))) +
  theme_bw() +
  xlab("Day") +
  ylab("Number of Tweets including #NGSSchat per day") +
  xlim(c(as.POSIXct(as.Date("2012-01-01")), as.POSIXct(as.Date("2017-12-31")))) +
  geom_rect(aes(xmin = as.POSIXct(as.Date(c("2014-08-01"))), xmax = as.POSIXct(as.Date(c("2015-07-31"))),
                ymin = -Inf,
                ymax = Inf),
            fill = "cyan3", alpha = 0.01)

3 Location plot

# loc <- osm_geocode(users$location, key = MQ_API_KEY)

# l <- as.list(users$location) %>% 
#   map(osm_geocode, key = MQ_API_KEY)
# 
# write_rds(l, "geocoded-locations.rds")
l <- read_rds("geocoded-locations.rds")

lats <- l %>% 
  purrr::map(~.$lat) %>% 
  modify_if(is.null, ~ NA)

lons <- l %>% 
  purrr::map(~.$lon) %>% 
  modify_if(is.null, ~ NA)

display_name <- l %>% 
  purrr::map(~.$display_name) %>% 
  modify_if(is.null, ~ NA)

users$lat <- unlist(lats)
users$lon <- unlist(lons)
users$display_name <- unlist(display_name)

states <- map_data("state")

ggplot(data = states) + 
  geom_polygon(aes(x = long, y = lat, group = group), color = "black", fill = 'white') + 
  geom_point(data = filter(users, str_detect(display_name, "United States")), aes(x = lon, y = lat, color = group, size = n_tweets)) +
  coord_fixed(1.3) +
  guides(fill=FALSE) + # do this to leave off the color legend
  coord_map("stereographic") +
  ggthemes::theme_map() +
  scale_color_viridis_d("Group") +
  scale_size_continuous("Number of Original Tweets ('14-'15")

4 Descriptive stats

4.1 Table of descriptive stats

users <- select(users, -account_lang, -user_id) # this var seems to be all NA

# overall
orig %>% 
  mutate(screen_name = tolower(screen_name)) %>% 
  filter(!is_retweet) %>% 
  left_join(users) %>% 
  semi_join(users) %>% 
  filter(n_tweets >= 2) %>% 
  count(screen_name) %>%
  rename(n_tweets = n) # 230 users w/ more than one tweet
## # A tibble: 191 x 2
##    screen_name     n_tweets
##    <chr>              <int>
##  1 21stscied             13
##  2 2footgiraffe          58
##  3 achavez_science        3
##  4 adchempages           28
##  5 aeolani                8
##  6 ajollygal              2
##  7 aliciajohal            5
##  8 all4ed                 2
##  9 alynnmeyer            34
## 10 amycoyote              9
## # … with 181 more rows
orig %>% 
  mutate(screen_name = tolower(screen_name)) %>% 
  filter(!is_retweet) %>% 
  left_join(users) %>% 
  semi_join(users) %>% 
  filter(n_tweets >= 2) %>% 
  count(screen_name) %>% 
  summarize(median_n = median(n),
            mean_n = mean(n), 
            sd_n = sd(n))
## # A tibble: 1 x 3
##   median_n mean_n  sd_n
##      <int>  <dbl> <dbl>
## 1       12   33.3  90.7
# by group

du <- orig %>% 
  mutate(screen_name = tolower(screen_name)) %>% 
  filter(!is_retweet) %>% 
  left_join(users) %>% 
  semi_join(users) %>% 
  filter(n_tweets >= 2) %>% 
  mutate(screen_name = tolower(screen_name)) %>% 
  filter(!is.na(group))

n_tweeters_by_group <- du %>% 
  filter(!is_retweet) %>% 
  count(group, screen_name) %>% 
  count(group) %>% 
  rename(n_tweeters = n) %>% 
  mutate(n_prop = n_tweeters / sum(n_tweeters)) %>% 
  arrange(desc(n_tweeters))

n_tweets_by_group <- du %>% 
  filter(!is_retweet) %>% 
  count(group, screen_name) %>% 
  group_by(group) %>% 
  summarize(sum_n_tweets = sum(n))

# this is individual engagement by group - probably what we want to report
fin_df <- du %>% 
  filter(!is_retweet) %>% 
  count(group, screen_name) %>% 
  group_by(group) %>% 
  summarize(mean_n_tweets = mean(n), 
            sd_n_tweets = sd(n)) %>% 
  right_join(n_tweeters_by_group) %>% 
  right_join(n_tweets_by_group) %>% 
  select(group, sum_n_tweets, n_tweeters, mean_n_tweets, sd_n_tweets) %>% 
  arrange(desc(sum_n_tweets))

fin_df
## # A tibble: 6 x 5
##   group         sum_n_tweets n_tweeters mean_n_tweets sd_n_tweets
##   <chr>                <int>      <int>         <dbl>       <dbl>
## 1 Teacher               2653         78         34.0        90.9 
## 2 Administrator         2421         46         52.6       135.  
## 3 Researcher             600         14         42.9        54.9 
## 4 Other                  441         22         20.0        29.2 
## 5 Unclear                184         23          8           7.53
## 6 Organization            67          8          8.38        4.50

4.2 Testing the diffs

chisq.test(fin_df$sum_n_tweets) # there are sig diffs across sum_n_tweets
## 
##  Chi-squared test for given probabilities
## 
## data:  fin_df$sum_n_tweets
## X-squared = 6350.8, df = 5, p-value < 2.2e-16
cst <- chisq.test(fin_df$sum_n_tweets)
cst$stdres
## [1]  53.53971  45.73744 -15.50365 -20.85089 -29.49392 -33.42869
cst <- chisq.test(fin_df$sum_n_tweets[1:2])
cst$stdres # there does seem to be a diff in n between teachers and admin
## [1]  3.256962 -3.256962

5 Interactions

5.1 Conversing

dc <- edge %>% 
  filter(interaction_type == "conversing")

g <- graph_from_data_frame(dc)
m <- as_adjacency_matrix(g, sparse = FALSE) # sender is row, receiver is column

t <- m %>% 
  as.data.frame() %>% 
  rownames_to_column("sender") %>% 
  gather(receiver, val, -sender) %>% 
  as_tibble()

tt <- add_users_data(t, users)

tt <- mutate(tt,
             code_sender = ifelse(is.na(code_sender), 11, code_sender),
             code_receiver = ifelse(is.na(code_receiver), 11, code_receiver)) %>%
  # filter(code_sender != 11 & code_receiver !=11) %>%
  mutate(group_sender = recode(code_sender,
                               `1` = "Teacher",
                               `2` = "Administrator",
                               `3` = "Administrator",
                               `4` = "Researcher",
                               `5` = "Other",
                               `8` = "Other",
                               `9` = "Other",
                               `10` = "Other",
                               `6` = "Organization",
                               `7` = "Organization",
                               `11` = "Other"),
         group_receiver = recode(code_receiver,
                                 `1` = "Teacher",
                                 `2` = "Administrator",
                                 `3` = "Administrator",
                                 `4` = "Researcher",
                                 `5` = "Other",
                                 `8` = "Other",
                                 `9` = "Other",
                                 `10` = "Other",
                                 `6` = "Organization",
                                 `7` = "Organization",
                                 `11` = "Other"))

tt$group_receiver <- fct_relevel(as.factor(tt$group_receiver), "Other")
tt$group_sender <- fct_relevel(as.factor(tt$group_sender), "Other")
tt$dic <- ifelse(tt$val > 0, 1, 0)
tt$same <- ifelse(tt$group_sender == tt$group_receiver, 1, 0)
mc1 <- brm(val ~ 1 +
             (1|sender) + (1|receiver),
           iter = 800, chains = 3, cores = 3,
           control=list(adapt_delta=0.99, 
                        max_treedepth=10),
           family = 'poisson',
           data = tt)

summary(mc1)
##  Family: poisson 
##   Links: mu = log 
## Formula: val ~ 1 + (1 | sender) + (1 | receiver) 
##    Data: tt (Number of observations: 117649) 
## Samples: 3 chains, each with iter = 800; warmup = 400; thin = 1;
##          total post-warmup samples = 1200
## 
## Group-Level Effects: 
## ~receiver (Number of levels: 343) 
##               Estimate Est.Error l-95% CI u-95% CI Eff.Sample Rhat
## sd(Intercept)     2.42      0.15     2.15     2.71         71 1.02
## 
## ~sender (Number of levels: 242) 
##               Estimate Est.Error l-95% CI u-95% CI Eff.Sample Rhat
## sd(Intercept)     1.86      0.11     1.67     2.11         36 1.06
## 
## Population-Level Effects: 
##           Estimate Est.Error l-95% CI u-95% CI Eff.Sample Rhat
## Intercept    -6.12      0.18    -6.48    -5.79         70 1.03
## 
## Samples were drawn using sampling(NUTS). For each parameter, Eff.Sample 
## is a crude measure of effective sample size, and Rhat is the potential 
## scale reduction factor on split chains (at convergence, Rhat = 1).
mc1 <- brm(val ~ 1 +
             group_sender +
             group_receiver +
             (1|sender) + (1|receiver),
           iter = 800, chains = 3, cores = 3,
           control=list(adapt_delta=0.99, 
                        max_treedepth=10),
           family = 'poisson',
           data = tt)

summary(mc1)
##  Family: poisson 
##   Links: mu = log 
## Formula: val ~ 1 + group_sender + group_receiver + (1 | sender) + (1 | receiver) 
##    Data: tt (Number of observations: 117649) 
## Samples: 3 chains, each with iter = 800; warmup = 400; thin = 1;
##          total post-warmup samples = 1200
## 
## Group-Level Effects: 
## ~receiver (Number of levels: 343) 
##               Estimate Est.Error l-95% CI u-95% CI Eff.Sample Rhat
## sd(Intercept)     2.37      0.14     2.09     2.61         48 1.05
## 
## ~sender (Number of levels: 242) 
##               Estimate Est.Error l-95% CI u-95% CI Eff.Sample Rhat
## sd(Intercept)     1.85      0.11     1.66     2.07         54 1.04
## 
## Population-Level Effects: 
##                             Estimate Est.Error l-95% CI u-95% CI
## Intercept                      -7.16      0.38    -7.90    -6.37
## group_senderAdministrator       0.71      0.37    -0.03     1.39
## group_senderOrganization        0.03      0.54    -1.01     1.11
## group_senderResearcher          0.83      0.46     0.04     1.76
## group_senderTeacher             0.63      0.32    -0.09     1.23
## group_receiverAdministrator     0.80      0.44    -0.03     1.61
## group_receiverOrganization     -0.13      0.70    -1.40     1.37
## group_receiverResearcher        1.10      0.57    -0.03     2.18
## group_receiverTeacher           0.91      0.36     0.07     1.60
##                             Eff.Sample Rhat
## Intercept                           20 1.11
## group_senderAdministrator           56 1.07
## group_senderOrganization           105 1.01
## group_senderResearcher              88 1.06
## group_senderTeacher                 15 1.15
## group_receiverAdministrator         34 1.08
## group_receiverOrganization          43 1.04
## group_receiverResearcher            57 1.08
## group_receiverTeacher                9 1.27
## 
## Samples were drawn using sampling(NUTS). For each parameter, Eff.Sample 
## is a crude measure of effective sample size, and Rhat is the potential 
## scale reduction factor on split chains (at convergence, Rhat = 1).
mc2 <- brm(val ~ 1 +
             group_sender +
             group_receiver +
             same + 
             (1|sender) + (1|receiver),
           iter = 800, chains = 3, cores = 3,
           control=list(adapt_delta=0.99, 
                        max_treedepth=10),
           family = 'poisson',
           data = tt)

summary(mc2)
##  Family: poisson 
##   Links: mu = log 
## Formula: val ~ 1 + group_sender + group_receiver + same + (1 | sender) + (1 | receiver) 
##    Data: tt (Number of observations: 117649) 
## Samples: 3 chains, each with iter = 800; warmup = 400; thin = 1;
##          total post-warmup samples = 1200
## 
## Group-Level Effects: 
## ~receiver (Number of levels: 343) 
##               Estimate Est.Error l-95% CI u-95% CI Eff.Sample Rhat
## sd(Intercept)     2.37      0.13     2.14     2.64        113 1.02
## 
## ~sender (Number of levels: 242) 
##               Estimate Est.Error l-95% CI u-95% CI Eff.Sample Rhat
## sd(Intercept)     1.87      0.11     1.68     2.14         73 1.04
## 
## Population-Level Effects: 
##                             Estimate Est.Error l-95% CI u-95% CI
## Intercept                      -7.13      0.41    -7.90    -6.26
## group_senderAdministrator       0.75      0.40    -0.07     1.58
## group_senderOrganization       -0.07      0.62    -1.39     1.16
## group_senderResearcher          0.79      0.57    -0.39     1.79
## group_senderTeacher             0.63      0.33    -0.07     1.27
## group_receiverAdministrator     0.74      0.39    -0.09     1.44
## group_receiverOrganization     -0.14      0.64    -1.41     1.10
## group_receiverResearcher        1.18      0.56     0.12     2.23
## group_receiverTeacher           0.91      0.35     0.24     1.61
## same                            0.09      0.02     0.05     0.14
##                             Eff.Sample Rhat
## Intercept                           15 1.23
## group_senderAdministrator           47 1.07
## group_senderOrganization            92 1.03
## group_senderResearcher              72 1.01
## group_senderTeacher                 45 1.02
## group_receiverAdministrator         37 1.14
## group_receiverOrganization          51 1.07
## group_receiverResearcher            84 1.02
## group_receiverTeacher               22 1.17
## same                              1229 1.00
## 
## Samples were drawn using sampling(NUTS). For each parameter, Eff.Sample 
## is a crude measure of effective sample size, and Rhat is the potential 
## scale reduction factor on split chains (at convergence, Rhat = 1).
mc3 <- brm(val ~ 1 +
             scale(years_on_twitter_sender) + scale(n_tweets_sender) + group_sender +
             scale(years_on_twitter_receiver) + scale(n_tweets_receiver) + group_receiver +
             same +
             (1|sender) + (1|receiver),
           iter = 1800, chains = 3, cores = 3,
           family = 'poisson',
           control=list(adapt_delta=0.99, 
                        max_treedepth=10),
           data = tt)

summary(mc3)
##  Family: poisson 
##   Links: mu = log 
## Formula: val ~ 1 + scale(years_on_twitter_sender) + scale(n_tweets_sender) + group_sender + scale(years_on_twitter_receiver) + scale(n_tweets_receiver) + group_receiver + same + (1 | sender) + (1 | receiver) 
##    Data: tt (Number of observations: 117649) 
## Samples: 3 chains, each with iter = 1800; warmup = 900; thin = 1;
##          total post-warmup samples = 2700
## 
## Group-Level Effects: 
## ~receiver (Number of levels: 343) 
##               Estimate Est.Error l-95% CI u-95% CI Eff.Sample Rhat
## sd(Intercept)     2.29      0.13     2.06     2.57        293 1.00
## 
## ~sender (Number of levels: 242) 
##               Estimate Est.Error l-95% CI u-95% CI Eff.Sample Rhat
## sd(Intercept)     1.53      0.09     1.36     1.72        208 1.03
## 
## Population-Level Effects: 
##                                Estimate Est.Error l-95% CI u-95% CI
## Intercept                         -6.61      0.33    -7.29    -5.96
## scaleyears_on_twitter_sender       0.24      0.09     0.05     0.43
## scalen_tweets_sender               1.03      0.12     0.78     1.27
## group_senderAdministrator          0.28      0.30    -0.27     0.88
## group_senderOrganization          -0.32      0.48    -1.22     0.62
## group_senderResearcher             0.35      0.41    -0.44     1.13
## group_senderTeacher                0.23      0.26    -0.24     0.74
## scaleyears_on_twitter_receiver     0.28      0.14    -0.01     0.55
## scalen_tweets_receiver             0.38      0.13     0.12     0.62
## group_receiverAdministrator        0.54      0.38    -0.20     1.25
## group_receiverOrganization        -0.28      0.58    -1.36     0.92
## group_receiverResearcher           0.88      0.57    -0.19     2.03
## group_receiverTeacher              0.75      0.34     0.07     1.38
## same                               0.09      0.02     0.05     0.14
##                                Eff.Sample Rhat
## Intercept                             104 1.02
## scaleyears_on_twitter_sender          243 1.01
## scalen_tweets_sender                  291 1.01
## group_senderAdministrator             186 1.00
## group_senderOrganization              270 1.01
## group_senderResearcher                308 1.02
## group_senderTeacher                   188 1.01
## scaleyears_on_twitter_receiver        133 1.01
## scalen_tweets_receiver                488 1.01
## group_receiverAdministrator           127 1.01
## group_receiverOrganization            196 1.02
## group_receiverResearcher              136 1.01
## group_receiverTeacher                 100 1.02
## same                                 3669 1.00
## 
## Samples were drawn using sampling(NUTS). For each parameter, Eff.Sample 
## is a crude measure of effective sample size, and Rhat is the potential 
## scale reduction factor on split chains (at convergence, Rhat = 1).

5.2 Endorsing

dc <- edge %>% 
  filter(interaction_type == "endorsing")

g <- graph_from_data_frame(dc)
m <- as_adjacency_matrix(g, sparse = FALSE) # sender is row, receiver is column

t <- m %>% 
  as.data.frame() %>% 
  rownames_to_column("sender") %>% 
  gather(receiver, val, -sender) %>% 
  as_tibble()

tt <- add_users_data(t, users)

tt <- mutate(tt,
             code_sender = ifelse(is.na(code_sender), 11, code_sender),
             code_receiver = ifelse(is.na(code_receiver), 11, code_receiver)) %>%
  # filter(code_sender != 11 & code_receiver !=11) %>%
  mutate(group_sender = recode(code_sender,
                               `1` = "Teacher",
                               `2` = "Administrator",
                               `3` = "Administrator",
                               `4` = "Researcher",
                               `5` = "Other",
                               `8` = "Other",
                               `9` = "Other",
                               `10` = "Other",
                               `6` = "Organization",
                               `7` = "Organization",
                               `11` = "Other"),
         group_receiver = recode(code_receiver,
                                 `1` = "Teacher",
                                 `2` = "Administrator",
                                 `3` = "Administrator",
                                 `4` = "Researcher",
                                 `5` = "Other",
                                 `8` = "Other",
                                 `9` = "Other",
                                 `10` = "Other",
                                 `6` = "Organization",
                                 `7` = "Organization",
                                 `11` = "Other"))

tt$group_receiver <- fct_relevel(as.factor(tt$group_receiver), "Other")
tt$group_sender <- fct_relevel(as.factor(tt$group_sender), "Other")
tt$same <- ifelse(tt$group_sender == tt$group_receiver, 1, 0)
me1 <- brm(val ~ 1 +
             (1|sender) + (1|receiver),
           iter = 600, chains = 3, cores = 3,
           family = 'poisson',
           data = tt)

summary(me1)

me1 <- brm(val ~ 1 +
             group_sender +
             group_receiver +
             (1|sender) + (1|receiver),
           iter = 600, chains = 3, cores = 3,
           family = 'poisson',
           data = tt)

summary(me1)

me2 <- brm(val ~ 1 +
             group_sender +
             group_receiver +
             same + 
             (1|sender) + (1|receiver),
           iter = 600, chains = 3, cores = 3,
           family = 'poisson',
           data = tt)

summary(me2)

me3 <- brm(val ~ 1 +
             scale(years_on_twitter_sender) + scale(n_tweets_sender) + group_sender +
             scale(years_on_twitter_receiver) + scale(n_tweets_receiver) + group_receiver +
             same +
             (1|sender) + (1|receiver),
           iter = 600, chains = 3, cores = 3,
           family = 'poisson',
           data = tt)

summary(me3)

6 Influence

n_days <- orig_pre %>% 
  mutate(screen_name = tolower(screen_name)) %>% 
  filter(!is_retweet) %>% 
  count(screen_name, day) %>% 
  count(screen_name) %>% 
  select(screen_name, pre_n_days = n)

orig_pre <- orig_pre %>% 
  mutate(screen_name = tolower(screen_name)) %>% 
  filter(!is_retweet) %>% 
  count(screen_name) %>% 
  select(screen_name, pre_n = n) %>% 
  left_join(n_days)

n_days <- orig_post %>% 
  mutate(screen_name = tolower(screen_name)) %>% 
  filter(!is_retweet) %>% 
  count(screen_name, day) %>% 
  count(screen_name) %>% 
  select(screen_name, post_n_days = n) 

orig_post <- orig_post %>% 
  mutate(screen_name = tolower(screen_name)) %>% 
  filter(!is_retweet) %>% 
  count(screen_name) %>% 
  select(screen_name, post_n = n) %>% 
  left_join(n_days)

d_for_influence <- users %>% 
  left_join(orig_pre) %>% 
  left_join(orig_post) %>% 
  distinct(screen_name, .keep_all = TRUE) %>% 
  select(screen_name, pre_n, pre_n_days, n_tweets, post_n, post_n_days) %>% 
  filter(n_tweets > 1) %>% 
  mutate_all(replace_na, 0)
orig_pre <- rename(orig_pre, sender = screen_name)

influence_endorsing <- edge %>% 
  filter(interaction_type == "endorsing") %>% 
  count(sender, receiver) %>% 
  left_join(orig_pre) %>% 
  mutate(exposure = n * pre_n) %>% 
  group_by(receiver) %>% 
  summarize(exposure_sum_end = sum(exposure, na.rm = TRUE)) %>% 
  rename(screen_name = receiver) %>% 
  right_join(d_for_influence) %>% 
  mutate(exposure_sum_end = replace_na(exposure_sum_end, 0)) %>% 
  left_join(users) %>% 
  mutate(group= ifelse(group %in% c("Other", "Unclear", "Uncoded"), "Other", group))

influence_conversing <- edge %>% 
  filter(interaction_type == "conversing") %>% 
  count(sender, receiver) %>% 
  left_join(orig_pre) %>% 
  mutate(exposure = n * pre_n) %>% 
  group_by(receiver) %>% 
  summarize(exposure_sum_conv = sum(exposure, na.rm = TRUE)) %>% 
  rename(screen_name = receiver) %>% 
  right_join(d_for_influence) %>% 
  mutate(exposure_sum_conv = replace_na(exposure_sum_conv, 0)) %>% 
  left_join(users) %>% 
  mutate(group= ifelse(group %in% c("Other", "Unclear", "Uncoded"), "Other", group))

influence <- influence_endorsing %>% 
  left_join(influence_conversing) %>% 
  left_join(users) %>% 
  mutate(group = ifelse(group %in% c("Other", "Unclear", "Uncoded"), "Other", group))

influence$group <- fct_relevel(as.factor(influence$group), "Other")
influence %>% 
  select(pre_n,
         pre_n_days,
         post_n, 
         post_n_days,
         n_tweets,
         n_days,
         exposure_sum_end,
         exposure_sum_conv,
         years_on_twitter) %>% 
  corrr::correlate() %>% 
  corrr::shave() %>% 
  corrr::fashion()
##             rowname pre_n pre_n_days post_n post_n_days n_tweets n_days
## 1             pre_n                                                    
## 2        pre_n_days   .72                                              
## 3            post_n   .86        .56                                   
## 4       post_n_days   .40        .45    .70                            
## 5          n_tweets   .92        .61    .94         .56                
## 6            n_days   .51        .63    .67         .79      .69       
## 7  exposure_sum_end  -.01        .01    .07         .17      .11    .22
## 8 exposure_sum_conv   .04        .20    .17         .42      .21    .49
## 9  years_on_twitter   .03        .08    .01         .05      .06    .12
##   exposure_sum_end exposure_sum_conv years_on_twitter
## 1                                                    
## 2                                                    
## 3                                                    
## 4                                                    
## 5                                                    
## 6                                                    
## 7                                                    
## 8              .57                                   
## 9              .00               .18
influence <- mutate(influence,
                    code = ifelse(is.na(code), 11, code)) %>% 
  mutate(code_category = recode(code,
                                `1` = "Teacher", 
                                `2` = "Administrator",
                                `3` = "Administrator",
                                `4` = "Researcher",
                                `5` = "Other",
                                `8` = "Other",
                                `9` = "Other",
                                `10` = "Unclear",
                                `6` = "Organization",
                                `7` = "Organization",
                                `11` = "Unclear")) %>% 
  mutate(code_category = as.factor(code_category),
         code_category = fct_relevel(code_category, "Unclear"))
m0 <- glm(post_n ~ 1,
          data = influence,
          family = 'poisson')

summary(m0)
## 
## Call:
## glm(formula = post_n ~ 1, family = "poisson", data = influence)
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -6.982  -6.982  -6.982  -2.469  74.711  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  3.19339    0.01286   248.3   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 23423  on 247  degrees of freedom
## Residual deviance: 23423  on 247  degrees of freedom
## AIC: 23934
## 
## Number of Fisher Scoring iterations: 7
m1 <- glm(post_n ~ 1 +
            code_category,
          data = influence,
          family = 'poisson')

summary(m1)
## 
## Call:
## glm(formula = post_n ~ 1 + code_category, family = "poisson", 
##     data = influence)
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -9.530  -7.134  -4.979  -2.482  66.041  
## 
## Coefficients:
##                            Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                 1.15145    0.08575   13.43   <2e-16 ***
## code_categoryAdministrator  2.66439    0.08814   30.23   <2e-16 ***
## code_categoryOrganization   1.96035    0.10381   18.88   <2e-16 ***
## code_categoryOther          1.36567    0.10117   13.50   <2e-16 ***
## code_categoryResearcher     2.16806    0.09737   22.27   <2e-16 ***
## code_categoryTeacher        2.08514    0.08815   23.65   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 23423  on 247  degrees of freedom
## Residual deviance: 21174  on 242  degrees of freedom
## AIC: 21695
## 
## Number of Fisher Scoring iterations: 7
m2 <- glm(post_n ~ 1 +
            scale(n_tweets) + 
            scale(years_on_twitter) +
            scale(n_days) +
            code_category,
          data = influence,
          family = 'poisson')

summary(m2)
## 
## Call:
## glm(formula = post_n ~ 1 + scale(n_tweets) + scale(years_on_twitter) + 
##     scale(n_days) + code_category, family = "poisson", data = influence)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -9.5813  -3.9681  -3.0296   0.0528  22.4589  
## 
## Coefficients:
##                             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                 1.439307   0.085995  16.737  < 2e-16 ***
## scale(n_tweets)             0.121054   0.004479  27.030  < 2e-16 ***
## scale(years_on_twitter)    -0.070628   0.016677  -4.235 2.28e-05 ***
## scale(n_days)               0.748193   0.010513  71.169  < 2e-16 ***
## code_categoryAdministrator  1.090255   0.091630  11.898  < 2e-16 ***
## code_categoryOrganization   0.792991   0.106839   7.422 1.15e-13 ***
## code_categoryOther          1.195105   0.101332  11.794  < 2e-16 ***
## code_categoryResearcher     1.099022   0.103011  10.669  < 2e-16 ***
## code_categoryTeacher        0.871716   0.090943   9.585  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 23423.1  on 247  degrees of freedom
## Residual deviance:  5733.8  on 239  degrees of freedom
## AIC: 6260.6
## 
## Number of Fisher Scoring iterations: 7
m2 <- glm(post_n ~ 1 +
            scale(years_on_twitter) +
            scale(n_tweets) + 
            scale(n_days) +
            scale(pre_n) + 
            code_category,
          data = influence,
          family = 'poisson')

summary(m2)
## 
## Call:
## glm(formula = post_n ~ 1 + scale(years_on_twitter) + scale(n_tweets) + 
##     scale(n_days) + scale(pre_n) + code_category, family = "poisson", 
##     data = influence)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -10.665   -3.872   -2.704   -0.306   21.250  
## 
## Coefficients:
##                            Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                 1.44143    0.08600  16.761  < 2e-16 ***
## scale(years_on_twitter)    -0.05316    0.01673  -3.178  0.00148 ** 
## scale(n_tweets)             0.49234    0.03568  13.798  < 2e-16 ***
## scale(n_days)               0.64336    0.01448  44.436  < 2e-16 ***
## scale(pre_n)               -0.31536    0.02993 -10.537  < 2e-16 ***
## code_categoryAdministrator  1.03717    0.09199  11.275  < 2e-16 ***
## code_categoryOrganization   0.69980    0.10749   6.510  7.5e-11 ***
## code_categoryOther          1.17279    0.10139  11.567  < 2e-16 ***
## code_categoryResearcher     0.99286    0.10351   9.592  < 2e-16 ***
## code_categoryTeacher        0.92756    0.09095  10.199  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 23423.1  on 247  degrees of freedom
## Residual deviance:  5619.2  on 238  degrees of freedom
## AIC: 6148.1
## 
## Number of Fisher Scoring iterations: 7
m3 <- glm(post_n ~ 1 +
            scale(years_on_twitter) +
            scale(n_tweets) + 
            scale(n_days) +
            scale(pre_n) + 
            code_category +
            exposure_sum_conv,
          data = influence,
          family = 'poisson')

summary(m3)
## 
## Call:
## glm(formula = post_n ~ 1 + scale(years_on_twitter) + scale(n_tweets) + 
##     scale(n_days) + scale(pre_n) + code_category + exposure_sum_conv, 
##     family = "poisson", data = influence)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -8.7126  -3.8042  -2.7984  -0.4321  21.9750  
## 
## Coefficients:
##                              Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                 1.405e+00  8.608e-02  16.320  < 2e-16 ***
## scale(years_on_twitter)    -9.346e-02  1.692e-02  -5.524 3.31e-08 ***
## scale(n_tweets)             5.067e-01  3.503e-02  14.466  < 2e-16 ***
## scale(n_days)               6.092e-01  1.463e-02  41.632  < 2e-16 ***
## scale(pre_n)               -3.070e-01  2.946e-02 -10.421  < 2e-16 ***
## code_categoryAdministrator  9.683e-01  9.248e-02  10.470  < 2e-16 ***
## code_categoryOrganization   7.917e-01  1.079e-01   7.339 2.15e-13 ***
## code_categoryOther          1.190e+00  1.014e-01  11.734  < 2e-16 ***
## code_categoryResearcher     7.916e-01  1.054e-01   7.513 5.77e-14 ***
## code_categoryTeacher        9.053e-01  9.108e-02   9.940  < 2e-16 ***
## exposure_sum_conv           1.643e-05  1.394e-06  11.785  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 23423.1  on 247  degrees of freedom
## Residual deviance:  5487.4  on 237  degrees of freedom
## AIC: 6018.3
## 
## Number of Fisher Scoring iterations: 7
m4 <- glm(post_n ~ 1 +
            scale(years_on_twitter) +
            scale(n_tweets) + 
            scale(n_days) +
            scale(pre_n) + 
            code_category +
            exposure_sum_end,
          data = influence,
          family = 'poisson')

summary(m4)
## 
## Call:
## glm(formula = post_n ~ 1 + scale(years_on_twitter) + scale(n_tweets) + 
##     scale(n_days) + scale(pre_n) + code_category + exposure_sum_end, 
##     family = "poisson", data = influence)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -9.2262  -3.7847  -2.7177  -0.1993  21.9250  
## 
## Coefficients:
##                              Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                 1.447e+00  8.598e-02  16.826  < 2e-16 ***
## scale(years_on_twitter)    -3.872e-02  1.679e-02  -2.306   0.0211 *  
## scale(n_tweets)             4.984e-01  3.566e-02  13.976  < 2e-16 ***
## scale(n_days)               6.411e-01  1.439e-02  44.552  < 2e-16 ***
## scale(pre_n)               -3.160e-01  2.998e-02 -10.540  < 2e-16 ***
## code_categoryAdministrator  9.686e-01  9.266e-02  10.454  < 2e-16 ***
## code_categoryOrganization   6.892e-01  1.076e-01   6.403 1.53e-10 ***
## code_categoryOther          1.166e+00  1.014e-01  11.497  < 2e-16 ***
## code_categoryResearcher     8.324e-01  1.060e-01   7.851 4.14e-15 ***
## code_categoryTeacher        8.983e-01  9.112e-02   9.859  < 2e-16 ***
## exposure_sum_end            5.807e-04  7.432e-05   7.814 5.54e-15 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 23423.1  on 247  degrees of freedom
## Residual deviance:  5562.8  on 237  degrees of freedom
## AIC: 6093.6
## 
## Number of Fisher Scoring iterations: 7
m5 <- glm(post_n ~ 1 +
            scale(years_on_twitter) +
            scale(n_tweets) + 
            scale(n_days) +
            scale(pre_n) + 
            code_category +
            exposure_sum_conv +
            exposure_sum_end,
          data = influence,
          family = 'poisson')

summary(m5)
## 
## Call:
## glm(formula = post_n ~ 1 + scale(years_on_twitter) + scale(n_tweets) + 
##     scale(n_days) + scale(pre_n) + code_category + exposure_sum_conv + 
##     exposure_sum_end, family = "poisson", data = influence)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -8.7508  -3.7990  -2.7934  -0.4234  21.9990  
## 
## Coefficients:
##                              Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                 1.406e+00  8.616e-02  16.320  < 2e-16 ***
## scale(years_on_twitter)    -9.139e-02  1.790e-02  -5.105 3.30e-07 ***
## scale(n_tweets)             5.071e-01  3.507e-02  14.460  < 2e-16 ***
## scale(n_days)               6.097e-01  1.471e-02  41.451  < 2e-16 ***
## scale(pre_n)               -3.077e-01  2.953e-02 -10.418  < 2e-16 ***
## code_categoryAdministrator  9.656e-01  9.279e-02  10.406  < 2e-16 ***
## code_categoryOrganization   7.888e-01  1.082e-01   7.289 3.12e-13 ***
## code_categoryOther          1.189e+00  1.014e-01  11.722  < 2e-16 ***
## code_categoryResearcher     7.870e-01  1.062e-01   7.411 1.26e-13 ***
## code_categoryTeacher        9.043e-01  9.112e-02   9.924  < 2e-16 ***
## exposure_sum_conv           1.601e-05  1.828e-06   8.760  < 2e-16 ***
## exposure_sum_end            3.540e-05  1.003e-04   0.353    0.724    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 23423.1  on 247  degrees of freedom
## Residual deviance:  5487.2  on 236  degrees of freedom
## AIC: 6020.1
## 
## Number of Fisher Scoring iterations: 7

7 Version/dependencies

sessionInfo()
## R version 3.5.3 (2019-03-11)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS Mojave 10.14.2
## 
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] brms_2.9.0         Rcpp_1.0.1         rTAGS_0.1.0       
##  [4] googlesheets_0.3.0 rtweet_0.6.9       sjstats_0.17.5    
##  [7] lme4_1.1-21        Matrix_1.2-17      igraph_1.2.4.1    
## [10] forcats_0.4.0      stringr_1.4.0      dplyr_0.8.1       
## [13] purrr_0.3.2        readr_1.3.1        tidyr_0.8.3       
## [16] tibble_2.1.3       ggplot2_3.1.1      tidyverse_1.2.1   
## 
## loaded via a namespace (and not attached):
##   [1] TH.data_1.0-10       minqa_1.2.4          colorspace_1.4-1    
##   [4] ggridges_0.5.1       rsconnect_0.8.13     sjlabelled_1.1.0    
##   [7] estimability_1.3     markdown_1.0         base64enc_0.1-3     
##  [10] rstudioapi_0.10      rstan_2.18.2         DT_0.6              
##  [13] fansi_0.4.0          mvtnorm_1.0-10       lubridate_1.7.4     
##  [16] xml2_1.2.0           bridgesampling_0.6-0 codetools_0.2-16    
##  [19] splines_3.5.3        knitr_1.23           shinythemes_1.1.2   
##  [22] sjmisc_2.7.9         zeallot_0.1.0        bayesplot_1.7.0     
##  [25] jsonlite_1.6         nloptr_1.2.1         broom_0.5.2         
##  [28] shiny_1.3.2          compiler_3.5.3       httr_1.4.0          
##  [31] emmeans_1.3.4        backports_1.1.4      assertthat_0.2.1    
##  [34] lazyeval_0.2.2       cli_1.1.0            later_0.8.0         
##  [37] prettyunits_1.0.2    htmltools_0.3.6      tools_3.5.3         
##  [40] coda_0.19-2          gtable_0.3.0         glue_1.3.1          
##  [43] corrr_0.3.2          reshape2_1.4.3       cellranger_1.1.0    
##  [46] vctrs_0.1.0          nlme_3.1-140         crosstalk_1.0.0     
##  [49] insight_0.3.0        xfun_0.7             ps_1.3.0            
##  [52] rvest_0.3.4          mime_0.6             miniUI_0.1.1.1      
##  [55] gtools_3.8.1         MASS_7.3-51.4        zoo_1.8-6           
##  [58] scales_1.0.0         colourpicker_1.0     hms_0.4.2           
##  [61] promises_1.0.1       Brobdingnag_1.2-6    parallel_3.5.3      
##  [64] sandwich_2.5-1       inline_0.3.15        shinystan_2.5.0     
##  [67] yaml_2.2.0           gridExtra_2.3        StanHeaders_2.18.1  
##  [70] loo_2.1.0            stringi_1.4.3        bayestestR_0.2.0    
##  [73] dygraphs_1.1.1.6     pkgbuild_1.0.3       boot_1.3-22         
##  [76] rlang_0.3.4          pkgconfig_2.0.2      matrixStats_0.54.0  
##  [79] evaluate_0.14        lattice_0.20-38      labeling_0.3        
##  [82] rstantools_1.5.1     htmlwidgets_1.3      processx_3.3.1      
##  [85] tidyselect_0.2.5     plyr_1.8.4           magrittr_1.5        
##  [88] R6_2.4.0             generics_0.0.2       multcomp_1.4-10     
##  [91] pillar_1.4.1         haven_2.1.0          withr_2.1.2         
##  [94] xts_0.11-2           survival_2.44-1.1    abind_1.4-5         
##  [97] performance_0.2.0    modelr_0.1.4         crayon_1.3.4        
## [100] utf8_1.1.4           rmarkdown_1.13       grid_3.5.3          
## [103] readxl_1.3.1         callr_3.2.0          threejs_0.3.1       
## [106] digest_0.6.19        xtable_1.8-4         httpuv_1.5.1        
## [109] stats4_3.5.3         munsell_0.5.0        shinyjs_1.0