The data

PGH_DATA_PATH <- here("exploratory_analyses/01_reddit_pilot/data/pittsburgh2years_tidy.csv")
tidy_pgh <- read_csv(PGH_DATA_PATH, guess_max = 1000000)  %>%
  select(-body)

RP_DATA_PATH <- here("exploratory_analyses/01_reddit_pilot/data/redpill2years_tidy.csv")
tidy_rp <- read_csv(RP_DATA_PATH, guess_max = 1000000) %>%
  select(-body)

Using big query, and the fh-bigquerry:reddit_posts/fh-bigquerry:reddit_comments tables, I downloaded all comments and posts from 1/1/2017 to present for two subrredits: theredpill and pittsburgh. There are 802795 comments and posts for theredpill and 521381 for pittsburgh.

After some munging, here’s what the data look like:

tidy_pgh %>%
  slice(1:10)%>%
  DT::datatable()

Reconstructing a post

Let’s see if we can reconstruct a single post: https://www.reddit.com/r/pittsburgh/comments/a41bez/meet_the_activists_and_riders_behind_pittsburghs. This is what the data look like in our dataset.

target <- tidy_pgh %>%
  filter(post_id == "t3_a41bez") %>%
  select(1:5, 7:9, 20) 

DT::datatable(target)

Here’s what it looks like as a graph:

graph_thread <- target %>%
  select(parent_id, comment_id) %>%
  rename(from = parent_id, 
         to = comment_id) %>%
  as_tbl_graph(directed = TRUE)  %>%
  activate(nodes) %>%
  left_join(distinct(target, comment_id, author, body, text_type),
            by = c("name" = "comment_id")) %>%
  filter(!is.na(author))

graph_thread %>%
  ggraph(layout = "kk") +
  geom_node_point(alpha = .2, aes(size = text_type)) +
  geom_node_text(aes(label = author,
                     color = author), size = 3) +
  geom_edge_fan(alpha = .2, arrow = arrow(length = unit(3, 'mm')))  +
  theme(legend.position = "none")

Descriptive statistics

How many posts (threads) are there in each subreddit?

tidy_reddit_f <- tidy_pgh %>%
  bind_rows(tidy_rp) %>%
  filter(author != "[deleted]") 

tidy_reddit_f %>%
  filter(text_type == "post") %>%
  count(subreddit) %>%
  kable()
subreddit n
pittsburgh 24372
TheRedPill 22822

How many comments are there in each thread?

tidy_reddit_f <- tidy_pgh %>%
  bind_rows(tidy_rp) %>%
  filter(author != "[deleted]") 

tidy_reddit_f %>%
  count(subreddit, post_id) %>%
  ggplot(aes(x = n)) +
  geom_histogram(bins = 100) +
  scale_x_log10() +
  xlab("N posts per thread") +
  facet_wrap(~subreddit) +
  theme_classic()

How many comments/posts does each author make?

tidy_reddit_f %>%
  count(subreddit, author) %>%
  count(subreddit, n) %>%
  ggplot(aes(x = nn)) +
  geom_histogram(bins = 100) +
  scale_x_log10() +
  xlab("N posts per author") +
  facet_wrap(~subreddit) +
  theme_classic()

reddit_author_means <- tidy_reddit_f %>%
  count(subreddit, author) %>%
  group_by(subreddit) %>%
  tidyboot::tidyboot_mean(col = n)

ggplot(reddit_author_means, aes(x = subreddit)) +
  geom_pointrange(aes(y = mean, 
                      ymin = ci_lower, 
                      ymax = ci_upper)) +
  theme_classic()

Population over time

birth_death <- tidy_reddit_f %>%
  group_by(subreddit, author) %>%
  summarize(birth = min(created_utc),
            death = max(created_utc),
            n = n())

birth_death_bin <- birth_death %>%
  #filter(n > 1) %>%
  mutate(birth_bin = lubridate::round_date(birth, "month"),
         death_bin = lubridate::round_date(death, "month")) %>%
  select(subreddit, birth_bin, death_bin) %>%
  gather("event", "value", -subreddit) %>%
  count(subreddit, event, value) %>%
  ungroup() 

total_births <- birth_death_bin %>%
  filter(event == "birth_bin") %>%
  group_by(subreddit)  %>%
  mutate(n = cumsum(n),
         event = "cum_births") %>%
   select(subreddit, value, n, event)  

pop_size <- birth_death_bin %>%
  left_join(total_births %>% rename(total_births = n) %>% select(-event), by = c("subreddit", "value")) %>%
  filter(event == "death_bin") %>%
  rename(deaths = n) %>%
  mutate(n = total_births - deaths,
         event = "pop_size") %>%
  select(subreddit, event, value, n) 

pop_data <- birth_death_bin %>%
  bind_rows(pop_size) %>%
  bind_rows(total_births)


ggplot(pop_data, aes(x = value, y = n, 
           color = event, group = event)) +
  geom_line() +
  facet_wrap(~subreddit) +
  theme_classic()

Social network

This is the network for the pittsburgh subreddit, for folks who had at least 1000 comments. They are linked if they responded to each other.

targ_authors <- tidy_reddit_f %>%
  count(subreddit, author) %>% 
  filter(n >= 1000)

social_data <- tidy_reddit_f %>%
  select(subreddit, parent_id, comment_id, author) %>%
  left_join(tidy_reddit_f %>% distinct(comment_id, author) %>%
              rename(parent_author = author), 
            by = c("parent_id" = "comment_id")) %>%
  # mutate_if(is.character, as.factor) %>%
  filter(!is.na(parent_id), 
         !is.na(parent_author), 
         parent_author != "[deleted]",
         subreddit == "pittsburgh")

social_graph_data  <- social_data %>%
  count(author, parent_author) %>%
  filter(n > 5) %>%
  mutate_if(is.character, as.factor) 

social_graph <- social_graph_data %>%
  filter(author %in% unique(social_graph_data$parent_author),
         parent_author %in% unique(social_graph_data$author)) %>%
  rename(from = parent_author,
         to = author) %>%
  as_tbl_graph(directed = F) %>%
  #activate(edges) %>% #  remove loops
  #filter(!edge_is_loop()) %>%
  activate(nodes) %>%  # Remove isolated nodes
  mutate(centrality = centrality_degree(mode = 'in')) %>%
  filter(centrality > 2) %>%
  filter(!node_is_isolated(),
         !node_is_simplical())

ggraph(social_graph) + 
    geom_edge_fan(aes(alpha = n), show.legend = FALSE) + 
    geom_node_point(aes(size = log(centrality)), alpha = .5, color = 'red',) + 
  theme_void() +
  theme(legend.position = "none")

Text analysis

At thread level

  • A thread with more comments has longer comments
  • As the community gets older there are fewer comments per thread
  • As a community gets older, comments get shorter
  • As poluation size increases, comment length increases, but num comments decreases.
reddit_text <- tidy_reddit_f %>%
  select(1:8, 20) %>%
  filter(text_type == "comment",
         body_clean != "removed") %>%
  mutate(text_length = nchar(body_clean)) 

targ_authors <- reddit_text %>%
  count(subreddit, author) %>% 
  filter(n >= 50)

pop_data_wide <- pop_data %>%
  spread(event, n) %>%
  select(-cum_births)
  
length_over_time <- reddit_text %>%
  group_by(subreddit, post_id) %>%
  summarize(log_n_comments = log(n()),
            log_mean_length = log(mean(text_length)),
            first_comment = min(created_utc),
            first_comment_time_num = as.numeric(first_comment)) %>%
  mutate(first_comment_bin = lubridate::round_date(first_comment, "month")) %>%
  left_join(pop_data_wide, by = c("subreddit", "first_comment_bin" = "value"))

make_corr_plot(length_over_time[,c(3,4,6,8:10)])

ggplot(length_over_time, aes(first_comment, log_n_comments)) +
  geom_point(alpha = .1) +
  geom_smooth(method = "lm") +
  facet_wrap(~subreddit) +
  theme_classic()

At individual level

Weak effect of time in community - longer in community, write less.

nested_authors_text <- reddit_text %>% 
  filter(author %in% targ_authors$author) %>%
  group_by(subreddit, author) %>% 
  mutate(birth = min(created_utc)) %>%
  arrange(author, created_utc) %>%
  mutate(comment_num = 1:n(),
         time_since_birth = as.numeric(created_utc - birth)) %>%
  ungroup() %>% 
  select(subreddit, author, comment_num, time_since_birth, created_utc, text_length) %>%
  group_by(subreddit, author) %>%
  nest()

score_corrs <- nested_authors_text %>%
  mutate(corr_pearson_tsb = map(data, ~cor(.$text_length , .$time_since_birth, method = "pearson")),
         corr_spearman_comment_num = map(data, ~cor(.$text_length , .$comment_num, method = "spearman"))) %>%
  select(-data) %>%
  unnest() %>%
  gather("measure", "value", -subreddit, -author)

mean_values <- score_corrs %>%
  group_by(subreddit, measure) %>%
  tidyboot::tidyboot_mean(col = value, na.rm = T)

ggplot(score_corrs, aes(x = value)) +
  geom_histogram() +
  geom_vline(data = mean_values, 
             aes(xintercept = mean), 
             color = "red", linetype = 2) +
  facet_grid(measure ~ subreddit) +
  theme_classic()

kable(mean_values)
subreddit measure n empirical_stat ci_lower mean ci_upper
pittsburgh corr_pearson_tsb 1403 -0.0121343 -0.0190764 -0.0122545 -0.0055661
pittsburgh corr_spearman_comment_num 1403 -0.0209361 -0.0284733 -0.0208258 -0.0136477
TheRedPill corr_pearson_tsb 2260 -0.0149966 -0.0208979 -0.0148251 -0.0088747
TheRedPill corr_spearman_comment_num 2260 -0.0250679 -0.0311259 -0.0251069 -0.0189966
## Do scores get higher the longer you're in the community?

#Weak evidence if you look at folks who have at least 50 posts.
targ_authors <- tidy_reddit_f %>%
  count(subreddit, author) %>% 
  filter(n >= 50)

nested_authors <- tidy_reddit_f %>% 
  right_join(targ_authors %>% select(-n)) %>%
  filter(text_type == "comment") %>%
  group_by(subreddit, author) %>% 
  mutate(birth = min(created_utc)) %>%
  arrange(author, created_utc) %>%
  mutate(comment_num = 1:n(),
         time_since_birth = as.numeric(created_utc - birth)) %>%
  ungroup() %>% 
  select(subreddit, author, comment_num, time_since_birth, created_utc, score) %>%
  group_by(subreddit, author) %>%
  nest()

score_corrs <- nested_authors %>%
  mutate(corr_pearson_tsb = map(data, ~cor(.$score , .$time_since_birth, method = "pearson")),
         corr_spearman_comment_num = map(data, ~cor(.$score , .$comment_num, method = "spearman"))) %>%
  select(-data) %>%
  unnest() %>%
  gather("measure", "value", -subreddit, -author)

mean_values <- score_corrs %>%
  group_by(subreddit, measure) %>%
   tidyboot::tidyboot_mean(col = value, na.rm = T)

ggplot(score_corrs, aes(x = value)) +
  geom_histogram() +
  geom_vline(data = mean_values, 
             aes(xintercept = mean), 
             color = "red", linetype = 2) +
  facet_grid(measure ~ subreddit) +
  theme_classic()

kable(mean_values)
#HYPOTHESES:

#* As population size increases, complexity of norms decreases (moderated by population structure, prop new, etc)
#* Social variables influence alignment: people who are responded to more introduce more norms
#* Norms of larger communities change more quickly and are less coherent across individuals
#* Topic entropy as a mediating factor