PGH_DATA_PATH <- here("exploratory_analyses/01_reddit_pilot/data/pittsburgh2years_tidy.csv")
tidy_pgh <- read_csv(PGH_DATA_PATH, guess_max = 1000000) %>%
select(-body)
RP_DATA_PATH <- here("exploratory_analyses/01_reddit_pilot/data/redpill2years_tidy.csv")
tidy_rp <- read_csv(RP_DATA_PATH, guess_max = 1000000) %>%
select(-body)
Using big query, and the fh-bigquerry:reddit_posts/fh-bigquerry:reddit_comments tables, I downloaded all comments and posts from 1/1/2017 to present for two subrredits: theredpill and pittsburgh. There are 802795 comments and posts for theredpill and 521381 for pittsburgh.
After some munging, here’s what the data look like:
tidy_pgh %>%
slice(1:10)%>%
DT::datatable()
Let’s see if we can reconstruct a single post: https://www.reddit.com/r/pittsburgh/comments/a41bez/meet_the_activists_and_riders_behind_pittsburghs. This is what the data look like in our dataset.
target <- tidy_pgh %>%
filter(post_id == "t3_a41bez") %>%
select(1:5, 7:9, 20)
DT::datatable(target)
Here’s what it looks like as a graph:
graph_thread <- target %>%
select(parent_id, comment_id) %>%
rename(from = parent_id,
to = comment_id) %>%
as_tbl_graph(directed = TRUE) %>%
activate(nodes) %>%
left_join(distinct(target, comment_id, author, body, text_type),
by = c("name" = "comment_id")) %>%
filter(!is.na(author))
graph_thread %>%
ggraph(layout = "kk") +
geom_node_point(alpha = .2, aes(size = text_type)) +
geom_node_text(aes(label = author,
color = author), size = 3) +
geom_edge_fan(alpha = .2, arrow = arrow(length = unit(3, 'mm'))) +
theme(legend.position = "none")
tidy_reddit_f <- tidy_pgh %>%
bind_rows(tidy_rp) %>%
filter(author != "[deleted]")
tidy_reddit_f %>%
filter(text_type == "post") %>%
count(subreddit) %>%
kable()
| subreddit | n |
|---|---|
| pittsburgh | 24372 |
| TheRedPill | 22822 |
tidy_reddit_f <- tidy_pgh %>%
bind_rows(tidy_rp) %>%
filter(author != "[deleted]")
tidy_reddit_f %>%
count(subreddit, post_id) %>%
ggplot(aes(x = n)) +
geom_histogram(bins = 100) +
scale_x_log10() +
xlab("N posts per thread") +
facet_wrap(~subreddit) +
theme_classic()
birth_death <- tidy_reddit_f %>%
group_by(subreddit, author) %>%
summarize(birth = min(created_utc),
death = max(created_utc),
n = n())
birth_death_bin <- birth_death %>%
#filter(n > 1) %>%
mutate(birth_bin = lubridate::round_date(birth, "month"),
death_bin = lubridate::round_date(death, "month")) %>%
select(subreddit, birth_bin, death_bin) %>%
gather("event", "value", -subreddit) %>%
count(subreddit, event, value) %>%
ungroup()
total_births <- birth_death_bin %>%
filter(event == "birth_bin") %>%
group_by(subreddit) %>%
mutate(n = cumsum(n),
event = "cum_births") %>%
select(subreddit, value, n, event)
pop_size <- birth_death_bin %>%
left_join(total_births %>% rename(total_births = n) %>% select(-event), by = c("subreddit", "value")) %>%
filter(event == "death_bin") %>%
rename(deaths = n) %>%
mutate(n = total_births - deaths,
event = "pop_size") %>%
select(subreddit, event, value, n)
pop_data <- birth_death_bin %>%
bind_rows(pop_size) %>%
bind_rows(total_births)
ggplot(pop_data, aes(x = value, y = n,
color = event, group = event)) +
geom_line() +
facet_wrap(~subreddit) +
theme_classic()
reddit_text <- tidy_reddit_f %>%
select(1:8, 20) %>%
filter(text_type == "comment",
body_clean != "removed") %>%
mutate(text_length = nchar(body_clean))
targ_authors <- reddit_text %>%
count(subreddit, author) %>%
filter(n >= 50)
pop_data_wide <- pop_data %>%
spread(event, n) %>%
select(-cum_births)
length_over_time <- reddit_text %>%
group_by(subreddit, post_id) %>%
summarize(log_n_comments = log(n()),
log_mean_length = log(mean(text_length)),
first_comment = min(created_utc),
first_comment_time_num = as.numeric(first_comment)) %>%
mutate(first_comment_bin = lubridate::round_date(first_comment, "month")) %>%
left_join(pop_data_wide, by = c("subreddit", "first_comment_bin" = "value"))
make_corr_plot(length_over_time[,c(3,4,6,8:10)])
ggplot(length_over_time, aes(first_comment, log_n_comments)) +
geom_point(alpha = .1) +
geom_smooth(method = "lm") +
facet_wrap(~subreddit) +
theme_classic()
Weak effect of time in community - longer in community, write less.
nested_authors_text <- reddit_text %>%
filter(author %in% targ_authors$author) %>%
group_by(subreddit, author) %>%
mutate(birth = min(created_utc)) %>%
arrange(author, created_utc) %>%
mutate(comment_num = 1:n(),
time_since_birth = as.numeric(created_utc - birth)) %>%
ungroup() %>%
select(subreddit, author, comment_num, time_since_birth, created_utc, text_length) %>%
group_by(subreddit, author) %>%
nest()
score_corrs <- nested_authors_text %>%
mutate(corr_pearson_tsb = map(data, ~cor(.$text_length , .$time_since_birth, method = "pearson")),
corr_spearman_comment_num = map(data, ~cor(.$text_length , .$comment_num, method = "spearman"))) %>%
select(-data) %>%
unnest() %>%
gather("measure", "value", -subreddit, -author)
mean_values <- score_corrs %>%
group_by(subreddit, measure) %>%
tidyboot::tidyboot_mean(col = value, na.rm = T)
ggplot(score_corrs, aes(x = value)) +
geom_histogram() +
geom_vline(data = mean_values,
aes(xintercept = mean),
color = "red", linetype = 2) +
facet_grid(measure ~ subreddit) +
theme_classic()
kable(mean_values)
| subreddit | measure | n | empirical_stat | ci_lower | mean | ci_upper |
|---|---|---|---|---|---|---|
| pittsburgh | corr_pearson_tsb | 1403 | -0.0121343 | -0.0190764 | -0.0122545 | -0.0055661 |
| pittsburgh | corr_spearman_comment_num | 1403 | -0.0209361 | -0.0284733 | -0.0208258 | -0.0136477 |
| TheRedPill | corr_pearson_tsb | 2260 | -0.0149966 | -0.0208979 | -0.0148251 | -0.0088747 |
| TheRedPill | corr_spearman_comment_num | 2260 | -0.0250679 | -0.0311259 | -0.0251069 | -0.0189966 |
## Do scores get higher the longer you're in the community?
#Weak evidence if you look at folks who have at least 50 posts.
targ_authors <- tidy_reddit_f %>%
count(subreddit, author) %>%
filter(n >= 50)
nested_authors <- tidy_reddit_f %>%
right_join(targ_authors %>% select(-n)) %>%
filter(text_type == "comment") %>%
group_by(subreddit, author) %>%
mutate(birth = min(created_utc)) %>%
arrange(author, created_utc) %>%
mutate(comment_num = 1:n(),
time_since_birth = as.numeric(created_utc - birth)) %>%
ungroup() %>%
select(subreddit, author, comment_num, time_since_birth, created_utc, score) %>%
group_by(subreddit, author) %>%
nest()
score_corrs <- nested_authors %>%
mutate(corr_pearson_tsb = map(data, ~cor(.$score , .$time_since_birth, method = "pearson")),
corr_spearman_comment_num = map(data, ~cor(.$score , .$comment_num, method = "spearman"))) %>%
select(-data) %>%
unnest() %>%
gather("measure", "value", -subreddit, -author)
mean_values <- score_corrs %>%
group_by(subreddit, measure) %>%
tidyboot::tidyboot_mean(col = value, na.rm = T)
ggplot(score_corrs, aes(x = value)) +
geom_histogram() +
geom_vline(data = mean_values,
aes(xintercept = mean),
color = "red", linetype = 2) +
facet_grid(measure ~ subreddit) +
theme_classic()
kable(mean_values)
#HYPOTHESES:
#* As population size increases, complexity of norms decreases (moderated by population structure, prop new, etc)
#* Social variables influence alignment: people who are responded to more introduce more norms
#* Norms of larger communities change more quickly and are less coherent across individuals
#* Topic entropy as a mediating factor
Social network
This is the network for the pittsburgh subreddit, for folks who had at least 1000 comments. They are linked if they responded to each other.