AUTHOR_COUNTS <- here("exploratory_analyses/03_systematic_sample/data/subreddit_counts_scores.csv")
author_count_measures <- read_csv(AUTHOR_COUNTS, col_names = c("subreddit","author_n","word_H","word_mean_n","word_sd","word_total","score_mean", "score_sd","score_H","comments_n_long","comments_n_all",
"posts_n_all","comments_posts_ratio")) %>%
filter(author_n > 100) %>%
arrange(-author_n) %>%
mutate(author_rank = 1:n())
author_counts <- author_count_measures %>%
select(subreddit, author_n, author_rank)
Summary of findings:
#Implications for language change:
#* Larger communnities: more homogenous input because of inequality across speakers
# More consensus -> faster word gain.
More people, more words. (there’s no relationship between mean/sd post length and author n.)
r2 <- lm(unlist(log(author_count_measures[,"author_n"])) ~ unlist(log(author_count_measures[,"word_total"]))) %>%
summary() %>%
pluck("r.squared") %>%
round(2)
Quantity of language as a function of community size follows a power law, with an exponent of 1.03, meaning it’s a Zipfs law (~1). And an \(R^2\) = 0.91!
ggplot(author_count_measures, aes(x = author_n, y = word_total)) +
geom_point() +
geom_smooth(method = "lm")+
scale_y_log10(name = "N total words (log)") +
scale_x_log10(name = "N authors (log)")+
theme_classic()
Exp: 1.13.
ggplot(author_count_measures, aes(x = author_n, y = comments_n_long)) +
geom_point() +
geom_smooth(method = "lm")+
scale_y_log10(name = "N comments (log)") +
scale_x_log10(name = "N authors (log)")+
theme_classic()
Exp: 0.62.
Low exp -> more inequality (large communities with lots of posts)
ggplot(author_count_measures, aes(x = author_n, y = posts_n_all)) +
geom_point() +
geom_smooth(method = "lm")+
scale_y_log10(name = "N posts (log)") +
scale_x_log10(name = "N authors (log)")+
theme_classic()
Within the same thread, how long does it take for a different person to respond?
For each subreddit, for each thread, calculated the lag in seconds between comments from different people. Then, averaged across threads in the same week. The data below are for posts of any length.
LAG_PATH <- here("exploratory_analyses/03_systematic_sample/data/thread_lag_overtime.csv")
lag <- read_csv(LAG_PATH, col_names = c("subreddit", "created_bin", "lag_sec", "n", "comment_length_type")) %>%
filter(comment_length_type == "all") %>%
left_join(author_counts) %>%
mutate(subreddit = fct_reorder(subreddit, author_n)) %>%
filter(author_n > 100)
lag_mean <- lag %>%
group_by(subreddit) %>%
summarize(mean_lag = mean(lag_sec))
# ggplot(lag, aes(x = created_bin, y = lag_sec)) +
# geom_hline(data = lag_mean, aes(yintercept = mean_lag), linetype = 2, color = "red") +
# geom_smooth() +
# ylab("Lag (log seconds)") +
# scale_y_log10() +
# facet_wrap(~subreddit) +
# theme_classic() +
# theme(axis.text.x = element_text(angle = 90))
ggplot(lag, aes(x = created_bin, y = lag_sec, group = subreddit, color = log(author_n))) +
geom_smooth(se = F) +
ylab("Lag (log seconds)") +
scale_y_log10() +
scale_color_viridis(alpha = .8) +
theme_classic() +
theme(axis.text.x = element_text(angle = 90))
For each subreddit, for each week, calculated in/(in + out). Where “in” = first time posting to community; “out” = last time posting in community. The data below is for a measure where I only consider all comments (not restricitng on length). Each panel shows a subreddit with the red dashed line indicating the mean in-churn over time. Red lines greater than .5 indicate that a community is growing. Note that we’re somewhat underestimating overall growth here by including the last time period (where everyone dies). But, this is the same across communities.
CHURN_PATH <- here("exploratory_analyses/03_systematic_sample/data/churn_overtime.csv")
churn <- read_csv(CHURN_PATH, col_names = c("subreddit", "created_bin", "in_churn",
"inout_sum", "comment_length_type")) %>%
filter(comment_length_type == "all") %>%
left_join(author_counts) %>%
mutate(subreddit = fct_reorder(subreddit, author_n)) %>%
filter(author_n > 100)
in_churn <- churn %>%
group_by(subreddit) %>%
summarize(mean_churn = mean(in_churn))
ggplot(churn, aes(x = created_bin, y = in_churn, group = subreddit)) +
geom_hline(data = in_churn, aes(yintercept = mean_churn), linetype = 2, color = "red") +
geom_hline(aes(yintercept = .5)) +
geom_smooth() +
ylim(0,1) +
xlab("week") +
ylab("Churn-in") +
facet_wrap(~subreddit) +
theme_classic() +
theme(axis.text.x = element_text(angle = 90))
churn %>%
ggplot(aes(x = created_bin, y = in_churn, color = log(author_n),
group = subreddit)) +
scale_color_viridis(alpha = .5) +
geom_smooth(se = F) +
xlab("week") +
ylab("Churn-in") +
theme_classic() +
theme(axis.text.x = element_text(angle = 90))
in_churn %>%
ggplot(aes(x = mean_churn)) +
xlab("Churn-in") +
geom_histogram() +
theme_classic()
This is the mean score across comments in 1-week bins (all comments). (This should scale with with community size, right?)
SCORE_PATH <- here("exploratory_analyses/03_systematic_sample/data/score_overtime.csv")
scores<- read_csv(SCORE_PATH, col_names = c( "created_bin", "mean_score",
"comment_length_type", "subreddit")) %>%
filter(comment_length_type == "all") %>%
left_join(author_counts) %>%
mutate(subreddit = fct_reorder(subreddit, author_n),
mean_score = as.numeric(mean_score),
created_bin = lubridate::round_date(as.POSIXct(created_bin), "week")) %>%
filter(author_n > 100) %>%
mutate(mean_score_normalized = mean_score/author_n)
score_mean <- scores %>%
group_by(subreddit) %>%
summarize(mean_score = mean(mean_score),
mean_score_normalized = mean(mean_score_normalized))
ggplot(scores, aes(x = created_bin, y = mean_score, group = subreddit,
color = log(author_n))) +
geom_smooth(se = F) +
ylab("Score (log)") +
scale_y_log10() +
scale_color_viridis(alpha = .8) +
theme_classic() +
theme(axis.text.x = element_text(angle = 90))
ggplot(scores, aes(x = created_bin, y = mean_score_normalized, group = subreddit,
color = log(author_n))) +
geom_smooth(se = F) +
ylab("Score/author_n (log)") +
scale_y_log10() +
scale_color_viridis(alpha = .8) +
theme_classic() +
theme(axis.text.x = element_text(angle = 90))
Based on topic models (longest post for each author per week).
ENTROPY_PATH <- here("exploratory_analyses/03_systematic_sample/data/subreddit_post_entropy_overtime.csv")
comment_entropy<- read_csv(ENTROPY_PATH, col_names = c( "subreddit", "created_bin",
"mean_document_entropy", "n")) %>%
left_join(author_counts) %>%
mutate(subreddit = fct_reorder(subreddit, author_n),
created_bin = lubridate::round_date(as.POSIXct(created_bin), "week")) %>%
filter(author_n > 100)
entropy_mean <- comment_entropy %>%
group_by(subreddit) %>%
summarize(mean_document_entropy = mean(mean_document_entropy))
ggplot(comment_entropy, aes(x = created_bin, y = mean_document_entropy, group = subreddit,
color = log(author_n))) +
geom_smooth(se = F) +
ylab("Score/author_n (log)") +
scale_color_viridis(alpha = .8) +
theme_classic() +
theme(axis.text.x = element_text(angle = 90))
author_count_measures %>%
select(subreddit, word_total, author_n, comments_n_long, posts_n_all, comments_posts_ratio) %>%
left_join(score_mean %>% select(subreddit, mean_score_normalized), by = "subreddit") %>%
left_join(lag_mean, by = "subreddit") %>%
left_join(author_time %>% select(subreddit, author_longevity_mean, author_lag_mean),
by = "subreddit") %>%
left_join(in_churn, by= "subreddit") %>%
left_join(entropy_mean, by = "subreddit") %>%
left_join(mean_pairwise_JSD %>% select(subreddit, mean_JSD), by = "subreddit") %>%
left_join(author_ineq_long %>% select(subreddit, comment_gini_coeff), by = "subreddit") %>%
mutate_at(vars(word_total, author_n, comments_n_long, posts_n_all, comments_posts_ratio,
mean_score_normalized), log) %>%
select(-subreddit) %>%
make_corr_plot()
Comments to post ratio
Exp: 0.17.