AUTHOR_COUNTS <-  here("exploratory_analyses/03_systematic_sample/data/subreddit_counts_scores.csv")
author_count_measures <- read_csv(AUTHOR_COUNTS, col_names = c("subreddit","author_n","word_H","word_mean_n","word_sd","word_total","score_mean",                                                    "score_sd","score_H","comments_n_long","comments_n_all",
                                                               "posts_n_all","comments_posts_ratio")) %>%
  filter(author_n > 100) %>%
  arrange(-author_n) %>%
  mutate(author_rank = 1:n())

author_counts <- author_count_measures %>%
  select(subreddit, author_n, author_rank)

Summary of findings:

  • There’s a robust Zipf’s law relationship between number of words and community size - I don’t think this has ever been shown before!
  • Larger communities produce more comments relative to posts (power law).
  • Larger communities have members with longer lifespans
  • Larger communities have shorter lags between comments on a thread (power law).
  • Larger communities have less churn.
  • Larger communities have more inequality across authors in terms of posting - relatively fewer people do more of the posting.
  • Larger communities have lower (normalized) scores. This is a Zipf-like distribution.
  • Larger communities have lower comment entropy (though weak).
  • Larger communities have authors that are less coherent with respect to themselves, i.e. change more.
#Implications for language change:
#* Larger communnities: more homogenous input because of inequality across speakers
# More consensus -> faster word gain.

Language counts

More people, more words. (there’s no relationship between mean/sd post length and author n.)

Total words

r2 <- lm(unlist(log(author_count_measures[,"author_n"])) ~ unlist(log(author_count_measures[,"word_total"]))) %>%
    summary() %>%
   pluck("r.squared") %>%
  round(2)

Quantity of language as a function of community size follows a power law, with an exponent of 1.03, meaning it’s a Zipfs law (~1). And an \(R^2\) = 0.91!

ggplot(author_count_measures, aes(x = author_n, y = word_total)) +
  geom_point() +
  geom_smooth(method = "lm")+
  scale_y_log10(name = "N total words (log)") +
  scale_x_log10(name = "N authors (log)")+
  theme_classic()

N comments (at least 100 words)

Exp: 1.13.

ggplot(author_count_measures, aes(x = author_n, y = comments_n_long)) +
  geom_point() +
  geom_smooth(method = "lm")+
  scale_y_log10(name = "N comments (log)") +
  scale_x_log10(name = "N authors (log)")+
  theme_classic()

N Posts

Exp: 0.62.

Low exp -> more inequality (large communities with lots of posts)

ggplot(author_count_measures, aes(x = author_n, y = posts_n_all)) +
  geom_point() +
  geom_smooth(method = "lm")+
  scale_y_log10(name = "N posts (log)") +
  scale_x_log10(name = "N authors (log)")+
  theme_classic()

Comments to post ratio

Exp: 0.17.

ggplot(author_count_measures, aes(x = author_n, y = comments_posts_ratio)) +
  geom_point() +
  geom_smooth(method = "lm")+
  scale_y_log10(name = "comments/posts (log)") +
  scale_x_log10(name = "N authors (log)")+
  theme_classic()

cor.test(log(author_count_measures$author_n), log(author_count_measures$comments_posts_ratio)) %>%
  tidy() %>%
  kable()
estimate statistic p.value parameter conf.low conf.high method alternative
0.3305478 2.779902 0.0071602 63 0.0942469 0.53159 Pearson’s product-moment correlation two.sided

Author time

Variables looking at time with respect to a single author.

AUTHOR_TIME <- here("exploratory_analyses/03_systematic_sample/data/subreddit_author_time_data.csv")
author_time <- read_csv(AUTHOR_TIME, col_names = c("subreddit", "author_longevity_mean", "author_sd_mean",
                                                   "author_longevity_H", "author_lag_sd", "author_lag_H", "author_lag_mean")) %>%
    left_join(author_counts) %>%
     filter(author_n > 100)  

Author longevity

Time from first to last post - bigger communities have people with longer lifespans.

ggplot(author_time, aes(x = author_n, y = author_longevity_mean)) +
  geom_point() +
  geom_smooth(method = "lm")+
  ylab("Author longevity") +
  scale_x_log10(name = "N authors (log)")+
  theme_classic()

cor.test(log(author_time$author_n), author_time$author_longevity_mean) %>%
  tidy() %>%
  kable()
estimate statistic p.value parameter conf.low conf.high method alternative
0.4073817 3.540612 0.000757 63 0.1815183 0.5924182 Pearson’s product-moment correlation two.sided

Author lag

Mean time between posts by the same author.

ggplot(author_time, aes(x = author_n, y = author_lag_mean)) +
  geom_point() +
  geom_smooth(method = "lm")+
  ylab("Author lag") +
  scale_x_log10(name = "N authors (log)")+
  theme_classic()

Thread Lag

Within the same thread, how long does it take for a different person to respond?

For each subreddit, for each thread, calculated the lag in seconds between comments from different people. Then, averaged across threads in the same week. The data below are for posts of any length.

LAG_PATH <- here("exploratory_analyses/03_systematic_sample/data/thread_lag_overtime.csv")

lag <- read_csv(LAG_PATH, col_names = c("subreddit", "created_bin", "lag_sec", "n", "comment_length_type")) %>%
  filter(comment_length_type == "all") %>%
  left_join(author_counts) %>%
  mutate(subreddit = fct_reorder(subreddit, author_n))  %>%
  filter(author_n > 100) 

lag_mean <- lag %>%
  group_by(subreddit) %>%
  summarize(mean_lag = mean(lag_sec))

# ggplot(lag, aes(x = created_bin, y = lag_sec)) +
#   geom_hline(data = lag_mean, aes(yintercept = mean_lag), linetype = 2, color = "red") +
#   geom_smooth() +
#   ylab("Lag (log seconds)") +
#   scale_y_log10() +
#   facet_wrap(~subreddit) +
#   theme_classic() +
#   theme(axis.text.x = element_text(angle = 90))

Over time

ggplot(lag, aes(x = created_bin, y = lag_sec, group = subreddit, color = log(author_n))) +
  geom_smooth(se = F) +
  ylab("Lag (log seconds)") +
  scale_y_log10() +
  scale_color_viridis(alpha = .8) +
  theme_classic() +
  theme(axis.text.x = element_text(angle = 90))

Mean lag and author n

lag_mean_with_n <- lag_mean %>%
  left_join(author_counts) 

Exp: -0.42.

ggplot(lag_mean_with_n, aes(x = author_n, y = mean_lag)) +
  geom_point() +
  ylab("Churn-in") +
  geom_smooth(method = "lm")+
  scale_y_log10(name = "Lag (log seconds)")+
  scale_x_log10(name = "N authors (log)")+
  theme_classic()

Churn

For each subreddit, for each week, calculated in/(in + out). Where “in” = first time posting to community; “out” = last time posting in community. The data below is for a measure where I only consider all comments (not restricitng on length). Each panel shows a subreddit with the red dashed line indicating the mean in-churn over time. Red lines greater than .5 indicate that a community is growing. Note that we’re somewhat underestimating overall growth here by including the last time period (where everyone dies). But, this is the same across communities.

Over time

CHURN_PATH <- here("exploratory_analyses/03_systematic_sample/data/churn_overtime.csv")

churn <- read_csv(CHURN_PATH, col_names = c("subreddit", "created_bin", "in_churn",
                                            "inout_sum", "comment_length_type")) %>%
  filter(comment_length_type == "all") %>%
  left_join(author_counts) %>%
  mutate(subreddit = fct_reorder(subreddit, author_n))  %>%
  filter(author_n > 100) 

in_churn <- churn %>%
  group_by(subreddit) %>%
  summarize(mean_churn = mean(in_churn))
ggplot(churn, aes(x = created_bin, y = in_churn, group = subreddit)) +
  geom_hline(data = in_churn, aes(yintercept = mean_churn), linetype = 2, color = "red") +
  geom_hline(aes(yintercept = .5)) +
  geom_smooth() +
  ylim(0,1) +
  xlab("week") +
  ylab("Churn-in") +
  facet_wrap(~subreddit) +
  theme_classic() +
  theme(axis.text.x = element_text(angle = 90))

Over time2

churn %>%
  ggplot(aes(x = created_bin, y = in_churn, color = log(author_n), 
              group = subreddit)) +
  scale_color_viridis(alpha = .5) +
  geom_smooth(se = F) +
  xlab("week") +
  ylab("Churn-in") +
  theme_classic() +
  theme(axis.text.x = element_text(angle = 90))

Mean churn rate

in_churn %>%
  ggplot(aes(x = mean_churn)) +
  xlab("Churn-in") +
  geom_histogram() +
  theme_classic()

Mean churn and author n

Bigger communities: less churn.

churn_with_author <- in_churn %>%
  left_join(author_counts) 

ggplot(churn_with_author, aes(x = author_n, y = mean_churn)) +
  geom_point() +
  ylab("Churn-in") +
  scale_x_log10(name = "N authors (log)")+
  geom_smooth(method = "lm")+
  theme_classic()

cor.test(log(churn_with_author$author_n), churn_with_author$mean_churn) %>%
  tidy() %>%
  kable()
estimate statistic p.value parameter conf.low conf.high method alternative
-0.2971083 -2.469748 0.016242 63 -0.5044525 -0.0573662 Pearson’s product-moment correlation two.sided

Author Distribution

How are comments distributed over authors? Gini coefficient (higher values - more inequality). Bigger communities, more inequality over comments (some people contributing more than others).

Also looked at entropy(number_of_posts_per_author)/log(n_authors).

All comments

AUTHOR_INEQ <- here("exploratory_analyses/03_systematic_sample/data/subreddit_author_inequality_all.csv")

author_ineq <- read_csv(AUTHOR_INEQ, col_names = c("subreddit", "comment_author_H", "comment_gini_coeff",
                                     "comment_normalized_author_H", "post_author_H", "post_gini_coeff",
                                    "post_normalized_author_H")) %>%
  left_join(author_counts) %>%
  mutate(subreddit = fct_reorder(subreddit, author_n)) %>%
  filter(author_n > 100) 


ggplot(author_ineq, aes(x = author_n, y = comment_gini_coeff)) +
  geom_point() +
  geom_smooth(method = "lm")+
  ylab("Comment Gini Coefficient") +
  scale_x_log10(name = "N authors (log)")+
  theme_classic()

cor.test(log(author_ineq$author_n), author_ineq$comment_gini_coeff) %>%
  tidy() %>%
  kable()
estimate statistic p.value parameter conf.low conf.high method alternative
0.2545245 2.089025 0.0407509 63 0.0113287 0.4692908 Pearson’s product-moment correlation two.sided

Long comments only

AUTHOR_INEQ_LONG <- here("exploratory_analyses/03_systematic_sample/data/subreddit_author_inequality_long.csv")

author_ineq_long <- read_csv(AUTHOR_INEQ_LONG, col_names = c("subreddit", "comment_author_H",
                                                             "comment_gini_coeff",
                                     "comment_normalized_author_H")) %>%
  left_join(author_counts) %>%
  mutate(subreddit = fct_reorder(subreddit, author_n)) %>%
  filter(author_n > 100) 

ggplot(author_ineq_long, aes(x = author_n, y = comment_gini_coeff)) +
  geom_point() +
  geom_smooth(method = "lm")+
  ylab("Comment Gini Coefficient") +
  scale_x_log10(name = "N authors (log)")+
  theme_classic()

cor.test(log(author_ineq_long$author_n), author_ineq_long$comment_gini_coeff) %>%
  tidy() %>%
  kable()
estimate statistic p.value parameter conf.low conf.high method alternative
0.4488341 3.986627 0.0001768 63 0.230127 0.624382 Pearson’s product-moment correlation two.sided

Score

This is the mean score across comments in 1-week bins (all comments). (This should scale with with community size, right?)

Over time

SCORE_PATH <- here("exploratory_analyses/03_systematic_sample/data/score_overtime.csv")

scores<- read_csv(SCORE_PATH, col_names = c( "created_bin", "mean_score", 
                                             "comment_length_type", "subreddit")) %>%
  filter(comment_length_type == "all") %>%
  left_join(author_counts) %>%
  mutate(subreddit = fct_reorder(subreddit, author_n),
         mean_score = as.numeric(mean_score),
         created_bin = lubridate::round_date(as.POSIXct(created_bin), "week")) %>%
  filter(author_n > 100)  %>%
  mutate(mean_score_normalized = mean_score/author_n)

score_mean <- scores %>%
  group_by(subreddit) %>%
  summarize(mean_score = mean(mean_score),
            mean_score_normalized = mean(mean_score_normalized))

ggplot(scores, aes(x = created_bin, y = mean_score, group = subreddit, 
                   color = log(author_n))) +
  geom_smooth(se = F) +
  ylab("Score (log)") +
  scale_y_log10() +
  scale_color_viridis(alpha = .8) +
  theme_classic() +
  theme(axis.text.x = element_text(angle = 90))

ggplot(scores, aes(x = created_bin, y = mean_score_normalized, group = subreddit, 
                   color = log(author_n))) +
  geom_smooth(se = F) +
  ylab("Score/author_n (log)") +
  scale_y_log10() +
  scale_color_viridis(alpha = .8) +
  theme_classic() +
  theme(axis.text.x = element_text(angle = 90))

Mean score and author n

score_with_author <- score_mean %>%
  left_join(author_counts) 

normalized score exp: -0.9.

  ggplot(score_with_author, aes( x = author_n, y = mean_score)) +
  geom_point() +
  geom_smooth(method = "lm")+
  scale_y_log10(name = "Score") +
  scale_x_log10(name = "N authors (log)")+
  theme_classic()

  ggplot(score_with_author, aes(x = author_n, y = mean_score_normalized)) +
  geom_point() +
  geom_smooth(method = "lm")+
  scale_y_log10(name = "Score/author_n (log)") +
  scale_x_log10(name = "N authors (log)")+
  theme_classic()

Comment Entropy

Based on topic models (longest post for each author per week).

Over time

ENTROPY_PATH <- here("exploratory_analyses/03_systematic_sample/data/subreddit_post_entropy_overtime.csv")

comment_entropy<- read_csv(ENTROPY_PATH, col_names = c( "subreddit", "created_bin", 
                                             "mean_document_entropy", "n")) %>%
  left_join(author_counts) %>%
  mutate(subreddit = fct_reorder(subreddit, author_n),
         created_bin = lubridate::round_date(as.POSIXct(created_bin), "week")) %>%
  filter(author_n > 100) 

entropy_mean <- comment_entropy %>%
  group_by(subreddit) %>%
  summarize(mean_document_entropy = mean(mean_document_entropy))

ggplot(comment_entropy, aes(x = created_bin, y = mean_document_entropy, group = subreddit, 
                   color = log(author_n))) +
  geom_smooth(se = F) +
  ylab("Score/author_n (log)") +
  scale_color_viridis(alpha = .8) +
  theme_classic() +
  theme(axis.text.x = element_text(angle = 90))

Mean and author n

entropy_counts <- entropy_mean %>%
  left_join(author_counts) 

ggplot(entropy_counts, aes(x = author_n, y = mean_document_entropy)) +
  geom_point() +
  geom_smooth(method = "lm")+
  scale_x_log10(name = "N authors (log)")+
  theme_classic()

cor.test(log(entropy_counts$author_n), entropy_counts$mean_document_entropy) %>%
  tidy() %>%
  kable()
estimate statistic p.value parameter conf.low conf.high method alternative
-0.2421378 -1.980856 0.051977 63 -0.4589336 0.0018718 Pearson’s product-moment correlation two.sided

Author Pairwise JSD

How similiar is an author nth post to their n + 1 post?

Over time

## Nth similarity
PAIRWISE_TOPIC_JSD <- "/Volumes/wilbur_the_great/LANGSCALES_subreddit_sample/jsd_nth_post/"

nth_post_data <- map_df(list.files(PAIRWISE_TOPIC_JSD, full.names = T),
                        ~{read_csv(.x) %>% mutate(subreddit = .x)}) %>%
  mutate(subreddit = str_replace(subreddit, paste0(PAIRWISE_TOPIC_JSD, "/"), ""),
         subreddit = str_replace(subreddit, "_previous_jsd_nth_post.csv", ""))

over_individual_time <- nth_post_data  %>%
  select(author, nth_post, previous_author_JSD, subreddit) %>%
  gather("measure", "value", -nth_post, -subreddit, -author) %>%
  select(-measure) %>%
  filter(!is.na(value))
over_individual_time_ms <- over_individual_time %>%
  group_by(subreddit, nth_post) %>%
  summarize(mean_JSD = mean(value, na.rm = T),
            n = n()) %>%
  ungroup() %>%
  filter(nth_post <= 50) %>%
  left_join(author_counts) %>%
  mutate(subreddit = fct_reorder(subreddit, author_n))


over_individual_time_ms %>%
  filter(!is.na(author_n)) %>%
  filter(author_n > 100) %>%
  ggplot(aes(x = nth_post, y = mean_JSD, group = subreddit, 
                   color = log(author_n))) +
  geom_smooth(se = F) +
  ylab("mean author pairwise comment JSD") +
  scale_color_viridis(alpha = .8) +
  theme_classic() +
  theme(axis.text.x = element_text(angle = 90))

Mean JSD and author N

mean_pairwise_JSD <- over_individual_time_ms %>%
  group_by(subreddit) %>%
  summarize(mean_JSD = mean(mean_JSD)) %>%
  left_join(author_counts)

ggplot(mean_pairwise_JSD, aes(x = author_n, y = mean_JSD)) +
  geom_point() +
  geom_smooth(method = "lm")+
scale_x_log10(name = "N authors (log)")+
  theme_classic()

cor.test(log(mean_pairwise_JSD$author_n), mean_pairwise_JSD$mean_JSD) %>%
  tidy() %>%
  kable()
estimate statistic p.value parameter conf.low conf.high method alternative
0.3151385 2.635631 0.0105597 63 0.0771709 0.5191358 Pearson’s product-moment correlation two.sided

Pairwise correlations

author_count_measures %>%
  select(subreddit, word_total, author_n, comments_n_long, posts_n_all, comments_posts_ratio) %>%
  left_join(score_mean %>% select(subreddit, mean_score_normalized), by = "subreddit") %>%
  left_join(lag_mean, by = "subreddit") %>%
  left_join(author_time %>% select(subreddit, author_longevity_mean, author_lag_mean),
            by = "subreddit") %>%
  left_join(in_churn, by= "subreddit") %>%
  left_join(entropy_mean, by = "subreddit") %>%
  left_join(mean_pairwise_JSD %>% select(subreddit, mean_JSD), by = "subreddit") %>%
  left_join(author_ineq_long %>% select(subreddit, comment_gini_coeff), by = "subreddit") %>%
  mutate_at(vars(word_total, author_n, comments_n_long, posts_n_all, comments_posts_ratio,
                 mean_score_normalized), log) %>%
  select(-subreddit) %>%
  make_corr_plot()