AUTHOR_COUNTS <-  here("exploratory_analyses/04_systematic_sample_tidy/data/subreddit_meta.csv")
author_count_measures <- read_csv(AUTHOR_COUNTS, col_names = c("subreddit","author_n","word_H","word_mean_n","word_sd","word_total","score_mean",                                                    "score_sd","score_H","comments_n_all", "posts_n_all","comments_posts_ratio")) %>%
  filter(author_n > 100) %>%
  arrange(-author_n) %>%
  filter(!(subreddit == "newsokur"))

COUNTS <- "/Volumes/wilbur_the_great/LANGSCALES_subreddit_sample/misc/all_word_counts2.csv"
corpus_counts <- read_csv(COUNTS)

corpus_counts_with_freq <- corpus_counts %>%
  group_by(subreddit) %>%
  arrange(-corpus_2_counts) %>%
  mutate(corpus_2_freq_rank = 1:n()) %>%
  select(subreddit, total_counts, corpus_1_counts, corpus_2_freq_rank) %>%
  inner_join(author_count_measures %>% select(subreddit, author_n)) %>%
  filter(author_n > 100) %>%
  ungroup() %>%
  mutate(subreddit = fct_reorder(subreddit, author_n))

This dataset reflects all comments of length at least 50 words (after cleaning). Websites and usernames have been removed (though not all usernames). Rank frequency and N total words are calculated for each community from two separate corpora. Two corpora are created by sampling the total word counts from a binomial (with p = .5) and adding 1 (to deal with zeros). Here’s what the data look like:

kable(corpus_counts %>% slice(1:10))
subreddit word total_counts corpus_1_counts corpus_2_counts
AdrianaChechik the 513 257 258
AdrianaChechik to 421 218 205
AdrianaChechik and 414 202 214
AdrianaChechik this 317 151 168
AdrianaChechik i 262 140 124
AdrianaChechik a 246 120 128
AdrianaChechik of 230 109 123
AdrianaChechik you 209 98 113
AdrianaChechik was 199 112 89
AdrianaChechik not 195 106 91

All words

zipf_params <- corpus_counts_with_freq %>%
  nest(-subreddit) %>%
  mutate(temp = map(data,
                    ~get_power_law_exponent(.x, "corpus_2_freq_rank", "corpus_1_counts"))) %>%
  select(-data) %>%
  unnest() %>%
  mutate(x = 10000,
         y = 10000,
         reference_slope = -1)

hex plot

ggplot(corpus_counts_with_freq , aes(x = corpus_2_freq_rank, y = corpus_1_counts)) +
  geom_hex(binwidth = .08) +
  facet_wrap(~subreddit) +
  scale_fill_viridis_c(trans = "log", breaks = c(10,100,1000, 10000), direction = -1) +
  scale_y_log10(name = "N total words (log)",
                labels = scales::trans_format("log10",
                                              scales::math_format(10^.x))) +
  scale_x_log10(name = "Rank Frequency (log)",
                labels = scales::trans_format("log10",
                                              scales::math_format(10^.x)))+
  geom_text(data = zipf_params,  aes(label = slope_print, x= x, y = y),
           color= "black", size = 2) +
  geom_abline(data = zipf_params, 
              aes(intercept = intercept_value, slope= slope_value), size = .5, color = "red") +
  geom_abline(data = zipf_params, linetype = 2,
              aes(intercept = intercept_value, slope= reference_slope)) +
  theme_classic()

line plot

corpus_counts_with_freq %>%
  ggplot( aes(x = corpus_2_freq_rank, y = corpus_1_counts)) +
  facet_wrap(~subreddit) +
  geom_line(color = "grey") +
  scale_fill_viridis_c(trans = "log", breaks = c(10,100,1000, 10000), direction = -1) +
  scale_y_log10(name = "N total words (log)",
                labels = scales::trans_format("log10",
                                              scales::math_format(10^.x))) +
  scale_x_log10(name = "Rank Frequency (log)",
                labels = scales::trans_format("log10",
                                              scales::math_format(10^.x)))+
  geom_abline(data = zipf_params, 
              aes(intercept = intercept_value, slope= slope_value), size = .5, color = "red") +
  geom_abline(data = zipf_params, linetype = 2,
              aes(intercept = intercept_value, slope= reference_slope)) +
  geom_text(data = zipf_params,  aes(label = slope_print, x= x, y = y),
           color= "black", size = 2) +
  theme_classic()

Zipf and community size

community_zipf <- zipf_params %>%
  left_join(author_count_measures) %>%
  select_if(is.numeric) %>%
  select(-x, -y, -reference_slope) 

ggplot(community_zipf, aes(x =author_n, y = slope_value)) +
  geom_point(size = 4, alpha = .5) +
  scale_x_log10(name = "N Authors (log)",
                labels = scales::trans_format("log10",
                                              scales::math_format(10^.x)))+ 
  geom_hline(aes(yintercept = -1), linetype = 2) +
  geom_smooth(method = "lm") +
  ylab("Zipf Parameter") +
  theme_classic()

cor.test(community_zipf$slope_value, log(community_zipf$author_n))
## 
##  Pearson's product-moment correlation
## 
## data:  community_zipf$slope_value and log(community_zipf$author_n)
## t = -11.484, df = 63, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.8883065 -0.7240031
## sample estimates:
##        cor 
## -0.8226378

Types

word_tokens_type <- corpus_counts_with_freq %>%
  group_by(subreddit) %>%
  summarize(n_tokens = sum(total_counts),
        n_types = n()) %>%
  inner_join(author_count_measures)

exp_value <- get_power_law_exponent(word_tokens_type, "author_n","n_types")

word_tokens_type %>%
  ggplot(aes(x = author_n, y = n_types)) +
  geom_point(size = 4, alpha = .4) +
  geom_abline(slope = exp_value$slope_value, intercept = exp_value$intercept_value,  color = "red") +
  geom_abline(slope = 1, linetype = 2,  intercept =exp_value$intercept_value) +
  ggtitle("Number of Word Types vs. Community Size") +
  scale_y_log10(name = "N words types (log)",
                labels = scales::trans_format("log10", scales::math_format(10^.x))) +
  scale_x_log10(name = "N authors (log)",
                labels = scales::trans_format("log10", scales::math_format(10^.x)))+
  annotation_logticks() +
  annotate("text", label = exp_value$slope_print,
           x = 50000, y = 20000, color= "red", size = 5) +
  theme_classic(base_size = 12)

Tokens

exp_value <- get_power_law_exponent(word_tokens_type, "author_n","n_tokens")

word_tokens_type %>%
  ggplot(aes(x = author_n, y = n_tokens)) +
  geom_point(size = 4, alpha = .4) +
  geom_abline(slope = exp_value$slope_value, intercept = exp_value$intercept_value,  color = "red") +
  geom_abline(slope = 1, linetype = 2,  intercept =exp_value$intercept_value) +
  ggtitle("Number of Word Tokens vs. Community Size") +
  scale_y_log10(name = "N tokens (log)",
                labels = scales::trans_format("log10", scales::math_format(10^.x))) +
  scale_x_log10(name = "N authors (log)",
                labels = scales::trans_format("log10", scales::math_format(10^.x)))+
  annotation_logticks() +
  annotate("text", label = exp_value$slope_print,
           x = 50000, y = 50000, color= "red", size = 5) +
  theme_classic(base_size = 12)

Pairwise corrs

word_tokens_type %>%
  left_join(zipf_params%>% select(subreddit, slope_value)) %>%
  select(slope_value, author_n, n_tokens, n_types) %>%
  mutate_all(~log(abs(.x))) %>%
  make_corr_plot()

x min = 200

zipf_params_200 <- corpus_counts_with_freq %>%
  filter(corpus_2_freq_rank >=200) %>%
  nest(-subreddit) %>%
  mutate(temp = map(data,
                    ~get_power_law_exponent(.x, "corpus_2_freq_rank", "corpus_1_counts"))) %>%
  select(-data) %>%
  unnest() %>%
  mutate(x = 10000,
         y = 10000,
         reference_slope = -1)

hex plot

corpus_counts_with_freq %>%
  filter(corpus_2_freq_rank >=200) %>%
  ggplot( aes(x = corpus_2_freq_rank, y = corpus_1_counts)) +
  geom_hex(binwidth = .08) +
  facet_wrap(~subreddit) +
  scale_fill_viridis_c(trans = "log", breaks = c(10,100,1000, 10000), direction = -1) +
  scale_y_log10(name = "N total words (log)",
                labels = scales::trans_format("log10",
                                              scales::math_format(10^.x))) +
  scale_x_log10(name = "Rank Frequency (log)",
                labels = scales::trans_format("log10",
                                              scales::math_format(10^.x)))+
  geom_text(data = zipf_params_200,  aes(label = slope_print, x= x, y = y),
           color= "black", size = 2) +
  geom_abline(data = zipf_params_200, 
              aes(intercept = intercept_value, slope= slope_value), size = .5, color = "red") +
  geom_abline(data = zipf_params_200, linetype = 2,
              aes(intercept = intercept_value, slope= reference_slope)) +
  theme_classic()

line plot

corpus_counts_with_freq %>%
  filter(corpus_2_freq_rank > 200) %>%
  ggplot( aes(x = corpus_2_freq_rank, y = corpus_1_counts)) +
  facet_wrap(~subreddit) +
  geom_line(color = "grey") +
  scale_fill_viridis_c(trans = "log", breaks = c(10,100,1000, 10000), direction = -1) +
  scale_y_log10(name = "N total words (log)",
                labels = scales::trans_format("log10",
                                              scales::math_format(10^.x))) +
  scale_x_log10(name = "Rank Frequency (log)",
                labels = scales::trans_format("log10",
                                              scales::math_format(10^.x)))+
  geom_abline(data = zipf_params_200, 
              aes(intercept = intercept_value, slope= slope_value), size = .5, color = "red") +
  geom_abline(data = zipf_params_200, linetype = 2,
              aes(intercept = intercept_value, slope= reference_slope)) +
  geom_text(data = zipf_params_200,  aes(label = slope_print, x= x, y = y),
           color= "black", size = 2) +
  theme_classic()

Zipf and community size

community_zipf_200 <- zipf_params_200 %>%
  left_join(author_count_measures) %>%
  select_if(is.numeric) %>%
  select(-x, -y, -reference_slope) 

ggplot(community_zipf_200, aes(x =author_n, y = slope_value)) +
  geom_point(size = 4, alpha = .5) +
  scale_x_log10(name = "N Authors (log)",
                labels = scales::trans_format("log10",
                                              scales::math_format(10^.x)))+ 
  geom_hline(aes(yintercept = -1), linetype = 2) +
  geom_smooth(method = "lm") +
  ylab("Zipf Parameter") +
  theme_classic()

cor.test(community_zipf_200$slope_value, 
         log(community_zipf_200$author_n))
## 
##  Pearson's product-moment correlation
## 
## data:  community_zipf_200$slope_value and log(community_zipf_200$author_n)
## t = -14.99, df = 63, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.9276947 -0.8156762
## sample estimates:
##        cor 
## -0.8837581

Types

word_tokens_type_200 <- corpus_counts_with_freq %>%
  filter(corpus_2_freq_rank > 200) %>%
  group_by(subreddit) %>%
  summarize(n_tokens = sum(total_counts),
        n_types = n()) %>%
  inner_join(author_count_measures)

exp_value <- get_power_law_exponent(word_tokens_type_200, "author_n","n_types")

word_tokens_type_200 %>%
  ggplot(aes(x = author_n, y = n_types)) +
  geom_point(size = 4, alpha = .4) +
  geom_abline(slope = exp_value$slope_value, intercept = exp_value$intercept_value,  color = "red") +
  geom_abline(slope = 1, linetype = 2,  intercept =exp_value$intercept_value) +
  ggtitle("Number of Word Types vs. Community Size") +
  scale_y_log10(name = "N words types (log)",
                labels = scales::trans_format("log10", scales::math_format(10^.x))) +
  scale_x_log10(name = "N authors (log)",
                labels = scales::trans_format("log10", scales::math_format(10^.x)))+
  annotation_logticks() +
annotate("text", label = exp_value$slope_print,
           x = 50000, y = 20000, color= "red", size = 5) +
  theme_classic(base_size = 12)

Tokens

exp_value <- get_power_law_exponent(word_tokens_type_200, "author_n","n_tokens")

word_tokens_type_200 %>%
  ggplot(aes(x = author_n, y = n_tokens)) +
  geom_point(size = 4, alpha = .4) +
  geom_abline(slope = exp_value$slope_value, intercept = exp_value$intercept_value,  color = "red") +
  geom_abline(slope = 1, linetype = 2,  intercept =exp_value$intercept_value) +
  ggtitle("Number of Word Tokens vs. Community Size") +
  scale_y_log10(name = "N tokens (log)",
                labels = scales::trans_format("log10", scales::math_format(10^.x))) +
  scale_x_log10(name = "N authors (log)",
                labels = scales::trans_format("log10", scales::math_format(10^.x)))+
  annotation_logticks() +
  annotate("text", label = exp_value$slope_print,
           x = 50000, y = 50000, color= "red", size = 5) +
  theme_classic(base_size = 12)

Pairwise corrs

word_tokens_type_200 %>%
  left_join(zipf_params_200 %>% select(subreddit, slope_value)) %>%
  select(slope_value, author_n, n_tokens, n_types) %>%
  mutate_all(~log(abs(.x))) %>%
  make_corr_plot()

dictionary words only

grady_words <- data.frame(word = GradyAugmented)

corpus_counts_with_freq_grady <- corpus_counts %>%
  right_join(grady_words) %>%
  group_by(subreddit) %>%
  arrange(-corpus_2_counts) %>%
  mutate(corpus_2_freq_rank = 1:n()) %>%
  select(subreddit, total_counts, corpus_1_counts, corpus_2_freq_rank) %>%
  inner_join(author_count_measures %>% select(subreddit, author_n)) %>%
  ungroup() %>%
  mutate(subreddit = fct_reorder(subreddit, author_n))

zipf_params_grady <- corpus_counts_with_freq_grady %>%
  nest(-subreddit) %>%
  mutate(temp = map(data,
                    ~get_power_law_exponent(.x, "corpus_2_freq_rank", "corpus_1_counts"))) %>%
  select(-data) %>%
  unnest() %>%
  mutate(x = 10000,
         y = 10000,
         reference_slope = -1)

hex plot

corpus_counts_with_freq_grady %>%
  ggplot( aes(x = corpus_2_freq_rank, y = corpus_1_counts)) +
  geom_hex(binwidth = .08) +
  facet_wrap(~subreddit) +
  scale_fill_viridis_c(trans = "log", breaks = c(10,100,1000, 10000), direction = -1) +
  scale_y_log10(name = "N total words (log)",
                labels = scales::trans_format("log10",
                                              scales::math_format(10^.x))) +
  scale_x_log10(name = "Rank Frequency (log)",
                labels = scales::trans_format("log10",
                                              scales::math_format(10^.x)))+
  geom_text(data = zipf_params_grady,  aes(label = slope_print, x= x, y = y),
           color= "black", size = 2) +
  geom_abline(data = zipf_params_grady, 
              aes(intercept = intercept_value, slope= slope_value), size = .5, color = "red") +
  geom_abline(data = zipf_params_grady, linetype = 2,
              aes(intercept = intercept_value, slope= reference_slope)) +
  theme_classic()

line plot

corpus_counts_with_freq_grady %>%
  ggplot( aes(x = corpus_2_freq_rank, y = corpus_1_counts)) +
  facet_wrap(~subreddit) +
  geom_line(color = "grey") +
  scale_fill_viridis_c(trans = "log", breaks = c(10,100,1000, 10000), direction = -1) +
  scale_y_log10(name = "N total words (log)",
                labels = scales::trans_format("log10",
                                              scales::math_format(10^.x))) +
  scale_x_log10(name = "Rank Frequency (log)",
                labels = scales::trans_format("log10",
                                              scales::math_format(10^.x)))+
  geom_abline(data = zipf_params_grady, 
              aes(intercept = intercept_value, slope= slope_value), size = .5, color = "red") +
  geom_abline(data = zipf_params_grady, linetype = 2,
              aes(intercept = intercept_value, slope= reference_slope)) +
  geom_text(data = zipf_params_grady,  aes(label = slope_print, x= x, y = y),
           color= "black", size = 2) +
  theme_classic()

Zipf and community size

community_zipf_grady <- zipf_params_grady %>%
  left_join(author_count_measures) %>%
  select_if(is.numeric) %>%
  select(-x, -y, -reference_slope) 

ggplot(community_zipf_grady, aes(x =author_n, y = slope_value)) +
  geom_point(size = 4, alpha = .5) +
  scale_x_log10(name = "N Authors (log)",
                labels = scales::trans_format("log10",
                                              scales::math_format(10^.x)))+ 
  geom_hline(aes(yintercept = -1), linetype = 2) +
  geom_smooth(method = "lm") +
  ylab("Zipf Parameter") +
  theme_classic()

cor.test(community_zipf_grady$slope_value, 
         log(community_zipf_grady$author_n))
## 
##  Pearson's product-moment correlation
## 
## data:  community_zipf_grady$slope_value and log(community_zipf_grady$author_n)
## t = -18.088, df = 63, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.9479052 -0.8650050
## sample estimates:
##        cor 
## -0.9157126

Types

word_tokens_type_grady <- corpus_counts_with_freq_grady %>%
  group_by(subreddit) %>%
  summarize(n_tokens = sum(total_counts),
        n_types = n()) %>%
  inner_join(author_count_measures)

exp_value <- get_power_law_exponent(word_tokens_type_grady, "author_n","n_types")

ggplot(word_tokens_type_grady, aes(x = author_n, y = n_types)) +
  geom_point(size = 4, alpha = .4) +
  geom_abline(slope = exp_value$slope_value, intercept = exp_value$intercept_value,  color = "red") +
  geom_abline(slope = 1, linetype = 2,  intercept = exp_value$intercept_value) +
  ggtitle("Number of Word Types vs. Community Size") +
  scale_y_log10(name = "N words types (log)",
                labels = scales::trans_format("log10", scales::math_format(10^.x))) +
  scale_x_log10(name = "N authors (log)",
                labels = scales::trans_format("log10", scales::math_format(10^.x)))+
  annotation_logticks() +
  annotate("text", label = exp_value$slope_print,
           x = 50000, y = 20000, color= "red", size = 5) +
  theme_classic(base_size = 12)

#dev.off()

Tokens

exp_value <- get_power_law_exponent(word_tokens_type_grady, "author_n","n_tokens")

ggplot(word_tokens_type_grady, aes(x = author_n, y = n_tokens)) +
  geom_point(size = 4, alpha = .4) +
  geom_abline(slope = exp_value$slope_value, intercept = exp_value$intercept_value,  color = "red") +
  geom_abline(slope = 1, linetype = 2,  intercept =exp_value$intercept_value) +
  ggtitle("Number of Word Tokens vs. Community Size") +
  scale_y_log10(name = "N tokens (log)",
                labels = scales::trans_format("log10", scales::math_format(10^.x))) +
  scale_x_log10(name = "N authors (log)",
                labels = scales::trans_format("log10", scales::math_format(10^.x)))+
  annotation_logticks() +
  annotate("text", label = exp_value$slope_print,
           x = 50000, y = 50000, color= "red", size = 5) +
  theme_classic(base_size = 12)

Pairwise corrs

word_tokens_type_grady %>%
  left_join(zipf_params_grady %>% select(subreddit, slope_value)) %>%
  select(slope_value, author_n, n_tokens, n_types) %>%
  mutate_all(~log(abs(.x))) %>%
  make_corr_plot()