Reddit Communities

TYPE_PATH <- here("data/word_type_counts.csv")
type_counts <- read_csv(TYPE_PATH)

AUTHOR_COUNTS <-  here("exploratory_analyses/03_systematic_sample/data/subreddit_counts_scores.csv")
author_count_measures <- read_csv(AUTHOR_COUNTS, col_names = c("subreddit","author_n","word_H","word_mean_n","word_sd","word_total","score_mean",                                                    "score_sd","score_H","comments_n_long","comments_n_all",
                                                               "posts_n_all","comments_posts_ratio")) %>%
  filter(author_n > 100) %>%
  arrange(-author_n) %>%
  mutate(author_rank = 1:n()) %>%
    filter(!(subreddit == "newsokur")) %>%
  left_join(type_counts)


author_counts <- author_count_measures %>%
  select(subreddit, author_n, author_rank)

Tokens

exp_value <- get_power_law_exponent(author_count_measures, "author_n","word_total")


label_data <- author_count_measures %>%
  filter(subreddit %in% c("DisneyTravel", "movies", "neoliberal", "JustCause", "FinancialCareers"))

#pdf(here("/exploratory_analyses/03_systematic_sample/plots/nwords.pdf"),width = 6, height = 6)

ggplot(author_count_measures, aes(x = author_n, y = word_total)) +
  geom_point(size = 4, alpha = .4) +
  geom_smooth(method = "lm")+
  ggtitle("Number of Words vs. Community Size") +
  geom_text_repel(data = label_data, aes(label = subreddit), min.segment.length = .2) +
  scale_y_log10(name = "N total words (log)",
                labels = scales::trans_format("log10", scales::math_format(10^.x))) +
  scale_x_log10(name = "N authors (log)",
                labels = scales::trans_format("log10", scales::math_format(10^.x)))+
  annotation_logticks() +
  geom_abline(slope = 1, linetype = 2, intercept = exp_value[[4]]) +
  annotate("text", label = exp_value[[3]],
           x = 100000, y = 100000, color= "red", size = 5) +
  theme_classic()

#dev.off()

Types

exp_value2 <- get_power_law_exponent(author_count_measures, "author_n","n_word_types")


#pdf(here("/exploratory_analyses/03_systematic_sample/plots/nword_types.pdf"), 
#    width = 6, height = 6)

ggplot(author_count_measures, aes(x = author_n, y = n_word_types)) +
  geom_point(size = 4, alpha = .4) +
  geom_smooth(method = "lm")+
  ggtitle("Number of Word Types vs. Community Size") +
  geom_text_repel(data = label_data, aes(label = subreddit), min.segment.length = .2) +
  scale_y_log10(name = "N word types (log)",
                labels = scales::trans_format("log10", scales::math_format(10^.x)),
                breaks = c(10000, 100000, 1000000)) +
  geom_abline(slope = 1, linetype = 2, intercept = exp_value2[[4]]) +
  scale_x_log10(name = "N authors (log)",
                labels = scales::trans_format("log10", scales::math_format(10^.x)))+
  annotation_logticks() +
  
  annotate("text", label = exp_value2[[3]], x = 100000, y = 10000, color= "red", size = 5) +
  theme_classic()

#dev.off()

Token to type ratio

author_count_measures_with_token_type <- author_count_measures %>%
  mutate(token_type_ratio = word_total/n_word_types)

exp_value3 <- get_power_law_exponent(author_count_measures_with_token_type, "author_n","token_type_ratio")

ggplot(author_count_measures_with_token_type, aes(x = author_n, y = token_type_ratio))  +
  geom_point(size = 4, alpha = .4) +
  ggtitle("Type Token Ratio vs. Community Size") +
  scale_y_log10(name = "Token:Type (log)") +
  geom_abline(slope = 1, linetype = 2, intercept = exp_value3[[4]]) +
  geom_smooth(method = "lm") +
  scale_x_log10(name = "N authors (log)",
                labels = scales::trans_format("log10", scales::math_format(10^.x)))+
  annotate("text", label = exp_value3[[3]], x = 100000, y = 30, color= "red", size = 5) +
  annotation_logticks() +
  theme_classic()

Jane Austen

N types varying N tokens.Each point is 100 samples with bootstraped CIs (not visible).

JANEAUSTEN <- here("exploratory_analyses/03_systematic_sample/data/jane_austen_tt_samples.csv")

jane_austen_sample <- read_csv(JANEAUSTEN)

mean_types_by_token_length <- jane_austen_sample %>%
  group_by(n_tokens) %>%
  tidyboot_mean(column = n_types) %>%
  rename(n_types = mean)

exp_value_jane <- get_power_law_exponent(mean_types_by_token_length, "n_tokens","n_types")

ggplot(mean_types_by_token_length, aes(x = n_tokens, y = n_types, ymin = ci_lower, ymax = ci_upper)) +
  geom_pointrange() +
  ggtitle("Number of types in randomly sampled tokens \nfrom Jane Austen Corpus") +
  scale_x_log10(name = "N Tokens (log)") +
  scale_y_log10(name = "N Types (log)") +
  geom_smooth(method = "lm") +
  annotate("text", label = exp_value_jane[[3]], x = 100000, y = 500, color= "red", size = 5) +
  geom_abline(slope = 1, linetype = 2, intercept = exp_value_jane[[4]]) +
  annotation_logticks() +
  theme_classic()