TYPE_PATH <- here("data/word_type_counts.csv")
type_counts <- read_csv(TYPE_PATH)
AUTHOR_COUNTS <- here("exploratory_analyses/03_systematic_sample/data/subreddit_counts_scores.csv")
author_count_measures <- read_csv(AUTHOR_COUNTS, col_names = c("subreddit","author_n","word_H","word_mean_n","word_sd","word_total","score_mean", "score_sd","score_H","comments_n_long","comments_n_all",
"posts_n_all","comments_posts_ratio")) %>%
filter(author_n > 100) %>%
arrange(-author_n) %>%
mutate(author_rank = 1:n()) %>%
filter(!(subreddit == "newsokur")) %>%
left_join(type_counts)
author_counts <- author_count_measures %>%
select(subreddit, author_n, author_rank)
exp_value <- get_power_law_exponent(author_count_measures, "author_n","word_total")
label_data <- author_count_measures %>%
filter(subreddit %in% c("DisneyTravel", "movies", "neoliberal", "JustCause", "FinancialCareers"))
#pdf(here("/exploratory_analyses/03_systematic_sample/plots/nwords.pdf"),width = 6, height = 6)
ggplot(author_count_measures, aes(x = author_n, y = word_total)) +
geom_point(size = 4, alpha = .4) +
geom_smooth(method = "lm")+
ggtitle("Number of Words vs. Community Size") +
geom_text_repel(data = label_data, aes(label = subreddit), min.segment.length = .2) +
scale_y_log10(name = "N total words (log)",
labels = scales::trans_format("log10", scales::math_format(10^.x))) +
scale_x_log10(name = "N authors (log)",
labels = scales::trans_format("log10", scales::math_format(10^.x)))+
annotation_logticks() +
geom_abline(slope = 1, linetype = 2, intercept = exp_value[[4]]) +
annotate("text", label = exp_value[[3]],
x = 100000, y = 100000, color= "red", size = 5) +
theme_classic()
#dev.off()
exp_value2 <- get_power_law_exponent(author_count_measures, "author_n","n_word_types")
#pdf(here("/exploratory_analyses/03_systematic_sample/plots/nword_types.pdf"),
# width = 6, height = 6)
ggplot(author_count_measures, aes(x = author_n, y = n_word_types)) +
geom_point(size = 4, alpha = .4) +
geom_smooth(method = "lm")+
ggtitle("Number of Word Types vs. Community Size") +
geom_text_repel(data = label_data, aes(label = subreddit), min.segment.length = .2) +
scale_y_log10(name = "N word types (log)",
labels = scales::trans_format("log10", scales::math_format(10^.x)),
breaks = c(10000, 100000, 1000000)) +
geom_abline(slope = 1, linetype = 2, intercept = exp_value2[[4]]) +
scale_x_log10(name = "N authors (log)",
labels = scales::trans_format("log10", scales::math_format(10^.x)))+
annotation_logticks() +
annotate("text", label = exp_value2[[3]], x = 100000, y = 10000, color= "red", size = 5) +
theme_classic()
#dev.off()
author_count_measures_with_token_type <- author_count_measures %>%
mutate(token_type_ratio = word_total/n_word_types)
exp_value3 <- get_power_law_exponent(author_count_measures_with_token_type, "author_n","token_type_ratio")
ggplot(author_count_measures_with_token_type, aes(x = author_n, y = token_type_ratio)) +
geom_point(size = 4, alpha = .4) +
ggtitle("Type Token Ratio vs. Community Size") +
scale_y_log10(name = "Token:Type (log)") +
geom_abline(slope = 1, linetype = 2, intercept = exp_value3[[4]]) +
geom_smooth(method = "lm") +
scale_x_log10(name = "N authors (log)",
labels = scales::trans_format("log10", scales::math_format(10^.x)))+
annotate("text", label = exp_value3[[3]], x = 100000, y = 30, color= "red", size = 5) +
annotation_logticks() +
theme_classic()
N types varying N tokens.Each point is 100 samples with bootstraped CIs (not visible).
JANEAUSTEN <- here("exploratory_analyses/03_systematic_sample/data/jane_austen_tt_samples.csv")
jane_austen_sample <- read_csv(JANEAUSTEN)
mean_types_by_token_length <- jane_austen_sample %>%
group_by(n_tokens) %>%
tidyboot_mean(column = n_types) %>%
rename(n_types = mean)
exp_value_jane <- get_power_law_exponent(mean_types_by_token_length, "n_tokens","n_types")
ggplot(mean_types_by_token_length, aes(x = n_tokens, y = n_types, ymin = ci_lower, ymax = ci_upper)) +
geom_pointrange() +
ggtitle("Number of types in randomly sampled tokens \nfrom Jane Austen Corpus") +
scale_x_log10(name = "N Tokens (log)") +
scale_y_log10(name = "N Types (log)") +
geom_smooth(method = "lm") +
annotate("text", label = exp_value_jane[[3]], x = 100000, y = 500, color= "red", size = 5) +
geom_abline(slope = 1, linetype = 2, intercept = exp_value_jane[[4]]) +
annotation_logticks() +
theme_classic()