AUTHOR_COUNTS <- here("exploratory_analyses/03_systematic_sample/data/subreddit_counts_scores.csv")
author_count_measures <- read_csv(AUTHOR_COUNTS, col_names = c("subreddit","author_n","word_H","word_mean_n","word_sd","word_total","score_mean", "score_sd","score_H","comments_n_long","comments_n_all", "posts_n_all","comments_posts_ratio")) %>%
filter(author_n > 100) %>%
filter(!(subreddit == "newsokur"))
COUNTS <- "/Volumes/wilbur_the_great/LANGSCALES_subreddit_sample/misc/all_word_counts.csv"
corpus_counts <- read_csv(COUNTS)
corpus_counts_with_freq <- corpus_counts %>%
group_by(subreddit) %>%
arrange(-corpus_2_counts) %>%
mutate(corpus_2_freq_rank = 1:n()) %>%
select(subreddit, corpus_1_counts, corpus_2_freq_rank) %>%
inner_join(author_count_measures %>% select(subreddit, author_n)) %>%
ungroup() %>%
mutate(subreddit = fct_reorder(subreddit, author_n))
zipf_params <- corpus_counts_with_freq %>%
nest(-subreddit) %>%
mutate(temp = map(data,
~get_power_law_exponent(.x, "corpus_2_freq_rank", "corpus_1_counts"))) %>%
select(-data) %>%
unnest() %>%
mutate(x = 10000,
y = 10000,
reference_slope = -1)
p <- ggplot(corpus_counts_with_freq , aes(x = corpus_2_freq_rank, y = corpus_1_counts)) +
geom_hex(binwidth = .08) +
facet_wrap(~subreddit) +
geom_smooth(method = "lm", size = .8, color = "red") +
scale_fill_viridis_c(trans = "log", breaks = c(10,100,1000, 10000), direction = -1) +
scale_y_log10(name = "N total words (log)",
labels = scales::trans_format("log10",
scales::math_format(10^.x))) +
scale_x_log10(name = "Rank Frequency (log)",
labels = scales::trans_format("log10",
scales::math_format(10^.x)))+
# annotation_logticks(short = unit(0.01, "cm"),
# mid = unit(0.02, "cm"),
# long = unit(0.03,"cm")) +
geom_abline(data = zipf_params, linetype = 2,
aes(intercept = intercept_value, slope= reference_slope)) +
geom_text(data = zipf_params, aes(label = slope_print, x= x, y = y),
color= "black", size = 2) +
theme_classic()
#pdf("/Users/mollylewis/Documents/research/Projects/1_in_progress/LANGSCALES/exploratory_analyses/03_systematic_sample/plots/zipfs_by_community.pdf", width = 12.5, height = 12)
p

#dev.off()
community_zipf <- zipf_params %>%
left_join(author_count_measures) %>%
select_if(is.numeric) %>%
select(-x, -y, -reference_slope)
ggplot(community_zipf, aes(x =author_n, y = slope_value)) +
geom_point(size = 4, alpha = .5) +
scale_x_log10(name = "N Authors (log)",
labels = scales::trans_format("log10",
scales::math_format(10^.x)))+
geom_hline(aes(yintercept = -1), linetype = 2) +
geom_smooth(method = "lm") +
ylab("Zipf Parameter") +
theme_classic()

cor.test(community_zipf$slope_value, log(community_zipf$author_n))
##
## Pearson's product-moment correlation
##
## data: community_zipf$slope_value and log(community_zipf$author_n)
## t = -8.7147, df = 62, p-value = 2.299e-12
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.835426 -0.606863
## sample estimates:
## cor
## -0.741988