AUTHOR_COUNTS <- here("exploratory_analyses/03_systematic_sample/data/subreddit_counts_scores.csv")
author_count_measures <- read_csv(AUTHOR_COUNTS, col_names = c("subreddit","author_n","word_H","word_mean_n","word_sd","word_total","score_mean", "score_sd","score_H","comments_n_long","comments_n_all", "posts_n_all","comments_posts_ratio")) %>%
#filter(author_n > 100) %>%
filter(!(subreddit == "newsokur"))
PARAMS1 <- here("exploratory_analyses/03_systematic_sample/data/estimated_zipf_params.csv")
PARAMS2 <- here("exploratory_analyses/03_systematic_sample/data/estimated_zipf_params2.csv")
params <- read_csv(PARAMS1, col_names = c("subreddit", "param", "xmin", "p_val")) %>%
bind_rows(read_csv(PARAMS2, col_names = c("subreddit", "param", "xmin", "p_val"))) %>%
filter(!(subreddit == "newsokur"))
# distinct(subreddit, .keep_all = T)
DT::datatable(params)
params_with_author <- params %>%
left_join(author_count_measures %>% select(subreddit, author_n)) %>%
distinct(subreddit, .keep_all = T)
ggplot(params_with_author, aes(x = author_n, y = param, color = p_val)) +
geom_point(size = 4) + #alpha = .5) +
scale_x_log10(name = "N Authors (log)",
labels = scales::trans_format("log10",
scales::math_format(10^.x)))+
scale_color_gradient(low = "grey", high = "red") +
geom_hline(aes(yintercept = 1), linetype = 2) +
geom_smooth(method = "lm") +
ylab("Zipf Parameter") +
theme_classic()

ggplot(params_with_author, aes(x = p_val, y = xmin)) +
geom_point(size = 4, alpha = .5) +
geom_smooth(method = "lm") +
theme_classic()
