Estimating Zipf with Clauset et al. method

AUTHOR_COUNTS <-  here("exploratory_analyses/03_systematic_sample/data/subreddit_counts_scores.csv")
author_count_measures <- read_csv(AUTHOR_COUNTS, col_names = c("subreddit","author_n","word_H","word_mean_n","word_sd","word_total","score_mean",                                                    "score_sd","score_H","comments_n_long","comments_n_all", "posts_n_all","comments_posts_ratio")) %>%
  #filter(author_n > 100) %>%
  filter(!(subreddit == "newsokur")) 

PARAMS1 <- here("exploratory_analyses/03_systematic_sample/data/estimated_zipf_params.csv")
PARAMS2 <- here("exploratory_analyses/03_systematic_sample/data/estimated_zipf_params2.csv")

params <- read_csv(PARAMS1, col_names = c("subreddit", "param", "xmin", "p_val")) %>%
  bind_rows(read_csv(PARAMS2, col_names = c("subreddit", "param", "xmin", "p_val"))) %>%
  filter(!(subreddit == "newsokur")) 
 # distinct(subreddit, .keep_all = T)

DT::datatable(params)

params_with_author <- params %>%
  left_join(author_count_measures %>% select(subreddit, author_n))   %>%
   distinct(subreddit, .keep_all = T)

ggplot(params_with_author, aes(x = author_n, y = param, color = p_val)) +
  geom_point(size = 4) + #alpha = .5) +
   scale_x_log10(name = "N Authors (log)",
                 labels = scales::trans_format("log10",
                                               scales::math_format(10^.x)))+ 
   scale_color_gradient(low = "grey", high = "red") +
   geom_hline(aes(yintercept = 1), linetype = 2) +
   geom_smooth(method = "lm") +
   ylab("Zipf Parameter") +
   theme_classic()

ggplot(params_with_author, aes(x = p_val, y = xmin)) +
  geom_point(size = 4, alpha = .5) +
  geom_smooth(method = "lm") +
  theme_classic()

Estimating Zipf with Clauset et al. method

Molly Lewis

2020-02-07