Zipf Exponent

AUTHOR_COUNTS <-  here("exploratory_analyses/03_systematic_sample/data/subreddit_counts_scores.csv")
author_count_measures <- read_csv(AUTHOR_COUNTS, col_names = c("subreddit","author_n","word_H","word_mean_n","word_sd","word_total","score_mean",                                                    "score_sd","score_H","comments_n_long","comments_n_all", "posts_n_all","comments_posts_ratio")) %>%
  filter(author_n > 100) %>%
  filter(!(subreddit == "newsokur"))

COUNTS <- "/Volumes/wilbur_the_great/LANGSCALES_subreddit_sample/misc/all_word_counts.csv"
corpus_counts <- read_csv(COUNTS)

corpus_counts_with_freq <- corpus_counts %>%
  group_by(subreddit) %>%
  arrange(-corpus_2_counts) %>%
  mutate(corpus_2_freq_rank = 1:n()) %>%
  select(subreddit, corpus_1_counts, corpus_2_freq_rank) %>%
  inner_join(author_count_measures %>% select(subreddit, author_n)) %>%
  ungroup() %>%
  mutate(subreddit = fct_reorder(subreddit, author_n))

zipf_params <- corpus_counts_with_freq %>%
  nest(-subreddit) %>%
  mutate(temp = map(data,
                    ~get_power_law_exponent(.x, "corpus_2_freq_rank", "corpus_1_counts"))) %>%
  select(-data) %>%
  unnest() %>%
  mutate(x = 10000,
         y = 10000,
         reference_slope = -1)

p <- ggplot(corpus_counts_with_freq , aes(x = corpus_2_freq_rank, y = corpus_1_counts)) +
  geom_hex(binwidth = .08) +
  facet_wrap(~subreddit) +
  geom_smooth(method = "lm", size = .8, color = "red") +
  scale_fill_viridis_c(trans = "log", breaks = c(10,100,1000, 10000), direction = -1) +
  scale_y_log10(name = "N total words (log)",
                labels = scales::trans_format("log10",
                                              scales::math_format(10^.x))) +
  scale_x_log10(name = "Rank Frequency (log)",
                labels = scales::trans_format("log10",
                                              scales::math_format(10^.x)))+
 # annotation_logticks(short = unit(0.01, "cm"),
  #                    mid = unit(0.02, "cm"), 
  #                    long = unit(0.03,"cm")) +
  geom_abline(data = zipf_params, linetype = 2,
              aes(intercept = intercept_value, slope= reference_slope)) +
  geom_text(data = zipf_params,  aes(label = slope_print, x= x, y = y),
           color= "black", size = 2) +
  theme_classic()
#pdf("/Users/mollylewis/Documents/research/Projects/1_in_progress/LANGSCALES/exploratory_analyses/03_systematic_sample/plots/zipfs_by_community.pdf", width = 12.5, height = 12)
p

#dev.off()

community_zipf <- zipf_params %>%
  left_join(author_count_measures) %>%
  select_if(is.numeric) %>%
  select(-x, -y, -reference_slope) 

ggplot(community_zipf, aes(x =author_n, y = slope_value)) +
  geom_point(size = 4, alpha = .5) +
  scale_x_log10(name = "N Authors (log)",
                labels = scales::trans_format("log10",
                                              scales::math_format(10^.x)))+ 
  geom_hline(aes(yintercept = -1), linetype = 2) +
  geom_smooth(method = "lm") +
  ylab("Zipf Parameter") +
  theme_classic()

cor.test(community_zipf$slope_value, log(community_zipf$author_n))

## 
##  Pearson's product-moment correlation
## 
## data:  community_zipf$slope_value and log(community_zipf$author_n)
## t = -8.7147, df = 62, p-value = 2.299e-12
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.835426 -0.606863
## sample estimates:
##       cor 
## -0.741988

Zipf Exponent

Molly Lewis

2020-02-05