AUTHOR_COUNTS <- here("exploratory_analyses/04_systematic_sample_tidy/data/subreddit_meta.csv")
author_count_measures <- read_csv(AUTHOR_COUNTS, col_names = c("subreddit","author_n","word_H","word_mean_n","word_sd","word_total","score_mean", "score_sd","score_H","comments_n_all", "posts_n_all","comments_posts_ratio")) %>%
filter(author_n > 100) %>%
arrange(-author_n) %>%
filter(!(subreddit == "newsokur"))
COUNTS <- "/Volumes/wilbur_the_great/LANGSCALES_subreddit_sample/misc/all_word_counts2.csv"
corpus_counts <- read_csv(COUNTS)
corpus_counts_with_freq <- corpus_counts %>%
group_by(subreddit) %>%
arrange(-corpus_2_counts) %>%
mutate(corpus_2_freq_rank = 1:n()) %>%
select(subreddit, total_counts, corpus_1_counts, corpus_2_freq_rank) %>%
inner_join(author_count_measures %>% select(subreddit, author_n)) %>%
filter(author_n > 100) %>%
ungroup() %>%
mutate(subreddit = fct_reorder(subreddit, author_n))
This dataset reflects all comments of length at least 50 words (after cleaning). Websites and usernames have been removed (though not all usernames). Rank frequency and N total words are calculated for each community from two separate corpora. Two corpora are created by sampling the total word counts from a binomial (with p = .5) and adding 1 (to deal with zeros). Here’s what the data look like:
kable(corpus_counts %>% slice(1:10))
| subreddit | word | total_counts | corpus_1_counts | corpus_2_counts |
|---|---|---|---|---|
| AdrianaChechik | the | 513 | 257 | 258 |
| AdrianaChechik | to | 421 | 218 | 205 |
| AdrianaChechik | and | 414 | 202 | 214 |
| AdrianaChechik | this | 317 | 151 | 168 |
| AdrianaChechik | i | 262 | 140 | 124 |
| AdrianaChechik | a | 246 | 120 | 128 |
| AdrianaChechik | of | 230 | 109 | 123 |
| AdrianaChechik | you | 209 | 98 | 113 |
| AdrianaChechik | was | 199 | 112 | 89 |
| AdrianaChechik | not | 195 | 106 | 91 |
zipf_params <- corpus_counts_with_freq %>%
nest(-subreddit) %>%
mutate(temp = map(data,
~get_power_law_exponent(.x, "corpus_2_freq_rank", "corpus_1_counts"))) %>%
select(-data) %>%
unnest() %>%
mutate(x = 10000,
y = 10000,
reference_slope = -1)
ggplot(corpus_counts_with_freq , aes(x = corpus_2_freq_rank, y = corpus_1_counts)) +
geom_hex(binwidth = .08) +
facet_wrap(~subreddit) +
scale_fill_viridis_c(trans = "log", breaks = c(10,100,1000, 10000), direction = -1) +
scale_y_log10(name = "N total words (log)",
labels = scales::trans_format("log10",
scales::math_format(10^.x))) +
scale_x_log10(name = "Rank Frequency (log)",
labels = scales::trans_format("log10",
scales::math_format(10^.x)))+
geom_text(data = zipf_params, aes(label = slope_print, x= x, y = y),
color= "black", size = 2) +
geom_abline(data = zipf_params,
aes(intercept = intercept_value, slope= slope_value), size = .5, color = "red") +
geom_abline(data = zipf_params, linetype = 2,
aes(intercept = intercept_value, slope= reference_slope)) +
theme_classic()
corpus_counts_with_freq %>%
ggplot( aes(x = corpus_2_freq_rank, y = corpus_1_counts)) +
facet_wrap(~subreddit) +
geom_line(color = "grey") +
scale_fill_viridis_c(trans = "log", breaks = c(10,100,1000, 10000), direction = -1) +
scale_y_log10(name = "N total words (log)",
labels = scales::trans_format("log10",
scales::math_format(10^.x))) +
scale_x_log10(name = "Rank Frequency (log)",
labels = scales::trans_format("log10",
scales::math_format(10^.x)))+
geom_abline(data = zipf_params,
aes(intercept = intercept_value, slope= slope_value), size = .5, color = "red") +
geom_abline(data = zipf_params, linetype = 2,
aes(intercept = intercept_value, slope= reference_slope)) +
geom_text(data = zipf_params, aes(label = slope_print, x= x, y = y),
color= "black", size = 2) +
theme_classic()
community_zipf <- zipf_params %>%
left_join(author_count_measures) %>%
select_if(is.numeric) %>%
select(-x, -y, -reference_slope)
ggplot(community_zipf, aes(x =author_n, y = slope_value)) +
geom_point(size = 4, alpha = .5) +
scale_x_log10(name = "N Authors (log)",
labels = scales::trans_format("log10",
scales::math_format(10^.x)))+
geom_hline(aes(yintercept = -1), linetype = 2) +
geom_smooth(method = "lm") +
ylab("Zipf Parameter") +
theme_classic()
cor.test(community_zipf$slope_value, log(community_zipf$author_n))
##
## Pearson's product-moment correlation
##
## data: community_zipf$slope_value and log(community_zipf$author_n)
## t = -11.484, df = 63, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.8883065 -0.7240031
## sample estimates:
## cor
## -0.8226378
word_tokens_type <- corpus_counts_with_freq %>%
group_by(subreddit) %>%
summarize(n_tokens = sum(total_counts),
n_types = n()) %>%
inner_join(author_count_measures)
exp_value <- get_power_law_exponent(word_tokens_type, "author_n","n_types")
word_tokens_type %>%
ggplot(aes(x = author_n, y = n_types)) +
geom_point(size = 4, alpha = .4) +
geom_abline(slope = exp_value$slope_value, intercept = exp_value$intercept_value, color = "red") +
geom_abline(slope = 1, linetype = 2, intercept =exp_value$intercept_value) +
ggtitle("Number of Word Types vs. Community Size") +
scale_y_log10(name = "N words types (log)",
labels = scales::trans_format("log10", scales::math_format(10^.x))) +
scale_x_log10(name = "N authors (log)",
labels = scales::trans_format("log10", scales::math_format(10^.x)))+
annotation_logticks() +
annotate("text", label = exp_value$slope_print,
x = 50000, y = 20000, color= "red", size = 5) +
theme_classic(base_size = 12)
exp_value <- get_power_law_exponent(word_tokens_type, "author_n","n_tokens")
word_tokens_type %>%
ggplot(aes(x = author_n, y = n_tokens)) +
geom_point(size = 4, alpha = .4) +
geom_abline(slope = exp_value$slope_value, intercept = exp_value$intercept_value, color = "red") +
geom_abline(slope = 1, linetype = 2, intercept =exp_value$intercept_value) +
ggtitle("Number of Word Tokens vs. Community Size") +
scale_y_log10(name = "N tokens (log)",
labels = scales::trans_format("log10", scales::math_format(10^.x))) +
scale_x_log10(name = "N authors (log)",
labels = scales::trans_format("log10", scales::math_format(10^.x)))+
annotation_logticks() +
annotate("text", label = exp_value$slope_print,
x = 50000, y = 50000, color= "red", size = 5) +
theme_classic(base_size = 12)
word_tokens_type %>%
left_join(zipf_params%>% select(subreddit, slope_value)) %>%
select(slope_value, author_n, n_tokens, n_types) %>%
mutate_all(~log(abs(.x))) %>%
make_corr_plot()
zipf_params_200 <- corpus_counts_with_freq %>%
filter(corpus_2_freq_rank >=200) %>%
nest(-subreddit) %>%
mutate(temp = map(data,
~get_power_law_exponent(.x, "corpus_2_freq_rank", "corpus_1_counts"))) %>%
select(-data) %>%
unnest() %>%
mutate(x = 10000,
y = 10000,
reference_slope = -1)
corpus_counts_with_freq %>%
filter(corpus_2_freq_rank >=200) %>%
ggplot( aes(x = corpus_2_freq_rank, y = corpus_1_counts)) +
geom_hex(binwidth = .08) +
facet_wrap(~subreddit) +
scale_fill_viridis_c(trans = "log", breaks = c(10,100,1000, 10000), direction = -1) +
scale_y_log10(name = "N total words (log)",
labels = scales::trans_format("log10",
scales::math_format(10^.x))) +
scale_x_log10(name = "Rank Frequency (log)",
labels = scales::trans_format("log10",
scales::math_format(10^.x)))+
geom_text(data = zipf_params_200, aes(label = slope_print, x= x, y = y),
color= "black", size = 2) +
geom_abline(data = zipf_params_200,
aes(intercept = intercept_value, slope= slope_value), size = .5, color = "red") +
geom_abline(data = zipf_params_200, linetype = 2,
aes(intercept = intercept_value, slope= reference_slope)) +
theme_classic()
corpus_counts_with_freq %>%
filter(corpus_2_freq_rank > 200) %>%
ggplot( aes(x = corpus_2_freq_rank, y = corpus_1_counts)) +
facet_wrap(~subreddit) +
geom_line(color = "grey") +
scale_fill_viridis_c(trans = "log", breaks = c(10,100,1000, 10000), direction = -1) +
scale_y_log10(name = "N total words (log)",
labels = scales::trans_format("log10",
scales::math_format(10^.x))) +
scale_x_log10(name = "Rank Frequency (log)",
labels = scales::trans_format("log10",
scales::math_format(10^.x)))+
geom_abline(data = zipf_params_200,
aes(intercept = intercept_value, slope= slope_value), size = .5, color = "red") +
geom_abline(data = zipf_params_200, linetype = 2,
aes(intercept = intercept_value, slope= reference_slope)) +
geom_text(data = zipf_params_200, aes(label = slope_print, x= x, y = y),
color= "black", size = 2) +
theme_classic()
community_zipf_200 <- zipf_params_200 %>%
left_join(author_count_measures) %>%
select_if(is.numeric) %>%
select(-x, -y, -reference_slope)
ggplot(community_zipf_200, aes(x =author_n, y = slope_value)) +
geom_point(size = 4, alpha = .5) +
scale_x_log10(name = "N Authors (log)",
labels = scales::trans_format("log10",
scales::math_format(10^.x)))+
geom_hline(aes(yintercept = -1), linetype = 2) +
geom_smooth(method = "lm") +
ylab("Zipf Parameter") +
theme_classic()
cor.test(community_zipf_200$slope_value,
log(community_zipf_200$author_n))
##
## Pearson's product-moment correlation
##
## data: community_zipf_200$slope_value and log(community_zipf_200$author_n)
## t = -14.99, df = 63, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.9276947 -0.8156762
## sample estimates:
## cor
## -0.8837581
word_tokens_type_200 <- corpus_counts_with_freq %>%
filter(corpus_2_freq_rank > 200) %>%
group_by(subreddit) %>%
summarize(n_tokens = sum(total_counts),
n_types = n()) %>%
inner_join(author_count_measures)
exp_value <- get_power_law_exponent(word_tokens_type_200, "author_n","n_types")
word_tokens_type_200 %>%
ggplot(aes(x = author_n, y = n_types)) +
geom_point(size = 4, alpha = .4) +
geom_abline(slope = exp_value$slope_value, intercept = exp_value$intercept_value, color = "red") +
geom_abline(slope = 1, linetype = 2, intercept =exp_value$intercept_value) +
ggtitle("Number of Word Types vs. Community Size") +
scale_y_log10(name = "N words types (log)",
labels = scales::trans_format("log10", scales::math_format(10^.x))) +
scale_x_log10(name = "N authors (log)",
labels = scales::trans_format("log10", scales::math_format(10^.x)))+
annotation_logticks() +
annotate("text", label = exp_value$slope_print,
x = 50000, y = 20000, color= "red", size = 5) +
theme_classic(base_size = 12)
exp_value <- get_power_law_exponent(word_tokens_type_200, "author_n","n_tokens")
word_tokens_type_200 %>%
ggplot(aes(x = author_n, y = n_tokens)) +
geom_point(size = 4, alpha = .4) +
geom_abline(slope = exp_value$slope_value, intercept = exp_value$intercept_value, color = "red") +
geom_abline(slope = 1, linetype = 2, intercept =exp_value$intercept_value) +
ggtitle("Number of Word Tokens vs. Community Size") +
scale_y_log10(name = "N tokens (log)",
labels = scales::trans_format("log10", scales::math_format(10^.x))) +
scale_x_log10(name = "N authors (log)",
labels = scales::trans_format("log10", scales::math_format(10^.x)))+
annotation_logticks() +
annotate("text", label = exp_value$slope_print,
x = 50000, y = 50000, color= "red", size = 5) +
theme_classic(base_size = 12)
word_tokens_type_200 %>%
left_join(zipf_params_200 %>% select(subreddit, slope_value)) %>%
select(slope_value, author_n, n_tokens, n_types) %>%
mutate_all(~log(abs(.x))) %>%
make_corr_plot()
grady_words <- data.frame(word = GradyAugmented)
corpus_counts_with_freq_grady <- corpus_counts %>%
right_join(grady_words) %>%
group_by(subreddit) %>%
arrange(-corpus_2_counts) %>%
mutate(corpus_2_freq_rank = 1:n()) %>%
select(subreddit, total_counts, corpus_1_counts, corpus_2_freq_rank) %>%
inner_join(author_count_measures %>% select(subreddit, author_n)) %>%
ungroup() %>%
mutate(subreddit = fct_reorder(subreddit, author_n))
zipf_params_grady <- corpus_counts_with_freq_grady %>%
nest(-subreddit) %>%
mutate(temp = map(data,
~get_power_law_exponent(.x, "corpus_2_freq_rank", "corpus_1_counts"))) %>%
select(-data) %>%
unnest() %>%
mutate(x = 10000,
y = 10000,
reference_slope = -1)
corpus_counts_with_freq_grady %>%
ggplot( aes(x = corpus_2_freq_rank, y = corpus_1_counts)) +
geom_hex(binwidth = .08) +
facet_wrap(~subreddit) +
scale_fill_viridis_c(trans = "log", breaks = c(10,100,1000, 10000), direction = -1) +
scale_y_log10(name = "N total words (log)",
labels = scales::trans_format("log10",
scales::math_format(10^.x))) +
scale_x_log10(name = "Rank Frequency (log)",
labels = scales::trans_format("log10",
scales::math_format(10^.x)))+
geom_text(data = zipf_params_grady, aes(label = slope_print, x= x, y = y),
color= "black", size = 2) +
geom_abline(data = zipf_params_grady,
aes(intercept = intercept_value, slope= slope_value), size = .5, color = "red") +
geom_abline(data = zipf_params_grady, linetype = 2,
aes(intercept = intercept_value, slope= reference_slope)) +
theme_classic()
corpus_counts_with_freq_grady %>%
ggplot( aes(x = corpus_2_freq_rank, y = corpus_1_counts)) +
facet_wrap(~subreddit) +
geom_line(color = "grey") +
scale_fill_viridis_c(trans = "log", breaks = c(10,100,1000, 10000), direction = -1) +
scale_y_log10(name = "N total words (log)",
labels = scales::trans_format("log10",
scales::math_format(10^.x))) +
scale_x_log10(name = "Rank Frequency (log)",
labels = scales::trans_format("log10",
scales::math_format(10^.x)))+
geom_abline(data = zipf_params_grady,
aes(intercept = intercept_value, slope= slope_value), size = .5, color = "red") +
geom_abline(data = zipf_params_grady, linetype = 2,
aes(intercept = intercept_value, slope= reference_slope)) +
geom_text(data = zipf_params_grady, aes(label = slope_print, x= x, y = y),
color= "black", size = 2) +
theme_classic()
community_zipf_grady <- zipf_params_grady %>%
left_join(author_count_measures) %>%
select_if(is.numeric) %>%
select(-x, -y, -reference_slope)
ggplot(community_zipf_grady, aes(x =author_n, y = slope_value)) +
geom_point(size = 4, alpha = .5) +
scale_x_log10(name = "N Authors (log)",
labels = scales::trans_format("log10",
scales::math_format(10^.x)))+
geom_hline(aes(yintercept = -1), linetype = 2) +
geom_smooth(method = "lm") +
ylab("Zipf Parameter") +
theme_classic()
cor.test(community_zipf_grady$slope_value,
log(community_zipf_grady$author_n))
##
## Pearson's product-moment correlation
##
## data: community_zipf_grady$slope_value and log(community_zipf_grady$author_n)
## t = -18.088, df = 63, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.9479052 -0.8650050
## sample estimates:
## cor
## -0.9157126
word_tokens_type_grady <- corpus_counts_with_freq_grady %>%
group_by(subreddit) %>%
summarize(n_tokens = sum(total_counts),
n_types = n()) %>%
inner_join(author_count_measures)
exp_value <- get_power_law_exponent(word_tokens_type_grady, "author_n","n_types")
ggplot(word_tokens_type_grady, aes(x = author_n, y = n_types)) +
geom_point(size = 4, alpha = .4) +
geom_abline(slope = exp_value$slope_value, intercept = exp_value$intercept_value, color = "red") +
geom_abline(slope = 1, linetype = 2, intercept = exp_value$intercept_value) +
ggtitle("Number of Word Types vs. Community Size") +
scale_y_log10(name = "N words types (log)",
labels = scales::trans_format("log10", scales::math_format(10^.x))) +
scale_x_log10(name = "N authors (log)",
labels = scales::trans_format("log10", scales::math_format(10^.x)))+
annotation_logticks() +
annotate("text", label = exp_value$slope_print,
x = 50000, y = 20000, color= "red", size = 5) +
theme_classic(base_size = 12)
#dev.off()
exp_value <- get_power_law_exponent(word_tokens_type_grady, "author_n","n_tokens")
ggplot(word_tokens_type_grady, aes(x = author_n, y = n_tokens)) +
geom_point(size = 4, alpha = .4) +
geom_abline(slope = exp_value$slope_value, intercept = exp_value$intercept_value, color = "red") +
geom_abline(slope = 1, linetype = 2, intercept =exp_value$intercept_value) +
ggtitle("Number of Word Tokens vs. Community Size") +
scale_y_log10(name = "N tokens (log)",
labels = scales::trans_format("log10", scales::math_format(10^.x))) +
scale_x_log10(name = "N authors (log)",
labels = scales::trans_format("log10", scales::math_format(10^.x)))+
annotation_logticks() +
annotate("text", label = exp_value$slope_print,
x = 50000, y = 50000, color= "red", size = 5) +
theme_classic(base_size = 12)
word_tokens_type_grady %>%
left_join(zipf_params_grady %>% select(subreddit, slope_value)) %>%
select(slope_value, author_n, n_tokens, n_types) %>%
mutate_all(~log(abs(.x))) %>%
make_corr_plot()