Q: How unique are authors in a subreddit in their comments with respect to that subreddit?
This is one comment per author per subreddit per week (min length = 100 words; min community size = 10).
Measure:
Log comment uniqueness:
\[LCU = \frac{\sum_{w\in c} \log p(w)}{|c|}\]
Log author uniqueness: \[LAU = \frac{\sum_{c\in C}(LCU)}{|C|}\]
Log subreddit uniqueness: \[LSU = \frac{\sum_{a\in A}(LAU)}{|A|}\]
# language variables
LCU_PATH <- "/Volumes/wilbur_the_great/LANGSCALES_subreddit_sample/lau_by_author_comment/"
lcu_measures <- map_df(list.files(LCU_PATH, full.names = T), read_csv)
lcu_measures_tidy <- lcu_measures %>%
select(subreddit, everything()) %>%
rename(lcu = lau) %>%
filter(!(subreddit == "newsokur"))
lau_measures <- lcu_measures_tidy %>%
group_by(subreddit, author) %>%
summarize(lau = mean(lcu))
# demo variables
comment_counts <- lcu_measures_tidy %>%
count(subreddit, name = "comment_n")
word_counts <- lcu_measures_tidy %>%
group_by(subreddit) %>%
summarize(mean_word_n = mean(n_post_words),
sum_word_n = sum(n_post_words))
actual_author_counts <- lcu_measures_tidy %>%
distinct(subreddit, author) %>%
count(subreddit, name = "actual_author_n")
all_subreddit_measures <- lau_measures %>%
group_by(subreddit) %>%
summarize(lsu = mean(lau)) %>%
left_join(comment_counts) %>%
left_join(word_counts) %>%
left_join(actual_author_counts) %>%
filter(actual_author_n >= 10) %>%
left_join(author_time) %>%
mutate_at(vars(actual_author_n, mean_word_n, sum_word_n, comment_n), log)
This is what the data look like:
head(all_subreddit_measures) %>%
kable()
| subreddit | lsu | comment_n | mean_word_n | sum_word_n | actual_author_n | author_longevity_mean | author_sd_mean | author_longevity_H | author_lag_sd | author_lag_H | author_lag_mean |
|---|---|---|---|---|---|---|---|---|---|---|---|
| Android | -6.866991 | 11.212496 | 5.175184 | 16.387680 | 10.274603 | 34523046 | 22523937 | 11.578390 | 8414424 | 8.6967253 | 6137443 |
| apple | -6.847983 | 11.221570 | 5.181000 | 16.402570 | 10.279764 | 33909395 | 22332285 | 11.564538 | 8378308 | 8.6949264 | 6027877 |
| areolas | -5.837151 | 2.484907 | 5.101390 | 7.586296 | 2.302585 | 1067071 | 2714364 | 2.129607 | 8028396 | 0.0154264 | 5701585 |
| Art | -7.020414 | 9.299358 | 5.204477 | 14.503835 | 9.087268 | 49815281 | 16390299 | 11.787406 | 7085627 | 6.2350882 | 3393298 |
| beauty | -6.697476 | 8.390496 | 5.168514 | 13.559009 | 7.951207 | 8381088 | 13600250 | 7.518137 | 6539784 | 5.9157518 | 3843072 |
| beneater | -6.349540 | 4.624973 | 5.200067 | 9.825039 | 4.158883 | 7037813 | 12437631 | 3.657413 | 3493614 | 2.2927837 | 2901036 |
all_subreddit_measures %>%
select(-subreddit) %>%
make_corr_plot()
ggplot(all_subreddit_measures, aes(x = actual_author_n, y = lsu)) +
geom_point() +
ylab("LSU") +
geom_smooth(method= "lm") +
xlab("Log number of authors") +
theme_classic()
ggplot(all_subreddit_measures, aes(x = actual_author_n, y = lsu)) +
geom_label(aes(label = subreddit)) +
ylab("LSU") +
geom_smooth(method= "lm") +
xlab("Log number of authors") +
theme_classic()
lm(lsu ~ actual_author_n + comment_n + sum_word_n, all_subreddit_measures) %>%
summary()
##
## Call:
## lm(formula = lsu ~ actual_author_n + comment_n + sum_word_n,
## data = all_subreddit_measures)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.42402 -0.10676 0.00715 0.09838 0.55607
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -3.62345 0.91987 -3.939 0.000195 ***
## actual_author_n -0.19833 0.07363 -2.694 0.008894 **
## comment_n 0.53622 0.22767 2.355 0.021401 *
## sum_word_n -0.44788 0.17676 -2.534 0.013591 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1899 on 68 degrees of freedom
## Multiple R-squared: 0.6801, Adjusted R-squared: 0.666
## F-statistic: 48.2 on 3 and 68 DF, p-value: < 2.2e-16
Controling for longevity:
lm(lsu ~ actual_author_n + comment_n + sum_word_n + author_longevity_mean, all_subreddit_measures) %>%
summary()
##
## Call:
## lm(formula = lsu ~ actual_author_n + comment_n + sum_word_n +
## author_longevity_mean, data = all_subreddit_measures)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.42577 -0.10565 0.00511 0.09861 0.55449
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -3.610e+00 9.274e-01 -3.893 0.000231 ***
## actual_author_n -2.122e-01 8.978e-02 -2.364 0.020984 *
## comment_n 5.550e-01 2.392e-01 2.320 0.023398 *
## sum_word_n -4.513e-01 1.784e-01 -2.530 0.013776 *
## author_longevity_mean -9.375e-10 3.414e-09 -0.275 0.784470
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1912 on 67 degrees of freedom
## Multiple R-squared: 0.6805, Adjusted R-squared: 0.6614
## F-statistic: 35.68 on 4 and 67 DF, p-value: 5.97e-16
LCU as a function of nth post by a users.
mean_lcu <- lcu_measures_tidy %>%
nest(-nth_comment, -subreddit) %>%
mutate(num_comments = map_dbl(data, nrow)) %>%
filter(num_comments > 5) %>% # can't calculate
mutate(test = map(data, ~t.test(.x$lcu)),
tidied = map(test, tidy)) %>%
unnest(tidied, drop = T) %>%
select(subreddit, nth_comment, num_comments,
estimate, conf.low, conf.high) %>%
filter(nth_comment <= 50) %>%
left_join(actual_author_counts) %>%
mutate(subreddit = fct_reorder(subreddit, actual_author_n))
ggplot(mean_lcu, aes(x = nth_comment, y = estimate)) +
geom_point() +
facet_wrap(~subreddit) +
geom_smooth(method = "lm")+
theme_classic()