Q: How unique are authors in a subreddit in their comments with respect to that subreddit?

This is one comment per author per subreddit per week (min length = 100 words; min community size = 10).

Measure:

\(p(w)\) - probability of a word in a subreddit (freq/total_words)
\(c\) = set of words produced for one comment
\(C\) = set of comments by one author
\(A\) = set of authors in one subreddit

Log comment uniqueness:

\[LCU = \frac{\sum_{w\in c} \log p(w)}{|c|}\]
Log author uniqueness: \[LAU = \frac{\sum_{c\in C}(LCU)}{|C|}\]
Log subreddit uniqueness: \[LSU = \frac{\sum_{a\in A}(LAU)}{|A|}\]

# language variables
LCU_PATH <- "/Volumes/wilbur_the_great/LANGSCALES_subreddit_sample/lau_by_author_comment/"
lcu_measures <- map_df(list.files(LCU_PATH, full.names = T), read_csv) 

lcu_measures_tidy <- lcu_measures %>%
  select(subreddit, everything()) %>%
  rename(lcu = lau) %>%
  filter(!(subreddit == "newsokur"))

lau_measures <- lcu_measures_tidy %>%
  group_by(subreddit, author) %>%
  summarize(lau = mean(lcu))

# demo variables
comment_counts <- lcu_measures_tidy %>%
  count(subreddit, name = "comment_n")

word_counts <- lcu_measures_tidy %>%
  group_by(subreddit) %>%
  summarize(mean_word_n = mean(n_post_words),
            sum_word_n = sum(n_post_words))

actual_author_counts <- lcu_measures_tidy %>%
  distinct(subreddit, author) %>%
  count(subreddit, name = "actual_author_n")

LSU

all_subreddit_measures <- lau_measures %>%
  group_by(subreddit)  %>%
  summarize(lsu = mean(lau)) %>%
  left_join(comment_counts) %>%
  left_join(word_counts) %>%
  left_join(actual_author_counts) %>%
  filter(actual_author_n >= 10) %>%
  left_join(author_time) %>%
  mutate_at(vars(actual_author_n, mean_word_n, sum_word_n, comment_n), log)

This is what the data look like:

head(all_subreddit_measures) %>%
  kable()

subreddit	lsu	comment_n	mean_word_n	sum_word_n	actual_author_n	author_longevity_mean	author_sd_mean	author_longevity_H	author_lag_sd	author_lag_H	author_lag_mean
Android	-6.866991	11.212496	5.175184	16.387680	10.274603	34523046	22523937	11.578390	8414424	8.6967253	6137443
apple	-6.847983	11.221570	5.181000	16.402570	10.279764	33909395	22332285	11.564538	8378308	8.6949264	6027877
areolas	-5.837151	2.484907	5.101390	7.586296	2.302585	1067071	2714364	2.129607	8028396	0.0154264	5701585
Art	-7.020414	9.299358	5.204477	14.503835	9.087268	49815281	16390299	11.787406	7085627	6.2350882	3393298
beauty	-6.697476	8.390496	5.168514	13.559009	7.951207	8381088	13600250	7.518137	6539784	5.9157518	3843072
beneater	-6.349540	4.624973	5.200067	9.825039	4.158883	7037813	12437631	3.657413	3493614	2.2927837	2901036

all_subreddit_measures %>% 
  select(-subreddit)  %>%
  make_corr_plot()

ggplot(all_subreddit_measures, aes(x = actual_author_n, y = lsu)) +
  geom_point() +
  ylab("LSU") +
  geom_smooth(method= "lm") +
  xlab("Log number of authors") +
  theme_classic()

ggplot(all_subreddit_measures, aes(x = actual_author_n, y = lsu)) +
  geom_label(aes(label = subreddit)) +
  ylab("LSU") +
  geom_smooth(method= "lm") +
  xlab("Log number of authors") +
  theme_classic()

lm(lsu ~ actual_author_n + comment_n + sum_word_n, all_subreddit_measures) %>%
  summary()

## 
## Call:
## lm(formula = lsu ~ actual_author_n + comment_n + sum_word_n, 
##     data = all_subreddit_measures)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.42402 -0.10676  0.00715  0.09838  0.55607 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     -3.62345    0.91987  -3.939 0.000195 ***
## actual_author_n -0.19833    0.07363  -2.694 0.008894 ** 
## comment_n        0.53622    0.22767   2.355 0.021401 *  
## sum_word_n      -0.44788    0.17676  -2.534 0.013591 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1899 on 68 degrees of freedom
## Multiple R-squared:  0.6801, Adjusted R-squared:  0.666 
## F-statistic:  48.2 on 3 and 68 DF,  p-value: < 2.2e-16

Controling for longevity:

lm(lsu ~ actual_author_n + comment_n + sum_word_n + author_longevity_mean, all_subreddit_measures) %>%
  summary()

## 
## Call:
## lm(formula = lsu ~ actual_author_n + comment_n + sum_word_n + 
##     author_longevity_mean, data = all_subreddit_measures)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.42577 -0.10565  0.00511  0.09861  0.55449 
## 
## Coefficients:
##                         Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           -3.610e+00  9.274e-01  -3.893 0.000231 ***
## actual_author_n       -2.122e-01  8.978e-02  -2.364 0.020984 *  
## comment_n              5.550e-01  2.392e-01   2.320 0.023398 *  
## sum_word_n            -4.513e-01  1.784e-01  -2.530 0.013776 *  
## author_longevity_mean -9.375e-10  3.414e-09  -0.275 0.784470    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1912 on 67 degrees of freedom
## Multiple R-squared:  0.6805, Adjusted R-squared:  0.6614 
## F-statistic: 35.68 on 4 and 67 DF,  p-value: 5.97e-16

LCU

LCU as a function of nth post by a users.

mean_lcu <- lcu_measures_tidy %>%
  nest(-nth_comment, -subreddit) %>%
  mutate(num_comments = map_dbl(data, nrow)) %>%
  filter(num_comments > 5) %>% # can't calculate
  mutate(test = map(data, ~t.test(.x$lcu)),
         tidied  = map(test, tidy)) %>%
  unnest(tidied, drop = T) %>%
  select(subreddit, nth_comment, num_comments,
         estimate, conf.low, conf.high)  %>%
  filter(nth_comment <= 50) %>%
  left_join(actual_author_counts) %>%
  mutate(subreddit = fct_reorder(subreddit, actual_author_n))

ggplot(mean_lcu, aes(x = nth_comment, y = estimate)) +
  geom_point() +
  facet_wrap(~subreddit) +
  geom_smooth(method = "lm")+
  theme_classic()

Reddit x 100

Author uniqueness

Molly Lewis

2020-01-17

LSU

LCU