Q: How unique are authors in a subreddit in their comments with respect to that subreddit?

This is one comment per author per subreddit per week (min length = 100 words; min community size = 10).

Measure:

  • \(p(w)\) - probability of a word in a subreddit (freq/total_words)
  • \(c\) = set of words produced for one comment
  • \(C\) = set of comments by one author
  • \(A\) = set of authors in one subreddit

Log comment uniqueness:

\[LCU = \frac{\sum_{w\in c} \log p(w)}{|c|}\]
Log author uniqueness: \[LAU = \frac{\sum_{c\in C}(LCU)}{|C|}\]
Log subreddit uniqueness: \[LSU = \frac{\sum_{a\in A}(LAU)}{|A|}\]

# language variables
LCU_PATH <- "/Volumes/wilbur_the_great/LANGSCALES_subreddit_sample/lau_by_author_comment/"
lcu_measures <- map_df(list.files(LCU_PATH, full.names = T), read_csv) 

lcu_measures_tidy <- lcu_measures %>%
  select(subreddit, everything()) %>%
  rename(lcu = lau) %>%
  filter(!(subreddit == "newsokur"))

lau_measures <- lcu_measures_tidy %>%
  group_by(subreddit, author) %>%
  summarize(lau = mean(lcu))
# demo variables
comment_counts <- lcu_measures_tidy %>%
  count(subreddit, name = "comment_n")

word_counts <- lcu_measures_tidy %>%
  group_by(subreddit) %>%
  summarize(mean_word_n = mean(n_post_words),
            sum_word_n = sum(n_post_words))

actual_author_counts <- lcu_measures_tidy %>%
  distinct(subreddit, author) %>%
  count(subreddit, name = "actual_author_n")

LSU

all_subreddit_measures <- lau_measures %>%
  group_by(subreddit)  %>%
  summarize(lsu = mean(lau)) %>%
  left_join(comment_counts) %>%
  left_join(word_counts) %>%
  left_join(actual_author_counts) %>%
  filter(actual_author_n >= 10) %>%
  left_join(author_time) %>%
  mutate_at(vars(actual_author_n, mean_word_n, sum_word_n, comment_n), log)

This is what the data look like:

head(all_subreddit_measures) %>%
  kable()
subreddit lsu comment_n mean_word_n sum_word_n actual_author_n author_longevity_mean author_sd_mean author_longevity_H author_lag_sd author_lag_H author_lag_mean
Android -6.866991 11.212496 5.175184 16.387680 10.274603 34523046 22523937 11.578390 8414424 8.6967253 6137443
apple -6.847983 11.221570 5.181000 16.402570 10.279764 33909395 22332285 11.564538 8378308 8.6949264 6027877
areolas -5.837151 2.484907 5.101390 7.586296 2.302585 1067071 2714364 2.129607 8028396 0.0154264 5701585
Art -7.020414 9.299358 5.204477 14.503835 9.087268 49815281 16390299 11.787406 7085627 6.2350882 3393298
beauty -6.697476 8.390496 5.168514 13.559009 7.951207 8381088 13600250 7.518137 6539784 5.9157518 3843072
beneater -6.349540 4.624973 5.200067 9.825039 4.158883 7037813 12437631 3.657413 3493614 2.2927837 2901036
all_subreddit_measures %>% 
  select(-subreddit)  %>%
  make_corr_plot()

ggplot(all_subreddit_measures, aes(x = actual_author_n, y = lsu)) +
  geom_point() +
  ylab("LSU") +
  geom_smooth(method= "lm") +
  xlab("Log number of authors") +
  theme_classic()

ggplot(all_subreddit_measures, aes(x = actual_author_n, y = lsu)) +
  geom_label(aes(label = subreddit)) +
  ylab("LSU") +
  geom_smooth(method= "lm") +
  xlab("Log number of authors") +
  theme_classic()

lm(lsu ~ actual_author_n + comment_n + sum_word_n, all_subreddit_measures) %>%
  summary()
## 
## Call:
## lm(formula = lsu ~ actual_author_n + comment_n + sum_word_n, 
##     data = all_subreddit_measures)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.42402 -0.10676  0.00715  0.09838  0.55607 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     -3.62345    0.91987  -3.939 0.000195 ***
## actual_author_n -0.19833    0.07363  -2.694 0.008894 ** 
## comment_n        0.53622    0.22767   2.355 0.021401 *  
## sum_word_n      -0.44788    0.17676  -2.534 0.013591 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1899 on 68 degrees of freedom
## Multiple R-squared:  0.6801, Adjusted R-squared:  0.666 
## F-statistic:  48.2 on 3 and 68 DF,  p-value: < 2.2e-16

Controling for longevity:

lm(lsu ~ actual_author_n + comment_n + sum_word_n + author_longevity_mean, all_subreddit_measures) %>%
  summary()
## 
## Call:
## lm(formula = lsu ~ actual_author_n + comment_n + sum_word_n + 
##     author_longevity_mean, data = all_subreddit_measures)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.42577 -0.10565  0.00511  0.09861  0.55449 
## 
## Coefficients:
##                         Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           -3.610e+00  9.274e-01  -3.893 0.000231 ***
## actual_author_n       -2.122e-01  8.978e-02  -2.364 0.020984 *  
## comment_n              5.550e-01  2.392e-01   2.320 0.023398 *  
## sum_word_n            -4.513e-01  1.784e-01  -2.530 0.013776 *  
## author_longevity_mean -9.375e-10  3.414e-09  -0.275 0.784470    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1912 on 67 degrees of freedom
## Multiple R-squared:  0.6805, Adjusted R-squared:  0.6614 
## F-statistic: 35.68 on 4 and 67 DF,  p-value: 5.97e-16

LCU

LCU as a function of nth post by a users.

mean_lcu <- lcu_measures_tidy %>%
  nest(-nth_comment, -subreddit) %>%
  mutate(num_comments = map_dbl(data, nrow)) %>%
  filter(num_comments > 5) %>% # can't calculate
  mutate(test = map(data, ~t.test(.x$lcu)),
         tidied  = map(test, tidy)) %>%
  unnest(tidied, drop = T) %>%
  select(subreddit, nth_comment, num_comments,
         estimate, conf.low, conf.high)  %>%
  filter(nth_comment <= 50) %>%
  left_join(actual_author_counts) %>%
  mutate(subreddit = fct_reorder(subreddit, actual_author_n))

ggplot(mean_lcu, aes(x = nth_comment, y = estimate)) +
  geom_point() +
  facet_wrap(~subreddit) +
  geom_smooth(method = "lm")+
  theme_classic()