Sampled subreddits

META_PATH <- here("exploratory_analyses/03_systematic_sample/data/subreddit_meta_data.csv")
meta_data <- read_csv(META_PATH, col_names = c("subreddit", "author_H","author_n","word_H","word_mean_n","word_sd","word_total","score_mean","score_sd","score_H","comments_n_long","comments_n_all","posts_n_all","comments_posts_ratio","author_longevity_mean","author_longevity_sd","author_longevity_H","author_lag_sd","author_lag_H", "author_lag_mean"))


meta_data2 <- meta_data %>%
  mutate(word_H = word_H/log(comments_n_long),
         score_H = score_H/log(comments_n_long),
         author_longevity_H = author_longevity_H/log(author_n),
         author_lag_H = author_lag_H/log(author_n)) %>%
  drop_na() 

DT::datatable(meta_data2 %>% arrange(-author_n))

We ended up with 83.

Descriptive subreddit stats

meta_data2 %>%
  gather("measure", "value", -subreddit) %>%
  ggplot(aes(x = value)) +
  geom_histogram() +
  facet_wrap(~measure, scales = "free_x") +
  scale_x_log10() +
  theme_classic()

Pairwise correlations

meta_data2 %>%
  mutate_if(is.numeric, log) %>%
  mutate_if(is.numeric, list(~na_if(., -Inf))) %>%
  make_corr_plot()

JSD at individual level

PAIRWISE_TOPIC_JSD <- "/Volumes/wilbur_the_great/LANGSCALES_subreddit_sample/jsd_nth_post/"


nth_post_data <- map_df(list.files(PAIRWISE_TOPIC_JSD, full.names = T), ~{read_csv(.x) %>% mutate(subreddit = .x)}) %>%
  mutate(subreddit = str_replace(subreddit, paste0(PAIRWISE_TOPIC_JSD, "/"), ""),
         subreddit = str_replace(subreddit, "_jsd_nth_post.csv", ""))

over_individual_time <- nth_post_data  %>%
  select(author, nth_post, previous_author_JSD, current_community_JSD, subreddit) %>%
  #mutate(previous_to_current  = previous_author_JSD + current_community_JSD) %>%
  gather("measure", "value", -nth_post, -subreddit, -author) 
over_individual_time_ms <- over_individual_time %>%
  group_by(subreddit, nth_post, measure) %>%
  summarize(mean_JSD = mean(value, na.rm = T),
            n = n()) %>%
  left_join(meta_data2) %>%
  ungroup() %>%
  mutate(subreddit = fct_reorder(subreddit, author_n))

over_individual_time_ms %>%
  ggplot(aes(x = nth_post, y = mean_JSD, group = measure, color = measure)) +
  geom_point() +
  geom_smooth() +
  ggtitle("Post similarity over indvidual time (with community reference)") +
  xlab("Nth post") +
   facet_wrap(~subreddit, scales = "free_y",  nrow = 12) +
  ylab("Mean JSD") +
  theme_classic() +
  theme(legend.position = "bottom")

good_subreddits <- over_individual_time %>%
    group_by(subreddit, nth_post, measure) %>%
    summarize(mean_JSD = mean(value, na.rm = T),
              n = n()) %>%
    filter(!is.na(mean_JSD)) %>% ungroup() %>%
    count(subreddit)  %>%
    filter(n>25) %>%
    pull(subreddit)

over_individual_time_ms <- over_individual_time %>%
  filter(subreddit %in% good_subreddits) %>%
  group_by(subreddit, nth_post, measure) %>%
  summarize(mean_JSD = mean(value, na.rm = T),
            n = n()) %>%
  group_by(subreddit, measure) %>%
  nest() %>%
  mutate(temp = map(data, ~ tidy(cor.test(.$nth_post, .$mean_JSD))))  %>%
  select(-data) %>%
  unnest() %>%
  left_join(meta_data2)

over_individual_time_ms %>%
  select(-word_H, -word_mean_n, -word_sd, -author_longevity_sd, -author_lag_sd, -comments_n_all) %>%
  select(1,2,3,  11:23) %>%
  gather("measure2", "value", -1:-3) %>%
  ggplot(aes(x = value, y = estimate, group = measure, color = measure))+
  geom_point(alpha = .3) +
#  geom_label(aes(label = subreddit)) +
  geom_smooth(method = "lm") +
  facet_wrap(~measure2, scales = "free") +
  scale_x_log10()  +
  geom_hline(aes(yintercept = 0), linetype = 2) +
  #geom_pointrange(aes(ymin = conf.low, ymax = conf.high), alpha =  .2) +
  theme_classic()

over_individual_time_ms_community <- over_individual_time_ms %>%
  filter(measure == "current_community_JSD")

over_individual_time_ms_individual <- over_individual_time_ms %>%
  filter(measure == "previous_author_JSD")

over_individual_time_ms_p_to_c <- over_individual_time_ms %>%
  filter(measure == "previous_to_current")
meta_data2 %>%
  select(author_n, word_total, author_H, author_longevity_mean, score_mean, score_sd, comments_n_long) %>%
  mutate_if(is.numeric, log) %>%
  mutate_if(is.numeric, list(~na_if(., -Inf))) %>%
  make_corr_plot()

Predicting author_i_t1 to author_i_t2 post distance over “time” (posts); larger estimate values mean that posts grow more disimiliar to each other over time. Number of words in a subreddit (word_total) is a strong predictor of this measure. Controling for number of words, author entropy and mean score, there’s some evidence to suggest that in larger communities, authors change (with respect to themselves) to a greater degree. The effect goes away if you don’t control for score.

lm(estimate ~ 
     log(author_n) # how many commenters are there
        + author_H  # how much is each author contributing to the subreddit 

     + log(word_total)  # how many words in the comments, total?
     + log(score_mean)  # what's the mean score of comments
     #+ log(score_sd) # what's the variance in score of comments
   ,data = over_individual_time_ms_individual) %>%
  summary() %>%
  tidy() %>%
  kable()
term estimate std.error statistic p.value
(Intercept) 2.5679107 0.7741765 3.316958 0.0016024
log(author_n) 0.1669674 0.0784787 2.127550 0.0377913
author_H -0.4243229 0.3869417 -1.096607 0.2775067
log(word_total) -0.2646483 0.0749937 -3.528941 0.0008418
log(score_mean) -0.1175929 0.0719180 -1.635097 0.1076381

Some evidence that other “social” variables might matter

lm(estimate ~ 
     log(author_n) # how many commenters are there
        + author_H  # how much is each author contributing to the subreddit 

   + author_longevity_H
   + author_lag_H
   + log(author_longevity_mean)
    + log(author_lag_mean)
     + log(word_total)  # how many words in the comments, total?
     + log(score_mean)  # what's the mean score of comments
     #+ log(score_sd) # what's the variance in score of comments
   ,data = over_individual_time_ms_individual) %>%
  summary() %>%
  tidy() %>%
  kable()
term estimate std.error statistic p.value
(Intercept) 2.4619625 2.3500607 1.0476165 0.2996604
log(author_n) 0.3425259 0.1050078 3.2619089 0.0019573
author_H -0.6938361 0.5561010 -1.2476800 0.2177378
author_longevity_H 2.0375119 0.6243736 3.2632896 0.0019494
author_lag_H -0.5116690 1.1321277 -0.4519535 0.6531821
log(author_longevity_mean) -0.5451585 0.1711666 -3.1849585 0.0024473
log(author_lag_mean) 0.5188171 0.1663040 3.1196909 0.0029513
log(word_total) -0.3680464 0.1220633 -3.0152107 0.0039654
log(score_mean) -0.2478159 0.0974541 -2.5428994 0.0140088

N = 53 subreddits

JSD over community time

This is preliminary - missing data for about 15 communities.

PAIRWISE_TOPIC_JSD <- "/Volumes/wilbur_the_great/LANGSCALES_subreddit_sample/jsd_over_community_time/"


over_community_time <- map_df(list.files(PAIRWISE_TOPIC_JSD, full.names = T), ~{read_csv(.x) %>% mutate(subreddit = .x)}) %>%
  mutate(subreddit = str_replace(subreddit, paste0(PAIRWISE_TOPIC_JSD, "/"), ""),
         subreddit = str_replace(subreddit, "_jsd_over_community_time.csv", "")) %>%
  filter(n > 1)

over_community_time_with_meta <- over_community_time %>%
  left_join(meta_data2) %>%
  mutate(subreddit = fct_reorder(subreddit, author_n))

good_subreddits2 <- over_community_time_with_meta %>%
    count(subreddit) %>%
    filter(n >2) %>%
    pull(subreddit)

jsd_time_correlation <- over_community_time_with_meta %>%
  #filter(!is.na(mean_jsd)) %>%
  filter(subreddit %in% good_subreddits2) %>%
  group_by(subreddit) %>%
  nest() %>%
  mutate(temp = map(data, ~tidy(cor.test(as.numeric(.$group), .$mean_jsd))))  %>%
  select(-data) %>%
  unnest()  %>%
  left_join(meta_data2)

jsd_time_correlation %>%
    select(-word_H, -word_mean_n, -word_sd, -author_longevity_sd, -author_lag_sd, -comments_n_all) %>%
  select(1,2,10:22) %>%
  gather("measure", "value", -1:-2) %>%
  ggplot(aes(x = value, y = estimate))+
  #geom_label(aes(label = subreddit)) +
  geom_smooth(method = "lm") +
  facet_wrap(~measure, scales = "free_x") +
  scale_x_log10() +
  geom_hline(aes(yintercept = 0), linetype = 2) +
  geom_point() +
 # geom_pointrange(aes(ymin = conf.low, ymax = conf.high), alpha =  .2) +
  theme_classic()

lm(estimate ~ 
     log(author_n) # how many commenters are there
        + author_H  # how much is each author contributing to the subreddit 

   + author_longevity_H
   + author_lag_H
   + log(author_longevity_mean)
    + log(author_lag_mean)
     + log(word_total)  # how many words in the comments, total?
     + log(score_mean)  # what's the mean score of comments
     #+ log(score_sd) # what's the variance in score of comments
   ,data = jsd_time_correlation) %>%
  summary() %>%
  tidy() %>%
  kable()
term estimate std.error statistic p.value
(Intercept) 0.3935388 1.1231446 0.3503901 0.7277179
log(author_n) -0.1406038 0.0748984 -1.8772609 0.0671220
author_H 1.1904764 0.4389935 2.7118316 0.0095096
author_longevity_H -0.8193841 0.3894437 -2.1039863 0.0411290
author_lag_H 0.8755055 0.4855346 1.8031786 0.0782108
log(author_longevity_mean) 0.0701314 0.0660619 1.0616018 0.2942089
log(author_lag_mean) -0.1781902 0.0731044 -2.4374748 0.0189008
log(word_total) 0.1160264 0.0740887 1.5660464 0.1245023
log(score_mean) -0.0771389 0.0568828 -1.3561014 0.1819858