Meta-data

pairwise_topic_jsd_path <- here("/exploratory_analyses/02_reddit_scaled_up/data/jsd_over_community_time/")

over_community_time <- map_df(list.files(pairwise_topic_jsd_path, full.names = T), ~{read_csv(.x) %>% mutate(subreddit = .x)}) %>%
  mutate(subreddit = str_replace(subreddit, paste0(pairwise_topic_jsd_path, "/"), ""),
         subreddit = str_replace(subreddit, "_jsd_over_community_time.csv", ""))

META_DATA <-  here("/exploratory_analyses/02_reddit_scaled_up/data/misc/subreddit_meta_data.csv")
subreddit_meta <- read_csv(META_DATA) %>%
  mutate(subreddit = case_when(subreddit == "TheRedPill" ~ "redpill", TRUE ~ subreddit)) %>%
  select(-n_large_authors) %>%
  arrange(n_authors) %>%
  select(subreddit, n_authors, everything())

kable(subreddit_meta)
subreddit n_authors n_posts n_comments mean_comments_per_thread mean_comments_per_author comment_length
DarkEnlightenment 1853 3900 9981 3.389744 26.47143 386.3618
akron 1883 1345 10595 8.228808 26.52113 180.4486
fatFIRE 6808 1345 51546 32.349235 47.94366 317.3898
Cleveland 10722 7560 94589 12.060094 52.30244 169.8967
kansascity 16422 12803 270933 19.682020 46.00000 164.5620
3dshacks 17434 7544 109175 13.791681 106.95706 248.0620
Watchexchange 22438 30646 247975 6.887183 75.80405 210.5787
pittsburgh 24745 24372 454054 17.612502 53.10086 184.3571
OkCupid 31476 23168 1004554 31.930715 48.95179 128.3380
philadelphia 33133 26181 558152 19.596653 66.00199 149.5078
slowcooking 38202 7435 113802 14.912300 335.10526 139.2719
smallbusiness 41515 21283 208379 9.610897 203.50490 303.6743
chicago 43905 31798 694099 19.128225 78.12278 182.5836
nyc 52925 31877 667957 19.029121 99.29644 171.4625
redpill 56435 22822 622692 23.564065 122.95207 330.3788
Piracy 122522 61627 628614 9.108004 310.96954 148.8488
askscience 220151 193682 461762 2.987221 948.92672 505.3883
make_corr_plot(log(subreddit_meta[,-1]))

JSD over community time

post similarity over commuity time

over_community_time_with_meta <- over_community_time %>%
  left_join(subreddit_meta) %>%
  mutate(subreddit = fct_reorder(subreddit, n_authors))

ggplot(over_community_time_with_meta, 
       aes(x = group, y = mean_jsd )) +
  facet_wrap(~subreddit, scale = "free") +
  geom_point(aes(size = n), alpha = .3) +
  xlab("Date (in weeks)") +
  ylab("Mean JSD") +
  ggtitle("Post similarity over community time") +
  geom_smooth(color = "blue" ) +
  geom_smooth(method = "lm",color = "red" ) +
  theme_classic(base_size = 14)

slope and demographic variables

jsd_time_correlation <- over_community_time_with_meta %>%
  group_by(subreddit) %>%
  nest() %>%
  mutate(temp = map(data, ~tidy(cor.test(as.numeric(.$group), .$mean_jsd))))  %>%
  select(-data) %>%
  unnest() %>%
  left_join(subreddit_meta)

jsd_time_correlation %>%
  select(1,2,6,7, 10:15) %>%
  gather("measure", "value", -1:-4) %>%
  ggplot(aes(x = value, y = estimate))+
  geom_label(aes(label = subreddit)) +
  geom_smooth(method = "lm") +
  facet_wrap(~measure, scales = "free_x") +
  geom_hline(aes(yintercept = 0), linetype = 2) +
  geom_pointrange(aes(ymin = conf.low, ymax = conf.high), alpha =  .2) +
  theme_classic()

slope and demographic variables (log x)

jsd_time_correlation %>%
  select(1,2,6,7, 10:15) %>%
  gather("measure", "value", -1:-4) %>%
  ggplot(aes(x = value, y = estimate))+
  geom_label(aes(label = subreddit)) +
  geom_smooth(method = "lm") +
  scale_x_log10() +
  facet_wrap(~measure, scales = "free_x") +
  geom_hline(aes(yintercept = 0), linetype = 2) +
  geom_pointrange(aes(ymin = conf.low, ymax = conf.high), alpha =  .2) +
  theme_classic()

JSD over individual time

pink - mean distance of current post to all other posts in that time period in community;

green - distance to previous author post

PAIRWISE_TOPIC_JSD <- here("/exploratory_analyses/02_reddit_scaled_up/data/jsd_nth_post/")


nth_post_data <- map_df(list.files(PAIRWISE_TOPIC_JSD, full.names = T), ~{read_csv(.x) %>% mutate(subreddit = .x)}) %>%
  mutate(subreddit = str_replace(subreddit, paste0(PAIRWISE_TOPIC_JSD, "/"), ""),
         subreddit = str_replace(subreddit, "_jsd_nth_post.csv", ""))

over_individual_time <- nth_post_data  %>%
  select(author, nth_post, previous_author_JSD, current_community_JSD, subreddit) %>%
  gather("measure", "value", -nth_post, -subreddit, -author)

n = all

over_individual_time_ms <- over_individual_time %>%
  group_by(subreddit, nth_post, measure) %>%
  summarize(mean_JSD = mean(value, na.rm = T),
            n = n()) %>%
    left_join(subreddit_meta) %>%
  ungroup() %>%
  mutate(subreddit = fct_reorder(subreddit, n_authors))

over_individual_time_ms %>%
  ggplot(aes(x = nth_post, y = mean_JSD, group = measure, color = measure)) +
  geom_point() +
  geom_smooth() +
  ggtitle("Post similarity over indvidual time (with community reference)") +
  xlab("Nth post") +
   facet_wrap(~subreddit, scales = "free_y") +
  ylab("Mean JSD") +
  theme_classic()

n >= 50

over_individual_time_ms <- over_individual_time %>%
  group_by(subreddit, nth_post, measure) %>%
  summarize(mean_JSD = mean(value, na.rm = T),
            n = n()) %>%
  filter(n >= 50) %>%
  left_join(subreddit_meta) %>%
  ungroup() %>%
  mutate(subreddit = fct_reorder(subreddit, n_authors))


over_individual_time_ms %>%
  ggplot(aes(x = nth_post, y = mean_JSD, group = measure, color = measure)) +
  geom_point() +
  geom_smooth() +
  ggtitle("Post similarity over indvidual time (with community reference)") +
  xlab("Nth post") +
  ylab("Mean JSD") +
  facet_wrap(~subreddit, scales = "free_y") +
  theme_classic()

slope and population variables

1-25 posts

over_individual_time_ms <- over_individual_time %>%
  filter(nth_post <= 25) %>%
  group_by(subreddit, nth_post, measure) %>%
  summarize(mean_JSD = mean(value, na.rm = T),
            n = n()) %>%
  group_by(subreddit, measure) %>%
  nest() %>%
  mutate(temp = map(data, ~tidy(cor.test(.$nth_post, .$mean_JSD))))  %>%
  select(-data) %>%
  unnest() %>%
  left_join(subreddit_meta)


over_individual_time_ms %>%
  select(1,2,3, 7, 8, 11:16) %>%
  gather("measure2", "value", -1:-5) %>%
  ggplot(aes(x = value, y = estimate, group = measure, color = measure))+
  geom_label(aes(label = subreddit)) +
  geom_smooth(method = "lm") +
  facet_wrap(~measure2, scales = "free_x") +
  scale_x_log10()  +
  geom_hline(aes(yintercept = 0), linetype = 2) +
  geom_pointrange(aes(ymin = conf.low, ymax = conf.high), alpha =  .2) +
  theme_classic()

1-50 posts

over_individual_time_ms <- over_individual_time %>%
  filter(nth_post <= 50) %>%
  group_by(subreddit, nth_post, measure) %>%
  summarize(mean_JSD = mean(value, na.rm = T),
            n = n()) %>%
  group_by(subreddit, measure) %>%
  nest() %>%
  mutate(temp = map(data, ~tidy(cor.test(.$nth_post, .$mean_JSD))))  %>%
  select(-data) %>%
  unnest() %>%
  left_join(subreddit_meta)


over_individual_time_ms %>%
  select(1,2,3, 7, 8, 11:16) %>%
  gather("measure2", "value", -1:-5) %>%
  ggplot(aes(x = value, y = estimate, group = measure, color = measure))+
  geom_label(aes(label = subreddit)) +
  geom_smooth(method = "lm") +
  facet_wrap(~measure2, scales = "free_x") +
  scale_x_log10()  +
  geom_hline(aes(yintercept = 0), linetype = 2) +
  geom_pointrange(aes(ymin = conf.low, ymax = conf.high), alpha =  .2) +
  theme_classic()

1-100 posts

over_individual_time_ms <- over_individual_time %>%
  filter(nth_post <= 100) %>%
  group_by(subreddit, nth_post, measure) %>%
  summarize(mean_JSD = mean(value, na.rm = T),
            n = n()) %>%
  group_by(subreddit, measure) %>%
  nest() %>%
  mutate(temp = map(data, ~tidy(cor.test(.$nth_post, .$mean_JSD))))  %>%
  select(-data) %>%
  unnest() %>%
  left_join(subreddit_meta)


over_individual_time_ms %>%
  select(1,2,3, 7, 8, 11:16) %>%
  gather("measure2", "value", -1:-5) %>%
  ggplot(aes(x = value, y = estimate, group = measure, color = measure))+
  geom_label(aes(label = subreddit)) +
  geom_smooth(method = "lm") +
  facet_wrap(~measure2, scales = "free_x") +
  scale_x_log10()  +
  geom_hline(aes(yintercept = 0), linetype = 2) +
  geom_pointrange(aes(ymin = conf.low, ymax = conf.high), alpha =  .2) +
  theme_classic()