pairwise_topic_jsd_path <- here("/exploratory_analyses/02_reddit_scaled_up/data/jsd_over_community_time/")
over_community_time <- map_df(list.files(pairwise_topic_jsd_path, full.names = T), ~{read_csv(.x) %>% mutate(subreddit = .x)}) %>%
mutate(subreddit = str_replace(subreddit, paste0(pairwise_topic_jsd_path, "/"), ""),
subreddit = str_replace(subreddit, "_jsd_over_community_time.csv", ""))
META_DATA <- here("/exploratory_analyses/02_reddit_scaled_up/data/misc/subreddit_meta_data.csv")
subreddit_meta <- read_csv(META_DATA) %>%
mutate(subreddit = case_when(subreddit == "TheRedPill" ~ "redpill", TRUE ~ subreddit)) %>%
select(-n_large_authors) %>%
arrange(n_authors) %>%
select(subreddit, n_authors, everything())
kable(subreddit_meta)
| subreddit | n_authors | n_posts | n_comments | mean_comments_per_thread | mean_comments_per_author | comment_length |
|---|---|---|---|---|---|---|
| DarkEnlightenment | 1853 | 3900 | 9981 | 3.389744 | 26.47143 | 386.3618 |
| akron | 1883 | 1345 | 10595 | 8.228808 | 26.52113 | 180.4486 |
| fatFIRE | 6808 | 1345 | 51546 | 32.349235 | 47.94366 | 317.3898 |
| Cleveland | 10722 | 7560 | 94589 | 12.060094 | 52.30244 | 169.8967 |
| kansascity | 16422 | 12803 | 270933 | 19.682020 | 46.00000 | 164.5620 |
| 3dshacks | 17434 | 7544 | 109175 | 13.791681 | 106.95706 | 248.0620 |
| Watchexchange | 22438 | 30646 | 247975 | 6.887183 | 75.80405 | 210.5787 |
| pittsburgh | 24745 | 24372 | 454054 | 17.612502 | 53.10086 | 184.3571 |
| OkCupid | 31476 | 23168 | 1004554 | 31.930715 | 48.95179 | 128.3380 |
| philadelphia | 33133 | 26181 | 558152 | 19.596653 | 66.00199 | 149.5078 |
| slowcooking | 38202 | 7435 | 113802 | 14.912300 | 335.10526 | 139.2719 |
| smallbusiness | 41515 | 21283 | 208379 | 9.610897 | 203.50490 | 303.6743 |
| chicago | 43905 | 31798 | 694099 | 19.128225 | 78.12278 | 182.5836 |
| nyc | 52925 | 31877 | 667957 | 19.029121 | 99.29644 | 171.4625 |
| redpill | 56435 | 22822 | 622692 | 23.564065 | 122.95207 | 330.3788 |
| Piracy | 122522 | 61627 | 628614 | 9.108004 | 310.96954 | 148.8488 |
| askscience | 220151 | 193682 | 461762 | 2.987221 | 948.92672 | 505.3883 |
make_corr_plot(log(subreddit_meta[,-1]))
over_community_time_with_meta <- over_community_time %>%
left_join(subreddit_meta) %>%
mutate(subreddit = fct_reorder(subreddit, n_authors))
ggplot(over_community_time_with_meta,
aes(x = group, y = mean_jsd )) +
facet_wrap(~subreddit, scale = "free") +
geom_point(aes(size = n), alpha = .3) +
xlab("Date (in weeks)") +
ylab("Mean JSD") +
ggtitle("Post similarity over community time") +
geom_smooth(color = "blue" ) +
geom_smooth(method = "lm",color = "red" ) +
theme_classic(base_size = 14)
jsd_time_correlation <- over_community_time_with_meta %>%
group_by(subreddit) %>%
nest() %>%
mutate(temp = map(data, ~tidy(cor.test(as.numeric(.$group), .$mean_jsd)))) %>%
select(-data) %>%
unnest() %>%
left_join(subreddit_meta)
jsd_time_correlation %>%
select(1,2,6,7, 10:15) %>%
gather("measure", "value", -1:-4) %>%
ggplot(aes(x = value, y = estimate))+
geom_label(aes(label = subreddit)) +
geom_smooth(method = "lm") +
facet_wrap(~measure, scales = "free_x") +
geom_hline(aes(yintercept = 0), linetype = 2) +
geom_pointrange(aes(ymin = conf.low, ymax = conf.high), alpha = .2) +
theme_classic()
jsd_time_correlation %>%
select(1,2,6,7, 10:15) %>%
gather("measure", "value", -1:-4) %>%
ggplot(aes(x = value, y = estimate))+
geom_label(aes(label = subreddit)) +
geom_smooth(method = "lm") +
scale_x_log10() +
facet_wrap(~measure, scales = "free_x") +
geom_hline(aes(yintercept = 0), linetype = 2) +
geom_pointrange(aes(ymin = conf.low, ymax = conf.high), alpha = .2) +
theme_classic()
pink - mean distance of current post to all other posts in that time period in community;
green - distance to previous author post
PAIRWISE_TOPIC_JSD <- here("/exploratory_analyses/02_reddit_scaled_up/data/jsd_nth_post/")
nth_post_data <- map_df(list.files(PAIRWISE_TOPIC_JSD, full.names = T), ~{read_csv(.x) %>% mutate(subreddit = .x)}) %>%
mutate(subreddit = str_replace(subreddit, paste0(PAIRWISE_TOPIC_JSD, "/"), ""),
subreddit = str_replace(subreddit, "_jsd_nth_post.csv", ""))
over_individual_time <- nth_post_data %>%
select(author, nth_post, previous_author_JSD, current_community_JSD, subreddit) %>%
gather("measure", "value", -nth_post, -subreddit, -author)
over_individual_time_ms <- over_individual_time %>%
group_by(subreddit, nth_post, measure) %>%
summarize(mean_JSD = mean(value, na.rm = T),
n = n()) %>%
left_join(subreddit_meta) %>%
ungroup() %>%
mutate(subreddit = fct_reorder(subreddit, n_authors))
over_individual_time_ms %>%
ggplot(aes(x = nth_post, y = mean_JSD, group = measure, color = measure)) +
geom_point() +
geom_smooth() +
ggtitle("Post similarity over indvidual time (with community reference)") +
xlab("Nth post") +
facet_wrap(~subreddit, scales = "free_y") +
ylab("Mean JSD") +
theme_classic()
over_individual_time_ms <- over_individual_time %>%
group_by(subreddit, nth_post, measure) %>%
summarize(mean_JSD = mean(value, na.rm = T),
n = n()) %>%
filter(n >= 50) %>%
left_join(subreddit_meta) %>%
ungroup() %>%
mutate(subreddit = fct_reorder(subreddit, n_authors))
over_individual_time_ms %>%
ggplot(aes(x = nth_post, y = mean_JSD, group = measure, color = measure)) +
geom_point() +
geom_smooth() +
ggtitle("Post similarity over indvidual time (with community reference)") +
xlab("Nth post") +
ylab("Mean JSD") +
facet_wrap(~subreddit, scales = "free_y") +
theme_classic()
over_individual_time_ms <- over_individual_time %>%
filter(nth_post <= 25) %>%
group_by(subreddit, nth_post, measure) %>%
summarize(mean_JSD = mean(value, na.rm = T),
n = n()) %>%
group_by(subreddit, measure) %>%
nest() %>%
mutate(temp = map(data, ~tidy(cor.test(.$nth_post, .$mean_JSD)))) %>%
select(-data) %>%
unnest() %>%
left_join(subreddit_meta)
over_individual_time_ms %>%
select(1,2,3, 7, 8, 11:16) %>%
gather("measure2", "value", -1:-5) %>%
ggplot(aes(x = value, y = estimate, group = measure, color = measure))+
geom_label(aes(label = subreddit)) +
geom_smooth(method = "lm") +
facet_wrap(~measure2, scales = "free_x") +
scale_x_log10() +
geom_hline(aes(yintercept = 0), linetype = 2) +
geom_pointrange(aes(ymin = conf.low, ymax = conf.high), alpha = .2) +
theme_classic()
over_individual_time_ms <- over_individual_time %>%
filter(nth_post <= 50) %>%
group_by(subreddit, nth_post, measure) %>%
summarize(mean_JSD = mean(value, na.rm = T),
n = n()) %>%
group_by(subreddit, measure) %>%
nest() %>%
mutate(temp = map(data, ~tidy(cor.test(.$nth_post, .$mean_JSD)))) %>%
select(-data) %>%
unnest() %>%
left_join(subreddit_meta)
over_individual_time_ms %>%
select(1,2,3, 7, 8, 11:16) %>%
gather("measure2", "value", -1:-5) %>%
ggplot(aes(x = value, y = estimate, group = measure, color = measure))+
geom_label(aes(label = subreddit)) +
geom_smooth(method = "lm") +
facet_wrap(~measure2, scales = "free_x") +
scale_x_log10() +
geom_hline(aes(yintercept = 0), linetype = 2) +
geom_pointrange(aes(ymin = conf.low, ymax = conf.high), alpha = .2) +
theme_classic()
over_individual_time_ms <- over_individual_time %>%
filter(nth_post <= 100) %>%
group_by(subreddit, nth_post, measure) %>%
summarize(mean_JSD = mean(value, na.rm = T),
n = n()) %>%
group_by(subreddit, measure) %>%
nest() %>%
mutate(temp = map(data, ~tidy(cor.test(.$nth_post, .$mean_JSD)))) %>%
select(-data) %>%
unnest() %>%
left_join(subreddit_meta)
over_individual_time_ms %>%
select(1,2,3, 7, 8, 11:16) %>%
gather("measure2", "value", -1:-5) %>%
ggplot(aes(x = value, y = estimate, group = measure, color = measure))+
geom_label(aes(label = subreddit)) +
geom_smooth(method = "lm") +
facet_wrap(~measure2, scales = "free_x") +
scale_x_log10() +
geom_hline(aes(yintercept = 0), linetype = 2) +
geom_pointrange(aes(ymin = conf.low, ymax = conf.high), alpha = .2) +
theme_classic()