META_PATH <- here("exploratory_analyses/03_systematic_sample/data/subreddit_meta_data.csv")
meta_data <- read_csv(META_PATH, col_names = c("subreddit", "author_H","author_n","word_H","word_mean_n","word_sd","word_total","score_mean","score_sd","score_H","comments_n_long","comments_n_all","posts_n_all","comments_posts_ratio","author_longevity_mean","author_longevity_sd","author_longevity_H","author_lag_sd","author_lag_H", "author_lag_mean"))
meta_data2 <- meta_data %>%
mutate(word_H = word_H/log(comments_n_long),
score_H = score_H/log(comments_n_long),
author_longevity_H = author_longevity_H/log(author_n),
author_lag_H = author_lag_H/log(author_n)) %>%
drop_na()
DT::datatable(meta_data2 %>% arrange(-author_n))
We ended up with 83.
meta_data2 %>%
gather("measure", "value", -subreddit) %>%
ggplot(aes(x = value)) +
geom_histogram() +
facet_wrap(~measure, scales = "free_x") +
scale_x_log10() +
theme_classic()
Pairwise correlations
meta_data2 %>%
mutate_if(is.numeric, log) %>%
mutate_if(is.numeric, list(~na_if(., -Inf))) %>%
make_corr_plot()
PAIRWISE_TOPIC_JSD <- "/Volumes/wilbur_the_great/LANGSCALES_subreddit_sample/jsd_nth_post/"
nth_post_data <- map_df(list.files(PAIRWISE_TOPIC_JSD, full.names = T), ~{read_csv(.x) %>% mutate(subreddit = .x)}) %>%
mutate(subreddit = str_replace(subreddit, paste0(PAIRWISE_TOPIC_JSD, "/"), ""),
subreddit = str_replace(subreddit, "_jsd_nth_post.csv", ""))
over_individual_time <- nth_post_data %>%
select(author, nth_post, previous_author_JSD, current_community_JSD, subreddit) %>%
#mutate(previous_to_current = previous_author_JSD + current_community_JSD) %>%
gather("measure", "value", -nth_post, -subreddit, -author)
over_individual_time_ms <- over_individual_time %>%
group_by(subreddit, nth_post, measure) %>%
summarize(mean_JSD = mean(value, na.rm = T),
n = n()) %>%
left_join(meta_data2) %>%
ungroup() %>%
mutate(subreddit = fct_reorder(subreddit, author_n))
over_individual_time_ms %>%
ggplot(aes(x = nth_post, y = mean_JSD, group = measure, color = measure)) +
geom_point() +
geom_smooth() +
ggtitle("Post similarity over indvidual time (with community reference)") +
xlab("Nth post") +
facet_wrap(~subreddit, scales = "free_y", nrow = 12) +
ylab("Mean JSD") +
theme_classic() +
theme(legend.position = "bottom")
good_subreddits <- over_individual_time %>%
group_by(subreddit, nth_post, measure) %>%
summarize(mean_JSD = mean(value, na.rm = T),
n = n()) %>%
filter(!is.na(mean_JSD)) %>% ungroup() %>%
count(subreddit) %>%
filter(n>25) %>%
pull(subreddit)
over_individual_time_ms <- over_individual_time %>%
filter(subreddit %in% good_subreddits) %>%
group_by(subreddit, nth_post, measure) %>%
summarize(mean_JSD = mean(value, na.rm = T),
n = n()) %>%
group_by(subreddit, measure) %>%
nest() %>%
mutate(temp = map(data, ~ tidy(cor.test(.$nth_post, .$mean_JSD)))) %>%
select(-data) %>%
unnest() %>%
left_join(meta_data2)
over_individual_time_ms %>%
select(-word_H, -word_mean_n, -word_sd, -author_longevity_sd, -author_lag_sd, -comments_n_all) %>%
select(1,2,3, 11:23) %>%
gather("measure2", "value", -1:-3) %>%
ggplot(aes(x = value, y = estimate, group = measure, color = measure))+
geom_point(alpha = .3) +
# geom_label(aes(label = subreddit)) +
geom_smooth(method = "lm") +
facet_wrap(~measure2, scales = "free") +
scale_x_log10() +
geom_hline(aes(yintercept = 0), linetype = 2) +
#geom_pointrange(aes(ymin = conf.low, ymax = conf.high), alpha = .2) +
theme_classic()
over_individual_time_ms_community <- over_individual_time_ms %>%
filter(measure == "current_community_JSD")
over_individual_time_ms_individual <- over_individual_time_ms %>%
filter(measure == "previous_author_JSD")
over_individual_time_ms_p_to_c <- over_individual_time_ms %>%
filter(measure == "previous_to_current")
meta_data2 %>%
select(author_n, word_total, author_H, author_longevity_mean, score_mean, score_sd, comments_n_long) %>%
mutate_if(is.numeric, log) %>%
mutate_if(is.numeric, list(~na_if(., -Inf))) %>%
make_corr_plot()
Predicting author_i_t1 to author_i_t2 post distance over “time” (posts); larger estimate values mean that posts grow more disimiliar to each other over time. Number of words in a subreddit (word_total) is a strong predictor of this measure. Controling for number of words, author entropy and mean score, there’s some evidence to suggest that in larger communities, authors change (with respect to themselves) to a greater degree. The effect goes away if you don’t control for score.
lm(estimate ~
log(author_n) # how many commenters are there
+ author_H # how much is each author contributing to the subreddit
+ log(word_total) # how many words in the comments, total?
+ log(score_mean) # what's the mean score of comments
#+ log(score_sd) # what's the variance in score of comments
,data = over_individual_time_ms_individual) %>%
summary() %>%
tidy() %>%
kable()
term | estimate | std.error | statistic | p.value |
---|---|---|---|---|
(Intercept) | 2.5679107 | 0.7741765 | 3.316958 | 0.0016024 |
log(author_n) | 0.1669674 | 0.0784787 | 2.127550 | 0.0377913 |
author_H | -0.4243229 | 0.3869417 | -1.096607 | 0.2775067 |
log(word_total) | -0.2646483 | 0.0749937 | -3.528941 | 0.0008418 |
log(score_mean) | -0.1175929 | 0.0719180 | -1.635097 | 0.1076381 |
Some evidence that other “social” variables might matter
lm(estimate ~
log(author_n) # how many commenters are there
+ author_H # how much is each author contributing to the subreddit
+ author_longevity_H
+ author_lag_H
+ log(author_longevity_mean)
+ log(author_lag_mean)
+ log(word_total) # how many words in the comments, total?
+ log(score_mean) # what's the mean score of comments
#+ log(score_sd) # what's the variance in score of comments
,data = over_individual_time_ms_individual) %>%
summary() %>%
tidy() %>%
kable()
term | estimate | std.error | statistic | p.value |
---|---|---|---|---|
(Intercept) | 2.4619625 | 2.3500607 | 1.0476165 | 0.2996604 |
log(author_n) | 0.3425259 | 0.1050078 | 3.2619089 | 0.0019573 |
author_H | -0.6938361 | 0.5561010 | -1.2476800 | 0.2177378 |
author_longevity_H | 2.0375119 | 0.6243736 | 3.2632896 | 0.0019494 |
author_lag_H | -0.5116690 | 1.1321277 | -0.4519535 | 0.6531821 |
log(author_longevity_mean) | -0.5451585 | 0.1711666 | -3.1849585 | 0.0024473 |
log(author_lag_mean) | 0.5188171 | 0.1663040 | 3.1196909 | 0.0029513 |
log(word_total) | -0.3680464 | 0.1220633 | -3.0152107 | 0.0039654 |
log(score_mean) | -0.2478159 | 0.0974541 | -2.5428994 | 0.0140088 |
N = 53 subreddits
This is preliminary - missing data for about 15 communities.
PAIRWISE_TOPIC_JSD <- "/Volumes/wilbur_the_great/LANGSCALES_subreddit_sample/jsd_over_community_time/"
over_community_time <- map_df(list.files(PAIRWISE_TOPIC_JSD, full.names = T), ~{read_csv(.x) %>% mutate(subreddit = .x)}) %>%
mutate(subreddit = str_replace(subreddit, paste0(PAIRWISE_TOPIC_JSD, "/"), ""),
subreddit = str_replace(subreddit, "_jsd_over_community_time.csv", "")) %>%
filter(n > 1)
over_community_time_with_meta <- over_community_time %>%
left_join(meta_data2) %>%
mutate(subreddit = fct_reorder(subreddit, author_n))
good_subreddits2 <- over_community_time_with_meta %>%
count(subreddit) %>%
filter(n >2) %>%
pull(subreddit)
jsd_time_correlation <- over_community_time_with_meta %>%
#filter(!is.na(mean_jsd)) %>%
filter(subreddit %in% good_subreddits2) %>%
group_by(subreddit) %>%
nest() %>%
mutate(temp = map(data, ~tidy(cor.test(as.numeric(.$group), .$mean_jsd)))) %>%
select(-data) %>%
unnest() %>%
left_join(meta_data2)
jsd_time_correlation %>%
select(-word_H, -word_mean_n, -word_sd, -author_longevity_sd, -author_lag_sd, -comments_n_all) %>%
select(1,2,10:22) %>%
gather("measure", "value", -1:-2) %>%
ggplot(aes(x = value, y = estimate))+
#geom_label(aes(label = subreddit)) +
geom_smooth(method = "lm") +
facet_wrap(~measure, scales = "free_x") +
scale_x_log10() +
geom_hline(aes(yintercept = 0), linetype = 2) +
geom_point() +
# geom_pointrange(aes(ymin = conf.low, ymax = conf.high), alpha = .2) +
theme_classic()
lm(estimate ~
log(author_n) # how many commenters are there
+ author_H # how much is each author contributing to the subreddit
+ author_longevity_H
+ author_lag_H
+ log(author_longevity_mean)
+ log(author_lag_mean)
+ log(word_total) # how many words in the comments, total?
+ log(score_mean) # what's the mean score of comments
#+ log(score_sd) # what's the variance in score of comments
,data = jsd_time_correlation) %>%
summary() %>%
tidy() %>%
kable()
term | estimate | std.error | statistic | p.value |
---|---|---|---|---|
(Intercept) | 0.3935388 | 1.1231446 | 0.3503901 | 0.7277179 |
log(author_n) | -0.1406038 | 0.0748984 | -1.8772609 | 0.0671220 |
author_H | 1.1904764 | 0.4389935 | 2.7118316 | 0.0095096 |
author_longevity_H | -0.8193841 | 0.3894437 | -2.1039863 | 0.0411290 |
author_lag_H | 0.8755055 | 0.4855346 | 1.8031786 | 0.0782108 |
log(author_longevity_mean) | 0.0701314 | 0.0660619 | 1.0616018 | 0.2942089 |
log(author_lag_mean) | -0.1781902 | 0.0731044 | -2.4374748 | 0.0189008 |
log(word_total) | 0.1160264 | 0.0740887 | 1.5660464 | 0.1245023 |
log(score_mean) | -0.0771389 | 0.0568828 | -1.3561014 | 0.1819858 |