PGH_DATA_PATH <- here("exploratory_analyses/01_reddit_pilot/data/pittsburgh2years_tidy.csv")
tidy_pgh <- read_csv(PGH_DATA_PATH, guess_max = 1000000) %>%
select(-body)
RP_DATA_PATH <- here("exploratory_analyses/01_reddit_pilot/data/redpill2years_tidy.csv")
tidy_rp <- read_csv(RP_DATA_PATH, guess_max = 1000000) %>%
select(-body)
reddit_text <- tidy_pgh %>%
bind_rows(tidy_rp) %>%
filter(author != "[deleted]") %>%
select(1:8, 20) %>%
filter(text_type == "comment",
body_clean != "removed") %>%
mutate(text_length = nchar(body_clean))
FIRST_PRONOUNS <- c("i", "me", "my", "mine")
UNNESTED_TOKENS <- here("exploratory_analyses/01_reddit_pilot/data/unnested_tokens_first.csv")
#unnested_reddit <- reddit_text %>%
# select(subreddit, comment_id, body_clean) %>%
# unnest_tokens(word, body_clean)
#first_posts <- unnested_reddit %>%
# filter(word %in% FIRST_PRONOUNS) %>%
# distinct(subreddit, comment_id) %>%
# mutate(contains_first_pronoun = TRUE)
#write_csv(first_posts,UNNESTED_TOKENS)
first_posts <- data.table::fread(UNNESTED_TOKENS)
nested_authors_text <- reddit_text %>%
group_by(subreddit, author) %>%
mutate(birth = min(created_utc)) %>%
arrange(author, created_utc) %>%
mutate(comment_num = 1:n()) %>%
ungroup() %>%
mutate(time_since_birth = as.numeric(created_utc - birth),
time_since_birth_bin = cut(time_since_birth, 100)) %>%
select(subreddit, author, comment_num,
comment_id, time_since_birth,
time_since_birth_bin, created_utc, comment_id) %>%
left_join(first_posts) %>%
mutate(contains_first_pronoun = ifelse(is.na(contains_first_pronoun),
FALSE, contains_first_pronoun))
author_first_person <- nested_authors_text %>%
group_by(subreddit, author, time_since_birth_bin) %>%
summarize(contains_first_pronoun = mean(contains_first_pronoun))
time_since_birth <- author_first_person %>%
group_by(subreddit, time_since_birth_bin) %>%
summarize(contains_first_pronoun = mean(contains_first_pronoun))
ggplot(time_since_birth,
aes(x = time_since_birth_bin,
y = contains_first_pronoun,
group = subreddit)) +
geom_point() +
geom_smooth() +
facet_wrap(~subreddit) +
theme_classic() +
theme(axis.text.x = element_text(angle = 90))
