From Matt: variable was generated from ratings by scholastic and amazon. When there was a conflict between the two, the earlier was taken.
corpus_metadata <- read_csv("data/corpus_metadata.csv") %>% select(2,4)
ggplot(corpus_metadata, aes(x = earliest_advert_level_months)) +
geom_histogram(binwidth = 12) +
ggtitle("Age Distribution") +
theme_bw()
Books get less female over time, but then u shape?
gender_by_book <- read_csv("data/gender_by_book_token.csv")
gender_by_age <- gender_by_book %>%
left_join(corpus_metadata) %>%
mutate(age_bin = cut(earliest_advert_level_months, c(12, 24, 36, 48, 60))) %>%
filter(!is.na(age_bin)) %>%
group_by(age_bin, gender_measure) %>%
multi_boot_standard(col = "token_mean") %>%
rename(mean_gender = mean)
ggplot(gender_by_age, aes(x = age_bin, y = mean_gender)) +
geom_smooth(method = "lm") +
geom_pointrange(aes(ymin = ci_lower,
ymax = ci_upper))+
facet_wrap(~gender_measure, scales = "free") +
theme_bw()