Age metadata

From Matt: variable was generated from ratings by scholastic and amazon. When there was a conflict between the two, the earlier was taken.

corpus_metadata <- read_csv("data/corpus_metadata.csv") %>% select(2,4)

ggplot(corpus_metadata, aes(x = earliest_advert_level_months)) +
  geom_histogram(binwidth = 12) +
  ggtitle("Age Distribution") +
  theme_bw()

Genderness by age

Books get less female over time, but then u shape?

gender_by_book <- read_csv("data/gender_by_book_token.csv")

gender_by_age  <- gender_by_book %>%
  left_join(corpus_metadata) %>%
  mutate(age_bin = cut(earliest_advert_level_months, c(12, 24, 36, 48, 60))) %>%
  filter(!is.na(age_bin)) %>%
  group_by(age_bin, gender_measure) %>%
  multi_boot_standard(col = "token_mean") %>%
  rename(mean_gender = mean)

ggplot(gender_by_age, aes(x = age_bin, y = mean_gender)) +
  geom_smooth(method = "lm") +
  geom_pointrange(aes(ymin = ci_lower,
                      ymax = ci_upper))+
  facet_wrap(~gender_measure, scales = "free") +
  theme_bw()

Sentiment