These analses are based on a list of 65 children’s “mathy” words I generated in an ad hoc way. You can see them in a google sheet here.

MATH_WORDS <-  "1GTdLmxLhIEa4hWCGf_relV54CcuVrqORKE1dS3Ajy0Q"
math_words <- read_sheet(MATH_WORDS)
CORPUS_PATH <- here("data/processed/books/tidy_full_corpus_all.csv")
corpus <- read_csv(CORPUS_PATH) %>%
  select(-corpus_type)
REVIEWS_DATA_PATH <- here("data/processed/other/amazon_gender_scores.csv")
review_data <- read_csv(REVIEWS_DATA_PATH) 

by_book_review_data <- review_data %>%
  group_by(book_id, n_reviews_total, n_reviews_gendered,
           prop_review_gendered)  %>%
  summarize(addressee_gender_score_token =
              sum(n_female_token)/(sum(n_female_token) +
                                     sum(n_male_token))) %>%
  ungroup() %>%
  select(book_id, addressee_gender_score_token)

Proportion math words predicted by audience gender

Each point in these analyses is a book. The audience gender measure is the one described in the paper, derived from Amazon Reviews.

total_spatial_words_by_type_types <- corpus %>%
  distinct(book_id, word) %>%
  left_join(math_words) %>%
  group_by(book_id, type) %>%
  summarize(n_mathy = n()) %>%
  ungroup() %>%
  complete(book_id, type, fill = list(n_mathy = 0))  

all_word_counts_types<- total_spatial_words_by_type_types %>%
  group_by(book_id) %>%
  summarize(total_words = sum(n_mathy, na.rm = T))

spatial_words_by_type_types <- total_spatial_words_by_type_types %>%
  filter(!is.na(type)) %>%
  left_join(all_word_counts_types) %>%
  mutate(prop_mathy = n_mathy/total_words,
         log_prop_mathy = log(prop_mathy),
         log_prop_mathy_corrected = log(prop_mathy + .001)) %>%
  left_join(by_book_review_data)

spatial_words_all_types <- spatial_words_by_type_types %>%
  group_by(book_id) %>%
  summarize(n_mathy = sum(n_mathy),
            total_words = total_words[1],
            addressee_gender_score_token = addressee_gender_score_token[1]) %>%
    mutate(prop_mathy = n_mathy/total_words,
           log_prop_mathy = log(prop_mathy),
           log_prop_mathy_corrected = log(prop_mathy + .001))

All words

ggplot(spatial_words_all_types, aes(x = addressee_gender_score_token, y = log_prop_mathy)) +
  geom_point(alpha = .8) +
  geom_smooth(method = "lm") +
  ylab("Log proportion math words") +
  xlab("Prop. female book audience")

spatial_words_all_types %>%
  filter(is.finite(log_prop_mathy)) %>%
  nest() %>%
  mutate(temp = map(data, ~tidy(cor.test(.$addressee_gender_score_token,
                                         .$log_prop_mathy)))) %>%
  select(-data) %>%
  unnest() %>%
  kable()
estimate statistic p.value parameter conf.low conf.high method alternative
-0.2022876 -3.173203 0.001708 236 -0.3211874 -0.0771093 Pearson’s product-moment correlation two.sided

Words by math type

ggplot(spatial_words_by_type_types, aes(x = addressee_gender_score_token, y = log_prop_mathy)) +
  facet_wrap(~type, scales= "free_y") +
  geom_point(alpha = .8) +
  geom_smooth(method = "lm") +
  ylab("Log proportion math words") +
  xlab("Prop. female book audience")

spatial_words_by_type_types %>%
  filter(is.finite(log_prop_mathy)) %>%
  nest(-type) %>%
  mutate(temp = map(data, ~tidy(cor.test(.$addressee_gender_score_token,
                                         .$log_prop_mathy)))) %>%
  select(-data) %>%
  unnest() %>%
  kable()
type estimate statistic p.value parameter conf.low conf.high method alternative
number -0.0094058 -0.1240769 0.9013976 174 -0.1571073 0.1387072 Pearson’s product-moment correlation two.sided
spatial -0.1994039 -3.0927887 0.0022269 231 -0.3197314 -0.0727468 Pearson’s product-moment correlation two.sided
arithmetic -0.2073081 -2.2526523 0.0262121 113 -0.3761402 -0.0251524 Pearson’s product-moment correlation two.sided
shape -0.2724671 -1.0595673 0.3072805 14 -0.6767661 0.2580978 Pearson’s product-moment correlation two.sided

Proportion math words predicted by main character gender

CHARACTER_DATA <- here("data/raw/other/character_gender_by_book.csv")
character_data <- read_csv(CHARACTER_DATA)

tidy_character_data <- character_data %>%
  select(book_id, char_main_gender) %>%
  mutate(char_main_gender = fct_recode(char_main_gender, 
                                          female = "F",
                                          male = "M",
                                          "indeterminate" = "AND",
                                          mixed = "MIXED")) 

All words

all_words_by_book_character <- spatial_words_all_types %>%  
  filter(is.finite(log_prop_mathy)) %>%
  left_join(tidy_character_data) %>%
  group_by(char_main_gender) %>%
  multi_boot_standard(col = "log_prop_mathy")

ggplot(all_words_by_book_character, aes(x = char_main_gender, y = mean)) +
  geom_pointrange(aes(ymin = ci_lower, ymax = ci_upper)) +
  ylab("Log proportion math words") +
  xlab("Main character gender")

Words by math type

words_type_by_book_character <- spatial_words_by_type_types %>%  
  filter(is.finite(log_prop_mathy)) %>%
  left_join(tidy_character_data) %>%
  group_by(char_main_gender, type) %>%
  multi_boot_standard(col = "log_prop_mathy")
  
ggplot(words_type_by_book_character, aes(x = char_main_gender, y = mean)) +
  facet_wrap(~type, scales= "free_y") +
  geom_pointrange(aes(ymin = ci_lower, ymax = ci_upper)) +
  ylab("Log proportion math words") +
  xlab("Main character gender") +
  theme(axis.text.x = element_text(angle = 90))