Math words in kidbook corpus

Proportion math words predicted by audience gender
- All words
- Words by math type
Proportion math words predicted by main character gender
- All words
- Words by math type

These analses are based on a list of 65 children’s “mathy” words I generated in an ad hoc way. You can see them in a google sheet here.

MATH_WORDS <-  "1GTdLmxLhIEa4hWCGf_relV54CcuVrqORKE1dS3Ajy0Q"
math_words <- read_sheet(MATH_WORDS)

CORPUS_PATH <- here("data/processed/books/tidy_full_corpus_all.csv")
corpus <- read_csv(CORPUS_PATH) %>%
  select(-corpus_type)

REVIEWS_DATA_PATH <- here("data/processed/other/amazon_gender_scores.csv")
review_data <- read_csv(REVIEWS_DATA_PATH) 

by_book_review_data <- review_data %>%
  group_by(book_id, n_reviews_total, n_reviews_gendered,
           prop_review_gendered)  %>%
  summarize(addressee_gender_score_token =
              sum(n_female_token)/(sum(n_female_token) +
                                     sum(n_male_token))) %>%
  ungroup() %>%
  select(book_id, addressee_gender_score_token)

Proportion math words predicted by audience gender

Each point in these analyses is a book. The audience gender measure is the one described in the paper, derived from Amazon Reviews.

total_spatial_words_by_type_types <- corpus %>%
  distinct(book_id, word) %>%
  left_join(math_words) %>%
  group_by(book_id, type) %>%
  summarize(n_mathy = n()) %>%
  ungroup() %>%
  complete(book_id, type, fill = list(n_mathy = 0))  

all_word_counts_types<- total_spatial_words_by_type_types %>%
  group_by(book_id) %>%
  summarize(total_words = sum(n_mathy, na.rm = T))

spatial_words_by_type_types <- total_spatial_words_by_type_types %>%
  filter(!is.na(type)) %>%
  left_join(all_word_counts_types) %>%
  mutate(prop_mathy = n_mathy/total_words,
         log_prop_mathy = log(prop_mathy),
         log_prop_mathy_corrected = log(prop_mathy + .001)) %>%
  left_join(by_book_review_data)

spatial_words_all_types <- spatial_words_by_type_types %>%
  group_by(book_id) %>%
  summarize(n_mathy = sum(n_mathy),
            total_words = total_words[1],
            addressee_gender_score_token = addressee_gender_score_token[1]) %>%
    mutate(prop_mathy = n_mathy/total_words,
           log_prop_mathy = log(prop_mathy),
           log_prop_mathy_corrected = log(prop_mathy + .001))

All words

ggplot(spatial_words_all_types, aes(x = addressee_gender_score_token, y = log_prop_mathy)) +
  geom_point(alpha = .8) +
  geom_smooth(method = "lm") +
  ylab("Log proportion math words") +
  xlab("Prop. female book audience")

spatial_words_all_types %>%
  filter(is.finite(log_prop_mathy)) %>%
  nest() %>%
  mutate(temp = map(data, ~tidy(cor.test(.$addressee_gender_score_token,
                                         .$log_prop_mathy)))) %>%
  select(-data) %>%
  unnest() %>%
  kable()

estimate	statistic	p.value	parameter	conf.low	conf.high	method	alternative
-0.2022876	-3.173203	0.001708	236	-0.3211874	-0.0771093	Pearson’s product-moment correlation	two.sided

Words by math type

ggplot(spatial_words_by_type_types, aes(x = addressee_gender_score_token, y = log_prop_mathy)) +
  facet_wrap(~type, scales= "free_y") +
  geom_point(alpha = .8) +
  geom_smooth(method = "lm") +
  ylab("Log proportion math words") +
  xlab("Prop. female book audience")

spatial_words_by_type_types %>%
  filter(is.finite(log_prop_mathy)) %>%
  nest(-type) %>%
  mutate(temp = map(data, ~tidy(cor.test(.$addressee_gender_score_token,
                                         .$log_prop_mathy)))) %>%
  select(-data) %>%
  unnest() %>%
  kable()

type	estimate	statistic	p.value	parameter	conf.low	conf.high	method	alternative
number	-0.0094058	-0.1240769	0.9013976	174	-0.1571073	0.1387072	Pearson’s product-moment correlation	two.sided
spatial	-0.1994039	-3.0927887	0.0022269	231	-0.3197314	-0.0727468	Pearson’s product-moment correlation	two.sided
arithmetic	-0.2073081	-2.2526523	0.0262121	113	-0.3761402	-0.0251524	Pearson’s product-moment correlation	two.sided
shape	-0.2724671	-1.0595673	0.3072805	14	-0.6767661	0.2580978	Pearson’s product-moment correlation	two.sided

Proportion math words predicted by main character gender

CHARACTER_DATA <- here("data/raw/other/character_gender_by_book.csv")
character_data <- read_csv(CHARACTER_DATA)

tidy_character_data <- character_data %>%
  select(book_id, char_main_gender) %>%
  mutate(char_main_gender = fct_recode(char_main_gender, 
                                          female = "F",
                                          male = "M",
                                          "indeterminate" = "AND",
                                          mixed = "MIXED"))

All words

all_words_by_book_character <- spatial_words_all_types %>%  
  filter(is.finite(log_prop_mathy)) %>%
  left_join(tidy_character_data) %>%
  group_by(char_main_gender) %>%
  multi_boot_standard(col = "log_prop_mathy")

ggplot(all_words_by_book_character, aes(x = char_main_gender, y = mean)) +
  geom_pointrange(aes(ymin = ci_lower, ymax = ci_upper)) +
  ylab("Log proportion math words") +
  xlab("Main character gender")

Words by math type

words_type_by_book_character <- spatial_words_by_type_types %>%  
  filter(is.finite(log_prop_mathy)) %>%
  left_join(tidy_character_data) %>%
  group_by(char_main_gender, type) %>%
  multi_boot_standard(col = "log_prop_mathy")
  
ggplot(words_type_by_book_character, aes(x = char_main_gender, y = mean)) +
  facet_wrap(~type, scales= "free_y") +
  geom_pointrange(aes(ymin = ci_lower, ymax = ci_upper)) +
  ylab("Log proportion math words") +
  xlab("Main character gender") +
  theme(axis.text.x = element_text(angle = 90))

Math words in kidbook corpus

gender analysis

Molly Lewis

2020-08-04

Proportion math words predicted by audience gender

All words

Words by math type

Proportion math words predicted by main character gender

All words

Words by math type