These analses are based on a list of 65 children’s “mathy” words I generated in an ad hoc way. You can see them in a google sheet here.
MATH_WORDS <- "1GTdLmxLhIEa4hWCGf_relV54CcuVrqORKE1dS3Ajy0Q"
math_words <- read_sheet(MATH_WORDS)
CORPUS_PATH <- here("data/processed/books/tidy_full_corpus_all.csv")
corpus <- read_csv(CORPUS_PATH) %>%
select(-corpus_type)
REVIEWS_DATA_PATH <- here("data/processed/other/amazon_gender_scores.csv")
review_data <- read_csv(REVIEWS_DATA_PATH)
by_book_review_data <- review_data %>%
group_by(book_id, n_reviews_total, n_reviews_gendered,
prop_review_gendered) %>%
summarize(addressee_gender_score_token =
sum(n_female_token)/(sum(n_female_token) +
sum(n_male_token))) %>%
ungroup() %>%
select(book_id, addressee_gender_score_token)
Each point in these analyses is a book. The audience gender measure is the one described in the paper, derived from Amazon Reviews.
total_spatial_words_by_type_types <- corpus %>%
distinct(book_id, word) %>%
left_join(math_words) %>%
group_by(book_id, type) %>%
summarize(n_mathy = n()) %>%
ungroup() %>%
complete(book_id, type, fill = list(n_mathy = 0))
all_word_counts_types<- total_spatial_words_by_type_types %>%
group_by(book_id) %>%
summarize(total_words = sum(n_mathy, na.rm = T))
spatial_words_by_type_types <- total_spatial_words_by_type_types %>%
filter(!is.na(type)) %>%
left_join(all_word_counts_types) %>%
mutate(prop_mathy = n_mathy/total_words,
log_prop_mathy = log(prop_mathy),
log_prop_mathy_corrected = log(prop_mathy + .001)) %>%
left_join(by_book_review_data)
spatial_words_all_types <- spatial_words_by_type_types %>%
group_by(book_id) %>%
summarize(n_mathy = sum(n_mathy),
total_words = total_words[1],
addressee_gender_score_token = addressee_gender_score_token[1]) %>%
mutate(prop_mathy = n_mathy/total_words,
log_prop_mathy = log(prop_mathy),
log_prop_mathy_corrected = log(prop_mathy + .001))
ggplot(spatial_words_all_types, aes(x = addressee_gender_score_token, y = log_prop_mathy)) +
geom_point(alpha = .8) +
geom_smooth(method = "lm") +
ylab("Log proportion math words") +
xlab("Prop. female book audience")
spatial_words_all_types %>%
filter(is.finite(log_prop_mathy)) %>%
nest() %>%
mutate(temp = map(data, ~tidy(cor.test(.$addressee_gender_score_token,
.$log_prop_mathy)))) %>%
select(-data) %>%
unnest() %>%
kable()
| estimate | statistic | p.value | parameter | conf.low | conf.high | method | alternative |
|---|---|---|---|---|---|---|---|
| -0.2022876 | -3.173203 | 0.001708 | 236 | -0.3211874 | -0.0771093 | Pearson’s product-moment correlation | two.sided |
ggplot(spatial_words_by_type_types, aes(x = addressee_gender_score_token, y = log_prop_mathy)) +
facet_wrap(~type, scales= "free_y") +
geom_point(alpha = .8) +
geom_smooth(method = "lm") +
ylab("Log proportion math words") +
xlab("Prop. female book audience")
spatial_words_by_type_types %>%
filter(is.finite(log_prop_mathy)) %>%
nest(-type) %>%
mutate(temp = map(data, ~tidy(cor.test(.$addressee_gender_score_token,
.$log_prop_mathy)))) %>%
select(-data) %>%
unnest() %>%
kable()
| type | estimate | statistic | p.value | parameter | conf.low | conf.high | method | alternative |
|---|---|---|---|---|---|---|---|---|
| number | -0.0094058 | -0.1240769 | 0.9013976 | 174 | -0.1571073 | 0.1387072 | Pearson’s product-moment correlation | two.sided |
| spatial | -0.1994039 | -3.0927887 | 0.0022269 | 231 | -0.3197314 | -0.0727468 | Pearson’s product-moment correlation | two.sided |
| arithmetic | -0.2073081 | -2.2526523 | 0.0262121 | 113 | -0.3761402 | -0.0251524 | Pearson’s product-moment correlation | two.sided |
| shape | -0.2724671 | -1.0595673 | 0.3072805 | 14 | -0.6767661 | 0.2580978 | Pearson’s product-moment correlation | two.sided |
CHARACTER_DATA <- here("data/raw/other/character_gender_by_book.csv")
character_data <- read_csv(CHARACTER_DATA)
tidy_character_data <- character_data %>%
select(book_id, char_main_gender) %>%
mutate(char_main_gender = fct_recode(char_main_gender,
female = "F",
male = "M",
"indeterminate" = "AND",
mixed = "MIXED"))
all_words_by_book_character <- spatial_words_all_types %>%
filter(is.finite(log_prop_mathy)) %>%
left_join(tidy_character_data) %>%
group_by(char_main_gender) %>%
multi_boot_standard(col = "log_prop_mathy")
ggplot(all_words_by_book_character, aes(x = char_main_gender, y = mean)) +
geom_pointrange(aes(ymin = ci_lower, ymax = ci_upper)) +
ylab("Log proportion math words") +
xlab("Main character gender")
words_type_by_book_character <- spatial_words_by_type_types %>%
filter(is.finite(log_prop_mathy)) %>%
left_join(tidy_character_data) %>%
group_by(char_main_gender, type) %>%
multi_boot_standard(col = "log_prop_mathy")
ggplot(words_type_by_book_character, aes(x = char_main_gender, y = mean)) +
facet_wrap(~type, scales= "free_y") +
geom_pointrange(aes(ymin = ci_lower, ymax = ci_upper)) +
ylab("Log proportion math words") +
xlab("Main character gender") +
theme(axis.text.x = element_text(angle = 90))