Matt worked with RAs to code information about the characters in each book (more information here: https://github.com/mllewis/KIDBOOK_GENDER/blob/master/data/raw/other/character_README.md). Here’s a summary of the counts. Notably, there are about twice as many female lead characters relative to male.
CHARACTER_PATH <- here("data/raw/other/character_gender_by_book.csv")
characters <- read_csv(CHARACTER_PATH)
characters_tidy <- characters %>%
# select(book_id, char_main_singular, char_main_gender) %>%
mutate(main_char_gender = case_when(char_main_singular == "YES" & char_main_gender!= "AND" ~ char_main_gender,
TRUE ~ "none"),
main_char_gender2 = case_when(char_main_gender == "F" ~ char_main_gender,
char_main_gender == "M" ~ char_main_gender,
TRUE ~ "none")) %>%
mutate_if(is.character, as.factor)
summary(characters_tidy %>% select(4:7))
## char_main_singular char_main_gender char_second_singular
## NO : 70 AND :71 NO :176
## YES :177 F :48 YES : 71
## NA's: 2 M :92 NA's: 2
## MIXED:17
## NA's :21
## char_second_gender
## AND :48
## F :33
## M :23
## MIXED:67
## NA's :78
Then, for each book, I calculated a new overall gender score that excluded the main character name(s), other proper names (from the set of words we normed; N = 71), pronouns, and other common words refering to people.
Pronouns: "she", "he", "her", "him", "hers", "his", "herself", "himself"
Generic people words: "mother", "mommy", "mom", "father", "daddy", "dad", "brother", "boy", "sister", "girl", "grandma", "grandmother", "grandpa", "grandfather", "aunt", "uncle", "ma", "pa", "madame", "mister", "man", "woman", "lady", "gentleman", "mr.", "mrs."
BOOK_MEANS_PATH <- here("data/processed/books/gender_token_type_by_book.csv")
gender_rating_by_book_mean_only <- read_csv(BOOK_MEANS_PATH)
BOOK_MEANS_NO_CHARS_PATH <- here("data/processed/books/gender_token_type_by_book_no_chars.csv")
gender_rating_by_book_mean_only_no_chars <- read_csv(BOOK_MEANS_NO_CHARS_PATH) %>%
select(book_id, prop_present_token, token_gender_mean,
token_ci_lower, token_ci_upper) %>%
rename(prop_present_token_no_char = prop_present_token,
token_gender_mean_no_char = token_gender_mean,
token_ci_lower_no_char = token_ci_lower,
token_ci_upper_no_char = token_ci_upper)
This is the correlation between the all words and content only estimates of gender bias by book.
gender_rating_by_book_mean_only %>%
left_join(gender_rating_by_book_mean_only_no_chars) %>%
ggplot(aes(x = token_gender_mean, y = token_gender_mean_no_char)) +
geom_point(size = .5) +
geom_smooth(method = "lm") +
xlab("gender rating - all words") +
ylab("gender rating - content only ") +
theme_classic()
Below is the overall gender mean of each book with all words (top, same as before) and with all words refering to people excluded (bottom). Color corresponds to gender of the main character. What’s noticable to me is that in the content-only plot, female characters are associated with female content, and males with male content. This is suggestive that kids could be learning these associations FROM the books themselves.
forest_data1 <- gender_rating_by_book_mean_only %>%
left_join(gender_rating_by_book_mean_only_no_chars) %>%
mutate(title = fct_reorder(title, token_gender_mean)) %>%
left_join(characters_tidy %>% select(book_id, main_char_gender)) %>%
slice(1:50)
# all
overall_token_mean <- mean(gender_rating_by_book_mean_only$token_gender_mean)
ggplot(forest_data1) +
geom_hline(aes(yintercept = overall_token_mean), linetype = 2) +
coord_flip() +
geom_point(aes(x = reorder(title, token_gender_mean),
y = token_gender_mean, color = main_char_gender), size = .6) +
geom_linerange(aes(x = title, ymin = token_ci_lower,
ymax = token_ci_upper), alpha = .4) +
scale_color_manual(values = c("red", "blue", "grey"),
name = "Gender of main character") +
ylim(2.5, 3.5) +
xlab("Book Title") +
ylab("Book Gender Score (female-ness)") +
ggtitle("Mean Gender Rating by Book (all words)") +
theme_classic() +
theme(axis.text.y = element_text(size = 4))
# content only
overall_token_mean_no_char <- mean(gender_rating_by_book_mean_only_no_chars$token_gender_mean_no_char)
ggplot(forest_data1) +
geom_hline(aes(yintercept = overall_token_mean_no_char), linetype = 2) +
coord_flip() +
geom_point(aes(x = reorder(title, token_gender_mean),
y = token_gender_mean_no_char, color = main_char_gender), size = .6) +
geom_linerange(aes(x = title, ymin = token_ci_lower_no_char,
ymax = token_ci_upper_no_char), alpha = .4) +
xlab("Book Title") +
ylab("Book Gender Score (female-ness)") +
scale_color_manual(values = c("red", "blue", "grey"),
name = "Gender of main character") +
ylim(2.5, 3.5) +
ggtitle("Mean Gender Rating by Book (content only)") +
theme_classic() +
theme(axis.text.y = element_text(size = 4))
This is the same data as above, just showing the all words and content only means on the same figure.
forest_data2 <- gender_rating_by_book_mean_only %>%
left_join(gender_rating_by_book_mean_only_no_chars) %>%
mutate(title = fct_reorder(title, token_gender_mean )) %>%
select(book_id, title, token_gender_mean, token_gender_mean_no_char) %>%
gather("measure", "value", -1:-2) %>%
filter(book_id %in% forest_data1$book_id)
#filter(title %in% c("HUG", "CHRYSANTHEMUM", "JOURNEY", "OLIVIA"))
ggplot(forest_data2) +
geom_hline(aes(yintercept = overall_token_mean), linetype = 2) +
coord_flip() +
geom_line(aes(x=title, y= value, group = title)) +
ylim(2.5, 3.5) +
geom_point(aes(x = title,
y = value, color = measure), size = .6) +
xlab("Book Title") +
ylab("Book Gender Score (female-ness)") +
ggtitle("Mean Gender Rating by Book") +
theme_classic() +
theme(axis.text.y = element_text(size = 4))