Exploration of character names

Mean gender by book
- all
- content only
all words/content only

Matt worked with RAs to code information about the characters in each book (more information here: https://github.com/mllewis/KIDBOOK_GENDER/blob/master/data/raw/other/character_README.md). Here’s a summary of the counts. Notably, there are about twice as many female lead characters relative to male.

CHARACTER_PATH <- here("data/raw/other/character_gender_by_book.csv")
characters <- read_csv(CHARACTER_PATH)  

characters_tidy <- characters %>%
 # select(book_id, char_main_singular, char_main_gender) %>%
  mutate(main_char_gender = case_when(char_main_singular == "YES" & char_main_gender!= "AND" ~ char_main_gender,
                                      TRUE ~ "none"),
         main_char_gender2 = case_when(char_main_gender == "F" ~ char_main_gender,
                                       char_main_gender == "M" ~ char_main_gender,
                                      TRUE ~ "none")) %>%
  mutate_if(is.character, as.factor) 

summary(characters_tidy %>% select(4:7))

##  char_main_singular char_main_gender char_second_singular
##  NO  : 70           AND  :71         NO  :176            
##  YES :177           F    :48         YES : 71            
##  NA's:  2           M    :92         NA's:  2            
##                     MIXED:17                             
##                     NA's :21                             
##  char_second_gender
##  AND  :48          
##  F    :33          
##  M    :23          
##  MIXED:67          
##  NA's :78

Then, for each book, I calculated a new overall gender score that excluded the main character name(s), other proper names (from the set of words we normed; N = 71), pronouns, and other common words refering to people.

Pronouns: "she", "he", "her", "him", "hers", "his", "herself", "himself"

Generic people words: "mother", "mommy", "mom", "father", "daddy", "dad", "brother", "boy", "sister", "girl", "grandma", "grandmother", "grandpa", "grandfather", "aunt", "uncle", "ma", "pa", "madame", "mister", "man", "woman", "lady", "gentleman", "mr.", "mrs."

BOOK_MEANS_PATH <- here("data/processed/books/gender_token_type_by_book.csv")
gender_rating_by_book_mean_only <- read_csv(BOOK_MEANS_PATH)

BOOK_MEANS_NO_CHARS_PATH <- here("data/processed/books/gender_token_type_by_book_no_chars.csv")
gender_rating_by_book_mean_only_no_chars <- read_csv(BOOK_MEANS_NO_CHARS_PATH) %>%
  select(book_id, prop_present_token, token_gender_mean, 
         token_ci_lower, token_ci_upper) %>%
  rename(prop_present_token_no_char = prop_present_token,
         token_gender_mean_no_char = token_gender_mean,
         token_ci_lower_no_char = token_ci_lower,
         token_ci_upper_no_char = token_ci_upper)

This is the correlation between the all words and content only estimates of gender bias by book.

gender_rating_by_book_mean_only %>%
  left_join(gender_rating_by_book_mean_only_no_chars)  %>%
  ggplot(aes(x = token_gender_mean, y = token_gender_mean_no_char)) +
  geom_point(size = .5) +
  geom_smooth(method = "lm") +
  xlab("gender rating - all words") +
  ylab("gender rating - content only ") +
  theme_classic()

Below is the overall gender mean of each book with all words (top, same as before) and with all words refering to people excluded (bottom). Color corresponds to gender of the main character. What’s noticable to me is that in the content-only plot, female characters are associated with female content, and males with male content. This is suggestive that kids could be learning these associations FROM the books themselves.

Mean gender by book

all

forest_data1 <- gender_rating_by_book_mean_only %>%
  left_join(gender_rating_by_book_mean_only_no_chars)  %>%
  mutate(title = fct_reorder(title, token_gender_mean))  %>%
  left_join(characters_tidy %>% select(book_id, main_char_gender)) %>%
  slice(1:50)

# all
overall_token_mean <- mean(gender_rating_by_book_mean_only$token_gender_mean)
ggplot(forest_data1) +
  geom_hline(aes(yintercept = overall_token_mean), linetype = 2) +
  coord_flip() +
  geom_point(aes(x = reorder(title, token_gender_mean),
             y = token_gender_mean, color  = main_char_gender), size = .6) +
  geom_linerange(aes(x = title, ymin = token_ci_lower, 
                      ymax = token_ci_upper), alpha = .4) +
  scale_color_manual(values = c("red", "blue", "grey"),
                       name = "Gender of main character") +
  ylim(2.5, 3.5) +
  xlab("Book Title") +
  ylab("Book Gender Score (female-ness)") +
  ggtitle("Mean Gender Rating by Book (all words)") +
  theme_classic() +
  theme(axis.text.y = element_text(size = 4))

content only

# content only
overall_token_mean_no_char <- mean(gender_rating_by_book_mean_only_no_chars$token_gender_mean_no_char)
ggplot(forest_data1) +
  geom_hline(aes(yintercept = overall_token_mean_no_char), linetype = 2) +
  coord_flip() +
  geom_point(aes(x = reorder(title, token_gender_mean),
             y = token_gender_mean_no_char, color  = main_char_gender), size = .6) +
  geom_linerange(aes(x = title, ymin = token_ci_lower_no_char, 
                      ymax = token_ci_upper_no_char), alpha = .4) +
  xlab("Book Title") +
  ylab("Book Gender Score (female-ness)") +
  scale_color_manual(values = c("red", "blue", "grey"), 
                     name = "Gender of main character") +
   ylim(2.5, 3.5) +
  ggtitle("Mean Gender Rating by Book (content only)") +
  theme_classic() +
  theme(axis.text.y = element_text(size = 4))

all words/content only

This is the same data as above, just showing the all words and content only means on the same figure.

forest_data2 <- gender_rating_by_book_mean_only %>%
  left_join(gender_rating_by_book_mean_only_no_chars)  %>%
  mutate(title = fct_reorder(title, token_gender_mean )) %>%
  select(book_id, title, token_gender_mean, token_gender_mean_no_char) %>%
  gather("measure", "value", -1:-2) %>%
  filter(book_id %in% forest_data1$book_id)
  #filter(title %in% c("HUG", "CHRYSANTHEMUM", "JOURNEY", "OLIVIA"))

ggplot(forest_data2) +
  geom_hline(aes(yintercept = overall_token_mean), linetype = 2) +
  coord_flip() +
  geom_line(aes(x=title, y= value, group = title)) +
  ylim(2.5, 3.5) +
  geom_point(aes(x = title,
             y = value, color = measure), size = .6) +
  xlab("Book Title") +
  ylab("Book Gender Score (female-ness)") +
  ggtitle("Mean Gender Rating by Book") +
  theme_classic() +
  theme(axis.text.y = element_text(size = 4))

Exploration of character names - mini version

Molly Lewis

2019-04-04

Mean gender by book

all

content only

all words/content only