Human gender rating by-word and by-book analyses

Word level:
- Compared to Glasgow et al. (2018) norms
- Gender norms and other variables
Book level:

Word level:

Compared to Glasgow et al. (2018) norms

INFILE  <- "data/control_measures_by_word.csv"
resid_data <- read_csv(INFILE)

GLASGOW <- "/Users/mollylewis/Documents/research/Projects/1_in_progress/IATLANG/data/study1a/raw/GlasgowNorms.csv"

glasgow_data <- read_csv(GLASGOW) %>%
  select(word, GEND_M) %>%
  rename(glasgow_gender_rating = GEND_M)

our_vs_glasgow_norms <- resid_data %>%
  left_join(glasgow_data)

ggplot(our_vs_glasgow_norms, aes(x = mean_gender_rating, y = glasgow_gender_rating)) +
  ggtitle("Our gender norms vs. Glasgow") +
  xlab("Our norms (femaleness)") +
  ylab("Glasgow norms (maleness)") +
  geom_point() +
  geom_smooth(method = "lm") +
  theme_bw()

cor.test(our_vs_glasgow_norms$mean_gender_rating,
         our_vs_glasgow_norms$glasgow_gender_rating)

## 
##  Pearson's product-moment correlation
## 
## data:  our_vs_glasgow_norms$mean_gender_rating and our_vs_glasgow_norms$glasgow_gender_rating
## t = -52.131, df = 575, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.9217672 -0.8931005
## sample estimates:
##        cor 
## -0.9084968

They’re highly correlated.

Gender norms and other variables

valence (the pleasantness of the stimulus) from Warriner, Brysbaert and Kuperman (2013)
arousal (the intensity of emotion provoked by the stimulus)
dominance (the degree of control exerted by the stimulus)
concreteness (brysbaert)
AOA (kuperman)
TASA word frequencies (our word frequencies are highly correlated with this)

source("/Users/mollylewis/Documents/research/Misc/R\ stuff/make_corr_plot.R")

make_corr_plot(resid_data %>% select(1:9))

 lm(mean_gender_rating ~ aoa_rating  + word_length + 
     log_tasa_frequency + valence_rating + dominance_rating + arousal_rating +
     conc_rating, 
   resid_data) %>%
  summary()

## 
## Call:
## lm(formula = mean_gender_rating ~ aoa_rating + word_length + 
##     log_tasa_frequency + valence_rating + dominance_rating + 
##     arousal_rating + conc_rating, data = resid_data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.16121 -0.31698  0.00859  0.29894  2.00419 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         3.398191   0.357244   9.512  < 2e-16 ***
## aoa_rating         -0.093391   0.021222  -4.401 1.26e-05 ***
## word_length         0.029275   0.017529   1.670 0.095376 .  
## log_tasa_frequency -0.072957   0.019997  -3.648 0.000285 ***
## valence_rating      0.207774   0.025960   8.004 5.59e-15 ***
## dominance_rating    0.009209   0.037302   0.247 0.805079    
## arousal_rating     -0.076789   0.026872  -2.858 0.004405 ** 
## conc_rating        -0.137718   0.026402  -5.216 2.46e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5986 on 648 degrees of freedom
##   (495 observations deleted due to missingness)
## Multiple R-squared:  0.2052, Adjusted R-squared:  0.1966 
## F-statistic:  23.9 on 7 and 648 DF,  p-value: < 2.2e-16

Words more associate with girls tend to be (1) learned earlier, (2) lower frequency, (3) more positively valenced, (4) lower arousal, and (5) less concrete. Note when we look at all the predictors together we lose lots of data.

Book level:

Missing data

INFILE <- "data/gender_by_book_token.csv"
gender_rating_by_book <- read_csv(INFILE)

gender_rating_by_book_mean_only <- gender_rating_by_book %>%
  filter(gender_measure == "mean_gender_rating")

ggplot(gender_rating_by_book_mean_only, 
       aes(x = prop_na_token)) +
  xlab("Prop missing word types by book") +
  geom_histogram() +
  xlim(0,1) +
  theme_classic()

gender_rating_by_book_clean <- gender_rating_by_book %>%
  filter(doc_id %in% 
           (gender_rating_by_book_mean_only %>% filter(prop_na_token < .5) %>%
           pull(doc_id)))

We’re missing a fair number of words for each book. Let’s exclude the books for which we have ratings for less than .5 of the types.

Gender ratings by book

gender_rating_by_book_mean_only <- gender_rating_by_book_clean %>%
  filter(gender_measure == "mean_gender_rating")

ggplot(gender_rating_by_book_mean_only, 
       aes(x = token_mean)) +
  xlab("Gender rating (femaleness)") +
  geom_histogram() +
  xlim(1, 5) +
  theme_classic()

For each book, plot the mean gender bias. Ranges are bootstrapped 95% CIs. Note that I’m plotting token means here since it’s more variable.

overall_token_mean <- mean(gender_rating_by_book_mean_only$token_mean)

gender_rating_by_book_mean_only %>%
  ggplot(aes(x = reorder(title, token_mean),
             y = token_mean, color = token_mean)) +
    geom_hline(aes(yintercept = overall_token_mean), linetype = 2) +
  coord_flip() +
  geom_pointrange(aes(ymin = token_lower_ci, ymax = token_upper_ci), size = .1) +
scale_colour_gradient2(low = "blue", mid = "grey",
  high ="red", midpoint = overall_token_mean, space = "Lab",
  na.value = "grey50", guide = F) +
  #ylim(2, 4) +
  theme_classic() +
  xlab("Book Title") +
  ylab("Word Token Gender Mean (femaleness)") + ggtitle("Mean Gender Rating by Book") +
  theme(axis.text.y = element_text(size = 6))

Correlations with other variables

length_metadata <- read_csv("data/length_metadata.csv")
other_vars <- read_csv("data/other_vars_by_book_token.csv")

all_vars_by_book <- gender_rating_by_book_mean_only %>%
  rename(gender_score = token_mean) %>%
  left_join(length_metadata) %>%
  left_join(other_vars)

make_corr_plot(all_vars_by_book %>% select(1,6:17))

 lm(gender_score ~ TTR  + word_length + 
     log_tasa_frequency + valence_rating + dominance_rating + arousal_rating + aoa_rating +
     conc_rating, 
   all_vars_by_book) %>%
  summary()

## 
## Call:
## lm(formula = gender_score ~ TTR + word_length + log_tasa_frequency + 
##     valence_rating + dominance_rating + arousal_rating + aoa_rating + 
##     conc_rating, data = all_vars_by_book)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.43882 -0.08463  0.00298  0.09995  0.37849 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         2.724036   0.913448   2.982  0.00322 ** 
## TTR                -0.048755   0.086849  -0.561  0.57517    
## word_length        -0.009862   0.044416  -0.222  0.82452    
## log_tasa_frequency -0.088934   0.034465  -2.580  0.01058 *  
## valence_rating      0.328186   0.042183   7.780 3.79e-13 ***
## dominance_rating   -0.073032   0.073643  -0.992  0.32254    
## arousal_rating      0.003526   0.059653   0.059  0.95292    
## aoa_rating          0.005615   0.053921   0.104  0.91717    
## conc_rating        -0.160549   0.070373  -2.281  0.02358 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1558 on 200 degrees of freedom
## Multiple R-squared:  0.3766, Adjusted R-squared:  0.3517 
## F-statistic:  15.1 on 8 and 200 DF,  p-value: < 2.2e-16

Books that are more hight assocated with girls (1) have lower frequency words, (2) are more dominant, (3) more positive, (4) higher arousal, (5) have words learned earlier.

But: if you control for everything, girls books have words that are overall more abstract, more positively valenced, and lower frequency (e.g. “love”?).

Gender ratings, residualizing out other variables

Residualizing out len, freq, conc

resid_df <- gender_rating_by_book_clean %>%
  filter(gender_measure == "gender_residuals_len_freq_conc") 

overall_token_mean <- mean(resid_df$token_mean)

resid_df %>%
  ggplot(aes(x = reorder(title, token_mean),
             y = token_mean,  color = token_mean)) +
  coord_flip() +
  geom_hline(aes(yintercept = overall_token_mean), linetype = 2) +

  geom_pointrange(aes(ymin = token_lower_ci, ymax = token_upper_ci), size = .1) +
scale_colour_gradient2(low = "blue", mid = "grey",
  high ="red", midpoint = overall_token_mean, space = "Lab",
  na.value = "grey50", guide = F) +
  #ylim(2, 4) +
  theme_classic() +
  xlab("Book Title") +
  ylab("Word Token Gender Mean (femaleness)") + ggtitle("Mean Gender Rating by Book") +
  theme(axis.text.y = element_text(size = 6))

Residualizing out aoa, len, freq, conc

resid_df <- gender_rating_by_book_clean %>%
  filter(gender_measure == "gender_residuals_aoa_len_freq_conc") 

overall_token_mean <- mean(resid_df$token_mean)

resid_df %>%
  ggplot(aes(x = reorder(title, token_mean),
             y = token_mean,  color = token_mean)) +
  coord_flip() +
  geom_hline(aes(yintercept = overall_token_mean), linetype = 2) +

  geom_pointrange(aes(ymin = token_lower_ci, ymax = token_upper_ci), size = .1) +
scale_colour_gradient2(low = "blue", mid = "grey",
  high ="red", midpoint = overall_token_mean, space = "Lab",
  na.value = "grey50", guide = F) +
  #ylim(2, 4) +
  theme_classic() +
  xlab("Book Title") +
  ylab("Word Token Gender Mean (femaleness)") + ggtitle("Mean Gender Rating by Book") +
  theme(axis.text.y = element_text(size = 6))

Residualizing out aoa, len, freq, conc, val

resid_df <- gender_rating_by_book_clean %>%
  filter(gender_measure == "gender_residuals_aoa_len_freq_val_conc") 

overall_token_mean <- mean(resid_df$token_mean)

resid_df %>%
  ggplot(aes(x = reorder(title, token_mean),
             y = token_mean,  color = token_mean)) +
  coord_flip() +
  geom_hline(aes(yintercept = overall_token_mean), linetype = 2) +

  geom_pointrange(aes(ymin = token_lower_ci, ymax = token_upper_ci), size = .1) +
scale_colour_gradient2(low = "blue", mid = "grey",
  high ="red", midpoint = overall_token_mean, space = "Lab",
  na.value = "grey50", guide = F) +
  #ylim(2, 4) +
  theme_classic() +
  xlab("Book Title") +
  ylab("Word Token Gender Mean (femaleness)") + 
  ggtitle("Mean Gender Rating by Book") +
  theme(axis.text.y = element_text(size = 6))

THINGS TO DO NEXT:

Account for gender of proper names somehow (e.g. https://github.com/ropensci/gender)
Add in book metadata (author, target, age, popularity, kid gender (from Hudson-Kam))
Look at how these estimates are related to vector representation estimates of bias (if highly correlated could use them to estimate bias for missing words )