INFILE <- "data/control_measures_by_word.csv"
resid_data <- read_csv(INFILE)
GLASGOW <- "/Users/mollylewis/Documents/research/Projects/1_in_progress/IATLANG/data/study1a/raw/GlasgowNorms.csv"
glasgow_data <- read_csv(GLASGOW) %>%
select(word, GEND_M) %>%
rename(glasgow_gender_rating = GEND_M)
our_vs_glasgow_norms <- resid_data %>%
left_join(glasgow_data)
ggplot(our_vs_glasgow_norms, aes(x = mean_gender_rating, y = glasgow_gender_rating)) +
ggtitle("Our gender norms vs. Glasgow") +
xlab("Our norms (femaleness)") +
ylab("Glasgow norms (maleness)") +
geom_point() +
geom_smooth(method = "lm") +
theme_bw()
cor.test(our_vs_glasgow_norms$mean_gender_rating,
our_vs_glasgow_norms$glasgow_gender_rating)
##
## Pearson's product-moment correlation
##
## data: our_vs_glasgow_norms$mean_gender_rating and our_vs_glasgow_norms$glasgow_gender_rating
## t = -52.131, df = 575, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.9217672 -0.8931005
## sample estimates:
## cor
## -0.9084968
They’re highly correlated.
source("/Users/mollylewis/Documents/research/Misc/R\ stuff/make_corr_plot.R")
make_corr_plot(resid_data %>% select(1:9))
lm(mean_gender_rating ~ aoa_rating + word_length +
log_tasa_frequency + valence_rating + dominance_rating + arousal_rating +
conc_rating,
resid_data) %>%
summary()
##
## Call:
## lm(formula = mean_gender_rating ~ aoa_rating + word_length +
## log_tasa_frequency + valence_rating + dominance_rating +
## arousal_rating + conc_rating, data = resid_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.16121 -0.31698 0.00859 0.29894 2.00419
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.398191 0.357244 9.512 < 2e-16 ***
## aoa_rating -0.093391 0.021222 -4.401 1.26e-05 ***
## word_length 0.029275 0.017529 1.670 0.095376 .
## log_tasa_frequency -0.072957 0.019997 -3.648 0.000285 ***
## valence_rating 0.207774 0.025960 8.004 5.59e-15 ***
## dominance_rating 0.009209 0.037302 0.247 0.805079
## arousal_rating -0.076789 0.026872 -2.858 0.004405 **
## conc_rating -0.137718 0.026402 -5.216 2.46e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.5986 on 648 degrees of freedom
## (495 observations deleted due to missingness)
## Multiple R-squared: 0.2052, Adjusted R-squared: 0.1966
## F-statistic: 23.9 on 7 and 648 DF, p-value: < 2.2e-16
Words more associate with girls tend to be (1) learned earlier, (2) lower frequency, (3) more positively valenced, (4) lower arousal, and (5) less concrete. Note when we look at all the predictors together we lose lots of data.
INFILE <- "data/gender_by_book_token.csv"
gender_rating_by_book <- read_csv(INFILE)
gender_rating_by_book_mean_only <- gender_rating_by_book %>%
filter(gender_measure == "mean_gender_rating")
ggplot(gender_rating_by_book_mean_only,
aes(x = prop_na_token)) +
xlab("Prop missing word types by book") +
geom_histogram() +
xlim(0,1) +
theme_classic()
gender_rating_by_book_clean <- gender_rating_by_book %>%
filter(doc_id %in%
(gender_rating_by_book_mean_only %>% filter(prop_na_token < .5) %>%
pull(doc_id)))
We’re missing a fair number of words for each book. Let’s exclude the books for which we have ratings for less than .5 of the types.
gender_rating_by_book_mean_only <- gender_rating_by_book_clean %>%
filter(gender_measure == "mean_gender_rating")
ggplot(gender_rating_by_book_mean_only,
aes(x = token_mean)) +
xlab("Gender rating (femaleness)") +
geom_histogram() +
xlim(1, 5) +
theme_classic()
For each book, plot the mean gender bias. Ranges are bootstrapped 95% CIs. Note that I’m plotting token means here since it’s more variable.
overall_token_mean <- mean(gender_rating_by_book_mean_only$token_mean)
gender_rating_by_book_mean_only %>%
ggplot(aes(x = reorder(title, token_mean),
y = token_mean, color = token_mean)) +
geom_hline(aes(yintercept = overall_token_mean), linetype = 2) +
coord_flip() +
geom_pointrange(aes(ymin = token_lower_ci, ymax = token_upper_ci), size = .1) +
scale_colour_gradient2(low = "blue", mid = "grey",
high ="red", midpoint = overall_token_mean, space = "Lab",
na.value = "grey50", guide = F) +
#ylim(2, 4) +
theme_classic() +
xlab("Book Title") +
ylab("Word Token Gender Mean (femaleness)") + ggtitle("Mean Gender Rating by Book") +
theme(axis.text.y = element_text(size = 6))
length_metadata <- read_csv("data/length_metadata.csv")
other_vars <- read_csv("data/other_vars_by_book_token.csv")
all_vars_by_book <- gender_rating_by_book_mean_only %>%
rename(gender_score = token_mean) %>%
left_join(length_metadata) %>%
left_join(other_vars)
make_corr_plot(all_vars_by_book %>% select(1,6:17))
lm(gender_score ~ TTR + word_length +
log_tasa_frequency + valence_rating + dominance_rating + arousal_rating + aoa_rating +
conc_rating,
all_vars_by_book) %>%
summary()
##
## Call:
## lm(formula = gender_score ~ TTR + word_length + log_tasa_frequency +
## valence_rating + dominance_rating + arousal_rating + aoa_rating +
## conc_rating, data = all_vars_by_book)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.43882 -0.08463 0.00298 0.09995 0.37849
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.724036 0.913448 2.982 0.00322 **
## TTR -0.048755 0.086849 -0.561 0.57517
## word_length -0.009862 0.044416 -0.222 0.82452
## log_tasa_frequency -0.088934 0.034465 -2.580 0.01058 *
## valence_rating 0.328186 0.042183 7.780 3.79e-13 ***
## dominance_rating -0.073032 0.073643 -0.992 0.32254
## arousal_rating 0.003526 0.059653 0.059 0.95292
## aoa_rating 0.005615 0.053921 0.104 0.91717
## conc_rating -0.160549 0.070373 -2.281 0.02358 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1558 on 200 degrees of freedom
## Multiple R-squared: 0.3766, Adjusted R-squared: 0.3517
## F-statistic: 15.1 on 8 and 200 DF, p-value: < 2.2e-16
Books that are more hight assocated with girls (1) have lower frequency words, (2) are more dominant, (3) more positive, (4) higher arousal, (5) have words learned earlier.
But: if you control for everything, girls books have words that are overall more abstract, more positively valenced, and lower frequency (e.g. “love”?).
resid_df <- gender_rating_by_book_clean %>%
filter(gender_measure == "gender_residuals_len_freq_conc")
overall_token_mean <- mean(resid_df$token_mean)
resid_df %>%
ggplot(aes(x = reorder(title, token_mean),
y = token_mean, color = token_mean)) +
coord_flip() +
geom_hline(aes(yintercept = overall_token_mean), linetype = 2) +
geom_pointrange(aes(ymin = token_lower_ci, ymax = token_upper_ci), size = .1) +
scale_colour_gradient2(low = "blue", mid = "grey",
high ="red", midpoint = overall_token_mean, space = "Lab",
na.value = "grey50", guide = F) +
#ylim(2, 4) +
theme_classic() +
xlab("Book Title") +
ylab("Word Token Gender Mean (femaleness)") + ggtitle("Mean Gender Rating by Book") +
theme(axis.text.y = element_text(size = 6))
resid_df <- gender_rating_by_book_clean %>%
filter(gender_measure == "gender_residuals_aoa_len_freq_conc")
overall_token_mean <- mean(resid_df$token_mean)
resid_df %>%
ggplot(aes(x = reorder(title, token_mean),
y = token_mean, color = token_mean)) +
coord_flip() +
geom_hline(aes(yintercept = overall_token_mean), linetype = 2) +
geom_pointrange(aes(ymin = token_lower_ci, ymax = token_upper_ci), size = .1) +
scale_colour_gradient2(low = "blue", mid = "grey",
high ="red", midpoint = overall_token_mean, space = "Lab",
na.value = "grey50", guide = F) +
#ylim(2, 4) +
theme_classic() +
xlab("Book Title") +
ylab("Word Token Gender Mean (femaleness)") + ggtitle("Mean Gender Rating by Book") +
theme(axis.text.y = element_text(size = 6))
resid_df <- gender_rating_by_book_clean %>%
filter(gender_measure == "gender_residuals_aoa_len_freq_val_conc")
overall_token_mean <- mean(resid_df$token_mean)
resid_df %>%
ggplot(aes(x = reorder(title, token_mean),
y = token_mean, color = token_mean)) +
coord_flip() +
geom_hline(aes(yintercept = overall_token_mean), linetype = 2) +
geom_pointrange(aes(ymin = token_lower_ci, ymax = token_upper_ci), size = .1) +
scale_colour_gradient2(low = "blue", mid = "grey",
high ="red", midpoint = overall_token_mean, space = "Lab",
na.value = "grey50", guide = F) +
#ylim(2, 4) +
theme_classic() +
xlab("Book Title") +
ylab("Word Token Gender Mean (femaleness)") +
ggtitle("Mean Gender Rating by Book") +
theme(axis.text.y = element_text(size = 6))
THINGS TO DO NEXT: