IAT LANG - gender-genius

Language data
Behavioral data
Behavioral IAT vs. Language IAT
- By language
- By participant

Here I’m controling for block order and combining datasets from exp1 and exp2. I’m also included median country age as a covariate, because that seems to matter from other analyses we’ve done.

Take aways:

when datasets are combined there’s no effect
theres a trend when you look at only the turk data, controling for us
residualizing out order, age, and sex doesn’t change much

MINPARTICIPANTS <- 8

Language data

IAT_lang_full_path <- "genius_effect_sizes_google_full.csv"
IAT_lang_restricted_path <- "genius_effect_sizes_google_restricted.csv"
IAT_lang_restricted_path2 <- "genius_effect_sizes_google_restricted2.csv"
lang_key <- read_tsv("language_name_to_google.csv")

I calculated language IAT as in Caliskan et al (2017) for 40 languages in the behavioral data set, using translations from Google Translate. The restricted word list and full word lists effect sizes are highly correlated, so in the following analysis, I ony look at the effect sizes calculated from the restricted set.

Behavioral data

# Study 2
IAT_behavioral_path1 <-"../../data/IAT/Gender-Genius/InternationalIAT_LanguageData.csv"

IAT_behavioral_path_raw1 <- read_csv(IAT_behavioral_path1)

IAT_behavioral_tidy1 <- IAT_behavioral_path_raw1 %>%
  mutate(lang1 = tolower(PrimaryLanguage),
         #lang2 = tolower(lang_other_clean),
         Gender = tolower(Gender),
         subid = as.factor(1:n()))  %>%
  rename(iat_score = IATScore, 
         gender = Gender,
         age = Age,
         country = Country,
        # ses = SES, 
         condition = ConditionC,
         conservatism = Conservatism,
         status = Status, 
         children = Children) %>%
  mutate(lang = ifelse(lang1 == "other", lang2, lang1)) %>%
       #  region = as.factor(countrycode::countrycode(country, "country.name", "region"))) %>%
  left_join(lang_key %>% select(language_code, lang)) %>%
  mutate_if(is.character, as.factor) %>%
  mutate(condition = as.factor(condition), 
         log_age = log(age)) %>%
  select(subid, iat_score, lang, language_code, gender, condition, log_age, country, conservatism, status, children) 


## Study1
IAT_behavioral_path2 <- "../../data/IAT/Gender-Genius/IAT_Study1_Combined_Master_Dataset_LanguageData.csv"

IAT_behavioral_path_raw2 <- read_csv(IAT_behavioral_path2)

IAT_behavioral_tidy2 <- IAT_behavioral_path_raw2 %>%
  mutate(lang = tolower(PrimaryLanguage),
         Gender = tolower(Gender),
         subid = as.factor(SubjectID))  %>%
  rename(iat_score = IATScore, 
         gender = Gender,
         age = Age,
         country = Country,
         condition = Condition,
         #sexism = Sexism,
         #race = Race,
         #politicalparty = PoliticalParty,
         conservatism = Conservatism,
         status = Status, 
         children = Children)%>%
         #income = Income) %>%
  left_join(lang_key %>% select(language_code, lang)) %>%
  mutate_if(is.character, as.factor) %>%
  mutate(condition = as.factor(condition),
         conservatism = as.factor(conservatism),
         children = as.numeric(children),
         log_age = log(age)) %>%
  select(subid, iat_score, lang, language_code, gender, condition, log_age, country, conservatism, status, children) 

IAT_behavioral_tidy <- bind_rows(IAT_behavioral_tidy1, IAT_behavioral_tidy2)
#IAT_behavioral_tidy <- IAT_behavioral_tidy1

Let’s residualize out order, age and gender, as in career, and add in mean country age, and objective bias measures

# add residuals
mod <- lm(iat_score ~ as.factor(gender)   + log_age + as.factor(condition), data = IAT_behavioral_tidy)

AGE_DATA_PATH <- "../7_age_controls/median_country_age_world_factbook.csv"
country_age <- read_csv(AGE_DATA_PATH) %>%
  rename(country= country_name) %>%
  mutate(country = fct_recode(country,
                                     "United States"= "United States of America", 
                                     "United Kingdom" = "UK",
                                     "Czech Republic" = "Czechia",
                                     "Tanzania, United Republic of" = "Tanzania",
                                     "Moldova, Republic of" = "Moldova",
                                     "Bahamas" = "Bahamas, The"))


GENDER_MEASURE_PATH <- "../../analyses/4_gender_measures/data/gender_measures/all_gender_measures2.csv"

objective_country_measures_by_country <- read_csv(GENDER_MEASURE_PATH) %>%
  rename(country = country_name) %>%
  select(ggi, wps, country) 

IAT_behavioral_tidy_with_resids  <- IAT_behavioral_tidy %>%
  add_residuals(mod, "iat_resid") %>%
  left_join(country_age) %>%
  left_join(objective_country_measures_by_country)

There are 918 participants in the data, speaking 46 languages.

lang_counts <- count(IAT_behavioral_tidy_with_resids, lang)
lang_counts %>%
  arrange(-n) %>%
  DT::datatable()

Here are the scores, raw and residualized.

ggplot(IAT_behavioral_tidy_with_resids, aes(x = iat_score)) +
  geom_histogram() +
  ggtitle("Behavioral IAT score distribtions") +
  theme_classic()

ggplot(IAT_behavioral_tidy_with_resids, aes(x = iat_resid)) +
  geom_histogram() +
  ggtitle("Behavioral IAT score distribtions") +
  theme_classic()

Behavioral IAT vs. Language IAT

By language

I subset to only those languages that have sufficient speakers. Here the value is set to 8. Here are the mean behavioral IATs by language as a function of language IAT. The ranges are 95% CIs.

Predicting raw IAT scores

targ_langs <- lang_counts %>%
  filter(n >= MINPARTICIPANTS) %>%
  pull(lang)

behavioral_means_tidy <- IAT_behavioral_tidy_with_resids %>%
  filter(lang %in% targ_langs) %>%
  group_by(lang) %>%
  multi_boot_standard(col = "iat_score") %>%
  ungroup()  %>%
  rename(behavioral_mean = mean,
         behavioral_ci_lower = ci_lower,
         behavioral_ci_upper = ci_upper)  %>%
  left_join(lang_key %>% select(language_code, lang))

sample_sizes <- count(IAT_behavioral_tidy_with_resids, lang) %>%
    left_join(lang_key %>% select(language_code, lang))

median_age_by_lang <- IAT_behavioral_tidy_with_resids %>%
  group_by(language_code) %>%
  summarize(median_age = mean(median_age))


mean_objectives_by_lang <- IAT_behavioral_tidy_with_resids %>%
  select(language_code, wps, ggi) %>%
  group_by(language_code) %>%
  summarize(wps = mean(wps),
            ggi  = mean(ggi))

full_df <- behavioral_means_tidy %>%
  left_join(IAT_lang_restricted, by = "language_code") %>%
  left_join(sample_sizes, by = "language_code")  %>%
  left_join(median_age_by_lang, by = "language_code") %>%
  left_join(mean_objectives_by_lang, by = "language_code")

ggplot(full_df, aes(x = effect_size_restricted, y = behavioral_mean)) +
  geom_pointrange(aes(color = language_code, ymin = behavioral_ci_lower, ymax = behavioral_ci_upper)) +
  geom_text(aes(label = language_code)) +
  xlab("Language IAT effect size") +
  ylab("Behavioral IAT effect size") +
  geom_smooth(method = "lm") +
  geom_hline(aes(yintercept = 0), linetype = 2) +
  theme_classic() +
  theme(legend.position = "none")

Models

cor.test(full_df$behavioral_mean, 
                full_df$effect_size_restricted)

## 
##  Pearson's product-moment correlation
## 
## data:  full_df$behavioral_mean and full_df$effect_size_restricted
## t = 0.24281, df = 11, p-value = 0.8126
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.4980061  0.5998667
## sample estimates:
##        cor 
## 0.07301368

lm(behavioral_mean ~ effect_size_restricted + median_age , full_df) %>%
  summary()

## 
## Call:
## lm(formula = behavioral_mean ~ effect_size_restricted + median_age, 
##     data = full_df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.13906 -0.05073 -0.02720  0.06562  0.18831 
## 
## Coefficients:
##                         Estimate Std. Error t value Pr(>|t|)   
## (Intercept)            -0.474387   0.169597  -2.797  0.01889 * 
## effect_size_restricted -0.032584   0.049825  -0.654  0.52788   
## median_age              0.020922   0.004697   4.454  0.00123 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1065 on 10 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.6667, Adjusted R-squared:    0.6 
## F-statistic:    10 on 2 and 10 DF,  p-value: 0.004115

lm(behavioral_mean ~ effect_size_restricted + median_age  + wps, full_df) %>%
  summary()

## 
## Call:
## lm(formula = behavioral_mean ~ effect_size_restricted + median_age + 
##     wps, data = full_df)
## 
## Residuals:
##        4        7        9       11       13 
## -0.01948 -0.10272  0.01240  0.02055  0.08923 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)
## (Intercept)             2.11959    2.72647   0.777    0.579
## effect_size_restricted  0.63331    0.82897   0.764    0.585
## median_age              0.09901    0.06823   1.451    0.384
## wps                    -7.68377    7.21104  -1.066    0.480
## 
## Residual standard error: 0.1395 on 1 degrees of freedom
##   (9 observations deleted due to missingness)
## Multiple R-squared:  0.8804, Adjusted R-squared:  0.5217 
## F-statistic: 2.454 on 3 and 1 DF,  p-value: 0.4314

Predicting residualized IAT scores

behavioral_means_tidy <- IAT_behavioral_tidy_with_resids %>%
  filter(lang %in% targ_langs) %>%
  group_by(lang) %>%
  multi_boot_standard(col = "iat_resid") %>%
  ungroup()  %>%
  rename(behavioral_mean = mean,
         behavioral_ci_lower = ci_lower,
         behavioral_ci_upper = ci_upper)  %>%
  left_join(lang_key %>% select(language_code, lang))

full_df <- behavioral_means_tidy %>%
  left_join(IAT_lang_restricted, by = "language_code") %>%
  left_join(sample_sizes, by = "language_code")  %>%
  left_join(median_age_by_lang, by = "language_code")  %>%
  left_join(mean_objectives_by_lang, by = "language_code")

ggplot(full_df, aes(x = effect_size_restricted, y = behavioral_mean)) +
  geom_pointrange(aes(color = language_code, ymin = behavioral_ci_lower, ymax = behavioral_ci_upper)) +
  geom_text(aes(label = language_code)) +
  xlab("Language IAT effect size") +
  ylab("Behavioral IAT effect size") +
  geom_smooth(method = "lm") +
  geom_hline(aes(yintercept = 0), linetype = 2) +
  theme_classic() +
  theme(legend.position = "none")

cor.test(full_df$behavioral_mean, 
          full_df$effect_size_restricted)

## 
##  Pearson's product-moment correlation
## 
## data:  full_df$behavioral_mean and full_df$effect_size_restricted
## t = 0.18247, df = 11, p-value = 0.8585
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.5115355  0.5881172
## sample estimates:
##        cor 
## 0.05493243

lm(behavioral_mean ~ effect_size_restricted + median_age  + wps, full_df) %>%
  summary()

## 
## Call:
## lm(formula = behavioral_mean ~ effect_size_restricted + median_age + 
##     wps, data = full_df)
## 
## Residuals:
##         4         7         9        11        13 
## -0.015625 -0.082407  0.009952  0.016490  0.071590 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)
## (Intercept)             2.46706    2.18737   1.128    0.462
## effect_size_restricted  0.81180    0.66506   1.221    0.437
## median_age              0.10234    0.05474   1.870    0.313
## wps                    -8.68139    5.78522  -1.501    0.374
## 
## Residual standard error: 0.1119 on 1 degrees of freedom
##   (9 observations deleted due to missingness)
## Multiple R-squared:  0.9021, Adjusted R-squared:  0.6085 
## F-statistic: 3.072 on 3 and 1 DF,  p-value: 0.3918

By participant

This is the same analysis as above, but at the participant level using mixed-effect models.

by_participant_df <-  IAT_behavioral_tidy_with_resids  %>%
   left_join(IAT_lang_restricted, by = "language_code") %>%
   mutate(effect_size_restricted = scale(effect_size_restricted),
          log_age = scale(log_age),
          iat_score = scale(iat_score),
          iat_resid = scale(iat_resid),
          condition = as.factor(condition),
          median_age = scale(median_age)) 

ggplot(by_participant_df, aes(x = effect_size_restricted, 
                     y = iat_score)) +
  geom_point(aes( color = language_code)) +
  geom_smooth(method = "lm") +
  ylab("Behavioral IAT") +
  xlab("Language IAT") +
  geom_hline(aes(yintercept = 0), linetype = 2) +
  theme_classic() +
  theme(legend.position = "none")

In a mixed-effect model with country and language as random intercepts and gender and age as fixed effect, here’s no relationship between language IAT and behavioral IAT when you include all languages. All variables are scaled.

lme4::lmer(iat_score ~ effect_size_restricted+ gender + condition + children + log_age + median_age + wps+  as.numeric(conservatism) + (1|country)  + (1|lang), 
           by_participant_df) %>%
  summary()

## Linear mixed model fit by REML ['lmerMod']
## Formula: 
## iat_score ~ effect_size_restricted + gender + condition + children +  
##     log_age + median_age + wps + as.numeric(conservatism) + (1 |  
##     country) + (1 | lang)
##    Data: by_participant_df
## 
## REML criterion at convergence: 1095.9
## 
## Scaled residuals: 
##      Min       1Q   Median       3Q      Max 
## -2.65725 -0.64455  0.00685  0.70369  2.97554 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev.
##  country  (Intercept) 0.000000 0.00000 
##  lang     (Intercept) 0.001758 0.04193 
##  Residual             0.968997 0.98438 
## Number of obs: 383, groups:  country, 61; lang, 38
## 
## Fixed effects:
##                          Estimate Std. Error t value
## (Intercept)              -1.00319    0.70980  -1.413
## effect_size_restricted   -0.01931    0.04619  -0.418
## gendermale                0.20312    0.11231   1.809
## condition1                0.59797    0.10115   5.912
## children                 -0.16573    0.07021  -2.360
## log_age                   0.06761    0.07176   0.942
## median_age                0.04612    0.07983   0.578
## wps                       0.98571    0.95177   1.036
## as.numeric(conservatism) -0.02201    0.02407  -0.914
## 
## Correlation of Fixed Effects:
##             (Intr) effc__ gndrml cndtn1 chldrn log_ag medn_g wps   
## effct_sz_rs  0.266                                                 
## gendermale  -0.095  0.043                                          
## condition1  -0.076  0.020  0.020                                   
## children    -0.114 -0.062  0.105 -0.020                            
## log_age      0.092  0.055 -0.005 -0.027 -0.511                     
## median_age   0.819  0.126  0.018 -0.016  0.034  0.049              
## wps         -0.976 -0.242 -0.018 -0.002  0.076 -0.107 -0.847       
## as.nmrc(cn) -0.184 -0.053 -0.046  0.010  0.003  0.011  0.068  0.051

lme4::lmer(iat_score ~ effect_size_restricted* gender + condition +children +log_age +median_age + as.numeric(conservatism) + (1|country)  + (1|lang), 
           by_participant_df) %>%
  summary()

## Linear mixed model fit by REML ['lmerMod']
## Formula: 
## iat_score ~ effect_size_restricted * gender + condition + children +  
##     log_age + median_age + as.numeric(conservatism) + (1 | country) +  
##     (1 | lang)
##    Data: by_participant_df
## 
## REML criterion at convergence: 2408.6
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -3.9892 -0.6245  0.0520  0.6634  3.2377 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev.
##  country  (Intercept) 0.00000  0.0000  
##  lang     (Intercept) 0.02424  0.1557  
##  Residual             0.88021  0.9382  
## Number of obs: 874, groups:  country, 75; lang, 39
## 
## Fixed effects:
##                                    Estimate Std. Error t value
## (Intercept)                       -0.444316   0.109252  -4.067
## effect_size_restricted            -0.018699   0.071149  -0.263
## gendermale                         0.329392   0.068250   4.826
## condition1                         0.528208   0.064099   8.241
## children                          -0.042220   0.038789  -1.088
## log_age                           -0.070011   0.035814  -1.955
## median_age                         0.120182   0.037158   3.234
## as.numeric(conservatism)           0.002541   0.016077   0.158
## effect_size_restricted:gendermale -0.034542   0.069046  -0.500
## 
## Correlation of Fixed Effects:
##             (Intr) effc__ gndrml cndtn1 chldrn log_ag medn_g as.n()
## effct_sz_rs  0.122                                                 
## gendermale  -0.405  0.069                                          
## condition1  -0.245  0.010 -0.034                                   
## children    -0.229 -0.018  0.215 -0.041                            
## log_age      0.108  0.040 -0.160 -0.029 -0.387                     
## median_age  -0.123 -0.136  0.051 -0.020 -0.060  0.120              
## as.nmrc(cn) -0.543  0.016 -0.066 -0.052 -0.055 -0.035  0.149       
## effct_sz_r:  0.088 -0.670 -0.082  0.008 -0.019 -0.035 -0.017 -0.027

This is true even when you exclude participants.

targ_langs <- lang_counts %>%
  filter(n >= MINPARTICIPANTS) %>%
  pull(lang)

lme4::lmer(iat_score ~ effect_size_restricted+ gender + condition +children +log_age +median_age + wps + as.numeric(conservatism) + (1|country)  + (1|lang), 
           by_participant_df %>% filter(lang %in% targ_langs)) %>%
  summary()

## Linear mixed model fit by REML ['lmerMod']
## Formula: 
## iat_score ~ effect_size_restricted + gender + condition + children +  
##     log_age + median_age + wps + as.numeric(conservatism) + (1 |  
##     country) + (1 | lang)
##    Data: by_participant_df %>% filter(lang %in% targ_langs)
## 
## REML criterion at convergence: 956.3
## 
## Scaled residuals: 
##      Min       1Q   Median       3Q      Max 
## -2.64044 -0.65148  0.00661  0.70103  3.02795 
## 
## Random effects:
##  Groups   Name        Variance  Std.Dev. 
##  country  (Intercept) 2.217e-15 4.709e-08
##  lang     (Intercept) 0.000e+00 0.000e+00
##  Residual             9.594e-01 9.795e-01
## Number of obs: 335, groups:  country, 47; lang, 13
## 
## Fixed effects:
##                           Estimate Std. Error t value
## (Intercept)              -0.902588   0.764881  -1.180
## effect_size_restricted    0.004172   0.048279   0.086
## gendermale                0.181098   0.116884   1.549
## condition1                0.598277   0.107838   5.548
## children                 -0.164721   0.074562  -2.209
## log_age                   0.081932   0.076238   1.075
## median_age                0.102368   0.086879   1.178
## wps                       0.899488   1.020635   0.881
## as.numeric(conservatism) -0.020219   0.025424  -0.795
## 
## Correlation of Fixed Effects:
##             (Intr) effc__ gndrml cndtn1 chldrn log_ag medn_g wps   
## effct_sz_rs  0.273                                                 
## gendermale  -0.071  0.051                                          
## condition1  -0.102 -0.029 -0.009                                   
## children    -0.148 -0.048  0.061 -0.030                            
## log_age      0.125  0.067  0.001 -0.014 -0.526                     
## median_age   0.828  0.143  0.033 -0.046 -0.002  0.077              
## wps         -0.977 -0.243 -0.031  0.021  0.119 -0.143 -0.850       
## as.nmrc(cn) -0.210 -0.104 -0.043  0.043  0.008  0.002  0.045  0.075

lme4::lmer(iat_score ~ effect_size_restricted*gender + condition + log_age +median_age  + wps +  (1|country)  + (1|lang), 
           by_participant_df %>% filter(lang %in% targ_langs)) %>%
  summary()

## Linear mixed model fit by REML ['lmerMod']
## Formula: 
## iat_score ~ effect_size_restricted * gender + condition + log_age +  
##     median_age + wps + (1 | country) + (1 | lang)
##    Data: by_participant_df %>% filter(lang %in% targ_langs)
## 
## REML criterion at convergence: 977.1
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -2.6479 -0.6434  0.0195  0.6822  3.1691 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev.
##  country  (Intercept) 0.0000   0.0000  
##  lang     (Intercept) 0.0000   0.0000  
##  Residual             0.9856   0.9928  
## Number of obs: 341, groups:  country, 47; lang, 13
## 
## Fixed effects:
##                                   Estimate Std. Error t value
## (Intercept)                       -1.10781    0.75174  -1.474
## effect_size_restricted             0.05934    0.08518   0.697
## gendermale                         0.19080    0.12550   1.520
## condition1                         0.59664    0.10857   5.496
## log_age                           -0.01555    0.06561  -0.237
## median_age                         0.10968    0.08734   1.256
## wps                                0.98006    1.02150   0.959
## effect_size_restricted:gendermale -0.08597    0.10104  -0.851
## 
## Correlation of Fixed Effects:
##             (Intr) effc__ gndrml cndtn1 log_ag medn_g wps   
## effct_sz_rs  0.257                                          
## gendermale  -0.123 -0.274                                   
## condition1  -0.123 -0.068  0.014                            
## log_age      0.062  0.031  0.039 -0.033                     
## median_age   0.868  0.154 -0.002 -0.069  0.092              
## wps         -0.986 -0.212  0.009  0.042 -0.101 -0.865       
## effct_sz_r: -0.142 -0.823  0.367  0.059 -0.003 -0.087  0.100