Wiki cc comparision

# behavioral
BEHAVIORAL_IAT_PATH <- here("data/study0/processed/by_language_df.csv")
iat_behavioral_es <- read_csv(BEHAVIORAL_IAT_PATH) %>%
  rename(language_code = "wiki_language_code") %>%
  select(language_code, median_country_age, 
         es_iat_sex_age_order_explicit_resid,
         es_iat_sex_age_order_implicit_resid, 
         per_women_stem_2012_2017, 
         n_participants)

# study 1b
LANG_IAT_PATH <- here("data/study1b/iat_es_lang.csv")
iat_lang_es <- read_csv(LANG_IAT_PATH)

LANG_FAMILY_PATH <- here("data/study0/processed/top_lang_by_country_ethnologue.csv")
lang_family <- read_csv(LANG_FAMILY_PATH) %>%
  select(wiki_language_code, family) %>%
  rename(language_code = "wiki_language_code") %>%
  distinct()

# study 2 measures (included here for making single grand table)
BY_LANGUAGE_OCCUPATION_PATH  <- here("data/study2/occupation_gender_score_by_language.csv")
occupation_semantics <- read_csv(BY_LANGUAGE_OCCUPATION_PATH) 

OCCUPATION_OVERLAP_PATH <- here('data/study2/occupation_gender_scores.csv')
by_lang_scores <- read_csv(OCCUPATION_OVERLAP_PATH)

LANGUAGE_NAME_PATH <- here("data/study0/processed/lang_name_to_wiki_iso.csv")
language_names <- read_csv(LANGUAGE_NAME_PATH) %>%
  rename(language_code = wiki_language_code) %>%
  distinct(language_code, .keep_all = TRUE)

# combine lang and behavioral and family info
all_es <- left_join(iat_behavioral_es, iat_lang_es, by = "language_code") %>%
  left_join(lang_family)   %>%
  left_join(occupation_semantics)  %>% # include study 2 measure here so can make table
  left_join(by_lang_scores) %>%
  left_join(language_names) %>%
  select(language_code,language_name,family,n_participants,es_iat_sex_age_order_implicit_resid, es_iat_sex_age_order_explicit_resid, median_country_age, per_women_stem_2012_2017, lang_es_sub, lang_es_wiki, lang_es_wiki_cc, mean_prop_distinct_occs, subt_occu_semantics_fm, wiki_occu_semantics_fm)

# remove exclusions and fix croatian to be mean of hr and sr (only in wiki)
EXCLUSIONS_PATH <- here("data/study1b/language_exclusions.csv") 
exclusions <- read_csv(EXCLUSIONS_PATH)

hr_new_wiki <- mean(c(filter(iat_lang_es, language_code == "hr") %>%  pull(lang_es_wiki),
         filter(iat_lang_es, language_code == "sr") %>%  pull(lang_es_wiki)))

all_es_tidy <- all_es %>%
  left_join(exclusions) %>%
  mutate(lang_es_wiki = case_when(exclude_wiki == TRUE ~ NA_real_,
                                  TRUE ~ lang_es_wiki),
         lang_es_sub = case_when(exclude_sub == TRUE ~ NA_real_,
                                  TRUE ~ lang_es_sub),
         lang_es_wiki_cc = case_when(exclude_wiki_cc == TRUE ~ NA_real_,
                                  TRUE ~ lang_es_wiki_cc)) %>%
  select(-exclude_wiki, -exclude_sub, -exclude_wiki_cc) %>%
  mutate(lang_es_wiki = case_when(language_code == "hr" ~ hr_new_wiki,
                                  TRUE ~ lang_es_wiki),
         lang_es_sub = case_when(language_code == "hr" ~ NA_real_, # sr is missing from sub
                                  TRUE ~ lang_es_sub),
         lang_es_wiki_cc= case_when(language_code == "hr" ~ NA_real_, # sr is missing from sub/wiki_cc #check this
                                  TRUE ~ lang_es_wiki_cc))

# corr of lang, behavioral, etc.
all_corr_vars <- all_es_tidy %>%
  select(lang_es_sub, lang_es_wiki_cc, lang_es_wiki,subt_occu_semantics_fm, wiki_occu_semantics_fm, mean_prop_distinct_occs, es_iat_sex_age_order_explicit_resid, 
         es_iat_sex_age_order_implicit_resid, per_women_stem_2012_2017, median_country_age) %>%
  rename(`Residualized Implicit Bias (IAT)` = "es_iat_sex_age_order_implicit_resid",
          `Residualized Explicit Bias` = "es_iat_sex_age_order_explicit_resid",
          `Language IAT (Subtitle)` = "lang_es_sub",
          `Language IAT (Wikipedia)` = "lang_es_wiki",
          `Language IAT (Wikipedia CC)` = "lang_es_wiki_cc",
          `Occupation Bias (Subtitle)` = "subt_occu_semantics_fm",
          `Occupation Bias (Wikipedia)` = "wiki_occu_semantics_fm",
          `Prop. Gendered Occupation Labels` = "mean_prop_distinct_occs",
          `Percent Women in STEM` = "per_women_stem_2012_2017",
          `Median Country Age` = "median_country_age") 

simple_corr <- psych::corr.test(all_corr_vars, adjust = "none")$r %>%
  as_tibble(rownames = "rowname") %>%
  gather("var2", "simple_r", -rowname)
  
simple_corr_p <- psych::corr.test(all_corr_vars, adjust = "none")$p %>%
  as_tibble(rownames = "rowname") %>%
  gather("var2", "simple_p", -rowname)
  
partial_psych_obj <- psych::partial.r(data = all_corr_vars, 
                                      x = 1:8, y = "Median Country Age" ) 
partial_corr <- psych::corr.p(partial_psych_obj, n = nrow(all_corr_vars) - 1, 
                              adjust = "none")$r %>%
  psych_to_mat() %>%
  as_tibble(rownames = "rowname") %>%
  gather("var2", "partial_r", -rowname)

partial_corr_p <- psych::corr.p(partial_psych_obj, n = nrow(all_corr_vars) - 1, 
                                adjust = "none")$p %>%
  psych_to_mat() %>%
  as_tibble(rownames = "rowname") %>%
  gather("var2", "partial_p", -rowname)

tidy_corrs <- simple_corr %>%
                left_join(simple_corr_p) %>%
                left_join(partial_corr) %>%
                left_join(partial_corr_p)

corr_text_df <- tidy_corrs %>%
  filter(rowname == "Residualized Implicit Bias (IAT)", 
         var2 %in% c("Language IAT (Subtitle)",
                     "Language IAT (Wikipedia)",
                      "Language IAT (Wikipedia CC)")) %>%
    mutate(model = fct_recode(var2, "Subtitle Embeddings" = "Language IAT (Subtitle)",
                                   "Wikipedia Embeddings" = "Language IAT (Wikipedia)",
                                  "Wikipedia CC Embeddings" = "Language IAT (Wikipedia CC)"))  %>%
  select(model, simple_r) %>%
  mutate(simple_r = paste0("r = ", f_num(simple_r, 2))) %>%
  mutate(x = .85, y = -.07)

# plot lang vs behavioral
all_es_tidy %>%
  select(language_name, lang_es_sub, lang_es_wiki,lang_es_wiki_cc,
         es_iat_sex_age_order_implicit_resid, n_participants) %>%
  gather("model", "lang_es", -language_name,
         -es_iat_sex_age_order_implicit_resid, -n_participants) %>%
  mutate(model = fct_recode(model, "Subtitle Embeddings" = "lang_es_sub",
                                   "Wikipedia Embeddings" = "lang_es_wiki",
                            "Wikipedia CC Embeddings" = "lang_es_wiki_cc")) %>%
  ggplot(aes(x = lang_es, y = es_iat_sex_age_order_implicit_resid, size = n_participants)) +
  facet_wrap( . ~ model) +
  geom_smooth(method = "lm", alpha = .1, size = .9) +
  geom_point(alpha = .2) +
  ggrepel::geom_text_repel(aes(label = language_name), 
                           size = 2, box.padding = 0.1) + 
  scale_x_continuous(breaks = c(-.3, -0, .5, 1), 
                     label = c("\n(male-\nfamily)", "0", ".5","1\n(male-\ncareer)") , limits = c(-.35, 1.1)) +
  scale_y_continuous(breaks = c(-.075, -.05, -.025, 0, .025, .05), 
                     label = c("-.075\n(male-\nfamily)", "-.05", "-.025", "0", ".025", ".05\n(male-\ncareer)") , limits = c(-.08, .06) ) +
  scale_size(trans = "log10", labels = scales::comma, name = "N participants") +
  geom_text(data = corr_text_df, aes(label = simple_r, x = x, y = y), 
            color = "red", size = 4) +
  ggtitle("Psychological and  Linguistic Gender Biases") +
  ylab("Implicit  Gender Bias (residualized)\n") +
  xlab("\nLinguistic Gender Bias\n (effect size)") +
  theme_classic()  +
  theme(legend.position = "bottom")

Implicit gender bias (adjusted for age, sex, and block order) as a function of the linguistic gender bias derived from word-embeddings (Study 1b). Each point corresponds to a language, with the size of the point corresponding to the number of participants speaking that langauge. Linguistic biases are estimated from models trained on text in each language from Subtitle (left) and Wikipedia (right) corpora. Larger values indicate a larger bias to associate men with the concept of career and women with the concept of family. Error bands indicate standard error of the linear model estimate.

print_tidy_corrs <- tidy_corrs %>%
  filter(rowname != var2) %>%
  mutate_at(vars(simple_r, partial_r), ~ format(round(., 2), nsmall = 2) %>%  f_num(., digits = 2)) %>%
  mutate_at(vars(simple_r, partial_r), ~ 
               case_when(str_detect(.,"^-") ~ ., TRUE ~ paste0("\\ ", .))) %>% # add leading space so decimals align
  mutate_at(vars(simple_p, partial_p), ~ case_when(
    . < .01 ~ "**", . < .05 ~ "*",  . < .1 ~ "+", TRUE ~ "")) %>%
  mutate(r_partial_print = case_when(
    !is.na(partial_r) ~ paste0(partial_r, partial_p),TRUE ~ ""),
    r_simple_print = paste0(simple_r, simple_p)) %>%
  select(rowname, var2, r_simple_print, r_partial_print)

tidy_corrs_to_print_simple <- print_tidy_corrs %>%
  select(-r_partial_print) %>%
  spread(var2, r_simple_print)  %>%
  mutate_all(funs(replace_na(., ""))) %>%
  select("rowname", 
         contains("Residualized"), contains("STEM"), contains("IAT"),
         contains("Occupation Labels"), contains("Occupation Bias"),
         contains("Age")) %>%
  select("rowname",contains("Language IAT"), contains("Median"), contains("Occupation Bias"), contains("STEM"), contains("Occupation Labels"),  contains("Residualized")) %>%
  rename(" " = "rowname")

kable(tidy_corrs_to_print_simple,  booktabs = T, escape = F,
      caption = "Correlation (Pearson's r) for all measures in Study 1 and 2 at the level of languages. Top panel shows simple correlations; bottom panel shows partial correlations controlling for median country age. Single asterisks indicate p < .05 and double asterisks indicate p < .01. The + symbol indicates a marginally significant p-value, p < .1.",
      align = "l")

Correlation (Pearson’s r) for all measures in Study 1 and 2 at the level of languages. Top panel shows simple correlations; bottom panel shows partial correlations controlling for median country age. Single asterisks indicate p < .05 and double asterisks indicate p < .01. The + symbol indicates a marginally significant p-value, p < .1.
	Language IAT (Subtitle)	Language IAT (Wikipedia CC)	Language IAT (Wikipedia)	Median Country Age	Occupation Bias (Subtitle)	Occupation Bias (Wikipedia)	Percent Women in STEM	Prop. Gendered Occupation Labels	Residualized Explicit Bias	Residualized Implicit Bias (IAT)
Language IAT (Subtitle)		.51*	.51*	.31	.42+	.40+	-.55*	.28	-.08	.50*
Language IAT (Wikipedia CC)	.51*		.67**	.16	.35	.30	.10	.30	.15	.26
Language IAT (Wikipedia)	.51*	.67**		.25	.28	.44*	-.19	.18	.34+	.48*
Median Country Age	.31	.16	.25		.36	.34+	-.37+	.38+	-.13	.61**
Occupation Bias (Subtitle)	.42+	.35	.28	.36		.80**	-.39	.75**	.28	.64**
Occupation Bias (Wikipedia)	.40+	.30	.44*	.34+	.80**		-.32	.70**	.29	.59**
Percent Women in STEM	-.55*	.10	-.19	-.37+	-.39	-.32		-.32	.15	-.52*
Prop. Gendered Occupation Labels	.28	.30	.18	.38+	.75**	.70**	-.32		.06	.58**
Residualized Explicit Bias	-.08	.15	.34+	-.13	.28	.29	.15	.06		.15
Residualized Implicit Bias (IAT)	.50*	.26	.48*	.61**	.64**	.59**	-.52*	.58**	.15

Wiki cc comparision

Molly Lewis

2019-08-26