BEHAVIORAL_IAT_PATH <- here("data/study0/processed/by_language_df.csv")
iat_behavioral_es <- read_csv(BEHAVIORAL_IAT_PATH) %>%
  rename(language_code = "wiki_language_code") %>%
  select(language_code, median_country_age, 
         es_iat_sex_age_order_explicit_resid,
         es_iat_sex_age_order_implicit_resid, 
         per_women_stem_2012_2017, 
         n_participants)

LANG_PATH  <- here("exploratory_studies/16_wiki_native/data/06_iat_es/iat_es_lang.csv")
lang_iat_native <- read_csv(LANG_PATH) %>%
  rename(lang_es_wiki_native = lang_es_wiki) 

# study 1b
LANG_IAT_PATH <- here("data/study1b/iat_es_lang.csv")
iat_lang_es <- read_csv(LANG_IAT_PATH)

LANG_FAMILY_PATH <- here("data/study0/processed/top_lang_by_country_ethnologue.csv")
lang_family <- read_csv(LANG_FAMILY_PATH) %>%
  select(wiki_language_code, family) %>%
  rename(language_code = "wiki_language_code") %>%
  distinct()

# study 2 measures (included here for making single grand table)
BY_LANGUAGE_OCCUPATION_PATH  <- here("data/study2/occupation_gender_score_by_language.csv")
occupation_semantics <- read_csv(BY_LANGUAGE_OCCUPATION_PATH) 

OCCUPATION_OVERLAP_PATH <- here('data/study2/occupation_gender_scores.csv')
by_lang_scores <- read_csv(OCCUPATION_OVERLAP_PATH)

LANGUAGE_NAME_PATH <- here("data/study0/processed/lang_name_to_wiki_iso.csv")
language_names <- read_csv(LANGUAGE_NAME_PATH) %>%
  rename(language_code = wiki_language_code) %>%
  distinct(language_code, .keep_all = TRUE)

# combine lang and behavioral and family info
all_es <- left_join(iat_behavioral_es, iat_lang_es, by = "language_code") %>%
  left_join(lang_iat_native, by = "language_code") %>%
  left_join(lang_family)   %>%
  left_join(occupation_semantics)  %>% # include study 2 measure here so can make table
  left_join(by_lang_scores) %>%
  left_join(language_names) %>%
  select(language_code,language_name,family,n_participants,es_iat_sex_age_order_implicit_resid, es_iat_sex_age_order_explicit_resid, median_country_age, per_women_stem_2012_2017, lang_es_sub, lang_es_wiki, lang_es_wiki_native, mean_prop_distinct_occs, subt_occu_semantics_fm, wiki_occu_semantics_fm)
# remove exclusions and fix croatian to be mean of hr and sr (only in wiki)
EXCLUSIONS_PATH <- here("data/study1b/language_exclusions.csv") 
exclusions <- read_csv(EXCLUSIONS_PATH)

EXCLUSIONS_WIKI_NATIVE_PATH <- here("/exploratory_studies/16_wiki_native/data/06_iat_es/language_exclusions.csv") 
exclusions_wiki_native <- read_csv(EXCLUSIONS_WIKI_NATIVE_PATH)

hr_new_wiki <- mean(c(filter(iat_lang_es, language_code == "hr") %>%  pull(lang_es_wiki),
         filter(iat_lang_es, language_code == "sr") %>%  pull(lang_es_wiki)))

hr_new_wiki_native <- mean(c(filter(lang_iat_native, language_code == "hr") %>%  pull(lang_es_wiki_native),
         filter(lang_iat_native, language_code == "sr") %>%  pull(lang_es_wiki_native)))

all_es_tidy <- all_es %>%
  left_join(exclusions) %>%
  left_join(exclusions_wiki_native) %>%
  mutate(lang_es_wiki = case_when(exclude_wiki == TRUE ~ NA_real_,
                                  TRUE ~ lang_es_wiki),
         lang_es_sub = case_when(exclude_sub == TRUE ~ NA_real_,
                                  TRUE ~ lang_es_sub),
         lang_es_wiki_native = case_when(exclude_wiki_native == TRUE ~ NA_real_,
                                  TRUE ~ lang_es_wiki_native)) %>%
  select(-exclude_wiki, -exclude_sub, -exclude_wiki_native) %>%
  mutate(lang_es_wiki = case_when(language_code == "hr" ~ hr_new_wiki,
                                  TRUE ~ lang_es_wiki),
         lang_es_wiki_native = case_when(language_code == "hr" ~ hr_new_wiki_native,
                                  TRUE ~ lang_es_wiki_native),
         lang_es_sub = case_when(language_code == "hr" ~ NA_real_, # sr is missing from sub
                                  TRUE ~ lang_es_sub))  %>%
    filter(language_code != "zu")  # exclude proportion overlap measure (study 2) in zulu 
# corr of lang, behavioral, etc.
all_corr_vars <- all_es_tidy %>%
  select(lang_es_sub, lang_es_wiki,lang_es_wiki_native, subt_occu_semantics_fm, wiki_occu_semantics_fm, mean_prop_distinct_occs, es_iat_sex_age_order_explicit_resid, 
         es_iat_sex_age_order_implicit_resid, per_women_stem_2012_2017, median_country_age) %>%
  rename(`Residualized Implicit Bias (IAT)` = "es_iat_sex_age_order_implicit_resid",
          `Residualized Explicit Bias` = "es_iat_sex_age_order_explicit_resid",
          `Language IAT (Subtitle)` = "lang_es_sub",
          `Language IAT (Wikipedia)` = "lang_es_wiki",
          `Language IAT (Wikipedia, native)` = "lang_es_wiki_native", 
          `Occupation Bias (Subtitle)` = "subt_occu_semantics_fm",
          `Occupation Bias (Wikipedia)` = "wiki_occu_semantics_fm",
          `Prop. Gendered Occupation Labels` = "mean_prop_distinct_occs",
          `Percent Women in STEM` = "per_women_stem_2012_2017",
          `Median Country Age` = "median_country_age") 

simple_corr <- psych::corr.test(all_corr_vars, adjust = "none")$r %>%
  as_tibble(rownames = "rowname") %>%
  gather("var2", "simple_r", -rowname)
  
simple_corr_p <- psych::corr.test(all_corr_vars, adjust = "none")$p %>%
  as_tibble(rownames = "rowname") %>%
  gather("var2", "simple_p", -rowname)
  
partial_psych_obj <- psych::partial.r(data = all_corr_vars, 
                                      x = 1:8, y = "Median Country Age" ) 
partial_corr <- psych::corr.p(partial_psych_obj, n = nrow(all_corr_vars) - 1, 
                              adjust = "none")$r %>%
  psych_to_mat() %>%
  as_tibble(rownames = "rowname") %>%
  gather("var2", "partial_r", -rowname)

partial_corr_p <- psych::corr.p(partial_psych_obj, n = nrow(all_corr_vars) - 1, 
                                adjust = "none")$p %>%
  psych_to_mat() %>%
  as_tibble(rownames = "rowname") %>%
  gather("var2", "partial_p", -rowname)

tidy_corrs <- simple_corr %>%
                left_join(simple_corr_p) %>%
                left_join(partial_corr) %>%
                left_join(partial_corr_p) 

corr_text_df <- tidy_corrs %>%
  filter(rowname == "Residualized Implicit Bias (IAT)", 
         var2 %in% c("Language IAT (Subtitle)",
                     "Language IAT (Wikipedia)", 
                     "Language IAT (Wikipedia, native)")) %>%
    mutate(model = fct_recode(var2, "Subtitle Embeddings" = "Language IAT (Subtitle)",
                                     "Wikipedia Embeddings" = "Language IAT (Wikipedia)",
                                     "Wikipedia Embeddings, native" = "Language IAT (Wikipedia, native)"))  %>%
  select(model, simple_r) %>%
  mutate(simple_r = paste0("r = ", f_num(simple_r, 2))) %>%
  mutate(x = .85, y = -.07)

# plot lang vs behavioral
all_es_tidy %>%
  select(language_name, lang_es_sub, lang_es_wiki, lang_es_wiki_native,
         es_iat_sex_age_order_implicit_resid, n_participants) %>%
  gather("model", "lang_es", -language_name,
         -es_iat_sex_age_order_implicit_resid, -n_participants) %>%
  mutate(model = fct_recode(model, "Subtitle Embeddings" = "lang_es_sub",
                                   "Wikipedia Embeddings" = "lang_es_wiki",
                                      "Wikipedia Embeddings, native" = "lang_es_wiki_native")) %>%
  ggplot(aes(x = lang_es, y = es_iat_sex_age_order_implicit_resid, size = n_participants)) +
  facet_wrap( . ~ model) +
  geom_smooth(method = "lm", alpha = .1, size = .9) +
  geom_point(alpha = .2) +
  ggrepel::geom_text_repel(aes(label = language_name), 
                           size = 2, box.padding = 0.1) + 
  scale_x_continuous(breaks = c(-.3, -0, .5, 1), 
                     label = c("\n(male-\nfamily)", "0", ".5","1\n(male-\ncareer)") , limits = c(-.35, 1.1)) +
  scale_y_continuous(breaks = c(-.075, -.05, -.025, 0, .025, .05), 
                     label = c("-.075\n(male-\nfamily)", "-.05", "-.025", "0", ".025", ".05\n(male-\ncareer)") , limits = c(-.08, .06) ) +
  scale_size(trans = "log10", labels = scales::comma, name = "N participants") +
  geom_text(data = corr_text_df, aes(label = simple_r, x = x, y = y), 
            color = "red", size = 4) +
  ggtitle("Psychological and  Linguistic Gender Biases") +
  ylab("Implicit  Gender Bias (residualized)\n") +
  xlab("\nLinguistic Gender Bias\n (effect size)") +
  theme_classic()  +
  theme(legend.position = "bottom")

print_tidy_corrs <- tidy_corrs %>%
  filter(rowname != var2) %>%
  mutate_at(vars(simple_r, partial_r), ~ format(round(., 2), nsmall = 2) %>%  f_num(., digits = 2)) %>%
  mutate_at(vars(simple_p, partial_p), ~ case_when(
    . < .01 ~ "**", . < .05 ~ "*",  . < .1 ~ "+", TRUE ~ "")) %>%
  mutate(r_partial_print = case_when(
    !is.na(partial_r) ~ paste0(partial_r, partial_p),TRUE ~ ""),
    r_simple_print = paste0(simple_r, simple_p)) %>%
  select(rowname, var2, r_simple_print, r_partial_print)

tidy_corrs_to_print_simple <- print_tidy_corrs %>%
  select(-r_partial_print) %>%
  spread(var2, r_simple_print)  %>%
  mutate_all(funs(replace_na(., ""))) %>%
  select("rowname", "Language IAT (Subtitle)", "Language IAT (Wikipedia, native)", "Language IAT (Wikipedia)", "Median Country Age", "Occupation Bias (Subtitle)", "Occupation Bias (Wikipedia)", "Percent Women in STEM", "Prop. Gendered Occupation Labels", "Residualized Explicit Bias", "Residualized Implicit Bias (IAT)") %>%
  rename(" " = "rowname")

tidy_corrs_to_print_partial <- print_tidy_corrs %>%
  select(-r_simple_print) %>%
  spread(var2, r_partial_print)  %>%
  mutate_all(funs(replace_na(., ""))) %>%
  select("rowname", "Language IAT (Subtitle)", "Language IAT (Wikipedia, native)", "Language IAT (Wikipedia)", "Median Country Age", "Occupation Bias (Subtitle)", "Occupation Bias (Wikipedia)",  "Prop. Gendered Occupation Labels", "Residualized Explicit Bias", "Residualized Implicit Bias (IAT)") %>%
  rename(" " = "rowname") %>%
  mutate("Median Country Age" = " ") 

#tidy_corrs_to_print_reordered_simple <- tidy_corrs_to_print_simple[c(8,9,6,1,2,7,4,5,3),]
#tidy_corrs_to_print_reordered_partial <- tidy_corrs_to_print_partial[c(8,9,6,1,2,7,4,5,3),]

tidy_corrs_to_print_reordered <- bind_rows(tidy_corrs_to_print_simple, tidy_corrs_to_print_partial) 
kable(tidy_corrs_to_print_reordered,  booktabs = T, escape = F,
      caption = "Correlation (Pearson's r) for all measures in Study 1 and 2 at the level of languages. Top panel shows simple correlations; bottom panel shows partial correlations controlling for median country age. Single asterisks indicate p < .05 and double asterisks indicate p < .01. The + symbol indicates a marginally significant p-value, p < .1.",
      align = "l",
      longtable = T) %>% # this makes it so the table doesn't go at end
  kable_styling(full_width = F,  font_size = 10)  %>%
  kableExtra::group_rows(group_label = "Simple Correlations",
                         start_row = 1,
                         end_row = 10) %>%
  kableExtra::group_rows(group_label = "Partial Correlations",
                         start_row = 11,
                         end_row = 18) %>%
  row_spec(0, angle = 90) 
Correlation (Pearson’s r) for all measures in Study 1 and 2 at the level of languages. Top panel shows simple correlations; bottom panel shows partial correlations controlling for median country age. Single asterisks indicate p < .05 and double asterisks indicate p < .01. The + symbol indicates a marginally significant p-value, p < .1.
Language IAT (Subtitle) Language IAT (Wikipedia, native) Language IAT (Wikipedia) Median Country Age Occupation Bias (Subtitle) Occupation Bias (Wikipedia) Percent Women in STEM Prop. Gendered Occupation Labels Residualized Explicit Bias Residualized Implicit Bias (IAT)
Simple Correlations
Language IAT (Subtitle) .49* .51* .31 .42+ .40+ -.55* .28 -.08 .50*
Language IAT (Wikipedia, native) .49* .90** .48* .25 .46* -.39+ .23 .21 .60**
Language IAT (Wikipedia) .51* .90** .25 .28 .44* -.19 .18 .34+ .48*
Median Country Age .31 .48* .25 .36 .34+ -.42+ .35+ -.07 .61**
Occupation Bias (Subtitle) .42+ .25 .28 .36 .80** -.39 .75** .28 .64**
Occupation Bias (Wikipedia) .40+ .46* .44* .34+ .80** -.32 .70** .29 .59**
Percent Women in STEM -.55* -.39+ -.19 -.42+ -.39 -.32 -.35 .18 -.53*
Prop. Gendered Occupation Labels .28 .23 .18 .35+ .75** .70** -.35 .11 .57**
Residualized Explicit Bias -.08 .21 .34+ -.07 .28 .29 .18 .11 .18
Residualized Implicit Bias (IAT) .50* .60** .48* .61** .64** .59** -.53* .57** .18
Partial Correlations
Language IAT (Subtitle) .41* .47* .35+ .33 NA .20 -.06 .42*
Language IAT (Wikipedia, native) .41* .92** .09 .36+ NA .07 .28 .44*
Language IAT (Wikipedia) .47* .92** .21 .39+ NA .11 .38+ .43*
Median Country Age NA
Occupation Bias (Subtitle) .35+ .09 .21 .77** NA .71** .33 .57**
Occupation Bias (Wikipedia) .33 .36+ .39+ .77** NA .66** .34 .52**
Percent Women in STEM NA
Prop. Gendered Occupation Labels .20 .07 .11 .71** .66** NA .14 .48*
Residualized Explicit Bias -.06 .28 .38+ .33 .34 NA .14 .28
Residualized Implicit Bias (IAT) .42* .44* .43* .57** .52** NA .48* .28