ALPHA <- .1

Six gender parity indicators:

We exclude GPI because it’s missing for about half of countries - shoud look into this.

IAT bias measures

# read in country to language
all_countries <- read_csv("data/other/country_to_lang.csv")
all_countries[all_countries$country_name == "UK", "country_code"] = "GB"
          
# read in bias measures
bias_measures <- read_csv("data/other/all_es_wide.csv") %>%
  select(-wps_index) 

#bias_measures[bias_measures$weapons_google < .5, "weapons_google"] = NA
#bias_measures[bias_measures$flowers_google < .9, "flowers_google"] = NA
bias_measures %>%
  gather("measure", "value", c(-1,-6)) %>%
  ggplot(aes(x = measure, y = value)) +
  geom_boxplot() +
  theme_bw()

Objective gender measures

#read in gender measures
hdi <- read_csv("data/gender_measures/HDI_complete.csv") %>%
  mutate_at(3:28, as.numeric) %>%
  mutate(mean_value = rowMeans(.[,3:28], na.rm = T))   %>%
  mutate(country_code = countrycode(country, "country.name", "iso2c")) %>%
  select(kpi_name, country_code, mean_value) %>%
  spread(kpi_name, mean_value, -2) %>%
  select(country_code, `Median Age`) %>%
  rename(median_age = `Median Age`) 

all_gender_measures <- read_csv("data/gender_measures/all_gender_measures.csv") %>%
  select(-sigi, -sigi_physical, -wb_cpia, -contains("schooling"), -gpi_literacy, -contains("ggi_"), -sigi_son) %>%
  left_join(hdi)
all_gender_measures %>%
  gather(measure, value, -1:-3) %>%
  ggplot(aes(x = value)) +
  geom_histogram() +
  facet_wrap(~measure, scales = "free", ncol = 4) +
  ggtitle("raw") +
  theme_bw()

Transform skewed measures.

all_gender_measures_transformed <- all_gender_measures %>%
  mutate(sigi_fam_log = log(sigi_fam),
         gii_log = log(gii),
         gdi_exp = gdi^10) %>%
  select(-sigi_fam,  -gii, -gdi) %>%
  mutate(sigi_fam_log = ifelse(is.infinite(sigi_fam_log), NA,  sigi_fam_log))

all_gender_measures_transformed %>%
  gather(measure, value, -1:-3) %>%
  ggplot(aes(x = value)) +
  geom_histogram() +
  ggtitle("transformed")+
  facet_wrap(~measure, scales = "free", ncol = 4) +
  theme_bw()

Correlation between measures

# merge together
full_df_partial = all_countries %>%
  left_join(bias_measures, 
            by = "wiki_language_code") %>%
  left_join(all_gender_measures_transformed,  by = "country_code") %>%
  select(-contains(".y"))  %>%
  #select(c(-10:-13, -16)) %>%
  select(1:5, 8,  everything())  %>%
  group_by(wiki_language_code.x) %>%
  summarize_at(vars(career_google:gdi_exp), mean, na.rm = T) 
mod1 <- lm(career_behavioral_iat ~ median_age, 
           data = full_df_partial)


full_df = full_df_partial %>%
  modelr::add_residuals(mod1) %>%
  rename(career_behavioral_iat_resid = resid) %>%
  select(1:4, career_behavioral_iat_resid, 5, 6, everything())
corr_mat <- cor(full_df[,c(-1)], 
                use = "pairwise.complete.obs")

p.mat <- cor.mtest(full_df[,c(-1)], 
                  conf.level = (1-ALPHA),  
                  use = "pairwise.complete.obs")$p

cols = rev(colorRampPalette(c("red", "white", "blue"))(100))

corrplot(corr_mat, method = "color",  col = cols,
         type = "upper", order = "original", number.cex = .7,
         addCoef.col = "black", 
         p.mat = p.mat, sig.level = ALPHA, insig = "blank", 
         tl.col = "black", tl.srt = 90,
         diag = FALSE)