Correlations between all measures

gdp_2013 is the measure Bill sent; it is missing three values. gdp_2017 is a measure I obtained.

gdp_data <- wb(indicator = "NY.GDP.PCAP.CD", 
               startdate = 2017, 
               enddate = 2017) %>%
  select(iso2c, value) %>%
  rename(gdp_2017 = value)

#wbsearch("gdp per capita")
#NY.GDP.MKTP.CD
#NY.GDP.PCAP.PP.CD

PATH <- "Molly data2.csv"
bill_data <- read_csv(PATH) %>%
  janitor::clean_names()  %>%
  left_join(gdp_data, by = c("country_code" = "iso2c"))  %>%
  mutate_if(is.numeric, scale)
plot_data <- bill_data %>%
  select_if(is.numeric)

long_corr <- cor(plot_data, 
                use = "pairwise.complete.obs") %>%
  as.data.frame() %>%
  rownames_to_column("v2") %>%
  gather("v1", "estimate", -v2)

long_p <- corrplot::cor.mtest(plot_data, 
                             use = "pairwise.complete.obs")$p %>%
  as.data.frame(row.names = names(plot_data)) %>%
  do(setNames(.,names(plot_data))) %>%
      rownames_to_column("v2") %>%
  gather("v1", "p", -v2)

corr_df <- full_join(long_corr, long_p) %>%
  mutate(estimate_char = case_when(v1 == v2 ~ "", 
                              TRUE ~ as.character(round(estimate,2))),
         estimate = case_when(v1 == v2 ~ as.numeric(NA), 
                              TRUE ~ estimate),
         estimate_color = case_when(p < .05 ~ estimate, TRUE ~ 0 ))

ggplot(corr_df, aes(v1, fct_rev(v2), fill = estimate_color)) + 
  geom_tile() + #rectangles for each correlation
  #add actual correlation value in the rectangle
  geom_text(aes(label = estimate_char), size=3) + 
  scale_fill_gradient2(low ="blue", mid = "white", high = "red", 
                       midpoint = 0, space = "Lab", guide = "colourbar",
                       name = "Pearson's r") +
  ggtitle("Pairwise Correlation Coefficients") +
  theme_classic(base_size = 12) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1), #, hjust = .95, vjust = .2), 
        axis.title.x=element_blank(), 
        axis.title.y=element_blank(),
        axis.ticks = element_blank(),
        legend.position = "none")

bill_data %>%
  ggplot(aes(x = lang_es_sub, y = women_stem))+
  geom_point(size = 3) +
  geom_smooth(method = "lm", alpha = .2) +
  theme_classic()

bill_data %>%
  ggplot(aes(x = lang_es_sub, y = gdp_2017))+
  geom_point(size = 3) +
  geom_smooth(method = "lm", alpha = .2) +
  theme_classic()

Country-level (subtitles)

Predicting women in stem with embedding measures, controlling for median age.

Bill’s measure:

lmer(women_stem ~  lang_es_sub + gdp_2013  + (1|language_name), 
   data = bill_data) %>%
  summary()
## Linear mixed model fit by REML ['lmerMod']
## Formula: women_stem ~ lang_es_sub + gdp_2013 + (1 | language_name)
##    Data: bill_data
## 
## REML criterion at convergence: 55
## 
## Scaled residuals: 
##      Min       1Q   Median       3Q      Max 
## -1.34115 -0.35934  0.00647  0.28640  1.99820 
## 
## Random effects:
##  Groups        Name        Variance Std.Dev.
##  language_name (Intercept) 0.4927   0.7019  
##  Residual                  0.1725   0.4153  
## Number of obs: 26, groups:  language_name, 16
## 
## Fixed effects:
##             Estimate Std. Error t value
## (Intercept)  -0.1068     0.2009  -0.532
## lang_es_sub  -0.1045     0.2163  -0.483
## gdp_2013     -0.4438     0.1850  -2.398
## 
## Correlation of Fixed Effects:
##             (Intr) lng_s_
## lang_es_sub  0.117       
## gdp_2013    -0.129 -0.524

My measure:

lmer(women_stem ~  lang_es_sub + gdp_2017  + (1|language_name), 
   data = bill_data) %>%
  summary()
## Linear mixed model fit by REML ['lmerMod']
## Formula: women_stem ~ lang_es_sub + gdp_2017 + (1 | language_name)
##    Data: bill_data
## 
## REML criterion at convergence: 60.4
## 
## Scaled residuals: 
##      Min       1Q   Median       3Q      Max 
## -1.22030 -0.34556 -0.06633  0.47187  1.87289 
## 
## Random effects:
##  Groups        Name        Variance Std.Dev.
##  language_name (Intercept) 0.3913   0.6255  
##  Residual                  0.2104   0.4587  
## Number of obs: 28, groups:  language_name, 18
## 
## Fixed effects:
##             Estimate Std. Error t value
## (Intercept)  -0.0953     0.1817  -0.524
## lang_es_sub  -0.2643     0.1814  -1.458
## gdp_2017     -0.3018     0.1756  -1.718
## 
## Correlation of Fixed Effects:
##             (Intr) lng_s_
## lang_es_sub  0.215       
## gdp_2017    -0.160 -0.471

Predicting women in stem with embedding measure, controlling for median age and gdp.

additive:

lmer(women_stem ~  lang_es_sub + median_country_age + gdp_2017 +  (1|language_name), 
   data = bill_data) %>%
  summary()
## Linear mixed model fit by REML ['lmerMod']
## Formula: women_stem ~ lang_es_sub + median_country_age + gdp_2017 + (1 |  
##     language_name)
##    Data: bill_data
## 
## REML criterion at convergence: 61.7
## 
## Scaled residuals: 
##      Min       1Q   Median       3Q      Max 
## -1.16106 -0.36787 -0.05786  0.48451  1.78441 
## 
## Random effects:
##  Groups        Name        Variance Std.Dev.
##  language_name (Intercept) 0.3948   0.6283  
##  Residual                  0.2236   0.4728  
## Number of obs: 28, groups:  language_name, 18
## 
## Fixed effects:
##                    Estimate Std. Error t value
## (Intercept)        -0.10492    0.18728  -0.560
## lang_es_sub        -0.27544    0.18853  -1.461
## median_country_age  0.04798    0.20063   0.239
## gdp_2017           -0.31225    0.18814  -1.660
## 
## Correlation of Fixed Effects:
##             (Intr) lng_s_ mdn_c_
## lang_es_sub  0.247              
## mdn_cntry_g -0.186 -0.216       
## gdp_2017    -0.094 -0.376 -0.304

interactive:

lmer(women_stem ~  lang_es_sub * median_country_age + gdp_2017 +  (1|language_name), 
   data = bill_data) %>%
  summary()
## Linear mixed model fit by REML ['lmerMod']
## Formula: women_stem ~ lang_es_sub * median_country_age + gdp_2017 + (1 |  
##     language_name)
##    Data: bill_data
## 
## REML criterion at convergence: 56.7
## 
## Scaled residuals: 
##      Min       1Q   Median       3Q      Max 
## -1.29879 -0.60406  0.02726  0.34444  1.52770 
## 
## Random effects:
##  Groups        Name        Variance Std.Dev.
##  language_name (Intercept) 0.2259   0.4753  
##  Residual                  0.2192   0.4681  
## Number of obs: 28, groups:  language_name, 18
## 
## Fixed effects:
##                                Estimate Std. Error t value
## (Intercept)                     -0.3702     0.1814  -2.041
## lang_es_sub                     -0.3272     0.1626  -2.012
## median_country_age               0.3967     0.2230   1.779
## gdp_2017                        -0.3848     0.1734  -2.220
## lang_es_sub:median_country_age   0.5089     0.1851   2.748
## 
## Correlation of Fixed Effects:
##             (Intr) lng_s_ mdn_c_ g_2017
## lang_es_sub  0.261                     
## mdn_cntry_g -0.425 -0.234              
## gdp_2017     0.020 -0.364 -0.381       
## lng_s_sb:__ -0.504 -0.089  0.567 -0.210
lmer(women_stem ~  lang_es_wiki * median_country_age + gdp_2017 +  (1|language_name), 
   data = bill_data) %>%
  summary()
## Linear mixed model fit by REML ['lmerMod']
## Formula: women_stem ~ lang_es_wiki * median_country_age + gdp_2017 + (1 |  
##     language_name)
##    Data: bill_data
## 
## REML criterion at convergence: 81.8
## 
## Scaled residuals: 
##      Min       1Q   Median       3Q      Max 
## -1.24681 -0.25976 -0.00787  0.25881  1.73528 
## 
## Random effects:
##  Groups        Name        Variance Std.Dev.
##  language_name (Intercept) 0.7772   0.8816  
##  Residual                  0.2288   0.4783  
## Number of obs: 32, groups:  language_name, 22
## 
## Fixed effects:
##                                 Estimate Std. Error t value
## (Intercept)                       0.3003     0.2388   1.258
## lang_es_wiki                      0.2206     0.2436   0.905
## median_country_age               -0.2935     0.2306  -1.272
## gdp_2017                         -0.4988     0.2216  -2.251
## lang_es_wiki:median_country_age  -0.1767     0.2213  -0.798
## 
## Correlation of Fixed Effects:
##             (Intr) lng_s_ mdn_c_ g_2017
## lang_es_wik  0.244                     
## mdn_cntry_g -0.268 -0.258              
## gdp_2017    -0.140 -0.527 -0.103       
## lng_s_wk:__ -0.450 -0.251  0.510  0.243
mod <- lm(women_stem ~ gdp_2017, 
   data = bill_data) 

bill_data %>%
  modelr::add_residuals(mod, "women_stem_gdp_resid") %>%
  mutate(median_country_age_tile = ntile(median_country_age, 2),
         country_age = ifelse(median_country_age_tile == 1, "younger", "older")) %>%
  ggplot(aes(x = lang_es_sub, y = women_stem_gdp_resid, color = country_age)) +
  geom_point(size = 3) +
  geom_smooth(method = "lm", alpha = .2) +
  theme_classic()

Language-level (subtitles)

lang_level <- bill_data %>%
  group_by(language_name) %>%
  summarize_if(is.numeric, mean, na.rm = T)

Predicting women in stem with embedding measures, controlling for median age.

lm(women_stem ~  lang_es_sub + gdp_2017, 
   data = lang_level) %>%
  summary()
## 
## Call:
## lm(formula = women_stem ~ lang_es_sub + gdp_2017, data = lang_level)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.39590 -0.30211 -0.03374  0.37335  1.47381 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)  
## (Intercept)  -0.1036     0.1829  -0.567   0.5793  
## lang_es_sub  -0.3425     0.1885  -1.817   0.0893 .
## gdp_2017     -0.1506     0.2307  -0.653   0.5238  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.752 on 15 degrees of freedom
##   (8 observations deleted due to missingness)
## Multiple R-squared:  0.3237, Adjusted R-squared:  0.2336 
## F-statistic:  3.59 on 2 and 15 DF,  p-value: 0.0532

Predicting women in stem with embedding measure, controlling for median age and gdp.

additive:

lm(women_stem ~  lang_es_sub + median_country_age + gdp_2017, data = lang_level) %>%
  summary()
## 
## Call:
## lm(formula = women_stem ~ lang_es_sub + median_country_age + 
##     gdp_2017, data = lang_level)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.41746 -0.25454 -0.05535  0.35757  1.42377 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)  
## (Intercept)        -0.11819    0.19701  -0.600   0.5581  
## lang_es_sub        -0.35673    0.20229  -1.764   0.0996 .
## median_country_age  0.07654    0.29547   0.259   0.7994  
## gdp_2017           -0.17817    0.26095  -0.683   0.5059  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.7765 on 14 degrees of freedom
##   (8 observations deleted due to missingness)
## Multiple R-squared:  0.327,  Adjusted R-squared:  0.1827 
## F-statistic: 2.267 on 3 and 14 DF,  p-value: 0.1256

interactive

lm(women_stem ~  lang_es_sub * median_country_age + gdp_2017 , 
   data = lang_level) %>%
  summary()
## 
## Call:
## lm(formula = women_stem ~ lang_es_sub * median_country_age + 
##     gdp_2017, data = lang_level)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.4148 -0.3087  0.1236  0.3030  0.9878 
## 
## Coefficients:
##                                Estimate Std. Error t value Pr(>|t|)  
## (Intercept)                     -0.4186     0.1958  -2.138   0.0521 .
## lang_es_sub                     -0.3810     0.1671  -2.280   0.0401 *
## median_country_age               0.5999     0.3092   1.940   0.0744 .
## gdp_2017                        -0.4585     0.2382  -1.925   0.0764 .
## lang_es_sub:median_country_age   0.5431     0.1974   2.751   0.0165 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.6406 on 13 degrees of freedom
##   (8 observations deleted due to missingness)
## Multiple R-squared:  0.5746, Adjusted R-squared:  0.4438 
## F-statistic: 4.391 on 4 and 13 DF,  p-value: 0.01829
mod <- lm(women_stem ~ gdp_2017, 
   data = lang_level) 

lang_level %>%
  modelr::add_residuals(mod, "women_stem_gdp_resid") %>%
  mutate(median_country_age_tile = ntile(median_country_age, 2),
         country_age = ifelse(median_country_age_tile == 1, "younger", "older")) %>%
  ggplot(aes(x = lang_es_sub, y = women_stem_gdp_resid, color = country_age)) +
  geom_point(size = 3) +
  geom_smooth(method = "lm", alpha = .2) +
  theme_classic()

lang_level %>%
  ggplot(aes(x = lang_es_sub, y = women_stem))+
  geom_point(size = 3) +
  geom_smooth(method = "lm", alpha = .2) +
  theme_classic()

lang_level %>%
  ggplot(aes(x = lang_es_sub, y = gdp_2017))+
  geom_point(size = 3) +
  geom_smooth(method = "lm", alpha = .2) +
  theme_classic()