Init

Packages and options.

#options
options(digits = 2,
        tibble.print_max = 30)

#minimum group size
min_n = 25

#packages
library(pacman)
p_load(kirkegaard, readxl, rms, haven, sf, tmap, ggeffects)

#default theme
theme_set(theme_bw())

#overwrite describe
describe = function(...) psych::describe(...) %>% as.matrix()

Data

#all years mixed together
#we dont use these because we cannot adjust for grade inflation
#but these contain a bit more countries
raw_all_years = read_excel("data/Resultater - Folkeskolens Afgangseksamen all years together.xlsx",
                           range = "A5:D151") %>% 
  set_colnames(c("country_region", "country", "GPA", "n")) %>% 
  filter(!is.na(country), country != "<ukendt>", country != "Mellemøsten, uoplyst land")

#read by year data
raw = read_excel("data/data.xlsx", range = "A5:E728") %>% 
  set_colnames(c("year", "country_region", "country", "GPA", "n"))

#countries only (remove regions and unknowns)
countries = raw %>% filter(!is.na(country), country != "<ukendt>", country != "Mellemøsten, uoplyst land")

#mutate
countries %<>% 
  mutate(
    year = str_match(year, "\\/(\\d+)")[, 2] %>% as.numeric(),
    year_fct = factor(year),
    ISO = pu_translate(country)
  )
## No exact match: Saudi Arabien
## Best fuzzy match found: Saudi Arabien -> Saudi-Arabien with distance 1.00
#average across years
#beware grade inflation, standardized
#we fit a model and regress out the year effect
#this preserves the data scale
GPA_inflation_adjust_fit = ols(GPA ~ year_fct, data = countries, weights = countries$n)
countries$GPA_adj = GPA_inflation_adjust_fit %>% resid() %>% add(coef(GPA_inflation_adjust_fit)[1])

countries_avg = plyr::ddply(countries, c("country", "ISO"), function(dd) {
  tibble(
    n = wtd_sum(dd$n),
    n_sqrt = sqrt(n),
    GPA = wtd_mean(dd$GPA_adj, dd$n)
  )
})

#by generation
raw_bg = read_excel("data/data generation.xlsx", range = "A5:F1107") %>% 
  set_colnames(c("year", "generation", "country_region", "country", "GPA", "n"))

#countries only (remove regions and unknowns)
countries_bg = raw_bg %>% filter(!is.na(country), country != "<ukendt>", country != "Mellemøsten, uoplyst land")

#mutate
countries_bg %<>% 
  mutate(
    year = str_match(year, "\\/(\\d+)")[, 2] %>% as.numeric(),
    year_fct = factor(year),
    ISO = pu_translate(country),
    first_gen = (generation == "Indvandrer")
  )
## No exact match: Saudi Arabien
## Best fuzzy match found: Saudi Arabien -> Saudi-Arabien with distance 1.00
#average across years
GPA_inflation_adjust_bg_fit = ols(GPA ~ year_fct, data = countries_bg, weights = countries_bg$n)
countries_bg$GPA_adj = GPA_inflation_adjust_bg_fit %>% resid() %>% add(coef(GPA_inflation_adjust_bg_fit)[1])

countries_avg_bg = plyr::ddply(countries_bg, c("country", "ISO", "first_gen"), function(dd) {
  tibble(
    n = wtd_sum(dd$n),
    n_sqrt = sqrt(n),
    GPA = wtd_mean(dd$GPA_adj, dd$n)
  )
})

#national data from prior studies
#danish immigrant and stereotype study
# dk_immigrant_pref_study = read_rds("data/dk_immigrant_pref_global_envir.rds")
#dutch stereotype study
nl_stereotype_study = read_rds("data/dutch-stereotype_data_out.rds")
#cognitive elite study
natl_time_pref_study = read_rds("data/time_prefdata_out.rds")
#danish GPA 2015 study https://openpsych.net/paper/36
# dk_gpa_study = read_rds("data/")

#worldbank human capital index
worldbank_hci_2018 = read_excel("data/hcidataoctober2018.xlsx", sheet = 4) %>% 
  df_legalize_names() %>% 
  mutate(
    ISO = pu_translate(Country_Name),
    IQwb = HUMAN_CAPITAL_INDEX
  )
## No exact match: eSwatini
## Best fuzzy match found: eSwatini -> Eswatini with distance 2.00
#PIAAC and employment gaps
PIAAC_employment_gaps = read_excel("data/PIAAC employment.xlsx")

#spatial data
worldmap = read_sf("data/TM_WORLD_BORDERS-0.3/TM_WORLD_BORDERS-0.3.shp") %>% 
  rename(
    ISO = ISO3
  )

#brain drain
brain_drain_oecd = read_dta("data/brain drain/iabbd_8010_v1.dta") %>% 
  mutate(
    ISO = ccode_origin,
    high_fraction = high / tot
  )

#denmark as destination, 1995-2005 years
brain_drain_oecd_dk = brain_drain_oecd %>% 
  filter(destination == "Denmark")

#sum across years and sexes
brain_drain_oecd_dk_sum = brain_drain_oecd_dk %>% 
  plyr::ddply(c("origin", "ISO"), function(dd) {
    tibble(
      total = sum(dd$tot),
      low = sum(dd$low),
      med = sum(dd$med),
      high = sum(dd$high),
      high_fraction = high / total
    )
  }) %>% 
  #filter to those with n >= min
  filter(total >= min_n)

#merge
brain_drain_oecd_dk_sum = brain_drain_oecd_dk_sum %>% left_join(natl_time_pref_study %>% select(ISO, Mean_years_of_schooling), by = "ISO")

#model
(brain_drain_model = ols(high_fraction ~ Mean_years_of_schooling, data = brain_drain_oecd_dk_sum))
## Frequencies of Missing Values Due to Each Variable
##           high_fraction Mean_years_of_schooling 
##                       0                      10 
## 
## Linear Regression Model
##  
##  ols(formula = high_fraction ~ Mean_years_of_schooling, data = brain_drain_oecd_dk_sum)
##  
##  
##                  Model Likelihood    Discrimination    
##                        Ratio Test           Indexes    
##  Obs     156    LR chi2      9.46    R2       0.059    
##  sigma0.0816    d.f.            1    R2 adj   0.053    
##  d.f.    154    Pr(> chi2) 0.0021    g        0.023    
##  
##  Residuals
##  
##        Min        1Q    Median        3Q       Max 
##  -0.199226 -0.051772 -0.001017  0.049583  0.226157 
##  
##  
##                          Coef   S.E.   t     Pr(>|t|)
##  Intercept               0.1993 0.0188 10.59 <0.0001 
##  Mean_years_of_schooling 0.0064 0.0021  3.10 0.0023  
## 
brain_drain_oecd_dk_sum$edu_selection = resid(brain_drain_model) %>% standardize()

#plot
brain_drain_oecd_dk_sum %>% select(edu_selection, high_fraction, Mean_years_of_schooling) %>% wtd.cors()
##                         edu_selection high_fraction Mean_years_of_schooling
## edu_selection                 1.0e+00          0.97                -5.2e-17
## high_fraction                 9.7e-01          1.00                 2.4e-01
## Mean_years_of_schooling      -5.2e-17          0.24                 1.0e+00
GG_scatter(brain_drain_oecd_dk_sum, "Mean_years_of_schooling", "high_fraction") +
  ylab("High education % among emigrants to Denmark")
## `geom_smooth()` using formula 'y ~ x'

GG_save("figs/brain_brain_to_dk.png")
## `geom_smooth()` using formula 'y ~ x'
#selection by year
brain_drain_oecd_dk_sum_by_year = brain_drain_oecd %>% 
  filter(destination == "Denmark") %>% 
    plyr::ddply(c("origin", "ISO", "year"), function(dd) {
    tibble(
      total = sum(dd$tot),
      low = sum(dd$low),
      med = sum(dd$med),
      high = sum(dd$high),
      high_fraction = high / total
    )
  }) %>% 
  #filter to those with n >= min
  filter(total >= min_n)

#variance between countries over time?
lm(high_fraction ~ ISO + year, data = brain_drain_oecd_dk_sum_by_year, weights = brain_drain_oecd_dk_sum_by_year$total) %>% 
  stats::aov() %>% 
  sjstats::anova_stats()
#merge
d = left_join(countries_avg, natl_time_pref_study %>% select(ISO, NAME, IQlv, IQr, IQb, IslamPewResearch2010), by = "ISO") %>% 
  left_join(nl_stereotype_study %>% select(ISO, Muslim), by = "ISO") %>% 
  left_join(brain_drain_oecd_dk_sum %>% select(ISO, high_fraction, edu_selection)) %>% 
  left_join(worldbank_hci_2018 %>% select(ISO, IQwb), by = "ISO") %>% 
  mutate(
    NAME = pu_translate(ISO, reverse = T)
  )
## Joining, by = "ISO"
assert_that(!any(duplicated(d$ISO)))
## [1] TRUE
#merge bg
d_bg = left_join(countries_avg_bg, natl_time_pref_study %>% select(ISO, NAME, IQlv, IQr, IQb, IslamPewResearch2010), by = "ISO") %>% 
  left_join(nl_stereotype_study %>% select(ISO, Muslim), by = "ISO") %>% 
  left_join(brain_drain_oecd_dk_sum %>% select(ISO, high_fraction, edu_selection)) %>% 
  left_join(worldbank_hci_2018 %>% select(ISO, IQwb), by = "ISO") %>% 
  mutate(
    NAME = pu_translate(ISO, reverse = T)
  )
## Joining, by = "ISO"
#fill in Muslim
d$Muslim = miss_fill(d$Muslim, d$IslamPewResearch2010)
d_bg$Muslim = miss_fill(d_bg$Muslim, d_bg$IslamPewResearch2010)

#add missing data
#Kosovo is 95.6% Muslim
#https://www.britannica.com/place/Kosovo/Religion
d$Muslim[which(d$ISO == "KSV")] = .956
d_bg$Muslim[which(d_bg$ISO == "KSV")] = .956

#Serbia and Montenegro
#Montenegro 3.3% muslim https://www.cia.gov/library/publications/the-world-factbook/geos/print_mj.html
#Serbia 3.1% https://en.wikipedia.org/wiki/Islam_in_Serbia
d$Muslim[which(d$ISO == "SRBM")] = .031
d_bg$Muslim[which(d_bg$ISO == "SRBM")] = .031

#use Serbia IQ for Kosova, and Serbia Montenegro, and Yugoslavia
d$IQlv[which(d$ISO %in% c("KSV", "SRBM", "YUG"))] = 90.3
d_bg$IQlv[which(d_bg$ISO %in% c("KSV", "SRBM", "YUG"))] = 90.3

#Use Becker's geo estimate for Cayman islands
d$IQlv[which(d$ISO %in% c("CYM"))] = 82
d_bg$IQlv[which(d_bg$ISO %in% c("CYM"))] = 82

Analyses

Descriptives

#input data
countries$n %>% describe()
##    vars   n mean   sd median trimmed mad min   max range skew kurtosis  se
## X1    1 663  609 5301     13      33  13   3 53313 53310  9.6       89 206
countries$n %>% sum()
## [1] 4e+05
countries_avg$n %>% describe()
##    vars   n mean    sd median trimmed mad min    max  range skew kurtosis   se
## X1    1 116 3483 33685     60     168  79   3 363055 363052   10      109 3128
countries_avg$n %>% sum()
## [1] 4e+05
#without Denmark
countries_avg %>% 
  filter(ISO != "DNK") %>% 
  .$n %>% describe()
##    vars   n mean  sd median trimmed mad min  max range skew kurtosis se
## X1    1 115  356 811     59     156  76   3 5677  5674  3.9       18 76

Combined generations

#main predictors
main_preds = c("IQlv", "IQr", "IQb", "IQwb", "Muslim", "edu_selection")

#unweighted
GG_scatter(d, "IQlv", "GPA", case_names = "NAME")
## `geom_smooth()` using formula 'y ~ x'

GG_save("figs/IQlv GPA unwt.png")
## `geom_smooth()` using formula 'y ~ x'
#weights
GG_scatter(d, "IQlv", "GPA", case_names = "NAME", weights = "n_sqrt")
## `geom_smooth()` using formula 'y ~ x'

GG_save("figs/IQlv GPA wt.png")
## `geom_smooth()` using formula 'y ~ x'
#no dk
GG_scatter(d %>% filter(ISO != "DNK"), "IQlv", "GPA", case_names = "NAME", weights = "n_sqrt")
## `geom_smooth()` using formula 'y ~ x'

GG_save("figs/IQlv GPA wt noDK.png")
## `geom_smooth()` using formula 'y ~ x'
#large only, no dk
GG_scatter(d %>% filter(ISO != "DNK", n >= min_n), "IQlv", "GPA", case_names = "NAME", weights = "n_sqrt")
## `geom_smooth()` using formula 'y ~ x'

GG_save("figs/IQlv GPA wt noDK large.png")
## `geom_smooth()` using formula 'y ~ x'
#correlations
d %>% 
  filter(ISO != "DNK", n >= min_n) %>% 
  {
    print(wtd.cors(select(., GPA, !!main_preds), weight = .$n_sqrt))
    print(pairwiseCount(select(., GPA, !!main_preds)))
  }
##                 GPA   IQlv    IQr    IQb   IQwb Muslim edu_selection
## GPA            1.00  0.469  0.531  0.520  0.451  -0.40         0.351
## IQlv           0.47  1.000  0.978  0.897  0.842  -0.41        -0.042
## IQr            0.53  0.978  1.000  0.902  0.858  -0.44         0.015
## IQb            0.52  0.897  0.902  1.000  0.770  -0.47        -0.013
## IQwb           0.45  0.842  0.858  0.770  1.000  -0.51        -0.086
## Muslim        -0.40 -0.407 -0.442 -0.470 -0.511   1.00        -0.136
## edu_selection  0.35 -0.042  0.015 -0.013 -0.086  -0.14         1.000
##               GPA IQlv IQr IQb IQwb Muslim edu_selection
## GPA            81   81  77  78   72     81            71
## IQlv           81   81  77  78   72     81            71
## IQr            77   77  77  77   71     77            71
## IQb            78   78  77  78   71     78            71
## IQwb           72   72  71  71   72     72            66
## Muslim         81   81  77  78   72     81            71
## edu_selection  71   71  71  71   66     71            71
#combined
combine_upperlower(
  wtd.cors(select(d %>% filter(ISO != "DNK"), GPA, !!main_preds)),
  
  #weights
  d %>% 
    filter(ISO != "DNK", n >= min_n) %>% 
    {
      wtd.cors(select(., GPA, !!main_preds), weight = .$n_sqrt)
    }
)
##                 GPA   IQlv    IQr    IQb   IQwb Muslim edu_selection
## GPA              NA  0.553  0.591  0.567  0.525  -0.24        0.2619
## IQlv           0.47     NA  0.981  0.929  0.897  -0.36       -0.0024
## IQr            0.53  0.978     NA  0.932  0.923  -0.36        0.0194
## IQb            0.52  0.897  0.902     NA  0.874  -0.43       -0.0029
## IQwb           0.45  0.842  0.858  0.770     NA  -0.39       -0.0316
## Muslim        -0.40 -0.407 -0.442 -0.470 -0.511     NA       -0.1389
## edu_selection  0.35 -0.042  0.015 -0.013 -0.086  -0.14            NA
#datasets for models
model_preds = c("IQlv", "Muslim", "edu_selection")

d_std_noDK = d %>% df_standardize(messages = F, exclude = c("n", "n_sqrt")) %>% filter(ISO != "DNK") %>% filter(!is.na(IQlv), !is.na(Muslim), !is.na(edu_selection), n >= min_n)

d_noDK = d %>% filter(ISO != "DNK") %>% filter(!is.na(IQlv), !is.na(Muslim), !is.na(edu_selection), n >= min_n)

#models
model_forms = list(
  GPA ~ IQlv,
  GPA ~ Muslim,
  GPA ~ IQlv + Muslim,
  GPA ~ IQlv + Muslim + edu_selection
)

#fits
model_forms %>% 
  map(~ols(., data = d_std_noDK, weights = d_std_noDK$n_sqrt)) %>% 
  summarize_models()
#unstandardized
model_forms %>% 
  map(~ols(., data = d_noDK, weights = d_noDK$n_sqrt)) %>% 
  summarize_models()

By generation

#unweighted
GG_scatter(d_bg, "IQlv", "GPA", case_names = "NAME", color = "first_gen") +
  scale_color_discrete("First generation?") +
  theme(legend.position="bottom")
## `geom_smooth()` using formula 'y ~ x'

GG_save("figs/bg/IQlv GPA unwt.png")
## `geom_smooth()` using formula 'y ~ x'
#weights
GG_scatter(d_bg, "IQlv", "GPA", case_names = "NAME", weights = "n_sqrt", color = "first_gen") +
  scale_color_discrete("First generation?") +
  theme(legend.position="bottom")
## `geom_smooth()` using formula 'y ~ x'

GG_save("figs/bg/IQlv GPA wt.png")
## `geom_smooth()` using formula 'y ~ x'
#weights + no dk
GG_scatter(d_bg %>% filter(ISO != "DNK"), "IQlv", "GPA", case_names = "NAME", weights = "n_sqrt", color = "first_gen") +
  scale_color_discrete("First generation?") +
  theme(legend.position="bottom")
## `geom_smooth()` using formula 'y ~ x'

GG_save("figs/bg/IQlv GPA wt noDK.png")
## `geom_smooth()` using formula 'y ~ x'
#large only, no dk
GG_scatter(d_bg %>% filter(ISO != "DNK", n >= min_n), "IQlv", "GPA", case_names = "NAME", weights = "n_sqrt", color = "first_gen") +
  scale_color_discrete("First generation?") +
  scale_x_continuous("National intelligence of origin country") +
  scale_y_continuous("GPA in Denmark") +
  theme(legend.position="bottom")
## `geom_smooth()` using formula 'y ~ x'

GG_save("figs/bg/IQlv GPA wt noDK large.png")
## `geom_smooth()` using formula 'y ~ x'
#Muslim
GG_scatter(d_bg %>% filter(ISO != "DNK", n >= min_n), "Muslim", "GPA", case_names = "NAME", weights = "n_sqrt", color = "first_gen") +
  scale_color_discrete("First generation?") +
  scale_x_continuous("Muslims in origin country", labels = scales::percent) +
  scale_y_continuous("GPA in Denmark") +
  theme(legend.position="bottom")
## `geom_smooth()` using formula 'y ~ x'

GG_save("figs/bg/Muslim GPA wt noDK large.png")
## `geom_smooth()` using formula 'y ~ x'
#Education selection
GG_scatter(d_bg %>% filter(ISO != "DNK", n >= min_n), "edu_selection", "GPA", case_names = "NAME", weights = "n_sqrt", color = "first_gen") +
  scale_color_discrete("First generation?") +
  scale_x_continuous("Educational selection of migrants") +
  scale_y_continuous("GPA in Denmark") +
  theme(legend.position="bottom")
## `geom_smooth()` using formula 'y ~ x'

GG_save("figs/bg/edu selection GPA wt noDK large.png")
## `geom_smooth()` using formula 'y ~ x'
#standardized datasets
model_preds = c("IQlv", "Muslim", "first_gen", "edu_selection")

d_bg_std_noDK = d_bg %>% df_standardize(messages = F, exclude = c("n", "n_sqrt")) %>% filter(ISO != "DNK") %>% filter(!is.na(IQlv), !is.na(Muslim), !is.na(first_gen), !is.na(edu_selection), n > min_n)

d_bg_noDK = d_bg %>% filter(ISO != "DNK") %>% filter(!is.na(IQlv), !is.na(Muslim), !is.na(first_gen), !is.na(edu_selection), n > min_n)

#models
model_forms = list(
  GPA ~ first_gen + IQlv,
  GPA ~ first_gen + Muslim,
  GPA ~ first_gen + IQlv + Muslim,
  GPA ~ first_gen + IQlv + Muslim + edu_selection
)

#fits
model_forms %>% 
  map(~ols(., data = d_bg_std_noDK, weights = d_bg_std_noDK$n_sqrt)) %>% 
  summarize_models()
#unstandardized
model_forms %>% 
  map(~ols(., data = d_bg_noDK, weights = d_bg_noDK$n_sqrt)) %>% 
  summarize_models()
#model predictions
ols(model_forms[[4]], data = d_bg_noDK %>% mutate(first_gen = factor(first_gen)), weights = d_bg_noDK$n_sqrt) %>% 
  ggeffects::ggpredict(c("IQlv", "first_gen")) %>% 
  plot()

ols(model_forms[[4]], data = d_bg_noDK %>% mutate(first_gen = factor(first_gen)), weights = d_bg_noDK$n_sqrt) %>% 
  ggeffects::ggpredict(c("Muslim", "first_gen")) %>% 
  plot()

ols(model_forms[[4]], data = d_bg_noDK %>% mutate(first_gen = factor(first_gen)), weights = d_bg_noDK$n_sqrt) %>% 
  ggeffects::ggpredict(c("edu_selection", "first_gen")) %>% 
  plot()

By included countries

Vary the minimum sample size and see how this affects results.

#compute
cor_results_by_minimum_n = map_df(5:200, function(min_n) {
  #subset
  d %>% 
    filter(ISO != "DNK", n >= min_n) ->
    d_local
  
  #compute correlations
  cors = wtd.cors(select(d_local, GPA, !!main_preds), weight = d_local$n_sqrt)
  
  #save as long form
  cors %>% 
    reshape2::melt() %>% 
    mutate(
      min_n = min_n,
      n = nrow(d_local)
      )
})

#plot
cor_results_by_minimum_n %>% 
  filter(Var2 == "GPA", Var1 != "GPA") %>% 
  ggplot(aes(min_n, value, color = Var1)) +
  geom_line() +
  scale_color_discrete("Predictor") +
  scale_y_continuous("Correlation with GPA") +
  scale_x_continuous("Minimum required sample size for GPA")

GG_save("figs/min_n_results.png")

#model results
model_results_by_minimum_n = map_df(5:200, function(min_n) {
  #subset
  #standardize again to avoid the pre-filtering above!
  d_bg %>% 
    df_standardize(messages = F, exclude = c("n", "n_sqrt")) %>% 
    filter(ISO != "DNK", n >= min_n) ->
    d_local
  
  #compute correlations
  model_fit = ols(GPA ~ first_gen + IQlv + Muslim + edu_selection, data = d_local, weights = d_local$n_sqrt)

  #save as long form
  model_fit %>% 
    summary.lm() %>% 
    broom::tidy() %>% 
    mutate(
      min_n = min_n,
      n = nrow(d_local)
    )
})

#plot
model_results_by_minimum_n %>% 
  filter(term != "Intercept") %>% 
  mutate(
    p_low = p.value < .01
  ) %>% 
  ggplot(aes(min_n, estimate, color = term, linetype = p_low)) +
  geom_line() +
  scale_color_discrete("Predictor") +
  scale_linetype_discrete("p < .01", ) +
  scale_y_continuous("Standardized beta") +
  scale_x_continuous("Minimum required sample size for GPA")

GG_save("figs/min_n_results_model.png")

#correlations of predictors with sample size
d %>% select(!!main_preds, n) %>% wtd.cors()
##                  IQlv    IQr     IQb   IQwb Muslim edu_selection      n
## IQlv           1.0000  0.981  0.9296  0.898 -0.367       -0.0024  0.096
## IQr            0.9811  1.000  0.9323  0.924 -0.361        0.0194  0.108
## IQb            0.9296  0.932  1.0000  0.876 -0.431       -0.0029  0.106
## IQwb           0.8981  0.924  0.8759  1.000 -0.393       -0.0316  0.121
## Muslim        -0.3666 -0.361 -0.4306 -0.393  1.000       -0.1389 -0.053
## edu_selection -0.0024  0.019 -0.0029 -0.032 -0.139        1.0000 -0.321
## n              0.0958  0.108  0.1061  0.121 -0.053       -0.3206  1.000

Maps

Merge as needed and then plot. Tricky because some of the countries no longer exist.

#GPA data
#which are we missing?
setdiff(d$ISO, worldmap$ISO)
## [1] "YUG"  "KSV"  "SRBM"
#ignore the Yugoslavia problem

#prep GPA data
d_combined = d %>% select(ISO, GPA, n, IQlv, Muslim, edu_selection) %>% 
  left_join(d_bg %>% select(ISO, GPA, first_gen) %>% mutate(first_gen = first_gen %>% mapvalues(c(T, F), c("GPA_1st_gen", "GPA_2nd_gen"))) %>% pivot_wider(names_from = "first_gen", values_from = "GPA"), by = "ISO")

left_join(worldmap, d_combined, by = "ISO") %>% 
  st_make_valid() ->
  GPA_map_data

#GPA
GPA_map_data %>% 
  tm_shape() +
  tm_polygons("GPA", title = "Mean GPA") ->
  GPA_map

GPA_map

tmap_save(GPA_map, filename = "figs/GPA_map.png")
## Map saved to /media/luks8tb2/Projects/Denmark GPA origin country/figs/GPA_map.png
## Resolution: 3054 by 1444 pixels
## Size: 10 by 4.8 inches (300 dpi)
#1st gen
GPA_map_data %>% 
  tm_shape() +
  tm_polygons("GPA_1st_gen", title = "Mean GPA, 1st gen.") ->
  GPA_1st_map

GPA_1st_map

tmap_save(GPA_1st_map, filename = "figs/GPA_1st_map.png")
## Map saved to /media/luks8tb2/Projects/Denmark GPA origin country/figs/GPA_1st_map.png
## Resolution: 3054 by 1444 pixels
## Size: 10 by 4.8 inches (300 dpi)
#2nd gen
GPA_map_data %>% 
  tm_shape() +
  tm_polygons("GPA_2nd_gen", title = "Mean GPA, 2nd gen.") ->
  GPA_2nd_map

GPA_2nd_map

tmap_save(GPA_2nd_map, filename = "figs/GPA_2nd_map.png")
## Map saved to /media/luks8tb2/Projects/Denmark GPA origin country/figs/GPA_2nd_map.png
## Resolution: 3054 by 1444 pixels
## Size: 10 by 4.8 inches (300 dpi)
#selection index into Denmark
left_join(worldmap, brain_drain_oecd_dk_sum, by = "ISO") %>% 
  st_make_valid() ->
  selection_map_data

selection_map_data %>% 
  tm_shape() +
  tm_polygons("edu_selection", title = "Selection index into Denmark") ->
  selection_map

selection_map
## Variable(s) "edu_selection" contains positive and negative values, so midpoint is set to 0. Set midpoint = NA to show the full spectrum of the color palette.

tmap_save(selection_map, filename = "figs/selection_map.png")
## Variable(s) "edu_selection" contains positive and negative values, so midpoint is set to 0. Set midpoint = NA to show the full spectrum of the color palette.
## Map saved to /media/luks8tb2/Projects/Denmark GPA origin country/figs/selection_map.png
## Resolution: 3054 by 1444 pixels
## Size: 10 by 4.8 inches (300 dpi)

PIAAC and employment

#original backwards plot
GG_scatter(PIAAC_employment_gaps, "Employment_gap", "PIAAC_gap", case_names = "Country") +
  xlab("Employment rate gap") +
  ylab("PIAAC test score gap")
## `geom_smooth()` using formula 'y ~ x'

#causal direction
GG_scatter(PIAAC_employment_gaps, "PIAAC_gap", "Employment_gap", case_names = "Country") +
  ylab("Employment rate gap") +
  xlab("PIAAC test score gap")
## `geom_smooth()` using formula 'y ~ x'

GG_save("figs/PIAAC_employment.png")
## `geom_smooth()` using formula 'y ~ x'

Appendix data table

#combine the variables by generation to wide format
d %>% select(ISO, GPA, n, IQlv, Muslim, edu_selection) %>% 
  left_join(d_bg %>% select(ISO, GPA, first_gen) %>% mutate(first_gen = first_gen %>% mapvalues(c(T, F), c("GPA_1st_gen", "GPA_2nd_gen"))) %>% pivot_wider(names_from = "first_gen", values_from = "GPA"), by = "ISO") %>% 
  select(ISO, IQlv, Muslim, edu_selection, n, GPA, GPA_1st_gen, GPA_2nd_gen)

Meta

#versions
write_sessioninfo()
## R version 4.0.1 (2020-06-06)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Linux Mint 19.3
## 
## Matrix products: default
## BLAS:   /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.7.1
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.7.1
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=de_DE.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=de_DE.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=de_DE.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] ggeffects_0.14.3      tmap_3.0              sf_0.9-3             
##  [4] haven_2.3.1           rms_5.1-4             SparseM_1.78         
##  [7] readxl_1.3.1          kirkegaard_2020-07-02 metafor_2.4-0        
## [10] Matrix_1.2-18         psych_1.9.12.31       magrittr_1.5         
## [13] assertthat_0.2.1      weights_1.0.1         mice_3.9.0           
## [16] gdata_2.18.0          Hmisc_4.4-0           Formula_1.2-3        
## [19] survival_3.1-12       lattice_0.20-41       forcats_0.5.0        
## [22] stringr_1.4.0         dplyr_0.8.99.9003     purrr_0.3.4          
## [25] readr_1.3.1           tidyr_1.1.0           tibble_3.0.1         
## [28] ggplot2_3.3.2         tidyverse_1.3.0       pacman_0.5.1         
## 
## loaded via a namespace (and not attached):
##   [1] backports_1.1.8     lwgeom_0.2-4        plyr_1.8.6         
##   [4] sp_1.4-2            splines_4.0.1       crosstalk_1.1.0.1  
##   [7] leaflet_2.0.3       TH.data_1.0-10      digest_0.6.25      
##  [10] htmltools_0.5.0     fansi_0.4.1         checkmate_2.0.0    
##  [13] cluster_2.1.0       modelr_0.1.8        sandwich_2.5-1     
##  [16] jpeg_0.1-8.1        colorspace_1.4-1    blob_1.2.1         
##  [19] rvest_0.3.5         xfun_0.15           leafem_0.1.1       
##  [22] crayon_1.3.4        jsonlite_1.7.0      lme4_1.1-23        
##  [25] zoo_1.8-8           glue_1.4.1          stars_0.4-1        
##  [28] gtable_0.3.0        emmeans_1.4.6       MatrixModels_0.4-1 
##  [31] sjmisc_2.8.4        sjstats_0.18.0      abind_1.4-5        
##  [34] scales_1.1.1        mvtnorm_1.1-0       DBI_1.1.0          
##  [37] Rcpp_1.0.4.6        performance_0.4.6   xtable_1.8-4       
##  [40] viridisLite_0.3.0   htmlTable_2.0.0     tmvnsim_1.0-2      
##  [43] units_0.6-6         foreign_0.8-79      htmlwidgets_1.5.1  
##  [46] httr_1.4.1          RColorBrewer_1.1-2  acepack_1.4.1      
##  [49] ellipsis_0.3.1      pkgconfig_2.0.3     XML_3.99-0.3       
##  [52] farver_2.0.3        nnet_7.3-14         dbplyr_1.4.4       
##  [55] reshape2_1.4.4      tidyselect_1.1.0    labeling_0.3       
##  [58] rlang_0.4.6         effectsize_0.3.1    tmaptools_3.0      
##  [61] munsell_0.5.0       multilevel_2.6      cellranger_1.1.0   
##  [64] tools_4.0.1         cli_2.0.2           generics_0.0.2     
##  [67] sjlabelled_1.1.4    broom_0.5.6         evaluate_0.14      
##  [70] yaml_2.2.1          leafsync_0.1.0      knitr_1.29         
##  [73] fs_1.4.2            nlme_3.1-147        quantreg_5.55      
##  [76] xml2_1.3.2          psychometric_2.2    compiler_4.0.1     
##  [79] rstudioapi_0.11     png_0.1-7           e1071_1.7-3        
##  [82] reprex_0.3.0        statmod_1.4.34      stringi_1.4.6      
##  [85] parameters_0.7.0    nloptr_1.2.2.1      classInt_0.4-3     
##  [88] vctrs_0.3.1         stringdist_0.9.5.5  pillar_1.4.4       
##  [91] lifecycle_0.2.0     pwr_1.3-0           estimability_1.3   
##  [94] data.table_1.12.8   insight_0.8.4       raster_3.1-5       
##  [97] R6_2.4.1            latticeExtra_0.6-29 KernSmooth_2.23-17 
## [100] gridExtra_2.3       codetools_0.2-16    polspline_1.1.19   
## [103] dichromat_2.0-0     boot_1.3-25         MASS_7.3-51.6      
## [106] gtools_3.8.2        withr_2.2.0         mnormt_2.0.1       
## [109] multcomp_1.4-13     mgcv_1.8-31         bayestestR_0.6.0   
## [112] parallel_4.0.1      hms_0.5.3           grid_4.0.1         
## [115] rpart_4.1-15        minqa_1.2.4         coda_0.19-3        
## [118] class_7.3-17        rmarkdown_2.3       lubridate_1.7.9    
## [121] base64enc_0.1-3     rematch_1.0.1
#output data
d %>% write_rds("data/combined.rds", compress = "xz")
d_bg %>% write_rds("data/by_generation.rds", compress = "xz")

#upload to OSF
if (F) {
  library(osfr)
  osf_auth(read_lines("~/.config/osf_token"))
  osf_proj = osf_retrieve_node("https://osf.io/2vz8u/")
  osf_upload(
    osf_proj,
    conflicts = "overwrite", 
    path = c("notebook.Rmd", "notebook.html", "data", "figs"),
    recurse = T
    )
}