Packages and options.
#options
options(digits = 2,
tibble.print_max = 30)
#minimum group size
min_n = 25
#packages
library(pacman)
p_load(kirkegaard, readxl, rms, haven, sf, tmap, ggeffects)
#default theme
theme_set(theme_bw())
#overwrite describe
describe = function(...) psych::describe(...) %>% as.matrix()
#all years mixed together
#we dont use these because we cannot adjust for grade inflation
#but these contain a bit more countries
raw_all_years = read_excel("data/Resultater - Folkeskolens Afgangseksamen all years together.xlsx",
range = "A5:D151") %>%
set_colnames(c("country_region", "country", "GPA", "n")) %>%
filter(!is.na(country), country != "<ukendt>", country != "Mellemøsten, uoplyst land")
#read by year data
raw = read_excel("data/data.xlsx", range = "A5:E728") %>%
set_colnames(c("year", "country_region", "country", "GPA", "n"))
#countries only (remove regions and unknowns)
countries = raw %>% filter(!is.na(country), country != "<ukendt>", country != "Mellemøsten, uoplyst land")
#mutate
countries %<>%
mutate(
year = str_match(year, "\\/(\\d+)")[, 2] %>% as.numeric(),
year_fct = factor(year),
ISO = pu_translate(country)
)
## No exact match: Saudi Arabien
## Best fuzzy match found: Saudi Arabien -> Saudi-Arabien with distance 1.00
#average across years
#beware grade inflation, standardized
#we fit a model and regress out the year effect
#this preserves the data scale
GPA_inflation_adjust_fit = ols(GPA ~ year_fct, data = countries, weights = countries$n)
countries$GPA_adj = GPA_inflation_adjust_fit %>% resid() %>% add(coef(GPA_inflation_adjust_fit)[1])
countries_avg = plyr::ddply(countries, c("country", "ISO"), function(dd) {
tibble(
n = wtd_sum(dd$n),
n_sqrt = sqrt(n),
GPA = wtd_mean(dd$GPA_adj, dd$n)
)
})
#by generation
raw_bg = read_excel("data/data generation.xlsx", range = "A5:F1107") %>%
set_colnames(c("year", "generation", "country_region", "country", "GPA", "n"))
#countries only (remove regions and unknowns)
countries_bg = raw_bg %>% filter(!is.na(country), country != "<ukendt>", country != "Mellemøsten, uoplyst land")
#mutate
countries_bg %<>%
mutate(
year = str_match(year, "\\/(\\d+)")[, 2] %>% as.numeric(),
year_fct = factor(year),
ISO = pu_translate(country),
first_gen = (generation == "Indvandrer")
)
## No exact match: Saudi Arabien
## Best fuzzy match found: Saudi Arabien -> Saudi-Arabien with distance 1.00
#average across years
GPA_inflation_adjust_bg_fit = ols(GPA ~ year_fct, data = countries_bg, weights = countries_bg$n)
countries_bg$GPA_adj = GPA_inflation_adjust_bg_fit %>% resid() %>% add(coef(GPA_inflation_adjust_bg_fit)[1])
countries_avg_bg = plyr::ddply(countries_bg, c("country", "ISO", "first_gen"), function(dd) {
tibble(
n = wtd_sum(dd$n),
n_sqrt = sqrt(n),
GPA = wtd_mean(dd$GPA_adj, dd$n)
)
})
#national data from prior studies
#danish immigrant and stereotype study
# dk_immigrant_pref_study = read_rds("data/dk_immigrant_pref_global_envir.rds")
#dutch stereotype study
nl_stereotype_study = read_rds("data/dutch-stereotype_data_out.rds")
#cognitive elite study
natl_time_pref_study = read_rds("data/time_prefdata_out.rds")
#danish GPA 2015 study https://openpsych.net/paper/36
# dk_gpa_study = read_rds("data/")
#worldbank human capital index
worldbank_hci_2018 = read_excel("data/hcidataoctober2018.xlsx", sheet = 4) %>%
df_legalize_names() %>%
mutate(
ISO = pu_translate(Country_Name),
IQwb = HUMAN_CAPITAL_INDEX
)
## No exact match: eSwatini
## Best fuzzy match found: eSwatini -> Eswatini with distance 2.00
#PIAAC and employment gaps
PIAAC_employment_gaps = read_excel("data/PIAAC employment.xlsx")
#spatial data
worldmap = read_sf("data/TM_WORLD_BORDERS-0.3/TM_WORLD_BORDERS-0.3.shp") %>%
rename(
ISO = ISO3
)
#brain drain
brain_drain_oecd = read_dta("data/brain drain/iabbd_8010_v1.dta") %>%
mutate(
ISO = ccode_origin,
high_fraction = high / tot
)
#denmark as destination, 1995-2005 years
brain_drain_oecd_dk = brain_drain_oecd %>%
filter(destination == "Denmark")
#sum across years and sexes
brain_drain_oecd_dk_sum = brain_drain_oecd_dk %>%
plyr::ddply(c("origin", "ISO"), function(dd) {
tibble(
total = sum(dd$tot),
low = sum(dd$low),
med = sum(dd$med),
high = sum(dd$high),
high_fraction = high / total
)
}) %>%
#filter to those with n >= min
filter(total >= min_n)
#merge
brain_drain_oecd_dk_sum = brain_drain_oecd_dk_sum %>% left_join(natl_time_pref_study %>% select(ISO, Mean_years_of_schooling), by = "ISO")
#model
(brain_drain_model = ols(high_fraction ~ Mean_years_of_schooling, data = brain_drain_oecd_dk_sum))
## Frequencies of Missing Values Due to Each Variable
## high_fraction Mean_years_of_schooling
## 0 10
##
## Linear Regression Model
##
## ols(formula = high_fraction ~ Mean_years_of_schooling, data = brain_drain_oecd_dk_sum)
##
##
## Model Likelihood Discrimination
## Ratio Test Indexes
## Obs 156 LR chi2 9.46 R2 0.059
## sigma0.0816 d.f. 1 R2 adj 0.053
## d.f. 154 Pr(> chi2) 0.0021 g 0.023
##
## Residuals
##
## Min 1Q Median 3Q Max
## -0.199226 -0.051772 -0.001017 0.049583 0.226157
##
##
## Coef S.E. t Pr(>|t|)
## Intercept 0.1993 0.0188 10.59 <0.0001
## Mean_years_of_schooling 0.0064 0.0021 3.10 0.0023
##
brain_drain_oecd_dk_sum$edu_selection = resid(brain_drain_model) %>% standardize()
#plot
brain_drain_oecd_dk_sum %>% select(edu_selection, high_fraction, Mean_years_of_schooling) %>% wtd.cors()
## edu_selection high_fraction Mean_years_of_schooling
## edu_selection 1.0e+00 0.97 -5.2e-17
## high_fraction 9.7e-01 1.00 2.4e-01
## Mean_years_of_schooling -5.2e-17 0.24 1.0e+00
GG_scatter(brain_drain_oecd_dk_sum, "Mean_years_of_schooling", "high_fraction") +
ylab("High education % among emigrants to Denmark")
## `geom_smooth()` using formula 'y ~ x'
GG_save("figs/brain_brain_to_dk.png")
## `geom_smooth()` using formula 'y ~ x'
#selection by year
brain_drain_oecd_dk_sum_by_year = brain_drain_oecd %>%
filter(destination == "Denmark") %>%
plyr::ddply(c("origin", "ISO", "year"), function(dd) {
tibble(
total = sum(dd$tot),
low = sum(dd$low),
med = sum(dd$med),
high = sum(dd$high),
high_fraction = high / total
)
}) %>%
#filter to those with n >= min
filter(total >= min_n)
#variance between countries over time?
lm(high_fraction ~ ISO + year, data = brain_drain_oecd_dk_sum_by_year, weights = brain_drain_oecd_dk_sum_by_year$total) %>%
stats::aov() %>%
sjstats::anova_stats()
#merge
d = left_join(countries_avg, natl_time_pref_study %>% select(ISO, NAME, IQlv, IQr, IQb, IslamPewResearch2010), by = "ISO") %>%
left_join(nl_stereotype_study %>% select(ISO, Muslim), by = "ISO") %>%
left_join(brain_drain_oecd_dk_sum %>% select(ISO, high_fraction, edu_selection)) %>%
left_join(worldbank_hci_2018 %>% select(ISO, IQwb), by = "ISO") %>%
mutate(
NAME = pu_translate(ISO, reverse = T)
)
## Joining, by = "ISO"
assert_that(!any(duplicated(d$ISO)))
## [1] TRUE
#merge bg
d_bg = left_join(countries_avg_bg, natl_time_pref_study %>% select(ISO, NAME, IQlv, IQr, IQb, IslamPewResearch2010), by = "ISO") %>%
left_join(nl_stereotype_study %>% select(ISO, Muslim), by = "ISO") %>%
left_join(brain_drain_oecd_dk_sum %>% select(ISO, high_fraction, edu_selection)) %>%
left_join(worldbank_hci_2018 %>% select(ISO, IQwb), by = "ISO") %>%
mutate(
NAME = pu_translate(ISO, reverse = T)
)
## Joining, by = "ISO"
#fill in Muslim
d$Muslim = miss_fill(d$Muslim, d$IslamPewResearch2010)
d_bg$Muslim = miss_fill(d_bg$Muslim, d_bg$IslamPewResearch2010)
#add missing data
#Kosovo is 95.6% Muslim
#https://www.britannica.com/place/Kosovo/Religion
d$Muslim[which(d$ISO == "KSV")] = .956
d_bg$Muslim[which(d_bg$ISO == "KSV")] = .956
#Serbia and Montenegro
#Montenegro 3.3% muslim https://www.cia.gov/library/publications/the-world-factbook/geos/print_mj.html
#Serbia 3.1% https://en.wikipedia.org/wiki/Islam_in_Serbia
d$Muslim[which(d$ISO == "SRBM")] = .031
d_bg$Muslim[which(d_bg$ISO == "SRBM")] = .031
#use Serbia IQ for Kosova, and Serbia Montenegro, and Yugoslavia
d$IQlv[which(d$ISO %in% c("KSV", "SRBM", "YUG"))] = 90.3
d_bg$IQlv[which(d_bg$ISO %in% c("KSV", "SRBM", "YUG"))] = 90.3
#Use Becker's geo estimate for Cayman islands
d$IQlv[which(d$ISO %in% c("CYM"))] = 82
d_bg$IQlv[which(d_bg$ISO %in% c("CYM"))] = 82
#input data
countries$n %>% describe()
## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 663 609 5301 13 33 13 3 53313 53310 9.6 89 206
countries$n %>% sum()
## [1] 4e+05
countries_avg$n %>% describe()
## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 116 3483 33685 60 168 79 3 363055 363052 10 109 3128
countries_avg$n %>% sum()
## [1] 4e+05
#without Denmark
countries_avg %>%
filter(ISO != "DNK") %>%
.$n %>% describe()
## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 115 356 811 59 156 76 3 5677 5674 3.9 18 76
#main predictors
main_preds = c("IQlv", "IQr", "IQb", "IQwb", "Muslim", "edu_selection")
#unweighted
GG_scatter(d, "IQlv", "GPA", case_names = "NAME")
## `geom_smooth()` using formula 'y ~ x'
GG_save("figs/IQlv GPA unwt.png")
## `geom_smooth()` using formula 'y ~ x'
#weights
GG_scatter(d, "IQlv", "GPA", case_names = "NAME", weights = "n_sqrt")
## `geom_smooth()` using formula 'y ~ x'
GG_save("figs/IQlv GPA wt.png")
## `geom_smooth()` using formula 'y ~ x'
#no dk
GG_scatter(d %>% filter(ISO != "DNK"), "IQlv", "GPA", case_names = "NAME", weights = "n_sqrt")
## `geom_smooth()` using formula 'y ~ x'
GG_save("figs/IQlv GPA wt noDK.png")
## `geom_smooth()` using formula 'y ~ x'
#large only, no dk
GG_scatter(d %>% filter(ISO != "DNK", n >= min_n), "IQlv", "GPA", case_names = "NAME", weights = "n_sqrt")
## `geom_smooth()` using formula 'y ~ x'
GG_save("figs/IQlv GPA wt noDK large.png")
## `geom_smooth()` using formula 'y ~ x'
#correlations
d %>%
filter(ISO != "DNK", n >= min_n) %>%
{
print(wtd.cors(select(., GPA, !!main_preds), weight = .$n_sqrt))
print(pairwiseCount(select(., GPA, !!main_preds)))
}
## GPA IQlv IQr IQb IQwb Muslim edu_selection
## GPA 1.00 0.469 0.531 0.520 0.451 -0.40 0.351
## IQlv 0.47 1.000 0.978 0.897 0.842 -0.41 -0.042
## IQr 0.53 0.978 1.000 0.902 0.858 -0.44 0.015
## IQb 0.52 0.897 0.902 1.000 0.770 -0.47 -0.013
## IQwb 0.45 0.842 0.858 0.770 1.000 -0.51 -0.086
## Muslim -0.40 -0.407 -0.442 -0.470 -0.511 1.00 -0.136
## edu_selection 0.35 -0.042 0.015 -0.013 -0.086 -0.14 1.000
## GPA IQlv IQr IQb IQwb Muslim edu_selection
## GPA 81 81 77 78 72 81 71
## IQlv 81 81 77 78 72 81 71
## IQr 77 77 77 77 71 77 71
## IQb 78 78 77 78 71 78 71
## IQwb 72 72 71 71 72 72 66
## Muslim 81 81 77 78 72 81 71
## edu_selection 71 71 71 71 66 71 71
#combined
combine_upperlower(
wtd.cors(select(d %>% filter(ISO != "DNK"), GPA, !!main_preds)),
#weights
d %>%
filter(ISO != "DNK", n >= min_n) %>%
{
wtd.cors(select(., GPA, !!main_preds), weight = .$n_sqrt)
}
)
## GPA IQlv IQr IQb IQwb Muslim edu_selection
## GPA NA 0.553 0.591 0.567 0.525 -0.24 0.2619
## IQlv 0.47 NA 0.981 0.929 0.897 -0.36 -0.0024
## IQr 0.53 0.978 NA 0.932 0.923 -0.36 0.0194
## IQb 0.52 0.897 0.902 NA 0.874 -0.43 -0.0029
## IQwb 0.45 0.842 0.858 0.770 NA -0.39 -0.0316
## Muslim -0.40 -0.407 -0.442 -0.470 -0.511 NA -0.1389
## edu_selection 0.35 -0.042 0.015 -0.013 -0.086 -0.14 NA
#datasets for models
model_preds = c("IQlv", "Muslim", "edu_selection")
d_std_noDK = d %>% df_standardize(messages = F, exclude = c("n", "n_sqrt")) %>% filter(ISO != "DNK") %>% filter(!is.na(IQlv), !is.na(Muslim), !is.na(edu_selection), n >= min_n)
d_noDK = d %>% filter(ISO != "DNK") %>% filter(!is.na(IQlv), !is.na(Muslim), !is.na(edu_selection), n >= min_n)
#models
model_forms = list(
GPA ~ IQlv,
GPA ~ Muslim,
GPA ~ IQlv + Muslim,
GPA ~ IQlv + Muslim + edu_selection
)
#fits
model_forms %>%
map(~ols(., data = d_std_noDK, weights = d_std_noDK$n_sqrt)) %>%
summarize_models()
#unstandardized
model_forms %>%
map(~ols(., data = d_noDK, weights = d_noDK$n_sqrt)) %>%
summarize_models()
#unweighted
GG_scatter(d_bg, "IQlv", "GPA", case_names = "NAME", color = "first_gen") +
scale_color_discrete("First generation?") +
theme(legend.position="bottom")
## `geom_smooth()` using formula 'y ~ x'
GG_save("figs/bg/IQlv GPA unwt.png")
## `geom_smooth()` using formula 'y ~ x'
#weights
GG_scatter(d_bg, "IQlv", "GPA", case_names = "NAME", weights = "n_sqrt", color = "first_gen") +
scale_color_discrete("First generation?") +
theme(legend.position="bottom")
## `geom_smooth()` using formula 'y ~ x'
GG_save("figs/bg/IQlv GPA wt.png")
## `geom_smooth()` using formula 'y ~ x'
#weights + no dk
GG_scatter(d_bg %>% filter(ISO != "DNK"), "IQlv", "GPA", case_names = "NAME", weights = "n_sqrt", color = "first_gen") +
scale_color_discrete("First generation?") +
theme(legend.position="bottom")
## `geom_smooth()` using formula 'y ~ x'
GG_save("figs/bg/IQlv GPA wt noDK.png")
## `geom_smooth()` using formula 'y ~ x'
#large only, no dk
GG_scatter(d_bg %>% filter(ISO != "DNK", n >= min_n), "IQlv", "GPA", case_names = "NAME", weights = "n_sqrt", color = "first_gen") +
scale_color_discrete("First generation?") +
scale_x_continuous("National intelligence of origin country") +
scale_y_continuous("GPA in Denmark") +
theme(legend.position="bottom")
## `geom_smooth()` using formula 'y ~ x'
GG_save("figs/bg/IQlv GPA wt noDK large.png")
## `geom_smooth()` using formula 'y ~ x'
#Muslim
GG_scatter(d_bg %>% filter(ISO != "DNK", n >= min_n), "Muslim", "GPA", case_names = "NAME", weights = "n_sqrt", color = "first_gen") +
scale_color_discrete("First generation?") +
scale_x_continuous("Muslims in origin country", labels = scales::percent) +
scale_y_continuous("GPA in Denmark") +
theme(legend.position="bottom")
## `geom_smooth()` using formula 'y ~ x'
GG_save("figs/bg/Muslim GPA wt noDK large.png")
## `geom_smooth()` using formula 'y ~ x'
#Education selection
GG_scatter(d_bg %>% filter(ISO != "DNK", n >= min_n), "edu_selection", "GPA", case_names = "NAME", weights = "n_sqrt", color = "first_gen") +
scale_color_discrete("First generation?") +
scale_x_continuous("Educational selection of migrants") +
scale_y_continuous("GPA in Denmark") +
theme(legend.position="bottom")
## `geom_smooth()` using formula 'y ~ x'
GG_save("figs/bg/edu selection GPA wt noDK large.png")
## `geom_smooth()` using formula 'y ~ x'
#standardized datasets
model_preds = c("IQlv", "Muslim", "first_gen", "edu_selection")
d_bg_std_noDK = d_bg %>% df_standardize(messages = F, exclude = c("n", "n_sqrt")) %>% filter(ISO != "DNK") %>% filter(!is.na(IQlv), !is.na(Muslim), !is.na(first_gen), !is.na(edu_selection), n > min_n)
d_bg_noDK = d_bg %>% filter(ISO != "DNK") %>% filter(!is.na(IQlv), !is.na(Muslim), !is.na(first_gen), !is.na(edu_selection), n > min_n)
#models
model_forms = list(
GPA ~ first_gen + IQlv,
GPA ~ first_gen + Muslim,
GPA ~ first_gen + IQlv + Muslim,
GPA ~ first_gen + IQlv + Muslim + edu_selection
)
#fits
model_forms %>%
map(~ols(., data = d_bg_std_noDK, weights = d_bg_std_noDK$n_sqrt)) %>%
summarize_models()
#unstandardized
model_forms %>%
map(~ols(., data = d_bg_noDK, weights = d_bg_noDK$n_sqrt)) %>%
summarize_models()
#model predictions
ols(model_forms[[4]], data = d_bg_noDK %>% mutate(first_gen = factor(first_gen)), weights = d_bg_noDK$n_sqrt) %>%
ggeffects::ggpredict(c("IQlv", "first_gen")) %>%
plot()
ols(model_forms[[4]], data = d_bg_noDK %>% mutate(first_gen = factor(first_gen)), weights = d_bg_noDK$n_sqrt) %>%
ggeffects::ggpredict(c("Muslim", "first_gen")) %>%
plot()
ols(model_forms[[4]], data = d_bg_noDK %>% mutate(first_gen = factor(first_gen)), weights = d_bg_noDK$n_sqrt) %>%
ggeffects::ggpredict(c("edu_selection", "first_gen")) %>%
plot()
Vary the minimum sample size and see how this affects results.
#compute
cor_results_by_minimum_n = map_df(5:200, function(min_n) {
#subset
d %>%
filter(ISO != "DNK", n >= min_n) ->
d_local
#compute correlations
cors = wtd.cors(select(d_local, GPA, !!main_preds), weight = d_local$n_sqrt)
#save as long form
cors %>%
reshape2::melt() %>%
mutate(
min_n = min_n,
n = nrow(d_local)
)
})
#plot
cor_results_by_minimum_n %>%
filter(Var2 == "GPA", Var1 != "GPA") %>%
ggplot(aes(min_n, value, color = Var1)) +
geom_line() +
scale_color_discrete("Predictor") +
scale_y_continuous("Correlation with GPA") +
scale_x_continuous("Minimum required sample size for GPA")
GG_save("figs/min_n_results.png")
#model results
model_results_by_minimum_n = map_df(5:200, function(min_n) {
#subset
#standardize again to avoid the pre-filtering above!
d_bg %>%
df_standardize(messages = F, exclude = c("n", "n_sqrt")) %>%
filter(ISO != "DNK", n >= min_n) ->
d_local
#compute correlations
model_fit = ols(GPA ~ first_gen + IQlv + Muslim + edu_selection, data = d_local, weights = d_local$n_sqrt)
#save as long form
model_fit %>%
summary.lm() %>%
broom::tidy() %>%
mutate(
min_n = min_n,
n = nrow(d_local)
)
})
#plot
model_results_by_minimum_n %>%
filter(term != "Intercept") %>%
mutate(
p_low = p.value < .01
) %>%
ggplot(aes(min_n, estimate, color = term, linetype = p_low)) +
geom_line() +
scale_color_discrete("Predictor") +
scale_linetype_discrete("p < .01", ) +
scale_y_continuous("Standardized beta") +
scale_x_continuous("Minimum required sample size for GPA")
GG_save("figs/min_n_results_model.png")
#correlations of predictors with sample size
d %>% select(!!main_preds, n) %>% wtd.cors()
## IQlv IQr IQb IQwb Muslim edu_selection n
## IQlv 1.0000 0.981 0.9296 0.898 -0.367 -0.0024 0.096
## IQr 0.9811 1.000 0.9323 0.924 -0.361 0.0194 0.108
## IQb 0.9296 0.932 1.0000 0.876 -0.431 -0.0029 0.106
## IQwb 0.8981 0.924 0.8759 1.000 -0.393 -0.0316 0.121
## Muslim -0.3666 -0.361 -0.4306 -0.393 1.000 -0.1389 -0.053
## edu_selection -0.0024 0.019 -0.0029 -0.032 -0.139 1.0000 -0.321
## n 0.0958 0.108 0.1061 0.121 -0.053 -0.3206 1.000
Merge as needed and then plot. Tricky because some of the countries no longer exist.
#GPA data
#which are we missing?
setdiff(d$ISO, worldmap$ISO)
## [1] "YUG" "KSV" "SRBM"
#ignore the Yugoslavia problem
#prep GPA data
d_combined = d %>% select(ISO, GPA, n, IQlv, Muslim, edu_selection) %>%
left_join(d_bg %>% select(ISO, GPA, first_gen) %>% mutate(first_gen = first_gen %>% mapvalues(c(T, F), c("GPA_1st_gen", "GPA_2nd_gen"))) %>% pivot_wider(names_from = "first_gen", values_from = "GPA"), by = "ISO")
left_join(worldmap, d_combined, by = "ISO") %>%
st_make_valid() ->
GPA_map_data
#GPA
GPA_map_data %>%
tm_shape() +
tm_polygons("GPA", title = "Mean GPA") ->
GPA_map
GPA_map
tmap_save(GPA_map, filename = "figs/GPA_map.png")
## Map saved to /media/luks8tb2/Projects/Denmark GPA origin country/figs/GPA_map.png
## Resolution: 3054 by 1444 pixels
## Size: 10 by 4.8 inches (300 dpi)
#1st gen
GPA_map_data %>%
tm_shape() +
tm_polygons("GPA_1st_gen", title = "Mean GPA, 1st gen.") ->
GPA_1st_map
GPA_1st_map
tmap_save(GPA_1st_map, filename = "figs/GPA_1st_map.png")
## Map saved to /media/luks8tb2/Projects/Denmark GPA origin country/figs/GPA_1st_map.png
## Resolution: 3054 by 1444 pixels
## Size: 10 by 4.8 inches (300 dpi)
#2nd gen
GPA_map_data %>%
tm_shape() +
tm_polygons("GPA_2nd_gen", title = "Mean GPA, 2nd gen.") ->
GPA_2nd_map
GPA_2nd_map
tmap_save(GPA_2nd_map, filename = "figs/GPA_2nd_map.png")
## Map saved to /media/luks8tb2/Projects/Denmark GPA origin country/figs/GPA_2nd_map.png
## Resolution: 3054 by 1444 pixels
## Size: 10 by 4.8 inches (300 dpi)
#selection index into Denmark
left_join(worldmap, brain_drain_oecd_dk_sum, by = "ISO") %>%
st_make_valid() ->
selection_map_data
selection_map_data %>%
tm_shape() +
tm_polygons("edu_selection", title = "Selection index into Denmark") ->
selection_map
selection_map
## Variable(s) "edu_selection" contains positive and negative values, so midpoint is set to 0. Set midpoint = NA to show the full spectrum of the color palette.
tmap_save(selection_map, filename = "figs/selection_map.png")
## Variable(s) "edu_selection" contains positive and negative values, so midpoint is set to 0. Set midpoint = NA to show the full spectrum of the color palette.
## Map saved to /media/luks8tb2/Projects/Denmark GPA origin country/figs/selection_map.png
## Resolution: 3054 by 1444 pixels
## Size: 10 by 4.8 inches (300 dpi)
#original backwards plot
GG_scatter(PIAAC_employment_gaps, "Employment_gap", "PIAAC_gap", case_names = "Country") +
xlab("Employment rate gap") +
ylab("PIAAC test score gap")
## `geom_smooth()` using formula 'y ~ x'
#causal direction
GG_scatter(PIAAC_employment_gaps, "PIAAC_gap", "Employment_gap", case_names = "Country") +
ylab("Employment rate gap") +
xlab("PIAAC test score gap")
## `geom_smooth()` using formula 'y ~ x'
GG_save("figs/PIAAC_employment.png")
## `geom_smooth()` using formula 'y ~ x'
#combine the variables by generation to wide format
d %>% select(ISO, GPA, n, IQlv, Muslim, edu_selection) %>%
left_join(d_bg %>% select(ISO, GPA, first_gen) %>% mutate(first_gen = first_gen %>% mapvalues(c(T, F), c("GPA_1st_gen", "GPA_2nd_gen"))) %>% pivot_wider(names_from = "first_gen", values_from = "GPA"), by = "ISO") %>%
select(ISO, IQlv, Muslim, edu_selection, n, GPA, GPA_1st_gen, GPA_2nd_gen)
#versions
write_sessioninfo()
## R version 4.0.1 (2020-06-06)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Linux Mint 19.3
##
## Matrix products: default
## BLAS: /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.7.1
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.7.1
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=de_DE.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=de_DE.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=de_DE.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] ggeffects_0.14.3 tmap_3.0 sf_0.9-3
## [4] haven_2.3.1 rms_5.1-4 SparseM_1.78
## [7] readxl_1.3.1 kirkegaard_2020-07-02 metafor_2.4-0
## [10] Matrix_1.2-18 psych_1.9.12.31 magrittr_1.5
## [13] assertthat_0.2.1 weights_1.0.1 mice_3.9.0
## [16] gdata_2.18.0 Hmisc_4.4-0 Formula_1.2-3
## [19] survival_3.1-12 lattice_0.20-41 forcats_0.5.0
## [22] stringr_1.4.0 dplyr_0.8.99.9003 purrr_0.3.4
## [25] readr_1.3.1 tidyr_1.1.0 tibble_3.0.1
## [28] ggplot2_3.3.2 tidyverse_1.3.0 pacman_0.5.1
##
## loaded via a namespace (and not attached):
## [1] backports_1.1.8 lwgeom_0.2-4 plyr_1.8.6
## [4] sp_1.4-2 splines_4.0.1 crosstalk_1.1.0.1
## [7] leaflet_2.0.3 TH.data_1.0-10 digest_0.6.25
## [10] htmltools_0.5.0 fansi_0.4.1 checkmate_2.0.0
## [13] cluster_2.1.0 modelr_0.1.8 sandwich_2.5-1
## [16] jpeg_0.1-8.1 colorspace_1.4-1 blob_1.2.1
## [19] rvest_0.3.5 xfun_0.15 leafem_0.1.1
## [22] crayon_1.3.4 jsonlite_1.7.0 lme4_1.1-23
## [25] zoo_1.8-8 glue_1.4.1 stars_0.4-1
## [28] gtable_0.3.0 emmeans_1.4.6 MatrixModels_0.4-1
## [31] sjmisc_2.8.4 sjstats_0.18.0 abind_1.4-5
## [34] scales_1.1.1 mvtnorm_1.1-0 DBI_1.1.0
## [37] Rcpp_1.0.4.6 performance_0.4.6 xtable_1.8-4
## [40] viridisLite_0.3.0 htmlTable_2.0.0 tmvnsim_1.0-2
## [43] units_0.6-6 foreign_0.8-79 htmlwidgets_1.5.1
## [46] httr_1.4.1 RColorBrewer_1.1-2 acepack_1.4.1
## [49] ellipsis_0.3.1 pkgconfig_2.0.3 XML_3.99-0.3
## [52] farver_2.0.3 nnet_7.3-14 dbplyr_1.4.4
## [55] reshape2_1.4.4 tidyselect_1.1.0 labeling_0.3
## [58] rlang_0.4.6 effectsize_0.3.1 tmaptools_3.0
## [61] munsell_0.5.0 multilevel_2.6 cellranger_1.1.0
## [64] tools_4.0.1 cli_2.0.2 generics_0.0.2
## [67] sjlabelled_1.1.4 broom_0.5.6 evaluate_0.14
## [70] yaml_2.2.1 leafsync_0.1.0 knitr_1.29
## [73] fs_1.4.2 nlme_3.1-147 quantreg_5.55
## [76] xml2_1.3.2 psychometric_2.2 compiler_4.0.1
## [79] rstudioapi_0.11 png_0.1-7 e1071_1.7-3
## [82] reprex_0.3.0 statmod_1.4.34 stringi_1.4.6
## [85] parameters_0.7.0 nloptr_1.2.2.1 classInt_0.4-3
## [88] vctrs_0.3.1 stringdist_0.9.5.5 pillar_1.4.4
## [91] lifecycle_0.2.0 pwr_1.3-0 estimability_1.3
## [94] data.table_1.12.8 insight_0.8.4 raster_3.1-5
## [97] R6_2.4.1 latticeExtra_0.6-29 KernSmooth_2.23-17
## [100] gridExtra_2.3 codetools_0.2-16 polspline_1.1.19
## [103] dichromat_2.0-0 boot_1.3-25 MASS_7.3-51.6
## [106] gtools_3.8.2 withr_2.2.0 mnormt_2.0.1
## [109] multcomp_1.4-13 mgcv_1.8-31 bayestestR_0.6.0
## [112] parallel_4.0.1 hms_0.5.3 grid_4.0.1
## [115] rpart_4.1-15 minqa_1.2.4 coda_0.19-3
## [118] class_7.3-17 rmarkdown_2.3 lubridate_1.7.9
## [121] base64enc_0.1-3 rematch_1.0.1
#output data
d %>% write_rds("data/combined.rds", compress = "xz")
d_bg %>% write_rds("data/by_generation.rds", compress = "xz")
#upload to OSF
if (F) {
library(osfr)
osf_auth(read_lines("~/.config/osf_token"))
osf_proj = osf_retrieve_node("https://osf.io/2vz8u/")
osf_upload(
osf_proj,
conflicts = "overwrite",
path = c("notebook.Rmd", "notebook.html", "data", "figs"),
recurse = T
)
}