Simulation
#simulate data that correlates with NIQs at random
set.seed(1)
for (i in 1:50) {
d_niq[[str_glue("cor_{i}")]] = standardize(d_niq$NIQ) + rnorm(n = nrow(d_niq), sd = runif(1, min = 0.5, max = 4))
}
#distribution of correlations with NIQ
d_niq_cors = d_niq %>% select(NIQ, starts_with("cor_")) %>% wtd.cors() %>% .[-1, 1]
d_niq_cors %>% describe2()
d_niq_cors %>% GG_denhist()
## Input seems like a fraction, set `boundary=0` and `binwidth=1/30` to avoid issues near the limits. Disable this with `auto_fraction_bounary=F`

#pick 5 random varialbes, and impute IQs below 75- based on the 75+ data
set.seed(1)
d1_imp_res = map_df(1:1000, function(sim) {
# browser()
#which vars
pred_vars = str_glue("cor_{sample(1:50, 5)}")
#formula
form = str_glue("NIQ ~ {str_c(pred_vars, collapse = ' + ')}")
#subset idx
subset_idx = (d_niq$NIQ >= 75)
#fit
niq_lm = lm(form, data = d_niq[subset_idx, ])
#impute
niq_lm_pred = predict(niq_lm, newdata = d_niq[!subset_idx, ])
tibble(
sim = sim,
mean_imputed = mean(niq_lm_pred),
model_rsq = niq_lm %>% broom::glance() %>% .[["adj.r.squared"]]
)
})
#true value
true_mean_below_75 = d_niq %>% filter(NIQ <= 75) %>% pull(NIQ) %>% mean()
#imputed IQ vs. model accuracy
p1 = d1_imp_res %>%
ggplot(aes(model_rsq, mean_imputed)) +
geom_point() +
geom_smooth(method = "lm", fullrange = T) +
geom_hline(yintercept = true_mean_below_75, linetype = 2) +
scale_x_continuous(limits = c(0, 1)) +
labs(
title = "Predict countries below 75 IQ",
x = "Model R²",
y = "Average predicted IQ"
)
#predicted value if perfect model accuracy
lm_pred_models = lm(mean_imputed ~ model_rsq, data = d1_imp_res)
predict(lm_pred_models, newdata = data.frame(model_rsq = 1))
## 1
## 69.6
#how many below 75?
(d_niq$NIQ <= 75) %>% sum()
## [1] 50
(d_niq$NIQ >= 90) %>% sum()
## [1] 50
#repeat for those
set.seed(1)
d1_imp_res2 = map_df(1:1000, function(sim) {
# browser()
#which vars
pred_vars = str_glue("cor_{sample(1:50, 5)}")
#formula
form = str_glue("NIQ ~ {str_c(pred_vars, collapse = ' + ')}")
#subset idx
subset_idx = (d_niq$NIQ <= 90)
#fit
niq_lm = lm(form, data = d_niq[subset_idx, ])
#impute
niq_lm_pred = predict(niq_lm, newdata = d_niq[!subset_idx, ])
tibble(
sim = sim,
mean_imputed = mean(niq_lm_pred),
model_rsq = niq_lm %>% broom::glance() %>% .[["adj.r.squared"]]
)
})
#true value
true_mean_above_90 = d_niq %>% filter(NIQ >= 90) %>% pull(NIQ) %>% mean()
#imputed IQ vs. model accuracy
p2 = d1_imp_res2 %>%
ggplot(aes(model_rsq, mean_imputed)) +
geom_point() +
geom_smooth(method = "lm", fullrange = T) +
geom_hline(yintercept = true_mean_above_90, linetype = 2) +
scale_x_continuous(limits = c(0, 1)) +
labs(
title = "Predict countries above 90 IQ",
x = "Model R²",
y = "Average predicted IQ"
)
#predicted value if perfect model accuracy
lm_pred_models2 = lm(mean_imputed ~ model_rsq, data = d1_imp_res2)
predict(lm_pred_models2, newdata = data.frame(model_rsq = 1))
## 1
## 97.8
patchwork::wrap_plots(p1, p2)
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'

GG_save("figs/model_preds.png")
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
Impute African IQs
#impute African IQs
set.seed(1)
d_pred_african = map_df(1:10000, function(sim) {
# browser()
#how many vars
pred_var_count = sample(2:7, size = 1)
#which vars
pred_vars = sample(spi_vars, size = pred_var_count)
#formula
form = str_glue("NIQ ~ {str_c(pred_vars, collapse = ' + ')}")
#make subset with imputed SPI and IQs
d_sub = bind_cols(
d_spi_imp,
d %>% select(ISO, NIQ)
)
#subset idx
subset_idx = !d_sub$ISO %in% African_ISOs
#fit
niq_lm = lm(form, data = d_sub[subset_idx, ])
#impute
niq_lm_pred = predict(niq_lm, newdata = d_sub[!subset_idx, ])
tibble(
sim = sim,
mean_imputed = mean(niq_lm_pred, na.rm = T),
pred_vars_n = length(pred_vars),
pred_vars = list(pred_vars),
imputed = list(tibble(ISO = African_ISOs, NIQ_pred = niq_lm_pred)),
model_rsq = niq_lm %>% broom::glance() %>% .[["adj.r.squared"]]
)
})
#true value
true_mean_SSA = d %>% filter(ISO %in% African_ISOs) %>% pull(NIQ) %>% mean()
#plot meta-regression
d_pred_african %>%
ggplot(aes(model_rsq, mean_imputed)) +
geom_point() +
geom_smooth(method = "lm", fullrange = T) +
geom_hline(yintercept = true_mean_SSA, linetype = 2) +
scale_x_continuous(limits = c(0, 1)) +
labs(
title = "Predict African NIQs",
subtitle = "From random models with 2-7 variables from Social Progress Index database",
x = "Model R²",
y = "Average predicted IQ"
)
## `geom_smooth()` using formula = 'y ~ x'

GG_save("figs/pred_African.png")
## `geom_smooth()` using formula = 'y ~ x'
#investigate causes of very low predicted IQs
#we need the matrix of included variables
# d_pred_african_vars = matrix(F, nrow = nrow(d_pred_african), ncol = length(spi_vars))
# colnames(d_pred_african_vars) = spi_vars
# d_pred_african_vars %<>% as_tibble()
#
# #fill in
# for (model in 1:nrow(d_pred_african)) {
# this_model_preds = d_pred_african$pred_vars %>% unlist()
#
# for(v in this_model_preds) {
# d_pred_african_vars[model, v] = T
# }
# }
#above code is very slow
#GPT5.1
n_models <- nrow(d_pred_african)
p <- length(spi_vars)
# start with all FALSE
d_pred_african_vars <- matrix(FALSE, nrow = n_models, ncol = p,
dimnames = list(NULL, spi_vars))
# list-column of predictors
pv <- d_pred_african$pred_vars
# row indices: model 1 repeated length(pv[[1]]) times, etc.
i <- rep(seq_len(n_models), lengths(pv))
# column indices: match variable names to spi_vars
j <- match(unlist(pv), spi_vars)
# set included cells to TRUE
d_pred_african_vars[cbind(i, j)] <- TRUE
# if you want a tibble:
d_pred_african_vars <- as_tibble(d_pred_african_vars)
#verify by checking with predictor counts
(d_pred_african$pred_vars_n == (d_pred_african_vars %>% rowSums())) %>% all()
## [1] TRUE
#change to factor coding
d_pred_african_vars = map_df(d_pred_african_vars, as.factor)
#join
d_pred_african = bind_cols(
d_pred_african,
d_pred_african_vars
)
#so figure out which mnodels predict higher and lower
african_models = list(
simplest = lm(mean_imputed ~ model_rsq, data = d_pred_african),
pred_count = lm(mean_imputed ~ model_rsq + pred_vars_n, data = d_pred_african),
full = lm(str_glue("mean_imputed ~ model_rsq + pred_vars_n + {str_c(spi_vars, collapse = ' + ')}"), data = d_pred_african),
simplest_wo_bad_vars = lm(mean_imputed ~ model_rsq, data = d_pred_african %>% filter(Deaths_from_infectious_diseases_deaths_100_000_people == F))
)
ggpredict(african_models$simplest, terms = c("model_rsq"))
ggpredict(african_models$pred_count, terms = c("model_rsq"))
ggpredict(african_models$full, terms = c("model_rsq"))
## Warning in predict.lm(model, newdata = data_grid, type = "response", se.fit =
## se, : prediction from rank-deficient fit; attr(*, "non-estim") has doubtful
## cases
ggpredict(african_models$simplest_wo_bad_vars, terms = c("model_rsq"))
african_models %>%
summarize_models() %>%
filter(!str_detect(`Predictor/Model`, "TRUE"))
#imputed means by variable inclusion or not
african_models_means_by_pred = map_df(spi_vars, function(v) {
# browser()
#models with this variable
d2 = bind_cols(
model = 1:nrow(d_pred_african_vars),
d_pred_african_vars
)
d2 = d2 %>% filter(.data[[v]] == T)
tibble(
var = v,
n_models = nrow(d2),
mean_imputed = mean(d_pred_african[d2$model,] %>% pull(mean_imputed))
)
})
#color models by variables that cause bias
d_pred_african %>%
ggplot(aes(model_rsq, mean_imputed, color = Deaths_from_infectious_diseases_deaths_100_000_people)) +
geom_point() +
geom_smooth(method = "lm", fullrange = T) +
geom_hline(yintercept = true_mean_SSA, linetype = 2) +
scale_x_continuous(limits = c(0, 1)) +
labs(
title = "Predict African NIQs",
subtitle = "From random models with 2-7 variables from Social Progress Index database",
x = "Model R²",
y = "Average predicted IQ"
)
## `geom_smooth()` using formula = 'y ~ x'

GG_save("figs/pred_African_infec_diseases.png")
## `geom_smooth()` using formula = 'y ~ x'
d_pred_african %>%
filter(Deaths_from_infectious_diseases_deaths_100_000_people == T) %>%
ggplot(aes(model_rsq, mean_imputed, color = Access_to_essential_services_0_none_100_full_coverage)) +
geom_point() +
geom_smooth(method = "lm", fullrange = T) +
geom_hline(yintercept = true_mean_SSA, linetype = 2) +
scale_x_continuous(limits = c(0, 1)) +
labs(
title = "Predict African NIQs",
subtitle = "From random models with 2-7 variables from Social Progress Index database",
x = "Model R²",
y = "Average predicted IQ"
)
## `geom_smooth()` using formula = 'y ~ x'

#code models for these two variables
d_pred_african %>%
mutate(
var_combo = case_when(
Deaths_from_infectious_diseases_deaths_100_000_people==T & Access_to_essential_services_0_none_100_full_coverage == T ~ "infec. dis. yes, ess. serv. yes",
Deaths_from_infectious_diseases_deaths_100_000_people==T & Access_to_essential_services_0_none_100_full_coverage == F ~ "infec. dis. yes, ess. serv. no",
Deaths_from_infectious_diseases_deaths_100_000_people==F & Access_to_essential_services_0_none_100_full_coverage == T ~ "infec. dis. no, ess. serv. yes",
Deaths_from_infectious_diseases_deaths_100_000_people==F & Access_to_essential_services_0_none_100_full_coverage == F ~ "infec. dis. no, ess. serv. no",
)
) %>%
ggplot(aes(model_rsq, mean_imputed, color = var_combo)) +
geom_point() +
geom_smooth(method = "lm", fullrange = T) +
geom_hline(yintercept = true_mean_SSA, linetype = 2) +
scale_x_continuous(limits = c(0, 1)) +
labs(
title = "Predict African NIQs",
subtitle = "From random models with 2-7 variables from Social Progress Index database",
x = "Model R²",
y = "Average predicted IQ"
)
## `geom_smooth()` using formula = 'y ~ x'

GG_save("figs/pred_African_bad_vars.png")
## `geom_smooth()` using formula = 'y ~ x'