library(kirkegaard)
## Loading required package: tidyverse
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.1 ✔ stringr 1.5.2
## ✔ ggplot2 4.0.0 ✔ tibble 3.3.0
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.1.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
## Loading required package: magrittr
##
##
## Attaching package: 'magrittr'
##
##
## The following object is masked from 'package:purrr':
##
## set_names
##
##
## The following object is masked from 'package:tidyr':
##
## extract
##
##
## Loading required package: weights
##
## Loading required package: assertthat
##
##
## Attaching package: 'assertthat'
##
##
## The following object is masked from 'package:tibble':
##
## has_name
##
##
## Loading required package: psych
##
##
## Attaching package: 'psych'
##
##
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
##
##
## Loading required package: robustbase
##
##
## Attaching package: 'kirkegaard'
##
##
## The following object is masked from 'package:psych':
##
## rescale
##
##
## The following object is masked from 'package:assertthat':
##
## are_equal
##
##
## The following object is masked from 'package:purrr':
##
## is_logical
##
##
## The following object is masked from 'package:base':
##
## +
load_packages(
sf,
googlesheets4,
rnaturalearth
)
## Linking to GEOS 3.10.2, GDAL 3.4.1, PROJ 8.2.1; sf_use_s2() is TRUE
theme_set(theme_bw())
options(
digits = 3
)
#multithreading
#library(future)
#plan(multisession(workers = 8))
insert_row_values <- function(df, values, at = 2, col_start = 1) {
stopifnot(is.data.frame(df))
n_vals <- length(values)
col_end <- col_start + n_vals - 1
if (col_end > ncol(df)) {
stop("Values exceed number of columns available from col_start.")
}
# Create a 1-row blank template
new_row <- df[1, , drop = FALSE]
new_row[,] <- NA
# Fill the desired columns
new_row[1, col_start:col_end] <- values
# Insert the row
dplyr::bind_rows(
df[seq_len(at - 1), , drop = FALSE],
new_row,
df[at:nrow(df), , drop = FALSE]
)
}
insert_row = function(df, values, at = 1) {
# browser()
assertthat::assert_that(ncol(df) == length(values))
if (at == 1) {
idx_before = numeric()
idx_after = 1:nrow(df)
} else if (at == nrow(df)) {
idx_before = 1:nrow(df)
idx_after = numeric()
} else {
idx_before = 1:(at-1)
idx_after = at:nrow(df)
}
bind_rows(
df[idx_before, ],
matrix(values, nrow = 1) %>% {colnames(.) = colnames(df); .} %>% as_tibble(),
df[idx_after, ]
)
}
insert_row(
iris[1:5, -5],
values = 1:4
)
#NIQ and S
gs4_deauth()
d_niq_s = read_sheet("https://docs.google.com/spreadsheets/d/1cReoeIZLlbxOrN4_wnj52Q6UWFepXSJGi3Gj5m6ZAZg/edit?gid=528842851#gid=528842851", sheet = 3)
## ✔ Reading from "National IQ datasets".
## ✔ Range ''Seb2024b''.
#ISO3
d_niq_s$ISO = pu_translate(d_niq_s$Name)
## No exact match: Hong Kong SAR China
## No exact match: Macao SAR China
## No exact match: Palestinian Territories
## No exact match: Congo - Brazzaville
## No exact match: Congo - Kinshasa
## No exact match: São Tomé & PrÃncipe
## Best fuzzy match found: Hong Kong SAR China -> Hong Kong SAR, China with distance 1.00
## Best fuzzy match found: Macao SAR China -> Macao SAR, China with distance 1.00
## Best fuzzy match found: Palestinian Territories -> Palestinian territories with distance 1.00
## Best fuzzy match found: Congo - Brazzaville -> Congo (Brazzaville) with distance 3.00
## Best fuzzy match found: Congo - Kinshasa -> Congo (Zaire) with distance 9.00
## Best fuzzy match found: São Tomé & PrÃncipe -> São Tomé and PrÃncipe with distance 3.00
#GDPpc
d_gdppc = read_csv("data/gdp-per-capita-worldbank/gdp-per-capita-worldbank.csv") %>% df_legalize_names()
## Rows: 7311 Columns: 5
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): Entity, Code, World regions according to OWID
## dbl (2): Year, GDP per capita, PPP (constant 2021 international $)
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
#rename
d_gdppc %<>%
rename(
ISO = Code,
GDPpcppp = GDP_per_capita_PPP_constant_2021_international
)
#productivity
d_prod = read_csv("data/labor-productivity-per-hour-pennworldtable/labor-productivity-per-hour-pennworldtable.csv") %>% df_legalize_names()
## Rows: 4965 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): Entity, Code
## dbl (2): Year, Productivity: output per hour worked
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
#fix
d_prod %<>%
rename(
ISO = Code,
Productivity = Productivity_output_per_hour_worked
)
#population
d_pop = read_csv("data/population/population.csv") %>% df_legalize_names()
## Rows: 58824 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): Entity, Code
## dbl (2): Year, Population (historical)
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
#fix
d_pop %<>% rename(
ISO = Code,
Population = Population_historical
)
# Use geometries as sf
d_worldmap <- rnaturalearth::ne_countries(returnclass = "sf")
# 1) Reproject to a projected CRS (e.g. Web Mercator 3857)
d_worldmap_proj <- st_transform(d_worldmap, 3857)
# 2) Compute centroids in projected space
d_worldmap_cent <- d_worldmap_proj %>%
st_centroid()
## Warning: st_centroid assumes attributes are constant over geometries
# 3) Transform centroids back to lon/lat (EPSG:4326)
d_worldmap_cent <- st_transform(d_worldmap_cent, 4326)
# 4) Extract lon/lat as vars
coords <- st_coordinates(d_worldmap_cent)
d_worldmap_cent <- d_worldmap_cent %>%
mutate(
lon = coords[, "X"],
lat = coords[, "Y"]
)
#ISO
d_worldmap_cent %<>% rename(
ISO = adm0_a3
)
#join all data
d = smart_join(
d_niq_s %>% select(-Name),
d_gdppc %>% filter(Year == 2023, !is.na(ISO)) %>% select(ISO, GDPpcppp),
id_col = "ISO"
) %>% smart_join(
d_prod %>% filter(Year == 2023, !is.na(ISO)) %>% select(ISO, Productivity),
id_col = "ISO"
) %>% smart_join(
d_pop %>% filter(Year == 2023, !is.na(ISO)) %>% select(ISO, Population),
id_col = "ISO"
) %>% smart_join(
d_worldmap_cent %>% st_drop_geometry() %>% select(ISO, lat, lon),
id_col = "ISO"
)
#drop non-countries
d %<>% filter(!str_detect(ISO, "OWID_"))
#mutate
d %<>% mutate(
log_population = log10(Population),
log_GDPpcppp = log10(GDPpcppp)
)
#spatial lag
d %<>% spatial_knn(
vars = c("Productivity", "GDPpcppp", "log_GDPpcppp", "SDI")
)
#spatial cors
d %>% spatial_lag_cors()
## SDI GDPpcppp Productivity log_GDPpcppp
## 0.867 0.769 0.792 0.804
#cors
d %>%
select(-ISO, -ends_with("_lag"), -lat, -lon) %>%
GG_heatmap()
GG_save("figs/nat_cors.png")
#scatters
p_pop_gdp = d %>%
GG_scatter("Population", "GDPpcppp", case_names = "ISO") +
labs(
y = "GDP per person PPP",
x = "Population count"
)
p_pop_log_gdp = d %>%
GG_scatter("Population", "log_GDPpcppp", case_names = "ISO") +
labs(
y = "Log10 GDP per person PPP",
x = "Population count"
)
p_log_pop_gdp = d %>%
GG_scatter("log_population", "GDPpcppp", case_names = "ISO") +
labs(
y = "GDP per person PPP",
x = "Log10 population count"
)
p_log_pop_log_gdp = d %>%
GG_scatter("log_population", "log_GDPpcppp", case_names = "ISO") +
labs(
y = "Log10 GDP per person PPP",
x = "Log10 population count"
)
patchwork::wrap_plots(
p_pop_gdp,
p_pop_log_gdp,
p_log_pop_gdp,
p_log_pop_log_gdp
)
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
GG_save("figs/4 scatter pop gdp.png")
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
#models
#gdp
d_std = d %>% df_standardize()
## Skipped ISO because it is a character (string)
mods = list(
lm(GDPpcppp ~ log_population, data = d_std),
lm(GDPpcppp ~ GDPpcppp_lag + log_population, data = d_std),
lm(GDPpcppp ~ GDPpcppp_lag + log_population + NIQ, data = d_std),
lm(log_GDPpcppp ~ log_population, data = d_std),
lm(log_GDPpcppp ~ log_GDPpcppp_lag + log_population, data = d_std),
lm(log_GDPpcppp ~ log_GDPpcppp_lag + log_population + NIQ, data = d_std),
lm(Productivity ~ log_population, data = d_std),
lm(Productivity ~ Productivity_lag + log_population, data = d_std),
lm(Productivity ~ Productivity_lag + log_population + NIQ, data = d_std),
lm(SDI ~ log_population, data = d_std),
lm(SDI ~ SDI_lag + log_population, data = d_std),
lm(SDI ~ SDI_lag + log_population + NIQ, data = d_std)
)
mods %>%
summarize_models(asterisks_only = F) %>%
insert_row(
values = c("outcome", rep("GDPpcppp", 3), rep("log10 GDPpcppp", 3), rep("productivity", 3), rep("S", 3)),
at = 1
) %>%
flextable::flextable()
Predictor/Model | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
outcome | GDPpcppp | GDPpcppp | GDPpcppp | log10 GDPpcppp | log10 GDPpcppp | log10 GDPpcppp | productivity | productivity | productivity | S | S | S |
(Intercept) | 0.05 (0.074, 0.54) | 0.03 (0.062, 0.636) | 0.05 (0.055, 0.402) | 0.05 (0.074, 0.496) | -0.02 (0.063, 0.738) | 0.02 (0.052, 0.689) | 0.33 (0.132, 0.014) | 0.02 (0.094, 0.869) | -0.06 (0.084, 0.489) | 0.03 (0.079, 0.701) | -0.05 (0.053, 0.381) | 0.00 (0.038, 0.97) |
log_population | -0.19 (0.088, 0.034) | -0.17 (0.081, 0.032) | -0.22 (0.072, 0.003**) | -0.21 (0.087, 0.019) | -0.08 (0.082, 0.349) | -0.17 (0.069, 0.012) | -0.52 (0.160, 0.001**) | -0.06 (0.114, 0.621) | -0.07 (0.101, 0.463) | -0.11 (0.098, 0.27) | 0.03 (0.069, 0.667) | -0.07 (0.050, 0.167) |
SDI_lag | 0.89 (0.040, <0.001***) | 0.34 (0.052, <0.001***) | ||||||||||
NIQ | 0.38 (0.057, <0.001***) | 0.54 (0.062, <0.001***) | 0.40 (0.068, <0.001***) | 0.65 (0.052, <0.001***) | ||||||||
GDPpcppp_lag | 0.71 (0.047, <0.001***) | 0.46 (0.057, <0.001***) | ||||||||||
log_GDPpcppp_lag | 0.81 (0.048, <0.001***) | 0.39 (0.062, <0.001***) | ||||||||||
Productivity_lag | 0.79 (0.059, <0.001***) | 0.52 (0.070, <0.001***) | ||||||||||
R2 adj. | 0.018 | 0.598 | 0.688 | 0.023 | 0.643 | 0.761 | 0.070 | 0.622 | 0.704 | 0.001 | 0.748 | 0.872 |
N | 196 | 161 | 160 | 196 | 161 | 160 | 129 | 124 | 124 | 193 | 166 | 166 |
#without china and india
d_std_sub1 = d %>% filter(!ISO %in% c("CHN", "IND")) %>% df_standardize()
## Skipped ISO because it is a character (string)
#refit
mods_sub1 = list(
lm(GDPpcppp ~ log_population, data = d_std_sub1),
lm(GDPpcppp ~ GDPpcppp_lag + log_population, data = d_std_sub1),
lm(GDPpcppp ~ GDPpcppp_lag + log_population + NIQ, data = d_std_sub1),
lm(log_GDPpcppp ~ log_population, data = d_std_sub1),
lm(log_GDPpcppp ~ log_GDPpcppp_lag + log_population, data = d_std_sub1),
lm(log_GDPpcppp ~ log_GDPpcppp_lag + log_population + NIQ, data = d_std_sub1),
lm(Productivity ~ log_population, data = d_std_sub1),
lm(Productivity ~ Productivity_lag + log_population, data = d_std_sub1),
lm(Productivity ~ Productivity_lag + log_population + NIQ, data = d_std_sub1),
lm(SDI ~ log_population, data = d_std_sub1),
lm(SDI ~ SDI_lag + log_population, data = d_std_sub1),
lm(SDI ~ SDI_lag + log_population + NIQ, data = d_std_sub1)
)
mods_sub1 %>%
summarize_models(asterisks_only = F) %>%
insert_row(
values = c("outcome", rep("GDPpcppp", 3), rep("log10 GDPpcppp", 3), rep("productivity", 3), rep("S", 3)),
at = 1
) %>%
flextable::flextable()
Predictor/Model | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
outcome | GDPpcppp | GDPpcppp | GDPpcppp | log10 GDPpcppp | log10 GDPpcppp | log10 GDPpcppp | productivity | productivity | productivity | S | S | S |
(Intercept) | 0.04 (0.074, 0.551) | 0.04 (0.064, 0.507) | 0.05 (0.057, 0.374) | 0.05 (0.074, 0.48) | -0.01 (0.065, 0.91) | 0.03 (0.054, 0.613) | 0.33 (0.138, 0.018) | 0.00 (0.099, 0.983) | -0.10 (0.088, 0.282) | 0.03 (0.080, 0.681) | -0.03 (0.054, 0.536) | 0.00 (0.039, 0.968) |
log_population | -0.18 (0.089, 0.043) | -0.20 (0.084, 0.02) | -0.22 (0.074, 0.003**) | -0.21 (0.089, 0.017) | -0.10 (0.085, 0.223) | -0.18 (0.071, 0.011) | -0.52 (0.170, 0.003**) | -0.04 (0.121, 0.769) | -0.02 (0.106, 0.816) | -0.11 (0.100, 0.253) | 0.00 (0.071, 0.958) | -0.07 (0.052, 0.189) |
SDI_lag | 0.89 (0.040, <0.001***) | 0.33 (0.053, <0.001***) | ||||||||||
NIQ | 0.38 (0.058, <0.001***) | 0.54 (0.063, <0.001***) | 0.42 (0.069, <0.001***) | 0.66 (0.053, <0.001***) | ||||||||
GDPpcppp_lag | 0.71 (0.048, <0.001***) | 0.46 (0.058, <0.001***) | ||||||||||
log_GDPpcppp_lag | 0.81 (0.048, <0.001***) | 0.39 (0.063, <0.001***) | ||||||||||
Productivity_lag | 0.79 (0.060, <0.001***) | 0.51 (0.071, <0.001***) | ||||||||||
R2 adj. | 0.016 | 0.600 | 0.688 | 0.024 | 0.646 | 0.762 | 0.062 | 0.619 | 0.707 | 0.002 | 0.752 | 0.872 |
N | 194 | 159 | 158 | 194 | 159 | 158 | 127 | 122 | 122 | 191 | 164 | 164 |
#NIQ vs. S
d %>%
GG_scatter("NIQ", "SDI", case_names = "ISO") +
labs(
x = "IQ mean",
y = "Socioeconomic development",
title = "National intelligence means predict most international variation in development",
subtitle = "Data source: Jensen & Kirkegaard 2024"
)
## `geom_smooth()` using formula = 'y ~ x'
GG_save("figs/scatter_NIQ_S.png")
## `geom_smooth()` using formula = 'y ~ x'
#versions
write_sessioninfo()
## R version 4.5.2 (2025-10-31)
## Platform: x86_64-pc-linux-gnu
## Running under: Linux Mint 21.1
##
## Matrix products: default
## BLAS: /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.10.0
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.10.0 LAPACK version 3.10.0
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=en_DK.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=en_DK.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_DK.UTF-8 LC_IDENTIFICATION=C
##
## time zone: Europe/Brussels
## tzcode source: system (glibc)
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] rnaturalearth_1.1.0 googlesheets4_1.1.2 sf_1.0-21
## [4] kirkegaard_2025-11-19 robustbase_0.99-6 psych_2.5.6
## [7] assertthat_0.2.1 weights_1.1.2 magrittr_2.0.4
## [10] lubridate_1.9.4 forcats_1.0.1 stringr_1.5.2
## [13] dplyr_1.1.4 purrr_1.1.0 readr_2.1.5
## [16] tidyr_1.3.1 tibble_3.3.0 ggplot2_4.0.0
## [19] tidyverse_2.0.0
##
## loaded via a namespace (and not attached):
## [1] RColorBrewer_1.1-3 rstudioapi_0.17.1 jsonlite_2.0.0
## [4] shape_1.4.6.1 jomo_2.7-6 farver_2.1.2
## [7] nloptr_2.2.1 rmarkdown_2.30 fs_1.6.6
## [10] ragg_1.5.0 vctrs_0.6.5 minqa_1.2.8
## [13] askpass_1.2.1 base64enc_0.1-3 terra_1.8-70
## [16] htmltools_0.5.8.1 curl_7.0.0 broom_1.0.10
## [19] cellranger_1.1.0 Formula_1.2-5 mitml_0.4-5
## [22] sass_0.4.10 KernSmooth_2.23-26 bslib_0.9.0
## [25] htmlwidgets_1.6.4 plyr_1.8.9 cachem_1.1.0
## [28] uuid_1.2-1 lifecycle_1.0.4 iterators_1.0.14
## [31] pkgconfig_2.0.3 Matrix_1.7-4 R6_2.6.1
## [34] fastmap_1.2.0 rbibutils_2.3 digest_0.6.37
## [37] colorspace_2.1-2 patchwork_1.3.2 textshaping_1.0.4
## [40] Hmisc_5.2-4 labeling_0.4.3 timechange_0.3.0
## [43] gdata_3.0.1 httr_1.4.7 mgcv_1.9-3
## [46] compiler_4.5.2 gargle_1.6.0 proxy_0.4-27
## [49] bit64_4.6.0-1 fontquiver_0.2.1 withr_3.0.2
## [52] htmlTable_2.4.3 S7_0.2.0 backports_1.5.0
## [55] DBI_1.2.3 pan_1.9 MASS_7.3-65
## [58] openssl_2.3.4 classInt_0.4-11 gtools_3.9.5
## [61] tools_4.5.2 units_1.0-0 foreign_0.8-90
## [64] googledrive_2.1.2 zip_2.3.3 nnet_7.3-20
## [67] glue_1.8.0 nlme_3.1-168 grid_4.5.2
## [70] stringdist_0.9.15 checkmate_2.3.3 cluster_2.1.8.1
## [73] generics_0.1.4 gtable_0.3.6 tzdb_0.5.0
## [76] class_7.3-23 data.table_1.17.8 hms_1.1.3
## [79] xml2_1.4.0 foreach_1.5.2 pillar_1.11.1
## [82] vroom_1.6.6 splines_4.5.2 lattice_0.22-7
## [85] survival_3.8-3 bit_4.6.0 tidyselect_1.2.1
## [88] fontLiberation_0.1.0 knitr_1.50 fontBitstreamVera_0.1.1
## [91] reformulas_0.4.1 gridExtra_2.3 xfun_0.53
## [94] DEoptimR_1.1-4 stringi_1.8.7 yaml_2.3.10
## [97] boot_1.3-32 evaluate_1.0.5 codetools_0.2-20
## [100] officer_0.7.0 gdtools_0.4.4 archive_1.1.12
## [103] cli_3.6.5 rpart_4.1.24 systemfonts_1.3.1
## [106] Rdpack_2.6.4 jquerylib_0.1.4 Rcpp_1.1.0
## [109] readxl_1.4.5 parallel_4.5.2 lme4_1.1-37
## [112] glmnet_4.1-10 scales_1.4.0 e1071_1.7-16
## [115] crayon_1.5.3 flextable_0.9.10 rlang_1.1.6
## [118] mnormt_2.1.1 mice_3.18.0
#write data to file for reuse
d %>% write_rds("data/data_for_reuse.rds")
#OSF
if (F) {
library(osfr)
#login
osf_auth(readr::read_lines("~/.config/osf_token"))
#the project we will use
osf_proj = osf_retrieve_node("https://osf.io/XXX/")
#upload all files in project
#overwrite existing (versioning)
osf_upload(
osf_proj,
path = c("data", "figures", "papers", "notebook.Rmd", "notebook.html", "sessions_info.txt"),
conflicts = "overwrite"
)
}