Functions
summarize_participation = function(d) {
totals = d %>% group_by(season) %>% summarize(n_games_total = n_distinct(year), .groups = "drop")
d %>%
group_by(iso3, country, season) %>%
summarize(n_games = n(), .groups = "drop") %>%
left_join(totals, by = "season") %>%
mutate(frac_games = n_games / n_games_total)
}
add_per_capita = function(d, pop, by) {
d %>%
inner_join(pop, by = by) %>%
mutate(
gold_pc = gold / avg_pop,
silver_pc = silver / avg_pop,
bronze_pc = bronze / avg_pop,
total_pc = total / avg_pop
)
}
# empirical Bayes shrinkage via ebpm (Poisson means with Gamma prior)
add_eb = function(d) {
d %>%
mutate(
gold_eb = ebpm_gamma(gold, avg_pop)$posterior$mean,
silver_eb = ebpm_gamma(silver, avg_pop)$posterior$mean,
bronze_eb = ebpm_gamma(bronze, avg_pop)$posterior$mean,
total_eb = ebpm_gamma(total, avg_pop)$posterior$mean
)
}
plot_map = function(d, iso3_col = "iso3", title = "") {
map_data = world %>%
filter(iso_a3_eh != "ATA") %>%
left_join(d %>% select(iso3 = !!sym(iso3_col), total_eb), by = c("iso_a3_eh" = "iso3"))
ggplot(map_data) +
geom_sf(aes(fill = total_eb), color = "grey40", linewidth = 0.1) +
scale_fill_viridis_c(
name = "Medals per capita (EB)",
na.value = "grey90",
trans = "log10",
limits = eb_range,
labels = scales::label_scientific()
) +
coord_sf(ylim = c(-55, 85), expand = FALSE) +
labs(title = title) +
theme_void(base_size = 14) +
theme(
plot.title = element_text(hjust = 0.5),
legend.position = "bottom",
legend.key.width = unit(2, "cm"),
plot.background = element_rect(fill = "white", color = NA),
panel.background = element_rect(fill = "white", color = NA),
plot.margin = margin(2, 2, 2, 2)
)
}
plot_resid_map = function(d, iso3_col = "iso3", resid_col, title = "") {
map_data = world %>%
filter(iso_a3_eh != "ATA") %>%
left_join(d %>% dplyr::select(iso3 = !!sym(iso3_col), resid = !!sym(resid_col)),
by = c("iso_a3_eh" = "iso3"))
ggplot(map_data) +
geom_sf(aes(fill = resid), color = "grey40", linewidth = 0.1) +
scale_fill_gradient2(
name = "Residual\n(log scale)",
na.value = "grey90",
low = "#d73027", mid = "white", high = "#1a9850",
midpoint = 0, limits = c(-5, 5), oob = scales::squish
) +
coord_sf(ylim = c(-55, 85), expand = FALSE) +
labs(title = title) +
theme_void(base_size = 14) +
theme(
plot.title = element_text(hjust = 0.5),
legend.position = "bottom",
legend.key.width = unit(2, "cm"),
plot.background = element_rect(fill = "white", color = NA),
panel.background = element_rect(fill = "white", color = NA),
plot.margin = margin(2, 2, 2, 2)
)
}
plot_raw_vs_eb = function(d, n = 30, title = "") {
d %>%
arrange(desc(total_eb)) %>%
head(n) %>%
mutate(country = fct_reorder(country, total_eb)) %>%
select(country, Raw = total_pc, EB = total_eb) %>%
pivot_longer(-country, names_to = "type", values_to = "rate") %>%
ggplot(aes(rate, country, fill = type)) +
geom_bar(stat = "identity", position = "dodge") +
labs(
title = title,
x = "Medals per capita",
fill = "Estimate"
)
}
Data
Medal tallies
# stable borders, all-time (1896-2024)
d_all_summer = read_csv("data/stable_borders_summer.csv")
## Rows: 116 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): iso3, country
## dbl (4): gold, silver, bronze, total
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
d_all_winter = read_csv("data/stable_borders_winter.csv")
## Rows: 35 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): iso3, country
## dbl (4): gold, silver, bronze, total
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
d_all_combined = read_csv("data/stable_borders_combined.csv")
## Rows: 119 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): iso3, country
## dbl (4): gold, silver, bronze, total
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# post-Cold War (1994-2024)
d_post1990_summer = read_csv("data/post1994_summer.csv")
## Rows: 134 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): modern_iso3, country
## dbl (4): gold, silver, bronze, total
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
d_post1990_winter = read_csv("data/post1994_winter.csv")
## Rows: 49 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): modern_iso3, country
## dbl (4): gold, silver, bronze, total
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
d_post1990_combined = read_csv("data/post1994_combined.csv")
## Rows: 138 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): modern_iso3, country
## dbl (4): gold, silver, bronze, total
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# participation: count and fraction of Games attended per country per season
d_all_participation = read_csv("data/stable_borders_participation.csv") %>% summarize_participation()
## Rows: 4028 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): iso3, country, season
## dbl (1): year
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
d_post1990_participation = read_csv("data/post1994_participation.csv") %>%
rename(iso3 = modern_iso3) %>%
summarize_participation()
## Rows: 2569 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): modern_iso3, country, season
## dbl (1): year
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Population
# UN World Population Prospects 2024 via OWID
d_un_pop = read_csv("data/un_pop_raw/population.csv") %>%
rename(iso3 = Code, country = Entity, year = Year, pop = Population) %>%
filter(!is.na(iso3), iso3 != "")
## Rows: 58824 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): Entity, Code
## dbl (2): Year, Population
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# average population for the full period (1896-2023)
d_un_pop_all = d_un_pop %>%
filter(year >= 1896, year <= 2023) %>%
group_by(iso3) %>%
summarize(avg_pop = mean(pop, na.rm = TRUE), .groups = "drop")
# average population for the post-Cold War period (1994-2023)
d_un_pop_post1990 = d_un_pop %>%
filter(year >= 1994, year <= 2023) %>%
group_by(iso3) %>%
summarize(avg_pop = mean(pop, na.rm = TRUE), .groups = "drop")
HDI
# UNDP HDI 1990-2023
d_hdi_raw <- read_csv("data/hdi_raw/HDR_composite_indices.csv")
## Rows: 206 Columns: 1112
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (4): iso3, country, hdicode, region
## dbl (1108): hdi_rank_2023, hdi_1990, hdi_1991, hdi_1992, hdi_1993, hdi_1994,...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
d_hdi <- d_hdi_raw %>%
dplyr::select(iso3, starts_with("hdi_19"), starts_with("hdi_20")) %>%
pivot_longer(-iso3, names_to = "year", values_to = "hdi", names_prefix = "hdi_") %>%
mutate(year = as.integer(year)) %>%
filter(!is.na(hdi))
d_hdi_post1990 <- d_hdi %>%
filter(year >= 1994, year <= 2023) %>%
group_by(iso3) %>%
summarize(avg_hdi = mean(hdi, na.rm = TRUE), .groups = "drop") %>%
# impute missing countries (period averages estimated from current values)
bind_rows(tibble(
iso3 = c("TWN", "PRI", "BMU", "PRK"),
avg_hdi = c(0.81, 0.81, 0.90, 0.56)
))
GDP per capita
# Maddison Project GDP per capita (2011 int-$), 1-2022
d_gdppc_raw <- read_xlsx("data/gdppc_raw/maddison_gdppc.xlsx", sheet = "GDPpc", skip = 2)
names(d_gdppc_raw)[1] <- "year"
d_gdppc <- d_gdppc_raw %>%
pivot_longer(-year, names_to = "iso3", values_to = "gdppc") %>%
filter(!is.na(gdppc), year >= 1896)
d_gdppc_all <- d_gdppc %>%
group_by(iso3) %>%
summarize(avg_gdppc = mean(gdppc, na.rm = TRUE), .groups = "drop")
d_gdppc_post1990 <- d_gdppc %>%
filter(year >= 1994) %>%
group_by(iso3) %>%
summarize(avg_gdppc = mean(gdppc, na.rm = TRUE), .groups = "drop")
World shapefile
world <- ne_countries(scale = "medium", returnclass = "sf") %>%
select(iso_a3_eh, geometry)
Analysis
Per capita rates
d_all_summer = add_per_capita(d_all_summer, d_un_pop_all, "iso3") %>% add_eb()
d_all_winter = add_per_capita(d_all_winter, d_un_pop_all, "iso3") %>% add_eb()
d_all_combined = add_per_capita(d_all_combined, d_un_pop_all, "iso3") %>% add_eb()
d_post1990_summer = add_per_capita(d_post1990_summer, d_un_pop_post1990, c("modern_iso3" = "iso3")) %>% add_eb()
d_post1990_winter = add_per_capita(d_post1990_winter, d_un_pop_post1990, c("modern_iso3" = "iso3")) %>% add_eb()
d_post1990_combined = add_per_capita(d_post1990_combined, d_un_pop_post1990, c("modern_iso3" = "iso3")) %>% add_eb()
Raw vs. EB rates
p1 <- plot_raw_vs_eb(d_all_summer, title = "Summer, all-time (stable borders)"); p1

p2 <- plot_raw_vs_eb(d_post1990_summer, title = "Summer, post-1994"); p2

p3 <- plot_raw_vs_eb(d_all_winter, title = "Winter, all-time (stable borders)"); p3

p4 <- plot_raw_vs_eb(d_post1990_winter, title = "Winter, post-1994"); p4

p_combined <- (p1 + p2) / (p3 + p4) + plot_layout(guides = "collect")
p_combined

GG_save("figures/raw_vs_eb_summer_alltime.png", p1)
GG_save("figures/raw_vs_eb_summer_post1994.png", p2)
GG_save("figures/raw_vs_eb_winter_alltime.png", p3)
GG_save("figures/raw_vs_eb_winter_post1994.png", p4)
GG_save("figures/raw_vs_eb_2x2.png", p_combined, width = 16, height = 14)
World maps
eb_range <- range(c(d_all_summer$total_eb, d_post1990_summer$total_eb,
d_all_winter$total_eb, d_post1990_winter$total_eb))
m1 <- plot_map(d_all_summer, title = "Summer, all-time (stable borders)"); m1

m2 <- plot_map(d_post1990_summer, iso3_col = "modern_iso3", title = "Summer, post-1994"); m2

m3 <- plot_map(d_all_winter, title = "Winter, all-time (stable borders)"); m3

m4 <- plot_map(d_post1990_winter, iso3_col = "modern_iso3", title = "Winter, post-1994"); m4

m_combined <- (m1 + m2) / (m3 + m4) + plot_layout(guides = "collect") &
theme(legend.position = "bottom")
m_combined

GG_save("figures/map_summer_alltime.png", m1)
GG_save("figures/map_summer_post1994.png", m2)
GG_save("figures/map_winter_alltime.png", m3)
GG_save("figures/map_winter_post1994.png", m4)
GG_save("figures/map_2x2.png", m_combined, width = 16, height = 10)
Post-1994 Summer models
d_reg_post <- d_post1990_summer %>%
left_join(
d_post1990_participation %>% filter(season == "Summer"),
by = c("modern_iso3" = "iso3")
) %>%
left_join(d_hdi_post1990, by = c("modern_iso3" = "iso3")) %>%
left_join(d_gdppc_post1990, by = c("modern_iso3" = "iso3")) %>%
dplyr::select(modern_iso3, country = country.x, total,
avg_pop, frac_games, avg_hdi, avg_gdppc) %>%
mutate(log_gdppc = log(avg_gdppc)) %>%
filter(!is.na(avg_hdi), !is.na(avg_gdppc))
# negative binomial models with offset(log(pop))
m_post_part <- MASS::glm.nb(total ~ frac_games + offset(log(avg_pop)), data = d_reg_post)
m_post_hdi <- MASS::glm.nb(total ~ avg_hdi + offset(log(avg_pop)), data = d_reg_post)
m_post_gdp <- MASS::glm.nb(total ~ log_gdppc + offset(log(avg_pop)), data = d_reg_post)
m_post_ph <- MASS::glm.nb(total ~ frac_games + avg_hdi + offset(log(avg_pop)), data = d_reg_post)
m_post_pg <- MASS::glm.nb(total ~ frac_games + log_gdppc + offset(log(avg_pop)), data = d_reg_post)
models_post <- list(
"Participation" = m_post_part,
"HDI" = m_post_hdi,
"log(GDPpc)" = m_post_gdp,
"Part + HDI" = m_post_ph,
"Part + log(GDPpc)" = m_post_pg
)
map2_dfr(models_post, names(models_post), ~ {
s <- summary(.x)$coefficients
tibble(term = rownames(s), est = s[,1], se = s[,2], z = s[,3], p = s[,4], model = .y)
}) %>% filter(term != "(Intercept)") %>%
mutate(sig = case_when(p < 0.001 ~ "***", p < 0.01 ~ "**", p < 0.05 ~ "*", p < 0.1 ~ ".", TRUE ~ "")) %>%
dplyr::select(model, term, est, se, z, p, sig)
tibble(
model = names(models_post),
AIC = sapply(models_post, AIC),
theta = sapply(models_post, function(m) m$theta)
)
# standardized coefficients for relative importance
d_reg_post_z <- d_reg_post %>%
mutate(across(c(frac_games, avg_hdi, log_gdppc), ~ scale(.)[,1], .names = "{.col}_z"))
m_post_ph_z <- MASS::glm.nb(total ~ frac_games_z + avg_hdi_z + offset(log(avg_pop)), data = d_reg_post_z)
m_post_pg_z <- MASS::glm.nb(total ~ frac_games_z + log_gdppc_z + offset(log(avg_pop)), data = d_reg_post_z)
coef(summary(m_post_ph_z))[2:3, ]
## Estimate Std. Error z value Pr(>|z|)
## frac_games_z 0.0483 0.116 0.416 6.78e-01
## avg_hdi_z 1.2662 0.118 10.727 7.57e-27
coef(summary(m_post_pg_z))[2:3, ]
## Estimate Std. Error z value Pr(>|z|)
## frac_games_z 0.0882 0.126 0.699 4.84e-01
## log_gdppc_z 0.9962 0.125 7.980 1.46e-15
All-time Summer models
d_reg_all <- d_all_summer %>%
left_join(
d_all_participation %>% filter(season == "Summer"),
by = "iso3"
) %>%
left_join(d_gdppc_all, by = "iso3") %>%
dplyr::select(iso3, country = country.x, total, avg_pop, frac_games, avg_gdppc) %>%
mutate(log_gdppc = log(avg_gdppc)) %>%
filter(!is.na(avg_gdppc))
m_all_part <- MASS::glm.nb(total ~ frac_games + offset(log(avg_pop)), data = d_reg_all)
m_all_gdp <- MASS::glm.nb(total ~ log_gdppc + offset(log(avg_pop)), data = d_reg_all)
m_all_pg <- MASS::glm.nb(total ~ frac_games + log_gdppc + offset(log(avg_pop)), data = d_reg_all)
models_all <- list(
"Participation" = m_all_part,
"log(GDPpc)" = m_all_gdp,
"Part + log(GDPpc)" = m_all_pg
)
map2_dfr(models_all, names(models_all), ~ {
s <- summary(.x)$coefficients
tibble(term = rownames(s), est = s[,1], se = s[,2], z = s[,3], p = s[,4], model = .y)
}) %>% filter(term != "(Intercept)") %>%
mutate(sig = case_when(p < 0.001 ~ "***", p < 0.01 ~ "**", p < 0.05 ~ "*", p < 0.1 ~ ".", TRUE ~ "")) %>%
dplyr::select(model, term, est, se, z, p, sig)
tibble(
model = names(models_all),
AIC = sapply(models_all, AIC),
theta = sapply(models_all, function(m) m$theta)
)
# standardized coefficients for relative importance
d_reg_all_z <- d_reg_all %>%
mutate(across(c(frac_games, log_gdppc), ~ scale(.)[,1], .names = "{.col}_z"))
m_all_pg_z <- MASS::glm.nb(total ~ frac_games_z + log_gdppc_z + offset(log(avg_pop)), data = d_reg_all_z)
coef(summary(m_all_pg_z))[2:3, ]
## Estimate Std. Error z value Pr(>|z|)
## frac_games_z 0.776 0.139 5.59 2.22e-08
## log_gdppc_z 0.692 0.134 5.18 2.22e-07
Model comparison (standardized betas)
# refit all models with standardized predictors
m_post_part_z <- MASS::glm.nb(total ~ frac_games_z + offset(log(avg_pop)), data = d_reg_post_z)
m_post_hdi_z <- MASS::glm.nb(total ~ avg_hdi_z + offset(log(avg_pop)), data = d_reg_post_z)
m_post_gdp_z <- MASS::glm.nb(total ~ log_gdppc_z + offset(log(avg_pop)), data = d_reg_post_z)
m_all_part_z <- MASS::glm.nb(total ~ frac_games_z + offset(log(avg_pop)), data = d_reg_all_z)
m_all_gdp_z <- MASS::glm.nb(total ~ log_gdppc_z + offset(log(avg_pop)), data = d_reg_all_z)
all_models <- list(
"Post-94: Part" = m_post_part_z,
"Post-94: HDI" = m_post_hdi_z,
"Post-94: GDPpc" = m_post_gdp_z,
"Post-94: Part+HDI" = m_post_ph_z,
"Post-94: Part+GDPpc" = m_post_pg_z,
"All: Part" = m_all_part_z,
"All: GDPpc" = m_all_gdp_z,
"All: Part+GDPpc" = m_all_pg_z
)
# McFadden pseudo R²
m_post_null <- MASS::glm.nb(total ~ 1 + offset(log(avg_pop)), data = d_reg_post_z)
m_all_null <- MASS::glm.nb(total ~ 1 + offset(log(avg_pop)), data = d_reg_all_z)
null_models <- c(rep(list(m_post_null), 5), rep(list(m_all_null), 3))
pseudo_r2 <- function(model, null_model) 1 - as.numeric(logLik(model)) / as.numeric(logLik(null_model))
r2_values <- map2_chr(all_models, null_models, ~ sprintf("%.3f", pseudo_r2(.x, .y)))
aic_values <- map_chr(all_models, ~ sprintf("%.1f", AIC(.x)))
extra_rows <- data.frame(matrix(c("AIC", aic_values, "McFadden R²", r2_values), nrow = 2, byrow = TRUE))
names(extra_rows) <- c(" ", names(all_models))
modelsummary(
all_models,
coef_map = c("frac_games_z" = "Participation", "avg_hdi_z" = "HDI", "log_gdppc_z" = "log(GDPpc)"),
statistic = "({std.error})",
stars = c("*" = 0.05, "**" = 0.01, "***" = 0.001),
gof_map = c("nobs"),
add_rows = extra_rows
)
| |
Post-94: Part |
Post-94: HDI |
Post-94: GDPpc |
Post-94: Part+HDI |
Post-94: Part+GDPpc |
All: Part |
All: GDPpc |
All: Part+GDPpc |
| * p < 0.05, ** p < 0.01, *** p < 0.001 |
| Participation |
0.214 |
|
|
0.048 |
0.088 |
1.035*** |
|
0.776*** |
|
(0.132) |
|
|
(0.116) |
(0.126) |
(0.133) |
|
(0.139) |
| HDI |
|
1.281*** |
|
1.266*** |
|
|
|
|
|
|
(0.110) |
|
(0.118) |
|
|
|
|
| log(GDPpc) |
|
|
1.019*** |
|
0.996*** |
|
1.219*** |
0.692*** |
|
|
|
(0.118) |
|
(0.125) |
|
(0.130) |
(0.134) |
| Num.Obs. |
125 |
125 |
125 |
125 |
125 |
105 |
105 |
105 |
| AIC |
1309.0 |
1237.7 |
1269.9 |
1239.5 |
1271.3 |
1136.4 |
1139.0 |
1119.0 |
| McFadden R² |
0.003 |
0.057 |
0.032 |
0.057 |
0.033 |
0.045 |
0.043 |
0.061 |
Model residuals
# post-1994: residuals from HDI model (best)
d_reg_post <- d_reg_post %>%
mutate(
fitted_hdi = fitted(m_post_hdi),
resid_hdi = log(total) - log(fitted_hdi)
)
d_reg_post %>% arrange(desc(resid_hdi)) %>%
dplyr::select(modern_iso3, country, total, fitted_hdi, resid_hdi) %>% head(10)
d_reg_post %>% arrange(resid_hdi) %>%
dplyr::select(modern_iso3, country, total, fitted_hdi, resid_hdi) %>% head(10)
# all-time: residuals from Part + log(GDPpc) model (best)
d_reg_all <- d_reg_all %>%
mutate(
fitted_pg = fitted(m_all_pg),
resid_pg = log(total) - log(fitted_pg)
)
d_reg_all %>% arrange(desc(resid_pg)) %>%
dplyr::select(iso3, country, total, fitted_pg, resid_pg) %>% head(10)
d_reg_all %>% arrange(resid_pg) %>%
dplyr::select(iso3, country, total, fitted_pg, resid_pg) %>% head(10)
r1 <- plot_resid_map(d_reg_post, "modern_iso3", "resid_hdi",
title = "Post-1994 Summer residuals (HDI model)"); r1

r2 <- plot_resid_map(d_reg_all, "iso3", "resid_pg",
title = "All-time Summer residuals (Part + GDPpc model)"); r2

r_combined <- r1 / r2 + plot_layout(guides = "collect") &
theme(legend.position = "bottom")
r_combined

GG_save("figures/resid_map_post1994_hdi.png", r1)
GG_save("figures/resid_map_alltime_pg.png", r2)
GG_save("figures/resid_map_combined.png", r_combined, width = 12, height = 12)