This notebook reproduces, from the prepped data, every figure and number in the post Top Universities and Human Talent. It (1) harmonises three global ranking systems into a single university-quality factor (uni-g), and (2) relates a country’s stock and quality of top universities to national IQ (NIQ) and population size.

Data

latest <- function(d){ yc <- intersect(c("year","Year"), names(d)); d[d[[yc]] == max(d[[yc]]), ] }
num    <- function(x) suppressWarnings(as.numeric(x))
rp     <- file.path(PROJ, "data/raw/rankings")
xw     <- read_csv(file.path(PROJ, "data/raw/ror_crosswalk.csv"), show_col_types = FALSE)
addror <- function(df, nc){ df$name <- df[[nc]]; df %>% left_join(xw, by = "name") %>% filter(!is.na(ror_id)) }

rankings_long <- read_csv(file.path(PROJ, "data/interim/rankings_long.csv"), show_col_types = FALSE)
cov <- read_csv(file.path(PROJ, "data/interim/country_covariates.csv"), show_col_types = FALSE)

# ror -> country crosswalk (modal country per university across the long table)
nm2c <- rankings_long %>% inner_join(xw, by = c("uni_name" = "name")) %>%
  filter(!is.na(country_iso3)) %>% count(ror_id, country_iso3, sort = TRUE) %>%
  group_by(ror_id) %>% slice(1) %>% ungroup() %>% select(ror_id, country_iso3)

Three ranking systems and their (im)perfect agreement

TOPN_TAB <- 15
short_name <- function(x){
  x <- sub("\\s*\\(.*$","",x); x <- sub("\\s*-\\s*Swiss.*$","",x, ignore.case=TRUE)
  map <- c("Massachusetts Institute of Technology"="MIT","California Institute of Technology"="Caltech",
           "^ETH ?Zurich|Swiss Federal Institute of Technology"="ETH Zurich",
           "Ecole Polytechnique Federale|EPF ?Lausanne|^EPFL"="EPFL",
           "University of California,? ?-?Berkeley"="UC Berkeley","University of California,? ?-?Los Angeles"="UCLA",
           "University College London|^UCL$"="UCL","National University of Singapore"="NUS",
           "Nanyang Technological"="NTU Singapore","Imperial College London"="Imperial",
           "University of Chicago"="U. Chicago","University of Pennsylvania"="U. Penn",
           "Johns Hopkins"="Johns Hopkins","Tsinghua"="Tsinghua","Peking"="Peking U.")
  for(p in names(map)) if(grepl(p,x,ignore.case=TRUE)) return(unname(map[[p]]))
  if(!grepl("[a-z]",x)) x <- gsub("\\b(\\w)(\\w*)","\\U\\1\\L\\2",x,perl=TRUE)
  x <- gsub("University of ","U. ",x,ignore.case=TRUE); x <- gsub("University","U.",x,ignore.case=TRUE)
  x <- gsub("College","Coll.",x,ignore.case=TRUE)
  if(nchar(x)>20) x <- paste0(substr(x,1,19),"…"); x
}
tab <- rankings_long %>% group_by(system) %>% filter(year==max(year)) %>% ungroup() %>%
  filter(!is.na(rank)) %>% mutate(rank=as.numeric(rank)) %>%
  group_by(system) %>% arrange(rank,.by_group=TRUE) %>% slice_head(n=TOPN_TAB) %>%
  mutate(pos=row_number()) %>% ungroup() %>%
  mutate(system=factor(system,levels=names(SYS_COL)),
         iso2=tolower(countrycode(country_iso3,"iso3c","iso2c")),
         lab=vapply(uni_name, short_name, ""))
yrs <- tab %>% group_by(system) %>% summarise(y=max(.data[["year"]]),.groups="drop")
hdr <- function(s) sprintf("%s  (%d)", s, yrs$y[yrs$system==s])
colx <- c(ARWU=0.55, QS=2.15, THE=3.75); colw <- 1.55
tab <- tab %>% mutate(x=colx[as.character(system)], y=-pos)
stripe <- data.frame(pos=seq(2,TOPN_TAB,by=2))
ggplot(tab) +
  geom_rect(data=stripe, aes(xmin=-0.45,xmax=colx["THE"]+colw,ymin=-pos-0.5,ymax=-pos+0.5),
            fill="grey94", inherit.aes=FALSE) +
  geom_text(data=distinct(tab,pos,y), aes(x=-0.35,y=y,label=pos), fontface="bold", colour="grey55", size=4) +
  ggflags::geom_flag(aes(x=x,y=y,country=iso2), size=6) +
  geom_text(aes(x=x+0.18,y=y,label=lab), hjust=0, size=3.7) +
  annotate("text", x=colx, y=0.3, label=vapply(names(colx),hdr,""), fontface="bold", hjust=0, size=4.6) +
  annotate("segment", x=-0.45, xend=colx["THE"]+colw, y=-0.5, yend=-0.5, colour="grey30") +
  scale_country(guide="none") +
  coord_cartesian(xlim=c(-0.55,colx["THE"]+colw), ylim=c(-TOPN_TAB-0.7,1.0), clip="off") +
  labs(title="The three systems agree on the world's top universities",
       subtitle=sprintf("Each system's own top %d, latest year.", TOPN_TAB), x=NULL, y=NULL) +
  theme_void(base_size=12) + theme(plot.title=element_text(face="bold"))

Counts by country differ across systems

short_country <- function(iso3){ nm <- countrycode(iso3,"iso3c","country.name.en")
  ov <- c(USA="USA",GBR="UK",KOR="South Korea",HKG="Hong Kong",TWN="Taiwan",CHN="China",RUS="Russia")
  ifelse(iso3 %in% names(ov), ov[iso3], nm) }
cnt_sys <- rankings_long %>% group_by(system) %>% filter(year==max(year)) %>% ungroup() %>%
  filter(!is.na(rank), as.numeric(rank)<=200, !is.na(country_iso3)) %>% count(system, country_iso3, name="count")
totals <- cnt_sys %>% group_by(country_iso3) %>% summarise(tot=sum(count),.groups="drop") %>%
  arrange(desc(tot)) %>% slice_head(n=15) %>% mutate(idx=rev(row_number()))
dat <- totals %>% select(country_iso3, idx) %>% crossing(system=names(SYS_COL)) %>%
  left_join(cnt_sys, by=c("country_iso3","system")) %>%
  mutate(count=replace_na(count,0L), system=factor(system,levels=names(SYS_COL)),
         off=c(ARWU=0.26,QS=0,THE=-0.26)[as.character(system)], ypos=idx+off)
meta <- totals %>% mutate(iso2=tolower(countrycode(country_iso3,"iso3c","iso2c")), cname=short_country(country_iso3))
xmax<-max(dat$count); gut<-xmax*0.42
ggplot(dat) +
  geom_col(aes(x=count,y=ypos,fill=system), orientation="y", width=0.24) +
  ggflags::geom_flag(data=meta, aes(x=-gut,y=idx,country=iso2), size=5.5) +
  geom_text(data=meta, aes(x=-gut+xmax*0.035,y=idx,label=cname), hjust=0, size=3.7) +
  scale_fill_manual(values=SYS_COL, breaks=names(SYS_COL), name=NULL) +
  scale_country(guide="none") +
  scale_x_continuous(limits=c(-gut,xmax*1.03), breaks=scales::breaks_width(20)(c(0,xmax)),
                     expand=expansion(mult=c(0,0.01))) +
  labs(title="Top-200 universities by country, by ranking system",
       subtitle="Each system's own top 200, counted per country.", x="number of top-200 universities", y=NULL) +
  theme(legend.position="top", legend.justification="left", panel.grid.major.y=element_blank(),
        panel.grid.minor=element_blank(), axis.text.y=element_blank(), plot.title=element_text(face="bold"))

Harmonising the systems: a university-g factor

We pool the published quantitative indicators (ARWU 6, QS 6, THE 5), matched by ROR ID on the universities present in all three systems.

A <- read_csv(file.path(rp,"ARWU_World_Rankings.csv"),show_col_types=FALSE,name_repair="unique")%>%latest()%>%addror("univNameEn")%>%
  transmute(ror_id,Alumni=num(Alumni),Award=num(Award),HiCi=num(HiCi),`N&S`=num(`N&S`),PUB=num(PUB),PCP=num(PCP))%>%group_by(ror_id)%>%slice(1)%>%ungroup()
Q <- read_csv(file.path(rp,"QS_World_Rankings.csv"),show_col_types=FALSE,name_repair="unique")%>%latest()%>%addror("University")%>%
  transmute(ror_id,`Acad Rep`=num(`Academic Reputation`),`Emp Rep`=num(`Employer Reputation`),`Cit/Fac`=num(`Citations Per Faculty`),`Fac/Stu`=num(`Faculty Student`),`Int Fac`=num(`International Faculty`),`Int Stu`=num(`International Students`))%>%group_by(ror_id)%>%slice(1)%>%ungroup()
T <- read_csv(file.path(rp,"THE_World_Rankings.csv"),show_col_types=FALSE,name_repair="unique")%>%latest()%>%addror("name")%>%
  transmute(ror_id,Research=num(scores_research),Citations=num(scores_citations),Teaching=num(scores_teaching),`Int Outlook`=num(scores_international_outlook),Industry=num(scores_industry_income))%>%group_by(ror_id)%>%slice(1)%>%ungroup()
m17 <- A%>%inner_join(Q,"ror_id")%>%inner_join(T,"ror_id")
X <- m17%>%select(-ror_id)%>%mutate(across(everything(),as.numeric)); X <- X[complete.cases(X),]
R <- cor(X)
sprintf("%d universities present in all three systems; %d indicators.", nrow(X), ncol(X))
## [1] "621 universities present in all three systems; 17 indicators."
sys_cols <- c(rep("#d1495b",6),rep("#b5860b",6),rep("#3f6fa3",5))
corrplot(R, method="color", order="original", col=COL2("RdBu",200), addCoef.col="grey25",
         number.cex=0.62, tl.col=sys_cols, tl.srt=45, tl.cex=1.0, cl.cex=0.85, mar=c(0,0,4,0))
title("Pooled ranking indicators correlate positively (n = 621)", line=2.6, cex.main=1.05)
n<-ncol(R); blk<-function(lo,hi,col) rect(lo-0.5,n-hi+0.5,hi+0.5,n-lo+0.5,border=col,lwd=3)
blk(1,6,"#d1495b"); blk(7,12,"#b5860b"); blk(13,17,"#3f6fa3")
mtext("ARWU",3,at=3.5,line=0.5,col="#d1495b",font=2,cex=1.05)
mtext("QS",3,at=9.5,line=0.5,col="#b5860b",font=2,cex=1.05)
mtext("THE",3,at=15,line=0.5,col="#3f6fa3",font=2,cex=1.05)

Every indicator loads positively on every other — a positive manifold — so a single general factor is justified.

Bifactor model

Xz <- as.data.frame(scale(X))
om <- omega(Xz, nfactors=5, fm="ml", rotate="oblimin", plot=FALSE)
sl <- as.data.frame(om$schmid$sl); fac <- c("g", grep("^F[0-9]\\*?$", colnames(sl), value=TRUE))
ord <- colnames(X); L <- sl[ord,fac]; L$indicator <- factor(rownames(L), levels=rev(ord))
long <- pivot_longer(L, all_of(fac), names_to="factor", values_to="loading") %>% mutate(factor=factor(factor,levels=fac))
relvar <- (colSums(as.matrix(sl[,fac])^2)/nrow(sl)); relvar <- relvar/sum(relvar)
omg <- c(om$omega.group["g","general"], om$omega.group[fac[-1],"group"])
foot <- data.frame(factor=factor(fac,levels=fac), pvar=sprintf("%.0f%%",100*relvar), pomg=sprintf("ω %.2f",omg))
ggplot(long, aes(factor, indicator, fill=loading)) +
  geom_tile(color="white", linewidth=.6) +
  geom_text(aes(label=sprintf("%.2f",loading)), size=3.1, color=ifelse(abs(long$loading)>0.55,"white","grey15")) +
  geom_vline(xintercept=1.5, linewidth=1.1, color="grey25") +
  annotate("segment",x=0.4,xend=6.6,y=11.5,yend=11.5,color="grey55",linetype=3) +
  annotate("segment",x=0.4,xend=6.6,y=5.5,yend=5.5,color="grey55",linetype=3) +
  annotate("segment",x=0.4,xend=6.6,y=0.4,yend=0.4,color="grey40") +
  geom_text(data=foot, aes(x=factor,y=-0.7,label=pvar), inherit.aes=FALSE, fontface="bold", size=3.2) +
  geom_text(data=foot, aes(x=factor,y=-1.8,label=pomg), inherit.aes=FALSE, fontface="bold", size=3.2, color="#555555") +
  scale_fill_gradient2(low="#b2182b",mid="white",high="#2166ac",midpoint=0,limits=c(-1,1),name="loading") +
  scale_x_discrete(position="top", labels=c("g\n(general)","F1\nLaureates","F2\nInter-\nnational","F3\nCitations\n& output","F4\nReputation","F5\nIndustry\nfunding")) +
  scale_y_discrete(expand=expansion(add=c(2.5,0.6))) + coord_cartesian(clip="off") +
  labs(title="Bifactor (Schmid-Leiman) loadings (n = 621)",
       subtitle=sprintf("General factor omega_h = %.2f (ECV = %.0f%%); footer = %%common-var and group omega per factor", om$omega_h, 100*relvar[1]),
       x=NULL, y=NULL) +
  theme(panel.grid=element_blank(), axis.text.x.top=element_text(face="bold"), plot.title=element_text(face="bold"))

Coverage: how many universities, and how much overlap?

mem <- function(tbl) tbl$ror_id[complete.cases(tbl)]
Acc<-mem(A); Qcc<-mem(Q); Tcc<-mem(T)
U <- function(...) length(Reduce(union, list(...))); I <- function(...) length(Reduce(intersect, list(...)))
coverage <- tibble(
  combination=c("ARWU","QS","THE","ARWU + QS","ARWU + THE","QS + THE","ARWU + QS + THE"),
  union_N=c(length(Acc),length(Qcc),length(Tcc),U(Acc,Qcc),U(Acc,Tcc),U(Qcc,Tcc),U(Acc,Qcc,Tcc)),
  intersection_N=c(length(Acc),length(Qcc),length(Tcc),I(Acc,Qcc),I(Acc,Tcc),I(Qcc,Tcc),I(Acc,Qcc,Tcc)))
coverage

The all-three intersection (621) is what the factor analysis above uses; the union of any-one (~2,000) is what we score next.

Scoring the union with FIML

A single-factor model fit by FIML uses each university’s available indicators and returns a score for every university, including partial-data ones. We use the quality-core indicators (internationalisation / industry dropped, as they index national context rather than quality).

zc <- function(d) d %>% mutate(across(-ror_id, ~ as.numeric(scale(.))))
A2 <- read_csv(file.path(rp,"ARWU_World_Rankings.csv"),show_col_types=FALSE,name_repair="unique")%>%latest()%>%addror("univNameEn")%>%
  transmute(ror_id,a_alumni=num(Alumni),a_award=num(Award),a_hici=num(HiCi),a_ns=num(`N&S`),a_pub=num(PUB),a_pcp=num(PCP))%>%group_by(ror_id)%>%slice(1)%>%ungroup()%>%zc()
Q2 <- read_csv(file.path(rp,"QS_World_Rankings.csv"),show_col_types=FALSE,name_repair="unique")%>%latest()%>%addror("University")%>%
  transmute(ror_id,q_acrep=num(`Academic Reputation`),q_emprep=num(`Employer Reputation`),q_citfac=num(`Citations Per Faculty`),q_facstu=num(`Faculty Student`))%>%group_by(ror_id)%>%slice(1)%>%ungroup()%>%zc()
T2 <- read_csv(file.path(rp,"THE_World_Rankings.csv"),show_col_types=FALSE,name_repair="unique")%>%latest()%>%addror("name")%>%
  transmute(ror_id,t_research=num(scores_research),t_cit=num(scores_citations),t_teach=num(scores_teaching))%>%group_by(ror_id)%>%slice(1)%>%ungroup()%>%zc()
hA<-A2[complete.cases(A2),]; hQ<-Q2[complete.cases(Q2),]; hT<-T2[complete.cases(T2),]
allror <- Reduce(union, list(hA$ror_id,hQ$ror_id,hT$ror_id))
W <- tibble(ror_id=allror) %>% left_join(hA,"ror_id") %>% left_join(hQ,"ror_id") %>% left_join(hT,"ror_id")
ind <- setdiff(names(W),"ror_id")
fit <- cfa(paste("g =~", paste(ind,collapse=" + ")), data=as.data.frame(W[,ind]), missing="fiml", std.lv=TRUE)
gscore <- as.numeric(lavPredict(fit)[,"g"])
mz <- rowMeans(as.matrix(W[,ind]), na.rm=TRUE); if(cor(gscore,mz)<0) gscore <- -gscore
unig <- tibble(ror_id=W$ror_id, g=gscore) %>% left_join(nm2c,"ror_id")
sprintf("Scored %d universities; %d mapped to a country.", nrow(unig), sum(!is.na(unig$country_iso3)))
## [1] "Scored 2044 universities; 2044 mapped to a country."

National IQ, population, and the count of top universities

gu <- unig %>% filter(!is.na(country_iso3)) %>% arrange(desc(g)) %>% mutate(grank=row_number())
agg <- gu %>% group_by(country_iso3) %>% summarise(mean_g=mean(g), max_g=max(g), n_uni=n(), .groups="drop")
base <- cov %>% filter(!is.na(national_iq), population>0) %>%
  select(country_iso3, niq=national_iq, population) %>%
  left_join(agg,"country_iso3") %>% mutate(z=as.numeric(scale(niq)))

Population inflates counts more as the threshold widens

Nmax<-nrow(gu); grid<-sort(unique(c(seq(25,2000,by=25),Nmax)))
curve <- purrr::map_dfr(grid, function(N){
  cn <- gu[seq_len(min(N,Nmax)),] %>% count(country_iso3,name="count")
  d <- base %>% select(country_iso3,population) %>% left_join(cn,"country_iso3") %>%
    mutate(count=replace_na(count,0L), lp=log(population))
  d2 <- base %>% select(country_iso3,niq) %>% inner_join(d,"country_iso3")
  tibble(N=N, count=cor(d2$count,d2$lp), `log(count)`=cor(log1p(d2$count),d2$lp),
         `rank (Spearman)`=cor(d2$count,d2$lp,method="spearman"))
})
lc <- pivot_longer(curve,-N,names_to="measure",values_to="r") %>%
  mutate(measure=factor(measure,levels=c("count","log(count)","rank (Spearman)")))
ggplot(lc, aes(N,r,color=measure)) + geom_hline(yintercept=0,color="grey80") +
  geom_line(linewidth=1.1) + scale_x_log10(breaks=c(25,50,100,200,500,1000,2000)) +
  scale_y_continuous(limits=c(0,0.65)) +
  scale_color_manual(values=c("count"="#d1495b","log(count)"="#3f6fa3","rank (Spearman)"="#5a8f5a"),name=NULL) +
  labs(title="Population drives counts more as the 'top' threshold widens",
       subtitle="Correlation of country top-N count (FIML g) with log(population), 214 countries",
       x="“top N” threshold (log scale)", y="correlation with log(population)") +
  theme(legend.position=c(0.16,0.84), legend.background=element_rect(fill="white",color=NA),
        panel.grid.minor=element_blank(), plot.title=element_text(face="bold"))

Absolute vs per-capita (top 200)

pc <- gu %>% slice_head(n=200) %>% count(country_iso3, name="count") %>%
  left_join(base%>%select(country_iso3,population),"country_iso3") %>%
  mutate(per_million=count/(population/1e6), cname=short_country(country_iso3)) %>% arrange(desc(per_million))
prep <- function(metric) pc %>% arrange(desc(.data[[metric]])) %>% slice_head(n=15) %>%
  mutate(val=.data[[metric]], iso2=tolower(countrycode(country_iso3,"iso3c","iso2c")), idx=rev(row_number()))
dA <- prep("count") %>% mutate(lab=paste0(cname,"  ",count), faint=FALSE)
dB <- prep("per_million") %>% mutate(lab=sprintf("%s  %.2f (%d)",cname,per_million,count), faint=count<=1)
barP <- function(d,title,xlab,fill){ xmax<-max(d$val); pad<-xmax*0.02
  ggplot(d, aes(y=idx)) + geom_col(aes(x=val,fill=faint),width=.72,orientation="y") +
    ggflags::geom_flag(aes(x=val+pad*1.5,country=iso2),size=4.8) +
    geom_text(aes(x=val+pad*3.6,label=lab,color=faint),hjust=0,size=3.3) +
    scale_fill_manual(values=c(`FALSE`=fill,`TRUE`="#b8c4d0"),guide="none") +
    scale_color_manual(values=c(`FALSE`="grey10",`TRUE`="grey55"),guide="none") +
    scale_country(guide="none") + scale_x_continuous(expand=expansion(mult=c(0,0.62)),limits=c(0,xmax*1.55)) +
    labs(title=title,x=xlab,y=NULL) + theme(legend.position="none",panel.grid.major.y=element_blank(),
      panel.grid.minor=element_blank(),axis.text.y=element_blank(),plot.title=element_text(face="bold",size=13)) }
(barP(dA,"Most in total (count)","top-200 universities","#4878a8") |
 barP(dB,"Most per capita (per million)","per million population","#2e7d57")) +
  plot_annotation(title="Top-200 universities by country: absolute vs. per capita",
                  theme=theme(plot.title=element_text(face="bold",size=15)))

Three ways to relate university quality to NIQ

1. Mean uni-g (no population term)

o <- base %>% filter(n_uni>=1) %>% mutate(z_lpop=as.numeric(scale(log(population))),
                                          iso2=tolower(countrycode(country_iso3,"iso3c","iso2c")))
r_mean <- cor(o$niq,o$mean_g)
ggplot(o, aes(niq,mean_g)) + geom_hline(yintercept=0,color="grey88") +
  geom_smooth(method="lm",formula=y~x,color="#b2182b",fill="grey82") +
  ggflags::geom_flag(aes(country=iso2,size=n_uni)) +
  scale_size(range=c(2.2,9),name="# universities",breaks=c(1,5,20,50)) + scale_country(guide="none") +
  annotate("label",x=min(o$niq),y=max(o$mean_g),hjust=0,vjust=1,size=4.3,fontface="bold",
           label=sprintf("r = %.2f   R² = %.2f   n = %d", r_mean, r_mean^2, nrow(o))) +
  labs(title="Mean university quality vs national IQ",
       subtitle="Flag size = number of ranked universities (small-n countries are noisy / truncation-biased)",
       x="National IQ", y="Mean university g") +
  theme(plot.title=element_text(face="bold"), panel.grid.minor=element_blank())

2. Max uni-g (best university), controlling for population

m_max <- lm(max_g ~ z + z_lpop, o)
b <- coef(m_max)[["z"]]; o$pres <- resid(m_max) + b*o$z
rp_max <- cor(o$pres, o$niq)
ggplot(o, aes(niq,pres)) + geom_smooth(method="lm",formula=y~x,color="#b2182b",fill="grey82") +
  ggflags::geom_flag(aes(country=iso2),size=4.3) + scale_country(guide="none") +
  annotate("label",x=min(o$niq),y=max(o$pres),hjust=0,vjust=1,size=4.2,fontface="bold",
           label=sprintf("partial r = %.2f   partial R² = %.2f   n = %d", rp_max, rp_max^2, nrow(o))) +
  labs(title="A country's best university vs NIQ, controlling for population",
       subtitle="Partial-residual plot from  max uni-g ~ NIQ + log(population)",
       x="National IQ", y="Best-university g (population-adjusted)") +
  theme(plot.title=element_text(face="bold"), panel.grid.minor=element_blank())

summary(m_max)$coefficients
##                Estimate Std. Error    t value     Pr(>|t|)
## (Intercept) -0.02773564  0.1144988 -0.2422353 8.090842e-01
## z            1.29913619  0.1142077 11.3752092 7.517615e-20
## z_lpop       0.83403055  0.1004237  8.3051183 4.433726e-13

3. Count model (negative binomial, top 200, offset = log population)

N <- 200
cn <- gu %>% filter(grank<=N) %>% count(country_iso3, name="cnt")
d200 <- base %>% left_join(cn,"country_iso3") %>% mutate(cnt=replace_na(cnt,0L))
nb  <- glm.nb(cnt ~ z + offset(log(population)), d200)
nb0 <- glm.nb(cnt ~ 1 + offset(log(population)), d200)
sd_niq <- sd(d200$niq); f15 <- 15/sd_niq
tibble(term="NIQ",
  IRR_per_natSD = exp(coef(nb)[["z"]]),
  IRR_per_15pts = exp(coef(nb)[["z"]]*f15),
  theta = nb$theta,
  pseudoR2 = as.numeric(1 - logLik(nb)/logLik(nb0)),
  national_IQ_SD = sd_niq)

Negative binomial (not Poisson — variance/mean = 23; not zero-inflated — NB already reproduces the 167 zeros). Visualised as the empirical-Bayes-shrunk per-capita rate, which is defined for every country:

size <- nb0$theta; mu <- exp(unname(coef(nb0)[1]))
d200 <- d200 %>% mutate(eb=(cnt+size)/(population+size/mu)*1e6, haveuni=cnt>0,
                        iso2=tolower(countrycode(country_iso3,"iso3c","iso2c")))
nz<-filter(d200,haveuni); zr<-filter(d200,!haveuni); rr<-cor(d200$niq,log(d200$eb))
ggplot(d200, aes(niq,eb)) + geom_smooth(method="lm",formula=y~x,color="#b2182b",fill="grey82") +
  geom_point(data=zr,color="grey55",size=1.9,alpha=0.6) +
  ggflags::geom_flag(data=nz, aes(country=iso2), size=4.3) + scale_country(guide="none") +
  scale_y_log10(breaks=c(0.001,0.01,0.1,1)) +
  annotate("label",x=min(d200$niq),y=0.6,hjust=0,vjust=1,size=4,fontface="bold",fill="white",
           label=sprintf("r = %.2f   R² = %.2f", rr, rr^2)) +
  labs(title="EB-shrunk top-200 rate vs national IQ",
       subtitle="Grey = 0 observed (shrunk to a population-dependent floor); flags = ≥1 top-200 university",
       x="National IQ", y="EB-shrunk top-200 per million (log scale)") +
  theme(plot.title=element_text(face="bold"), panel.grid.minor=element_blank())

Summary of the NIQ effect

tibble(
  approach = c("Mean uni-g (no pop)","Max uni-g (| log pop)","Per-capita count (NB + offset)"),
  metric   = c("R²","partial R²","McFadden pseudo-R²"),
  value    = c(round(r_mean^2,2), round(rp_max^2,2), round(as.numeric(1-logLik(nb)/logLik(nb0)),2)),
  effect_per_15_IQ_points = c(sprintf("+%.2f g", coef(m_max)[["z"]]*0 + cor(o$niq,o$mean_g)*0 + coef(lm(mean_g~z,o))[["z"]]*f15),
                              sprintf("+%.2f g", coef(m_max)[["z"]]*f15),
                              sprintf("IRR = %.0f×", exp(coef(nb)[["z"]]*f15))))

Across all three outcome definitions, national IQ is a strong predictor of where the world’s top universities are and how good they are. Max uni-g, with population as a covariate, gives the cleanest signal (partial R² ≈ 0.56) and is immune to the truncation bias that inflates mean uni-g for poorly-covered countries. The count model (IRR ≈ 50× per +15 IQ points) handles all countries, including the ~85% with zero top-200 universities.

## Rendered 2026-06-05 05:24:24
## [1] "R version 4.6.0 (2026-04-24)"