Introduction

Briefly put, this visualizes the distributions and time trends of human accomplishment by means of significant figures. Who are the significant figures? To compile a list of such persons, one begins by compiling a number of well-regarded reference works on the field one wants to identify significant figures in, be it science or art. Then one finds all the persons mentioned in the books and assigns them ratings in various ways that reflect their importance in the works. This can be number of references, number of paragraphs devoted to the person, or number of pieces of art reproduce. Aggregating over the sources, it is possible to construct a combined score for each person in a given field. For more details, read the book.

Packages

library(pacman)
p_load(kirkegaard, readr, dplyr, doParallel, foreach, tidyr)
options(digits = 2)

Load HA data, fix names

The data has been made publicly available by Charles Murray. (Note: OSF down for maintenance as of writing.)

#load data
d = read_csv("HA.csv")

d$LNFM = d$Name
d$Name = NULL
d$FNLN = map_chr(d$LNFM, function(x) {
  #detect comma
  if(!str_detect(x, ",")) return(x)
  
  #match both parts
  y = str_match(x, "(.*),(.*)")
  # browser()
  
  #add together again
  str_c(y[, 3], y[, 2], sep = " ") %>% str_trim()
})

Code ethnicity

Some complained about the lack of (insert favorite group). I used all the data available and aggregated them into 3 major groups to avoid visual clutter.

#basic counts per ethnicity
table2(d$Ethnicity, include_NA = F) %>% arrange(desc(Count)) %>% as.data.frame
##            Group Count Percent
## 1       Germanic   592  14.793
## 2         French   565  14.118
## 3        English   441  11.019
## 4        Italian   397   9.920
## 5            USA   276   6.897
## 6        Chinese   240   5.997
## 7         Jewish   184   4.598
## 8       Japanese   170   4.248
## 9  Ancient Greek   147   3.673
## 10        Slavic   115   2.874
## 11         Dutch   101   2.524
## 12        Indian    97   2.424
## 13        Arabic    94   2.349
## 14       Flemish    83   2.074
## 15         Scots    76   1.899
## 16       Spanish    69   1.724
## 17 Ancient Roman    59   1.474
## 18       Swedish    45   1.124
## 19         Czech    39   0.975
## 20        Danish    35   0.875
## 21         Irish    32   0.800
## 22     Norwegian    23   0.575
## 23        Polish    19   0.475
## 24        Latino    17   0.425
## 25     Hungarian    13   0.325
## 26      Canadian    11   0.275
## 27    Portuguese    11   0.275
## 28         Greek    10   0.250
## 29       Finnish     7   0.175
## 30    Australian     5   0.125
## 31         Black     5   0.125
## 32      Croatian     5   0.125
## 33      Romanian     5   0.125
## 34   New Zealand     3   0.075
## 35         Swiss     3   0.075
## 36        Basque     2   0.050
## 37     Icelandic     2   0.050
## 38     Slovenian     2   0.050
## 39     Bulgarian     1   0.025
## 40      Estonian     1   0.025
## 41          <NA>     0   0.000
#aggregate ethnicities
euro_ethnicities = c("Ancient Greek",
                     "Ancient Roman",
                     "Australian",
                     "Basque",
                     "Bulgarian",
                     "Canadian",
                     "Croatian",
                     "Germanic",
                     "French",
                     "English",
                     "Italian",
                     "USA",
                     "Slavic",
                     "Jewish",
                     "Dutch",
                     "Flemish",
                     "Scots",
                     "Spanish",
                     "Swedish",
                     "Irish",
                     "Norwegian",
                     "Polish",
                     "Hungarian",
                     "Portuguese",
                     "Greek",
                     "Finnish",
                     "Romanian",
                     "Swiss",
                     "New Zealand",
                     "Slovenian",
                     "Icelandic",
                     "Estonian")

asian_ethnicities = c("Japanese", "Chinese")

#code
d$european = (d$Ethnicity %in% euro_ethnicities) %>% as.numeric
d$asian = (d$Ethnicity %in% asian_ethnicities) %>% as.numeric
d$other = 1 - (d$european + d$asian)
d$macro_ethnicity = map_chr(d$Ethnicity, function(x) {
  if (x %in% euro_ethnicities) return("European")
  if (x %in% asian_ethnicities) return("Asian")
  "Other"
})

#long form version for plotting
# d_long = d %>% select(Birth, macro_ethnicity, european, asian, other)
#no longer used

#counts per year for absolute plot
d_year = map_df(d$Birth %>% unique, function(yr) {
  tmp = d %>% dplyr::filter(Birth == yr)
  data_frame(european = sum(tmp$european),
             asian = sum(tmp$asian),
             other = sum(tmp$other),
             Birth = yr
             )
})

Plot trend

How were the significant figures distributed with regards to the ethnic groups? Did this change over time? We can visualize it both in relative terms and in absolute terms to find out.

#Relative trend
ggplot(d, aes(Birth, european)) +
  geom_smooth(color = "blue") +
  geom_smooth(aes(y = asian), color = "yellow") +
  geom_smooth(aes(y = other), color = "brown") +
  scale_y_continuous(labels = scales::percent, name = "Significant figures %", breaks = seq(0, 1, .2)) +
  scale_x_continuous(name = "Year of birth\nBlue = European, Yellow = Asian, Brown = Other", breaks = seq(-1000, 2000, 200)) +
  #add arrow for dark ages
  geom_segment(aes(x = 1300, xend = 1050, y = .15, yend = .15), arrow = arrow(length = unit(0.03, "npc"))) +
  geom_text(aes(label = "Dark ages", x = 1500, y = .15)) +
  theme_bw()

ggsave("figures/euro_trend.png")

#absolute trend
ggplot(d_year, aes(Birth)) +
  geom_smooth(aes(y = european), color = "blue") +
  geom_smooth(aes(y = asian), color = "yellow") +
  geom_smooth(aes(y = other), color = "brown") +
  scale_y_continuous(name = "Significant figures born per year, absolute numbers") +
  scale_x_continuous(name = "Year of birth\nBlue = European, Yellow = Asian, Brown = Other", breaks = seq(-1000, 2000, 200)) +
  #add arrow for dark ages
  geom_segment(aes(x = 700, xend = 700, y = 4, yend = 2), arrow = arrow(length = unit(0.03, "npc"))) +
  geom_text(aes(label = "Dark ages", x = 700, y = 4)) +
  theme_bw()

ggsave("figures/euro_trend_abs.png")

Realistic error bars

In the analysis of the percentages of significant figures from each group, errors bars were generated in the usual way meaning no assumptions about the numbers. As such, some of the error bars have impossible negative values. Error bars should not always be taken literally, but more as guidelines of uncertainty in the results. If we want more realistic error bars, we need to calculate them another way.

Bootstrapped error bars

A simple approach to calculate error bars is to just resample the data repeatedly. For a very simple and fast model, one could calcualte the proportions for each year of birth. To avoid excessive sample error, one can include datapoints ±n years on each side. In the early ages, there are very few datapoints, so one small values of n will result in missing excessive data problems. In the below I use the value 75, so the frquencies are 151 year moving averages.

set.seed(1)
#how many times to resample
n_samples = 1000

#function
get_freqs = function(df, year_width, run = NA) {
  #subset and resample
  df = sample_n(df[c("Birth", "european", "asian", "other")], size = nrow(df), replace = T)
  
  #freqs
  freqs = purrr::map_df(min(df$Birth, na.rm=T):max(df$Birth, na.rm=T), function(y) {
    df %>% 
      #subset to relevant datapoints
      dplyr::filter(kirkegaard::is_between(Birth, y - year_width, y + year_width)) %$%
      data_frame(
        year = y,
        european = mean(european),
        asian = mean(asian),
        other = mean(other),
        run = run
      )
    
  })
  
  freqs
}

#parallel computing
cl = makeCluster(detectCores(), type='PSOCK')
registerDoParallel(cl)
#export variables and functions to nodes
clusterExport(cl, varlist=c("d", "sample_n", "%$%", "data_frame"))

#replicate
# freqs_w = replicate(n = n_samples, expr = get_freqs(d, 100)) #single thread version
freqs_w = foreach(i = 1:n_samples) %dopar% get_freqs(d, year_width = 100, run = i)
#gives error every second time it is run (???)

#rbind
freqs_w_long = bind_rows(freqs_w) %>% 
  #filter NaN rows due to 0 persons
  miss_filter()

#calculate 95% confidence intervals as the top 5% and lowest 5% of intervals
freqs_w_sum = freqs_w_long %>% group_by(year) %>% summarize(
  european_mean = mean(european),
  european_upper = quantile(european, .95),
  european_lower = quantile(european, .05),
  asian_mean = mean(asian),
  asian_upper = quantile(asian, .95),
  asian_lower = quantile(asian, .05),
  other_mean = mean(other),
  other_upper = quantile(other, .95),
  other_lower = quantile(other, .05)
  )

#plot as is
#because getting to proper long format is diffcult!
ggplot(freqs_w_sum, aes(year)) +
  geom_line(aes(y = european_mean), color = "blue", size = 1) +
  geom_line(aes(y = european_upper), color = "blue", linetype = "dashed", alpha = .5) +
  geom_line(aes(y = european_lower), color = "blue", linetype = "dashed", alpha = .5) +
  geom_line(aes(y = asian_mean), color = "yellow", size = 1) +
  geom_line(aes(y = asian_upper), color = "yellow", linetype = "dashed", alpha = .5) +
  geom_line(aes(y = asian_lower), color = "yellow", linetype = "dashed", alpha = .5) +
  geom_line(aes(y = other_mean), color = "brown", size = 1) +
  geom_line(aes(y = other_upper), color = "brown", linetype = "dashed", alpha = .5) +
  geom_line(aes(y = other_lower), color = "brown", linetype = "dashed", alpha = .5) +
  xlab("Year of birth") +
  scale_y_continuous(name = "% significant figure", labels = scales::percent) +
  theme_bw()

ggsave("figures/bootstrapped_ci.png")
## Saving 7 x 5 in image