Briefly put, this visualizes the distributions and time trends of human accomplishment by means of significant figures. Who are the significant figures? To compile a list of such persons, one begins by compiling a number of well-regarded reference works on the field one wants to identify significant figures in, be it science or art. Then one finds all the persons mentioned in the books and assigns them ratings in various ways that reflect their importance in the works. This can be number of references, number of paragraphs devoted to the person, or number of pieces of art reproduce. Aggregating over the sources, it is possible to construct a combined score for each person in a given field. For more details, read the book.
library(pacman)
p_load(kirkegaard, readr, dplyr, doParallel, foreach, tidyr)
options(digits = 2)
The data has been made publicly available by Charles Murray. (Note: OSF down for maintenance as of writing.)
#load data
d = read_csv("HA.csv")
d$LNFM = d$Name
d$Name = NULL
d$FNLN = map_chr(d$LNFM, function(x) {
#detect comma
if(!str_detect(x, ",")) return(x)
#match both parts
y = str_match(x, "(.*),(.*)")
# browser()
#add together again
str_c(y[, 3], y[, 2], sep = " ") %>% str_trim()
})
Some complained about the lack of (insert favorite group). I used all the data available and aggregated them into 3 major groups to avoid visual clutter.
#basic counts per ethnicity
table2(d$Ethnicity, include_NA = F) %>% arrange(desc(Count)) %>% as.data.frame
## Group Count Percent
## 1 Germanic 592 14.793
## 2 French 565 14.118
## 3 English 441 11.019
## 4 Italian 397 9.920
## 5 USA 276 6.897
## 6 Chinese 240 5.997
## 7 Jewish 184 4.598
## 8 Japanese 170 4.248
## 9 Ancient Greek 147 3.673
## 10 Slavic 115 2.874
## 11 Dutch 101 2.524
## 12 Indian 97 2.424
## 13 Arabic 94 2.349
## 14 Flemish 83 2.074
## 15 Scots 76 1.899
## 16 Spanish 69 1.724
## 17 Ancient Roman 59 1.474
## 18 Swedish 45 1.124
## 19 Czech 39 0.975
## 20 Danish 35 0.875
## 21 Irish 32 0.800
## 22 Norwegian 23 0.575
## 23 Polish 19 0.475
## 24 Latino 17 0.425
## 25 Hungarian 13 0.325
## 26 Canadian 11 0.275
## 27 Portuguese 11 0.275
## 28 Greek 10 0.250
## 29 Finnish 7 0.175
## 30 Australian 5 0.125
## 31 Black 5 0.125
## 32 Croatian 5 0.125
## 33 Romanian 5 0.125
## 34 New Zealand 3 0.075
## 35 Swiss 3 0.075
## 36 Basque 2 0.050
## 37 Icelandic 2 0.050
## 38 Slovenian 2 0.050
## 39 Bulgarian 1 0.025
## 40 Estonian 1 0.025
## 41 <NA> 0 0.000
#aggregate ethnicities
euro_ethnicities = c("Ancient Greek",
"Ancient Roman",
"Australian",
"Basque",
"Bulgarian",
"Canadian",
"Croatian",
"Germanic",
"French",
"English",
"Italian",
"USA",
"Slavic",
"Jewish",
"Dutch",
"Flemish",
"Scots",
"Spanish",
"Swedish",
"Irish",
"Norwegian",
"Polish",
"Hungarian",
"Portuguese",
"Greek",
"Finnish",
"Romanian",
"Swiss",
"New Zealand",
"Slovenian",
"Icelandic",
"Estonian")
asian_ethnicities = c("Japanese", "Chinese")
#code
d$european = (d$Ethnicity %in% euro_ethnicities) %>% as.numeric
d$asian = (d$Ethnicity %in% asian_ethnicities) %>% as.numeric
d$other = 1 - (d$european + d$asian)
d$macro_ethnicity = map_chr(d$Ethnicity, function(x) {
if (x %in% euro_ethnicities) return("European")
if (x %in% asian_ethnicities) return("Asian")
"Other"
})
#long form version for plotting
# d_long = d %>% select(Birth, macro_ethnicity, european, asian, other)
#no longer used
#counts per year for absolute plot
d_year = map_df(d$Birth %>% unique, function(yr) {
tmp = d %>% dplyr::filter(Birth == yr)
data_frame(european = sum(tmp$european),
asian = sum(tmp$asian),
other = sum(tmp$other),
Birth = yr
)
})
How were the significant figures distributed with regards to the ethnic groups? Did this change over time? We can visualize it both in relative terms and in absolute terms to find out.
#Relative trend
ggplot(d, aes(Birth, european)) +
geom_smooth(color = "blue") +
geom_smooth(aes(y = asian), color = "yellow") +
geom_smooth(aes(y = other), color = "brown") +
scale_y_continuous(labels = scales::percent, name = "Significant figures %", breaks = seq(0, 1, .2)) +
scale_x_continuous(name = "Year of birth\nBlue = European, Yellow = Asian, Brown = Other", breaks = seq(-1000, 2000, 200)) +
#add arrow for dark ages
geom_segment(aes(x = 1300, xend = 1050, y = .15, yend = .15), arrow = arrow(length = unit(0.03, "npc"))) +
geom_text(aes(label = "Dark ages", x = 1500, y = .15)) +
theme_bw()
ggsave("figures/euro_trend.png")
#absolute trend
ggplot(d_year, aes(Birth)) +
geom_smooth(aes(y = european), color = "blue") +
geom_smooth(aes(y = asian), color = "yellow") +
geom_smooth(aes(y = other), color = "brown") +
scale_y_continuous(name = "Significant figures born per year, absolute numbers") +
scale_x_continuous(name = "Year of birth\nBlue = European, Yellow = Asian, Brown = Other", breaks = seq(-1000, 2000, 200)) +
#add arrow for dark ages
geom_segment(aes(x = 700, xend = 700, y = 4, yend = 2), arrow = arrow(length = unit(0.03, "npc"))) +
geom_text(aes(label = "Dark ages", x = 700, y = 4)) +
theme_bw()
ggsave("figures/euro_trend_abs.png")
In the analysis of the percentages of significant figures from each group, errors bars were generated in the usual way meaning no assumptions about the numbers. As such, some of the error bars have impossible negative values. Error bars should not always be taken literally, but more as guidelines of uncertainty in the results. If we want more realistic error bars, we need to calculate them another way.
A simple approach to calculate error bars is to just resample the data repeatedly. For a very simple and fast model, one could calcualte the proportions for each year of birth. To avoid excessive sample error, one can include datapoints ±n years on each side. In the early ages, there are very few datapoints, so one small values of n will result in missing excessive data problems. In the below I use the value 75, so the frquencies are 151 year moving averages.
set.seed(1)
#how many times to resample
n_samples = 1000
#function
get_freqs = function(df, year_width, run = NA) {
#subset and resample
df = sample_n(df[c("Birth", "european", "asian", "other")], size = nrow(df), replace = T)
#freqs
freqs = purrr::map_df(min(df$Birth, na.rm=T):max(df$Birth, na.rm=T), function(y) {
df %>%
#subset to relevant datapoints
dplyr::filter(kirkegaard::is_between(Birth, y - year_width, y + year_width)) %$%
data_frame(
year = y,
european = mean(european),
asian = mean(asian),
other = mean(other),
run = run
)
})
freqs
}
#parallel computing
cl = makeCluster(detectCores(), type='PSOCK')
registerDoParallel(cl)
#export variables and functions to nodes
clusterExport(cl, varlist=c("d", "sample_n", "%$%", "data_frame"))
#replicate
# freqs_w = replicate(n = n_samples, expr = get_freqs(d, 100)) #single thread version
freqs_w = foreach(i = 1:n_samples) %dopar% get_freqs(d, year_width = 100, run = i)
#gives error every second time it is run (???)
#rbind
freqs_w_long = bind_rows(freqs_w) %>%
#filter NaN rows due to 0 persons
miss_filter()
#calculate 95% confidence intervals as the top 5% and lowest 5% of intervals
freqs_w_sum = freqs_w_long %>% group_by(year) %>% summarize(
european_mean = mean(european),
european_upper = quantile(european, .95),
european_lower = quantile(european, .05),
asian_mean = mean(asian),
asian_upper = quantile(asian, .95),
asian_lower = quantile(asian, .05),
other_mean = mean(other),
other_upper = quantile(other, .95),
other_lower = quantile(other, .05)
)
#plot as is
#because getting to proper long format is diffcult!
ggplot(freqs_w_sum, aes(year)) +
geom_line(aes(y = european_mean), color = "blue", size = 1) +
geom_line(aes(y = european_upper), color = "blue", linetype = "dashed", alpha = .5) +
geom_line(aes(y = european_lower), color = "blue", linetype = "dashed", alpha = .5) +
geom_line(aes(y = asian_mean), color = "yellow", size = 1) +
geom_line(aes(y = asian_upper), color = "yellow", linetype = "dashed", alpha = .5) +
geom_line(aes(y = asian_lower), color = "yellow", linetype = "dashed", alpha = .5) +
geom_line(aes(y = other_mean), color = "brown", size = 1) +
geom_line(aes(y = other_upper), color = "brown", linetype = "dashed", alpha = .5) +
geom_line(aes(y = other_lower), color = "brown", linetype = "dashed", alpha = .5) +
xlab("Year of birth") +
scale_y_continuous(name = "% significant figure", labels = scales::percent) +
theme_bw()
ggsave("figures/bootstrapped_ci.png")
## Saving 7 x 5 in image