Data and cleaning
#load pan
pan = read_csv("data/Pantheon_2016_12_03.csv")
Parsed with column specification:
cols(
Rank = col_integer(),
Name = col_character(),
`Place of Birth*` = col_character(),
`Birth Year` = col_integer(),
Gender = col_character(),
Occupation = col_character(),
L = col_integer(),
`L*` = col_double(),
PV = col_character(),
PVe = col_character(),
PVne = col_character(),
`sPV` = col_character(),
HPI = col_double()
)
#sensible names
names(pan) = c("Rank", "Name", "Place of Birth*", "Birth Year", "Gender", "Occupation",
"L", "Lstar", "PV", "PVe", "PVne", "sPV", "HPI")
#name variable
pan$FNLN = pan$Name
pan$Name = NULL
#for now, we remove the few duplicates
duplicated(pan$FNLN) %>% which
[1] 2813 3476 8242 8591 8611 8659 8885 8948 9231 9597 10332 10899
pan %<>% dplyr::filter(!duplicated(FNLN))
#convert to real numerics
pan$PV %>% str_detect(" M") %>% proportion_true()
[1] 1
pan$sPV %>% str_detect(" K") %>% proportion_true()
[1] 1
#just remove the string part since they all have the same
pan$PV %<>% str_replace(" M", "") %>% as.numeric()
pan$PVe %<>% str_replace(" M", "") %>% as.numeric()
pan$PVne %<>% str_replace(" M", "") %>% as.numeric()
pan$sPV %<>% str_replace(" K", "") %>% as.numeric()
#merge HA into pan when exact match
#no this causes duplicates
#we have to use the inventories
sci = dplyr::filter(d, Inventory == "Science", !duplicated(FNLN)) %>% left_join(pan)
Joining, by = "FNLN"
#matches
#assert no duplicates
assert_that(!sci$FNLN %>% duplicated() %>% any)
[1] TRUE
#how many?
sci$HPI %>% count_NA(reverse = T)
[1] 338
sci$HPI %>% count_NA(reverse = T) %>% divide_by(nrow(sci))
[1] 0.25
EDA
index_vars = c("Index", "Rank", "L", "Lstar", "PV", "PVe", "PVne", "sPV", "HPI")
wtd.cors(sci[index_vars]) %>% write_clipboard()
Index Rank L Lstar PV PVe PVne sPV HPI
Index 1.00 -0.45 0.59 -0.12 0.47 0.45 0.48 0.29 0.51
Rank -0.45 1.00 -0.73 -0.11 -0.45 -0.43 -0.44 -0.17 -0.93
L 0.59 -0.73 1.00 0.08 0.83 0.81 0.83 0.47 0.86
Lstar -0.12 -0.11 0.08 1.00 -0.03 -0.08 0.00 -0.03 0.09
PV 0.47 -0.45 0.83 -0.03 1.00 0.98 0.99 0.53 0.66
PVe 0.45 -0.43 0.81 -0.08 0.98 1.00 0.94 0.53 0.62
PVne 0.48 -0.44 0.83 0.00 0.99 0.94 1.00 0.52 0.67
sPV 0.29 -0.17 0.47 -0.03 0.53 0.53 0.52 1.00 0.25
HPI 0.51 -0.93 0.86 0.09 0.66 0.62 0.67 0.25 1.00
GG_scatter(sci, "Index", "L", case_names_vector = "FNLN") +
xlab("Human Accomplishment index of scientists (Charles Murray)") +
ylab("Number of Wikipedias that cover this person\n(Pantheon dataset)")
ggsave("figures/HA_L.png")
Saving 7.29 x 4.5 in image

GG_scatter(sci, "Index", "HPI", case_names_vector = "FNLN") +
xlab("Human Accomplishment index of scientists (Charles Murray)") +
ylab("Complex measure of historical importance based of Wikipedia\n(Pantheon dataset)")
ggsave("figures/HA_HPI.png")
Saving 7.29 x 4.5 in image

GG_scatter(sci, "Index", "PV", case_names_vector = "FNLN") +
xlab("Human Accomplishment index of scientists (Charles Murray)") +
ylab("Page views (millions), Wikipedia\n(Pantheon dataset)")
ggsave("figures/HA_PV.png")
Saving 7.29 x 4.5 in image

GG_scatter(sci, "PVe", "PVne", case_names_vector = "FNLN") +
xlab("Page views (millions), English Wikipedia\n(Pantheon dataset)") +
ylab("Page views (millions), Non-English Wikipedia\n(Pantheon dataset)")
ggsave("figures/PVe_PVne.png")
Saving 7.29 x 4.5 in image

LS0tDQp0aXRsZTogIkNyb3NzLXZhbGlkYXRpb24gdGhlIEh1bWFuIEFjY29tcGxpc2htZW50IGRhdGFzZXQgd2l0aCBXaWtpcGVkaWEgZGF0YSINCm91dHB1dDogaHRtbF9ub3RlYm9vaw0KLS0tDQoNCiNEYXRhIGFuZCBjbGVhbmluZw0KYGBge3IgZGF0YSwgbWVzc2FnZT1GfQ0KI2xvYWQgcGFuDQpwYW4gPSByZWFkX2NzdigiZGF0YS9QYW50aGVvbl8yMDE2XzEyXzAzLmNzdiIpDQoNCiNzZW5zaWJsZSBuYW1lcw0KbmFtZXMocGFuKSA9IGMoIlJhbmsiLCAiTmFtZSIsICJQbGFjZSBvZiBCaXJ0aCoiLCAiQmlydGggWWVhciIsICJHZW5kZXIiLCAiT2NjdXBhdGlvbiIsIA0KIkwiLCAiTHN0YXIiLCAiUFYiLCAiUFZlIiwgIlBWbmUiLCAic1BWIiwgIkhQSSIpDQoNCiNuYW1lIHZhcmlhYmxlDQpwYW4kRk5MTiA9IHBhbiROYW1lDQpwYW4kTmFtZSA9IE5VTEwNCg0KI2ZvciBub3csIHdlIHJlbW92ZSB0aGUgZmV3IGR1cGxpY2F0ZXMNCmR1cGxpY2F0ZWQocGFuJEZOTE4pICU+JSB3aGljaA0KcGFuICU8PiUgZHBseXI6OmZpbHRlcighZHVwbGljYXRlZChGTkxOKSkNCg0KI2NvbnZlcnQgdG8gcmVhbCBudW1lcmljcw0KcGFuJFBWICU+JSBzdHJfZGV0ZWN0KCIgTSIpICU+JSBwcm9wb3J0aW9uX3RydWUoKQ0KcGFuJHNQViAlPiUgc3RyX2RldGVjdCgiIEsiKSAlPiUgcHJvcG9ydGlvbl90cnVlKCkNCg0KI2p1c3QgcmVtb3ZlIHRoZSBzdHJpbmcgcGFydCBzaW5jZSB0aGV5IGFsbCBoYXZlIHRoZSBzYW1lDQpwYW4kUFYgJTw+JSBzdHJfcmVwbGFjZSgiIE0iLCAiIikgJT4lIGFzLm51bWVyaWMoKQ0KcGFuJFBWZSAlPD4lIHN0cl9yZXBsYWNlKCIgTSIsICIiKSAlPiUgYXMubnVtZXJpYygpDQpwYW4kUFZuZSAlPD4lIHN0cl9yZXBsYWNlKCIgTSIsICIiKSAlPiUgYXMubnVtZXJpYygpDQpwYW4kc1BWICU8PiUgc3RyX3JlcGxhY2UoIiBLIiwgIiIpICU+JSBhcy5udW1lcmljKCkNCg0KI21lcmdlIEhBIGludG8gcGFuIHdoZW4gZXhhY3QgbWF0Y2gNCiNubyB0aGlzIGNhdXNlcyBkdXBsaWNhdGVzDQojd2UgaGF2ZSB0byB1c2UgdGhlIGludmVudG9yaWVzDQpzY2kgPSBkcGx5cjo6ZmlsdGVyKGQsIEludmVudG9yeSA9PSAiU2NpZW5jZSIsICFkdXBsaWNhdGVkKEZOTE4pKSAlPiUgbGVmdF9qb2luKHBhbikNCg0KI21hdGNoZXMNCiNhc3NlcnQgbm8gZHVwbGljYXRlcw0KYXNzZXJ0X3RoYXQoIXNjaSRGTkxOICU+JSBkdXBsaWNhdGVkKCkgJT4lIGFueSkNCiNob3cgbWFueT8NCnNjaSRIUEkgJT4lIGNvdW50X05BKHJldmVyc2UgPSBUKQ0Kc2NpJEhQSSAlPiUgY291bnRfTkEocmV2ZXJzZSA9IFQpICU+JSBkaXZpZGVfYnkobnJvdyhzY2kpKQ0KYGBgDQoNCiNFREENCmBgYHtyIGVkYX0NCmluZGV4X3ZhcnMgPSBjKCJJbmRleCIsICJSYW5rIiwgIkwiLCAiTHN0YXIiLCAiUFYiLCAiUFZlIiwgIlBWbmUiLCAic1BWIiwgIkhQSSIpDQp3dGQuY29ycyhzY2lbaW5kZXhfdmFyc10pICU+JSB3cml0ZV9jbGlwYm9hcmQoKQ0KDQpHR19zY2F0dGVyKHNjaSwgIkluZGV4IiwgIkwiLCBjYXNlX25hbWVzX3ZlY3RvciA9ICJGTkxOIikgKw0KICB4bGFiKCJIdW1hbiBBY2NvbXBsaXNobWVudCBpbmRleCBvZiBzY2llbnRpc3RzIChDaGFybGVzIE11cnJheSkiKSArDQogIHlsYWIoIk51bWJlciBvZiBXaWtpcGVkaWFzIHRoYXQgY292ZXIgdGhpcyBwZXJzb25cbihQYW50aGVvbiBkYXRhc2V0KSIpDQpnZ3NhdmUoImZpZ3VyZXMvSEFfTC5wbmciKQ0KDQpHR19zY2F0dGVyKHNjaSwgIkluZGV4IiwgIkhQSSIsIGNhc2VfbmFtZXNfdmVjdG9yID0gIkZOTE4iKSArDQogIHhsYWIoIkh1bWFuIEFjY29tcGxpc2htZW50IGluZGV4IG9mIHNjaWVudGlzdHMgKENoYXJsZXMgTXVycmF5KSIpICsNCiAgeWxhYigiQ29tcGxleCBtZWFzdXJlIG9mIGhpc3RvcmljYWwgaW1wb3J0YW5jZSBiYXNlZCBvZiBXaWtpcGVkaWFcbihQYW50aGVvbiBkYXRhc2V0KSIpDQpnZ3NhdmUoImZpZ3VyZXMvSEFfSFBJLnBuZyIpDQoNCkdHX3NjYXR0ZXIoc2NpLCAiSW5kZXgiLCAiUFYiLCBjYXNlX25hbWVzX3ZlY3RvciA9ICJGTkxOIikgKw0KICB4bGFiKCJIdW1hbiBBY2NvbXBsaXNobWVudCBpbmRleCBvZiBzY2llbnRpc3RzIChDaGFybGVzIE11cnJheSkiKSArDQogIHlsYWIoIlBhZ2Ugdmlld3MgKG1pbGxpb25zKSwgV2lraXBlZGlhXG4oUGFudGhlb24gZGF0YXNldCkiKQ0KZ2dzYXZlKCJmaWd1cmVzL0hBX1BWLnBuZyIpDQoNCkdHX3NjYXR0ZXIoc2NpLCAiUFZlIiwgIlBWbmUiLCBjYXNlX25hbWVzX3ZlY3RvciA9ICJGTkxOIikgKw0KICB4bGFiKCJQYWdlIHZpZXdzIChtaWxsaW9ucyksIEVuZ2xpc2ggV2lraXBlZGlhXG4oUGFudGhlb24gZGF0YXNldCkiKSArDQogIHlsYWIoIlBhZ2Ugdmlld3MgKG1pbGxpb25zKSwgTm9uLUVuZ2xpc2ggV2lraXBlZGlhXG4oUGFudGhlb24gZGF0YXNldCkiKQ0KZ2dzYXZlKCJmaWd1cmVzL1BWZV9QVm5lLnBuZyIpDQpgYGANCg0KDQo=