Data and cleaning

#load pan
pan = read_csv("data/Pantheon_2016_12_03.csv")
Parsed with column specification:
cols(
  Rank = col_integer(),
  Name = col_character(),
  `Place of Birth*` = col_character(),
  `Birth Year` = col_integer(),
  Gender = col_character(),
  Occupation = col_character(),
  L = col_integer(),
  `L*` = col_double(),
  PV = col_character(),
  PVe = col_character(),
  PVne = col_character(),
  `sPV` = col_character(),
  HPI = col_double()
)
#sensible names
names(pan) = c("Rank", "Name", "Place of Birth*", "Birth Year", "Gender", "Occupation", 
"L", "Lstar", "PV", "PVe", "PVne", "sPV", "HPI")
#name variable
pan$FNLN = pan$Name
pan$Name = NULL
#for now, we remove the few duplicates
duplicated(pan$FNLN) %>% which
 [1]  2813  3476  8242  8591  8611  8659  8885  8948  9231  9597 10332 10899
pan %<>% dplyr::filter(!duplicated(FNLN))
#convert to real numerics
pan$PV %>% str_detect(" M") %>% proportion_true()
[1] 1
pan$sPV %>% str_detect(" K") %>% proportion_true()
[1] 1
#just remove the string part since they all have the same
pan$PV %<>% str_replace(" M", "") %>% as.numeric()
pan$PVe %<>% str_replace(" M", "") %>% as.numeric()
pan$PVne %<>% str_replace(" M", "") %>% as.numeric()
pan$sPV %<>% str_replace(" K", "") %>% as.numeric()
#merge HA into pan when exact match
#no this causes duplicates
#we have to use the inventories
sci = dplyr::filter(d, Inventory == "Science", !duplicated(FNLN)) %>% left_join(pan)
Joining, by = "FNLN"
#matches
#assert no duplicates
assert_that(!sci$FNLN %>% duplicated() %>% any)
[1] TRUE
#how many?
sci$HPI %>% count_NA(reverse = T)
[1] 338
sci$HPI %>% count_NA(reverse = T) %>% divide_by(nrow(sci))
[1] 0.25

EDA

index_vars = c("Index", "Rank", "L", "Lstar", "PV", "PVe", "PVne", "sPV", "HPI")
wtd.cors(sci[index_vars]) %>% write_clipboard()
      Index  Rank     L Lstar    PV   PVe  PVne   sPV   HPI
Index  1.00 -0.45  0.59 -0.12  0.47  0.45  0.48  0.29  0.51
Rank  -0.45  1.00 -0.73 -0.11 -0.45 -0.43 -0.44 -0.17 -0.93
L      0.59 -0.73  1.00  0.08  0.83  0.81  0.83  0.47  0.86
Lstar -0.12 -0.11  0.08  1.00 -0.03 -0.08  0.00 -0.03  0.09
PV     0.47 -0.45  0.83 -0.03  1.00  0.98  0.99  0.53  0.66
PVe    0.45 -0.43  0.81 -0.08  0.98  1.00  0.94  0.53  0.62
PVne   0.48 -0.44  0.83  0.00  0.99  0.94  1.00  0.52  0.67
sPV    0.29 -0.17  0.47 -0.03  0.53  0.53  0.52  1.00  0.25
HPI    0.51 -0.93  0.86  0.09  0.66  0.62  0.67  0.25  1.00
GG_scatter(sci, "Index", "L", case_names_vector = "FNLN") +
  xlab("Human Accomplishment index of scientists (Charles Murray)") +
  ylab("Number of Wikipedias that cover this person\n(Pantheon dataset)")
ggsave("figures/HA_L.png")
Saving 7.29 x 4.5 in image

GG_scatter(sci, "Index", "HPI", case_names_vector = "FNLN") +
  xlab("Human Accomplishment index of scientists (Charles Murray)") +
  ylab("Complex measure of historical importance based of Wikipedia\n(Pantheon dataset)")
ggsave("figures/HA_HPI.png")
Saving 7.29 x 4.5 in image

GG_scatter(sci, "Index", "PV", case_names_vector = "FNLN") +
  xlab("Human Accomplishment index of scientists (Charles Murray)") +
  ylab("Page views (millions), Wikipedia\n(Pantheon dataset)")
ggsave("figures/HA_PV.png")
Saving 7.29 x 4.5 in image

GG_scatter(sci, "PVe", "PVne", case_names_vector = "FNLN") +
  xlab("Page views (millions), English Wikipedia\n(Pantheon dataset)") +
  ylab("Page views (millions), Non-English Wikipedia\n(Pantheon dataset)")
ggsave("figures/PVe_PVne.png")
Saving 7.29 x 4.5 in image

LS0tDQp0aXRsZTogIkNyb3NzLXZhbGlkYXRpb24gdGhlIEh1bWFuIEFjY29tcGxpc2htZW50IGRhdGFzZXQgd2l0aCBXaWtpcGVkaWEgZGF0YSINCm91dHB1dDogaHRtbF9ub3RlYm9vaw0KLS0tDQoNCiNEYXRhIGFuZCBjbGVhbmluZw0KYGBge3IgZGF0YSwgbWVzc2FnZT1GfQ0KI2xvYWQgcGFuDQpwYW4gPSByZWFkX2NzdigiZGF0YS9QYW50aGVvbl8yMDE2XzEyXzAzLmNzdiIpDQoNCiNzZW5zaWJsZSBuYW1lcw0KbmFtZXMocGFuKSA9IGMoIlJhbmsiLCAiTmFtZSIsICJQbGFjZSBvZiBCaXJ0aCoiLCAiQmlydGggWWVhciIsICJHZW5kZXIiLCAiT2NjdXBhdGlvbiIsIA0KIkwiLCAiTHN0YXIiLCAiUFYiLCAiUFZlIiwgIlBWbmUiLCAic1BWIiwgIkhQSSIpDQoNCiNuYW1lIHZhcmlhYmxlDQpwYW4kRk5MTiA9IHBhbiROYW1lDQpwYW4kTmFtZSA9IE5VTEwNCg0KI2ZvciBub3csIHdlIHJlbW92ZSB0aGUgZmV3IGR1cGxpY2F0ZXMNCmR1cGxpY2F0ZWQocGFuJEZOTE4pICU+JSB3aGljaA0KcGFuICU8PiUgZHBseXI6OmZpbHRlcighZHVwbGljYXRlZChGTkxOKSkNCg0KI2NvbnZlcnQgdG8gcmVhbCBudW1lcmljcw0KcGFuJFBWICU+JSBzdHJfZGV0ZWN0KCIgTSIpICU+JSBwcm9wb3J0aW9uX3RydWUoKQ0KcGFuJHNQViAlPiUgc3RyX2RldGVjdCgiIEsiKSAlPiUgcHJvcG9ydGlvbl90cnVlKCkNCg0KI2p1c3QgcmVtb3ZlIHRoZSBzdHJpbmcgcGFydCBzaW5jZSB0aGV5IGFsbCBoYXZlIHRoZSBzYW1lDQpwYW4kUFYgJTw+JSBzdHJfcmVwbGFjZSgiIE0iLCAiIikgJT4lIGFzLm51bWVyaWMoKQ0KcGFuJFBWZSAlPD4lIHN0cl9yZXBsYWNlKCIgTSIsICIiKSAlPiUgYXMubnVtZXJpYygpDQpwYW4kUFZuZSAlPD4lIHN0cl9yZXBsYWNlKCIgTSIsICIiKSAlPiUgYXMubnVtZXJpYygpDQpwYW4kc1BWICU8PiUgc3RyX3JlcGxhY2UoIiBLIiwgIiIpICU+JSBhcy5udW1lcmljKCkNCg0KI21lcmdlIEhBIGludG8gcGFuIHdoZW4gZXhhY3QgbWF0Y2gNCiNubyB0aGlzIGNhdXNlcyBkdXBsaWNhdGVzDQojd2UgaGF2ZSB0byB1c2UgdGhlIGludmVudG9yaWVzDQpzY2kgPSBkcGx5cjo6ZmlsdGVyKGQsIEludmVudG9yeSA9PSAiU2NpZW5jZSIsICFkdXBsaWNhdGVkKEZOTE4pKSAlPiUgbGVmdF9qb2luKHBhbikNCg0KI21hdGNoZXMNCiNhc3NlcnQgbm8gZHVwbGljYXRlcw0KYXNzZXJ0X3RoYXQoIXNjaSRGTkxOICU+JSBkdXBsaWNhdGVkKCkgJT4lIGFueSkNCiNob3cgbWFueT8NCnNjaSRIUEkgJT4lIGNvdW50X05BKHJldmVyc2UgPSBUKQ0Kc2NpJEhQSSAlPiUgY291bnRfTkEocmV2ZXJzZSA9IFQpICU+JSBkaXZpZGVfYnkobnJvdyhzY2kpKQ0KYGBgDQoNCiNFREENCmBgYHtyIGVkYX0NCmluZGV4X3ZhcnMgPSBjKCJJbmRleCIsICJSYW5rIiwgIkwiLCAiTHN0YXIiLCAiUFYiLCAiUFZlIiwgIlBWbmUiLCAic1BWIiwgIkhQSSIpDQp3dGQuY29ycyhzY2lbaW5kZXhfdmFyc10pICU+JSB3cml0ZV9jbGlwYm9hcmQoKQ0KDQpHR19zY2F0dGVyKHNjaSwgIkluZGV4IiwgIkwiLCBjYXNlX25hbWVzX3ZlY3RvciA9ICJGTkxOIikgKw0KICB4bGFiKCJIdW1hbiBBY2NvbXBsaXNobWVudCBpbmRleCBvZiBzY2llbnRpc3RzIChDaGFybGVzIE11cnJheSkiKSArDQogIHlsYWIoIk51bWJlciBvZiBXaWtpcGVkaWFzIHRoYXQgY292ZXIgdGhpcyBwZXJzb25cbihQYW50aGVvbiBkYXRhc2V0KSIpDQpnZ3NhdmUoImZpZ3VyZXMvSEFfTC5wbmciKQ0KDQpHR19zY2F0dGVyKHNjaSwgIkluZGV4IiwgIkhQSSIsIGNhc2VfbmFtZXNfdmVjdG9yID0gIkZOTE4iKSArDQogIHhsYWIoIkh1bWFuIEFjY29tcGxpc2htZW50IGluZGV4IG9mIHNjaWVudGlzdHMgKENoYXJsZXMgTXVycmF5KSIpICsNCiAgeWxhYigiQ29tcGxleCBtZWFzdXJlIG9mIGhpc3RvcmljYWwgaW1wb3J0YW5jZSBiYXNlZCBvZiBXaWtpcGVkaWFcbihQYW50aGVvbiBkYXRhc2V0KSIpDQpnZ3NhdmUoImZpZ3VyZXMvSEFfSFBJLnBuZyIpDQoNCkdHX3NjYXR0ZXIoc2NpLCAiSW5kZXgiLCAiUFYiLCBjYXNlX25hbWVzX3ZlY3RvciA9ICJGTkxOIikgKw0KICB4bGFiKCJIdW1hbiBBY2NvbXBsaXNobWVudCBpbmRleCBvZiBzY2llbnRpc3RzIChDaGFybGVzIE11cnJheSkiKSArDQogIHlsYWIoIlBhZ2Ugdmlld3MgKG1pbGxpb25zKSwgV2lraXBlZGlhXG4oUGFudGhlb24gZGF0YXNldCkiKQ0KZ2dzYXZlKCJmaWd1cmVzL0hBX1BWLnBuZyIpDQoNCkdHX3NjYXR0ZXIoc2NpLCAiUFZlIiwgIlBWbmUiLCBjYXNlX25hbWVzX3ZlY3RvciA9ICJGTkxOIikgKw0KICB4bGFiKCJQYWdlIHZpZXdzIChtaWxsaW9ucyksIEVuZ2xpc2ggV2lraXBlZGlhXG4oUGFudGhlb24gZGF0YXNldCkiKSArDQogIHlsYWIoIlBhZ2Ugdmlld3MgKG1pbGxpb25zKSwgTm9uLUVuZ2xpc2ggV2lraXBlZGlhXG4oUGFudGhlb24gZGF0YXNldCkiKQ0KZ2dzYXZlKCJmaWd1cmVzL1BWZV9QVm5lLnBuZyIpDQpgYGANCg0KDQo=