Study

This is the R notebook for a work in progress study. Results are probably final, but no paper has been written.

Study files

Available on OSF.

Data sources:

Startup

Load packages and set options.

options(digits = 2)
library(pacman)
p_load(kirkegaard, readr, rms)

Data

Load data and merge with existing dataset. This is a little tricky because we have to use the gendered versions of the names to merge, but we don’t necessarily want these for other purposes.

#ad hoc gender function
genderize = function(x, gender = NULL, remove = F) {
  if (remove) {
   x = str_replace(x, " \\(.+\\)", "")
   x = str_replace(x, "_.$", "")
  } else {
    #check missing
    if (is.null(gender)) stop("`gender` was NULL!")

    #identify gender
    .gender = (gender == "Male" | gender == 1)
    
    #make a vec to add
    .gender_str = if_else(.gender, true = "_M", false = "_F", missing = "_?")
    
    #add
    x = x + .gender_str 
  }
  
  x
}

#load main name data from previous study
kirk_tran = read_rds("data/kirkegaard_tranberg_2015.rds") %>% 
  #rename name to firstname
  rename(
    firstname = name
  ) %>% 
  #remove gender tag from firstnames
  mutate(
    firstname = genderize(firstname, remove = T),
    firstname_gendered = genderize(firstname, gender)
  )

#Thomas
thomas_data = read_csv("data/thomas_firstnames_data.csv") %>% 
  #remove na name
  filter(!is.na(firstname)) %>% 
  #fix gender
  mutate(
    firstname_gendered = genderize(firstname, sex)
  )
## Parsed with column specification:
## cols(
##   sex = col_integer(),
##   firstname = col_character(),
##   region = col_character(),
##   IQ_raw = col_double(),
##   n = col_integer()
## )
#merge names in Thomas' data
thomas_data %<>% plyr::ddply("firstname", .fun = function(.d) {
  #assert equal sex
  assert_that(kirkegaard::all_elements_the_same(.d$sex))
  
  #use first row as default values
  ..d = .d[1, ]
  
  #calc wtd mean IQ
  ..d$IQ_raw = wtd_mean(.d$IQ_raw, .d$n)
  
  #sum n
  ..d$n = sum(.d$n)
  
  #out
  ..d
})

#any duplicates in thomas' data?
assert_that(!any(duplicated(thomas_data$firstname)))
## [1] TRUE
#merge data
d = full_join(kirk_tran, thomas_data %>% select(-firstname), by = c("firstname_gendered" = "firstname_gendered"))

#synonyms
d$S = d$S.both.no.age
d$CA = d$IQ_raw %>% standardize

#conver to IQ, assuming Danish = 100, 15
d$IQ = (((d$IQ_raw - 42.79833641) / 9.832409119) * 15) + 100

#sample sizes for IQ data
#total
thomas_data$n %>% sum()
## [1] 65137
#descrip
thomas_data$n %>% describe()
##    vars   n mean  sd median trimmed mad min  max range skew kurtosis se
## X1    1 265  246 429     65     132  58  21 2140  2119  2.6      6.5 26

Note that the S scores are standardized at the firstname level, not the individual level. An S score of -2 does NOT mean that the group mean is on the 2th centile of society. However, the IQs are scaled to individual-level Danish norms, so they do have the usual interpretation. It’s not easy to rescale the S scores.

Plot cognitive ability x S

Now we are ready to examine the relationship between the mean cognitive ability and S score of each first name.

#labels for reuse
S_ylab = ylab("Average S\n(general socioeconomic factor based on 5 indicators; z-score)")
CA_xlab = xlab("Average IQ on Danish military test (Danish norms)")

#plot
GG_scatter(d, "IQ", "S", case_names = "firstname", color = "gender") +
  scale_color_manual("Sex", values = c("Red", "Blue")) +
  CA_xlab +
  S_ylab

GG_save("figs/CA_S.png")

GG_scatter(d, "IQ", "S", case_names = "firstname", weights = sqrt(d$n), color = "gender", alpha = .2) +
  scale_color_manual("Sex", values = c("Red", "Blue")) +
  CA_xlab +
  S_ylab