Start

options(digits = 3)
library(pacman)
p_load(kirkegaard, haven, rms, googlesheets, glmnet)

#ad hoc function to get best model from cv.glmnet
get_glmnet_coefs = function(x) {
  #get coefs at two criteria
  coefs_min <- coef(x, s = "lambda.min")
  coefs_1se <- coef(x, s = "lambda.1se")
  
  #get values
  d1 = data.frame(predictor = coefs_min@Dimnames[[1]][coefs_min@i + 1], beta_min = coefs_min@x)
  d2 = data.frame(predictor = coefs_1se@Dimnames[[1]][coefs_1se@i + 1], beta_1se = coefs_1se@x)
  
  #merge
  full_join(d1, d2, by = "predictor")
}

describe = function(x, ...) {
  y = psych::describe(x, ...)
  class(y) = "data.frame"
  y
}

Data

#spss
spss = read_sav("data/kw_final.sav")

#googlesheets
gs_auth()
## Auto-refreshing stale OAuth token.
gs = gs_url("https://docs.google.com/spreadsheets/d/19b_b6IBk1uh33npk6KB8fsk4qmcJvQBwcNsgVBW16sA/edit#gid=547666987")
## Sheet-identifying info appears to be a browser URL.
## googlesheets will attempt to extract sheet key from the URL.
## Putative key: 19b_b6IBk1uh33npk6KB8fsk4qmcJvQBwcNsgVBW16sA
## Sheet successfully identified: "Keywords study Intelligence journal"
d = gs_read(gs)
## Accessing worksheet titled 'data'.
## Parsed with column specification:
## cols(
##   Year = col_integer(),
##   FirstAuthor = col_character(),
##   Title = col_character(),
##   WoS = col_integer(),
##   Keyword = col_character(),
##   Category = col_character(),
##   `Category numeric` = col_integer(),
##   `Recode Redundant (best guess from list)` = col_character(),
##   `Recode redundant preferred` = col_character(),
##   `Change Code` = col_integer()
## )
#mutate
d %<>% 
  #add/fix columns
  mutate(
    FirstAuthor = plyr::mapvalues(FirstAuthor, from = "", to = NA),
    Title = plyr::mapvalues(Title, from = "", to = NA),
    #citations per year
    wos_per_year =  WoS / (2018 - Year)
  ) %>% 
  #fill in repeated values
  fill(FirstAuthor, Title) %>% 
  mutate(
    #unique id
    author_year = glue::glue("{FirstAuthor} - {Year} ") %>% as.character(),
    author_year_title = glue::glue("{author_year} - {Title} ") %>% as.character()
  )
## The following `from` values were not present in `x`:
## The following `from` values were not present in `x`:
#duplicates?
#tricky because of the multi-row format
#the author year combos do not form 'unique sequences'
#i.e. some authors published multiple times a single year
#if not, the RLE count for sorted and unsorted would be identical
rle(d$author_year)
## Run Length Encoding
##   lengths: int [1:911] 5 3 4 5 4 3 4 5 5 5 ...
##   values : chr [1:911] "Colom, R - 2000 " "Stankov, L - 2001 " ...
rle(d$author_year %>% sort())
## Run Length Encoding
##   lengths: int [1:793] 6 10 4 4 3 4 4 5 5 5 ...
##   values : chr [1:793] "Abad, F - 2003 " "Ackerman - 2014 " ...
#but should be ok for the author year title approach
rle(d$author_year_title)
## Run Length Encoding
##   lengths: int [1:919] 5 3 4 5 4 3 4 5 5 5 ...
##   values : chr [1:919] "Colom, R - 2000  - Negligible Sex Differences in General Intelligence " ...
rle(d$author_year_title %>% sort())
## Run Length Encoding
##   lengths: int [1:919] 6 3 3 4 4 4 3 4 4 5 ...
##   values : chr [1:919] "Abad, F - 2003  - Intelligence differentiation in adult samples " ...
#force error if false
assert_that(length(rle(d$author_year_title)$length) == length(rle(d$author_year_title %>% sort())$length))
## [1] TRUE
#extract studies table
studies = d %>% filter(!duplicated(author_year_title))

#remove missing keywords placeholder
#this also removes the studies without keywords
#but we can get them back later by merging with studies table
d = d %>% filter(Keyword != "xxx")

#overwrite 'redundant' category codings
d$Category_orig = d$Category
d$Category = case_when(
  d$Category == "redundant" ~ d$`Recode Redundant (best guess from list)`,
  TRUE ~ d$Category
  )

#remove any problem linebreaks
d$Category = d$Category %>% str_replace_all("\\n", "")

#assert no missing or redundant
assert_that(!(d$Category == "redudant") %>% any())
## [1] TRUE

Basic citation stats

#citation counts
GG_denhist(studies, "WoS")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

describe(studies$WoS) %>% (t)
##              X1
## vars       1.00
## n        919.00
## mean      24.73
## sd        40.22
## median    13.00
## trimmed   17.06
## mad       13.34
## min        0.00
## max      560.00
## range    560.00
## skew       6.11
## kurtosis  58.59
## se         1.33
#citation rate
GG_denhist(studies, "wos_per_year")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

describe(studies$wos_per_year) %>% (t)
##               X1
## vars       1.000
## n        919.000
## mean       3.059
## sd         3.757
## median     2.000
## trimmed    2.370
## mad        1.730
## min        0.000
## max       50.909
## range     50.909
## skew       4.791
## kurtosis  40.201
## se         0.124
#plot the relationship to publication year
GG_scatter(studies, "Year", "wos_per_year") +
  geom_smooth()
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

Data transformation example.

Before we can model data, we need to convert to tidy format – 1 row per article. For this, we need to move to wide format dummies for the keywords.

#test data
#illustrate how the spreading works
set.seed(1)
tt = data_frame(
  id = c(rep("A", 3), rep("B", 4), rep("C", 2)),
  keyword = sample(letters, 9, replace = T),
  true = T,
  ...count = 1:9
)

#print example data
tt
#show how wide-format transform works
tt2 = tt %>% spread(key = keyword, value = true, fill = F)
tt2
#collapse to single row
tt2 %>% 
  plyr::ddply("id", function(x) {
    # browser()
    #keep constants
    y1 = data_frame(
      id = x$id[1]
    )
    
    #call any on each data col
    y2 = colSums(x[-c(1:2)]) %>% as.list() %>% as_data_frame()
    
    cbind(y1, y2)
  })

Keywords

Direct keyword analyses.

#metadata vars
kw_metavars = c("author_year_title", "...count")

#lower case, remove whitespace, remove any trailing digits
#save orig
d$keyword_orig = d$Keyword
d$keyword = d$keyword_orig %>% str_replace("\\s+\\d*\\s*$", "") %>% str_to_lower()

#unique count before and after
d$keyword_orig %>% unique() %>% length()
## [1] 2989
d$keyword %>% unique() %>% length()
## [1] 2134
#do a count
(kw_counts = table2(d$keyword))
#wide format
#assert no overlap with existing colnames
#this means we can just insert the dummies and dont have to worry about replacement
#but we need to accept 'illegal names'
assert_that(!any(names(d) %in% kw_counts$Group))
## [1] TRUE
#spread kws
kws = d %>%
  #get id and keyword col
  select(author_year_title, keyword) %>% 
  #add TRUE col
  mutate(
    #we want a logical argument for presence of keyword or not
    true = T,
    #add a count variable
    #needed for uniqueness, tho we dont need it for anything
    ...count = 1:nrow(.)
    ) %>% 
    #as wide
    tidyr::spread(key = keyword, value = true, fill = F)

#calculate proportions by keyword
kw_dists = kws %>% select(-!!kw_metavars) %>% colMeans()
kw_dists %>% sort(decreasing = T) %>% head(50)
##              intelligence                        iq 
##                   0.08776                   0.02108 
##            working memory         cognitive ability 
##                   0.01604                   0.01054 
##              flynn effect        fluid intelligence 
##                   0.01054                   0.01031 
##      general intelligence           sex differences 
##                   0.01031                   0.00940 
##                 education       cognitive abilities 
##                   0.00894                   0.00687 
##                         g                     aging 
##                   0.00687                   0.00481 
##           inspection time                  g factor 
##                   0.00481                   0.00458 
##                 reasoning               personality 
##                   0.00458                   0.00412 
##          processing speed                 cognition 
##                   0.00412                   0.00390 
## crystallized intelligence               development 
##                   0.00367                   0.00344 
##    emotional intelligence                creativity 
##                   0.00344                   0.00321 
## general cognitive ability    individual differences 
##                   0.00321                   0.00321 
##     spearman's hypothesis   working memory capacity 
##                   0.00321                   0.00321 
##                  children    cognitive epidemiology 
##                   0.00298                   0.00298 
##           factor analysis              longitudinal 
##                   0.00298                   0.00298 
##                 attention              heritability 
##                   0.00275                   0.00275 
##                 expertise                 fertility 
##                   0.00252                   0.00252 
##        gender differences                    health 
##                   0.00252                   0.00252 
##           mental rotation      socioeconomic status 
##                   0.00252                   0.00252 
##     cognitive development   complex problem solving 
##                   0.00229                   0.00229 
##                    income              mental speed 
##                   0.00229                   0.00229 
##               national iq             reaction time 
##                   0.00229                   0.00229 
##                     twins    educational attainment 
##                   0.00229                   0.00206 
##                 evolution                    gender 
##                   0.00206                   0.00206 
##            mental ability             meta-analysis 
##                   0.00206                   0.00206

Categories

#cats metavar
cat_metavars = c("author_year_title", "...count")

#do a count
(cat_counts = table2(d$Category))
#spread
cats = d %>%
  #get id and cat col
  select(author_year_title, Category) %>% 
  #filter redundant?
  # filter(Category != "redundant") %>% 
  #add TRUE col
  mutate(
    #we want a logical argument for presence of keyword or not
    true = T,
    #add a count variable
    #needed for uniqueness, tho we dont need it for anything
    ...count = 1:nrow(.)
    ) %>% 
    #as wide
    tidyr::spread(key = Category, value = true, fill = F)

cat_counts = cats %>% plyr::ddply("author_year_title", function(x) {
    # browser()
    #keep constants
    y1 = data_frame(
      author_year_title = x$author_year_title[1]
    )
    
    #call any on each data col
    y2 = x %>% select(-!!cat_metavars) %>% colSums() %>% as.list() %>% as_data_frame()
    
    cbind(y1, y2, count = sum(y2))
  })

cat_lgls = cats %>% plyr::ddply("author_year_title", function(x) {
    # browser()
    #keep constants
    y1 = data_frame(
      author_year_title = x$author_year_title[1]
    )
    
    #call any on each data col
    y2 = x %>% select(-!!cat_metavars) %>% colSums() %>% as.list() %>% as_data_frame() %>% map_df(as.logical)
    
    cbind(y1, y2, count = y2 %>% map_df(as.numeric) %>% sum())
  })

#calculate proportions by keyword
cat_dists = cat_lgls %>% select(-author_year_title, -count) %>% colMeans()
cat_dists_count = cat_lgls %>% select(-author_year_title, -count) %>% colSums()
cat_dists %>% sort(decreasing = T) %>% head(50)
##  intelligence / cognitive ability                          g factor 
##                            0.5862                            0.1507 
##        psychometrics / statistics                         education 
##                            0.1266                            0.1234 
##  iq / achievement / aptitude test                  race / ethnicity 
##                            0.1092                            0.1081 
##                    working memory                     brain / neuro 
##                            0.1048                            0.0906 
##      children / child development                memory / cognition 
##                            0.0830                            0.0808 
##                   sex differences             income / status / ses 
##                            0.0797                            0.0786 
##                            health                     adult / aging 
##                            0.0753                            0.0688 
##                fluid intelligence                      flynn effect 
##                            0.0655                            0.0644 
##                          modeling                 genes / evolution 
##                            0.0622                            0.0600 
##             genes and environment                               ECT 
##                            0.0590                            0.0579 
##                      mental speed          aggregate / regional iqs 
##                            0.0557                            0.0491 
##                           raven's                       iq theories 
##                            0.0491                            0.0480 
##         crystallized intelligence                         attention 
##                            0.0426                            0.0393 
##                       personality                         reasoning 
##                            0.0371                            0.0360 
##                   factor analysis                   spatial ability 
##                            0.0349                            0.0338 
##             spearman's hypothesis                executive function 
##                            0.0317                            0.0306 
##                  item level / IRT                          politics 
##                            0.0306                            0.0251 
##                      longitudinal                         economics 
##                            0.0240                            0.0207 
##                                EQ     individual change / stability 
##                            0.0207                            0.0207 
## problem solving / decision making               talent / giftedness 
##                            0.0207                            0.0207 
##                        creativity                         expertise 
##                            0.0197                            0.0197 
##                              work                             slodr 
##                            0.0197                            0.0186 
##                         dysgenics            individual differences 
##                            0.0175                            0.0175 
##            sensation / perception          culture / cross cultural 
##                            0.0175                            0.0164 
##                     meta-analysis                       religiosity 
##                            0.0164                            0.0164

Modeling

#join to the tables, wide format, no duplicated rows
cat_counts2 = left_join(cat_counts, studies %>% select(author_year_title, wos_per_year, WoS, Year), by = "author_year_title")
cat_lgls2 = left_join(cat_lgls, studies %>% select(author_year_title, wos_per_year, WoS, Year), by = "author_year_title")

cat_year_sums = cat_counts2 %>% plyr::ddply("Year", function(x) {
  # browser()
  x %>% select(-author_year_title, -Year, -wos_per_year) %>% colSums()
})

cat_year_sums[c("Year", cat_dists %>% sort(decreasing = T) %>% head(10) %>% names())]
cat_lgls2_years = cat_lgls2 %>% filter(Year != 2000) %>% plyr::ddply("Year", function(x) {
  # browser()
  x %>% select(-author_year_title, -Year, -wos_per_year) %>% colMeans()
})

cat_lgls2_years[c("Year", cat_dists %>% sort(decreasing = T) %>% .[-1] %>% head(10) %>% names())] %>% 
  #to long form
  gather(key = category, value = prop, -Year) %>% 
  ggplot(aes(Year, prop, color = category)) +
  # geom_line()
  geom_smooth(se = F) +
  scale_x_continuous(breaks = 0:9999) +
  theme_classic() +
  theme(axis.text.x = element_text(angle = -30))
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'