Start

options(digits = 3)
library(pacman)
p_load(kirkegaard, haven, rms, googlesheets, glmnet)

#ad hoc function to get best model from cv.glmnet
get_glmnet_coefs = function(x) {
  #get coefs at two criteria
  coefs_min <- coef(x, s = "lambda.min")
  coefs_1se <- coef(x, s = "lambda.1se")
  
  #get values
  d1 = data.frame(predictor = coefs_min@Dimnames[[1]][coefs_min@i + 1], beta_min = coefs_min@x)
  d2 = data.frame(predictor = coefs_1se@Dimnames[[1]][coefs_1se@i + 1], beta_1se = coefs_1se@x)
  
  #merge
  full_join(d1, d2, by = "predictor")
}

describe = function(x, ...) {
  y = psych::describe(x, ...)
  class(y) = "data.frame"
  y
}

Data

#spss
spss = read_sav("data/kw_final.sav")

#googlesheets
gs_auth()
## Auto-refreshing stale OAuth token.
gs = gs_url("https://docs.google.com/spreadsheets/d/19b_b6IBk1uh33npk6KB8fsk4qmcJvQBwcNsgVBW16sA/edit#gid=547666987")
## Sheet-identifying info appears to be a browser URL.
## googlesheets will attempt to extract sheet key from the URL.
## Putative key: 19b_b6IBk1uh33npk6KB8fsk4qmcJvQBwcNsgVBW16sA
## Sheet successfully identified: "Keywords study Intelligence journal"
d = gs_read(gs)
## Accessing worksheet titled 'data'.
## Parsed with column specification:
## cols(
##   Year = col_integer(),
##   FirstAuthor = col_character(),
##   Title = col_character(),
##   WoS = col_integer(),
##   Keyword = col_character(),
##   Category = col_character(),
##   `Category numeric` = col_integer(),
##   `Recode Redundant (best guess from list)` = col_character(),
##   `Recode redundant preferred` = col_character(),
##   `Change Code` = col_integer()
## )
#mutate
d %<>% 
  #add/fix columns
  mutate(
    FirstAuthor = plyr::mapvalues(FirstAuthor, from = "", to = NA),
    Title = plyr::mapvalues(Title, from = "", to = NA),
    #citations per year
    wos_per_year =  WoS / (2018 - Year)
  ) %>% 
  #fill in repeated values
  fill(FirstAuthor, Title) %>% 
  mutate(
    #unique id
    author_year = glue::glue("{FirstAuthor} - {Year} ") %>% as.character(),
    author_year_title = glue::glue("{author_year} - {Title} ") %>% as.character()
  )
## The following `from` values were not present in `x`:
## The following `from` values were not present in `x`:
#duplicates?
#tricky because of the multi-row format
#the author year combos do not form 'unique sequences'
#i.e. some authors published multiple times a single year
#if not, the RLE count for sorted and unsorted would be identical
rle(d$author_year)
## Run Length Encoding
##   lengths: int [1:911] 5 3 4 5 4 3 4 5 5 5 ...
##   values : chr [1:911] "Colom, R - 2000 " "Stankov, L - 2001 " ...
rle(d$author_year %>% sort())
## Run Length Encoding
##   lengths: int [1:793] 6 10 4 4 3 4 4 5 5 5 ...
##   values : chr [1:793] "Abad, F - 2003 " "Ackerman - 2014 " ...
#but should be ok for the author year title approach
rle(d$author_year_title)
## Run Length Encoding
##   lengths: int [1:919] 5 3 4 5 4 3 4 5 5 5 ...
##   values : chr [1:919] "Colom, R - 2000  - Negligible Sex Differences in General Intelligence " ...
rle(d$author_year_title %>% sort())
## Run Length Encoding
##   lengths: int [1:919] 6 3 3 4 4 4 3 4 4 5 ...
##   values : chr [1:919] "Abad, F - 2003  - Intelligence differentiation in adult samples " ...
#force error if false
assert_that(length(rle(d$author_year_title)$length) == length(rle(d$author_year_title %>% sort())$length))
## [1] TRUE
#extract studies table
studies = d %>% filter(!duplicated(author_year_title))

#remove missing keywords placeholder
#this also removes the studies without keywords
#but we can get them back later by merging with studies table
d = d %>% filter(Keyword != "xxx")

#overwrite 'redundant' category codings
d$Category_orig = d$Category
d$Category = case_when(
  d$Category == "redundant" ~ d$`Recode Redundant (best guess from list)`,
  TRUE ~ d$Category
  )

#remove any problem linebreaks
d$Category = d$Category %>% str_replace_all("\\n", "")

#assert no missing or redundant
assert_that(!(d$Category == "redudant") %>% any())
## [1] TRUE

Basic citation stats

#citation counts
GG_denhist(studies, "WoS")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

describe(studies$WoS) %>% (t)
##              X1
## vars       1.00
## n        919.00
## mean      24.73
## sd        40.22
## median    13.00
## trimmed   17.06
## mad       13.34
## min        0.00
## max      560.00
## range    560.00
## skew       6.11
## kurtosis  58.59
## se         1.33
#citation rate
GG_denhist(studies, "wos_per_year")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

describe(studies$wos_per_year) %>% (t)
##               X1
## vars       1.000
## n        919.000
## mean       3.059
## sd         3.757
## median     2.000
## trimmed    2.370
## mad        1.730
## min        0.000
## max       50.909
## range     50.909
## skew       4.791
## kurtosis  40.201
## se         0.124
#plot the relationship to publication year
GG_scatter(studies, "Year", "wos_per_year") +
  geom_smooth()
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

Data transformation example.

Before we can model data, we need to convert to tidy format – 1 row per article. For this, we need to move to wide format dummies for the keywords.

#test data
#illustrate how the spreading works
set.seed(1)
tt = data_frame(
  id = c(rep("A", 3), rep("B", 4), rep("C", 2)),
  keyword = sample(letters, 9, replace = T),
  true = T,
  ...count = 1:9
)

#print example data
tt
#show how wide-format transform works
tt2 = tt %>% spread(key = keyword, value = true, fill = F)
tt2
#collapse to single row
tt2 %>% 
  plyr::ddply("id", function(x) {
    # browser()
    #keep constants
    y1 = data_frame(
      id = x$id[1]
    )
    
    #call any on each data col
    y2 = colSums(x[-c(1:2)]) %>% as.list() %>% as_data_frame()
    
    cbind(y1, y2)
  })

Keywords

Direct keyword analyses.

#metadata vars
kw_metavars = c("author_year_title", "...count")

#lower case, remove whitespace, remove any trailing digits
#save orig
d$keyword_orig = d$Keyword
d$keyword = d$keyword_orig %>% str_replace("\\s+\\d*\\s*$", "") %>% str_to_lower()

#unique count before and after
d$keyword_orig %>% unique() %>% length()
## [1] 2989
d$keyword %>% unique() %>% length()
## [1] 2134
#do a count
(kw_counts = table2(d$keyword))
#wide format
#assert no overlap with existing colnames
#this means we can just insert the dummies and dont have to worry about replacement
#but we need to accept 'illegal names'
assert_that(!any(names(d) %in% kw_counts$Group))
## [1] TRUE
#spread kws
kws = d %>%
  #get id and keyword col
  select(author_year_title, keyword) %>% 
  #add TRUE col
  mutate(
    #we want a logical argument for presence of keyword or not
    true = T,
    #add a count variable
    #needed for uniqueness, tho we dont need it for anything
    ...count = 1:nrow(.)
    ) %>% 
    #as wide
    tidyr::spread(key = keyword, value = true, fill = F)

#calculate proportions by keyword
kw_dists = kws %>% select(-!!kw_metavars) %>% colMeans()
kw_dists %>% sort(decreasing = T) %>% head(50)
##              intelligence                        iq 
##                   0.08776                   0.02108 
##            working memory         cognitive ability 
##                   0.01604                   0.01054 
##              flynn effect        fluid intelligence 
##                   0.01054                   0.01031 
##      general intelligence           sex differences 
##                   0.01031                   0.00940 
##                 education       cognitive abilities 
##                   0.00894                   0.00687 
##                         g                     aging 
##                   0.00687                   0.00481 
##           inspection time                  g factor 
##                   0.00481                   0.00458 
##                 reasoning               personality 
##                   0.00458                   0.00412 
##          processing speed                 cognition 
##                   0.00412                   0.00390 
## crystallized intelligence               development 
##                   0.00367                   0.00344 
##    emotional intelligence                creativity 
##                   0.00344                   0.00321 
## general cognitive ability    individual differences 
##                   0.00321                   0.00321 
##     spearman's hypothesis   working memory capacity 
##                   0.00321                   0.00321 
##                  children    cognitive epidemiology 
##                   0.00298                   0.00298 
##           factor analysis              longitudinal 
##                   0.00298                   0.00298 
##                 attention              heritability 
##                   0.00275                   0.00275 
##                 expertise                 fertility 
##                   0.00252                   0.00252 
##        gender differences                    health 
##                   0.00252                   0.00252 
##           mental rotation      socioeconomic status 
##                   0.00252                   0.00252 
##     cognitive development   complex problem solving 
##                   0.00229                   0.00229 
##                    income              mental speed 
##                   0.00229                   0.00229 
##               national iq             reaction time 
##                   0.00229                   0.00229 
##                     twins    educational attainment 
##                   0.00229                   0.00206 
##                 evolution                    gender 
##                   0.00206                   0.00206 
##            mental ability             meta-analysis 
##                   0.00206                   0.00206

Categories

#cats metavar
cat_metavars = c("author_year_title", "...count")

#do a count
(cat_counts = table2(d$Category))
#spread
cats = d %>%
  #get id and cat col
  select(author_year_title, Category) %>% 
  #filter redundant?
  # filter(Category != "redundant") %>% 
  #add TRUE col
  mutate(
    #we want a logical argument for presence of keyword or not
    true = T,
    #add a count variable
    #needed for uniqueness, tho we dont need it for anything
    ...count = 1:nrow(.)
    ) %>% 
    #as wide
    tidyr::spread(key = Category, value = true, fill = F)

cat_counts = cats %>% plyr::ddply("author_year_title", function(x) {
    # browser()
    #keep constants
    y1 = data_frame(
      author_year_title = x$author_year_title[1]
    )
    
    #call any on each data col
    y2 = x %>% select(-!!cat_metavars) %>% colSums() %>% as.list() %>% as_data_frame()
    
    cbind(y1, y2, count = sum(y2))
  })

cat_lgls = cats %>% plyr::ddply("author_year_title", function(x) {
    # browser()
    #keep constants
    y1 = data_frame(
      author_year_title = x$author_year_title[1]
    )
    
    #call any on each data col
    y2 = x %>% select(-!!cat_metavars) %>% colSums() %>% as.list() %>% as_data_frame() %>% map_df(as.logical)
    
    cbind(y1, y2, count = y2 %>% map_df(as.numeric) %>% sum())
  })

#calculate proportions by keyword
cat_dists = cat_lgls %>% select(-author_year_title, -count) %>% colMeans()
cat_dists_count = cat_lgls %>% select(-author_year_title, -count) %>% colSums()
cat_dists %>% sort(decreasing = T) %>% head(50)
##  intelligence / cognitive ability                          g factor 
##                            0.5862                            0.1507 
##        psychometrics / statistics                         education 
##                            0.1266                            0.1234 
##  iq / achievement / aptitude test                  race / ethnicity 
##                            0.1092                            0.1081 
##                    working memory                     brain / neuro 
##                            0.1048                            0.0906 
##      children / child development                memory / cognition 
##                            0.0830                            0.0808 
##                   sex differences             income / status / ses 
##                            0.0797                            0.0786 
##                            health                     adult / aging 
##                            0.0753                            0.0688 
##                fluid intelligence                      flynn effect 
##                            0.0655                            0.0644 
##                          modeling                 genes / evolution 
##                            0.0622                            0.0600 
##             genes and environment                               ECT 
##                            0.0590                            0.0579 
##                      mental speed          aggregate / regional iqs 
##                            0.0557                            0.0491 
##                           raven's                       iq theories 
##                            0.0491                            0.0480 
##         crystallized intelligence                         attention 
##                            0.0426                            0.0393 
##                       personality                         reasoning 
##                            0.0371                            0.0360 
##                   factor analysis                   spatial ability 
##                            0.0349                            0.0338 
##             spearman's hypothesis                executive function 
##                            0.0317                            0.0306 
##                  item level / IRT                          politics 
##                            0.0306                            0.0251 
##                      longitudinal                         economics 
##                            0.0240                            0.0207 
##                                EQ     individual change / stability 
##                            0.0207                            0.0207 
## problem solving / decision making               talent / giftedness 
##                            0.0207                            0.0207 
##                        creativity                         expertise 
##                            0.0197                            0.0197 
##                              work                             slodr 
##                            0.0197                            0.0186 
##                         dysgenics            individual differences 
##                            0.0175                            0.0175 
##            sensation / perception          culture / cross cultural 
##                            0.0175                            0.0164 
##                     meta-analysis                       religiosity 
##                            0.0164                            0.0164

Modeling

#join to the tables, wide format, no duplicated rows
cat_counts2 = left_join(cat_counts, studies %>% select(author_year_title, wos_per_year, WoS, Year), by = "author_year_title")
cat_lgls2 = left_join(cat_lgls, studies %>% select(author_year_title, wos_per_year, WoS, Year), by = "author_year_title")

cat_year_sums = cat_counts2 %>% plyr::ddply("Year", function(x) {
  # browser()
  x %>% select(-author_year_title, -Year, -wos_per_year) %>% colSums()
})

cat_year_sums[c("Year", cat_dists %>% sort(decreasing = T) %>% head(10) %>% names())]
cat_lgls2_years = cat_lgls2 %>% filter(Year != 2000) %>% plyr::ddply("Year", function(x) {
  # browser()
  x %>% select(-author_year_title, -Year, -wos_per_year) %>% colMeans()
})

cat_lgls2_years[c("Year", cat_dists %>% sort(decreasing = T) %>% .[-1] %>% head(10) %>% names())] %>% 
  #to long form
  gather(key = category, value = prop, -Year) %>% 
  ggplot(aes(Year, prop, color = category)) +
  # geom_line()
  geom_smooth(se = F) +
  scale_x_continuous(breaks = 0:9999) +
  theme_classic() +
  theme(axis.text.x = element_text(angle = -30))
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

#subsets
#Bryan wants because reasons
#subset 1: flat
cat_lgls2_years[c("Year", "EQ", "reasoning", "modeling", "fluid intelligence", "attention", "factor analysis", "spatial ability")] %>% 
  #to long form
  gather(key = category, value = prop, -Year) %>% 
  ggplot(aes(Year, prop, color = category)) +
  # geom_line()
  geom_smooth(se = F) +
  scale_x_continuous(breaks = 0:9999) +
  ylab("Proportion of papers") +
  theme_classic() +
  theme(axis.text.x = element_text(angle = -30)) +
  ylim(0, .25)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

GG_save("figs/timeline1.png")
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
#subset 2: n/u shaped
cat_lgls2_years[c("Year", "working memory", "g factor", "iq / achievement / aptitude test", "sex differences", "brain / neuro", "crystallized intelligence")] %>% 
  #to long form
  gather(key = category, value = prop, -Year) %>% 
  ggplot(aes(Year, prop, color = category)) +
  # geom_line()
  geom_smooth(se = F) +
  scale_x_continuous(breaks = 0:9999) +
  ylab("Proportion of papers") +
  theme_classic() +
  theme(axis.text.x = element_text(angle = -30)) +
  ylim(0, .25)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## Warning: Removed 3 rows containing non-finite values (stat_smooth).

GG_save("figs/timeline2.png")
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## Warning: Removed 3 rows containing non-finite values (stat_smooth).
#subset 3: rising falling
cat_lgls2_years[c("Year", "mental speed", "flynn effect", "memory / cognition", "education", "executive function", "iq theories", "psychometrics / statistics")] %>% 
  #to long form
  gather(key = category, value = prop, -Year) %>% 
  ggplot(aes(Year, prop, color = category)) +
  # geom_line()
  geom_smooth(se = F) +
  scale_x_continuous(breaks = 0:9999) +
  ylab("Proportion of papers") +
  theme_classic() +
  theme(axis.text.x = element_text(angle = -30)) +
  ylim(0, .25)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## Warning: Removed 2 rows containing non-finite values (stat_smooth).
## Warning: Removed 2 rows containing missing values (geom_smooth).

GG_save("figs/timeline3.png")
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## Warning: Removed 2 rows containing non-finite values (stat_smooth).

## Warning: Removed 2 rows containing missing values (geom_smooth).
#build formulas
#top 20 categories
form_model = glue::glue("wos_per_year ~ `{str_c(cat_dists %>% sort(decreasing = T) %>% head(20) %>% names(), collapse = '` + `')}` + count") %>% as.formula()

#with year as control
form_model2 = glue::glue("wos_per_year ~ `{str_c(cat_dists %>% sort(decreasing = T) %>% head(20) %>% names(), collapse = '` + `')}` + count + rcs(Year)") %>% as.formula()

#all categories with >= 20 articles
(cats_at_least_20 = cat_dists_count[cat_dists_count >= 20] %>% names())
##  [1] "adult / aging"                    "aggregate / regional iqs"        
##  [3] "attention"                        "brain / neuro"                   
##  [5] "children / child development"     "crystallized intelligence"       
##  [7] "ECT"                              "education"                       
##  [9] "executive function"               "factor analysis"                 
## [11] "fluid intelligence"               "flynn effect"                    
## [13] "genes and environment"            "genes / evolution"               
## [15] "g factor"                         "health"                          
## [17] "income / status / ses"            "intelligence / cognitive ability"
## [19] "iq / achievement / aptitude test" "iq theories"                     
## [21] "item level / IRT"                 "longitudinal"                    
## [23] "memory / cognition"               "mental speed"                    
## [25] "modeling"                         "personality"                     
## [27] "politics"                         "psychometrics / statistics"      
## [29] "race / ethnicity"                 "raven's"                         
## [31] "reasoning"                        "sex differences"                 
## [33] "spatial ability"                  "spearman's hypothesis"           
## [35] "working memory"
form_model3 = glue::glue("wos_per_year ~ `{str_c(cats_at_least_20, collapse = '` + `')}` + count + rcs(Year)") %>% as.formula()

#fit
rms::ols(form_model, data = cat_counts2)
## Linear Regression Model
##  
##  rms::ols(formula = form_model, data = cat_counts2)
##  
##                 Model Likelihood     Discrimination    
##                    Ratio Test           Indexes        
##  Obs     916    LR chi2     56.48    R2       0.060    
##  sigma3.6897    d.f.           21    R2 adj   0.038    
##  d.f.    894    Pr(> chi2) 0.0000    g        1.007    
##  
##  Residuals
##  
##      Min      1Q  Median      3Q     Max 
##  -5.5487 -1.9148 -0.8652  0.7808 46.4351 
##  
##  
##                                   Coef    S.E.   t     Pr(>|t|)
##  Intercept                         1.7007 0.4252  4.00 <0.0001 
##  intelligence / cognitive ability  0.2854 0.2144  1.33 0.1835  
##  g factor                         -0.2364 0.3226 -0.73 0.4639  
##  psychometrics / statistics       -0.2183 0.2393 -0.91 0.3618  
##  education                         0.1704 0.3187  0.53 0.5930  
##  iq / achievement / aptitude test  0.3360 0.2629  1.28 0.2016  
##  race / ethnicity                 -0.8718 0.2465 -3.54 0.0004  
##  working memory                    0.9087 0.3324  2.73 0.0064  
##  brain / neuro                    -0.0559 0.1811 -0.31 0.7577  
##  children / child development     -0.4068 0.3594 -1.13 0.2580  
##  memory / cognition                0.0004 0.4438  0.00 0.9992  
##  sex differences                  -0.0844 0.4171 -0.20 0.8397  
##  income / status / ses             0.1842 0.3408  0.54 0.5890  
##  health                           -0.4738 0.2236 -2.12 0.0344  
##  adult / aging                    -0.5218 0.4006 -1.30 0.1931  
##  fluid intelligence                0.8196 0.5342  1.53 0.1253  
##  flynn effect                      0.6700 0.4071  1.65 0.1002  
##  modeling                          0.5181 0.3747  1.38 0.1671  
##  genes / evolution                -0.2082 0.3012 -0.69 0.4897  
##  genes and environment            -0.4061 0.2447 -1.66 0.0974  
##  ECT                              -0.5679 0.3654 -1.55 0.1205  
##  count                             0.2720 0.1033  2.63 0.0086  
## 
rms::ols(form_model2, data = cat_counts2)
## Linear Regression Model
##  
##  rms::ols(formula = form_model2, data = cat_counts2)
##  
##                 Model Likelihood     Discrimination    
##                    Ratio Test           Indexes        
##  Obs     916    LR chi2     79.73    R2       0.083    
##  sigma3.6513    d.f.           25    R2 adj   0.058    
##  d.f.    890    Pr(> chi2) 0.0000    g        1.209    
##  
##  Residuals
##  
##      Min      1Q  Median      3Q     Max 
##  -5.6105 -1.9149 -0.7779  0.7468 45.7296 
##  
##  
##                                   Coef      S.E.     t     Pr(>|t|)
##  Intercept                        -626.4974 289.4705 -2.16 0.0307  
##  intelligence / cognitive ability    0.2357   0.2140  1.10 0.2711  
##  g factor                           -0.4200   0.3245 -1.29 0.1959  
##  psychometrics / statistics         -0.2433   0.2375 -1.02 0.3060  
##  education                           0.2659   0.3169  0.84 0.4016  
##  iq / achievement / aptitude test    0.1409   0.2639  0.53 0.5936  
##  race / ethnicity                   -0.8235   0.2444 -3.37 0.0008  
##  working memory                      0.7448   0.3311  2.25 0.0247  
##  brain / neuro                      -0.1458   0.1806 -0.81 0.4197  
##  children / child development       -0.3928   0.3570 -1.10 0.2715  
##  memory / cognition                 -0.1183   0.4427 -0.27 0.7893  
##  sex differences                    -0.2935   0.4173 -0.70 0.4820  
##  income / status / ses               0.0973   0.3383  0.29 0.7738  
##  health                             -0.5453   0.2237 -2.44 0.0150  
##  adult / aging                      -0.4403   0.3970 -1.11 0.2678  
##  fluid intelligence                  0.9064   0.5291  1.71 0.0870  
##  flynn effect                        0.6728   0.4042  1.66 0.0963  
##  modeling                            0.5613   0.3718  1.51 0.1315  
##  genes / evolution                  -0.2414   0.2984 -0.81 0.4188  
##  genes and environment              -0.4352   0.2431 -1.79 0.0738  
##  ECT                                -0.5669   0.3662 -1.55 0.1219  
##  count                               0.2761   0.1033  2.67 0.0076  
##  Year                                0.3138   0.1445  2.17 0.0301  
##  Year'                              -0.9696   0.4329 -2.24 0.0254  
##  Year''                              5.3829   3.1058  1.73 0.0834  
##  Year'''                           -10.2308   6.8756 -1.49 0.1371  
## 
rms::ols(form_model3, data = cat_counts2)
## Linear Regression Model
##  
##  rms::ols(formula = form_model3, data = cat_counts2)
##  
##                 Model Likelihood     Discrimination    
##                    Ratio Test           Indexes        
##  Obs     916    LR chi2    140.92    R2       0.143    
##  sigma3.5615    d.f.           40    R2 adj   0.103    
##  d.f.    875    Pr(> chi2) 0.0000    g        1.515    
##  
##  Residuals
##  
##      Min      1Q  Median      3Q     Max 
##  -7.6618 -1.8408 -0.6408  0.8643 45.5900 
##  
##  
##                                   Coef      S.E.     t     Pr(>|t|)
##  Intercept                        -588.3727 284.8436 -2.07 0.0392  
##  adult / aging                      -0.2001   0.4022 -0.50 0.6189  
##  aggregate / regional iqs           -0.1730   0.5163 -0.34 0.7377  
##  attention                          -0.3372   0.5565 -0.61 0.5448  
##  brain / neuro                      -0.1038   0.1842 -0.56 0.5734  
##  children / child development       -0.3985   0.3539 -1.13 0.2605  
##  crystallized intelligence          -0.5609   0.4878 -1.15 0.2505  
##  ECT                                -0.3663   0.3643 -1.01 0.3151  
##  education                           0.4396   0.3179  1.38 0.1671  
##  executive function                  1.8500   0.4398  4.21 <0.0001 
##  factor analysis                     1.3763   0.5374  2.56 0.0106  
##  fluid intelligence                  1.2154   0.5366  2.27 0.0237  
##  flynn effect                        0.7693   0.4050  1.90 0.0578  
##  genes and environment              -0.3425   0.2450 -1.40 0.1625  
##  genes / evolution                  -0.1184   0.2984 -0.40 0.6916  
##  g factor                           -0.5323   0.3296 -1.61 0.1067  
##  health                             -0.4603   0.2265 -2.03 0.0424  
##  income / status / ses               0.1972   0.3421  0.58 0.5644  
##  intelligence / cognitive ability    0.3118   0.2193  1.42 0.1555  
##  iq / achievement / aptitude test    0.2011   0.2650  0.76 0.4480  
##  iq theories                         1.2657   0.3798  3.33 0.0009  
##  item level / IRT                   -0.6251   0.5085 -1.23 0.2193  
##  longitudinal                       -0.3981   0.8019 -0.50 0.6197  
##  memory / cognition                 -0.1987   0.4376 -0.45 0.6498  
##  mental speed                       -0.9802   0.4554 -2.15 0.0316  
##  modeling                            0.5069   0.3679  1.38 0.1686  
##  personality                        -0.5836   0.4633 -1.26 0.2082  
##  politics                            0.1458   0.3567  0.41 0.6827  
##  psychometrics / statistics         -0.1905   0.2429 -0.78 0.4331  
##  race / ethnicity                   -0.7080   0.2505 -2.83 0.0048  
##  raven's                            -0.7148   0.5624 -1.27 0.2041  
##  reasoning                          -0.2371   0.5882 -0.40 0.6869  
##  sex differences                    -0.2613   0.4149 -0.63 0.5289  
##  spatial ability                     0.9560   0.4342  2.20 0.0279  
##  spearman's hypothesis               0.2515   0.6582  0.38 0.7025  
##  working memory                      0.9322   0.3362  2.77 0.0057  
##  count                               0.2043   0.1149  1.78 0.0756  
##  Year                                0.2949   0.1422  2.07 0.0383  
##  Year'                              -0.9612   0.4257 -2.26 0.0242  
##  Year''                              5.4870   3.0587  1.79 0.0732  
##  Year'''                           -10.5190   6.7767 -1.55 0.1210  
## 
#lasso
(cats_at_least_10 = cat_dists_count[cat_dists_count >= 10] %>% names())
##  [1] "adult / aging"                    
##  [2] "aggregate / regional iqs"         
##  [3] "assessment"                       
##  [4] "attention"                        
##  [5] "brain / neuro"                    
##  [6] "children / child development"     
##  [7] "cohort"                           
##  [8] "creativity"                       
##  [9] "crime"                            
## [10] "crystallized intelligence"        
## [11] "culture / cross cultural"         
## [12] "dysgenics"                        
## [13] "economics"                        
## [14] "ECT"                              
## [15] "education"                        
## [16] "EQ"                               
## [17] "executive function"               
## [18] "expertise"                        
## [19] "factor analysis"                  
## [20] "fertility"                        
## [21] "fluid intelligence"               
## [22] "flynn effect"                     
## [23] "genes and environment"            
## [24] "genes / evolution"                
## [25] "g factor"                         
## [26] "health"                           
## [27] "income / status / ses"            
## [28] "individual change / stability"    
## [29] "individual differences"           
## [30] "intelligence / cognitive ability" 
## [31] "iq / achievement / aptitude test" 
## [32] "iq theories"                      
## [33] "item level / IRT"                 
## [34] "learning"                         
## [35] "longitudinal"                     
## [36] "mathematics"                      
## [37] "memory / cognition"               
## [38] "mental speed"                     
## [39] "meta-analysis"                    
## [40] "methods"                          
## [41] "modeling"                         
## [42] "motivation"                       
## [43] "multiple categories"              
## [44] "personality"                      
## [45] "politics"                         
## [46] "practice"                         
## [47] "problem solving / decision making"
## [48] "psychometrics / statistics"       
## [49] "race / ethnicity"                 
## [50] "raven's"                          
## [51] "reasoning"                        
## [52] "religiosity"                      
## [53] "sensation / perception"           
## [54] "sex differences"                  
## [55] "slodr"                            
## [56] "spatial ability"                  
## [57] "spearman's hypothesis"            
## [58] "talent / giftedness"              
## [59] "training"                         
## [60] "variability"                      
## [61] "work"                             
## [62] "working memory"
lasso_cv = cv.glmnet(y = cat_counts2$wos_per_year, x = cat_counts2[c(cats_at_least_10, "count")] %>% as.matrix(), nfolds = nrow(cat_counts2))
## Warning: Option grouped=FALSE enforced in cv.glmnet, since < 3 observations
## per fold
plot(lasso_cv)

get_glmnet_coefs(lasso_cv) %>% arrange(-beta_min)
## Warning: Column `predictor` joining factors with different levels, coercing
## to character vector
#get citation summary stats by category
cats_wos_long_all = map_df(names(cat_dists), function(x) {
  # browser()
  #varname as symbol
  xx = rlang::sym(x)
  
  #filter to articles with that category
  y = cat_counts2 %>% filter(as.logical(!!xx)) %>% select(wos_per_year) %>% mutate(category = x)
  
  data_frame(
    category = x,
    mean = mean(y$wos_per_year),
    median = median(y$wos_per_year),
    sd = sd(y$wos_per_year),
    n = nrow(y)
  )
})

#print
cats_wos_long_all %>% filter(n >= 10)
#scatterplot of mean citations per year vs. number of papers with that category
GG_scatter(cats_wos_long_all, "mean", "n")

cats_wos_long_all[-1] %>% wtd.cors()
##         mean  median    sd       n
## mean   1.000  0.9549 0.767  0.0470
## median 0.955  1.0000 0.301 -0.0358
## sd     0.767  0.3010 1.000  0.1837
## n      0.047 -0.0358 0.184  1.0000

Save data

write_rds(d, "data/main.rds")
write_rds(kws, "data/keywords.rds")
write_rds(cats, "data/categories.rds")
write_rds(cats_wos_long_all, "data/cats_wos_long_all.rds")