options(digits = 3)
library(pacman)
p_load(kirkegaard, haven, rms, googlesheets, glmnet)
#ad hoc function to get best model from cv.glmnet
get_glmnet_coefs = function(x) {
#get coefs at two criteria
coefs_min <- coef(x, s = "lambda.min")
coefs_1se <- coef(x, s = "lambda.1se")
#get values
d1 = data.frame(predictor = coefs_min@Dimnames[[1]][coefs_min@i + 1], beta_min = coefs_min@x)
d2 = data.frame(predictor = coefs_1se@Dimnames[[1]][coefs_1se@i + 1], beta_1se = coefs_1se@x)
#merge
full_join(d1, d2, by = "predictor")
}
describe = function(x, ...) {
y = psych::describe(x, ...)
class(y) = "data.frame"
y
}
#spss
spss = read_sav("data/kw_final.sav")
#googlesheets
gs_auth()
## Auto-refreshing stale OAuth token.
gs = gs_url("https://docs.google.com/spreadsheets/d/19b_b6IBk1uh33npk6KB8fsk4qmcJvQBwcNsgVBW16sA/edit#gid=547666987")
## Sheet-identifying info appears to be a browser URL.
## googlesheets will attempt to extract sheet key from the URL.
## Putative key: 19b_b6IBk1uh33npk6KB8fsk4qmcJvQBwcNsgVBW16sA
## Sheet successfully identified: "Keywords study Intelligence journal"
d = gs_read(gs)
## Accessing worksheet titled 'data'.
## Parsed with column specification:
## cols(
## Year = col_integer(),
## FirstAuthor = col_character(),
## Title = col_character(),
## WoS = col_integer(),
## Keyword = col_character(),
## Category = col_character(),
## `Category numeric` = col_integer(),
## `Recode Redundant (best guess from list)` = col_character(),
## `Recode redundant preferred` = col_character(),
## `Change Code` = col_integer()
## )
#mutate
d %<>%
#add/fix columns
mutate(
FirstAuthor = plyr::mapvalues(FirstAuthor, from = "", to = NA),
Title = plyr::mapvalues(Title, from = "", to = NA),
#citations per year
wos_per_year = WoS / (2018 - Year)
) %>%
#fill in repeated values
fill(FirstAuthor, Title) %>%
mutate(
#unique id
author_year = glue::glue("{FirstAuthor} - {Year} ") %>% as.character(),
author_year_title = glue::glue("{author_year} - {Title} ") %>% as.character()
)
## The following `from` values were not present in `x`:
## The following `from` values were not present in `x`:
#duplicates?
#tricky because of the multi-row format
#the author year combos do not form 'unique sequences'
#i.e. some authors published multiple times a single year
#if not, the RLE count for sorted and unsorted would be identical
rle(d$author_year)
## Run Length Encoding
## lengths: int [1:911] 5 3 4 5 4 3 4 5 5 5 ...
## values : chr [1:911] "Colom, R - 2000 " "Stankov, L - 2001 " ...
rle(d$author_year %>% sort())
## Run Length Encoding
## lengths: int [1:793] 6 10 4 4 3 4 4 5 5 5 ...
## values : chr [1:793] "Abad, F - 2003 " "Ackerman - 2014 " ...
#but should be ok for the author year title approach
rle(d$author_year_title)
## Run Length Encoding
## lengths: int [1:919] 5 3 4 5 4 3 4 5 5 5 ...
## values : chr [1:919] "Colom, R - 2000 - Negligible Sex Differences in General Intelligence " ...
rle(d$author_year_title %>% sort())
## Run Length Encoding
## lengths: int [1:919] 6 3 3 4 4 4 3 4 4 5 ...
## values : chr [1:919] "Abad, F - 2003 - Intelligence differentiation in adult samples " ...
#force error if false
assert_that(length(rle(d$author_year_title)$length) == length(rle(d$author_year_title %>% sort())$length))
## [1] TRUE
#extract studies table
studies = d %>% filter(!duplicated(author_year_title))
#remove missing keywords placeholder
#this also removes the studies without keywords
#but we can get them back later by merging with studies table
d = d %>% filter(Keyword != "xxx")
#overwrite 'redundant' category codings
d$Category_orig = d$Category
d$Category = case_when(
d$Category == "redundant" ~ d$`Recode Redundant (best guess from list)`,
TRUE ~ d$Category
)
#remove any problem linebreaks
d$Category = d$Category %>% str_replace_all("\\n", "")
#assert no missing or redundant
assert_that(!(d$Category == "redudant") %>% any())
## [1] TRUE
#citation counts
GG_denhist(studies, "WoS")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
describe(studies$WoS) %>% (t)
## X1
## vars 1.00
## n 919.00
## mean 24.73
## sd 40.22
## median 13.00
## trimmed 17.06
## mad 13.34
## min 0.00
## max 560.00
## range 560.00
## skew 6.11
## kurtosis 58.59
## se 1.33
#citation rate
GG_denhist(studies, "wos_per_year")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
describe(studies$wos_per_year) %>% (t)
## X1
## vars 1.000
## n 919.000
## mean 3.059
## sd 3.757
## median 2.000
## trimmed 2.370
## mad 1.730
## min 0.000
## max 50.909
## range 50.909
## skew 4.791
## kurtosis 40.201
## se 0.124
#plot the relationship to publication year
GG_scatter(studies, "Year", "wos_per_year") +
geom_smooth()
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
Before we can model data, we need to convert to tidy format – 1 row per article. For this, we need to move to wide format dummies for the keywords.
#test data
#illustrate how the spreading works
set.seed(1)
tt = data_frame(
id = c(rep("A", 3), rep("B", 4), rep("C", 2)),
keyword = sample(letters, 9, replace = T),
true = T,
...count = 1:9
)
#print example data
tt
#show how wide-format transform works
tt2 = tt %>% spread(key = keyword, value = true, fill = F)
tt2
#collapse to single row
tt2 %>%
plyr::ddply("id", function(x) {
# browser()
#keep constants
y1 = data_frame(
id = x$id[1]
)
#call any on each data col
y2 = colSums(x[-c(1:2)]) %>% as.list() %>% as_data_frame()
cbind(y1, y2)
})
Direct keyword analyses.
#metadata vars
kw_metavars = c("author_year_title", "...count")
#lower case, remove whitespace, remove any trailing digits
#save orig
d$keyword_orig = d$Keyword
d$keyword = d$keyword_orig %>% str_replace("\\s+\\d*\\s*$", "") %>% str_to_lower()
#unique count before and after
d$keyword_orig %>% unique() %>% length()
## [1] 2989
d$keyword %>% unique() %>% length()
## [1] 2134
#do a count
(kw_counts = table2(d$keyword))
#wide format
#assert no overlap with existing colnames
#this means we can just insert the dummies and dont have to worry about replacement
#but we need to accept 'illegal names'
assert_that(!any(names(d) %in% kw_counts$Group))
## [1] TRUE
#spread kws
kws = d %>%
#get id and keyword col
select(author_year_title, keyword) %>%
#add TRUE col
mutate(
#we want a logical argument for presence of keyword or not
true = T,
#add a count variable
#needed for uniqueness, tho we dont need it for anything
...count = 1:nrow(.)
) %>%
#as wide
tidyr::spread(key = keyword, value = true, fill = F)
#calculate proportions by keyword
kw_dists = kws %>% select(-!!kw_metavars) %>% colMeans()
kw_dists %>% sort(decreasing = T) %>% head(50)
## intelligence iq
## 0.08776 0.02108
## working memory cognitive ability
## 0.01604 0.01054
## flynn effect fluid intelligence
## 0.01054 0.01031
## general intelligence sex differences
## 0.01031 0.00940
## education cognitive abilities
## 0.00894 0.00687
## g aging
## 0.00687 0.00481
## inspection time g factor
## 0.00481 0.00458
## reasoning personality
## 0.00458 0.00412
## processing speed cognition
## 0.00412 0.00390
## crystallized intelligence development
## 0.00367 0.00344
## emotional intelligence creativity
## 0.00344 0.00321
## general cognitive ability individual differences
## 0.00321 0.00321
## spearman's hypothesis working memory capacity
## 0.00321 0.00321
## children cognitive epidemiology
## 0.00298 0.00298
## factor analysis longitudinal
## 0.00298 0.00298
## attention heritability
## 0.00275 0.00275
## expertise fertility
## 0.00252 0.00252
## gender differences health
## 0.00252 0.00252
## mental rotation socioeconomic status
## 0.00252 0.00252
## cognitive development complex problem solving
## 0.00229 0.00229
## income mental speed
## 0.00229 0.00229
## national iq reaction time
## 0.00229 0.00229
## twins educational attainment
## 0.00229 0.00206
## evolution gender
## 0.00206 0.00206
## mental ability meta-analysis
## 0.00206 0.00206
#cats metavar
cat_metavars = c("author_year_title", "...count")
#do a count
(cat_counts = table2(d$Category))
#spread
cats = d %>%
#get id and cat col
select(author_year_title, Category) %>%
#filter redundant?
# filter(Category != "redundant") %>%
#add TRUE col
mutate(
#we want a logical argument for presence of keyword or not
true = T,
#add a count variable
#needed for uniqueness, tho we dont need it for anything
...count = 1:nrow(.)
) %>%
#as wide
tidyr::spread(key = Category, value = true, fill = F)
cat_counts = cats %>% plyr::ddply("author_year_title", function(x) {
# browser()
#keep constants
y1 = data_frame(
author_year_title = x$author_year_title[1]
)
#call any on each data col
y2 = x %>% select(-!!cat_metavars) %>% colSums() %>% as.list() %>% as_data_frame()
cbind(y1, y2, count = sum(y2))
})
cat_lgls = cats %>% plyr::ddply("author_year_title", function(x) {
# browser()
#keep constants
y1 = data_frame(
author_year_title = x$author_year_title[1]
)
#call any on each data col
y2 = x %>% select(-!!cat_metavars) %>% colSums() %>% as.list() %>% as_data_frame() %>% map_df(as.logical)
cbind(y1, y2, count = y2 %>% map_df(as.numeric) %>% sum())
})
#calculate proportions by keyword
cat_dists = cat_lgls %>% select(-author_year_title, -count) %>% colMeans()
cat_dists_count = cat_lgls %>% select(-author_year_title, -count) %>% colSums()
cat_dists %>% sort(decreasing = T) %>% head(50)
## intelligence / cognitive ability g factor
## 0.5862 0.1507
## psychometrics / statistics education
## 0.1266 0.1234
## iq / achievement / aptitude test race / ethnicity
## 0.1092 0.1081
## working memory brain / neuro
## 0.1048 0.0906
## children / child development memory / cognition
## 0.0830 0.0808
## sex differences income / status / ses
## 0.0797 0.0786
## health adult / aging
## 0.0753 0.0688
## fluid intelligence flynn effect
## 0.0655 0.0644
## modeling genes / evolution
## 0.0622 0.0600
## genes and environment ECT
## 0.0590 0.0579
## mental speed aggregate / regional iqs
## 0.0557 0.0491
## raven's iq theories
## 0.0491 0.0480
## crystallized intelligence attention
## 0.0426 0.0393
## personality reasoning
## 0.0371 0.0360
## factor analysis spatial ability
## 0.0349 0.0338
## spearman's hypothesis executive function
## 0.0317 0.0306
## item level / IRT politics
## 0.0306 0.0251
## longitudinal economics
## 0.0240 0.0207
## EQ individual change / stability
## 0.0207 0.0207
## problem solving / decision making talent / giftedness
## 0.0207 0.0207
## creativity expertise
## 0.0197 0.0197
## work slodr
## 0.0197 0.0186
## dysgenics individual differences
## 0.0175 0.0175
## sensation / perception culture / cross cultural
## 0.0175 0.0164
## meta-analysis religiosity
## 0.0164 0.0164
#join to the tables, wide format, no duplicated rows
cat_counts2 = left_join(cat_counts, studies %>% select(author_year_title, wos_per_year, WoS, Year), by = "author_year_title")
cat_lgls2 = left_join(cat_lgls, studies %>% select(author_year_title, wos_per_year, WoS, Year), by = "author_year_title")
cat_year_sums = cat_counts2 %>% plyr::ddply("Year", function(x) {
# browser()
x %>% select(-author_year_title, -Year, -wos_per_year) %>% colSums()
})
cat_year_sums[c("Year", cat_dists %>% sort(decreasing = T) %>% head(10) %>% names())]
cat_lgls2_years = cat_lgls2 %>% filter(Year != 2000) %>% plyr::ddply("Year", function(x) {
# browser()
x %>% select(-author_year_title, -Year, -wos_per_year) %>% colMeans()
})
cat_lgls2_years[c("Year", cat_dists %>% sort(decreasing = T) %>% .[-1] %>% head(10) %>% names())] %>%
#to long form
gather(key = category, value = prop, -Year) %>%
ggplot(aes(Year, prop, color = category)) +
# geom_line()
geom_smooth(se = F) +
scale_x_continuous(breaks = 0:9999) +
theme_classic() +
theme(axis.text.x = element_text(angle = -30))
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
#subsets
#Bryan wants because reasons
#subset 1: flat
cat_lgls2_years[c("Year", "EQ", "reasoning", "modeling", "fluid intelligence", "attention", "factor analysis", "spatial ability")] %>%
#to long form
gather(key = category, value = prop, -Year) %>%
ggplot(aes(Year, prop, color = category)) +
# geom_line()
geom_smooth(se = F) +
scale_x_continuous(breaks = 0:9999) +
ylab("Proportion of papers") +
theme_classic() +
theme(axis.text.x = element_text(angle = -30)) +
ylim(0, .25)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
GG_save("figs/timeline1.png")
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
#subset 2: n/u shaped
cat_lgls2_years[c("Year", "working memory", "g factor", "iq / achievement / aptitude test", "sex differences", "brain / neuro", "crystallized intelligence")] %>%
#to long form
gather(key = category, value = prop, -Year) %>%
ggplot(aes(Year, prop, color = category)) +
# geom_line()
geom_smooth(se = F) +
scale_x_continuous(breaks = 0:9999) +
ylab("Proportion of papers") +
theme_classic() +
theme(axis.text.x = element_text(angle = -30)) +
ylim(0, .25)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## Warning: Removed 3 rows containing non-finite values (stat_smooth).
GG_save("figs/timeline2.png")
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## Warning: Removed 3 rows containing non-finite values (stat_smooth).
#subset 3: rising falling
cat_lgls2_years[c("Year", "mental speed", "flynn effect", "memory / cognition", "education", "executive function", "iq theories", "psychometrics / statistics")] %>%
#to long form
gather(key = category, value = prop, -Year) %>%
ggplot(aes(Year, prop, color = category)) +
# geom_line()
geom_smooth(se = F) +
scale_x_continuous(breaks = 0:9999) +
ylab("Proportion of papers") +
theme_classic() +
theme(axis.text.x = element_text(angle = -30)) +
ylim(0, .25)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## Warning: Removed 2 rows containing non-finite values (stat_smooth).
## Warning: Removed 2 rows containing missing values (geom_smooth).
GG_save("figs/timeline3.png")
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## Warning: Removed 2 rows containing non-finite values (stat_smooth).
## Warning: Removed 2 rows containing missing values (geom_smooth).
#build formulas
#top 20 categories
form_model = glue::glue("wos_per_year ~ `{str_c(cat_dists %>% sort(decreasing = T) %>% head(20) %>% names(), collapse = '` + `')}` + count") %>% as.formula()
#with year as control
form_model2 = glue::glue("wos_per_year ~ `{str_c(cat_dists %>% sort(decreasing = T) %>% head(20) %>% names(), collapse = '` + `')}` + count + rcs(Year)") %>% as.formula()
#all categories with >= 20 articles
(cats_at_least_20 = cat_dists_count[cat_dists_count >= 20] %>% names())
## [1] "adult / aging" "aggregate / regional iqs"
## [3] "attention" "brain / neuro"
## [5] "children / child development" "crystallized intelligence"
## [7] "ECT" "education"
## [9] "executive function" "factor analysis"
## [11] "fluid intelligence" "flynn effect"
## [13] "genes and environment" "genes / evolution"
## [15] "g factor" "health"
## [17] "income / status / ses" "intelligence / cognitive ability"
## [19] "iq / achievement / aptitude test" "iq theories"
## [21] "item level / IRT" "longitudinal"
## [23] "memory / cognition" "mental speed"
## [25] "modeling" "personality"
## [27] "politics" "psychometrics / statistics"
## [29] "race / ethnicity" "raven's"
## [31] "reasoning" "sex differences"
## [33] "spatial ability" "spearman's hypothesis"
## [35] "working memory"
form_model3 = glue::glue("wos_per_year ~ `{str_c(cats_at_least_20, collapse = '` + `')}` + count + rcs(Year)") %>% as.formula()
#fit
rms::ols(form_model, data = cat_counts2)
## Linear Regression Model
##
## rms::ols(formula = form_model, data = cat_counts2)
##
## Model Likelihood Discrimination
## Ratio Test Indexes
## Obs 916 LR chi2 56.48 R2 0.060
## sigma3.6897 d.f. 21 R2 adj 0.038
## d.f. 894 Pr(> chi2) 0.0000 g 1.007
##
## Residuals
##
## Min 1Q Median 3Q Max
## -5.5487 -1.9148 -0.8652 0.7808 46.4351
##
##
## Coef S.E. t Pr(>|t|)
## Intercept 1.7007 0.4252 4.00 <0.0001
## intelligence / cognitive ability 0.2854 0.2144 1.33 0.1835
## g factor -0.2364 0.3226 -0.73 0.4639
## psychometrics / statistics -0.2183 0.2393 -0.91 0.3618
## education 0.1704 0.3187 0.53 0.5930
## iq / achievement / aptitude test 0.3360 0.2629 1.28 0.2016
## race / ethnicity -0.8718 0.2465 -3.54 0.0004
## working memory 0.9087 0.3324 2.73 0.0064
## brain / neuro -0.0559 0.1811 -0.31 0.7577
## children / child development -0.4068 0.3594 -1.13 0.2580
## memory / cognition 0.0004 0.4438 0.00 0.9992
## sex differences -0.0844 0.4171 -0.20 0.8397
## income / status / ses 0.1842 0.3408 0.54 0.5890
## health -0.4738 0.2236 -2.12 0.0344
## adult / aging -0.5218 0.4006 -1.30 0.1931
## fluid intelligence 0.8196 0.5342 1.53 0.1253
## flynn effect 0.6700 0.4071 1.65 0.1002
## modeling 0.5181 0.3747 1.38 0.1671
## genes / evolution -0.2082 0.3012 -0.69 0.4897
## genes and environment -0.4061 0.2447 -1.66 0.0974
## ECT -0.5679 0.3654 -1.55 0.1205
## count 0.2720 0.1033 2.63 0.0086
##
rms::ols(form_model2, data = cat_counts2)
## Linear Regression Model
##
## rms::ols(formula = form_model2, data = cat_counts2)
##
## Model Likelihood Discrimination
## Ratio Test Indexes
## Obs 916 LR chi2 79.73 R2 0.083
## sigma3.6513 d.f. 25 R2 adj 0.058
## d.f. 890 Pr(> chi2) 0.0000 g 1.209
##
## Residuals
##
## Min 1Q Median 3Q Max
## -5.6105 -1.9149 -0.7779 0.7468 45.7296
##
##
## Coef S.E. t Pr(>|t|)
## Intercept -626.4974 289.4705 -2.16 0.0307
## intelligence / cognitive ability 0.2357 0.2140 1.10 0.2711
## g factor -0.4200 0.3245 -1.29 0.1959
## psychometrics / statistics -0.2433 0.2375 -1.02 0.3060
## education 0.2659 0.3169 0.84 0.4016
## iq / achievement / aptitude test 0.1409 0.2639 0.53 0.5936
## race / ethnicity -0.8235 0.2444 -3.37 0.0008
## working memory 0.7448 0.3311 2.25 0.0247
## brain / neuro -0.1458 0.1806 -0.81 0.4197
## children / child development -0.3928 0.3570 -1.10 0.2715
## memory / cognition -0.1183 0.4427 -0.27 0.7893
## sex differences -0.2935 0.4173 -0.70 0.4820
## income / status / ses 0.0973 0.3383 0.29 0.7738
## health -0.5453 0.2237 -2.44 0.0150
## adult / aging -0.4403 0.3970 -1.11 0.2678
## fluid intelligence 0.9064 0.5291 1.71 0.0870
## flynn effect 0.6728 0.4042 1.66 0.0963
## modeling 0.5613 0.3718 1.51 0.1315
## genes / evolution -0.2414 0.2984 -0.81 0.4188
## genes and environment -0.4352 0.2431 -1.79 0.0738
## ECT -0.5669 0.3662 -1.55 0.1219
## count 0.2761 0.1033 2.67 0.0076
## Year 0.3138 0.1445 2.17 0.0301
## Year' -0.9696 0.4329 -2.24 0.0254
## Year'' 5.3829 3.1058 1.73 0.0834
## Year''' -10.2308 6.8756 -1.49 0.1371
##
rms::ols(form_model3, data = cat_counts2)
## Linear Regression Model
##
## rms::ols(formula = form_model3, data = cat_counts2)
##
## Model Likelihood Discrimination
## Ratio Test Indexes
## Obs 916 LR chi2 140.92 R2 0.143
## sigma3.5615 d.f. 40 R2 adj 0.103
## d.f. 875 Pr(> chi2) 0.0000 g 1.515
##
## Residuals
##
## Min 1Q Median 3Q Max
## -7.6618 -1.8408 -0.6408 0.8643 45.5900
##
##
## Coef S.E. t Pr(>|t|)
## Intercept -588.3727 284.8436 -2.07 0.0392
## adult / aging -0.2001 0.4022 -0.50 0.6189
## aggregate / regional iqs -0.1730 0.5163 -0.34 0.7377
## attention -0.3372 0.5565 -0.61 0.5448
## brain / neuro -0.1038 0.1842 -0.56 0.5734
## children / child development -0.3985 0.3539 -1.13 0.2605
## crystallized intelligence -0.5609 0.4878 -1.15 0.2505
## ECT -0.3663 0.3643 -1.01 0.3151
## education 0.4396 0.3179 1.38 0.1671
## executive function 1.8500 0.4398 4.21 <0.0001
## factor analysis 1.3763 0.5374 2.56 0.0106
## fluid intelligence 1.2154 0.5366 2.27 0.0237
## flynn effect 0.7693 0.4050 1.90 0.0578
## genes and environment -0.3425 0.2450 -1.40 0.1625
## genes / evolution -0.1184 0.2984 -0.40 0.6916
## g factor -0.5323 0.3296 -1.61 0.1067
## health -0.4603 0.2265 -2.03 0.0424
## income / status / ses 0.1972 0.3421 0.58 0.5644
## intelligence / cognitive ability 0.3118 0.2193 1.42 0.1555
## iq / achievement / aptitude test 0.2011 0.2650 0.76 0.4480
## iq theories 1.2657 0.3798 3.33 0.0009
## item level / IRT -0.6251 0.5085 -1.23 0.2193
## longitudinal -0.3981 0.8019 -0.50 0.6197
## memory / cognition -0.1987 0.4376 -0.45 0.6498
## mental speed -0.9802 0.4554 -2.15 0.0316
## modeling 0.5069 0.3679 1.38 0.1686
## personality -0.5836 0.4633 -1.26 0.2082
## politics 0.1458 0.3567 0.41 0.6827
## psychometrics / statistics -0.1905 0.2429 -0.78 0.4331
## race / ethnicity -0.7080 0.2505 -2.83 0.0048
## raven's -0.7148 0.5624 -1.27 0.2041
## reasoning -0.2371 0.5882 -0.40 0.6869
## sex differences -0.2613 0.4149 -0.63 0.5289
## spatial ability 0.9560 0.4342 2.20 0.0279
## spearman's hypothesis 0.2515 0.6582 0.38 0.7025
## working memory 0.9322 0.3362 2.77 0.0057
## count 0.2043 0.1149 1.78 0.0756
## Year 0.2949 0.1422 2.07 0.0383
## Year' -0.9612 0.4257 -2.26 0.0242
## Year'' 5.4870 3.0587 1.79 0.0732
## Year''' -10.5190 6.7767 -1.55 0.1210
##
#lasso
(cats_at_least_10 = cat_dists_count[cat_dists_count >= 10] %>% names())
## [1] "adult / aging"
## [2] "aggregate / regional iqs"
## [3] "assessment"
## [4] "attention"
## [5] "brain / neuro"
## [6] "children / child development"
## [7] "cohort"
## [8] "creativity"
## [9] "crime"
## [10] "crystallized intelligence"
## [11] "culture / cross cultural"
## [12] "dysgenics"
## [13] "economics"
## [14] "ECT"
## [15] "education"
## [16] "EQ"
## [17] "executive function"
## [18] "expertise"
## [19] "factor analysis"
## [20] "fertility"
## [21] "fluid intelligence"
## [22] "flynn effect"
## [23] "genes and environment"
## [24] "genes / evolution"
## [25] "g factor"
## [26] "health"
## [27] "income / status / ses"
## [28] "individual change / stability"
## [29] "individual differences"
## [30] "intelligence / cognitive ability"
## [31] "iq / achievement / aptitude test"
## [32] "iq theories"
## [33] "item level / IRT"
## [34] "learning"
## [35] "longitudinal"
## [36] "mathematics"
## [37] "memory / cognition"
## [38] "mental speed"
## [39] "meta-analysis"
## [40] "methods"
## [41] "modeling"
## [42] "motivation"
## [43] "multiple categories"
## [44] "personality"
## [45] "politics"
## [46] "practice"
## [47] "problem solving / decision making"
## [48] "psychometrics / statistics"
## [49] "race / ethnicity"
## [50] "raven's"
## [51] "reasoning"
## [52] "religiosity"
## [53] "sensation / perception"
## [54] "sex differences"
## [55] "slodr"
## [56] "spatial ability"
## [57] "spearman's hypothesis"
## [58] "talent / giftedness"
## [59] "training"
## [60] "variability"
## [61] "work"
## [62] "working memory"
lasso_cv = cv.glmnet(y = cat_counts2$wos_per_year, x = cat_counts2[c(cats_at_least_10, "count")] %>% as.matrix(), nfolds = nrow(cat_counts2))
## Warning: Option grouped=FALSE enforced in cv.glmnet, since < 3 observations
## per fold
plot(lasso_cv)
get_glmnet_coefs(lasso_cv) %>% arrange(-beta_min)
## Warning: Column `predictor` joining factors with different levels, coercing
## to character vector
#get citation summary stats by category
cats_wos_long_all = map_df(names(cat_dists), function(x) {
# browser()
#varname as symbol
xx = rlang::sym(x)
#filter to articles with that category
y = cat_counts2 %>% filter(as.logical(!!xx)) %>% select(wos_per_year) %>% mutate(category = x)
data_frame(
category = x,
mean = mean(y$wos_per_year),
median = median(y$wos_per_year),
sd = sd(y$wos_per_year),
n = nrow(y)
)
})
#print
cats_wos_long_all %>% filter(n >= 10)
#scatterplot of mean citations per year vs. number of papers with that category
GG_scatter(cats_wos_long_all, "mean", "n")
cats_wos_long_all[-1] %>% wtd.cors()
## mean median sd n
## mean 1.000 0.9549 0.767 0.0470
## median 0.955 1.0000 0.301 -0.0358
## sd 0.767 0.3010 1.000 0.1837
## n 0.047 -0.0358 0.184 1.0000
write_rds(d, "data/main.rds")
write_rds(kws, "data/keywords.rds")
write_rds(cats, "data/categories.rds")
write_rds(cats_wos_long_all, "data/cats_wos_long_all.rds")