Init

Sys.setenv(LANG = "en") # make R environment in english

library(kirkegaard)
load_packages(
  readxl,
  mirt,
  haven,
  googlesheets4,
  rms,
  nnet,
  ggeffects,
  mgcv,
  future
)

theme_set(theme_bw())

#exclusion rules
exclude_failed_attention_checks = T
exclude_below_min = 17.5

options(contrasts=c("contr.treatment", "contr.treatment"))

#multi-threading
plan(multisession(workers = 4))

Data

Data cleaning and coding

#read
# d = read_spss("data/SpssExport-1645882702/spss.sav")
#only the CSV export has browser data, which we need!
d_raw = read_csv("C:/Users/mh198/OneDrive/Documents/Data/Other data/A new, free to use Danish vocabulary test with norms/data/20250113142029-SurveyExport.csv") %>% 
  df_legalize_names(func = function(x) str_legalize(x, ascii = F)) 
## Rows: 2904 Columns: 585
## ── Column specification ──────────────────────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (572): Status, Language, Referer, SessionID, User Agent, IP Address, Country, City, State...
## dbl    (6): Response ID, Longitude, Latitude, Hvor mange bøger læste du sidste år? Inklusiv ly...
## lgl    (5): Contact ID, Legacy Comments, Comments, Tags, End redirect
## dttm   (2): Time Started, Date Submitted
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
d = d_raw %>% 
  #filter to complete cases
  filter(Status == "Complete", 
         str_detect(Referer, "ssisurveys"),
         Taler_du_dansk_som_modersmål == "Ja")

d_vars = df_var_table(d)

#add self
d = bind_rows(
  d,
  d_raw %>% tail(1)
)

#copy the attention checks
attention_checks = d %>% select(contains("Vælg_batteri_og_musik"), contains("monster_og_terrasse"))

#remove these variables from main
for (v in names(attention_checks)) d[[v]] = NULL

#add attention check score back
d$attention_check_1 = (!is.na(attention_checks$batteri_Vælg_batteri_og_musik) & !is.na(attention_checks$musik_Vælg_batteri_og_musik))
d$attention_check_2 = (!is.na(attention_checks$monster_Vælg_monster_og_terrasse) & !is.na(attention_checks$terrasse_Vælg_monster_og_terrasse))
d$attention_checksum = d$attention_check_1 + d$attention_check_2
d$attention_checksum %>% table2()
#manual inspect
d$attention_checksum2 = attention_checks %>% select(batteri_Vælg_batteri_og_musik, musik_Vælg_batteri_og_musik, monster_Vælg_monster_og_terrasse, terrasse_Vælg_monster_og_terrasse) %>% 
  miss_by_case()

#recodings
d %<>% mutate(
  vote_political_party = Hvis_der_var_Folketingsvalg_i_dag_hvem_ville_du_så_stemme_på %>% fct_relevel("S - Socialdemokratiet"),
  age = Hvor_gammel_er_du %>% as.numeric(),
  attention_pass = (attention_checksum2 == 0),
  time_to_complete = (Date_Submitted - Time_Started) %>% as.numeric(),
  time_to_complete_winsor = time_to_complete %>% winsorise(upper = 60),
  time_below_min = time_to_complete_winsor <= exclude_below_min,
  sex = Hvilken_køn_er_du_Biologisk %>% mapvalues(c("Mand", "Kvinde"), c("Male", "Female")) %>% as.factor(),
  native_speaker = Taler_du_dansk_som_modersmål == "Ja",
  education = Hvad_er_din_højeste_fuldførte_uddannelse %>% mapvalues(
    c(
      "Grundskole eller under (op til 10. klasse)",
      "Gymnasial uddannelse (almen/STX, handelshøjskole/HHX, teknisk/HTX, HF)",
      "Erhvervsfaglig uddannelse (fx social- og sundhedshjælper, elektriker, gastronom, murer, EUD)",
      "Kort videregående uddannelse (fx tandplejer, handelsøkonom, farmakonom, laborant og byggetekniker, 2-årig)",
      "Mellemlang videregående uddannelse (fx pædagog, sygeplejerske, skolelærer, maskinmester, diplomingeniør, 3-4-årig)",
      "Universitets bachelor (fx BA i dansk, BsC i fysik, 3-årig)",
      "Universitets kandidat (fx cand.mag i tysk, cand.scient i geologi, 2 år ovenpå bachelor)",
      "Ph.d. og forskeruddannelser"
    ),
    c(
      "Less than high school",
      "High school",
      "Vocational",
      "Short education",
      "Medium education",
      "University bachelor",
      "University master",
      "PhD"
    )
  ) %>% ordered(levels =    c(
      "Less than high school",
      "High school",
      "Vocational",
      "Short education",
      "Medium education",
      "University bachelor",
      "University master",
      "PhD"
    )),
  books_read = Hvor_mange_bøger_læste_du_sidste_år_Inklusiv_lydbøger,
  children = d$Hvor_mange_biologiske_børn_har_du,
  ideal_fertility = d$Forestil_dig_din_drømmeverden_Hvor_mange_biologiske_børn_ville_du_have_i_den
)
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `age = Hvor_gammel_er_du %>% as.numeric()`.
## Caused by warning in `Hvor_gammel_er_du %>% as.numeric()`:
## ! NAs introduced by coercion
#filter to attention pass, and native speakers
d_orig = d
if (exclude_failed_attention_checks) {
  d %<>% filter(attention_checksum2 == 0)
  d %<>% filter(!time_below_min)
  d %<>% filter(native_speaker)
}
d_failed = d_orig %>% filter(attention_checksum2 > 0)

table2(d_orig$attention_checksum2)
#export fails
d_failed %>% select(What_is_your_user_ID, attention_checksum2) %>% write_csv("data/failed_users.csv")

Vocab data

#restructure so we get just 101 columns
#they are grouped into 5 for each column, so we need to find the sets of 5 cols to merge
vocab_vars = d %>% select(overvåge_Vælg_de_to_ord_med_samme_eller_næsten_samme_betydning:svejfet_Vælg_de_to_ord_med_samme_eller_næsten_samme_betydning) %>% names()
length(vocab_vars)
## [1] 505
#item list
items_words = map_df(split_every_k(vocab_vars, k = 5), function(dd) {
  # browser()
  #split into 5 cols
  tibble(
    item_1 = dd[1] %>% str_replace("_Vælg_de_to_ord_med_samme_eller_næsten_samme_betydning", ""),
    item_2 = dd[2] %>% str_replace("_Vælg_de_to_ord_med_samme_eller_næsten_samme_betydning", ""),
    item_3 = dd[3] %>% str_replace("_Vælg_de_to_ord_med_samme_eller_næsten_samme_betydning", ""),
    item_4 = dd[4] %>% str_replace("_Vælg_de_to_ord_med_samme_eller_næsten_samme_betydning", ""),
    item_5 = dd[5] %>% str_replace("_Vælg_de_to_ord_med_samme_eller_næsten_samme_betydning", ""),
  )
}) %>% mutate(
  item_number = 1:n()
)

#convert to columns with integer sets
item_int_sets = map(split_every_k(vocab_vars, k = 5), function(dd) {
  # browser()
  #subset variables, check which are not missing, concat with comma
  d %>% 
    select(!!unlist(dd)) %>% 
    as.matrix() %>% 
    {
      #missing, matrix format
      miss = is.na(.) %>% `!`() %>% matrix(ncol = ncol(.))
      miss
    } %>% 
    plyr::alply(.margins = 1, .fun = function(x) {
    which(x) %>% str_c(collapse = ",")
  }) %>% 
    unlist() -> 
    items_picked
  
  tibble(
    value = items_picked
  ) %>% set_colnames(unlist(dd))
}) %>% 
  bind_cols() %>% 
  set_colnames(str_glue("item_{1:(length(vocab_vars)/5)}"))

#score them to T/F status using the key
gs4_deauth()
key = read_sheet("https://docs.google.com/spreadsheets/d/1H-4fkRYEVvc3jREOtkyOJCdWqxBQXoXkPpxkwsinIcs/edit#gid=0", skip = 2) %>% 
  df_legalize_names()
## ✔ Reading from "Danish vocab test (2023-06-23)".
## ✔ Range '3:10000000'.
#ensure they are in the same order
assert_that(all(
  key$Item_1 == items_words$item_1
))
## [1] TRUE
key %>% write_csv("data/vocab_items.csv")

#convert to integer
key %<>% rowwise() %>% 
  mutate(
    item_correct_1_number = which(Item_correct_1 == c(Item_1, Item_2, Item_3, Item_4, Item_5)),
    item_correct_2_number = which(Item_correct_2 == c(Item_1, Item_2, Item_3, Item_4, Item_5)),
    item_correct_scorekey = str_glue("{item_correct_1_number},{item_correct_2_number}")
  )

#score
items_TF = score_items(item_int_sets, key = key$item_correct_scorekey)

#list of all words
vocab_vars %>% str_replace("_Vælg_de_to_ord_med_samme_eller_næsten_samme_betydning", "") %>% table2(include_NA = F) %>% pull(Count) %>% table2()

Analysis

Full item set

#pass_rate by item
item_stats = tibble(
  item = names(items_TF),
  pass_rate = items_TF %>% colMeans(na.rm = T),
  n = items_TF %>% plyr::alply(.margins = 2, .fun = function(x) {miss_count(x, reverse = T)})  %>% unlist(),
)

#general stats
item_stats$pass_rate %>% describe2()
item_stats$pass_rate %>% GG_denhist() + scale_x_continuous(breaks = seq(0, 1, .1))
## Input seems like a fraction, set `boundary=0` and `binwidth=1/30` to avoid issues near the limits. Disable this with `auto_fraction_bounary=F`
## Scale for x is already present.
## Adding another scale for x, which will replace the existing scale.

#person level stats
d$vocab_sumscore = items_TF %>% rowSums(na.rm = T)
d$vocab_sumscore
##   [1]  23  64  57  10  74  74  72  69  74  27  87  74  98  34  89  55  95  39  89  58  83  88  66
##  [24]  77  74  68  48  35  90  75  38  89  75  87  94  84  80  84  73  77  60  99  50  90  80  76
##  [47]  58  54  23  84  75  33  75  58  99  76  90  64  65  84  69  71  89  74  87  92  71  37  81
##  [70]  44  77  88  56  92  80  67  51  77  74  65  90  55  74  59  98  74  86  58  44  81  88  92
##  [93]  81  82  86  82  91  76  66  94  94  85  76  63  95  90  96  75  35  84  83  76  74  69  88
## [116]  83  82  77  62  47  74  89  47  85  55  86  91  82  90  64  29  71  87  97  58  75  96  68
## [139]  11  80  55  83  90  91  90  78  77  53  77  93  78  90  36  88  68  74  82  78  99  85  86
## [162]  67  87  92  91  93  80  60  73  87  73  53  70  67  75  34  41  93  95  75  82  43  87  60
## [185]  62  67  83  65  82  71  81  63  72 101  79  79  34  22  60  88  52  72  90  72  72  75  87
## [208]  51  91  90  91  69  86  83  74  78  72  75  18  75  83  52  71  33  74  36  78  76  67  43
## [231]  81  84  74  59  69  57  78  46  76  43  96  58  80  74  75  89  44  84  86  79  84  64  55
## [254]  70  58  63  26  65  70  75  59  60  39  77  21  46  69  71  56  32  73  55  36  18  90  48
## [277]  71  64  72  91  50  86  72  57  24  82  81  80  43  33  68  90 100  64  63  26  86  64  76
## [300]  53  69  46  31  35  50  41  77  64  61  39  80  76  76  86  67  95  86  69  83  81  70  76
## [323]  49  83  62  51  77  79  87  91  91  78  57  31  98  77  30  61  59  48  71  62  75  63  71
## [346]  73  82  69  62  57  44  69  83  90  76  50  95  82  89  66  75  83  67  93  77  85  88  83
## [369]  52  99  76  72  83  78  61  89  93  96  91  89  95  89  65  79  78  74  37  85  88  51  81
## [392]  90  82  82  88  67  90  91  81  77  38  78  63  88  40  76  68  86  81  86  89  56  11  81
## [415]  63  59  88  77  87  64  75  93  85  78  56  78  90  43  53  86  70  60  86  48  33  77  61
## [438]  88  90  56  64  76  80  72  88  68  84  60  55  65  58  53  62  76  36  51  45  15  84  82
## [461]  81  54  62  46  61  69  70  86  86  88  39  81  61  80  65  54  42  76  75  97  86  68  71
## [484]  72  92  79  43  85  91  90  94  76  68  74  89  68  83  81  68  87  79  48  80  81  91  64
## [507]  83  73  72  79  54  99  86  97  40  75  86  86  74  64  46  75  53  73  47  50  80  66  55
## [530]  62  60  90  57  92  88  63  70  48  70  90  57  81  52  58  59  74  41  58  65  63  76  31
## [553]  72  50  61  73  68  97  83  80  90  81  73  79  69  74  76  55  65  70  87  95  86  54  72
## [576]  85  85  90  80  38  79  92  55  30  93  92  96  69  67  81  75  63  81  72  38  73  83  38
## [599]  81  79  73  55  88  76  52  71  55  64  85  76  39  89  72  54  41  80  41  82  78  48  67
## [622]  85  80
GG_denhist(d, "vocab_sumscore") +
  scale_x_continuous("Vocabulary sum score")
## Scale for x is already present.
## Adding another scale for x, which will replace the existing scale.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

GG_save("figs/vocab_sumscore_dist.png")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
#how many hit the ceiling?
sum(d$vocab_sumscore == 101)
## [1] 1
#compare people who failed attention checks
# GG_denhist(d_orig, "vocab_sumscore", group = "attention_checksum2")

d %>% pull(vocab_sumscore) %>% describe2()
#time to complete
d %>% pull(time_to_complete) %>% describe2()
#time use vs. score
d %>% GG_scatter("time_to_complete", "vocab_sumscore")
## `geom_smooth()` using formula = 'y ~ x'

d %>% GG_scatter("time_to_complete_winsor", "vocab_sumscore") +
  geom_smooth()
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

d %>% GG_scatter("time_to_complete_winsor", "vocab_sumscore", color = "time_below_min") +
  geom_smooth() +
  geom_hline(yintercept = 101 * 0.1, linetype = "dashed")
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

d %>% GG_scatter("time_to_complete_winsor", "vocab_sumscore", color = "attention_pass") +
  geom_smooth()
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

#IRT
irt_fit = mirt(
  items_TF,
  model = 1,
  verbose = F,
  technical = list(NCYCLES = 2000),
  SE = T
)

irt_fit
## 
## Call:
## mirt(data = items_TF, model = 1, SE = T, verbose = F, technical = list(NCYCLES = 2000))
## 
## Full-information item factor analysis with 1 factor(s).
## Converged within 1e-04 tolerance after 95 EM iterations.
## mirt version: 1.42 
## M-step optimizer: BFGS 
## EM acceleration: Ramsay 
## Number of rectangular quadrature: 61
## Latent density type: Gaussian 
## 
## Information matrix estimated with method: Oakes
## Second-order test: model is a possible local maximum
## Condition number of information matrix =  114.8989
## 
## Log-likelihood = -26412.21
## Estimated parameters: 202 
## AIC = 53228.43
## BIC = 54124.21; SABIC = 53482.89
## G2 (1e+10) = 44806.98, p = 1
## RMSEA = 0, CFI = NaN, TLI = NaN
irt_fit %>% summary()
##             F1     h2
## item_1   0.346 0.1195
## item_2   0.461 0.2121
## item_3   0.547 0.2988
## item_4   0.715 0.5107
## item_5   0.393 0.1548
## item_6   0.516 0.2661
## item_7   0.794 0.6301
## item_8   0.543 0.2952
## item_9   0.462 0.2133
## item_10  0.885 0.7839
## item_11  0.389 0.1515
## item_12  0.742 0.5503
## item_13  0.519 0.2693
## item_14  0.736 0.5411
## item_15  0.665 0.4424
## item_16  0.860 0.7403
## item_17  0.322 0.1039
## item_18  0.585 0.3419
## item_19  0.636 0.4047
## item_20  0.450 0.2028
## item_21  0.451 0.2033
## item_22  0.488 0.2386
## item_23  0.392 0.1533
## item_24  0.708 0.5016
## item_25  0.658 0.4327
## item_26  0.690 0.4763
## item_27  0.714 0.5098
## item_28  0.521 0.2714
## item_29  0.492 0.2425
## item_30  0.540 0.2914
## item_31  0.875 0.7659
## item_32  0.711 0.5050
## item_33  0.509 0.2593
## item_34  0.312 0.0974
## item_35  0.836 0.6994
## item_36  0.742 0.5507
## item_37  0.524 0.2742
## item_38  0.735 0.5398
## item_39  0.443 0.1966
## item_40  0.862 0.7432
## item_41  0.841 0.7077
## item_42  0.490 0.2405
## item_43  0.527 0.2775
## item_44  0.425 0.1804
## item_45  0.553 0.3056
## item_46  0.302 0.0911
## item_47  0.329 0.1079
## item_48  0.848 0.7199
## item_49  0.590 0.3478
## item_50  0.721 0.5195
## item_51  0.834 0.6956
## item_52  0.645 0.4161
## item_53  0.469 0.2198
## item_54  0.756 0.5719
## item_55  0.528 0.2793
## item_56  0.654 0.4283
## item_57  0.318 0.1013
## item_58  0.855 0.7316
## item_59  0.396 0.1570
## item_60  0.392 0.1538
## item_61  0.353 0.1247
## item_62  0.757 0.5735
## item_63  0.702 0.4932
## item_64  0.721 0.5199
## item_65  0.607 0.3681
## item_66  0.638 0.4073
## item_67  0.537 0.2879
## item_68  0.605 0.3654
## item_69  0.695 0.4833
## item_70  0.627 0.3936
## item_71  0.684 0.4675
## item_72  0.625 0.3911
## item_73  0.794 0.6305
## item_74  0.430 0.1853
## item_75  0.756 0.5709
## item_76  0.477 0.2275
## item_77  0.669 0.4472
## item_78  0.804 0.6469
## item_79  0.694 0.4810
## item_80  0.704 0.4955
## item_81  0.535 0.2865
## item_82  0.616 0.3790
## item_83  0.378 0.1429
## item_84  0.732 0.5354
## item_85  0.604 0.3651
## item_86  0.616 0.3790
## item_87  0.834 0.6964
## item_88  0.595 0.3546
## item_89  0.840 0.7059
## item_90  0.618 0.3824
## item_91  0.577 0.3331
## item_92  0.556 0.3086
## item_93  0.785 0.6158
## item_94  0.508 0.2580
## item_95  0.814 0.6630
## item_96  0.629 0.3950
## item_97  0.657 0.4313
## item_98  0.861 0.7407
## item_99  0.423 0.1791
## item_100 0.456 0.2075
## item_101 0.934 0.8725
## 
## SS loadings:  40.23 
## Proportion Var:  0.398 
## 
## Factor correlations: 
## 
##    F1
## F1  1
#with fixed guessing
irt_guess_fit = mirt(
  items_TF,
  model = 1,
  verbose = F,
  guess = 0.10,
  technical = list(NCYCLES = 2000)
)

irt_guess_fit
## 
## Call:
## mirt(data = items_TF, model = 1, guess = 0.1, verbose = F, technical = list(NCYCLES = 2000))
## 
## Full-information item factor analysis with 1 factor(s).
## Converged within 1e-04 tolerance after 147 EM iterations.
## mirt version: 1.42 
## M-step optimizer: BFGS 
## EM acceleration: Ramsay 
## Number of rectangular quadrature: 61
## Latent density type: Gaussian 
## 
## Log-likelihood = -26353.14
## Estimated parameters: 202 
## AIC = 53110.28
## BIC = 54006.06; SABIC = 53364.74
## G2 (1e+10) = 44688.84, p = 1
## RMSEA = 0, CFI = NaN, TLI = NaN
irt_guess_fit %>% summary()
##             F1     h2
## item_1   0.329 0.1081
## item_2   0.891 0.7935
## item_3   0.610 0.3718
## item_4   0.678 0.4591
## item_5   0.562 0.3158
## item_6   0.496 0.2461
## item_7   0.863 0.7442
## item_8   0.659 0.4337
## item_9   0.506 0.2557
## item_10  0.876 0.7672
## item_11  0.380 0.1445
## item_12  0.804 0.6458
## item_13  0.548 0.2998
## item_14  0.704 0.4953
## item_15  0.619 0.3833
## item_16  0.870 0.7576
## item_17  0.413 0.1706
## item_18  0.550 0.3020
## item_19  0.629 0.3951
## item_20  0.536 0.2873
## item_21  0.517 0.2672
## item_22  0.479 0.2297
## item_23  0.531 0.2824
## item_24  0.745 0.5550
## item_25  0.683 0.4668
## item_26  0.723 0.5231
## item_27  0.706 0.4978
## item_28  0.705 0.4976
## item_29  0.520 0.2703
## item_30  0.500 0.2502
## item_31  0.866 0.7493
## item_32  0.795 0.6325
## item_33  0.653 0.4259
## item_34  0.304 0.0924
## item_35  0.835 0.6966
## item_36  0.767 0.5888
## item_37  0.566 0.3203
## item_38  0.833 0.6932
## item_39  0.428 0.1831
## item_40  0.842 0.7082
## item_41  0.895 0.8008
## item_42  0.681 0.4643
## item_43  0.648 0.4205
## item_44  0.432 0.1864
## item_45  0.530 0.2810
## item_46  0.314 0.0985
## item_47  0.334 0.1116
## item_48  0.814 0.6620
## item_49  0.570 0.3246
## item_50  0.740 0.5480
## item_51  0.807 0.6510
## item_52  0.662 0.4381
## item_53  0.451 0.2038
## item_54  0.732 0.5354
## item_55  0.565 0.3190
## item_56  0.685 0.4686
## item_57  0.509 0.2591
## item_58  0.822 0.6750
## item_59  0.389 0.1513
## item_60  0.509 0.2593
## item_61  0.377 0.1425
## item_62  0.776 0.6020
## item_63  0.738 0.5452
## item_64  0.818 0.6698
## item_65  0.612 0.3745
## item_66  0.726 0.5278
## item_67  0.672 0.4521
## item_68  0.655 0.4293
## item_69  0.669 0.4472
## item_70  0.689 0.4741
## item_71  0.652 0.4252
## item_72  0.761 0.5797
## item_73  0.799 0.6387
## item_74  0.427 0.1826
## item_75  0.760 0.5775
## item_76  0.561 0.3144
## item_77  0.741 0.5491
## item_78  0.800 0.6408
## item_79  0.719 0.5172
## item_80  0.698 0.4878
## item_81  0.522 0.2730
## item_82  0.741 0.5484
## item_83  0.419 0.1753
## item_84  0.808 0.6527
## item_85  0.668 0.4459
## item_86  0.756 0.5715
## item_87  0.832 0.6925
## item_88  0.574 0.3297
## item_89  0.868 0.7531
## item_90  0.722 0.5216
## item_91  0.584 0.3413
## item_92  0.684 0.4680
## item_93  0.764 0.5831
## item_94  0.475 0.2252
## item_95  0.840 0.7056
## item_96  0.777 0.6041
## item_97  0.694 0.4814
## item_98  0.840 0.7052
## item_99  0.424 0.1801
## item_100 0.770 0.5928
## item_101 0.946 0.8956
## 
## SS loadings:  45.489 
## Proportion Var:  0.45 
## 
## Factor correlations: 
## 
##    F1
## F1  1
irt_guess_fit_stats = kirkegaard::get_mirt_stats(irt_guess_fit)

#add to item stats
full_join(
  item_stats,
  irt_fit@Fit$`F` %>% as.data.frame() %>% rownames_to_column(var = "item") %>% rename(loading = F1)
) -> 
  item_stats
## Joining with `by = join_by(item)`
item_stats
#distribution of factor loadings
GG_denhist(item_stats$loading) +
  scale_x_continuous(breaks = seq(-1, 1, .2))
## Input seems like a fraction, set `boundary=0` and `binwidth=1/30` to avoid issues near the limits. Disable this with `auto_fraction_bounary=F`
## Scale for x is already present.
## Adding another scale for x, which will replace the existing scale.

item_stats %>% describe2()
#item difficulty vs. loading
item_stats$difficulty = qnorm(item_stats$pass_rate, lower.tail = F)
item_stats %>% 
  GG_scatter("difficulty", "loading", case_names = "item")
## `geom_smooth()` using formula = 'y ~ x'

item_stats %>% 
  GG_scatter("pass_rate", "loading", case_names = "item") +
  labs(
    x = "Item pass rate",
    y = "Item factor loading"
  )
## `geom_smooth()` using formula = 'y ~ x'

GG_save("figs/item_passrate_loading.png")
## `geom_smooth()` using formula = 'y ~ x'
#score subjects
irt_fit_score = fscores(irt_fit, full.scores.SE = T)
d$vocab_g = irt_fit_score[, 1] %>% standardize()

#irt score dist
GG_denhist(d, "vocab_g") +
  scale_x_continuous("Vocabulary IRT score")
## Scale for x is already present.
## Adding another scale for x, which will replace the existing scale.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

GG_save("figs/vocab_g_dist.png")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
#distributions compare
d %>% select(vocab_sumscore, vocab_g) %>% 
  describe2()
#reliability
marginal_rxx(irt_fit)
## [1] 0.9461351
marginal_rxx(irt_guess_fit)
## [1] 0.9558691
empirical_rxx(irt_fit_score)
##        F1 
## 0.9471914
get_reliabilities(irt_fit) %>% 
  filter(is_between(z, -4, 4)) %>% 
  ggplot(aes(z, rel)) +
  geom_line() +
  scale_x_continuous(breaks = seq(-4, 4, 1))

#range above 0.90 reliability
rel_above_90_range = get_reliabilities(irt_fit) %>% 
  filter(rel > .90) %>% 
  pull(z) %>% 
  range()

rel_above_90_range
## [1] -3.165829  1.175879
mean(is_between(irt_fit_score[, 1], rel_above_90_range[1], rel_above_90_range[2]))
## [1] 0.8988764
#sum score vs. IRT score
GG_scatter(d, "vocab_sumscore", "vocab_g") +
  geom_smooth()
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

#age vs. vocab
GG_scatter(d, "age", "vocab_g") +
  geom_smooth() +
  labs(
    y = "Vocabulary IRT score",
    x = "Age"
  )
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

GG_save("figs/vocab_age.png")
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
GG_scatter(d, "age", "vocab_sumscore") +
  geom_smooth()
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

#adjust for age and make norms
norms = make_norms(
  d$vocab_g,
  d$age
)
## Detected linear effect of age on the score (p = <0.001***). Model used.
## No detected variance effect of age on the score (p = 0.233). Model not used.
norms
## $data
## # A tibble: 623 × 7
##       score   age norm_group score_ageadj1 score_ageadj2 score_ageadj3    IQ
##       <dbl> <dbl> <lgl>              <dbl>         <dbl>         <dbl> <dbl>
##  1 -2.00       42 TRUE              -1.56         -1.56         -1.87   71.9
##  2 -0.363      55 TRUE              -0.377        -0.377        -0.452  93.2
##  3 -0.816      28 TRUE               0.110         0.110         0.132 102. 
##  4 -2.83       56 TRUE              -2.88         -2.88         -3.46   48.1
##  5  0.00396    43 TRUE               0.408         0.408         0.490 107. 
##  6  0.117      75 TRUE              -0.592        -0.592        -0.712  89.3
##  7 -0.0435     69 TRUE              -0.544        -0.544        -0.653  90.2
##  8 -0.241      69 TRUE              -0.742        -0.742        -0.891  86.6
##  9 -0.0374     36 TRUE               0.610         0.610         0.733 111. 
## 10 -2.04       21 TRUE              -0.866        -0.866        -1.04   84.4
## # ℹ 613 more rows
## 
## $age_model
## 
## Call:
## lm(formula = score ~ age, data = d_norm)
## 
## Coefficients:
## (Intercept)          age  
##    -1.89984      0.03479  
## 
## 
## $age_model_abs
## 
## Call:
## lm(formula = abs(score_ageadj1) ~ age, data = d_norm)
## 
## Coefficients:
## (Intercept)          age  
##     0.57180      0.00154  
## 
## 
## $age_model_used
## value 
##  TRUE 
## 
## $age_model_abs_used
## value 
## FALSE 
## 
## $norm_desc
## # A tibble: 1 × 10
##   var               n     mean  median    sd   mad   min   max  skew kurtosis
##   <chr>         <dbl>    <dbl>   <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>    <dbl>
## 1 score_ageadj2   623 8.68e-16 -0.0977 0.833 0.783 -2.88  3.17 0.315    0.635
#norms based on sum score
norms_sum = make_norms(
  d$vocab_sumscore,
  d$age
)
## Detected linear effect of age on the score (p = <0.001***). Model used.
## Detected variance effect of age on the score (p = <0.001***). Model used.
norms_sum
## $data
## # A tibble: 623 × 7
##    score   age norm_group score_ageadj1 score_ageadj2 score_ageadj3    IQ
##    <dbl> <dbl> <lgl>              <dbl>         <dbl>         <dbl> <dbl>
##  1    23    42 TRUE              -39.9         -2.99         -2.37   64.5
##  2    64    55 TRUE               -6.98        -0.599        -0.471  92.9
##  3    57    28 TRUE                2.82         0.187         0.152 102. 
##  4    10    56 TRUE              -61.6         -5.35         -4.23   36.5
##  5    74    43 TRUE               10.5          0.795         0.635 110. 
##  6    74    75 TRUE               -9.43        -1.04         -0.819  87.7
##  7    72    69 TRUE               -7.69        -0.781        -0.615  90.8
##  8    69    69 TRUE              -10.7         -1.09         -0.856  87.2
##  9    74    36 TRUE               14.8          1.05          0.840 113. 
## 10    27    21 TRUE              -22.8         -1.42         -1.13   83.1
## # ℹ 613 more rows
## 
## $age_model
## 
## Call:
## lm(formula = score ~ age, data = d_norm)
## 
## Coefficients:
## (Intercept)          age  
##     36.7513       0.6223  
## 
## 
## $age_model_abs
## 
## Call:
## lm(formula = abs(score_ageadj1) ~ age, data = d_norm)
## 
## Coefficients:
## (Intercept)          age  
##     18.7244      -0.1286  
## 
## 
## $age_model_used
## value 
##  TRUE 
## 
## $age_model_abs_used
## value 
##  TRUE 
## 
## $norm_desc
## # A tibble: 1 × 10
##   var               n     mean median    sd   mad   min   max   skew kurtosis
##   <chr>         <dbl>    <dbl>  <dbl> <dbl> <dbl> <dbl> <dbl>  <dbl>    <dbl>
## 1 score_ageadj2   623 -0.00543  0.133  1.26  1.28 -5.35  2.79 -0.720    0.948
#save full scale norms
write_rds(norms, "data/full_norms_IRT.rds", compress = "xz")
write_rds(norms_sum, "data/full_norms_CTT.rds", compress = "xz")

#save norms
d$vocab_IQ_orig = norms$data$IQ %>% as.numeric()
d$vocab_IQ_sum_orig = norms_sum$data$IQ %>% as.numeric()

#manually check for HS
d$vocab_g_age_resid = lm(vocab_g ~ age, data = d) %>% residuals()
test_HS(d$vocab_g_age_resid, d$age)
## number of knots in rcs defaulting to 5
## number of knots in rcs defaulting to 5
#plot IQs
d %>% 
  GG_denhist("vocab_IQ_orig") +
  scale_x_continuous(breaks = seq(0, 200, 10)) +
  labs(
    x = "Vocabulary IQ score"
  )
## Scale for x is already present.
## Adding another scale for x, which will replace the existing scale.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

GG_save("figs/vocab_IQ_dist.png")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
d$vocab_IQ_orig %>% describe2()
#remove implausible values
sum(d$vocab_IQ_orig < 65)
## [1] 5
sum(d$vocab_IQ_orig > 145)
## [1] 3
d$vocab_IQ = if_else(!is_between(d$vocab_IQ_orig, 65, 145), NA_real_, d$vocab_IQ_orig)
d$vocab_IQ_sum = if_else(!is_between(d$vocab_IQ_sum_orig, 65, 145), NA_real_, d$vocab_IQ_sum_orig)

d %>% select(vocab_IQ, vocab_IQ_sum) %>% describe2()
#after filter
d %>% 
  GG_denhist("vocab_IQ") +
  scale_x_continuous(breaks = seq(0, 200, 10))
## Warning: Removed 8 rows containing non-finite outside the scale range (`stat_bin()`).
## Warning: Removed 8 rows containing non-finite outside the scale range (`stat_density()`).
## Scale for x is already present.
## Adding another scale for x, which will replace the existing scale.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 8 rows containing non-finite outside the scale range (`stat_bin()`).
## Removed 8 rows containing non-finite outside the scale range (`stat_density()`).

d %>% 
  GG_denhist("vocab_IQ_sum") +
  scale_x_continuous(breaks = seq(0, 200, 10))
## Warning: Removed 13 rows containing non-finite outside the scale range (`stat_bin()`).
## Warning: Removed 13 rows containing non-finite outside the scale range (`stat_density()`).
## Scale for x is already present.
## Adding another scale for x, which will replace the existing scale.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 13 rows containing non-finite outside the scale range (`stat_bin()`).
## Removed 13 rows containing non-finite outside the scale range (`stat_density()`).

#correlation between methods
GG_scatter(d, "vocab_IQ", "vocab_IQ_sum")
## `geom_smooth()` using formula = 'y ~ x'

Abbreviation

#filter items by pass rate
items_TF_goodpass = items_TF %>% select(item_stats$item[is_between(item_stats$pass_rate, .05, .95)])
dim(items_TF_goodpass)
## [1] 623  99
#apply algos
scale_forwards = cache_object(abbreviate_scale(items_TF_goodpass, item_target = 50, method = "forwards"), filename = "temp/abbrev_forwards.rds")
## Cache found, reading object from disk
scale_backwards = cache_object(abbreviate_scale(items_TF_goodpass, item_target = 5, method = "backwards"), filename = "temp/abbrev_backwards.rds")
## Cache found, reading object from disk
scale_max_basic = cache_object(abbreviate_scale(items_TF_goodpass, item_target = 50, method = "max_loading"), filename = "temp/abbrev_max_basic.rds")
## Cache found, reading object from disk
scale_max_balance = cache_object(abbreviate_scale(items_TF_goodpass, item_target = 50, method = "max_loading", difficulty_balance_groups = 5), filename = "temp/abbrev_max_balance.rds")
## Cache found, reading object from disk
scale_max_resid = cache_object(abbreviate_scale(items_TF_goodpass, item_target = 50, method = "max_loading", residualize_loadings = T), filename = "temp/abbrev_max_resid.rds")
## Cache found, reading object from disk
scale_genetic = cache_object(abbreviate_scale(items_TF_goodpass, item_target = 50, method = "genetic"), filename = "temp/abbrev_genetic.rds")
## Cache found, reading object from disk
#plot results
GG_scale_abbreviation(scale_forwards)

GG_scale_abbreviation(scale_backwards)

GG_scale_abbreviation(scale_max_basic)

GG_scale_abbreviation(scale_max_balance)

GG_scale_abbreviation(scale_max_resid)

GG_scale_abbreviation(scale_genetic)

#all together, just the reliability and full scale cor
bind_rows(
  scale_forwards$best_sets %>% mutate(method = "forwards"),
  scale_backwards$best_sets %>% mutate(method = "backwards") %>% filter(items_in_scale %in% 3:50),
  scale_max_basic$best_sets %>% mutate(method = "max loading"),
  scale_max_balance$best_sets %>% mutate(method = "max loading balance"),
  scale_max_resid$best_sets %>% mutate(method = "max loading resid"),
  scale_genetic$best_sets %>% mutate(method = "genetic")
) %>% 
  #split reliability and full scale cor
  pivot_longer(cols = c(reliability, r_full_score), names_to = "var", values_to = "var_value") %>%
  ggplot(aes(items_in_scale, var_value, color = method)) +
  geom_line() +
  facet_wrap(~var, scales = "free_y") +
  labs(
    y = "Value",
    x = "Number of items in scale"
  )

GG_save("figs/abbrev_results.png")

CTT abbreviation

scale_backwards_CTT = cache_object(abbreviate_scale(items_TF_goodpass, item_target = 5, method = "backwards", IRT = F), filename = "temp/abbrev_backwards_CTT.rds")
## Cache found, reading object from disk
#compare 20 item versions
scale_backwards$best_sets %>% filter(items_in_scale == 20) %>% pull(item_set)
## [[1]]
##  [1] "item_7"   "item_12"  "item_16"  "item_32"  "item_33"  "item_35"  "item_38"  "item_50" 
##  [9] "item_64"  "item_67"  "item_68"  "item_72"  "item_73"  "item_84"  "item_86"  "item_90" 
## [17] "item_92"  "item_95"  "item_96"  "item_101"
scale_backwards_CTT$best_sets %>% filter(items_in_scale == 20) %>% pull(item_set)
## [[1]]
##  [1] "item_7"   "item_12"  "item_16"  "item_31"  "item_32"  "item_35"  "item_38"  "item_40" 
##  [9] "item_41"  "item_50"  "item_64"  "item_73"  "item_75"  "item_77"  "item_84"  "item_86" 
## [17] "item_87"  "item_89"  "item_95"  "item_101"
#count overlap
overlap_metrics(
  scale_backwards$best_sets %>% filter(items_in_scale == 20) %>% pull(item_set) %>% unlist(),
  scale_backwards_CTT$best_sets %>% filter(items_in_scale == 20) %>% pull(item_set) %>% unlist()
)
##               x_count               y_count        xy_union_count xy_intersection_count 
##            20.0000000            20.0000000            27.0000000            13.0000000 
## prop_overlap_of_union          x_in_y_count           x_in_y_prop          y_in_x_count 
##             0.4814815            13.0000000             0.6500000            13.0000000 
##           y_in_x_prop 
##             0.6500000

All norms

We need to make norms for the IRT and CTT scales, and save these to file so that someone may use them fairly easily.

scale_lengths = c(5, 10, 15, 20, 25, 30, 35, 40, 45, 50)

#IRT scales
norms_IRT = map_dfr(scale_lengths, function(scale_length) {
  #get scale
  i_scale = scale_backwards$best_sets %>% filter(items_in_scale == scale_length)
  
  #make norms
  i_norms = make_norms(
    i_scale[["scores"]][[1]][, 1],
    d$age
  )
  
  #save
  tibble(
    scale_length = scale_length,
    norms = list(i_norms)
  ) 
})
## Detected linear effect of age on the score (p = <0.001***). Model used.
## Detected variance effect of age on the score (p = 0.002**). Model used.
## Detected linear effect of age on the score (p = <0.001***). Model used.
## No detected variance effect of age on the score (p = 0.74). Model not used.
## Detected linear effect of age on the score (p = <0.001***). Model used.
## No detected variance effect of age on the score (p = 0.189). Model not used.
## Detected linear effect of age on the score (p = <0.001***). Model used.
## No detected variance effect of age on the score (p = 0.425). Model not used.
## Detected linear effect of age on the score (p = <0.001***). Model used.
## No detected variance effect of age on the score (p = 0.187). Model not used.
## Detected linear effect of age on the score (p = <0.001***). Model used.
## No detected variance effect of age on the score (p = 0.221). Model not used.
## Detected linear effect of age on the score (p = <0.001***). Model used.
## No detected variance effect of age on the score (p = 0.146). Model not used.
## Detected linear effect of age on the score (p = <0.001***). Model used.
## No detected variance effect of age on the score (p = 0.175). Model not used.
## Detected linear effect of age on the score (p = <0.001***). Model used.
## No detected variance effect of age on the score (p = 0.12). Model not used.
## Detected linear effect of age on the score (p = <0.001***). Model used.
## No detected variance effect of age on the score (p = 0.105). Model not used.
#CTT scales
norms_CTT = map_dfr(scale_lengths, function(scale_length) {
  #get scale
  i_scale = scale_backwards_CTT$best_sets %>% filter(items_in_scale == scale_length)
  
  #make norms
  i_norms = make_norms(
    i_scale[["scores"]][[1]][, 1],
    d$age
  )
  
  #save
  tibble(
    scale_length = scale_length,
    norms = list(i_norms)
  ) 
})
## Detected linear effect of age on the score (p = <0.001***). Model used.
## Detected variance effect of age on the score (p = <0.001***). Model used.
## Detected linear effect of age on the score (p = <0.001***). Model used.
## Detected variance effect of age on the score (p = <0.001***). Model used.
## Detected linear effect of age on the score (p = <0.001***). Model used.
## Detected variance effect of age on the score (p = <0.001***). Model used.
## Detected linear effect of age on the score (p = <0.001***). Model used.
## Detected variance effect of age on the score (p = <0.001***). Model used.
## Detected linear effect of age on the score (p = <0.001***). Model used.
## Detected variance effect of age on the score (p = <0.001***). Model used.
## Detected linear effect of age on the score (p = <0.001***). Model used.
## Detected variance effect of age on the score (p = <0.001***). Model used.
## Detected linear effect of age on the score (p = <0.001***). Model used.
## Detected variance effect of age on the score (p = <0.001***). Model used.
## Detected linear effect of age on the score (p = <0.001***). Model used.
## Detected variance effect of age on the score (p = <0.001***). Model used.
## Detected linear effect of age on the score (p = <0.001***). Model used.
## Detected variance effect of age on the score (p = <0.001***). Model used.
## Detected linear effect of age on the score (p = <0.001***). Model used.
## Detected variance effect of age on the score (p = <0.001***). Model used.
#save for reuse
write_rds(norms_IRT, "data/abbrev_norms_IRT.rds", compress = "xz")
write_rds(norms_CTT, "data/abbrev_norms_CTT.rds", compress = "xz")

Norm example

#explore 20 item test example
#score 17/20, male, 30 years
scale20CTT = scale_backwards_CTT$best_sets %>% filter(items_in_scale == 20)

#distribution
tibble(
  sum_score = scale20CTT$scores[[1]][, 1],
  group = "norm"
) %>% bind_rows(
  tibble(
    sum_score = 17,
    group = "new"
    )
) %>% GG_denhist("sum_score", group = "group")
## Warning: Groups with fewer than two data points have been dropped.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Groups with fewer than two data points have been dropped.
## Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning -Inf

#centile
scale20CTT_ecdf = ecdf(scale20CTT$scores[[1]][, 1])
scale20CTT_ecdf(17)
## [1] 0.6067416
#age adjusted
norms_CTT_20 = norms_CTT %>% filter(scale_length == 20) %>% pull(norms) %>% first()

apply_norms(
  score = (0:20) %>% set_names(0:20),
  age = 30,
  prior_norms = norms_CTT_20
)
##         0         1         2         3         4         5         6         7         8 
##  74.50478  77.17116  79.83754  82.50391  85.17029  87.83666  90.50304  93.16942  95.83579 
##         9        10        11        12        13        14        15        16        17 
##  98.50217 101.16854 103.83492 106.50129 109.16767 111.83405 114.50042 117.16680 119.83317 
##        18        19        20 
## 122.49955 125.16593 127.83230

Validation

#age 35+
d_age35 = d %>% filter(age >= 35)

#education levels
d_age35 %>% 
    GG_group_means("vocab_IQ", "education", type = "point") +
  labs(
    y = "Vocabulary IQ",
    x = "Education level at age 35+"
  )
## Missing values were removed.

GG_save("figs/IQ_education.png")

edu_nom = nnet::multinom(education ~ vocab_IQ + age * sex, data = d_age35)
## # weights:  48 (35 variable)
## initial  value 1110.421783 
## iter  10 value 1011.020529
## iter  20 value 1009.460065
## iter  30 value 934.173854
## iter  40 value 931.756870
## iter  50 value 931.402348
## iter  60 value 931.252229
## final  value 931.252130 
## converged
ggpredict(edu_nom, "vocab_IQ [65:145]") %>% 
  plot()

#reverse
ols(vocab_IQ ~ education, data = d_age35)
## Frequencies of Missing Values Due to Each Variable
##  vocab_IQ education 
##         8         0 
## 
## Linear Regression Model
## 
## ols(formula = vocab_IQ ~ education, data = d_age35)
## 
## 
##                  Model Likelihood    Discrimination    
##                        Ratio Test           Indexes    
## Obs      534    LR chi2     72.58    R2       0.127    
## sigma13.3660    d.f.            7    R2 adj   0.115    
## d.f.     526    Pr(> chi2) 0.0000    g        5.553    
## 
## Residuals
## 
##     Min      1Q  Median      3Q     Max 
## -36.098  -9.557  -1.014   8.296  37.597 
## 
## 
##                               Coef    S.E.   t     Pr(>|t|)
## Intercept                     91.7892 1.9496 47.08 <0.0001 
## education=High school         12.3456 2.7877  4.43 <0.0001 
## education=Vocational           3.1110 2.2659  1.37 0.1703  
## education=Short education      8.1413 2.7026  3.01 0.0027  
## education=Medium education    11.2318 2.2532  4.98 <0.0001 
## education=University bachelor 12.5349 2.8563  4.39 <0.0001 
## education=University master   14.9645 2.5064  5.97 <0.0001 
## education=PhD                 24.4999 6.9616  3.52 0.0005
describe2(d_age35$vocab_IQ, d_age35$education)
## New names:
## • `` -> `...1`
#books
d$books_read %>% describe2()
d %>% 
  GG_scatter("vocab_IQ", "books_read")
## `geom_smooth()` using formula = 'y ~ x'

d$books_read2 = d$books_read %>% winsorise(upper = 100)
d %>% 
  GG_scatter("vocab_IQ", "books_read2")
## `geom_smooth()` using formula = 'y ~ x'

#log
d$books_read_log = log(d$books_read + 1)
d %>% 
  GG_scatter("vocab_IQ", "books_read_log") +
  labs(
    y = "Log (1 + books read last year)",
    x = "Vocabulary IQ"
  )
## `geom_smooth()` using formula = 'y ~ x'

GG_save("figs/IQ_books.png")
## `geom_smooth()` using formula = 'y ~ x'
#attempt to clean and fix income data
d$income = d$Hvor_mange_penge_tjente_du_sidste_år_før_skat_Medtæl_alle_indkomstkilder_fx_sociale_ydelser_og_pension_Bedste_gæt %>% as.numeric()

#remove implausible values above 2 million
d$income = if_else(d$income > 2e6, NA_real_, d$income)

#distribution
GG_denhist(d, "income") +
  scale_x_log10()
## Warning: Removed 4 rows containing non-finite outside the scale range (`stat_bin()`).
## Warning: Removed 4 rows containing non-finite outside the scale range (`stat_density()`).
## Scale for x is already present.
## Adding another scale for x, which will replace the existing scale.
## Warning in scale_x_log10(): log-10 transformation introduced infinite values.
## Warning in scale_x_log10(): log-10 transformation introduced infinite values.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 23 rows containing non-finite outside the scale range (`stat_bin()`).
## Warning: Removed 23 rows containing non-finite outside the scale range (`stat_density()`).

GG_scatter(d, "vocab_IQ", "income")
## `geom_smooth()` using formula = 'y ~ x'

#assume that if value is below 1000, it is in thousands
d$income2 = if_else(d$income < 1000, d$income * 1000, d$income)

#plot against score
d %>% 
  GG_scatter("vocab_IQ", "income2")
## `geom_smooth()` using formula = 'y ~ x'

#log income
d$income_log = log(d$income2 + 1)
d %>% 
  GG_scatter("vocab_IQ", "income_log")
## `geom_smooth()` using formula = 'y ~ x'

#regression model
inc_mod = gam(income ~ s(vocab_IQ, by = sex) + s(age, by = sex) + sex, data = d)
inc_mod %>% summary()
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## income ~ s(vocab_IQ, by = sex) + s(age, by = sex) + sex
## 
## Parametric coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   240048      13720  17.496  < 2e-16 ***
## sexMale        87988      19546   4.502 8.11e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                         edf Ref.df     F p-value   
## s(vocab_IQ):sexFemale 1.000  1.000 0.439 0.50767   
## s(vocab_IQ):sexMale   2.060  2.621 3.158 0.04007 * 
## s(age):sexFemale      1.866  2.343 1.958 0.13889   
## s(age):sexMale        5.821  6.978 3.517 0.00111 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.0767   Deviance explained = 9.45%
## GCV = 5.742e+10  Scale est. = 5.6222e+10  n = 611
#standardized values
d$income_z = standardize(d$income2)
d$vocab_IQ_z = d$vocab_IQ %>% standardize()
incz_mod = gam(income_z ~ s(vocab_IQ_z, by = sex) + s(age, by = sex) + sex, data = d)
incz_mod %>% summary()
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## income_z ~ s(vocab_IQ_z, by = sex) + s(age, by = sex) + sex
## 
## Parametric coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -0.15662    0.05171  -3.029  0.00256 ** 
## sexMale      0.26379    0.07366   3.581  0.00037 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                           edf Ref.df     F  p-value    
## s(vocab_IQ_z):sexFemale 1.000  1.000 0.063 0.801920    
## s(vocab_IQ_z):sexMale   2.565  3.255 3.139 0.021515 *  
## s(age):sexFemale        2.039  2.560 2.049 0.148251    
## s(age):sexMale          5.901  7.059 3.887 0.000407 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.0734   Deviance explained = 9.24%
## GCV = 0.81565  Scale est. = 0.79762   n = 611
#for those above 35
d_age35 = d %>% filter(age >= 35)
inc35_mod = gam(income_z ~ vocab_IQ_z + s(age, by = sex) + sex, data = d_age35)
inc35_mod %>% summary()
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## income_z ~ vocab_IQ_z + s(age, by = sex) + sex
## 
## Parametric coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -0.14975    0.05129  -2.919  0.00366 ** 
## vocab_IQ_z   0.10825    0.03542   3.056  0.00235 ** 
## sexMale      0.32638    0.07208   4.528  7.4e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                    edf Ref.df     F p-value   
## s(age):sexFemale 1.911  2.398 2.715 0.06478 . 
## s(age):sexMale   5.980  7.150 2.994 0.00438 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.0852   Deviance explained = 10.2%
## GCV = 0.67871  Scale est. = 0.66476   n = 530
#remove incomes that are 0
d$income0 <- ifelse(d$income > 0, d$income, NA)

GG_scatter(d, "vocab_IQ", "income0")
## `geom_smooth()` using formula = 'y ~ x'

#assume that if value is below 1000, it is in thousands
d$income02 = if_else(d$income0 < 1000, d$income0 * 1000, d$income0)

#plot against score
d %>% 
  GG_scatter("vocab_IQ", "income02")
## `geom_smooth()` using formula = 'y ~ x'

#log income
d$income0_log = log(d$income02 + 1)
d %>% 
  GG_scatter("vocab_IQ", "income0_log")
## `geom_smooth()` using formula = 'y ~ x'

#regression model
inc_mod = gam(income0 ~ s(vocab_IQ, by = sex) + s(age, by = sex) + sex, data = d)
inc_mod %>% summary()
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## income0 ~ s(vocab_IQ, by = sex) + s(age, by = sex) + sex
## 
## Parametric coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   249236      13919  17.906  < 2e-16 ***
## sexMale        87266      19778   4.412 1.22e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                         edf Ref.df     F p-value   
## s(vocab_IQ):sexFemale 1.000  1.000 0.425 0.51452   
## s(vocab_IQ):sexMale   2.201  2.798 3.003 0.04507 * 
## s(age):sexFemale      1.812  2.274 1.896 0.15243   
## s(age):sexMale        4.516  5.552 3.900 0.00129 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.0733   Deviance explained = 8.98%
## GCV = 5.6892e+10  Scale est. = 5.5784e+10  n = 592
#standardized values
d$income0_z = standardize(d$income02)
d$vocab_IQ_z = d$vocab_IQ %>% standardize()
incz_mod = gam(income0_z ~ s(vocab_IQ_z, by = sex) + s(age, by = sex) + sex, data = d)
incz_mod %>% summary()
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## income0_z ~ s(vocab_IQ_z, by = sex) + s(age, by = sex) + sex
## 
## Parametric coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -0.15667    0.05239   -2.99 0.002905 ** 
## sexMale      0.26294    0.07448    3.53 0.000448 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                           edf Ref.df     F  p-value    
## s(vocab_IQ_z):sexFemale 1.000  1.000 0.047 0.827793    
## s(vocab_IQ_z):sexMale   2.612  3.312 2.961 0.027494 *  
## s(age):sexFemale        2.041  2.564 2.032 0.158978    
## s(age):sexMale          6.404  7.545 3.794 0.000269 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.0757   Deviance explained = 9.62%
## GCV = 0.80838  Scale est. = 0.78919   n = 592
#for those above 35
d_age35 = d %>% filter(age >= 35)
inc35_mod = gam(income0_z ~ vocab_IQ_z + s(age, by = sex) + sex, data = d_age35)
inc35_mod %>% summary()
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## income0_z ~ vocab_IQ_z + s(age, by = sex) + sex
## 
## Parametric coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -0.15497    0.05152  -3.008  0.00276 ** 
## vocab_IQ_z   0.10665    0.03567   2.990  0.00293 ** 
## sexMale      0.33041    0.07234   4.567 6.22e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                    edf Ref.df     F p-value   
## s(age):sexFemale 2.005  2.516 2.651 0.06682 . 
## s(age):sexMale   5.881  7.051 2.928 0.00512 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.0867   Deviance explained = 10.4%
## GCV = 0.66402  Scale est. = 0.64998   n = 515
#sex difference
(inc_sex_desc = describe2(d_age35$income2, d_age35$sex))
## New names:
## • `` -> `...1`
inc_sex_desc$mean[2] / inc_sex_desc$mean[1]
## [1] 1.273635

Check possible causes for the outlier in education (high school category)

#education numeric
d$education_rank <- case_when(
  d$education == "Less than high school" ~ 1,
  d$education == "High school" ~ 2,
  d$education == "Vocational" ~ 3,
  d$education == "Short education" ~ 4,
  d$education == "Medium education" ~ 5,
  d$education == "University bachelor" ~ 6,
  d$education == "University master" ~ 7,
  d$education == "PhD" ~ 8,
  TRUE ~ NA_real_
)

d_age35$education_rank <- case_when(
  d_age35$education == "Less than high school" ~ 1,
  d_age35$education == "High school" ~ 2,
  d_age35$education == "Vocational" ~ 3,
  d_age35$education == "Short education" ~ 4,
  d_age35$education == "Medium education" ~ 5,
  d_age35$education == "University bachelor" ~ 6,
  d_age35$education == "University master" ~ 7,
  d_age35$education == "PhD" ~ 8,
  TRUE ~ NA_real_
)

#books & age by education level
d %>% group_by(education_rank) %>% summarise(books_read = mean(books_read), n = n())
d %>% group_by(education_rank) %>% summarise(books_read2 = mean(books_read2), n = n())
d %>% group_by(education_rank) %>% summarise(age = mean(age), n = n())
d_age35 %>% group_by(education_rank) %>% summarise(books_read = mean(books_read), n = n())
d_age35 %>% group_by(education_rank) %>% summarise(books_read2 = mean(books_read2), n = n())
d_age35 %>% group_by(education_rank) %>% summarise(age = mean(age), n = n())

Meta

write_sessioninfo()
## R version 4.4.1 (2024-06-14 ucrt)
## Platform: x86_64-w64-mingw32/x64
## Running under: Windows 11 x64 (build 26100)
## 
## Matrix products: default
## 
## 
## locale:
## [1] LC_COLLATE=French_France.utf8  LC_CTYPE=French_France.utf8    LC_MONETARY=French_France.utf8
## [4] LC_NUMERIC=C                   LC_TIME=French_France.utf8    
## 
## time zone: Asia/Hong_Kong
## tzcode source: internal
## 
## attached base packages:
## [1] stats4    stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] future_1.33.2         mgcv_1.9-1            nlme_3.1-164          ggeffects_1.7.0      
##  [5] nnet_7.3-19           rms_7.0-0             googlesheets4_1.1.1   haven_2.5.4          
##  [9] mirt_1.42             lattice_0.22-6        readxl_1.4.3          kirkegaard_2024-06-07
## [13] psych_2.4.6.26        assertthat_0.2.1      weights_1.0.4         Hmisc_5.2-2          
## [17] magrittr_2.0.3        lubridate_1.9.3       forcats_1.0.0         stringr_1.5.1        
## [21] dplyr_1.1.4           purrr_1.0.2           readr_2.1.5           tidyr_1.3.1          
## [25] tibble_3.2.1          ggplot2_3.5.1         tidyverse_2.0.0      
## 
## loaded via a namespace (and not attached):
##   [1] rstudioapi_0.17.1    audio_0.1-11         jsonlite_1.8.8       shape_1.4.6.1       
##   [5] datawizard_0.13.0    TH.data_1.1-2        jomo_2.7-6           farver_2.1.2        
##   [9] nloptr_2.1.1         rmarkdown_2.27       ragg_1.3.2           fs_1.6.4            
##  [13] vctrs_0.6.5          minqa_1.2.7          base64enc_0.1-3      htmltools_0.5.8.1   
##  [17] polspline_1.1.25     curl_5.2.1           broom_1.0.6          cellranger_1.1.0    
##  [21] Formula_1.2-5        mitml_0.4-5          dcurver_0.9.2        sass_0.4.9          
##  [25] parallelly_1.37.1    bslib_0.7.0          htmlwidgets_1.6.4    plyr_1.8.9          
##  [29] sandwich_3.1-0       testthat_3.2.1.1     zoo_1.8-12           cachem_1.1.0        
##  [33] lifecycle_1.0.4      iterators_1.0.14     pkgconfig_2.0.3      sjlabelled_1.2.0    
##  [37] Matrix_1.7-0         R6_2.5.1             fastmap_1.2.0        snakecase_0.11.1    
##  [41] digest_0.6.36        colorspace_2.1-0     textshaping_0.4.0    vegan_2.6-6.1       
##  [45] labeling_0.4.3       progressr_0.14.0     fansi_1.0.6          timechange_0.3.0    
##  [49] gdata_3.0.0          httr_1.4.7           compiler_4.4.1       gargle_1.5.2        
##  [53] bit64_4.0.5          withr_3.0.2          htmlTable_2.4.3      backports_1.5.0     
##  [57] highr_0.11           R.utils_2.12.3       pan_1.9              MASS_7.3-60.2       
##  [61] quantreg_5.98        sessioninfo_1.2.2    GPArotation_2024.3-1 gtools_3.9.5        
##  [65] permute_0.9-7        tools_4.4.1          foreign_0.8-86       googledrive_2.1.1   
##  [69] future.apply_1.11.2  R.oo_1.26.0          glue_1.7.0           grid_4.4.1          
##  [73] rsconnect_1.3.1      checkmate_2.3.1      cluster_2.1.6        generics_0.1.3      
##  [77] gtable_0.3.5         RPushbullet_0.3.4    tzdb_0.4.0           R.methodsS3_1.8.2   
##  [81] data.table_1.15.4    hms_1.1.3            Deriv_4.1.3          utf8_1.2.4          
##  [85] foreach_1.5.2        pillar_1.9.0         vroom_1.6.5          splines_4.4.1       
##  [89] bit_4.0.5            survival_3.6-4       SparseM_1.84         tidyselect_1.2.1    
##  [93] pbapply_1.7-2        knitr_1.48           gridExtra_2.3        xfun_0.45           
##  [97] brio_1.1.5           stringi_1.8.4        yaml_2.3.9           boot_1.3-30         
## [101] evaluate_0.24.0      codetools_0.2-20     beepr_2.0            cli_3.6.3           
## [105] rpart_4.1.23         systemfonts_1.1.0    munsell_0.5.1        jquerylib_0.1.4     
## [109] Rcpp_1.0.12          globals_0.16.3       parallel_4.4.1       MatrixModels_0.5-3  
## [113] lme4_1.1-35.5        listenv_0.9.1        glmnet_4.1-8         mvtnorm_1.2-5       
## [117] SimDesign_2.16       scales_1.3.0         crayon_1.5.3         insight_0.20.5      
## [121] rlang_1.1.4          multcomp_1.4-26      mnormt_2.1.1         mice_3.16.0