Init

Sys.setenv(LANG = "en") # make R environment in english

library(kirkegaard)
load_packages(
  readxl,
  mirt,
  haven,
  googlesheets4,
  rms,
  nnet,
  ggeffects,
  mgcv,
  future
)

theme_set(theme_bw())

#exclusion rules
exclude_failed_attention_checks = T
exclude_below_min = 17.5

options(contrasts=c("contr.treatment", "contr.treatment"))

#multi-threading
plan(multisession(workers = 4))

Data

Data cleaning and coding

#read
# d = read_spss("data/SpssExport-1645882702/spss.sav")
#only the CSV export has browser data, which we need!
d_raw = read_csv("C:/Users/mh198/OneDrive/Documents/Data/Other data/A new, free to use Danish vocabulary test with norms/data/20250113142029-SurveyExport.csv") %>% 
  df_legalize_names(func = function(x) str_legalize(x, ascii = F))

## Rows: 2904 Columns: 585
## ── Column specification ──────────────────────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (572): Status, Language, Referer, SessionID, User Agent, IP Address, Country, City, State...
## dbl    (6): Response ID, Longitude, Latitude, Hvor mange bøger læste du sidste år? Inklusiv ly...
## lgl    (5): Contact ID, Legacy Comments, Comments, Tags, End redirect
## dttm   (2): Time Started, Date Submitted
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

d = d_raw %>% 
  #filter to complete cases
  filter(Status == "Complete", 
         str_detect(Referer, "ssisurveys"),
         Taler_du_dansk_som_modersmål == "Ja")

d_vars = df_var_table(d)

#add self
d = bind_rows(
  d,
  d_raw %>% tail(1)
)

#copy the attention checks
attention_checks = d %>% select(contains("Vælg_batteri_og_musik"), contains("monster_og_terrasse"))

#remove these variables from main
for (v in names(attention_checks)) d[[v]] = NULL

#add attention check score back
d$attention_check_1 = (!is.na(attention_checks$batteri_Vælg_batteri_og_musik) & !is.na(attention_checks$musik_Vælg_batteri_og_musik))
d$attention_check_2 = (!is.na(attention_checks$monster_Vælg_monster_og_terrasse) & !is.na(attention_checks$terrasse_Vælg_monster_og_terrasse))
d$attention_checksum = d$attention_check_1 + d$attention_check_2
d$attention_checksum %>% table2()

#manual inspect
d$attention_checksum2 = attention_checks %>% select(batteri_Vælg_batteri_og_musik, musik_Vælg_batteri_og_musik, monster_Vælg_monster_og_terrasse, terrasse_Vælg_monster_og_terrasse) %>% 
  miss_by_case()

#recodings
d %<>% mutate(
  vote_political_party = Hvis_der_var_Folketingsvalg_i_dag_hvem_ville_du_så_stemme_på %>% fct_relevel("S - Socialdemokratiet"),
  age = Hvor_gammel_er_du %>% as.numeric(),
  attention_pass = (attention_checksum2 == 0),
  time_to_complete = (Date_Submitted - Time_Started) %>% as.numeric(),
  time_to_complete_winsor = time_to_complete %>% winsorise(upper = 60),
  time_below_min = time_to_complete_winsor <= exclude_below_min,
  sex = Hvilken_køn_er_du_Biologisk %>% mapvalues(c("Mand", "Kvinde"), c("Male", "Female")) %>% as.factor(),
  native_speaker = Taler_du_dansk_som_modersmål == "Ja",
  education = Hvad_er_din_højeste_fuldførte_uddannelse %>% mapvalues(
    c(
      "Grundskole eller under (op til 10. klasse)",
      "Gymnasial uddannelse (almen/STX, handelshøjskole/HHX, teknisk/HTX, HF)",
      "Erhvervsfaglig uddannelse (fx social- og sundhedshjælper, elektriker, gastronom, murer, EUD)",
      "Kort videregående uddannelse (fx tandplejer, handelsøkonom, farmakonom, laborant og byggetekniker, 2-årig)",
      "Mellemlang videregående uddannelse (fx pædagog, sygeplejerske, skolelærer, maskinmester, diplomingeniør, 3-4-årig)",
      "Universitets bachelor (fx BA i dansk, BsC i fysik, 3-årig)",
      "Universitets kandidat (fx cand.mag i tysk, cand.scient i geologi, 2 år ovenpå bachelor)",
      "Ph.d. og forskeruddannelser"
    ),
    c(
      "Less than high school",
      "High school",
      "Vocational",
      "Short education",
      "Medium education",
      "University bachelor",
      "University master",
      "PhD"
    )
  ) %>% ordered(levels =    c(
      "Less than high school",
      "High school",
      "Vocational",
      "Short education",
      "Medium education",
      "University bachelor",
      "University master",
      "PhD"
    )),
  books_read = Hvor_mange_bøger_læste_du_sidste_år_Inklusiv_lydbøger,
  children = d$Hvor_mange_biologiske_børn_har_du,
  ideal_fertility = d$Forestil_dig_din_drømmeverden_Hvor_mange_biologiske_børn_ville_du_have_i_den
)

## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `age = Hvor_gammel_er_du %>% as.numeric()`.
## Caused by warning in `Hvor_gammel_er_du %>% as.numeric()`:
## ! NAs introduced by coercion

#filter to attention pass, and native speakers
d_orig = d
if (exclude_failed_attention_checks) {
  d %<>% filter(attention_checksum2 == 0)
  d %<>% filter(!time_below_min)
  d %<>% filter(native_speaker)
}
d_failed = d_orig %>% filter(attention_checksum2 > 0)

table2(d_orig$attention_checksum2)

#export fails
d_failed %>% select(What_is_your_user_ID, attention_checksum2) %>% write_csv("data/failed_users.csv")

Vocab data

#restructure so we get just 101 columns
#they are grouped into 5 for each column, so we need to find the sets of 5 cols to merge
vocab_vars = d %>% select(overvåge_Vælg_de_to_ord_med_samme_eller_næsten_samme_betydning:svejfet_Vælg_de_to_ord_med_samme_eller_næsten_samme_betydning) %>% names()
length(vocab_vars)

## [1] 505

#item list
items_words = map_df(split_every_k(vocab_vars, k = 5), function(dd) {
  # browser()
  #split into 5 cols
  tibble(
    item_1 = dd[1] %>% str_replace("_Vælg_de_to_ord_med_samme_eller_næsten_samme_betydning", ""),
    item_2 = dd[2] %>% str_replace("_Vælg_de_to_ord_med_samme_eller_næsten_samme_betydning", ""),
    item_3 = dd[3] %>% str_replace("_Vælg_de_to_ord_med_samme_eller_næsten_samme_betydning", ""),
    item_4 = dd[4] %>% str_replace("_Vælg_de_to_ord_med_samme_eller_næsten_samme_betydning", ""),
    item_5 = dd[5] %>% str_replace("_Vælg_de_to_ord_med_samme_eller_næsten_samme_betydning", ""),
  )
}) %>% mutate(
  item_number = 1:n()
)

#convert to columns with integer sets
item_int_sets = map(split_every_k(vocab_vars, k = 5), function(dd) {
  # browser()
  #subset variables, check which are not missing, concat with comma
  d %>% 
    select(!!unlist(dd)) %>% 
    as.matrix() %>% 
    {
      #missing, matrix format
      miss = is.na(.) %>% `!`() %>% matrix(ncol = ncol(.))
      miss
    } %>% 
    plyr::alply(.margins = 1, .fun = function(x) {
    which(x) %>% str_c(collapse = ",")
  }) %>% 
    unlist() -> 
    items_picked
  
  tibble(
    value = items_picked
  ) %>% set_colnames(unlist(dd))
}) %>% 
  bind_cols() %>% 
  set_colnames(str_glue("item_{1:(length(vocab_vars)/5)}"))

#score them to T/F status using the key
gs4_deauth()
key = read_sheet("https://docs.google.com/spreadsheets/d/1H-4fkRYEVvc3jREOtkyOJCdWqxBQXoXkPpxkwsinIcs/edit#gid=0", skip = 2) %>% 
  df_legalize_names()

## ✔ Reading from "Danish vocab test (2023-06-23)".

## ✔ Range '3:10000000'.

#ensure they are in the same order
assert_that(all(
  key$Item_1 == items_words$item_1
))

## [1] TRUE

key %>% write_csv("data/vocab_items.csv")

#convert to integer
key %<>% rowwise() %>% 
  mutate(
    item_correct_1_number = which(Item_correct_1 == c(Item_1, Item_2, Item_3, Item_4, Item_5)),
    item_correct_2_number = which(Item_correct_2 == c(Item_1, Item_2, Item_3, Item_4, Item_5)),
    item_correct_scorekey = str_glue("{item_correct_1_number},{item_correct_2_number}")
  )

#score
items_TF = score_items(item_int_sets, key = key$item_correct_scorekey)

#list of all words
vocab_vars %>% str_replace("_Vælg_de_to_ord_med_samme_eller_næsten_samme_betydning", "") %>% table2(include_NA = F) %>% pull(Count) %>% table2()

Analysis

Full item set

#pass_rate by item
item_stats = tibble(
  item = names(items_TF),
  pass_rate = items_TF %>% colMeans(na.rm = T),
  n = items_TF %>% plyr::alply(.margins = 2, .fun = function(x) {miss_count(x, reverse = T)})  %>% unlist(),
)

#general stats
item_stats$pass_rate %>% describe2()

item_stats$pass_rate %>% GG_denhist() + scale_x_continuous(breaks = seq(0, 1, .1))

## Input seems like a fraction, set `boundary=0` and `binwidth=1/30` to avoid issues near the limits. Disable this with `auto_fraction_bounary=F`

## Scale for x is already present.
## Adding another scale for x, which will replace the existing scale.

#person level stats
d$vocab_sumscore = items_TF %>% rowSums(na.rm = T)
d$vocab_sumscore

##   [1]  23  64  57  10  74  74  72  69  74  27  87  74  98  34  89  55  95  39  89  58  83  88  66
##  [24]  77  74  68  48  35  90  75  38  89  75  87  94  84  80  84  73  77  60  99  50  90  80  76
##  [47]  58  54  23  84  75  33  75  58  99  76  90  64  65  84  69  71  89  74  87  92  71  37  81
##  [70]  44  77  88  56  92  80  67  51  77  74  65  90  55  74  59  98  74  86  58  44  81  88  92
##  [93]  81  82  86  82  91  76  66  94  94  85  76  63  95  90  96  75  35  84  83  76  74  69  88
## [116]  83  82  77  62  47  74  89  47  85  55  86  91  82  90  64  29  71  87  97  58  75  96  68
## [139]  11  80  55  83  90  91  90  78  77  53  77  93  78  90  36  88  68  74  82  78  99  85  86
## [162]  67  87  92  91  93  80  60  73  87  73  53  70  67  75  34  41  93  95  75  82  43  87  60
## [185]  62  67  83  65  82  71  81  63  72 101  79  79  34  22  60  88  52  72  90  72  72  75  87
## [208]  51  91  90  91  69  86  83  74  78  72  75  18  75  83  52  71  33  74  36  78  76  67  43
## [231]  81  84  74  59  69  57  78  46  76  43  96  58  80  74  75  89  44  84  86  79  84  64  55
## [254]  70  58  63  26  65  70  75  59  60  39  77  21  46  69  71  56  32  73  55  36  18  90  48
## [277]  71  64  72  91  50  86  72  57  24  82  81  80  43  33  68  90 100  64  63  26  86  64  76
## [300]  53  69  46  31  35  50  41  77  64  61  39  80  76  76  86  67  95  86  69  83  81  70  76
## [323]  49  83  62  51  77  79  87  91  91  78  57  31  98  77  30  61  59  48  71  62  75  63  71
## [346]  73  82  69  62  57  44  69  83  90  76  50  95  82  89  66  75  83  67  93  77  85  88  83
## [369]  52  99  76  72  83  78  61  89  93  96  91  89  95  89  65  79  78  74  37  85  88  51  81
## [392]  90  82  82  88  67  90  91  81  77  38  78  63  88  40  76  68  86  81  86  89  56  11  81
## [415]  63  59  88  77  87  64  75  93  85  78  56  78  90  43  53  86  70  60  86  48  33  77  61
## [438]  88  90  56  64  76  80  72  88  68  84  60  55  65  58  53  62  76  36  51  45  15  84  82
## [461]  81  54  62  46  61  69  70  86  86  88  39  81  61  80  65  54  42  76  75  97  86  68  71
## [484]  72  92  79  43  85  91  90  94  76  68  74  89  68  83  81  68  87  79  48  80  81  91  64
## [507]  83  73  72  79  54  99  86  97  40  75  86  86  74  64  46  75  53  73  47  50  80  66  55
## [530]  62  60  90  57  92  88  63  70  48  70  90  57  81  52  58  59  74  41  58  65  63  76  31
## [553]  72  50  61  73  68  97  83  80  90  81  73  79  69  74  76  55  65  70  87  95  86  54  72
## [576]  85  85  90  80  38  79  92  55  30  93  92  96  69  67  81  75  63  81  72  38  73  83  38
## [599]  81  79  73  55  88  76  52  71  55  64  85  76  39  89  72  54  41  80  41  82  78  48  67
## [622]  85  80

GG_denhist(d, "vocab_sumscore") +
  scale_x_continuous("Vocabulary sum score")

## Scale for x is already present.
## Adding another scale for x, which will replace the existing scale.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

GG_save("figs/vocab_sumscore_dist.png")

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

#how many hit the ceiling?
sum(d$vocab_sumscore == 101)

## [1] 1

#compare people who failed attention checks
# GG_denhist(d_orig, "vocab_sumscore", group = "attention_checksum2")

d %>% pull(vocab_sumscore) %>% describe2()

#time to complete
d %>% pull(time_to_complete) %>% describe2()

#time use vs. score
d %>% GG_scatter("time_to_complete", "vocab_sumscore")

## `geom_smooth()` using formula = 'y ~ x'

d %>% GG_scatter("time_to_complete_winsor", "vocab_sumscore") +
  geom_smooth()

## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

d %>% GG_scatter("time_to_complete_winsor", "vocab_sumscore", color = "time_below_min") +
  geom_smooth() +
  geom_hline(yintercept = 101 * 0.1, linetype = "dashed")

## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

d %>% GG_scatter("time_to_complete_winsor", "vocab_sumscore", color = "attention_pass") +
  geom_smooth()

## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

#IRT
irt_fit = mirt(
  items_TF,
  model = 1,
  verbose = F,
  technical = list(NCYCLES = 2000),
  SE = T
)

irt_fit

## 
## Call:
## mirt(data = items_TF, model = 1, SE = T, verbose = F, technical = list(NCYCLES = 2000))
## 
## Full-information item factor analysis with 1 factor(s).
## Converged within 1e-04 tolerance after 95 EM iterations.
## mirt version: 1.42 
## M-step optimizer: BFGS 
## EM acceleration: Ramsay 
## Number of rectangular quadrature: 61
## Latent density type: Gaussian 
## 
## Information matrix estimated with method: Oakes
## Second-order test: model is a possible local maximum
## Condition number of information matrix =  114.8989
## 
## Log-likelihood = -26412.21
## Estimated parameters: 202 
## AIC = 53228.43
## BIC = 54124.21; SABIC = 53482.89
## G2 (1e+10) = 44806.98, p = 1
## RMSEA = 0, CFI = NaN, TLI = NaN

irt_fit %>% summary()

##             F1     h2
## item_1   0.346 0.1195
## item_2   0.461 0.2121
## item_3   0.547 0.2988
## item_4   0.715 0.5107
## item_5   0.393 0.1548
## item_6   0.516 0.2661
## item_7   0.794 0.6301
## item_8   0.543 0.2952
## item_9   0.462 0.2133
## item_10  0.885 0.7839
## item_11  0.389 0.1515
## item_12  0.742 0.5503
## item_13  0.519 0.2693
## item_14  0.736 0.5411
## item_15  0.665 0.4424
## item_16  0.860 0.7403
## item_17  0.322 0.1039
## item_18  0.585 0.3419
## item_19  0.636 0.4047
## item_20  0.450 0.2028
## item_21  0.451 0.2033
## item_22  0.488 0.2386
## item_23  0.392 0.1533
## item_24  0.708 0.5016
## item_25  0.658 0.4327
## item_26  0.690 0.4763
## item_27  0.714 0.5098
## item_28  0.521 0.2714
## item_29  0.492 0.2425
## item_30  0.540 0.2914
## item_31  0.875 0.7659
## item_32  0.711 0.5050
## item_33  0.509 0.2593
## item_34  0.312 0.0974
## item_35  0.836 0.6994
## item_36  0.742 0.5507
## item_37  0.524 0.2742
## item_38  0.735 0.5398
## item_39  0.443 0.1966
## item_40  0.862 0.7432
## item_41  0.841 0.7077
## item_42  0.490 0.2405
## item_43  0.527 0.2775
## item_44  0.425 0.1804
## item_45  0.553 0.3056
## item_46  0.302 0.0911
## item_47  0.329 0.1079
## item_48  0.848 0.7199
## item_49  0.590 0.3478
## item_50  0.721 0.5195
## item_51  0.834 0.6956
## item_52  0.645 0.4161
## item_53  0.469 0.2198
## item_54  0.756 0.5719
## item_55  0.528 0.2793
## item_56  0.654 0.4283
## item_57  0.318 0.1013
## item_58  0.855 0.7316
## item_59  0.396 0.1570
## item_60  0.392 0.1538
## item_61  0.353 0.1247
## item_62  0.757 0.5735
## item_63  0.702 0.4932
## item_64  0.721 0.5199
## item_65  0.607 0.3681
## item_66  0.638 0.4073
## item_67  0.537 0.2879
## item_68  0.605 0.3654
## item_69  0.695 0.4833
## item_70  0.627 0.3936
## item_71  0.684 0.4675
## item_72  0.625 0.3911
## item_73  0.794 0.6305
## item_74  0.430 0.1853
## item_75  0.756 0.5709
## item_76  0.477 0.2275
## item_77  0.669 0.4472
## item_78  0.804 0.6469
## item_79  0.694 0.4810
## item_80  0.704 0.4955
## item_81  0.535 0.2865
## item_82  0.616 0.3790
## item_83  0.378 0.1429
## item_84  0.732 0.5354
## item_85  0.604 0.3651
## item_86  0.616 0.3790
## item_87  0.834 0.6964
## item_88  0.595 0.3546
## item_89  0.840 0.7059
## item_90  0.618 0.3824
## item_91  0.577 0.3331
## item_92  0.556 0.3086
## item_93  0.785 0.6158
## item_94  0.508 0.2580
## item_95  0.814 0.6630
## item_96  0.629 0.3950
## item_97  0.657 0.4313
## item_98  0.861 0.7407
## item_99  0.423 0.1791
## item_100 0.456 0.2075
## item_101 0.934 0.8725
## 
## SS loadings:  40.23 
## Proportion Var:  0.398 
## 
## Factor correlations: 
## 
##    F1
## F1  1

#with fixed guessing
irt_guess_fit = mirt(
  items_TF,
  model = 1,
  verbose = F,
  guess = 0.10,
  technical = list(NCYCLES = 2000)
)

irt_guess_fit

## 
## Call:
## mirt(data = items_TF, model = 1, guess = 0.1, verbose = F, technical = list(NCYCLES = 2000))
## 
## Full-information item factor analysis with 1 factor(s).
## Converged within 1e-04 tolerance after 147 EM iterations.
## mirt version: 1.42 
## M-step optimizer: BFGS 
## EM acceleration: Ramsay 
## Number of rectangular quadrature: 61
## Latent density type: Gaussian 
## 
## Log-likelihood = -26353.14
## Estimated parameters: 202 
## AIC = 53110.28
## BIC = 54006.06; SABIC = 53364.74
## G2 (1e+10) = 44688.84, p = 1
## RMSEA = 0, CFI = NaN, TLI = NaN

irt_guess_fit %>% summary()

##             F1     h2
## item_1   0.329 0.1081
## item_2   0.891 0.7935
## item_3   0.610 0.3718
## item_4   0.678 0.4591
## item_5   0.562 0.3158
## item_6   0.496 0.2461
## item_7   0.863 0.7442
## item_8   0.659 0.4337
## item_9   0.506 0.2557
## item_10  0.876 0.7672
## item_11  0.380 0.1445
## item_12  0.804 0.6458
## item_13  0.548 0.2998
## item_14  0.704 0.4953
## item_15  0.619 0.3833
## item_16  0.870 0.7576
## item_17  0.413 0.1706
## item_18  0.550 0.3020
## item_19  0.629 0.3951
## item_20  0.536 0.2873
## item_21  0.517 0.2672
## item_22  0.479 0.2297
## item_23  0.531 0.2824
## item_24  0.745 0.5550
## item_25  0.683 0.4668
## item_26  0.723 0.5231
## item_27  0.706 0.4978
## item_28  0.705 0.4976
## item_29  0.520 0.2703
## item_30  0.500 0.2502
## item_31  0.866 0.7493
## item_32  0.795 0.6325
## item_33  0.653 0.4259
## item_34  0.304 0.0924
## item_35  0.835 0.6966
## item_36  0.767 0.5888
## item_37  0.566 0.3203
## item_38  0.833 0.6932
## item_39  0.428 0.1831
## item_40  0.842 0.7082
## item_41  0.895 0.8008
## item_42  0.681 0.4643
## item_43  0.648 0.4205
## item_44  0.432 0.1864
## item_45  0.530 0.2810
## item_46  0.314 0.0985
## item_47  0.334 0.1116
## item_48  0.814 0.6620
## item_49  0.570 0.3246
## item_50  0.740 0.5480
## item_51  0.807 0.6510
## item_52  0.662 0.4381
## item_53  0.451 0.2038
## item_54  0.732 0.5354
## item_55  0.565 0.3190
## item_56  0.685 0.4686
## item_57  0.509 0.2591
## item_58  0.822 0.6750
## item_59  0.389 0.1513
## item_60  0.509 0.2593
## item_61  0.377 0.1425
## item_62  0.776 0.6020
## item_63  0.738 0.5452
## item_64  0.818 0.6698
## item_65  0.612 0.3745
## item_66  0.726 0.5278
## item_67  0.672 0.4521
## item_68  0.655 0.4293
## item_69  0.669 0.4472
## item_70  0.689 0.4741
## item_71  0.652 0.4252
## item_72  0.761 0.5797
## item_73  0.799 0.6387
## item_74  0.427 0.1826
## item_75  0.760 0.5775
## item_76  0.561 0.3144
## item_77  0.741 0.5491
## item_78  0.800 0.6408
## item_79  0.719 0.5172
## item_80  0.698 0.4878
## item_81  0.522 0.2730
## item_82  0.741 0.5484
## item_83  0.419 0.1753
## item_84  0.808 0.6527
## item_85  0.668 0.4459
## item_86  0.756 0.5715
## item_87  0.832 0.6925
## item_88  0.574 0.3297
## item_89  0.868 0.7531
## item_90  0.722 0.5216
## item_91  0.584 0.3413
## item_92  0.684 0.4680
## item_93  0.764 0.5831
## item_94  0.475 0.2252
## item_95  0.840 0.7056
## item_96  0.777 0.6041
## item_97  0.694 0.4814
## item_98  0.840 0.7052
## item_99  0.424 0.1801
## item_100 0.770 0.5928
## item_101 0.946 0.8956
## 
## SS loadings:  45.489 
## Proportion Var:  0.45 
## 
## Factor correlations: 
## 
##    F1
## F1  1

irt_guess_fit_stats = kirkegaard::get_mirt_stats(irt_guess_fit)

#add to item stats
full_join(
  item_stats,
  irt_fit@Fit$`F` %>% as.data.frame() %>% rownames_to_column(var = "item") %>% rename(loading = F1)
) -> 
  item_stats

## Joining with `by = join_by(item)`

item_stats

#distribution of factor loadings
GG_denhist(item_stats$loading) +
  scale_x_continuous(breaks = seq(-1, 1, .2))

## Input seems like a fraction, set `boundary=0` and `binwidth=1/30` to avoid issues near the limits. Disable this with `auto_fraction_bounary=F`
## Scale for x is already present.
## Adding another scale for x, which will replace the existing scale.

item_stats %>% describe2()

#item difficulty vs. loading
item_stats$difficulty = qnorm(item_stats$pass_rate, lower.tail = F)
item_stats %>% 
  GG_scatter("difficulty", "loading", case_names = "item")

## `geom_smooth()` using formula = 'y ~ x'

item_stats %>% 
  GG_scatter("pass_rate", "loading", case_names = "item") +
  labs(
    x = "Item pass rate",
    y = "Item factor loading"
  )

## `geom_smooth()` using formula = 'y ~ x'

GG_save("figs/item_passrate_loading.png")

## `geom_smooth()` using formula = 'y ~ x'

#score subjects
irt_fit_score = fscores(irt_fit, full.scores.SE = T)
d$vocab_g = irt_fit_score[, 1] %>% standardize()

#irt score dist
GG_denhist(d, "vocab_g") +
  scale_x_continuous("Vocabulary IRT score")

## Scale for x is already present.
## Adding another scale for x, which will replace the existing scale.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

GG_save("figs/vocab_g_dist.png")

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

#distributions compare
d %>% select(vocab_sumscore, vocab_g) %>% 
  describe2()

#reliability
marginal_rxx(irt_fit)

## [1] 0.9461351

marginal_rxx(irt_guess_fit)

## [1] 0.9558691

empirical_rxx(irt_fit_score)

##        F1 
## 0.9471914

get_reliabilities(irt_fit) %>% 
  filter(is_between(z, -4, 4)) %>% 
  ggplot(aes(z, rel)) +
  geom_line() +
  scale_x_continuous(breaks = seq(-4, 4, 1))

#range above 0.90 reliability
rel_above_90_range = get_reliabilities(irt_fit) %>% 
  filter(rel > .90) %>% 
  pull(z) %>% 
  range()

rel_above_90_range

## [1] -3.165829  1.175879

mean(is_between(irt_fit_score[, 1], rel_above_90_range[1], rel_above_90_range[2]))

## [1] 0.8988764

#sum score vs. IRT score
GG_scatter(d, "vocab_sumscore", "vocab_g") +
  geom_smooth()

## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

#age vs. vocab
GG_scatter(d, "age", "vocab_g") +
  geom_smooth() +
  labs(
    y = "Vocabulary IRT score",
    x = "Age"
  )

## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

GG_save("figs/vocab_age.png")

## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

GG_scatter(d, "age", "vocab_sumscore") +
  geom_smooth()

## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

#adjust for age and make norms
norms = make_norms(
  d$vocab_g,
  d$age
)

## Detected linear effect of age on the score (p = <0.001***). Model used.
## No detected variance effect of age on the score (p = 0.233). Model not used.

norms

## $data
## # A tibble: 623 × 7
##       score   age norm_group score_ageadj1 score_ageadj2 score_ageadj3    IQ
##       <dbl> <dbl> <lgl>              <dbl>         <dbl>         <dbl> <dbl>
##  1 -2.00       42 TRUE              -1.56         -1.56         -1.87   71.9
##  2 -0.363      55 TRUE              -0.377        -0.377        -0.452  93.2
##  3 -0.816      28 TRUE               0.110         0.110         0.132 102. 
##  4 -2.83       56 TRUE              -2.88         -2.88         -3.46   48.1
##  5  0.00396    43 TRUE               0.408         0.408         0.490 107. 
##  6  0.117      75 TRUE              -0.592        -0.592        -0.712  89.3
##  7 -0.0435     69 TRUE              -0.544        -0.544        -0.653  90.2
##  8 -0.241      69 TRUE              -0.742        -0.742        -0.891  86.6
##  9 -0.0374     36 TRUE               0.610         0.610         0.733 111. 
## 10 -2.04       21 TRUE              -0.866        -0.866        -1.04   84.4
## # ℹ 613 more rows
## 
## $age_model
## 
## Call:
## lm(formula = score ~ age, data = d_norm)
## 
## Coefficients:
## (Intercept)          age  
##    -1.89984      0.03479  
## 
## 
## $age_model_abs
## 
## Call:
## lm(formula = abs(score_ageadj1) ~ age, data = d_norm)
## 
## Coefficients:
## (Intercept)          age  
##     0.57180      0.00154  
## 
## 
## $age_model_used
## value 
##  TRUE 
## 
## $age_model_abs_used
## value 
## FALSE 
## 
## $norm_desc
## # A tibble: 1 × 10
##   var               n     mean  median    sd   mad   min   max  skew kurtosis
##   <chr>         <dbl>    <dbl>   <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>    <dbl>
## 1 score_ageadj2   623 8.68e-16 -0.0977 0.833 0.783 -2.88  3.17 0.315    0.635

#norms based on sum score
norms_sum = make_norms(
  d$vocab_sumscore,
  d$age
)

## Detected linear effect of age on the score (p = <0.001***). Model used.
## Detected variance effect of age on the score (p = <0.001***). Model used.

norms_sum

## $data
## # A tibble: 623 × 7
##    score   age norm_group score_ageadj1 score_ageadj2 score_ageadj3    IQ
##    <dbl> <dbl> <lgl>              <dbl>         <dbl>         <dbl> <dbl>
##  1    23    42 TRUE              -39.9         -2.99         -2.37   64.5
##  2    64    55 TRUE               -6.98        -0.599        -0.471  92.9
##  3    57    28 TRUE                2.82         0.187         0.152 102. 
##  4    10    56 TRUE              -61.6         -5.35         -4.23   36.5
##  5    74    43 TRUE               10.5          0.795         0.635 110. 
##  6    74    75 TRUE               -9.43        -1.04         -0.819  87.7
##  7    72    69 TRUE               -7.69        -0.781        -0.615  90.8
##  8    69    69 TRUE              -10.7         -1.09         -0.856  87.2
##  9    74    36 TRUE               14.8          1.05          0.840 113. 
## 10    27    21 TRUE              -22.8         -1.42         -1.13   83.1
## # ℹ 613 more rows
## 
## $age_model
## 
## Call:
## lm(formula = score ~ age, data = d_norm)
## 
## Coefficients:
## (Intercept)          age  
##     36.7513       0.6223  
## 
## 
## $age_model_abs
## 
## Call:
## lm(formula = abs(score_ageadj1) ~ age, data = d_norm)
## 
## Coefficients:
## (Intercept)          age  
##     18.7244      -0.1286  
## 
## 
## $age_model_used
## value 
##  TRUE 
## 
## $age_model_abs_used
## value 
##  TRUE 
## 
## $norm_desc
## # A tibble: 1 × 10
##   var               n     mean median    sd   mad   min   max   skew kurtosis
##   <chr>         <dbl>    <dbl>  <dbl> <dbl> <dbl> <dbl> <dbl>  <dbl>    <dbl>
## 1 score_ageadj2   623 -0.00543  0.133  1.26  1.28 -5.35  2.79 -0.720    0.948

#save full scale norms
write_rds(norms, "data/full_norms_IRT.rds", compress = "xz")
write_rds(norms_sum, "data/full_norms_CTT.rds", compress = "xz")

#save norms
d$vocab_IQ_orig = norms$data$IQ %>% as.numeric()
d$vocab_IQ_sum_orig = norms_sum$data$IQ %>% as.numeric()

#manually check for HS
d$vocab_g_age_resid = lm(vocab_g ~ age, data = d) %>% residuals()
test_HS(d$vocab_g_age_resid, d$age)

## number of knots in rcs defaulting to 5
## number of knots in rcs defaulting to 5

#plot IQs
d %>% 
  GG_denhist("vocab_IQ_orig") +
  scale_x_continuous(breaks = seq(0, 200, 10)) +
  labs(
    x = "Vocabulary IQ score"
  )

## Scale for x is already present.
## Adding another scale for x, which will replace the existing scale.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

GG_save("figs/vocab_IQ_dist.png")

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

d$vocab_IQ_orig %>% describe2()

#remove implausible values
sum(d$vocab_IQ_orig < 65)

## [1] 5

sum(d$vocab_IQ_orig > 145)

## [1] 3

d$vocab_IQ = if_else(!is_between(d$vocab_IQ_orig, 65, 145), NA_real_, d$vocab_IQ_orig)
d$vocab_IQ_sum = if_else(!is_between(d$vocab_IQ_sum_orig, 65, 145), NA_real_, d$vocab_IQ_sum_orig)

d %>% select(vocab_IQ, vocab_IQ_sum) %>% describe2()

#after filter
d %>% 
  GG_denhist("vocab_IQ") +
  scale_x_continuous(breaks = seq(0, 200, 10))

## Warning: Removed 8 rows containing non-finite outside the scale range (`stat_bin()`).

## Warning: Removed 8 rows containing non-finite outside the scale range (`stat_density()`).

## Scale for x is already present.
## Adding another scale for x, which will replace the existing scale.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## Warning: Removed 8 rows containing non-finite outside the scale range (`stat_bin()`).
## Removed 8 rows containing non-finite outside the scale range (`stat_density()`).

d %>% 
  GG_denhist("vocab_IQ_sum") +
  scale_x_continuous(breaks = seq(0, 200, 10))

## Warning: Removed 13 rows containing non-finite outside the scale range (`stat_bin()`).

## Warning: Removed 13 rows containing non-finite outside the scale range (`stat_density()`).

## Scale for x is already present.
## Adding another scale for x, which will replace the existing scale.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## Warning: Removed 13 rows containing non-finite outside the scale range (`stat_bin()`).
## Removed 13 rows containing non-finite outside the scale range (`stat_density()`).

#correlation between methods
GG_scatter(d, "vocab_IQ", "vocab_IQ_sum")

## `geom_smooth()` using formula = 'y ~ x'

Abbreviation

#filter items by pass rate
items_TF_goodpass = items_TF %>% select(item_stats$item[is_between(item_stats$pass_rate, .05, .95)])
dim(items_TF_goodpass)

## [1] 623  99

#apply algos
scale_forwards = cache_object(abbreviate_scale(items_TF_goodpass, item_target = 50, method = "forwards"), filename = "temp/abbrev_forwards.rds")

## Cache found, reading object from disk

scale_backwards = cache_object(abbreviate_scale(items_TF_goodpass, item_target = 5, method = "backwards"), filename = "temp/abbrev_backwards.rds")

## Cache found, reading object from disk

scale_max_basic = cache_object(abbreviate_scale(items_TF_goodpass, item_target = 50, method = "max_loading"), filename = "temp/abbrev_max_basic.rds")

## Cache found, reading object from disk

scale_max_balance = cache_object(abbreviate_scale(items_TF_goodpass, item_target = 50, method = "max_loading", difficulty_balance_groups = 5), filename = "temp/abbrev_max_balance.rds")

## Cache found, reading object from disk

scale_max_resid = cache_object(abbreviate_scale(items_TF_goodpass, item_target = 50, method = "max_loading", residualize_loadings = T), filename = "temp/abbrev_max_resid.rds")

## Cache found, reading object from disk

scale_genetic = cache_object(abbreviate_scale(items_TF_goodpass, item_target = 50, method = "genetic"), filename = "temp/abbrev_genetic.rds")

## Cache found, reading object from disk

#plot results
GG_scale_abbreviation(scale_forwards)

GG_scale_abbreviation(scale_backwards)

GG_scale_abbreviation(scale_max_basic)

GG_scale_abbreviation(scale_max_balance)

GG_scale_abbreviation(scale_max_resid)

GG_scale_abbreviation(scale_genetic)

#all together, just the reliability and full scale cor
bind_rows(
  scale_forwards$best_sets %>% mutate(method = "forwards"),
  scale_backwards$best_sets %>% mutate(method = "backwards") %>% filter(items_in_scale %in% 3:50),
  scale_max_basic$best_sets %>% mutate(method = "max loading"),
  scale_max_balance$best_sets %>% mutate(method = "max loading balance"),
  scale_max_resid$best_sets %>% mutate(method = "max loading resid"),
  scale_genetic$best_sets %>% mutate(method = "genetic")
) %>% 
  #split reliability and full scale cor
  pivot_longer(cols = c(reliability, r_full_score), names_to = "var", values_to = "var_value") %>%
  ggplot(aes(items_in_scale, var_value, color = method)) +
  geom_line() +
  facet_wrap(~var, scales = "free_y") +
  labs(
    y = "Value",
    x = "Number of items in scale"
  )

GG_save("figs/abbrev_results.png")

CTT abbreviation

scale_backwards_CTT = cache_object(abbreviate_scale(items_TF_goodpass, item_target = 5, method = "backwards", IRT = F), filename = "temp/abbrev_backwards_CTT.rds")

## Cache found, reading object from disk

#compare 20 item versions
scale_backwards$best_sets %>% filter(items_in_scale == 20) %>% pull(item_set)

## [[1]]
##  [1] "item_7"   "item_12"  "item_16"  "item_32"  "item_33"  "item_35"  "item_38"  "item_50" 
##  [9] "item_64"  "item_67"  "item_68"  "item_72"  "item_73"  "item_84"  "item_86"  "item_90" 
## [17] "item_92"  "item_95"  "item_96"  "item_101"

scale_backwards_CTT$best_sets %>% filter(items_in_scale == 20) %>% pull(item_set)

## [[1]]
##  [1] "item_7"   "item_12"  "item_16"  "item_31"  "item_32"  "item_35"  "item_38"  "item_40" 
##  [9] "item_41"  "item_50"  "item_64"  "item_73"  "item_75"  "item_77"  "item_84"  "item_86" 
## [17] "item_87"  "item_89"  "item_95"  "item_101"

#count overlap
overlap_metrics(
  scale_backwards$best_sets %>% filter(items_in_scale == 20) %>% pull(item_set) %>% unlist(),
  scale_backwards_CTT$best_sets %>% filter(items_in_scale == 20) %>% pull(item_set) %>% unlist()
)

##               x_count               y_count        xy_union_count xy_intersection_count 
##            20.0000000            20.0000000            27.0000000            13.0000000 
## prop_overlap_of_union          x_in_y_count           x_in_y_prop          y_in_x_count 
##             0.4814815            13.0000000             0.6500000            13.0000000 
##           y_in_x_prop 
##             0.6500000

All norms

We need to make norms for the IRT and CTT scales, and save these to file so that someone may use them fairly easily.

scale_lengths = c(5, 10, 15, 20, 25, 30, 35, 40, 45, 50)

#IRT scales
norms_IRT = map_dfr(scale_lengths, function(scale_length) {
  #get scale
  i_scale = scale_backwards$best_sets %>% filter(items_in_scale == scale_length)
  
  #make norms
  i_norms = make_norms(
    i_scale[["scores"]][[1]][, 1],
    d$age
  )
  
  #save
  tibble(
    scale_length = scale_length,
    norms = list(i_norms)
  ) 
})

## Detected linear effect of age on the score (p = <0.001***). Model used.

## Detected variance effect of age on the score (p = 0.002**). Model used.

## Detected linear effect of age on the score (p = <0.001***). Model used.

## No detected variance effect of age on the score (p = 0.74). Model not used.

## Detected linear effect of age on the score (p = <0.001***). Model used.

## No detected variance effect of age on the score (p = 0.189). Model not used.

## Detected linear effect of age on the score (p = <0.001***). Model used.

## No detected variance effect of age on the score (p = 0.425). Model not used.

## Detected linear effect of age on the score (p = <0.001***). Model used.

## No detected variance effect of age on the score (p = 0.187). Model not used.

## Detected linear effect of age on the score (p = <0.001***). Model used.

## No detected variance effect of age on the score (p = 0.221). Model not used.

## Detected linear effect of age on the score (p = <0.001***). Model used.

## No detected variance effect of age on the score (p = 0.146). Model not used.

## Detected linear effect of age on the score (p = <0.001***). Model used.

## No detected variance effect of age on the score (p = 0.175). Model not used.

## Detected linear effect of age on the score (p = <0.001***). Model used.

## No detected variance effect of age on the score (p = 0.12). Model not used.

## Detected linear effect of age on the score (p = <0.001***). Model used.

## No detected variance effect of age on the score (p = 0.105). Model not used.

#CTT scales
norms_CTT = map_dfr(scale_lengths, function(scale_length) {
  #get scale
  i_scale = scale_backwards_CTT$best_sets %>% filter(items_in_scale == scale_length)
  
  #make norms
  i_norms = make_norms(
    i_scale[["scores"]][[1]][, 1],
    d$age
  )
  
  #save
  tibble(
    scale_length = scale_length,
    norms = list(i_norms)
  ) 
})

## Detected linear effect of age on the score (p = <0.001***). Model used.

## Detected variance effect of age on the score (p = <0.001***). Model used.

## Detected linear effect of age on the score (p = <0.001***). Model used.

## Detected variance effect of age on the score (p = <0.001***). Model used.

## Detected linear effect of age on the score (p = <0.001***). Model used.

## Detected variance effect of age on the score (p = <0.001***). Model used.

## Detected linear effect of age on the score (p = <0.001***). Model used.

## Detected variance effect of age on the score (p = <0.001***). Model used.

## Detected linear effect of age on the score (p = <0.001***). Model used.

## Detected variance effect of age on the score (p = <0.001***). Model used.

## Detected linear effect of age on the score (p = <0.001***). Model used.

## Detected variance effect of age on the score (p = <0.001***). Model used.

## Detected linear effect of age on the score (p = <0.001***). Model used.

## Detected variance effect of age on the score (p = <0.001***). Model used.

## Detected linear effect of age on the score (p = <0.001***). Model used.

## Detected variance effect of age on the score (p = <0.001***). Model used.

## Detected linear effect of age on the score (p = <0.001***). Model used.

## Detected variance effect of age on the score (p = <0.001***). Model used.

## Detected linear effect of age on the score (p = <0.001***). Model used.

## Detected variance effect of age on the score (p = <0.001***). Model used.

#save for reuse
write_rds(norms_IRT, "data/abbrev_norms_IRT.rds", compress = "xz")
write_rds(norms_CTT, "data/abbrev_norms_CTT.rds", compress = "xz")

Norm example

#explore 20 item test example
#score 17/20, male, 30 years
scale20CTT = scale_backwards_CTT$best_sets %>% filter(items_in_scale == 20)

#distribution
tibble(
  sum_score = scale20CTT$scores[[1]][, 1],
  group = "norm"
) %>% bind_rows(
  tibble(
    sum_score = 17,
    group = "new"
    )
) %>% GG_denhist("sum_score", group = "group")

## Warning: Groups with fewer than two data points have been dropped.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## Warning: Groups with fewer than two data points have been dropped.

## Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning -Inf

#centile
scale20CTT_ecdf = ecdf(scale20CTT$scores[[1]][, 1])
scale20CTT_ecdf(17)

## [1] 0.6067416

#age adjusted
norms_CTT_20 = norms_CTT %>% filter(scale_length == 20) %>% pull(norms) %>% first()

apply_norms(
  score = (0:20) %>% set_names(0:20),
  age = 30,
  prior_norms = norms_CTT_20
)

##         0         1         2         3         4         5         6         7         8 
##  74.50478  77.17116  79.83754  82.50391  85.17029  87.83666  90.50304  93.16942  95.83579 
##         9        10        11        12        13        14        15        16        17 
##  98.50217 101.16854 103.83492 106.50129 109.16767 111.83405 114.50042 117.16680 119.83317 
##        18        19        20 
## 122.49955 125.16593 127.83230

Validation

#age 35+
d_age35 = d %>% filter(age >= 35)

#education levels
d_age35 %>% 
    GG_group_means("vocab_IQ", "education", type = "point") +
  labs(
    y = "Vocabulary IQ",
    x = "Education level at age 35+"
  )

## Missing values were removed.

GG_save("figs/IQ_education.png")

edu_nom = nnet::multinom(education ~ vocab_IQ + age * sex, data = d_age35)

## # weights:  48 (35 variable)
## initial  value 1110.421783 
## iter  10 value 1011.020529
## iter  20 value 1009.460065
## iter  30 value 934.173854
## iter  40 value 931.756870
## iter  50 value 931.402348
## iter  60 value 931.252229
## final  value 931.252130 
## converged

ggpredict(edu_nom, "vocab_IQ [65:145]") %>% 
  plot()

#reverse
ols(vocab_IQ ~ education, data = d_age35)

## Frequencies of Missing Values Due to Each Variable
##  vocab_IQ education 
##         8         0 
## 
## Linear Regression Model
## 
## ols(formula = vocab_IQ ~ education, data = d_age35)
## 
## 
##                  Model Likelihood    Discrimination    
##                        Ratio Test           Indexes    
## Obs      534    LR chi2     72.58    R2       0.127    
## sigma13.3660    d.f.            7    R2 adj   0.115    
## d.f.     526    Pr(> chi2) 0.0000    g        5.553    
## 
## Residuals
## 
##     Min      1Q  Median      3Q     Max 
## -36.098  -9.557  -1.014   8.296  37.597 
## 
## 
##                               Coef    S.E.   t     Pr(>|t|)
## Intercept                     91.7892 1.9496 47.08 <0.0001 
## education=High school         12.3456 2.7877  4.43 <0.0001 
## education=Vocational           3.1110 2.2659  1.37 0.1703  
## education=Short education      8.1413 2.7026  3.01 0.0027  
## education=Medium education    11.2318 2.2532  4.98 <0.0001 
## education=University bachelor 12.5349 2.8563  4.39 <0.0001 
## education=University master   14.9645 2.5064  5.97 <0.0001 
## education=PhD                 24.4999 6.9616  3.52 0.0005

describe2(d_age35$vocab_IQ, d_age35$education)

## New names:
## • `` -> `...1`

#books
d$books_read %>% describe2()

d %>% 
  GG_scatter("vocab_IQ", "books_read")

## `geom_smooth()` using formula = 'y ~ x'

d$books_read2 = d$books_read %>% winsorise(upper = 100)
d %>% 
  GG_scatter("vocab_IQ", "books_read2")

## `geom_smooth()` using formula = 'y ~ x'

#log
d$books_read_log = log(d$books_read + 1)
d %>% 
  GG_scatter("vocab_IQ", "books_read_log") +
  labs(
    y = "Log (1 + books read last year)",
    x = "Vocabulary IQ"
  )

## `geom_smooth()` using formula = 'y ~ x'

GG_save("figs/IQ_books.png")

## `geom_smooth()` using formula = 'y ~ x'

#attempt to clean and fix income data
d$income = d$Hvor_mange_penge_tjente_du_sidste_år_før_skat_Medtæl_alle_indkomstkilder_fx_sociale_ydelser_og_pension_Bedste_gæt %>% as.numeric()

#remove implausible values above 2 million
d$income = if_else(d$income > 2e6, NA_real_, d$income)

#distribution
GG_denhist(d, "income") +
  scale_x_log10()

## Warning: Removed 4 rows containing non-finite outside the scale range (`stat_bin()`).

## Warning: Removed 4 rows containing non-finite outside the scale range (`stat_density()`).

## Scale for x is already present.
## Adding another scale for x, which will replace the existing scale.

## Warning in scale_x_log10(): log-10 transformation introduced infinite values.

## Warning in scale_x_log10(): log-10 transformation introduced infinite values.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## Warning: Removed 23 rows containing non-finite outside the scale range (`stat_bin()`).

## Warning: Removed 23 rows containing non-finite outside the scale range (`stat_density()`).

GG_scatter(d, "vocab_IQ", "income")

## `geom_smooth()` using formula = 'y ~ x'

#assume that if value is below 1000, it is in thousands
d$income2 = if_else(d$income < 1000, d$income * 1000, d$income)

#plot against score
d %>% 
  GG_scatter("vocab_IQ", "income2")

## `geom_smooth()` using formula = 'y ~ x'

#log income
d$income_log = log(d$income2 + 1)
d %>% 
  GG_scatter("vocab_IQ", "income_log")

## `geom_smooth()` using formula = 'y ~ x'

#regression model
inc_mod = gam(income ~ s(vocab_IQ, by = sex) + s(age, by = sex) + sex, data = d)
inc_mod %>% summary()

## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## income ~ s(vocab_IQ, by = sex) + s(age, by = sex) + sex
## 
## Parametric coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   240048      13720  17.496  < 2e-16 ***
## sexMale        87988      19546   4.502 8.11e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                         edf Ref.df     F p-value   
## s(vocab_IQ):sexFemale 1.000  1.000 0.439 0.50767   
## s(vocab_IQ):sexMale   2.060  2.621 3.158 0.04007 * 
## s(age):sexFemale      1.866  2.343 1.958 0.13889   
## s(age):sexMale        5.821  6.978 3.517 0.00111 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.0767   Deviance explained = 9.45%
## GCV = 5.742e+10  Scale est. = 5.6222e+10  n = 611

#standardized values
d$income_z = standardize(d$income2)
d$vocab_IQ_z = d$vocab_IQ %>% standardize()
incz_mod = gam(income_z ~ s(vocab_IQ_z, by = sex) + s(age, by = sex) + sex, data = d)
incz_mod %>% summary()

## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## income_z ~ s(vocab_IQ_z, by = sex) + s(age, by = sex) + sex
## 
## Parametric coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -0.15662    0.05171  -3.029  0.00256 ** 
## sexMale      0.26379    0.07366   3.581  0.00037 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                           edf Ref.df     F  p-value    
## s(vocab_IQ_z):sexFemale 1.000  1.000 0.063 0.801920    
## s(vocab_IQ_z):sexMale   2.565  3.255 3.139 0.021515 *  
## s(age):sexFemale        2.039  2.560 2.049 0.148251    
## s(age):sexMale          5.901  7.059 3.887 0.000407 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.0734   Deviance explained = 9.24%
## GCV = 0.81565  Scale est. = 0.79762   n = 611

#for those above 35
d_age35 = d %>% filter(age >= 35)
inc35_mod = gam(income_z ~ vocab_IQ_z + s(age, by = sex) + sex, data = d_age35)
inc35_mod %>% summary()

## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## income_z ~ vocab_IQ_z + s(age, by = sex) + sex
## 
## Parametric coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -0.14975    0.05129  -2.919  0.00366 ** 
## vocab_IQ_z   0.10825    0.03542   3.056  0.00235 ** 
## sexMale      0.32638    0.07208   4.528  7.4e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                    edf Ref.df     F p-value   
## s(age):sexFemale 1.911  2.398 2.715 0.06478 . 
## s(age):sexMale   5.980  7.150 2.994 0.00438 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.0852   Deviance explained = 10.2%
## GCV = 0.67871  Scale est. = 0.66476   n = 530

#remove incomes that are 0
d$income0 <- ifelse(d$income > 0, d$income, NA)

GG_scatter(d, "vocab_IQ", "income0")

## `geom_smooth()` using formula = 'y ~ x'

#assume that if value is below 1000, it is in thousands
d$income02 = if_else(d$income0 < 1000, d$income0 * 1000, d$income0)

#plot against score
d %>% 
  GG_scatter("vocab_IQ", "income02")

## `geom_smooth()` using formula = 'y ~ x'

#log income
d$income0_log = log(d$income02 + 1)
d %>% 
  GG_scatter("vocab_IQ", "income0_log")

## `geom_smooth()` using formula = 'y ~ x'

#regression model
inc_mod = gam(income0 ~ s(vocab_IQ, by = sex) + s(age, by = sex) + sex, data = d)
inc_mod %>% summary()

## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## income0 ~ s(vocab_IQ, by = sex) + s(age, by = sex) + sex
## 
## Parametric coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   249236      13919  17.906  < 2e-16 ***
## sexMale        87266      19778   4.412 1.22e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                         edf Ref.df     F p-value   
## s(vocab_IQ):sexFemale 1.000  1.000 0.425 0.51452   
## s(vocab_IQ):sexMale   2.201  2.798 3.003 0.04507 * 
## s(age):sexFemale      1.812  2.274 1.896 0.15243   
## s(age):sexMale        4.516  5.552 3.900 0.00129 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.0733   Deviance explained = 8.98%
## GCV = 5.6892e+10  Scale est. = 5.5784e+10  n = 592

#standardized values
d$income0_z = standardize(d$income02)
d$vocab_IQ_z = d$vocab_IQ %>% standardize()
incz_mod = gam(income0_z ~ s(vocab_IQ_z, by = sex) + s(age, by = sex) + sex, data = d)
incz_mod %>% summary()

## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## income0_z ~ s(vocab_IQ_z, by = sex) + s(age, by = sex) + sex
## 
## Parametric coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -0.15667    0.05239   -2.99 0.002905 ** 
## sexMale      0.26294    0.07448    3.53 0.000448 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                           edf Ref.df     F  p-value    
## s(vocab_IQ_z):sexFemale 1.000  1.000 0.047 0.827793    
## s(vocab_IQ_z):sexMale   2.612  3.312 2.961 0.027494 *  
## s(age):sexFemale        2.041  2.564 2.032 0.158978    
## s(age):sexMale          6.404  7.545 3.794 0.000269 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.0757   Deviance explained = 9.62%
## GCV = 0.80838  Scale est. = 0.78919   n = 592

#for those above 35
d_age35 = d %>% filter(age >= 35)
inc35_mod = gam(income0_z ~ vocab_IQ_z + s(age, by = sex) + sex, data = d_age35)
inc35_mod %>% summary()

## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## income0_z ~ vocab_IQ_z + s(age, by = sex) + sex
## 
## Parametric coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -0.15497    0.05152  -3.008  0.00276 ** 
## vocab_IQ_z   0.10665    0.03567   2.990  0.00293 ** 
## sexMale      0.33041    0.07234   4.567 6.22e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                    edf Ref.df     F p-value   
## s(age):sexFemale 2.005  2.516 2.651 0.06682 . 
## s(age):sexMale   5.881  7.051 2.928 0.00512 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.0867   Deviance explained = 10.4%
## GCV = 0.66402  Scale est. = 0.64998   n = 515

#sex difference
(inc_sex_desc = describe2(d_age35$income2, d_age35$sex))

## New names:
## • `` -> `...1`

inc_sex_desc$mean[2] / inc_sex_desc$mean[1]

## [1] 1.273635

Check possible causes for the outlier in education (high school category)

#education numeric
d$education_rank <- case_when(
  d$education == "Less than high school" ~ 1,
  d$education == "High school" ~ 2,
  d$education == "Vocational" ~ 3,
  d$education == "Short education" ~ 4,
  d$education == "Medium education" ~ 5,
  d$education == "University bachelor" ~ 6,
  d$education == "University master" ~ 7,
  d$education == "PhD" ~ 8,
  TRUE ~ NA_real_
)

d_age35$education_rank <- case_when(
  d_age35$education == "Less than high school" ~ 1,
  d_age35$education == "High school" ~ 2,
  d_age35$education == "Vocational" ~ 3,
  d_age35$education == "Short education" ~ 4,
  d_age35$education == "Medium education" ~ 5,
  d_age35$education == "University bachelor" ~ 6,
  d_age35$education == "University master" ~ 7,
  d_age35$education == "PhD" ~ 8,
  TRUE ~ NA_real_
)

#books & age by education level
d %>% group_by(education_rank) %>% summarise(books_read = mean(books_read), n = n())

d %>% group_by(education_rank) %>% summarise(books_read2 = mean(books_read2), n = n())

d %>% group_by(education_rank) %>% summarise(age = mean(age), n = n())

d_age35 %>% group_by(education_rank) %>% summarise(books_read = mean(books_read), n = n())

d_age35 %>% group_by(education_rank) %>% summarise(books_read2 = mean(books_read2), n = n())

d_age35 %>% group_by(education_rank) %>% summarise(age = mean(age), n = n())

A new, free to use Danish vocabulary test with norms (Dansk ordforrådstest)

23 mai, 2025

Init

Data

Data cleaning and coding

Vocab data

Analysis

Full item set

Abbreviation

CTT abbreviation

All norms

Norm example

Validation

Check possible causes for the outlier in education (high school category)

Meta