Sys.setenv(LANG = "en") # make R environment in english
library(kirkegaard)
load_packages(
readxl,
mirt,
haven,
googlesheets4,
rms,
nnet,
ggeffects,
mgcv,
future
)
theme_set(theme_bw())
#exclusion rules
exclude_failed_attention_checks = T
exclude_below_min = 17.5
options(contrasts=c("contr.treatment", "contr.treatment"))
#multi-threading
plan(multisession(workers = 4))
#read
# d = read_spss("data/SpssExport-1645882702/spss.sav")
#only the CSV export has browser data, which we need!
d_raw = read_csv("C:/Users/mh198/OneDrive/Documents/Data/Other data/A new, free to use Danish vocabulary test with norms/data/20250113142029-SurveyExport.csv") %>%
df_legalize_names(func = function(x) str_legalize(x, ascii = F))
## Rows: 2904 Columns: 585
## ── Column specification ──────────────────────────────────────────────────────────────────────────
## Delimiter: ","
## chr (572): Status, Language, Referer, SessionID, User Agent, IP Address, Country, City, State...
## dbl (6): Response ID, Longitude, Latitude, Hvor mange bøger læste du sidste år? Inklusiv ly...
## lgl (5): Contact ID, Legacy Comments, Comments, Tags, End redirect
## dttm (2): Time Started, Date Submitted
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
d = d_raw %>%
#filter to complete cases
filter(Status == "Complete",
str_detect(Referer, "ssisurveys"),
Taler_du_dansk_som_modersmål == "Ja")
d_vars = df_var_table(d)
#add self
d = bind_rows(
d,
d_raw %>% tail(1)
)
#copy the attention checks
attention_checks = d %>% select(contains("Vælg_batteri_og_musik"), contains("monster_og_terrasse"))
#remove these variables from main
for (v in names(attention_checks)) d[[v]] = NULL
#add attention check score back
d$attention_check_1 = (!is.na(attention_checks$batteri_Vælg_batteri_og_musik) & !is.na(attention_checks$musik_Vælg_batteri_og_musik))
d$attention_check_2 = (!is.na(attention_checks$monster_Vælg_monster_og_terrasse) & !is.na(attention_checks$terrasse_Vælg_monster_og_terrasse))
d$attention_checksum = d$attention_check_1 + d$attention_check_2
d$attention_checksum %>% table2()
#manual inspect
d$attention_checksum2 = attention_checks %>% select(batteri_Vælg_batteri_og_musik, musik_Vælg_batteri_og_musik, monster_Vælg_monster_og_terrasse, terrasse_Vælg_monster_og_terrasse) %>%
miss_by_case()
#recodings
d %<>% mutate(
vote_political_party = Hvis_der_var_Folketingsvalg_i_dag_hvem_ville_du_så_stemme_på %>% fct_relevel("S - Socialdemokratiet"),
age = Hvor_gammel_er_du %>% as.numeric(),
attention_pass = (attention_checksum2 == 0),
time_to_complete = (Date_Submitted - Time_Started) %>% as.numeric(),
time_to_complete_winsor = time_to_complete %>% winsorise(upper = 60),
time_below_min = time_to_complete_winsor <= exclude_below_min,
sex = Hvilken_køn_er_du_Biologisk %>% mapvalues(c("Mand", "Kvinde"), c("Male", "Female")) %>% as.factor(),
native_speaker = Taler_du_dansk_som_modersmål == "Ja",
education = Hvad_er_din_højeste_fuldførte_uddannelse %>% mapvalues(
c(
"Grundskole eller under (op til 10. klasse)",
"Gymnasial uddannelse (almen/STX, handelshøjskole/HHX, teknisk/HTX, HF)",
"Erhvervsfaglig uddannelse (fx social- og sundhedshjælper, elektriker, gastronom, murer, EUD)",
"Kort videregående uddannelse (fx tandplejer, handelsøkonom, farmakonom, laborant og byggetekniker, 2-årig)",
"Mellemlang videregående uddannelse (fx pædagog, sygeplejerske, skolelærer, maskinmester, diplomingeniør, 3-4-årig)",
"Universitets bachelor (fx BA i dansk, BsC i fysik, 3-årig)",
"Universitets kandidat (fx cand.mag i tysk, cand.scient i geologi, 2 år ovenpå bachelor)",
"Ph.d. og forskeruddannelser"
),
c(
"Less than high school",
"High school",
"Vocational",
"Short education",
"Medium education",
"University bachelor",
"University master",
"PhD"
)
) %>% ordered(levels = c(
"Less than high school",
"High school",
"Vocational",
"Short education",
"Medium education",
"University bachelor",
"University master",
"PhD"
)),
books_read = Hvor_mange_bøger_læste_du_sidste_år_Inklusiv_lydbøger,
children = d$Hvor_mange_biologiske_børn_har_du,
ideal_fertility = d$Forestil_dig_din_drømmeverden_Hvor_mange_biologiske_børn_ville_du_have_i_den
)
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `age = Hvor_gammel_er_du %>% as.numeric()`.
## Caused by warning in `Hvor_gammel_er_du %>% as.numeric()`:
## ! NAs introduced by coercion
#filter to attention pass, and native speakers
d_orig = d
if (exclude_failed_attention_checks) {
d %<>% filter(attention_checksum2 == 0)
d %<>% filter(!time_below_min)
d %<>% filter(native_speaker)
}
d_failed = d_orig %>% filter(attention_checksum2 > 0)
table2(d_orig$attention_checksum2)
#export fails
d_failed %>% select(What_is_your_user_ID, attention_checksum2) %>% write_csv("data/failed_users.csv")
#restructure so we get just 101 columns
#they are grouped into 5 for each column, so we need to find the sets of 5 cols to merge
vocab_vars = d %>% select(overvåge_Vælg_de_to_ord_med_samme_eller_næsten_samme_betydning:svejfet_Vælg_de_to_ord_med_samme_eller_næsten_samme_betydning) %>% names()
length(vocab_vars)
## [1] 505
#item list
items_words = map_df(split_every_k(vocab_vars, k = 5), function(dd) {
# browser()
#split into 5 cols
tibble(
item_1 = dd[1] %>% str_replace("_Vælg_de_to_ord_med_samme_eller_næsten_samme_betydning", ""),
item_2 = dd[2] %>% str_replace("_Vælg_de_to_ord_med_samme_eller_næsten_samme_betydning", ""),
item_3 = dd[3] %>% str_replace("_Vælg_de_to_ord_med_samme_eller_næsten_samme_betydning", ""),
item_4 = dd[4] %>% str_replace("_Vælg_de_to_ord_med_samme_eller_næsten_samme_betydning", ""),
item_5 = dd[5] %>% str_replace("_Vælg_de_to_ord_med_samme_eller_næsten_samme_betydning", ""),
)
}) %>% mutate(
item_number = 1:n()
)
#convert to columns with integer sets
item_int_sets = map(split_every_k(vocab_vars, k = 5), function(dd) {
# browser()
#subset variables, check which are not missing, concat with comma
d %>%
select(!!unlist(dd)) %>%
as.matrix() %>%
{
#missing, matrix format
miss = is.na(.) %>% `!`() %>% matrix(ncol = ncol(.))
miss
} %>%
plyr::alply(.margins = 1, .fun = function(x) {
which(x) %>% str_c(collapse = ",")
}) %>%
unlist() ->
items_picked
tibble(
value = items_picked
) %>% set_colnames(unlist(dd))
}) %>%
bind_cols() %>%
set_colnames(str_glue("item_{1:(length(vocab_vars)/5)}"))
#score them to T/F status using the key
gs4_deauth()
key = read_sheet("https://docs.google.com/spreadsheets/d/1H-4fkRYEVvc3jREOtkyOJCdWqxBQXoXkPpxkwsinIcs/edit#gid=0", skip = 2) %>%
df_legalize_names()
## ✔ Reading from "Danish vocab test (2023-06-23)".
## ✔ Range '3:10000000'.
#ensure they are in the same order
assert_that(all(
key$Item_1 == items_words$item_1
))
## [1] TRUE
key %>% write_csv("data/vocab_items.csv")
#convert to integer
key %<>% rowwise() %>%
mutate(
item_correct_1_number = which(Item_correct_1 == c(Item_1, Item_2, Item_3, Item_4, Item_5)),
item_correct_2_number = which(Item_correct_2 == c(Item_1, Item_2, Item_3, Item_4, Item_5)),
item_correct_scorekey = str_glue("{item_correct_1_number},{item_correct_2_number}")
)
#score
items_TF = score_items(item_int_sets, key = key$item_correct_scorekey)
#list of all words
vocab_vars %>% str_replace("_Vælg_de_to_ord_med_samme_eller_næsten_samme_betydning", "") %>% table2(include_NA = F) %>% pull(Count) %>% table2()
#pass_rate by item
item_stats = tibble(
item = names(items_TF),
pass_rate = items_TF %>% colMeans(na.rm = T),
n = items_TF %>% plyr::alply(.margins = 2, .fun = function(x) {miss_count(x, reverse = T)}) %>% unlist(),
)
#general stats
item_stats$pass_rate %>% describe2()
item_stats$pass_rate %>% GG_denhist() + scale_x_continuous(breaks = seq(0, 1, .1))
## Input seems like a fraction, set `boundary=0` and `binwidth=1/30` to avoid issues near the limits. Disable this with `auto_fraction_bounary=F`
## Scale for x is already present.
## Adding another scale for x, which will replace the existing scale.
#person level stats
d$vocab_sumscore = items_TF %>% rowSums(na.rm = T)
d$vocab_sumscore
## [1] 23 64 57 10 74 74 72 69 74 27 87 74 98 34 89 55 95 39 89 58 83 88 66
## [24] 77 74 68 48 35 90 75 38 89 75 87 94 84 80 84 73 77 60 99 50 90 80 76
## [47] 58 54 23 84 75 33 75 58 99 76 90 64 65 84 69 71 89 74 87 92 71 37 81
## [70] 44 77 88 56 92 80 67 51 77 74 65 90 55 74 59 98 74 86 58 44 81 88 92
## [93] 81 82 86 82 91 76 66 94 94 85 76 63 95 90 96 75 35 84 83 76 74 69 88
## [116] 83 82 77 62 47 74 89 47 85 55 86 91 82 90 64 29 71 87 97 58 75 96 68
## [139] 11 80 55 83 90 91 90 78 77 53 77 93 78 90 36 88 68 74 82 78 99 85 86
## [162] 67 87 92 91 93 80 60 73 87 73 53 70 67 75 34 41 93 95 75 82 43 87 60
## [185] 62 67 83 65 82 71 81 63 72 101 79 79 34 22 60 88 52 72 90 72 72 75 87
## [208] 51 91 90 91 69 86 83 74 78 72 75 18 75 83 52 71 33 74 36 78 76 67 43
## [231] 81 84 74 59 69 57 78 46 76 43 96 58 80 74 75 89 44 84 86 79 84 64 55
## [254] 70 58 63 26 65 70 75 59 60 39 77 21 46 69 71 56 32 73 55 36 18 90 48
## [277] 71 64 72 91 50 86 72 57 24 82 81 80 43 33 68 90 100 64 63 26 86 64 76
## [300] 53 69 46 31 35 50 41 77 64 61 39 80 76 76 86 67 95 86 69 83 81 70 76
## [323] 49 83 62 51 77 79 87 91 91 78 57 31 98 77 30 61 59 48 71 62 75 63 71
## [346] 73 82 69 62 57 44 69 83 90 76 50 95 82 89 66 75 83 67 93 77 85 88 83
## [369] 52 99 76 72 83 78 61 89 93 96 91 89 95 89 65 79 78 74 37 85 88 51 81
## [392] 90 82 82 88 67 90 91 81 77 38 78 63 88 40 76 68 86 81 86 89 56 11 81
## [415] 63 59 88 77 87 64 75 93 85 78 56 78 90 43 53 86 70 60 86 48 33 77 61
## [438] 88 90 56 64 76 80 72 88 68 84 60 55 65 58 53 62 76 36 51 45 15 84 82
## [461] 81 54 62 46 61 69 70 86 86 88 39 81 61 80 65 54 42 76 75 97 86 68 71
## [484] 72 92 79 43 85 91 90 94 76 68 74 89 68 83 81 68 87 79 48 80 81 91 64
## [507] 83 73 72 79 54 99 86 97 40 75 86 86 74 64 46 75 53 73 47 50 80 66 55
## [530] 62 60 90 57 92 88 63 70 48 70 90 57 81 52 58 59 74 41 58 65 63 76 31
## [553] 72 50 61 73 68 97 83 80 90 81 73 79 69 74 76 55 65 70 87 95 86 54 72
## [576] 85 85 90 80 38 79 92 55 30 93 92 96 69 67 81 75 63 81 72 38 73 83 38
## [599] 81 79 73 55 88 76 52 71 55 64 85 76 39 89 72 54 41 80 41 82 78 48 67
## [622] 85 80
GG_denhist(d, "vocab_sumscore") +
scale_x_continuous("Vocabulary sum score")
## Scale for x is already present.
## Adding another scale for x, which will replace the existing scale.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
GG_save("figs/vocab_sumscore_dist.png")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
#how many hit the ceiling?
sum(d$vocab_sumscore == 101)
## [1] 1
#compare people who failed attention checks
# GG_denhist(d_orig, "vocab_sumscore", group = "attention_checksum2")
d %>% pull(vocab_sumscore) %>% describe2()
#time to complete
d %>% pull(time_to_complete) %>% describe2()
#time use vs. score
d %>% GG_scatter("time_to_complete", "vocab_sumscore")
## `geom_smooth()` using formula = 'y ~ x'
d %>% GG_scatter("time_to_complete_winsor", "vocab_sumscore") +
geom_smooth()
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
d %>% GG_scatter("time_to_complete_winsor", "vocab_sumscore", color = "time_below_min") +
geom_smooth() +
geom_hline(yintercept = 101 * 0.1, linetype = "dashed")
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
d %>% GG_scatter("time_to_complete_winsor", "vocab_sumscore", color = "attention_pass") +
geom_smooth()
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
#IRT
irt_fit = mirt(
items_TF,
model = 1,
verbose = F,
technical = list(NCYCLES = 2000),
SE = T
)
irt_fit
##
## Call:
## mirt(data = items_TF, model = 1, SE = T, verbose = F, technical = list(NCYCLES = 2000))
##
## Full-information item factor analysis with 1 factor(s).
## Converged within 1e-04 tolerance after 95 EM iterations.
## mirt version: 1.42
## M-step optimizer: BFGS
## EM acceleration: Ramsay
## Number of rectangular quadrature: 61
## Latent density type: Gaussian
##
## Information matrix estimated with method: Oakes
## Second-order test: model is a possible local maximum
## Condition number of information matrix = 114.8989
##
## Log-likelihood = -26412.21
## Estimated parameters: 202
## AIC = 53228.43
## BIC = 54124.21; SABIC = 53482.89
## G2 (1e+10) = 44806.98, p = 1
## RMSEA = 0, CFI = NaN, TLI = NaN
irt_fit %>% summary()
## F1 h2
## item_1 0.346 0.1195
## item_2 0.461 0.2121
## item_3 0.547 0.2988
## item_4 0.715 0.5107
## item_5 0.393 0.1548
## item_6 0.516 0.2661
## item_7 0.794 0.6301
## item_8 0.543 0.2952
## item_9 0.462 0.2133
## item_10 0.885 0.7839
## item_11 0.389 0.1515
## item_12 0.742 0.5503
## item_13 0.519 0.2693
## item_14 0.736 0.5411
## item_15 0.665 0.4424
## item_16 0.860 0.7403
## item_17 0.322 0.1039
## item_18 0.585 0.3419
## item_19 0.636 0.4047
## item_20 0.450 0.2028
## item_21 0.451 0.2033
## item_22 0.488 0.2386
## item_23 0.392 0.1533
## item_24 0.708 0.5016
## item_25 0.658 0.4327
## item_26 0.690 0.4763
## item_27 0.714 0.5098
## item_28 0.521 0.2714
## item_29 0.492 0.2425
## item_30 0.540 0.2914
## item_31 0.875 0.7659
## item_32 0.711 0.5050
## item_33 0.509 0.2593
## item_34 0.312 0.0974
## item_35 0.836 0.6994
## item_36 0.742 0.5507
## item_37 0.524 0.2742
## item_38 0.735 0.5398
## item_39 0.443 0.1966
## item_40 0.862 0.7432
## item_41 0.841 0.7077
## item_42 0.490 0.2405
## item_43 0.527 0.2775
## item_44 0.425 0.1804
## item_45 0.553 0.3056
## item_46 0.302 0.0911
## item_47 0.329 0.1079
## item_48 0.848 0.7199
## item_49 0.590 0.3478
## item_50 0.721 0.5195
## item_51 0.834 0.6956
## item_52 0.645 0.4161
## item_53 0.469 0.2198
## item_54 0.756 0.5719
## item_55 0.528 0.2793
## item_56 0.654 0.4283
## item_57 0.318 0.1013
## item_58 0.855 0.7316
## item_59 0.396 0.1570
## item_60 0.392 0.1538
## item_61 0.353 0.1247
## item_62 0.757 0.5735
## item_63 0.702 0.4932
## item_64 0.721 0.5199
## item_65 0.607 0.3681
## item_66 0.638 0.4073
## item_67 0.537 0.2879
## item_68 0.605 0.3654
## item_69 0.695 0.4833
## item_70 0.627 0.3936
## item_71 0.684 0.4675
## item_72 0.625 0.3911
## item_73 0.794 0.6305
## item_74 0.430 0.1853
## item_75 0.756 0.5709
## item_76 0.477 0.2275
## item_77 0.669 0.4472
## item_78 0.804 0.6469
## item_79 0.694 0.4810
## item_80 0.704 0.4955
## item_81 0.535 0.2865
## item_82 0.616 0.3790
## item_83 0.378 0.1429
## item_84 0.732 0.5354
## item_85 0.604 0.3651
## item_86 0.616 0.3790
## item_87 0.834 0.6964
## item_88 0.595 0.3546
## item_89 0.840 0.7059
## item_90 0.618 0.3824
## item_91 0.577 0.3331
## item_92 0.556 0.3086
## item_93 0.785 0.6158
## item_94 0.508 0.2580
## item_95 0.814 0.6630
## item_96 0.629 0.3950
## item_97 0.657 0.4313
## item_98 0.861 0.7407
## item_99 0.423 0.1791
## item_100 0.456 0.2075
## item_101 0.934 0.8725
##
## SS loadings: 40.23
## Proportion Var: 0.398
##
## Factor correlations:
##
## F1
## F1 1
#with fixed guessing
irt_guess_fit = mirt(
items_TF,
model = 1,
verbose = F,
guess = 0.10,
technical = list(NCYCLES = 2000)
)
irt_guess_fit
##
## Call:
## mirt(data = items_TF, model = 1, guess = 0.1, verbose = F, technical = list(NCYCLES = 2000))
##
## Full-information item factor analysis with 1 factor(s).
## Converged within 1e-04 tolerance after 147 EM iterations.
## mirt version: 1.42
## M-step optimizer: BFGS
## EM acceleration: Ramsay
## Number of rectangular quadrature: 61
## Latent density type: Gaussian
##
## Log-likelihood = -26353.14
## Estimated parameters: 202
## AIC = 53110.28
## BIC = 54006.06; SABIC = 53364.74
## G2 (1e+10) = 44688.84, p = 1
## RMSEA = 0, CFI = NaN, TLI = NaN
irt_guess_fit %>% summary()
## F1 h2
## item_1 0.329 0.1081
## item_2 0.891 0.7935
## item_3 0.610 0.3718
## item_4 0.678 0.4591
## item_5 0.562 0.3158
## item_6 0.496 0.2461
## item_7 0.863 0.7442
## item_8 0.659 0.4337
## item_9 0.506 0.2557
## item_10 0.876 0.7672
## item_11 0.380 0.1445
## item_12 0.804 0.6458
## item_13 0.548 0.2998
## item_14 0.704 0.4953
## item_15 0.619 0.3833
## item_16 0.870 0.7576
## item_17 0.413 0.1706
## item_18 0.550 0.3020
## item_19 0.629 0.3951
## item_20 0.536 0.2873
## item_21 0.517 0.2672
## item_22 0.479 0.2297
## item_23 0.531 0.2824
## item_24 0.745 0.5550
## item_25 0.683 0.4668
## item_26 0.723 0.5231
## item_27 0.706 0.4978
## item_28 0.705 0.4976
## item_29 0.520 0.2703
## item_30 0.500 0.2502
## item_31 0.866 0.7493
## item_32 0.795 0.6325
## item_33 0.653 0.4259
## item_34 0.304 0.0924
## item_35 0.835 0.6966
## item_36 0.767 0.5888
## item_37 0.566 0.3203
## item_38 0.833 0.6932
## item_39 0.428 0.1831
## item_40 0.842 0.7082
## item_41 0.895 0.8008
## item_42 0.681 0.4643
## item_43 0.648 0.4205
## item_44 0.432 0.1864
## item_45 0.530 0.2810
## item_46 0.314 0.0985
## item_47 0.334 0.1116
## item_48 0.814 0.6620
## item_49 0.570 0.3246
## item_50 0.740 0.5480
## item_51 0.807 0.6510
## item_52 0.662 0.4381
## item_53 0.451 0.2038
## item_54 0.732 0.5354
## item_55 0.565 0.3190
## item_56 0.685 0.4686
## item_57 0.509 0.2591
## item_58 0.822 0.6750
## item_59 0.389 0.1513
## item_60 0.509 0.2593
## item_61 0.377 0.1425
## item_62 0.776 0.6020
## item_63 0.738 0.5452
## item_64 0.818 0.6698
## item_65 0.612 0.3745
## item_66 0.726 0.5278
## item_67 0.672 0.4521
## item_68 0.655 0.4293
## item_69 0.669 0.4472
## item_70 0.689 0.4741
## item_71 0.652 0.4252
## item_72 0.761 0.5797
## item_73 0.799 0.6387
## item_74 0.427 0.1826
## item_75 0.760 0.5775
## item_76 0.561 0.3144
## item_77 0.741 0.5491
## item_78 0.800 0.6408
## item_79 0.719 0.5172
## item_80 0.698 0.4878
## item_81 0.522 0.2730
## item_82 0.741 0.5484
## item_83 0.419 0.1753
## item_84 0.808 0.6527
## item_85 0.668 0.4459
## item_86 0.756 0.5715
## item_87 0.832 0.6925
## item_88 0.574 0.3297
## item_89 0.868 0.7531
## item_90 0.722 0.5216
## item_91 0.584 0.3413
## item_92 0.684 0.4680
## item_93 0.764 0.5831
## item_94 0.475 0.2252
## item_95 0.840 0.7056
## item_96 0.777 0.6041
## item_97 0.694 0.4814
## item_98 0.840 0.7052
## item_99 0.424 0.1801
## item_100 0.770 0.5928
## item_101 0.946 0.8956
##
## SS loadings: 45.489
## Proportion Var: 0.45
##
## Factor correlations:
##
## F1
## F1 1
irt_guess_fit_stats = kirkegaard::get_mirt_stats(irt_guess_fit)
#add to item stats
full_join(
item_stats,
irt_fit@Fit$`F` %>% as.data.frame() %>% rownames_to_column(var = "item") %>% rename(loading = F1)
) ->
item_stats
## Joining with `by = join_by(item)`
item_stats
#distribution of factor loadings
GG_denhist(item_stats$loading) +
scale_x_continuous(breaks = seq(-1, 1, .2))
## Input seems like a fraction, set `boundary=0` and `binwidth=1/30` to avoid issues near the limits. Disable this with `auto_fraction_bounary=F`
## Scale for x is already present.
## Adding another scale for x, which will replace the existing scale.
item_stats %>% describe2()
#item difficulty vs. loading
item_stats$difficulty = qnorm(item_stats$pass_rate, lower.tail = F)
item_stats %>%
GG_scatter("difficulty", "loading", case_names = "item")
## `geom_smooth()` using formula = 'y ~ x'
item_stats %>%
GG_scatter("pass_rate", "loading", case_names = "item") +
labs(
x = "Item pass rate",
y = "Item factor loading"
)
## `geom_smooth()` using formula = 'y ~ x'
GG_save("figs/item_passrate_loading.png")
## `geom_smooth()` using formula = 'y ~ x'
#score subjects
irt_fit_score = fscores(irt_fit, full.scores.SE = T)
d$vocab_g = irt_fit_score[, 1] %>% standardize()
#irt score dist
GG_denhist(d, "vocab_g") +
scale_x_continuous("Vocabulary IRT score")
## Scale for x is already present.
## Adding another scale for x, which will replace the existing scale.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
GG_save("figs/vocab_g_dist.png")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
#distributions compare
d %>% select(vocab_sumscore, vocab_g) %>%
describe2()
#reliability
marginal_rxx(irt_fit)
## [1] 0.9461351
marginal_rxx(irt_guess_fit)
## [1] 0.9558691
empirical_rxx(irt_fit_score)
## F1
## 0.9471914
get_reliabilities(irt_fit) %>%
filter(is_between(z, -4, 4)) %>%
ggplot(aes(z, rel)) +
geom_line() +
scale_x_continuous(breaks = seq(-4, 4, 1))
#range above 0.90 reliability
rel_above_90_range = get_reliabilities(irt_fit) %>%
filter(rel > .90) %>%
pull(z) %>%
range()
rel_above_90_range
## [1] -3.165829 1.175879
mean(is_between(irt_fit_score[, 1], rel_above_90_range[1], rel_above_90_range[2]))
## [1] 0.8988764
#sum score vs. IRT score
GG_scatter(d, "vocab_sumscore", "vocab_g") +
geom_smooth()
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
#age vs. vocab
GG_scatter(d, "age", "vocab_g") +
geom_smooth() +
labs(
y = "Vocabulary IRT score",
x = "Age"
)
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
GG_save("figs/vocab_age.png")
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
GG_scatter(d, "age", "vocab_sumscore") +
geom_smooth()
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
#adjust for age and make norms
norms = make_norms(
d$vocab_g,
d$age
)
## Detected linear effect of age on the score (p = <0.001***). Model used.
## No detected variance effect of age on the score (p = 0.233). Model not used.
norms
## $data
## # A tibble: 623 × 7
## score age norm_group score_ageadj1 score_ageadj2 score_ageadj3 IQ
## <dbl> <dbl> <lgl> <dbl> <dbl> <dbl> <dbl>
## 1 -2.00 42 TRUE -1.56 -1.56 -1.87 71.9
## 2 -0.363 55 TRUE -0.377 -0.377 -0.452 93.2
## 3 -0.816 28 TRUE 0.110 0.110 0.132 102.
## 4 -2.83 56 TRUE -2.88 -2.88 -3.46 48.1
## 5 0.00396 43 TRUE 0.408 0.408 0.490 107.
## 6 0.117 75 TRUE -0.592 -0.592 -0.712 89.3
## 7 -0.0435 69 TRUE -0.544 -0.544 -0.653 90.2
## 8 -0.241 69 TRUE -0.742 -0.742 -0.891 86.6
## 9 -0.0374 36 TRUE 0.610 0.610 0.733 111.
## 10 -2.04 21 TRUE -0.866 -0.866 -1.04 84.4
## # ℹ 613 more rows
##
## $age_model
##
## Call:
## lm(formula = score ~ age, data = d_norm)
##
## Coefficients:
## (Intercept) age
## -1.89984 0.03479
##
##
## $age_model_abs
##
## Call:
## lm(formula = abs(score_ageadj1) ~ age, data = d_norm)
##
## Coefficients:
## (Intercept) age
## 0.57180 0.00154
##
##
## $age_model_used
## value
## TRUE
##
## $age_model_abs_used
## value
## FALSE
##
## $norm_desc
## # A tibble: 1 × 10
## var n mean median sd mad min max skew kurtosis
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 score_ageadj2 623 8.68e-16 -0.0977 0.833 0.783 -2.88 3.17 0.315 0.635
#norms based on sum score
norms_sum = make_norms(
d$vocab_sumscore,
d$age
)
## Detected linear effect of age on the score (p = <0.001***). Model used.
## Detected variance effect of age on the score (p = <0.001***). Model used.
norms_sum
## $data
## # A tibble: 623 × 7
## score age norm_group score_ageadj1 score_ageadj2 score_ageadj3 IQ
## <dbl> <dbl> <lgl> <dbl> <dbl> <dbl> <dbl>
## 1 23 42 TRUE -39.9 -2.99 -2.37 64.5
## 2 64 55 TRUE -6.98 -0.599 -0.471 92.9
## 3 57 28 TRUE 2.82 0.187 0.152 102.
## 4 10 56 TRUE -61.6 -5.35 -4.23 36.5
## 5 74 43 TRUE 10.5 0.795 0.635 110.
## 6 74 75 TRUE -9.43 -1.04 -0.819 87.7
## 7 72 69 TRUE -7.69 -0.781 -0.615 90.8
## 8 69 69 TRUE -10.7 -1.09 -0.856 87.2
## 9 74 36 TRUE 14.8 1.05 0.840 113.
## 10 27 21 TRUE -22.8 -1.42 -1.13 83.1
## # ℹ 613 more rows
##
## $age_model
##
## Call:
## lm(formula = score ~ age, data = d_norm)
##
## Coefficients:
## (Intercept) age
## 36.7513 0.6223
##
##
## $age_model_abs
##
## Call:
## lm(formula = abs(score_ageadj1) ~ age, data = d_norm)
##
## Coefficients:
## (Intercept) age
## 18.7244 -0.1286
##
##
## $age_model_used
## value
## TRUE
##
## $age_model_abs_used
## value
## TRUE
##
## $norm_desc
## # A tibble: 1 × 10
## var n mean median sd mad min max skew kurtosis
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 score_ageadj2 623 -0.00543 0.133 1.26 1.28 -5.35 2.79 -0.720 0.948
#save full scale norms
write_rds(norms, "data/full_norms_IRT.rds", compress = "xz")
write_rds(norms_sum, "data/full_norms_CTT.rds", compress = "xz")
#save norms
d$vocab_IQ_orig = norms$data$IQ %>% as.numeric()
d$vocab_IQ_sum_orig = norms_sum$data$IQ %>% as.numeric()
#manually check for HS
d$vocab_g_age_resid = lm(vocab_g ~ age, data = d) %>% residuals()
test_HS(d$vocab_g_age_resid, d$age)
## number of knots in rcs defaulting to 5
## number of knots in rcs defaulting to 5
#plot IQs
d %>%
GG_denhist("vocab_IQ_orig") +
scale_x_continuous(breaks = seq(0, 200, 10)) +
labs(
x = "Vocabulary IQ score"
)
## Scale for x is already present.
## Adding another scale for x, which will replace the existing scale.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
GG_save("figs/vocab_IQ_dist.png")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
d$vocab_IQ_orig %>% describe2()
#remove implausible values
sum(d$vocab_IQ_orig < 65)
## [1] 5
sum(d$vocab_IQ_orig > 145)
## [1] 3
d$vocab_IQ = if_else(!is_between(d$vocab_IQ_orig, 65, 145), NA_real_, d$vocab_IQ_orig)
d$vocab_IQ_sum = if_else(!is_between(d$vocab_IQ_sum_orig, 65, 145), NA_real_, d$vocab_IQ_sum_orig)
d %>% select(vocab_IQ, vocab_IQ_sum) %>% describe2()
#after filter
d %>%
GG_denhist("vocab_IQ") +
scale_x_continuous(breaks = seq(0, 200, 10))
## Warning: Removed 8 rows containing non-finite outside the scale range (`stat_bin()`).
## Warning: Removed 8 rows containing non-finite outside the scale range (`stat_density()`).
## Scale for x is already present.
## Adding another scale for x, which will replace the existing scale.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 8 rows containing non-finite outside the scale range (`stat_bin()`).
## Removed 8 rows containing non-finite outside the scale range (`stat_density()`).
d %>%
GG_denhist("vocab_IQ_sum") +
scale_x_continuous(breaks = seq(0, 200, 10))
## Warning: Removed 13 rows containing non-finite outside the scale range (`stat_bin()`).
## Warning: Removed 13 rows containing non-finite outside the scale range (`stat_density()`).
## Scale for x is already present.
## Adding another scale for x, which will replace the existing scale.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 13 rows containing non-finite outside the scale range (`stat_bin()`).
## Removed 13 rows containing non-finite outside the scale range (`stat_density()`).
#correlation between methods
GG_scatter(d, "vocab_IQ", "vocab_IQ_sum")
## `geom_smooth()` using formula = 'y ~ x'
#filter items by pass rate
items_TF_goodpass = items_TF %>% select(item_stats$item[is_between(item_stats$pass_rate, .05, .95)])
dim(items_TF_goodpass)
## [1] 623 99
#apply algos
scale_forwards = cache_object(abbreviate_scale(items_TF_goodpass, item_target = 50, method = "forwards"), filename = "temp/abbrev_forwards.rds")
## Cache found, reading object from disk
scale_backwards = cache_object(abbreviate_scale(items_TF_goodpass, item_target = 5, method = "backwards"), filename = "temp/abbrev_backwards.rds")
## Cache found, reading object from disk
scale_max_basic = cache_object(abbreviate_scale(items_TF_goodpass, item_target = 50, method = "max_loading"), filename = "temp/abbrev_max_basic.rds")
## Cache found, reading object from disk
scale_max_balance = cache_object(abbreviate_scale(items_TF_goodpass, item_target = 50, method = "max_loading", difficulty_balance_groups = 5), filename = "temp/abbrev_max_balance.rds")
## Cache found, reading object from disk
scale_max_resid = cache_object(abbreviate_scale(items_TF_goodpass, item_target = 50, method = "max_loading", residualize_loadings = T), filename = "temp/abbrev_max_resid.rds")
## Cache found, reading object from disk
scale_genetic = cache_object(abbreviate_scale(items_TF_goodpass, item_target = 50, method = "genetic"), filename = "temp/abbrev_genetic.rds")
## Cache found, reading object from disk
#plot results
GG_scale_abbreviation(scale_forwards)
GG_scale_abbreviation(scale_backwards)
GG_scale_abbreviation(scale_max_basic)
GG_scale_abbreviation(scale_max_balance)
GG_scale_abbreviation(scale_max_resid)
GG_scale_abbreviation(scale_genetic)
#all together, just the reliability and full scale cor
bind_rows(
scale_forwards$best_sets %>% mutate(method = "forwards"),
scale_backwards$best_sets %>% mutate(method = "backwards") %>% filter(items_in_scale %in% 3:50),
scale_max_basic$best_sets %>% mutate(method = "max loading"),
scale_max_balance$best_sets %>% mutate(method = "max loading balance"),
scale_max_resid$best_sets %>% mutate(method = "max loading resid"),
scale_genetic$best_sets %>% mutate(method = "genetic")
) %>%
#split reliability and full scale cor
pivot_longer(cols = c(reliability, r_full_score), names_to = "var", values_to = "var_value") %>%
ggplot(aes(items_in_scale, var_value, color = method)) +
geom_line() +
facet_wrap(~var, scales = "free_y") +
labs(
y = "Value",
x = "Number of items in scale"
)
GG_save("figs/abbrev_results.png")
scale_backwards_CTT = cache_object(abbreviate_scale(items_TF_goodpass, item_target = 5, method = "backwards", IRT = F), filename = "temp/abbrev_backwards_CTT.rds")
## Cache found, reading object from disk
#compare 20 item versions
scale_backwards$best_sets %>% filter(items_in_scale == 20) %>% pull(item_set)
## [[1]]
## [1] "item_7" "item_12" "item_16" "item_32" "item_33" "item_35" "item_38" "item_50"
## [9] "item_64" "item_67" "item_68" "item_72" "item_73" "item_84" "item_86" "item_90"
## [17] "item_92" "item_95" "item_96" "item_101"
scale_backwards_CTT$best_sets %>% filter(items_in_scale == 20) %>% pull(item_set)
## [[1]]
## [1] "item_7" "item_12" "item_16" "item_31" "item_32" "item_35" "item_38" "item_40"
## [9] "item_41" "item_50" "item_64" "item_73" "item_75" "item_77" "item_84" "item_86"
## [17] "item_87" "item_89" "item_95" "item_101"
#count overlap
overlap_metrics(
scale_backwards$best_sets %>% filter(items_in_scale == 20) %>% pull(item_set) %>% unlist(),
scale_backwards_CTT$best_sets %>% filter(items_in_scale == 20) %>% pull(item_set) %>% unlist()
)
## x_count y_count xy_union_count xy_intersection_count
## 20.0000000 20.0000000 27.0000000 13.0000000
## prop_overlap_of_union x_in_y_count x_in_y_prop y_in_x_count
## 0.4814815 13.0000000 0.6500000 13.0000000
## y_in_x_prop
## 0.6500000
We need to make norms for the IRT and CTT scales, and save these to file so that someone may use them fairly easily.
scale_lengths = c(5, 10, 15, 20, 25, 30, 35, 40, 45, 50)
#IRT scales
norms_IRT = map_dfr(scale_lengths, function(scale_length) {
#get scale
i_scale = scale_backwards$best_sets %>% filter(items_in_scale == scale_length)
#make norms
i_norms = make_norms(
i_scale[["scores"]][[1]][, 1],
d$age
)
#save
tibble(
scale_length = scale_length,
norms = list(i_norms)
)
})
## Detected linear effect of age on the score (p = <0.001***). Model used.
## Detected variance effect of age on the score (p = 0.002**). Model used.
## Detected linear effect of age on the score (p = <0.001***). Model used.
## No detected variance effect of age on the score (p = 0.74). Model not used.
## Detected linear effect of age on the score (p = <0.001***). Model used.
## No detected variance effect of age on the score (p = 0.189). Model not used.
## Detected linear effect of age on the score (p = <0.001***). Model used.
## No detected variance effect of age on the score (p = 0.425). Model not used.
## Detected linear effect of age on the score (p = <0.001***). Model used.
## No detected variance effect of age on the score (p = 0.187). Model not used.
## Detected linear effect of age on the score (p = <0.001***). Model used.
## No detected variance effect of age on the score (p = 0.221). Model not used.
## Detected linear effect of age on the score (p = <0.001***). Model used.
## No detected variance effect of age on the score (p = 0.146). Model not used.
## Detected linear effect of age on the score (p = <0.001***). Model used.
## No detected variance effect of age on the score (p = 0.175). Model not used.
## Detected linear effect of age on the score (p = <0.001***). Model used.
## No detected variance effect of age on the score (p = 0.12). Model not used.
## Detected linear effect of age on the score (p = <0.001***). Model used.
## No detected variance effect of age on the score (p = 0.105). Model not used.
#CTT scales
norms_CTT = map_dfr(scale_lengths, function(scale_length) {
#get scale
i_scale = scale_backwards_CTT$best_sets %>% filter(items_in_scale == scale_length)
#make norms
i_norms = make_norms(
i_scale[["scores"]][[1]][, 1],
d$age
)
#save
tibble(
scale_length = scale_length,
norms = list(i_norms)
)
})
## Detected linear effect of age on the score (p = <0.001***). Model used.
## Detected variance effect of age on the score (p = <0.001***). Model used.
## Detected linear effect of age on the score (p = <0.001***). Model used.
## Detected variance effect of age on the score (p = <0.001***). Model used.
## Detected linear effect of age on the score (p = <0.001***). Model used.
## Detected variance effect of age on the score (p = <0.001***). Model used.
## Detected linear effect of age on the score (p = <0.001***). Model used.
## Detected variance effect of age on the score (p = <0.001***). Model used.
## Detected linear effect of age on the score (p = <0.001***). Model used.
## Detected variance effect of age on the score (p = <0.001***). Model used.
## Detected linear effect of age on the score (p = <0.001***). Model used.
## Detected variance effect of age on the score (p = <0.001***). Model used.
## Detected linear effect of age on the score (p = <0.001***). Model used.
## Detected variance effect of age on the score (p = <0.001***). Model used.
## Detected linear effect of age on the score (p = <0.001***). Model used.
## Detected variance effect of age on the score (p = <0.001***). Model used.
## Detected linear effect of age on the score (p = <0.001***). Model used.
## Detected variance effect of age on the score (p = <0.001***). Model used.
## Detected linear effect of age on the score (p = <0.001***). Model used.
## Detected variance effect of age on the score (p = <0.001***). Model used.
#save for reuse
write_rds(norms_IRT, "data/abbrev_norms_IRT.rds", compress = "xz")
write_rds(norms_CTT, "data/abbrev_norms_CTT.rds", compress = "xz")
#explore 20 item test example
#score 17/20, male, 30 years
scale20CTT = scale_backwards_CTT$best_sets %>% filter(items_in_scale == 20)
#distribution
tibble(
sum_score = scale20CTT$scores[[1]][, 1],
group = "norm"
) %>% bind_rows(
tibble(
sum_score = 17,
group = "new"
)
) %>% GG_denhist("sum_score", group = "group")
## Warning: Groups with fewer than two data points have been dropped.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Groups with fewer than two data points have been dropped.
## Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning -Inf
#centile
scale20CTT_ecdf = ecdf(scale20CTT$scores[[1]][, 1])
scale20CTT_ecdf(17)
## [1] 0.6067416
#age adjusted
norms_CTT_20 = norms_CTT %>% filter(scale_length == 20) %>% pull(norms) %>% first()
apply_norms(
score = (0:20) %>% set_names(0:20),
age = 30,
prior_norms = norms_CTT_20
)
## 0 1 2 3 4 5 6 7 8
## 74.50478 77.17116 79.83754 82.50391 85.17029 87.83666 90.50304 93.16942 95.83579
## 9 10 11 12 13 14 15 16 17
## 98.50217 101.16854 103.83492 106.50129 109.16767 111.83405 114.50042 117.16680 119.83317
## 18 19 20
## 122.49955 125.16593 127.83230
#age 35+
d_age35 = d %>% filter(age >= 35)
#education levels
d_age35 %>%
GG_group_means("vocab_IQ", "education", type = "point") +
labs(
y = "Vocabulary IQ",
x = "Education level at age 35+"
)
## Missing values were removed.
GG_save("figs/IQ_education.png")
edu_nom = nnet::multinom(education ~ vocab_IQ + age * sex, data = d_age35)
## # weights: 48 (35 variable)
## initial value 1110.421783
## iter 10 value 1011.020529
## iter 20 value 1009.460065
## iter 30 value 934.173854
## iter 40 value 931.756870
## iter 50 value 931.402348
## iter 60 value 931.252229
## final value 931.252130
## converged
ggpredict(edu_nom, "vocab_IQ [65:145]") %>%
plot()
#reverse
ols(vocab_IQ ~ education, data = d_age35)
## Frequencies of Missing Values Due to Each Variable
## vocab_IQ education
## 8 0
##
## Linear Regression Model
##
## ols(formula = vocab_IQ ~ education, data = d_age35)
##
##
## Model Likelihood Discrimination
## Ratio Test Indexes
## Obs 534 LR chi2 72.58 R2 0.127
## sigma13.3660 d.f. 7 R2 adj 0.115
## d.f. 526 Pr(> chi2) 0.0000 g 5.553
##
## Residuals
##
## Min 1Q Median 3Q Max
## -36.098 -9.557 -1.014 8.296 37.597
##
##
## Coef S.E. t Pr(>|t|)
## Intercept 91.7892 1.9496 47.08 <0.0001
## education=High school 12.3456 2.7877 4.43 <0.0001
## education=Vocational 3.1110 2.2659 1.37 0.1703
## education=Short education 8.1413 2.7026 3.01 0.0027
## education=Medium education 11.2318 2.2532 4.98 <0.0001
## education=University bachelor 12.5349 2.8563 4.39 <0.0001
## education=University master 14.9645 2.5064 5.97 <0.0001
## education=PhD 24.4999 6.9616 3.52 0.0005
describe2(d_age35$vocab_IQ, d_age35$education)
## New names:
## • `` -> `...1`
#books
d$books_read %>% describe2()
d %>%
GG_scatter("vocab_IQ", "books_read")
## `geom_smooth()` using formula = 'y ~ x'
d$books_read2 = d$books_read %>% winsorise(upper = 100)
d %>%
GG_scatter("vocab_IQ", "books_read2")
## `geom_smooth()` using formula = 'y ~ x'
#log
d$books_read_log = log(d$books_read + 1)
d %>%
GG_scatter("vocab_IQ", "books_read_log") +
labs(
y = "Log (1 + books read last year)",
x = "Vocabulary IQ"
)
## `geom_smooth()` using formula = 'y ~ x'
GG_save("figs/IQ_books.png")
## `geom_smooth()` using formula = 'y ~ x'
#attempt to clean and fix income data
d$income = d$Hvor_mange_penge_tjente_du_sidste_år_før_skat_Medtæl_alle_indkomstkilder_fx_sociale_ydelser_og_pension_Bedste_gæt %>% as.numeric()
#remove implausible values above 2 million
d$income = if_else(d$income > 2e6, NA_real_, d$income)
#distribution
GG_denhist(d, "income") +
scale_x_log10()
## Warning: Removed 4 rows containing non-finite outside the scale range (`stat_bin()`).
## Warning: Removed 4 rows containing non-finite outside the scale range (`stat_density()`).
## Scale for x is already present.
## Adding another scale for x, which will replace the existing scale.
## Warning in scale_x_log10(): log-10 transformation introduced infinite values.
## Warning in scale_x_log10(): log-10 transformation introduced infinite values.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 23 rows containing non-finite outside the scale range (`stat_bin()`).
## Warning: Removed 23 rows containing non-finite outside the scale range (`stat_density()`).
GG_scatter(d, "vocab_IQ", "income")
## `geom_smooth()` using formula = 'y ~ x'
#assume that if value is below 1000, it is in thousands
d$income2 = if_else(d$income < 1000, d$income * 1000, d$income)
#plot against score
d %>%
GG_scatter("vocab_IQ", "income2")
## `geom_smooth()` using formula = 'y ~ x'
#log income
d$income_log = log(d$income2 + 1)
d %>%
GG_scatter("vocab_IQ", "income_log")
## `geom_smooth()` using formula = 'y ~ x'
#regression model
inc_mod = gam(income ~ s(vocab_IQ, by = sex) + s(age, by = sex) + sex, data = d)
inc_mod %>% summary()
##
## Family: gaussian
## Link function: identity
##
## Formula:
## income ~ s(vocab_IQ, by = sex) + s(age, by = sex) + sex
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 240048 13720 17.496 < 2e-16 ***
## sexMale 87988 19546 4.502 8.11e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(vocab_IQ):sexFemale 1.000 1.000 0.439 0.50767
## s(vocab_IQ):sexMale 2.060 2.621 3.158 0.04007 *
## s(age):sexFemale 1.866 2.343 1.958 0.13889
## s(age):sexMale 5.821 6.978 3.517 0.00111 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.0767 Deviance explained = 9.45%
## GCV = 5.742e+10 Scale est. = 5.6222e+10 n = 611
#standardized values
d$income_z = standardize(d$income2)
d$vocab_IQ_z = d$vocab_IQ %>% standardize()
incz_mod = gam(income_z ~ s(vocab_IQ_z, by = sex) + s(age, by = sex) + sex, data = d)
incz_mod %>% summary()
##
## Family: gaussian
## Link function: identity
##
## Formula:
## income_z ~ s(vocab_IQ_z, by = sex) + s(age, by = sex) + sex
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.15662 0.05171 -3.029 0.00256 **
## sexMale 0.26379 0.07366 3.581 0.00037 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(vocab_IQ_z):sexFemale 1.000 1.000 0.063 0.801920
## s(vocab_IQ_z):sexMale 2.565 3.255 3.139 0.021515 *
## s(age):sexFemale 2.039 2.560 2.049 0.148251
## s(age):sexMale 5.901 7.059 3.887 0.000407 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.0734 Deviance explained = 9.24%
## GCV = 0.81565 Scale est. = 0.79762 n = 611
#for those above 35
d_age35 = d %>% filter(age >= 35)
inc35_mod = gam(income_z ~ vocab_IQ_z + s(age, by = sex) + sex, data = d_age35)
inc35_mod %>% summary()
##
## Family: gaussian
## Link function: identity
##
## Formula:
## income_z ~ vocab_IQ_z + s(age, by = sex) + sex
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.14975 0.05129 -2.919 0.00366 **
## vocab_IQ_z 0.10825 0.03542 3.056 0.00235 **
## sexMale 0.32638 0.07208 4.528 7.4e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(age):sexFemale 1.911 2.398 2.715 0.06478 .
## s(age):sexMale 5.980 7.150 2.994 0.00438 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.0852 Deviance explained = 10.2%
## GCV = 0.67871 Scale est. = 0.66476 n = 530
#remove incomes that are 0
d$income0 <- ifelse(d$income > 0, d$income, NA)
GG_scatter(d, "vocab_IQ", "income0")
## `geom_smooth()` using formula = 'y ~ x'
#assume that if value is below 1000, it is in thousands
d$income02 = if_else(d$income0 < 1000, d$income0 * 1000, d$income0)
#plot against score
d %>%
GG_scatter("vocab_IQ", "income02")
## `geom_smooth()` using formula = 'y ~ x'
#log income
d$income0_log = log(d$income02 + 1)
d %>%
GG_scatter("vocab_IQ", "income0_log")
## `geom_smooth()` using formula = 'y ~ x'
#regression model
inc_mod = gam(income0 ~ s(vocab_IQ, by = sex) + s(age, by = sex) + sex, data = d)
inc_mod %>% summary()
##
## Family: gaussian
## Link function: identity
##
## Formula:
## income0 ~ s(vocab_IQ, by = sex) + s(age, by = sex) + sex
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 249236 13919 17.906 < 2e-16 ***
## sexMale 87266 19778 4.412 1.22e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(vocab_IQ):sexFemale 1.000 1.000 0.425 0.51452
## s(vocab_IQ):sexMale 2.201 2.798 3.003 0.04507 *
## s(age):sexFemale 1.812 2.274 1.896 0.15243
## s(age):sexMale 4.516 5.552 3.900 0.00129 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.0733 Deviance explained = 8.98%
## GCV = 5.6892e+10 Scale est. = 5.5784e+10 n = 592
#standardized values
d$income0_z = standardize(d$income02)
d$vocab_IQ_z = d$vocab_IQ %>% standardize()
incz_mod = gam(income0_z ~ s(vocab_IQ_z, by = sex) + s(age, by = sex) + sex, data = d)
incz_mod %>% summary()
##
## Family: gaussian
## Link function: identity
##
## Formula:
## income0_z ~ s(vocab_IQ_z, by = sex) + s(age, by = sex) + sex
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.15667 0.05239 -2.99 0.002905 **
## sexMale 0.26294 0.07448 3.53 0.000448 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(vocab_IQ_z):sexFemale 1.000 1.000 0.047 0.827793
## s(vocab_IQ_z):sexMale 2.612 3.312 2.961 0.027494 *
## s(age):sexFemale 2.041 2.564 2.032 0.158978
## s(age):sexMale 6.404 7.545 3.794 0.000269 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.0757 Deviance explained = 9.62%
## GCV = 0.80838 Scale est. = 0.78919 n = 592
#for those above 35
d_age35 = d %>% filter(age >= 35)
inc35_mod = gam(income0_z ~ vocab_IQ_z + s(age, by = sex) + sex, data = d_age35)
inc35_mod %>% summary()
##
## Family: gaussian
## Link function: identity
##
## Formula:
## income0_z ~ vocab_IQ_z + s(age, by = sex) + sex
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.15497 0.05152 -3.008 0.00276 **
## vocab_IQ_z 0.10665 0.03567 2.990 0.00293 **
## sexMale 0.33041 0.07234 4.567 6.22e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(age):sexFemale 2.005 2.516 2.651 0.06682 .
## s(age):sexMale 5.881 7.051 2.928 0.00512 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.0867 Deviance explained = 10.4%
## GCV = 0.66402 Scale est. = 0.64998 n = 515
#sex difference
(inc_sex_desc = describe2(d_age35$income2, d_age35$sex))
## New names:
## • `` -> `...1`
inc_sex_desc$mean[2] / inc_sex_desc$mean[1]
## [1] 1.273635
#education numeric
d$education_rank <- case_when(
d$education == "Less than high school" ~ 1,
d$education == "High school" ~ 2,
d$education == "Vocational" ~ 3,
d$education == "Short education" ~ 4,
d$education == "Medium education" ~ 5,
d$education == "University bachelor" ~ 6,
d$education == "University master" ~ 7,
d$education == "PhD" ~ 8,
TRUE ~ NA_real_
)
d_age35$education_rank <- case_when(
d_age35$education == "Less than high school" ~ 1,
d_age35$education == "High school" ~ 2,
d_age35$education == "Vocational" ~ 3,
d_age35$education == "Short education" ~ 4,
d_age35$education == "Medium education" ~ 5,
d_age35$education == "University bachelor" ~ 6,
d_age35$education == "University master" ~ 7,
d_age35$education == "PhD" ~ 8,
TRUE ~ NA_real_
)
#books & age by education level
d %>% group_by(education_rank) %>% summarise(books_read = mean(books_read), n = n())
d %>% group_by(education_rank) %>% summarise(books_read2 = mean(books_read2), n = n())
d %>% group_by(education_rank) %>% summarise(age = mean(age), n = n())
d_age35 %>% group_by(education_rank) %>% summarise(books_read = mean(books_read), n = n())
d_age35 %>% group_by(education_rank) %>% summarise(books_read2 = mean(books_read2), n = n())
d_age35 %>% group_by(education_rank) %>% summarise(age = mean(age), n = n())
write_sessioninfo()
## R version 4.4.1 (2024-06-14 ucrt)
## Platform: x86_64-w64-mingw32/x64
## Running under: Windows 11 x64 (build 26100)
##
## Matrix products: default
##
##
## locale:
## [1] LC_COLLATE=French_France.utf8 LC_CTYPE=French_France.utf8 LC_MONETARY=French_France.utf8
## [4] LC_NUMERIC=C LC_TIME=French_France.utf8
##
## time zone: Asia/Hong_Kong
## tzcode source: internal
##
## attached base packages:
## [1] stats4 stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] future_1.33.2 mgcv_1.9-1 nlme_3.1-164 ggeffects_1.7.0
## [5] nnet_7.3-19 rms_7.0-0 googlesheets4_1.1.1 haven_2.5.4
## [9] mirt_1.42 lattice_0.22-6 readxl_1.4.3 kirkegaard_2024-06-07
## [13] psych_2.4.6.26 assertthat_0.2.1 weights_1.0.4 Hmisc_5.2-2
## [17] magrittr_2.0.3 lubridate_1.9.3 forcats_1.0.0 stringr_1.5.1
## [21] dplyr_1.1.4 purrr_1.0.2 readr_2.1.5 tidyr_1.3.1
## [25] tibble_3.2.1 ggplot2_3.5.1 tidyverse_2.0.0
##
## loaded via a namespace (and not attached):
## [1] rstudioapi_0.17.1 audio_0.1-11 jsonlite_1.8.8 shape_1.4.6.1
## [5] datawizard_0.13.0 TH.data_1.1-2 jomo_2.7-6 farver_2.1.2
## [9] nloptr_2.1.1 rmarkdown_2.27 ragg_1.3.2 fs_1.6.4
## [13] vctrs_0.6.5 minqa_1.2.7 base64enc_0.1-3 htmltools_0.5.8.1
## [17] polspline_1.1.25 curl_5.2.1 broom_1.0.6 cellranger_1.1.0
## [21] Formula_1.2-5 mitml_0.4-5 dcurver_0.9.2 sass_0.4.9
## [25] parallelly_1.37.1 bslib_0.7.0 htmlwidgets_1.6.4 plyr_1.8.9
## [29] sandwich_3.1-0 testthat_3.2.1.1 zoo_1.8-12 cachem_1.1.0
## [33] lifecycle_1.0.4 iterators_1.0.14 pkgconfig_2.0.3 sjlabelled_1.2.0
## [37] Matrix_1.7-0 R6_2.5.1 fastmap_1.2.0 snakecase_0.11.1
## [41] digest_0.6.36 colorspace_2.1-0 textshaping_0.4.0 vegan_2.6-6.1
## [45] labeling_0.4.3 progressr_0.14.0 fansi_1.0.6 timechange_0.3.0
## [49] gdata_3.0.0 httr_1.4.7 compiler_4.4.1 gargle_1.5.2
## [53] bit64_4.0.5 withr_3.0.2 htmlTable_2.4.3 backports_1.5.0
## [57] highr_0.11 R.utils_2.12.3 pan_1.9 MASS_7.3-60.2
## [61] quantreg_5.98 sessioninfo_1.2.2 GPArotation_2024.3-1 gtools_3.9.5
## [65] permute_0.9-7 tools_4.4.1 foreign_0.8-86 googledrive_2.1.1
## [69] future.apply_1.11.2 R.oo_1.26.0 glue_1.7.0 grid_4.4.1
## [73] rsconnect_1.3.1 checkmate_2.3.1 cluster_2.1.6 generics_0.1.3
## [77] gtable_0.3.5 RPushbullet_0.3.4 tzdb_0.4.0 R.methodsS3_1.8.2
## [81] data.table_1.15.4 hms_1.1.3 Deriv_4.1.3 utf8_1.2.4
## [85] foreach_1.5.2 pillar_1.9.0 vroom_1.6.5 splines_4.4.1
## [89] bit_4.0.5 survival_3.6-4 SparseM_1.84 tidyselect_1.2.1
## [93] pbapply_1.7-2 knitr_1.48 gridExtra_2.3 xfun_0.45
## [97] brio_1.1.5 stringi_1.8.4 yaml_2.3.9 boot_1.3-30
## [101] evaluate_0.24.0 codetools_0.2-20 beepr_2.0 cli_3.6.3
## [105] rpart_4.1.23 systemfonts_1.1.0 munsell_0.5.1 jquerylib_0.1.4
## [109] Rcpp_1.0.12 globals_0.16.3 parallel_4.4.1 MatrixModels_0.5-3
## [113] lme4_1.1-35.5 listenv_0.9.1 glmnet_4.1-8 mvtnorm_1.2-5
## [117] SimDesign_2.16 scales_1.3.0 crayon_1.5.3 insight_0.20.5
## [121] rlang_1.1.4 multcomp_1.4-26 mnormt_2.1.1 mice_3.16.0