library(kirkegaard)
## Loading required package: tidyverse
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.1 ✔ stringr 1.5.2
## ✔ ggplot2 4.0.0 ✔ tibble 3.3.0
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.1.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
## Loading required package: magrittr
##
##
## Attaching package: 'magrittr'
##
##
## The following object is masked from 'package:purrr':
##
## set_names
##
##
## The following object is masked from 'package:tidyr':
##
## extract
##
##
## Loading required package: weights
##
## Loading required package: assertthat
##
##
## Attaching package: 'assertthat'
##
##
## The following object is masked from 'package:tibble':
##
## has_name
##
##
## Loading required package: psych
##
##
## Attaching package: 'psych'
##
##
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
##
##
## Loading required package: robustbase
##
##
## Attaching package: 'kirkegaard'
##
##
## The following object is masked from 'package:psych':
##
## rescale
##
##
## The following object is masked from 'package:assertthat':
##
## are_equal
##
##
## The following object is masked from 'package:purrr':
##
## is_logical
##
##
## The following object is masked from 'package:base':
##
## +
load_packages(
rms,
mgcv,
mirt
)
## Loading required package: Hmisc
##
## Attaching package: 'Hmisc'
##
## The following object is masked from 'package:psych':
##
## describe
##
## The following objects are masked from 'package:dplyr':
##
## src, summarize
##
## The following objects are masked from 'package:base':
##
## format.pval, units
##
## Loading required package: nlme
##
## Attaching package: 'nlme'
##
## The following object is masked from 'package:dplyr':
##
## collapse
##
## This is mgcv 1.9-3. For overview type 'help("mgcv-package")'.
## Loading required package: stats4
## Loading required package: lattice
##
## Attaching package: 'mirt'
##
## The following object is masked from 'package:nlme':
##
## fixef
theme_set(theme_bw())
options(
digits = 3
)
#multithreading
#library(future)
#plan(multisession(workers = 8))
d_diamonds = diamonds
#vocab data
d_vocab_irt = read_rds("data/good_items_fit.rds")
d_vocab_items = d_vocab_irt@Data$data
d_vocab = read_rds("data/vocab study data.rds")
#plot quantiles
d_diamonds %>%
GG_quantiles("price", "carat", method = "Rq")
## number of knots in rcs defaulting to 5
## number of knots in rcs defaulting to 5
## number of knots in rcs defaulting to 5
## number of knots in rcs defaulting to 5
## number of knots in rcs defaulting to 5
GG_save("figs/diamond carat quantiles by price.png")
#mean regression as normal
d_diamonds %>%
ggplot(aes(price, carat)) +
geom_point(alpha = 0.1) +
geom_smooth()
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
GG_save("figs/diamond carat mean regression by price.png")
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
d_diamonds[-c(2, 3, 4)] %>% wtd.cors()
## carat depth table price x y z
## carat 1.0000 0.0282 0.182 0.9216 0.9751 0.9517 0.9534
## depth 0.0282 1.0000 -0.296 -0.0106 -0.0253 -0.0293 0.0949
## table 0.1816 -0.2958 1.000 0.1271 0.1953 0.1838 0.1509
## price 0.9216 -0.0106 0.127 1.0000 0.8844 0.8654 0.8612
## x 0.9751 -0.0253 0.195 0.8844 1.0000 0.9747 0.9708
## y 0.9517 -0.0293 0.184 0.8654 0.9747 1.0000 0.9520
## z 0.9534 0.0949 0.151 0.8612 0.9708 0.9520 1.0000
#residualize carat by price
gam_fit_spline = gam(carat ~ s(price), data = d_diamonds)
gam_fit_spline %>% summary()
##
## Family: gaussian
## Link function: identity
##
## Formula:
## carat ~ s(price)
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.797940 0.000649 1229 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(price) 8.61 8.95 53509 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.899 Deviance explained = 89.9%
## GCV = 0.022746 Scale est. = 0.022742 n = 53940
#cor equi
gam_fit_spline %>% summary() %>% .$r.sq %>% sqrt()
## [1] 0.948
d_diamonds = d_diamonds %>%
mutate(
carat_resid = resid(gam_fit_spline)
)
#test HS
test_HS(d_diamonds$carat_resid, d_diamonds$price)
#check residuals
d_diamonds %>%
ggplot(aes(price, carat_resid)) +
geom_point(alpha = 0.1) +
geom_smooth()
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
GG_save("figs/diamond carat residuals by price.png")
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
#predict abs residual
lm_fit_resids = lm(abs(carat_resid) ~ price, data = d_diamonds)
lm_fit_resids %>% summary()
##
## Call:
## lm(formula = abs(carat_resid) ~ price, data = d_diamonds)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.3288 -0.0405 -0.0123 0.0269 2.6299
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.18e-02 5.90e-04 53.8 <2e-16 ***
## price 1.61e-05 1.05e-07 153.1 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.0976 on 53938 degrees of freedom
## Multiple R-squared: 0.303, Adjusted R-squared: 0.303
## F-statistic: 2.34e+04 on 1 and 53938 DF, p-value: <2e-16
#save preds and
#divide by predicted abs resid to get z scores
d_diamonds = d_diamonds %>%
mutate(
carat_resid_abs_pred = predict(lm_fit_resids),
carat_z = carat_resid / carat_resid_abs_pred
)
#test HS
test_HS(d_diamonds$carat_z, d_diamonds$price)
#check z scores
d_diamonds %>%
ggplot(aes(price, carat_z)) +
geom_point(alpha = 0.1) +
geom_smooth()
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
GG_save("figs/diamond carat z scores by price.png")
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
#quantile regression again
d_diamonds %>%
GG_quantiles("price", "carat_z", method = "Rq")
## number of knots in rcs defaulting to 5
## number of knots in rcs defaulting to 5
## number of knots in rcs defaulting to 5
## number of knots in rcs defaulting to 5
## number of knots in rcs defaulting to 5
GG_save("figs/diamond carat z score quantiles by price.png")
#convert to real z-scores
d_diamonds = d_diamonds %>%
mutate(
carat_z_real = carat_z %>% standardize()
)
#load vocab data
d_vocab$vocab_sumscore = d_vocab_items %>% rowSums()
#plot quantiles
d_vocab %>%
GG_quantiles("age", "vocab_sumscore")
d_vocab %>%
GG_quantiles("age", "vocab_sumscore", method = "Rq")
## number of knots in rcs defaulting to 5
## number of knots in rcs defaulting to 5
## number of knots in rcs defaulting to 5
## number of knots in rcs defaulting to 5
## number of knots in rcs defaulting to 5
#try the g scores
d_vocab$vocab_g = fscores(d_vocab_irt)[, 1] %>% standardize()
d_vocab %>%
GG_quantiles("age", "vocab_g") +
labs(
title = "Vocab g scores by age, quantile regression",
y = "Vocab g scores (IRT)",
x = "Age"
)
GG_save("figs/vocab g scores quantiles by age.png")
#age norms
vocab_normed = make_norms(
d_vocab$vocab_g,
d_vocab$age
)
## Detected linear effect of age on the score (p = <0.001***). Model used.
## Detected variance effect of age on the score (p = <0.001***). Model used.
vocab_normed$data
apply_norms(
-1,
20,
prior_norms = vocab_normed
)
## -1
## 92.1
#test HS
test_HS(
d_vocab$vocab_g,
d_vocab$age
)
#versions
write_sessioninfo()
## R version 4.5.1 (2025-06-13)
## Platform: x86_64-pc-linux-gnu
## Running under: Linux Mint 21.1
##
## Matrix products: default
## BLAS: /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.10.0
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.10.0 LAPACK version 3.10.0
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=en_DK.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=en_DK.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_DK.UTF-8 LC_IDENTIFICATION=C
##
## time zone: Europe/Brussels
## tzcode source: system (glibc)
##
## attached base packages:
## [1] stats4 stats graphics grDevices utils datasets methods
## [8] base
##
## other attached packages:
## [1] mirt_1.45.1 lattice_0.22-7 mgcv_1.9-3
## [4] nlme_3.1-168 rms_8.0-0 Hmisc_5.2-4
## [7] kirkegaard_2025-10-17 robustbase_0.99-6 psych_2.5.6
## [10] assertthat_0.2.1 weights_1.1.2 magrittr_2.0.4
## [13] lubridate_1.9.4 forcats_1.0.1 stringr_1.5.2
## [16] dplyr_1.1.4 purrr_1.1.0 readr_2.1.5
## [19] tidyr_1.3.1 tibble_3.3.0 ggplot2_4.0.0
## [22] tidyverse_2.0.0
##
## loaded via a namespace (and not attached):
## [1] RColorBrewer_1.1-3 rstudioapi_0.17.1 audio_0.1-11
## [4] jsonlite_2.0.0 shape_1.4.6.1 TH.data_1.1-4
## [7] jomo_2.7-6 farver_2.1.2 nloptr_2.2.1
## [10] rmarkdown_2.30 ragg_1.5.0 vctrs_0.6.5
## [13] minqa_1.2.8 base64enc_0.1-3 htmltools_0.5.8.1
## [16] polspline_1.1.25 broom_1.0.10 Formula_1.2-5
## [19] mitml_0.4-5 dcurver_0.9.2 sass_0.4.10
## [22] parallelly_1.45.1 bslib_0.9.0 htmlwidgets_1.6.4
## [25] plyr_1.8.9 sandwich_3.1-1 testthat_3.2.3
## [28] zoo_1.8-14 cachem_1.1.0 mime_0.13
## [31] lifecycle_1.0.4 iterators_1.0.14 pkgconfig_2.0.3
## [34] Matrix_1.7-4 R6_2.6.1 fastmap_1.2.0
## [37] shiny_1.11.1 rbibutils_2.3 future_1.67.0
## [40] digest_0.6.37 colorspace_2.1-2 qgam_2.0.0
## [43] textshaping_1.0.4 vegan_2.7-2 labeling_0.4.3
## [46] progressr_0.16.0 timechange_0.3.0 gdata_3.0.1
## [49] compiler_4.5.1 doParallel_1.0.17 withr_3.0.2
## [52] htmlTable_2.4.3 S7_0.2.0 backports_1.5.0
## [55] R.utils_2.13.0 pan_1.9 MASS_7.3-65
## [58] quantreg_6.1 sessioninfo_1.2.3 GPArotation_2025.3-1
## [61] gtools_3.9.5 permute_0.9-8 tools_4.5.1
## [64] foreign_0.8-90 httpuv_1.6.16 future.apply_1.20.0
## [67] clipr_0.8.0 nnet_7.3-20 R.oo_1.27.1
## [70] glue_1.8.0 promises_1.3.3 grid_4.5.1
## [73] checkmate_2.3.3 cluster_2.1.8.1 generics_0.1.4
## [76] gtable_0.3.6 tzdb_0.5.0 R.methodsS3_1.8.2
## [79] data.table_1.17.8 hms_1.1.3 Deriv_4.2.0
## [82] foreach_1.5.2 pillar_1.11.1 later_1.4.4
## [85] splines_4.5.1 survival_3.8-3 SparseM_1.84-2
## [88] tidyselect_1.2.1 pbapply_1.7-4 knitr_1.50
## [91] reformulas_0.4.1 gridExtra_2.3 xfun_0.53
## [94] brio_1.1.5 DEoptimR_1.1-4 stringi_1.8.7
## [97] yaml_2.3.10 boot_1.3-32 evaluate_1.0.5
## [100] codetools_0.2-20 beepr_2.0 cli_3.6.5
## [103] rpart_4.1.24 xtable_1.8-4 systemfonts_1.3.1
## [106] Rdpack_2.6.4 jquerylib_0.1.4 Rcpp_1.1.0
## [109] globals_0.18.0 parallel_4.5.1 MatrixModels_0.5-4
## [112] lme4_1.1-37 listenv_0.9.1 glmnet_4.1-10
## [115] mvtnorm_1.3-3 SimDesign_2.21 scales_1.4.0
## [118] rlang_1.1.6 multcomp_1.4-28 mnormt_2.1.1
## [121] mice_3.18.0
#write data to file for reuse
# d %>% write_rds("data/data_for_reuse.rds")
#OSF
if (F) {
library(osfr)
#login
osf_auth(readr::read_lines("~/.config/osf_token"))
#the project we will use
osf_proj = osf_retrieve_node("https://osf.io/XXX/")
#upload all files in project
#overwrite existing (versioning)
osf_upload(
osf_proj,
path = c("data", "figures", "papers", "notebook.Rmd", "notebook.html", "sessions_info.txt"),
conflicts = "overwrite"
)
}