options(
digits = 3
)
library(pacman)
p_load(
kirkegaard,
rms,
mirt,
ggeffects
)
#nice theme
theme_set(theme_bw())
d = read_rds("data/parsed_data.rds")
variables = read_csv2("data/question_data.csv")
## ℹ Using "','" as decimal and "'.'" as grouping mark. Use `read_delim()` for more control.
## Rows: 2620 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ";"
## chr (9): question, text, option_1, option_2, option_3, option_4, Type, Order...
## dbl (1): N
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
iq_items_meta = read_csv("data/test_items.csv")
## Rows: 28 Columns: 8
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (6): variable, text, option_1, option_2, option_3, option_4
## dbl (2): ID, option_correct
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
d %<>% mutate(
#basic
age = d_age,
sex2 = case_when(d_gender %in% c("Man", "Cis Man") ~ "Man",
d_gender %in% c("Woman", "Cis Woman") ~ "Woman",
TRUE ~ NA_character_)
)
#items
CA_items = intersect(iq_items_meta$variable, names(d))
#get scoring key
v_correct_answers_text = apply(iq_items_meta %>% filter(variable %in% CA_items), MARGIN = 1, function(item) {
item["option_" + 1:4][item["option_correct"] %>% as.numeric()]
})
#score items
CA_scored_items = score_items(d[CA_items], key = v_correct_answers_text)
#CA counts
CA_items_counts = miss_by_case(CA_scored_items, reverse = T)
CA_items_counts %>% table2()
#factor analysis / IRT
# irt_CA = irt.fa(CA_scored_items)
irt_CA = mirt(CA_scored_items, model = 1, technical = list(removeEmptyRows=TRUE))
## Warning: removeEmptyRows option has been deprecated. Complete NA response
## vectors now supported by using NA placeholders
## Warning: data contains response patterns with only NAs
##
Iteration: 1, Log-Lik: -188768.194, Max-Change: 5.04127
Iteration: 2, Log-Lik: -167586.674, Max-Change: 0.93778
Iteration: 3, Log-Lik: -163943.799, Max-Change: 0.34531
Iteration: 4, Log-Lik: -162360.344, Max-Change: 0.28657
Iteration: 5, Log-Lik: -161709.787, Max-Change: 0.22768
Iteration: 6, Log-Lik: -161353.642, Max-Change: 0.16053
Iteration: 7, Log-Lik: -161173.781, Max-Change: 0.09737
Iteration: 8, Log-Lik: -161064.493, Max-Change: 0.22971
Iteration: 9, Log-Lik: -160991.363, Max-Change: 0.12791
Iteration: 10, Log-Lik: -160959.392, Max-Change: 0.07594
Iteration: 11, Log-Lik: -160939.094, Max-Change: 0.04297
Iteration: 12, Log-Lik: -160925.798, Max-Change: 0.04225
Iteration: 13, Log-Lik: -160916.474, Max-Change: 0.02643
Iteration: 14, Log-Lik: -160913.117, Max-Change: 0.01116
Iteration: 15, Log-Lik: -160909.698, Max-Change: 0.00764
Iteration: 16, Log-Lik: -160907.840, Max-Change: 0.00427
Iteration: 17, Log-Lik: -160907.466, Max-Change: 0.00338
Iteration: 18, Log-Lik: -160907.281, Max-Change: 0.00242
Iteration: 19, Log-Lik: -160907.163, Max-Change: 0.00138
Iteration: 20, Log-Lik: -160907.136, Max-Change: 0.00128
Iteration: 21, Log-Lik: -160907.121, Max-Change: 0.00118
Iteration: 22, Log-Lik: -160907.111, Max-Change: 0.00045
Iteration: 23, Log-Lik: -160907.109, Max-Change: 0.00028
Iteration: 24, Log-Lik: -160907.108, Max-Change: 0.00025
Iteration: 25, Log-Lik: -160907.108, Max-Change: 0.00024
Iteration: 26, Log-Lik: -160907.107, Max-Change: 0.00008
#score
# irt_CA_scores = scoreIrt(irt_CA, CA_scored_items)
irt_CA_scores = fscores(irt_CA, full.scores = T, full.scores.SE = T)
#reliabilities
empirical_rxx(irt_CA_scores)
## F1
## 0.549
marginal_rxx(irt_CA)
## [1] 0.764
#save
d$CA_old = d$CA %>% standardize()
d$CA = NA
# d$CA[CA_items_counts != 0] = irt_CA_scores[, 1] %>% standardize(focal_group = d$race[CA_items_counts != 0] == "White")
d$CA = irt_CA_scores[, 1] %>% standardize(focal_group = d$race == "White")
## Warning in standardize(., focal_group = d$race == "White"): `focal_group`
## contains `NA` values. These were converted to `FALSE` following tidyverse
## convention.
reliability_by_item_count <- map_df(0:14, function(i) {
#subset
d_subset = CA_scored_items[CA_items_counts >= i, ]
#fit
irt_CA = mirt(d_subset, model = 1, technical = list(removeEmptyRows=TRUE), verbose = F)
#score
irt_CA_scores = fscores(irt_CA, full.scores = T, full.scores.SE = T)
#reliabilities
tibble(
minimal_item_count = i,
n = nrow(d_subset),
empirical = empirical_rxx(irt_CA_scores) %>% as.vector(),
marginal = marginal_rxx(irt_CA)
)
})
## Warning: removeEmptyRows option has been deprecated. Complete NA response
## vectors now supported by using NA placeholders
## Warning: data contains response patterns with only NAs
## Warning: removeEmptyRows option has been deprecated. Complete NA response
## vectors now supported by using NA placeholders
## Warning: removeEmptyRows option has been deprecated. Complete NA response
## vectors now supported by using NA placeholders
## Warning: removeEmptyRows option has been deprecated. Complete NA response
## vectors now supported by using NA placeholders
## Warning: removeEmptyRows option has been deprecated. Complete NA response
## vectors now supported by using NA placeholders
## Warning: removeEmptyRows option has been deprecated. Complete NA response
## vectors now supported by using NA placeholders
## Warning: removeEmptyRows option has been deprecated. Complete NA response
## vectors now supported by using NA placeholders
## Warning: removeEmptyRows option has been deprecated. Complete NA response
## vectors now supported by using NA placeholders
## Warning: removeEmptyRows option has been deprecated. Complete NA response
## vectors now supported by using NA placeholders
## Warning: removeEmptyRows option has been deprecated. Complete NA response
## vectors now supported by using NA placeholders
## Warning: removeEmptyRows option has been deprecated. Complete NA response
## vectors now supported by using NA placeholders
## Warning: removeEmptyRows option has been deprecated. Complete NA response
## vectors now supported by using NA placeholders
## Warning: removeEmptyRows option has been deprecated. Complete NA response
## vectors now supported by using NA placeholders
## Warning: removeEmptyRows option has been deprecated. Complete NA response
## vectors now supported by using NA placeholders
## Warning: removeEmptyRows option has been deprecated. Complete NA response
## vectors now supported by using NA placeholders
reliability_by_item_count
#plot version
reliability_by_item_count %>%
select(-n) %>%
pivot_longer(cols = -minimal_item_count) %>%
ggplot(aes(minimal_item_count, value, color = name)) +
geom_line() +
scale_color_discrete("Metric") +
scale_x_continuous("Minimum number of questions answered",
breaks = 0:14,
labels = str_glue("{0:14}\nn={reliability_by_item_count$n}"),
guide = guide_axis(n.dodge = 2))
GG_save("figs/minimum_item_count_reliability.png")
d %<>% mutate(
#desired children
desired_children = q979,
#has a child
has_children = q105 %>% factor(levels = c("No", "Yes"))
)
#at least 8 items for IQ
d_IQ8 = d[CA_items_counts >= 8, ]
#heteros
d_het = d %>% filter(str_detect(gender_orientation, "Hetero"))
#combo
d2 = d[CA_items_counts >= 8, ] %>% filter(str_detect(gender_orientation, "Hetero"))
nrow(d2)
## [1] 18219
d %>% select(age) %>% describe2()
#desired
GG_group_means(d2, "CA", groupvar = "desired_children", type = "violin") +
xlab("Desired number of children\n'How many children would you ideally like to have?'") +
ylab("Intelligence (z score)")
## Missing values were removed.
GG_save("figs/desired_kids_violin.png")
#current
GG_group_means(d2, "CA", groupvar = "has_children", type = "violin") +
xlab("Has at least one child\n'Do you have a child or children?'") +
ylab("Intelligence (z score)")
## Missing values were removed.
GG_save("figs/has_kids_violin.png")
#numbers
SMD_matrix(-d2$CA, group = d2$desired_children, extended_output = T)
## $d
## None 1-2 3-4 5 or more!
## None NA -0.113 -0.238 -0.410
## 1-2 -0.113 NA -0.125 -0.297
## 3-4 -0.238 -0.125 NA -0.172
## 5 or more! -0.410 -0.297 -0.172 NA
##
## $d_string
## None 1-2 3-4
## None NA "-0.11 [-0.15 -0.076]" "-0.24 [-0.29 -0.19]"
## 1-2 "-0.11 [-0.15 -0.076]" NA "-0.13 [-0.17 -0.081]"
## 3-4 "-0.24 [-0.29 -0.19]" "-0.13 [-0.17 -0.081]" NA
## 5 or more! "-0.41 [-0.54 -0.28]" "-0.30 [-0.42 -0.17]" "-0.17 [-0.30 -0.042]"
## 5 or more!
## None "-0.41 [-0.54 -0.28]"
## 1-2 "-0.30 [-0.42 -0.17]"
## 3-4 "-0.17 [-0.30 -0.042]"
## 5 or more! NA
##
## $CI_lower
## None 1-2 3-4 5 or more!
## None NA -0.150 -0.289 -0.538
## 1-2 -0.150 NA -0.169 -0.422
## 3-4 -0.289 -0.169 NA -0.302
## 5 or more! -0.538 -0.422 -0.302 NA
##
## $CI_upper
## None 1-2 3-4 5 or more!
## None NA -0.0757 -0.1870 -0.2826
## 1-2 -0.0757 NA -0.0809 -0.1724
## 3-4 -0.1870 -0.0809 NA -0.0424
## 5 or more! -0.2826 -0.1724 -0.0424 NA
##
## $se_z
## [1] 1.96
##
## $se
## None 1-2 3-4 5 or more!
## None NA 0.0191 0.0261 0.0651
## 1-2 0.0191 NA 0.0226 0.0637
## 3-4 0.0261 0.0226 NA 0.0661
## 5 or more! 0.0651 0.0637 0.0661 NA
##
## $p
## None 1-2 3-4 5 or more!
## None NA 3.09e-09 7.59e-20 2.98e-10
## 1-2 3.09e-09 NA 2.92e-08 3.06e-06
## 3-4 7.59e-20 2.92e-08 NA 9.27e-03
## 5 or more! 2.98e-10 3.06e-06 9.27e-03 NA
##
## $pairwise_n
## None 1-2 3-4 5 or more!
## None NA 14047 6183 4006
## 1-2 14047 NA 12724 10547
## 3-4 6183 12724 NA 2683
## 5 or more! 4006 10547 2683 NA
SMD_matrix(-d2$CA, group = d2$has_children, extended_output = T)
## $d
## No Yes
## No NA -0.39
## Yes -0.39 NA
##
## $d_string
## No Yes
## No NA "-0.39 [-0.43 -0.35]"
## Yes "-0.39 [-0.43 -0.35]" NA
##
## $CI_lower
## No Yes
## No NA -0.425
## Yes -0.425 NA
##
## $CI_upper
## No Yes
## No NA -0.354
## Yes -0.354 NA
##
## $se_z
## [1] 1.96
##
## $se
## No Yes
## No NA 0.0182
## Yes 0.0182 NA
##
## $p
## No Yes
## No NA 2.96e-101
## Yes 2.96e-101 NA
##
## $pairwise_n
## No Yes
## No NA 17554
## Yes 17554 NA
#pearson cors
d2 %>%
select(CA, desired_children, has_children) %>%
df_as_num() %>%
wtd.cors()
## CA desired_children has_children
## CA 1.0000 -0.0799 -0.160
## desired_children -0.0799 1.0000 0.149
## has_children -0.1602 0.1490 1.000
#latent cors
d2 %>%
select(CA, desired_children, has_children) %>%
df_as_num() %>%
mixedCor()
## Call: mixedCor(data = .)
## CA dsrd_ hs_ch
## CA 1.00
## desired_children -0.09 1.00
## has_children -0.22 0.22 1.00
#for 35+ year olds
d2 %>%
filter(age >= 35) %>%
select(CA, desired_children, has_children) %>%
df_as_num() %>%
mixedCor()
## Call: mixedCor(data = .)
## CA dsrd_ hs_ch
## CA 1.00
## desired_children -0.07 1.00
## has_children -0.22 0.28 1.00
d2 %>%
filter(age >= 35) %>%
nrow()
## [1] 8321
#desired numbers
desired_models = list(
basic = lrm(desired_children ~ CA, data = d2),
add_ASR = lrm(desired_children ~ CA + race + sex2 * rcs(age), data = d2),
add_sex_interaction = lrm(desired_children ~ CA*sex2 + race + sex2 * rcs(age), data = d2)
)
desired_models
## $basic
## Frequencies of Missing Values Due to Each Variable
## desired_children CA
## 1489 0
##
## Logistic Regression Model
##
## lrm(formula = desired_children ~ CA, data = d2)
##
##
## Frequencies of Responses
##
## None 1-2 3-4 5 or more!
## 3753 10294 2430 253
##
##
## Model Likelihood Discrimination Rank Discrim.
## Ratio Test Indexes Indexes
## Obs 16730 LR chi2 102.67 R2 0.007 C 0.541
## max |deriv| 7e-13 d.f. 1 g 0.180 Dxy 0.082
## Pr(> chi2) <0.0001 gr 1.197 gamma 0.083
## gp 0.024 tau-a 0.045
## Brier 0.134
##
## Coef S.E. Wald Z Pr(>|Z|)
## y>=1-2 1.2632 0.0188 67.35 <0.0001
## y>=3-4 -1.6478 0.0211 -78.09 <0.0001
## y>=5 or more! -4.1728 0.0634 -65.86 <0.0001
## CA -0.1478 0.0146 -10.12 <0.0001
##
##
## $add_ASR
## Frequencies of Missing Values Due to Each Variable
## desired_children CA race sex2
## 1489 0 974 0
## age
## 0
##
## Logistic Regression Model
##
## lrm(formula = desired_children ~ CA + race + sex2 * rcs(age),
## data = d2)
##
##
## Frequencies of Responses
##
## None 1-2 3-4 5 or more!
## 3533 9787 2303 236
##
##
## Model Likelihood Discrimination Rank Discrim.
## Ratio Test Indexes Indexes
## Obs 15859 LR chi2 474.08 R2 0.034 C 0.589
## max |deriv| 1e-09 d.f. 19 g 0.392 Dxy 0.178
## Pr(> chi2) <0.0001 gr 1.479 gamma 0.178
## gp 0.051 tau-a 0.097
## Brier 0.133
##
## Coef S.E. Wald Z Pr(>|Z|)
## y>=1-2 1.9175 0.4508 4.25 <0.0001
## y>=3-4 -1.0634 0.4506 -2.36 0.0183
## y>=5 or more! -3.6127 0.4547 -7.95 <0.0001
## CA -0.1392 0.0153 -9.09 <0.0001
## race=Mixed 0.2608 0.0552 4.73 <0.0001
## race=Asian 0.3863 0.0814 4.75 <0.0001
## race=Hispanic / Latin 0.4481 0.0852 5.26 <0.0001
## race=Black 0.4817 0.0873 5.52 <0.0001
## race=Other 0.1669 0.1078 1.55 0.1218
## race=Indian 0.0844 0.1677 0.50 0.6149
## race=Middle Eastern 0.5533 0.3025 1.83 0.0674
## race=Native American 0.3164 0.3223 0.98 0.3264
## race=Pacific Islander 0.6169 0.4374 1.41 0.1584
## sex2=Woman -0.4243 0.7750 -0.55 0.5841
## age -0.0235 0.0168 -1.40 0.1601
## age' 0.2149 0.1568 1.37 0.1706
## age'' -0.8834 0.5495 -1.61 0.1079
## age''' 1.0716 0.6590 1.63 0.1039
## sex2=Woman * age 0.0278 0.0291 0.96 0.3393
## sex2=Woman * age' -0.3636 0.2936 -1.24 0.2156
## sex2=Woman * age'' 0.6555 1.0871 0.60 0.5465
## sex2=Woman * age''' 0.0948 1.4063 0.07 0.9463
##
##
## $add_sex_interaction
## Frequencies of Missing Values Due to Each Variable
## desired_children CA sex2 race
## 1489 0 0 974
## age
## 0
##
## Logistic Regression Model
##
## lrm(formula = desired_children ~ CA * sex2 + race + sex2 * rcs(age),
## data = d2)
##
##
## Frequencies of Responses
##
## None 1-2 3-4 5 or more!
## 3533 9787 2303 236
##
##
## Model Likelihood Discrimination Rank Discrim.
## Ratio Test Indexes Indexes
## Obs 15859 LR chi2 474.13 R2 0.034 C 0.589
## max |deriv| 1e-09 d.f. 20 g 0.392 Dxy 0.178
## Pr(> chi2) <0.0001 gr 1.480 gamma 0.178
## gp 0.051 tau-a 0.097
## Brier 0.133
##
## Coef S.E. Wald Z Pr(>|Z|)
## y>=1-2 1.9164 0.4508 4.25 <0.0001
## y>=3-4 -1.0644 0.4506 -2.36 0.0182
## y>=5 or more! -3.6137 0.4547 -7.95 <0.0001
## CA -0.1410 0.0174 -8.09 <0.0001
## sex2=Woman -0.4255 0.7751 -0.55 0.5830
## race=Mixed 0.2608 0.0552 4.73 <0.0001
## race=Asian 0.3863 0.0814 4.75 <0.0001
## race=Hispanic / Latin 0.4480 0.0852 5.26 <0.0001
## race=Black 0.4818 0.0873 5.52 <0.0001
## race=Other 0.1665 0.1078 1.54 0.1225
## race=Indian 0.0841 0.1677 0.50 0.6158
## race=Middle Eastern 0.5523 0.3025 1.83 0.0679
## race=Native American 0.3153 0.3224 0.98 0.3281
## race=Pacific Islander 0.6162 0.4374 1.41 0.1589
## age -0.0235 0.0168 -1.40 0.1610
## age' 0.2147 0.1568 1.37 0.1710
## age'' -0.8833 0.5495 -1.61 0.1080
## age''' 1.0718 0.6590 1.63 0.1038
## CA * sex2=Woman 0.0075 0.0359 0.21 0.8353
## sex2=Woman * age 0.0278 0.0291 0.96 0.3386
## sex2=Woman * age' -0.3642 0.2937 -1.24 0.2149
## sex2=Woman * age'' 0.6587 1.0873 0.61 0.5446
## sex2=Woman * age''' 0.0902 1.4065 0.06 0.9489
##
#plot predictions for CA
pred_data = ggeffects::new_data(desired_models$add_ASR, terms = c("CA", "sex2"), typical = c("race", "age"))
## Warning in if (fun == "median") myfun <- get("median", asNamespace("stats"))
## else if (fun == : the condition has length > 1 and only the first element will
## be used
## Warning in if (fun == "weighted.mean") myfun <- get("weighted.mean",
## asNamespace("stats")) else if (fun == : the condition has length > 1 and only
## the first element will be used
## Warning in if (fun == "mode") myfun <- get(".mode_value",
## asNamespace("ggeffects")) else if (fun == : the condition has length > 1 and
## only the first element will be used
## Warning in if (fun == "zero") return(0) else myfun <- get("mean",
## asNamespace("base")): the condition has length > 1 and only the first element
## will be used
## Warning in if (fun != "mode") out <- levels(x)[1] else out <- .mode_value(x):
## the condition has length > 1 and only the first element will be used
## Warning in if (fun == "median") myfun <- get("median", asNamespace("stats"))
## else if (fun == : the condition has length > 1 and only the first element will
## be used
## Warning in if (fun == "weighted.mean") myfun <- get("weighted.mean",
## asNamespace("stats")) else if (fun == : the condition has length > 1 and only
## the first element will be used
## Warning in if (fun == "mode") myfun <- get(".mode_value",
## asNamespace("ggeffects")) else if (fun == : the condition has length > 1 and
## only the first element will be used
## Warning in if (fun == "zero") return(0) else myfun <- get("mean",
## asNamespace("base")): the condition has length > 1 and only the first element
## will be used
pred_data = bind_cols(
pred_data,
rms:::predict.lrm(desired_models$add_ASR, newdata = pred_data, type = "fitted.ind") %>% as.data.frame()
) %>%
mutate(
id = 1:n()
) %>%
pivot_longer(cols = contains("desired_children")) %>%
mutate(
desired_children = str_match(name, "=(.+)")[, 2] %>% ordered(levels = levels(d$desired_children))
)
pred_data
#plot
pred_data %>%
#sample rows for speedup
# sample_frac(size = .01) %>%
ggplot(aes(CA, value, color = desired_children, linetype = sex2)) +
geom_line() +
scale_y_continuous("Probability", labels = scales::percent) +
scale_x_continuous("Intelligence (z-score)") +
scale_color_ordinal("Ideal number of children") +
scale_linetype_discrete("Sex")
GG_save("figs/desired_kids.png")
#predictions for race
pred_data = ggeffects::new_data(desired_models$add_ASR, terms = c("race"), typical = c("CA", "sex2", "age"))
## Warning in if (fun == "median") myfun <- get("median", asNamespace("stats"))
## else if (fun == : the condition has length > 1 and only the first element will
## be used
## Warning in if (fun == "weighted.mean") myfun <- get("weighted.mean",
## asNamespace("stats")) else if (fun == : the condition has length > 1 and only
## the first element will be used
## Warning in if (fun == "mode") myfun <- get(".mode_value",
## asNamespace("ggeffects")) else if (fun == : the condition has length > 1 and
## only the first element will be used
## Warning in if (fun == "zero") return(0) else myfun <- get("mean",
## asNamespace("base")): the condition has length > 1 and only the first element
## will be used
## Warning in if (fun == "weighted.mean") out <- do.call(myfun, args = list(x =
## x, : the condition has length > 1 and only the first element will be used
## Warning in if (fun == "median") myfun <- get("median", asNamespace("stats"))
## else if (fun == : the condition has length > 1 and only the first element will
## be used
## Warning in if (fun == "weighted.mean") myfun <- get("weighted.mean",
## asNamespace("stats")) else if (fun == : the condition has length > 1 and only
## the first element will be used
## Warning in if (fun == "mode") myfun <- get(".mode_value",
## asNamespace("ggeffects")) else if (fun == : the condition has length > 1 and
## only the first element will be used
## Warning in if (fun == "zero") return(0) else myfun <- get("mean",
## asNamespace("base")): the condition has length > 1 and only the first element
## will be used
## Warning in if (fun == "median") myfun <- get("median", asNamespace("stats"))
## else if (fun == : the condition has length > 1 and only the first element will
## be used
## Warning in if (fun == "weighted.mean") myfun <- get("weighted.mean",
## asNamespace("stats")) else if (fun == : the condition has length > 1 and only
## the first element will be used
## Warning in if (fun == "mode") myfun <- get(".mode_value",
## asNamespace("ggeffects")) else if (fun == : the condition has length > 1 and
## only the first element will be used
## Warning in if (fun == "zero") return(0) else myfun <- get("mean",
## asNamespace("base")): the condition has length > 1 and only the first element
## will be used
pred_data = bind_cols(
pred_data,
rms:::predict.lrm(desired_models$add_ASR, newdata = pred_data, type = "fitted.ind") %>% as.data.frame()
) %>%
mutate(
id = 1:n()
) %>%
pivot_longer(cols = contains("desired_children")) %>%
mutate(
desired_children = str_match(name, "=(.+)")[, 2] %>% ordered(levels = levels(d$desired_children))
)
pred_data
#plot
pred_data %>%
#sample rows for speedup
# sample_frac(size = .01) %>%
ggplot(aes(race, value, fill = desired_children)) +
geom_bar(stat = "identity") +
scale_y_continuous("Distribution", labels = scales::percent) +
scale_x_discrete(guide = guide_axis(n.dodge = 3)) +
scale_fill_ordinal("Ideal number of children")
GG_save("figs/desired_kids_race.png")
#desired numbers
has_models = list(
basic = lrm(has_children ~ CA, data = d2),
add_ASR = lrm(has_children ~ CA + race + sex2 * rcs(age), data = d2),
add_sex_interaction = lrm(has_children ~ CA*sex2 + race + sex2 * rcs(age), data = d2)
)
has_models
## $basic
## Frequencies of Missing Values Due to Each Variable
## has_children CA
## 665 0
##
## Logistic Regression Model
##
## lrm(formula = has_children ~ CA, data = d2)
##
##
## Model Likelihood Discrimination Rank Discrim.
## Ratio Test Indexes Indexes
## Obs 17554 LR chi2 444.49 R2 0.038 C 0.611
## No 13635 d.f. 1 g 0.432 Dxy 0.222
## Yes 3919 Pr(> chi2) <0.0001 gr 1.540 gamma 0.223
## max |deriv| 2e-12 gp 0.075 tau-a 0.077
## Brier 0.169
##
## Coef S.E. Wald Z Pr(>|Z|)
## Intercept -1.2382 0.0184 -67.23 <0.0001
## CA -0.3551 0.0170 -20.93 <0.0001
##
##
## $add_ASR
## Frequencies of Missing Values Due to Each Variable
## has_children CA race sex2 age
## 665 0 974 0 0
##
## Logistic Regression Model
##
## lrm(formula = has_children ~ CA + race + sex2 * rcs(age), data = d2)
##
##
## Model Likelihood Discrimination Rank Discrim.
## Ratio Test Indexes Indexes
## Obs 16637 LR chi2 2915.63 R2 0.245 C 0.776
## No 12892 d.f. 19 g 1.326 Dxy 0.552
## Yes 3745 Pr(> chi2) <0.0001 gr 3.766 gamma 0.552
## max |deriv| 1e-07 gp 0.191 tau-a 0.193
## Brier 0.144
##
## Coef S.E. Wald Z Pr(>|Z|)
## Intercept -10.5128 1.3214 -7.96 <0.0001
## CA -0.3589 0.0193 -18.59 <0.0001
## race=Mixed -0.0335 0.0707 -0.47 0.6353
## race=Asian -1.1707 0.1538 -7.61 <0.0001
## race=Hispanic / Latin -0.0273 0.1139 -0.24 0.8106
## race=Black 0.2118 0.1093 1.94 0.0526
## race=Other -0.0709 0.1343 -0.53 0.5977
## race=Indian -1.1109 0.3402 -3.27 0.0011
## race=Middle Eastern -0.8829 0.4524 -1.95 0.0510
## race=Native American -0.5326 0.4320 -1.23 0.2176
## race=Pacific Islander -0.8568 0.6192 -1.38 0.1664
## sex2=Woman 3.4758 1.8247 1.90 0.0568
## age 0.2831 0.0475 5.96 <0.0001
## age' -0.7981 0.3387 -2.36 0.0184
## age'' 2.5486 1.0473 2.43 0.0150
## age''' -3.0498 1.0825 -2.82 0.0048
## sex2=Woman * age -0.1037 0.0662 -1.57 0.1172
## sex2=Woman * age' 0.2866 0.5181 0.55 0.5802
## sex2=Woman * age'' -0.0474 1.6973 -0.03 0.9777
## sex2=Woman * age''' -1.2320 1.9061 -0.65 0.5180
##
##
## $add_sex_interaction
## Frequencies of Missing Values Due to Each Variable
## has_children CA sex2 race age
## 665 0 0 974 0
##
## Logistic Regression Model
##
## lrm(formula = has_children ~ CA * sex2 + race + sex2 * rcs(age),
## data = d2)
##
##
## Model Likelihood Discrimination Rank Discrim.
## Ratio Test Indexes Indexes
## Obs 16637 LR chi2 2916.65 R2 0.245 C 0.776
## No 12892 d.f. 20 g 1.326 Dxy 0.552
## Yes 3745 Pr(> chi2) <0.0001 gr 3.767 gamma 0.552
## max |deriv| 1e-07 gp 0.191 tau-a 0.193
## Brier 0.144
##
## Coef S.E. Wald Z Pr(>|Z|)
## Intercept -10.4969 1.3210 -7.95 <0.0001
## CA -0.3484 0.0219 -15.89 <0.0001
## sex2=Woman 3.4401 1.8259 1.88 0.0596
## race=Mixed -0.0328 0.0707 -0.46 0.6423
## race=Asian -1.1714 0.1539 -7.61 <0.0001
## race=Hispanic / Latin -0.0268 0.1139 -0.24 0.8140
## race=Black 0.2123 0.1093 1.94 0.0521
## race=Other -0.0684 0.1343 -0.51 0.6103
## race=Indian -1.1102 0.3401 -3.26 0.0011
## race=Middle Eastern -0.8778 0.4523 -1.94 0.0523
## race=Native American -0.5250 0.4319 -1.22 0.2241
## race=Pacific Islander -0.8502 0.6189 -1.37 0.1695
## age 0.2825 0.0474 5.96 <0.0001
## age' -0.7958 0.3385 -2.35 0.0187
## age'' 2.5417 1.0467 2.43 0.0152
## age''' -3.0425 1.0819 -2.81 0.0049
## CA * sex2=Woman -0.0462 0.0457 -1.01 0.3124
## sex2=Woman * age -0.1029 0.0663 -1.55 0.1203
## sex2=Woman * age' 0.2884 0.5186 0.56 0.5782
## sex2=Woman * age'' -0.0584 1.6993 -0.03 0.9726
## sex2=Woman * age''' -1.2166 1.9089 -0.64 0.5239
##
#plot predictions
ggpredict(has_models$add_sex_interaction, terms = c("CA", "sex2")) %>% plot()
GG_save("figs/has_kids.png")
ggpredict(has_models$add_sex_interaction, terms = c("CA", "race")) %>%
#plot()
as.data.frame() %>%
ggplot(aes(x, predicted, color = group)) +
geom_line() +
scale_y_continuous("Probability", labels = scales::percent)
GG_save("figs/has_kids_race.png")
#versions
write_sessioninfo()
## R version 4.1.1 (2021-08-10)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Linux Mint 19.3
##
## Matrix products: default
## BLAS: /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.7.1
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.7.1
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=en_DK.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=en_DK.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_DK.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] stats4 stats graphics grDevices utils datasets methods
## [8] base
##
## other attached packages:
## [1] ggeffects_1.1.1 mirt_1.34 rms_6.2-0
## [4] SparseM_1.81 kirkegaard_2021-09-16 rlang_0.4.11
## [7] metafor_3.0-2 Matrix_1.3-4 psych_2.1.6
## [10] magrittr_2.0.1 assertthat_0.2.1 weights_1.0.4
## [13] Hmisc_4.5-0 Formula_1.2-4 survival_3.2-13
## [16] lattice_0.20-44 forcats_0.5.1 stringr_1.4.0
## [19] dplyr_1.0.7 purrr_0.3.4 readr_2.0.1
## [22] tidyr_1.1.3 tibble_3.1.3 ggplot2_3.3.5
## [25] tidyverse_1.3.1 pacman_0.5.1
##
## loaded via a namespace (and not attached):
## [1] TH.data_1.0-10 minqa_1.2.4 colorspace_2.0-2
## [4] ellipsis_0.3.2 sjlabelled_1.1.8 htmlTable_2.2.1
## [7] base64enc_0.1-3 fs_1.5.0 rstudioapi_0.13
## [10] mice_3.13.0 farver_2.1.0 Deriv_4.1.3
## [13] MatrixModels_0.5-0 bit64_4.0.5 mvtnorm_1.1-2
## [16] fansi_0.5.0 lubridate_1.7.10 mathjaxr_1.4-0
## [19] xml2_1.3.2 codetools_0.2-18 splines_4.1.1
## [22] mnormt_2.0.2 knitr_1.33 jsonlite_1.7.2
## [25] nloptr_1.2.2.2 broom_0.7.9 cluster_2.1.2
## [28] dbplyr_2.1.1 png_0.1-7 compiler_4.1.1
## [31] httr_1.4.2 backports_1.2.1 cli_3.0.1
## [34] htmltools_0.5.1.1 quantreg_5.86 tools_4.1.1
## [37] gtable_0.3.0 glue_1.4.2 Rcpp_1.0.7
## [40] cellranger_1.1.0 jquerylib_0.1.4 vctrs_0.3.8
## [43] gdata_2.18.0 nlme_3.1-152 conquer_1.0.2
## [46] insight_0.14.3 xfun_0.25 lme4_1.1-27.1
## [49] rvest_1.0.1 lifecycle_1.0.0 dcurver_0.9.2
## [52] gtools_3.9.2 polspline_1.1.19 zoo_1.8-9
## [55] MASS_7.3-54 scales_1.1.1 vroom_1.5.4
## [58] hms_1.1.0 parallel_4.1.1 sandwich_3.0-1
## [61] RColorBrewer_1.1-2 yaml_2.2.1 gridExtra_2.3
## [64] sass_0.4.0 rpart_4.1-15 latticeExtra_0.6-29
## [67] stringi_1.7.3 highr_0.9 permute_0.9-5
## [70] checkmate_2.0.0 boot_1.3-28 pkgconfig_2.0.3
## [73] matrixStats_0.60.0 evaluate_0.14 labeling_0.4.2
## [76] htmlwidgets_1.5.3 bit_4.0.4 tidyselect_1.1.1
## [79] plyr_1.8.6 R6_2.5.1 generics_0.1.0
## [82] multcomp_1.4-17 DBI_1.1.1 mgcv_1.8-36
## [85] pillar_1.6.2 haven_2.4.3 foreign_0.8-81
## [88] withr_2.4.2 nnet_7.3-16 modelr_0.1.8
## [91] crayon_1.4.1 utf8_1.2.2 tmvnsim_1.0-2
## [94] tzdb_0.1.2 rmarkdown_2.10 jpeg_0.1-9
## [97] grid_4.1.1 readxl_1.3.1 data.table_1.14.0
## [100] vegan_2.5-7 reprex_2.0.1 digest_0.6.27
## [103] GPArotation_2014.11-1 munsell_0.5.0 viridisLite_0.4.0
## [106] bslib_0.2.5.1
#data out
write_rds(d, "data/data_for_reuse", compress = "xz")
#upload to OSF
if (F) {
library(osfr)
osf_auth(read_lines("~/.config/osf_token"))
osf_proj = osf_retrieve_node("https://osf.io/tw7cf/")
osf_upload(osf_proj, conflicts = "overwrite", path =
c(
"data/question_data.csv", "data/test_items.csv", "data/where are the data",
"figs",
"notebook.html", "notebook.Rmd",
"sessions_info.txt"
))
}