Packages and functions.
options(digits = 3)
options(contrasts=c("contr.treatment", "contr.treatment"))
library(pacman)
p_load(kirkegaard, dplyr, readr, polycor, effsize, rms)
Load data.
#okc dataset
okc = read_rds("data/parsed_data_public.rds") %>%
#we need a regular data frame bc we need rownames
as.data.frame()
#item info
questions = read_csv2("data/question_data.csv")
## Using ',' as decimal and '.' as grouping mark. Use read_delim() for more control.
## Warning: Missing column names filled in: 'X1' [1]
## Parsed with column specification:
## cols(
## X1 = col_character(),
## text = col_character(),
## option_1 = col_character(),
## option_2 = col_character(),
## option_3 = col_character(),
## option_4 = col_character(),
## N = col_integer(),
## Type = col_character(),
## Order = col_character(),
## Keywords = col_character()
## )
We need to subset the data based on the quality of the CA data. Then recode a bunch of variables for our use.
#quick item analyser
#ad hoc function to help recoding
item_info = function(item) {
#item text
question_text = questions[questions$X1 == item, "text"] %>% unlist()
#summary stats
stats = plyr::ddply(okc, item, function(x) {
data_frame(
n = nrow(x),
n_frac = n/nrow(okc),
mean_CA = wtd_mean(x$CA)
)
})
#rename
names(stats)[1] = question_text
stats
}
#recode some stuff
okc %<>% mutate(
#answered questions
questions_answered = okc %>% .[str_subset(questions$X1, "^q")] %>% is.na() %>% `!` %>% rowSums(),
questions_answered_z = standardize(questions_answered),
#rename sex
sex = gender,
#age
age = d_age,
age_z = age %>% standardize(),
#recode CAS variables
#we want them so that TRUE -> CAS, FALSE -> non-CAS
#recode to dichotomous for easy statistical treatment
arrested = (q252 == "Yes"),
prison = (q1138 == "Yes"),
punched_in_face = (q196 == "Yes"),
cheated_exam = (q400 == "Yes"),
would_tax_cheat = (q180 == "Yes"),
stole_glass_from_bar = (q59919 == "Yes."),
used_fake_id = (q22569 == "Yes"),
torture_animal_for_fun = (q19928 != "NO WAY!"),
steal_newspapers = (q39226 != "Pay for a paper and close the dispenser."),
litter = (q17017 != "Never"),
cigaret_littering = (q74381 == "No."),
hit_SO_in_anger = (q81783 == "Yes.")
)
#subset
d = okc %>% filter(
#at least 5 items for CA estimate
CA_items >= 5,
#only binary sex
#samples too small
gender %in% c("Man", "Woman")
) %>% mutate(
#restandardize CA
CA_orig = CA,
CA = CA %>% standardize()
)
#add id
d$id = 1:nrow(d)
#recode race to common combinations and independent dummies
d$d_ethnicity %>% table2()
## # A tibble: 148 x 3
## Group Count Percent
## <chr> <dbl> <dbl>
## 1 White 18261 70.738
## 2 <NA> 1741 6.744
## 3 Asian 958 3.711
## 4 Hispanic / Latin 779 3.018
## 5 Black 775 3.002
## 6 Other 606 2.347
## 7 Hispanic / Latin, White 454 1.759
## 8 White, Other 292 1.131
## 9 Native American, White 244 0.945
## 10 Indian 226 0.875
## # ... with 138 more rows
#lots!
#explicit NAs
d$ethnicities = plyr::mapvalues(d$d_ethnicity, NA, "NA")
#common combos
d$SIRE_CC = fct_lump(d$ethnicities,
#preserve groups with >= 50 persons
n = d$d_ethnicity %>% table2() %>% .$Count %>% `>=`(100) %>% sum(),
other_level = "Other_combos") %>%
#relevel in order of freq
fct_infreq()
d$SIRE_CC %>% table2() %>% write_clipboard()
## Group Count Percent
## 1 White 18261.00 70.74
## 2 1741.00 6.74
## 3 Other_combos 1197.00 4.64
## 4 Asian 958.00 3.71
## 5 Hispanic / Latin 779.00 3.02
## 6 Black 775.00 3.00
## 7 Other 606.00 2.35
## 8 Hispanic / Latin, White 454.00 1.76
## 9 White, Other 292.00 1.13
## 10 Native American, White 244.00 0.95
## 11 Indian 226.00 0.88
## 12 Asian, White 175.00 0.68
## 13 Black, White 107.00 0.41
## 14 0.00 0.00
#independent dummies
d$White = str_detect(d$d_ethnicity, "White")
d$Black = str_detect(d$d_ethnicity, "Black")
d$Asian = str_detect(d$d_ethnicity, "Asian")
d$Hispanic = str_detect(d$d_ethnicity, "Hispanic")
d$Native_American = str_detect(d$d_ethnicity, "Native American")
d$Indian = str_detect(d$d_ethnicity, "Indian")
d$Middle_Eastern = str_detect(d$d_ethnicity, "Middle Eastern")
d$Pacific_Islander = str_detect(d$d_ethnicity, "Pacific Islander")
d$Other = str_detect(d$d_ethnicity, "Other")
d$Multi_SIRE = str_detect(d$d_ethnicity, ",")
SIRE_dummies = c("White", "Black", "Asian", "Hispanic", "Native_American", "Indian", "Middle_Eastern", "Pacific_Islander", "Other", "Multi_SIRE")
###var lists
#all CAS vars
CAS_vars = c("arrested",
"prison",
"punched_in_face",
"cheated_exam",
"would_tax_cheat",
"stole_glass_from_bar",
"used_fake_id",
"torture_animal_for_fun",
"steal_newspapers",
"litter",
"cigaret_littering",
"hit_SO_in_anger")
#also as names
#so map functions will automatically name stuff in lists
names(CAS_vars) = CAS_vars
#data subsets for ease of use
#all vars of interest
vars = c("CA",
"gender",
"race",
CAS_vars)
#CAS vars
CAS_full = okc[CAS_vars]
CAS = d[CAS_vars]
# tables ------------------------------------------------------------------
#pairwise cases
d[vars] %>% count.pairwise() %>% MAT_half() %>% describe()
## vars n mean sd median trimmed mad min max range skew kurtosis
## X1 1 105 7737 5703 7394 7046 6426 734 25815 25081 1.08 1.17
## se
## X1 557
#tables
d$race %>% table2()
## # A tibble: 11 x 3
## Group Count Percent
## <chr> <dbl> <dbl>
## 1 White 18261 70.738
## 2 Mixed 2311 8.952
## 3 <NA> 1741 6.744
## 4 Asian 958 3.711
## 5 Hispanic / Latin 779 3.018
## 6 Black 775 3.002
## 7 Other 606 2.347
## 8 Indian 226 0.875
## 9 Middle Eastern 82 0.318
## 10 Native American 39 0.151
## 11 Pacific Islander 37 0.143
#sex/gender
d$sex %>% table2()
## # A tibble: 3 x 3
## Group Count Percent
## <chr> <dbl> <dbl>
## 1 Man 18129 70.2
## 2 Woman 7686 29.8
## 3 <NA> 0 0.0
okc$sex %>% table2()
## # A tibble: 4 x 3
## Group Count Percent
## <chr> <dbl> <dbl>
## 1 Man 40215 58.82
## 2 Woman 25952 37.96
## 3 <NA> 2006 2.93
## 4 Other 198 0.29
okc$d_gender %>% table2()
## # A tibble: 108 x 3
## Group Count Percent
## <chr> <dbl> <dbl>
## 1 Man 40215 58.81880
## 2 Woman 25952 37.95761
## 3 <NA> 2006 2.93399
## 4 Cis Man 19 0.02779
## 5 Cis Woman 18 0.02633
## 6 Genderqueer 7 0.01024
## 7 Trans Woman 7 0.01024
## 8 Woman, Cis Woman 7 0.01024
## 9 Genderfluid 6 0.00878
## 10 Man, Cis Man 6 0.00878
## # ... with 98 more rows
okc$d_gender %>% table2() %>% .$Percent %>% .[c(1:2, 4:5)] %>% sum()
## [1] 96.8
okc$d_gender %>% table2() %>% .$Percent %>% .[-c(1:5)] %>% sum()
## [1] 0.235
#sexual orientation x sex
d$gender_orientation %<>% plyr::mapvalues(c("Gay_male", "Gay_female", "Hetero_male", "Hetero_female"), c("Homosexual_male", "Homosexual_female", "Heterosexual_male", "Heterosexual_female"))
d$gender_orientation %<>% factor() %>% fct_relevel(c("Heterosexual_male", "Bisexual_male", "Homosexual_male", "Homosexual_female", "Bisexual_female", "Heterosexual_female"))
d$gender_orientation_ord = ordered(d$gender_orientation)
d$gender_orientation_num = as.numeric(d$gender_orientation)
d$gender_orientation %>% table2() %>% write_clipboard()
## Group Count Percent
## 1 Heterosexual_male 16249.00 62.94
## 2 Heterosexual_female 5859.00 22.70
## 3 Bisexual_female 1426.00 5.52
## 4 Homosexual_male 1236.00 4.79
## 5 Bisexual_male 548.00 2.12
## 6 Homosexual_female 301.00 1.17
## 7 196.00 0.76
#age
d$age %>% describe()
## vars n mean sd median trimmed mad min max range skew kurtosis
## X1 1 25815 33.7 7.59 33 33.1 7.41 18 100 82 0.94 2.02
## se
## X1 0.05
okc$age %>% describe()
## vars n mean sd median trimmed mad min max range skew kurtosis
## X1 1 66365 31.7 7.8 30 31 7.41 18 100 82 0.98 1.95
## se
## X1 0.03
#cognitive ability
d$CA %>% describe()
## vars n mean sd median trimmed mad min max range skew kurtosis
## X1 1 25815 0 1 0.2 0.06 1 -3.58 2.27 5.85 -0.54 -0.22
## se
## X1 0.01
d$CA_orig %>% describe()
## vars n mean sd median trimmed mad min max range skew
## X1 1 25815 0.56 0.61 0.68 0.6 0.61 -1.63 1.95 3.58 -0.54
## kurtosis se
## X1 -0.22 0
GG_denhist(d, "CA") +
scale_x_continuous("Cognitive ability")
## Scale for 'x' is already present. Adding another scale for 'x', which
## will replace the existing scale.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
GG_save("figures/CA_denhist.png")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
#CAS
#sample sizes by item
d %>% plyr::ddply("sex", function(x) {
x %>% extract(CAS_vars) %>%
map_int(count_NA, reverse = T)
}) %>%
write_clipboard()
## Sex Arrested Prison Punched in face Cheated exam Would tax cheat
## 1 Man 6867 9056 9783 3057 8522
## 2 Woman 2786 3280 4001 1022 3206
## Stole glass from bar Used fake id Torture animal for fun
## 1 9122 7767 6523
## 2 3504 3044 2466
## Steal newspapers Litter Cigaret littering Hit SO in anger
## 1 1239 16579 10891 2196
## 2 488 6677 4014 680
#rate
map_df(okc[CAS_vars], function(x) {
x = na.omit(x) %>% as.numeric()
#2 options
data_frame(n = length(x),
proportion = mean(x)
)
}) %>% cbind(CAS_vars, .) %>%
write_clipboard()
## CAS vars N Proportion
## Arrested arrested 12456 0.24
## Prison prison 15236 0.02
## Punched in face punched_in_face 18128 0.34
## Cheated exam cheated_exam 5155 0.40
## Would tax cheat would_tax_cheat 15581 0.42
## Stole glass from bar stole_glass_from_bar 16747 0.44
## Used fake id used_fake_id 14067 0.28
## Torture animal for fun torture_animal_for_fun 10725 0.03
## Steal newspapers steal_newspapers 2111 0.43
## Litter litter 35221 0.31
## Cigaret littering cigaret_littering 18983 0.11
## Hit SO in anger hit_SO_in_anger 3405 0.03
#CA by sex
d %$% describeBy(CA, sex)
##
## Descriptive statistics by group
## group: Man
## vars n mean sd median trimmed mad min max range skew
## X1 1 18129 0.01 1.02 0.19 0.07 1.08 -3.58 2.27 5.85 -0.52
## kurtosis se
## X1 -0.31 0.01
## --------------------------------------------------------
## group: Woman
## vars n mean sd median trimmed mad min max range skew kurtosis
## X1 1 7686 -0.02 0.95 0.21 0.05 0.93 -3.57 2.2 5.77 -0.6 0
## se
## X1 0.01
#countries / State
okc$d_country %>% table2()
## # A tibble: 203 x 3
## Group Count Percent
## <chr> <dbl> <dbl>
## 1 UK 8438 12.34
## 2 NY 7989 11.68
## 3 CA 7145 10.45
## 4 TX 2767 4.05
## 5 <NA> 2006 2.93
## 6 FL 1760 2.57
## 7 Australia 1678 2.45
## 8 NJ 1662 2.43
## 9 IL 1559 2.28
## 10 PA 1533 2.24
## # ... with 193 more rows
#USA total
(okc$d_country %>% str_detect("^..$") %>% sum(na.rm=T)) / nrow(okc)
## [1] 0.772
#Canada
(okc$d_country %in% c("Ontario", "British Columbia", "Alberta", "Quebec", "Manitoba", "Nova Scotia", "Saskatchewan", "New Brunswick", "Newfoundland and Labrador", "Prince Edward Island") %>% sum(na.rm=T)) / nrow(okc)
## [1] 0.032
#overall missing
miss_amount(okc)
## cases with missing data vars with missing data cells with missing data
## 1.000 1.000 0.767
sum(is.na(okc))
## [1] 138556616
sum(!is.na(okc))
## [1] 42079566
sum(!is.na(okc)) / (sum(is.na(okc)) + sum(!is.na(okc)))
## [1] 0.233
Compute group means for each outcome.
#TODO: bug fix the below
group_means = map_df(CAS_vars, .f = function(x) {
#compute SMD for CAS vs. non-CAS
.d = SMD_matrix(d$CA, d[[x]], extended_output = T)
#return as df
data_frame(d = -.d$d[2, 1],
d_lower = -.d$CI_lower[2, 1],
d_upper = -.d$CI_upper[2, 1],
n = .d$pairwise_n[2, 1]
)
}) %>%
cbind(CAS_vars, .) %>%
write_clipboard()
## CAS vars D D lower D upper N
## Arrested arrested -0.24 -0.19 -0.28 9653
## Prison prison -0.34 -0.19 -0.48 12336
## Punched in face punched_in_face -0.26 -0.23 -0.30 13784
## Cheated exam cheated_exam -0.13 -0.07 -0.20 4079
## Would tax cheat would_tax_cheat -0.15 -0.11 -0.18 11728
## Stole glass from bar stole_glass_from_bar -0.03 0.01 -0.06 12626
## Used fake id used_fake_id 0.02 0.06 -0.02 10811
## Torture animal for fun torture_animal_for_fun -0.35 -0.23 -0.48 8989
## Steal newspapers steal_newspapers -0.26 -0.16 -0.35 1727
## Litter litter -0.38 -0.35 -0.41 23256
## Cigaret littering cigaret_littering -0.49 -0.44 -0.55 14905
## Hit SO in anger hit_SO_in_anger -0.47 -0.24 -0.70 2876
Not so straightforward. Multiple issues:
Solution: method variation and cross-validation. Do different methods agree?
First we just try looking at the item intercorrelations.
# correlations ------------------------------------------------------------
#correlation matrix
crime_cors = tetrachoric(CAS)$rho
crime_cors_full = tetrachoric(CAS_full)$rho
#output
crime_cors %>%
write_clipboard()
## Arrested Prison Punched in face Cheated exam
## Arrested 1.00 0.72 0.45 0.32
## Prison 0.72 1.00 0.19 0.27
## Punched in face 0.45 0.19 1.00 0.24
## Cheated exam 0.32 0.27 0.24 1.00
## Would tax cheat 0.11 0.05 0.17 0.44
## Stole glass from bar 0.33 0.19 0.25 0.35
## Used fake id 0.37 0.12 0.29 0.32
## Torture animal for fun 0.19 0.31 0.11 0.15
## Steal newspapers 0.24 0.17 0.19 0.57
## Litter 0.18 0.11 0.19 0.26
## Cigaret littering 0.24 0.11 0.19 0.34
## Hit SO in anger 0.07 0.17 0.31 0.11
## Would tax cheat Stole glass from bar Used fake id
## Arrested 0.11 0.33 0.37
## Prison 0.05 0.19 0.12
## Punched in face 0.17 0.25 0.29
## Cheated exam 0.44 0.35 0.32
## Would tax cheat 1.00 0.25 0.25
## Stole glass from bar 0.25 1.00 0.49
## Used fake id 0.25 0.49 1.00
## Torture animal for fun 0.22 0.03 0.04
## Steal newspapers 0.37 0.34 0.17
## Litter 0.21 0.07 0.02
## Cigaret littering 0.09 0.13 0.10
## Hit SO in anger 0.19 0.05 0.21
## Torture animal for fun Steal newspapers Litter
## Arrested 0.19 0.24 0.18
## Prison 0.31 0.17 0.11
## Punched in face 0.11 0.19 0.19
## Cheated exam 0.15 0.57 0.26
## Would tax cheat 0.22 0.37 0.21
## Stole glass from bar 0.03 0.34 0.07
## Used fake id 0.04 0.17 0.02
## Torture animal for fun 1.00 0.19 0.19
## Steal newspapers 0.19 1.00 0.33
## Litter 0.19 0.33 1.00
## Cigaret littering 0.09 0.27 0.40
## Hit SO in anger 0.28 0.03 0.20
## Cigaret littering Hit SO in anger
## Arrested 0.24 0.07
## Prison 0.11 0.17
## Punched in face 0.19 0.31
## Cheated exam 0.34 0.11
## Would tax cheat 0.09 0.19
## Stole glass from bar 0.13 0.05
## Used fake id 0.10 0.21
## Torture animal for fun 0.09 0.28
## Steal newspapers 0.27 0.03
## Litter 0.40 0.20
## Cigaret littering 1.00 0.18
## Hit SO in anger 0.18 1.00
crime_cors_full %>%
write_clipboard()
## Arrested Prison Punched in face Cheated exam
## Arrested 1.00 0.73 0.47 0.32
## Prison 0.73 1.00 0.21 0.30
## Punched in face 0.47 0.21 1.00 0.25
## Cheated exam 0.32 0.30 0.25 1.00
## Would tax cheat 0.14 0.10 0.16 0.44
## Stole glass from bar 0.35 0.19 0.25 0.36
## Used fake id 0.36 0.13 0.30 0.33
## Torture animal for fun 0.25 0.36 0.13 0.18
## Steal newspapers 0.25 0.14 0.19 0.57
## Litter 0.20 0.15 0.20 0.25
## Cigaret littering 0.28 0.15 0.23 0.30
## Hit SO in anger 0.10 0.12 0.30 0.10
## Would tax cheat Stole glass from bar Used fake id
## Arrested 0.14 0.35 0.36
## Prison 0.10 0.19 0.13
## Punched in face 0.16 0.25 0.30
## Cheated exam 0.44 0.36 0.33
## Would tax cheat 1.00 0.27 0.27
## Stole glass from bar 0.27 1.00 0.49
## Used fake id 0.27 0.49 1.00
## Torture animal for fun 0.23 0.03 0.06
## Steal newspapers 0.38 0.37 0.21
## Litter 0.20 0.07 0.03
## Cigaret littering 0.09 0.12 0.12
## Hit SO in anger 0.24 0.07 0.20
## Torture animal for fun Steal newspapers Litter
## Arrested 0.25 0.25 0.20
## Prison 0.36 0.14 0.15
## Punched in face 0.13 0.19 0.20
## Cheated exam 0.18 0.57 0.25
## Would tax cheat 0.23 0.38 0.20
## Stole glass from bar 0.03 0.37 0.07
## Used fake id 0.06 0.21 0.03
## Torture animal for fun 1.00 0.18 0.19
## Steal newspapers 0.18 1.00 0.33
## Litter 0.19 0.33 1.00
## Cigaret littering 0.14 0.29 0.42
## Hit SO in anger 0.34 0.10 0.20
## Cigaret littering Hit SO in anger
## Arrested 0.28 0.10
## Prison 0.15 0.12
## Punched in face 0.23 0.30
## Cheated exam 0.30 0.10
## Would tax cheat 0.09 0.24
## Stole glass from bar 0.12 0.07
## Used fake id 0.12 0.20
## Torture animal for fun 0.14 0.34
## Steal newspapers 0.29 0.10
## Litter 0.42 0.20
## Cigaret littering 1.00 0.16
## Hit SO in anger 0.16 1.00
#averages
crime_cors %>% MAT_half() %>% describe() %>% as.data.frame()
## vars n mean sd median trimmed mad min max range skew
## X1 1 66 0.224 0.133 0.194 0.213 0.122 0.023 0.723 0.7 1.06
## kurtosis se
## X1 1.82 0.0163
crime_cors_full %>% MAT_half() %>% describe() %>% as.data.frame()
## vars n mean sd median trimmed mad min max range skew
## X1 1 66 0.237 0.131 0.21 0.225 0.127 0.0269 0.731 0.704 1.08
## kurtosis se
## X1 1.84 0.0161
#compare full to subset data
#differences in correlations
(crime_cors - crime_cors_full) %>% MAT_half() %>% GG_denhist()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
(crime_cors - crime_cors_full) %>% MAT_half() %>% describe()
## vars n mean sd median trimmed mad min max range skew kurtosis
## X1 1 66 -0.01 0.02 -0.01 -0.01 0.02 -0.06 0.05 0.11 0.04 0.2
## se
## X1 0
Now we come to the question of how we should aggregate the CAS items into a single score. It’s not quite obvious the best way to do this. First we examine the pattern of missing data.
#missing data distribution
miss_plot(CAS)
miss_plot(CAS, reverse = T)
GG_save("figures/missing_data_tradeoff.png")
miss_plot(CAS, reverse = T, case = F)
#cases by var
n_by_var = map_int(CAS_vars, ~d[[.]] %>% count_NA(reverse = T)) %>% set_names(CAS_vars)
#complete cases by scale length
l_complete_data = map(seq_along(CAS), function(x) {
CAS[, names(n_by_var)[1:x], drop = F] %>% na.omit()
})
#cases number with at least x
l_atleast_data = map(seq_along(CAS), function(x) {
CAS %>% miss_filter(missing = x, reverse = T)
})
#output
data_frame(casewise_complete = map_int(l_complete_data, nrow) %>% set_names(seq_along(CAS)),
fractional_complete = map_int(l_atleast_data, nrow) %>% set_names(seq_along(CAS))
) %>%
write_clipboard()
## Casewise complete Fractional complete
## 1 9653 24514
## 2 6320 21672
## 3 5126 19090
## 4 2163 16779
## 5 1942 13830
## 6 1855 10783
## 7 1798 7882
## 8 1670 5066
## 9 864 3399
## 10 847 2148
## 11 831 1148
## 12 459 459
#standard error of correlation
se_r = data.frame(n = 3:40000)
se_r$se_r = map_dbl(se_r$n, function(n) {sqrt((1-.15^2)/(n-2))})
ggplot(se_r, aes(n, se_r)) +
geom_line() +
scale_y_log10(breaks = helper_breaks(10^(-5:0)), name = "Standard error of correlation") +
scale_x_continuous(breaks = seq(0, 40000, by = 5000)) +
theme_bw()
GG_save("figures/se_r.png")
#subset to desired samples re. missing data
CAS2 = l_atleast_data[[7]]
#type variants
#some functions want specific types
CAS2_num = CAS2 %>% df_colFunc(as.numeric)
CAS2_lgl = CAS2 %>% df_colFunc(as.logical)
CAS2_fct = CAS2 %>% df_colFunc(as.factor)
First, we create an imputed version of the data.
#impute
#change items to logical for VIM
#then change back for psych
#impute
CAS2_imp = CAS2_fct %>% miss_impute() %>% map(~as.numeric(.) - 1) %>% as.data.frame()
#assert no missing
assert_that(sum(is.na(CAS2_imp)) == 0)
## [1] TRUE
#compare correlations with unimputed data
CAS2_cors = psych::tetrachoric(CAS2_imp)$rho
CAS2_imp_cors = psych::tetrachoric(CAS2)$rho
#describe
CAS2_cors %>% MAT_half() %>% describe()
## vars n mean sd median trimmed mad min max range skew kurtosis
## X1 1 66 0.26 0.19 0.22 0.25 0.19 -0.04 0.99 1.03 1.08 1.72
## se
## X1 0.02
CAS2_imp_cors %>% MAT_half() %>% describe()
## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 66 0.22 0.14 0.2 0.21 0.14 0 0.74 0.74 1 1.93 0.02
#differences
CAS2_imp_cors - CAS2_cors
## arrested prison punched_in_face cheated_exam
## arrested 0.00000 -0.00533 -0.06158 -0.2270
## prison -0.00533 0.00000 0.00984 -0.0527
## punched_in_face -0.06158 0.00984 0.00000 -0.1184
## cheated_exam -0.22696 -0.05268 -0.11841 0.0000
## would_tax_cheat -0.08937 -0.04049 -0.04813 -0.2041
## stole_glass_from_bar -0.04266 -0.00601 -0.04222 -0.0559
## used_fake_id -0.08093 -0.03093 -0.03196 -0.1163
## torture_animal_for_fun 0.03568 0.02939 0.00679 -0.0432
## steal_newspapers -0.27553 -0.13448 -0.18082 -0.4203
## litter -0.00475 0.00939 -0.00663 -0.1176
## cigaret_littering -0.01596 0.02737 -0.00969 -0.0873
## hit_SO_in_anger 0.01152 -0.02858 0.04572 0.0832
## would_tax_cheat stole_glass_from_bar used_fake_id
## arrested -0.0894 -0.04266 -0.080927
## prison -0.0405 -0.00601 -0.030933
## punched_in_face -0.0481 -0.04222 -0.031958
## cheated_exam -0.2041 -0.05586 -0.116269
## would_tax_cheat 0.0000 -0.04696 -0.018983
## stole_glass_from_bar -0.0470 0.00000 -0.042415
## used_fake_id -0.0190 -0.04242 0.000000
## torture_animal_for_fun 0.0240 0.00226 0.027054
## steal_newspapers -0.2384 -0.05607 -0.232859
## litter -0.0324 -0.00787 -0.000533
## cigaret_littering -0.0353 -0.01301 0.004859
## hit_SO_in_anger 0.0789 0.02829 0.097989
## torture_animal_for_fun steal_newspapers litter
## arrested 0.03568 -0.2755 -0.004750
## prison 0.02939 -0.1345 0.009388
## punched_in_face 0.00679 -0.1808 -0.006628
## cheated_exam -0.04318 -0.4203 -0.117557
## would_tax_cheat 0.02400 -0.2384 -0.032391
## stole_glass_from_bar 0.00226 -0.0561 -0.007875
## used_fake_id 0.02705 -0.2329 -0.000533
## torture_animal_for_fun 0.00000 0.0144 0.016487
## steal_newspapers 0.01443 0.0000 -0.079137
## litter 0.01649 -0.0791 0.000000
## cigaret_littering 0.04342 -0.1476 -0.001534
## hit_SO_in_anger 0.03578 0.0464 0.053359
## cigaret_littering hit_SO_in_anger
## arrested -0.01596 0.0115
## prison 0.02737 -0.0286
## punched_in_face -0.00969 0.0457
## cheated_exam -0.08730 0.0832
## would_tax_cheat -0.03529 0.0789
## stole_glass_from_bar -0.01301 0.0283
## used_fake_id 0.00486 0.0980
## torture_animal_for_fun 0.04342 0.0358
## steal_newspapers -0.14761 0.0464
## litter -0.00153 0.0534
## cigaret_littering 0.00000 0.1112
## hit_SO_in_anger 0.11120 0.0000
(CAS2_imp_cors - CAS2_cors) %>% MAT_half() %>% describe()
## vars n mean sd median trimmed mad min max range skew kurtosis
## X1 1 66 -0.04 0.09 -0.02 -0.03 0.06 -0.42 0.11 0.53 -1.52 3.06
## se
## X1 0.01
We try IRT scoring based on the dataset with missing data.
#factor analyze
(CAS2_irt = irt.fa(CAS2_num))
## Item Response Analysis using Factor Analysis
##
## Call: irt.fa(x = CAS2_num)
## Item Response Analysis using Factor Analysis
##
## Summary information by factor and item
## Factor = 1
## -3 -2 -1 0 1 2 3
## arrested 0.01 0.03 0.11 0.30 0.48 0.31 0.11
## prison 0.01 0.02 0.03 0.07 0.12 0.17 0.16
## punched_in_face 0.03 0.07 0.14 0.22 0.23 0.15 0.07
## cheated_exam 0.01 0.06 0.22 0.51 0.46 0.17 0.04
## would_tax_cheat 0.04 0.08 0.13 0.17 0.16 0.11 0.06
## stole_glass_from_bar 0.03 0.08 0.17 0.26 0.24 0.14 0.06
## used_fake_id 0.02 0.05 0.11 0.19 0.23 0.17 0.09
## torture_animal_for_fun 0.02 0.03 0.04 0.06 0.07 0.08 0.07
## steal_newspapers 0.03 0.08 0.20 0.32 0.28 0.14 0.05
## litter 0.03 0.06 0.10 0.13 0.13 0.10 0.07
## cigaret_littering 0.02 0.04 0.07 0.11 0.15 0.15 0.11
## hit_SO_in_anger 0.02 0.03 0.04 0.06 0.07 0.07 0.07
## Test Info 0.26 0.61 1.36 2.41 2.62 1.77 0.97
## SEM 1.96 1.28 0.86 0.64 0.62 0.75 1.01
## Reliability -2.83 -0.64 0.26 0.58 0.62 0.43 -0.03
##
## Factor analysis with Call: fa(r = r, nfactors = nfactors, n.obs = n.obs, rotate = rotate,
## fm = fm)
##
## Test of the hypothesis that 1 factor is sufficient.
## The degrees of freedom for the model is 54 and the objective function was 1.81
## The number of observations was 7882 with Chi Square = 14218 with prob < 0
##
## The root mean square of the residuals (RMSA) is 0.12
## The df corrected root mean square of the residuals is 0.13
##
## Tucker Lewis Index of factoring reliability = 0.374
## RMSEA index = 0.182 and the 10 % confidence intervals are 0.18 0.185
## BIC = 13733
#loadings
fa_plot_loadings(CAS2_irt$fa) +
scale_y_continuous(breaks = seq(0, 1, .1))
GG_save("figures/irt_loadings.png")
#score
CAS2_scores = scoreIrt(CAS2_irt, items = CAS2_num)
CAS2_scores$crime_score = CAS2_scores$theta1 %>% standardize()
CAS2_scores$id = rownames(CAS2) %>% as.integer()
#distribution of crime scores
GG_denhist(CAS2_scores, "crime_score") +
scale_x_continuous(name = "General CAS score - irt")
## Scale for 'x' is already present. Adding another scale for 'x', which
## will replace the existing scale.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
GG_save("figures/crime_score_dist.png")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
What if we impute the missing data beforehand? Maybe the odd results are due to incorrect handling of missing data, so we impute the data beforehand.
#factor analyze again
(CAS2_imp_irt = irt.fa(CAS2_imp, sort = F))
## Item Response Analysis using Factor Analysis
##
## Call: irt.fa(x = CAS2_imp, sort = F)
## Item Response Analysis using Factor Analysis
##
## Summary information by factor and item
## Factor = 1
## -3 -2 -1 0 1 2 3
## arrested 0.00 0.02 0.08 0.30 0.62 0.42 0.12
## prison 0.01 0.02 0.03 0.06 0.10 0.14 0.14
## punched_in_face 0.03 0.06 0.13 0.22 0.23 0.15 0.08
## cheated_exam 0.00 0.00 0.00 0.08 2.85 0.89 0.02
## would_tax_cheat 0.03 0.07 0.18 0.30 0.28 0.15 0.06
## stole_glass_from_bar 0.03 0.07 0.15 0.23 0.23 0.14 0.07
## used_fake_id 0.02 0.05 0.11 0.21 0.26 0.20 0.10
## steal_newspapers 0.00 0.00 0.01 0.19 3.05 0.56 0.02
## litter 0.03 0.06 0.10 0.12 0.13 0.10 0.07
## cigaret_littering 0.02 0.04 0.07 0.11 0.14 0.14 0.11
## Test Info 0.16 0.39 0.86 1.82 7.88 2.90 0.77
## SEM 2.46 1.60 1.08 0.74 0.36 0.59 1.14
## Reliability -5.06 -1.57 -0.17 0.45 0.87 0.65 -0.29
##
## Factor analysis with Call: fa(r = r, nfactors = nfactors, n.obs = n.obs, rotate = rotate,
## fm = fm)
##
## Test of the hypothesis that 1 factor is sufficient.
## The degrees of freedom for the model is 54 and the objective function was 4.26
## The number of observations was 7882 with Chi Square = 33566 with prob < 0
##
## The root mean square of the residuals (RMSA) is 0.12
## The df corrected root mean square of the residuals is 0.13
##
## Tucker Lewis Index of factoring reliability = 0.355
## RMSEA index = 0.281 and the 10 % confidence intervals are 0.278 0.283
## BIC = 33082
#loadings
fa_plot_loadings(CAS2_imp_irt$fa)
#compare loadings
CAS2_irts = list('without pre-imputation' = CAS2_irt$fa,
'with pre-imputation' = CAS2_imp_irt$fa)
fa_congruence_matrix(CAS2_irts)
## MR1 MR1
## MR1 1.00 0.97
## MR1 0.97 1.00
fa_plot_loadings(CAS2_irts)
#score
CAS2_imp_scores = scoreIrt(CAS2_imp_irt, items = CAS2_imp)
CAS2_imp_scores$crime_score_imp = CAS2_imp_scores$theta1 %>% standardize()
CAS2_imp_scores$id = rownames(CAS2) %>% as.integer()
#distribution of crime scores
GG_denhist(CAS2_imp_scores, "crime_score_imp") +
xlab("General CAS score (from imputed data)")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Compare scoring methods.
#compare scores
score_comparison = data_frame(crime_sum_imp = rowSums(CAS2_imp),
crime_score = CAS2_scores$crime_score,
crime_score_imp = CAS2_imp_scores$crime_score,
CAS2_complete = miss_by_case(CAS2, reverse = T),
id = rownames(CAS2) %>% as.integer()
)
#assert all ids in subset in complete sample
assert_that(all(score_comparison$id %in% d$id))
## [1] TRUE
#join to main
d = full_join(score_comparison, d, by = "id")
#correlations
wtd.cors(d[c("crime_sum_imp", "crime_score", "crime_score_imp", "CA", "CAS2_complete")]) %>%
write_clipboard()
## Crime sum imp Crime score Crime score imp CA
## Crime sum imp 1.00 0.90 0.94 -0.18
## Crime score 0.90 1.00 0.82 -0.13
## Crime score imp 0.94 0.82 1.00 -0.16
## CA -0.18 -0.13 -0.16 1.00
## CAS2 complete 0.04 0.05 0.09 -0.03
## CAS2 complete
## Crime sum imp 0.04
## Crime score 0.05
## Crime score imp 0.09
## CA -0.03
## CAS2 complete 1.00
#CA and crime
GG_scatter(d, "CA", "crime_sum_imp")
d$crime_sum_imp_fct = d$crime_sum_imp %>% factor() %>% fct_relevel(0:11)
GG_group_means(d, "CA", "crime_sum_imp_fct") +
scale_x_discrete("Simple criminality and anti-social score (out of 12)") +
scale_y_continuous("Cognitive ability (z-score)")
## Missing values were removed.
## Warning in qt(1 - ((1 - CI)/2), df = as.numeric(x[4]) - 1): NaNs produced
## Warning in qt(1 - ((1 - CI)/2), df = as.numeric(x[4]) - 1): NaNs produced
## Scale for 'x' is already present. Adding another scale for 'x', which
## will replace the existing scale.
## Warning: Removed 2 rows containing missing values (geom_errorbar).
GG_save("figures/crime_sum_ca.png")
## Warning: Removed 2 rows containing missing values (geom_errorbar).
#table
table2(d$crime_sum_imp_fct)
## # A tibble: 13 x 3
## Group Count Percent
## <chr> <dbl> <dbl>
## 1 <NA> 17933 69.46736
## 2 1 1782 6.90296
## 3 0 1678 6.50010
## 4 2 1215 4.70657
## 5 3 833 3.22681
## 6 4 812 3.14546
## 7 5 625 2.42107
## 8 6 460 1.78191
## 9 7 304 1.17761
## 10 8 135 0.52295
## 11 9 36 0.13945
## 12 10 1 0.00387
## 13 11 1 0.00387
Now we try to see if the relation is due to some obvious confound.
#fit models
ols_fits = list(
primary = rms::ols(MOD_add_str_to_formula(crime_score_imp ~ CA + gender_orientation + rcs(age), SIRE_dummies), data = d),
SIRE_CC = rms::ols(crime_score_imp ~ CA + gender_orientation + rcs(age) + SIRE_CC, data = d),
whites = rms::ols(crime_score_imp ~ CA + gender_orientation + rcs(age), data = d %>% dplyr::filter(race == "White")),
ca_gender_interaction = rms::ols(MOD_add_str_to_formula(crime_score_imp ~ CA * gender_orientation + rcs(age), SIRE_dummies), data = d),
no_low_CA = rms::ols(MOD_add_str_to_formula(crime_score_imp ~ CA + gender_orientation + rcs(age), SIRE_dummies), data = d %>% dplyr::filter(CA > -2)),
no_low_CA_gender_interaction = rms::ols(MOD_add_str_to_formula(crime_score_imp ~ CA * gender_orientation + rcs(age), SIRE_dummies), data = d %>% dplyr::filter(CA > -2)),
nonlinear_ca = rms::ols(MOD_add_str_to_formula(crime_score_imp ~ rcs(CA) * gender_orientation + rcs(age), SIRE_dummies), data = d %>% dplyr::filter(CA > -2)),
#alternative GSO codings
ordinal_GSO = rms::ols(MOD_add_str_to_formula(crime_score_imp ~ CA * gender_orientation_ord + rcs(age), SIRE_dummies), data = d %>% dplyr::filter(CA > -2)),
numeric_GSO = rms::ols(MOD_add_str_to_formula(crime_score_imp ~ CA * gender_orientation_num + rcs(age), SIRE_dummies), data = d %>% dplyr::filter(CA > -2)),
#baselines
demo_only = rms::ols(MOD_add_str_to_formula(crime_score_imp ~ 1, SIRE_dummies), data = d),
demo_only2 = rms::ols(crime_score_imp ~ SIRE_CC, data = d),
gender_orientation = rms::ols(crime_score_imp ~ gender_orientation, data = d)
)
ols_fits
## $primary
## Frequencies of Missing Values Due to Each Variable
## crime_score_imp CA gender_orientation
## 17933 0 196
## age White Black
## 0 1741 1741
## Asian Hispanic Native_American
## 1741 1741 1741
## Indian Middle_Eastern Pacific_Islander
## 1741 1741 1741
## Other Multi_SIRE
## 1741 1741
##
## Linear Regression Model
##
## rms::ols(formula = MOD_add_str_to_formula(crime_score_imp ~ CA +
## gender_orientation + rcs(age), SIRE_dummies), data = d)
##
##
## Model Likelihood Discrimination
## Ratio Test Indexes
## Obs 7410 LR chi2 323.19 R2 0.043
## sigma0.9790 d.f. 20 R2 adj 0.040
## d.f. 7389 Pr(> chi2) 0.0000 g 0.232
##
## Residuals
##
## Min 1Q Median 3Q Max
## -1.7683 -0.8129 -0.3335 1.0633 2.2378
##
##
## Coef S.E. t Pr(>|t|)
## Intercept 0.6253 0.3021 2.07 0.0385
## CA -0.1530 0.0107 -14.25 <0.0001
## gender_orientation=Bisexual_male 0.0565 0.0792 0.71 0.4756
## gender_orientation=Homosexual_male -0.0853 0.0557 -1.53 0.1257
## gender_orientation=Homosexual_female -0.0956 0.1076 -0.89 0.3745
## gender_orientation=Bisexual_female -0.1573 0.0575 -2.73 0.0063
## gender_orientation=Heterosexual_female -0.2645 0.0289 -9.15 <0.0001
## age -0.0192 0.0115 -1.66 0.0965
## age' 0.0098 0.0810 0.12 0.9041
## age'' 0.1584 0.3302 0.48 0.6315
## age''' -0.4100 0.4227 -0.97 0.3321
## White -0.0268 0.0493 -0.54 0.5859
## Black -0.0298 0.0628 -0.48 0.6348
## Asian -0.0532 0.0640 -0.83 0.4057
## Hispanic 0.0738 0.0591 1.25 0.2119
## Native_American -0.0358 0.0774 -0.46 0.6441
## Indian -0.0174 0.0966 -0.18 0.8572
## Middle_Eastern 0.1478 0.0993 1.49 0.1368
## Pacific_Islander 0.0487 0.1289 0.38 0.7054
## Other 0.2061 0.0625 3.30 0.0010
## Multi_SIRE 0.0380 0.0624 0.61 0.5423
##
##
## $SIRE_CC
## Frequencies of Missing Values Due to Each Variable
## crime_score_imp CA gender_orientation
## 17933 0 196
## age SIRE_CC
## 0 0
##
## Linear Regression Model
##
## rms::ols(formula = crime_score_imp ~ CA + gender_orientation +
## rcs(age) + SIRE_CC, data = d)
##
##
## Model Likelihood Discrimination
## Ratio Test Indexes
## Obs 7796 LR chi2 327.28 R2 0.041
## sigma0.9817 d.f. 22 R2 adj 0.038
## d.f. 7773 Pr(> chi2) 0.0000 g 0.229
##
## Residuals
##
## Min 1Q Median 3Q Max
## -1.7881 -0.8149 -0.3325 1.0743 2.2333
##
##
## Coef S.E. t Pr(>|t|)
## Intercept 0.5554 0.2904 1.91 0.0558
## CA -0.1511 0.0105 -14.43 <0.0001
## gender_orientation=Bisexual_male 0.0157 0.0764 0.21 0.8372
## gender_orientation=Homosexual_male -0.0981 0.0546 -1.80 0.0724
## gender_orientation=Homosexual_female -0.1472 0.1043 -1.41 0.1582
## gender_orientation=Bisexual_female -0.1651 0.0555 -2.97 0.0030
## gender_orientation=Heterosexual_female -0.2623 0.0282 -9.31 <0.0001
## age -0.0174 0.0113 -1.54 0.1227
## age' -0.0059 0.0790 -0.08 0.9402
## age'' 0.2300 0.3221 0.71 0.4751
## age''' -0.5107 0.4121 -1.24 0.2153
## SIRE_CC=NA 0.0563 0.0517 1.09 0.2763
## SIRE_CC=Other_combos 0.1116 0.0517 2.16 0.0310
## SIRE_CC=Asian -0.0333 0.0621 -0.54 0.5925
## SIRE_CC=Hispanic / Latin 0.1176 0.0654 1.80 0.0724
## SIRE_CC=Black 0.0188 0.0653 0.29 0.7731
## SIRE_CC=Other 0.2535 0.0772 3.28 0.0010
## SIRE_CC=Hispanic / Latin, White 0.1084 0.0835 1.30 0.1945
## SIRE_CC=White, Other 0.2888 0.1107 2.61 0.0091
## SIRE_CC=Native American, White 0.1099 0.1076 1.02 0.3070
## SIRE_CC=Indian -0.0689 0.1157 -0.60 0.5516
## SIRE_CC=Asian, White 0.0423 0.1320 0.32 0.7487
## SIRE_CC=Black, White 0.0958 0.1691 0.57 0.5712
##
##
## $whites
## Frequencies of Missing Values Due to Each Variable
## crime_score_imp CA gender_orientation
## 12552 0 129
## age
## 0
##
## Linear Regression Model
##
## rms::ols(formula = crime_score_imp ~ CA + gender_orientation +
## rcs(age), data = d %>% dplyr::filter(race == "White"))
##
##
## Model Likelihood Discrimination
## Ratio Test Indexes
## Obs 5647 LR chi2 224.39 R2 0.039
## sigma0.9788 d.f. 10 R2 adj 0.037
## d.f. 5636 Pr(> chi2) 0.0000 g 0.222
##
## Residuals
##
## Min 1Q Median 3Q Max
## -1.6592 -0.8138 -0.3425 1.0693 2.2335
##
##
## Coef S.E. t Pr(>|t|)
## Intercept 0.7435 0.3389 2.19 0.0283
## CA -0.1628 0.0123 -13.25 <0.0001
## gender_orientation=Bisexual_male -0.0240 0.0929 -0.26 0.7963
## gender_orientation=Homosexual_male -0.0661 0.0673 -0.98 0.3255
## gender_orientation=Homosexual_female -0.0570 0.1279 -0.45 0.6561
## gender_orientation=Bisexual_female -0.1813 0.0666 -2.72 0.0065
## gender_orientation=Heterosexual_female -0.2461 0.0332 -7.41 <0.0001
## age -0.0257 0.0129 -1.99 0.0471
## age' 0.0897 0.1126 0.80 0.4258
## age'' -0.1171 0.3751 -0.31 0.7550
## age''' -0.0784 0.4001 -0.20 0.8446
##
##
## $ca_gender_interaction
## Frequencies of Missing Values Due to Each Variable
## crime_score_imp CA gender_orientation
## 17933 0 196
## age White Black
## 0 1741 1741
## Asian Hispanic Native_American
## 1741 1741 1741
## Indian Middle_Eastern Pacific_Islander
## 1741 1741 1741
## Other Multi_SIRE
## 1741 1741
##
## Linear Regression Model
##
## rms::ols(formula = MOD_add_str_to_formula(crime_score_imp ~ CA *
## gender_orientation + rcs(age), SIRE_dummies), data = d)
##
##
## Model Likelihood Discrimination
## Ratio Test Indexes
## Obs 7410 LR chi2 331.84 R2 0.044
## sigma0.9788 d.f. 25 R2 adj 0.041
## d.f. 7384 Pr(> chi2) 0.0000 g 0.234
##
## Residuals
##
## Min 1Q Median 3Q Max
## -1.8093 -0.8253 -0.3391 1.0645 2.1559
##
##
## Coef S.E. t Pr(>|t|)
## Intercept 0.6080 0.3025 2.01 0.0445
## CA -0.1712 0.0129 -13.23 <0.0001
## gender_orientation=Bisexual_male 0.0607 0.0795 0.76 0.4449
## gender_orientation=Homosexual_male -0.0816 0.0571 -1.43 0.1528
## gender_orientation=Homosexual_female -0.0410 0.1144 -0.36 0.7199
## gender_orientation=Bisexual_female -0.1559 0.0576 -2.71 0.0068
## gender_orientation=Heterosexual_female -0.2510 0.0294 -8.53 <0.0001
## age -0.0187 0.0116 -1.62 0.1053
## age' 0.0102 0.0811 0.13 0.8999
## age'' 0.1533 0.3304 0.46 0.6427
## age''' -0.4016 0.4228 -0.95 0.3422
## White -0.0252 0.0493 -0.51 0.6090
## Black -0.0256 0.0629 -0.41 0.6835
## Asian -0.0505 0.0640 -0.79 0.4301
## Hispanic 0.0750 0.0591 1.27 0.2044
## Native_American -0.0357 0.0775 -0.46 0.6445
## Indian -0.0152 0.0966 -0.16 0.8752
## Middle_Eastern 0.1502 0.0994 1.51 0.1307
## Pacific_Islander 0.0461 0.1289 0.36 0.7209
## Other 0.2061 0.0625 3.30 0.0010
## Multi_SIRE 0.0353 0.0624 0.57 0.5714
## CA * gender_orientation=Bisexual_male -0.0042 0.0768 -0.05 0.9568
## CA * gender_orientation=Homosexual_male 0.0257 0.0528 0.49 0.6270
## CA * gender_orientation=Homosexual_female 0.1681 0.1102 1.53 0.1272
## CA * gender_orientation=Bisexual_female 0.0416 0.0583 0.71 0.4752
## CA * gender_orientation=Heterosexual_female 0.0670 0.0264 2.53 0.0113
##
##
## $no_low_CA
## Frequencies of Missing Values Due to Each Variable
## crime_score_imp CA gender_orientation
## 17381 0 192
## age White Black
## 0 1694 1694
## Asian Hispanic Native_American
## 1694 1694 1694
## Indian Middle_Eastern Pacific_Islander
## 1694 1694 1694
## Other Multi_SIRE
## 1694 1694
##
## Linear Regression Model
##
## rms::ols(formula = MOD_add_str_to_formula(crime_score_imp ~ CA +
## gender_orientation + rcs(age), SIRE_dummies), data = d %>%
## dplyr::filter(CA > -2))
##
##
## Model Likelihood Discrimination
## Ratio Test Indexes
## Obs 7002 LR chi2 306.43 R2 0.043
## sigma0.9791 d.f. 20 R2 adj 0.040
## d.f. 6981 Pr(> chi2) 0.0000 g 0.234
##
## Residuals
##
## Min 1Q Median 3Q Max
## -1.7026 -0.8116 -0.3400 1.0646 2.2701
##
##
## Coef S.E. t Pr(>|t|)
## Intercept 0.7206 0.3113 2.31 0.0207
## CA -0.1776 0.0127 -14.03 <0.0001
## gender_orientation=Bisexual_male 0.0635 0.0807 0.79 0.4314
## gender_orientation=Homosexual_male -0.0913 0.0571 -1.60 0.1098
## gender_orientation=Homosexual_female -0.1133 0.1103 -1.03 0.3044
## gender_orientation=Bisexual_female -0.1851 0.0589 -3.14 0.0017
## gender_orientation=Heterosexual_female -0.2593 0.0299 -8.67 <0.0001
## age -0.0226 0.0119 -1.89 0.0583
## age' 0.0238 0.0774 0.31 0.7584
## age'' 0.1013 0.3176 0.32 0.7497
## age''' -0.3430 0.4117 -0.83 0.4047
## White -0.0220 0.0507 -0.43 0.6648
## Black -0.0335 0.0651 -0.51 0.6076
## Asian -0.0525 0.0658 -0.80 0.4249
## Hispanic 0.0738 0.0615 1.20 0.2304
## Native_American -0.0458 0.0811 -0.56 0.5724
## Indian 0.0173 0.0983 0.18 0.8600
## Middle_Eastern 0.1397 0.1026 1.36 0.1737
## Pacific_Islander 0.0752 0.1340 0.56 0.5746
## Other 0.2129 0.0644 3.31 0.0009
## Multi_SIRE 0.0365 0.0641 0.57 0.5694
##
##
## $no_low_CA_gender_interaction
## Frequencies of Missing Values Due to Each Variable
## crime_score_imp CA gender_orientation
## 17381 0 192
## age White Black
## 0 1694 1694
## Asian Hispanic Native_American
## 1694 1694 1694
## Indian Middle_Eastern Pacific_Islander
## 1694 1694 1694
## Other Multi_SIRE
## 1694 1694
##
## Linear Regression Model
##
## rms::ols(formula = MOD_add_str_to_formula(crime_score_imp ~ CA *
## gender_orientation + rcs(age), SIRE_dummies), data = d %>%
## dplyr::filter(CA > -2))
##
##
## Model Likelihood Discrimination
## Ratio Test Indexes
## Obs 7002 LR chi2 318.21 R2 0.044
## sigma0.9786 d.f. 25 R2 adj 0.041
## d.f. 6976 Pr(> chi2) 0.0000 g 0.238
##
## Residuals
##
## Min 1Q Median 3Q Max
## -1.7306 -0.8172 -0.3433 1.0683 2.2034
##
##
## Coef S.E. t Pr(>|t|)
## Intercept 0.7199 0.3114 2.31 0.0208
## CA -0.1995 0.0152 -13.08 <0.0001
## gender_orientation=Bisexual_male 0.0719 0.0824 0.87 0.3828
## gender_orientation=Homosexual_male -0.0913 0.0576 -1.58 0.1134
## gender_orientation=Homosexual_female -0.0601 0.1145 -0.52 0.5999
## gender_orientation=Bisexual_female -0.2066 0.0600 -3.44 0.0006
## gender_orientation=Heterosexual_female -0.2569 0.0299 -8.58 <0.0001
## age -0.0226 0.0119 -1.90 0.0581
## age' 0.0255 0.0774 0.33 0.7418
## age'' 0.0938 0.3176 0.30 0.7677
## age''' -0.3342 0.4116 -0.81 0.4168
## White -0.0201 0.0507 -0.40 0.6922
## Black -0.0305 0.0651 -0.47 0.6397
## Asian -0.0505 0.0658 -0.77 0.4424
## Hispanic 0.0792 0.0615 1.29 0.1980
## Native_American -0.0396 0.0812 -0.49 0.6252
## Indian 0.0200 0.0982 0.20 0.8389
## Middle_Eastern 0.1435 0.1027 1.40 0.1624
## Pacific_Islander 0.0708 0.1341 0.53 0.5972
## Other 0.2164 0.0644 3.36 0.0008
## Multi_SIRE 0.0299 0.0641 0.47 0.6414
## CA * gender_orientation=Bisexual_male -0.0307 0.0893 -0.34 0.7312
## CA * gender_orientation=Homosexual_male 0.0344 0.0595 0.58 0.5629
## CA * gender_orientation=Homosexual_female 0.2598 0.1339 1.94 0.0524
## CA * gender_orientation=Bisexual_female 0.1486 0.0691 2.15 0.0315
## CA * gender_orientation=Heterosexual_female 0.0656 0.0315 2.08 0.0373
##
##
## $nonlinear_ca
## Frequencies of Missing Values Due to Each Variable
## crime_score_imp CA gender_orientation
## 17381 0 192
## age White Black
## 0 1694 1694
## Asian Hispanic Native_American
## 1694 1694 1694
## Indian Middle_Eastern Pacific_Islander
## 1694 1694 1694
## Other Multi_SIRE
## 1694 1694
##
## Linear Regression Model
##
## rms::ols(formula = MOD_add_str_to_formula(crime_score_imp ~ rcs(CA) *
## gender_orientation + rcs(age), SIRE_dummies), data = d %>%
## dplyr::filter(CA > -2))
##
##
## Model Likelihood Discrimination
## Ratio Test Indexes
## Obs 7002 LR chi2 340.82 R2 0.048
## sigma0.9783 d.f. 43 R2 adj 0.042
## d.f. 6958 Pr(> chi2) 0.0000 g 0.249
##
## Residuals
##
## Min 1Q Median 3Q Max
## -1.7057 -0.7924 -0.3386 1.0539 2.2156
##
##
## Coef S.E. t
## Intercept 0.8827 0.3281 2.69
## CA -0.0363 0.0762 -0.48
## CA' -0.3627 0.3094 -1.17
## CA'' 1.2594 1.6048 0.78
## CA''' -2.6788 3.4672 -0.77
## gender_orientation=Bisexual_male -0.2476 0.5602 -0.44
## gender_orientation=Homosexual_male -0.0502 0.3450 -0.15
## gender_orientation=Homosexual_female -0.5828 0.6496 -0.90
## gender_orientation=Bisexual_female -0.3161 0.3994 -0.79
## gender_orientation=Heterosexual_female -0.3382 0.1868 -1.81
## age -0.0223 0.0120 -1.86
## age' 0.0259 0.0776 0.33
## age'' 0.0895 0.3183 0.28
## age''' -0.3266 0.4122 -0.79
## White -0.0185 0.0508 -0.36
## Black -0.0302 0.0653 -0.46
## Asian -0.0499 0.0658 -0.76
## Hispanic 0.0778 0.0616 1.26
## Native_American -0.0357 0.0812 -0.44
## Indian 0.0290 0.0984 0.29
## Middle_Eastern 0.1492 0.1028 1.45
## Pacific_Islander 0.0717 0.1342 0.53
## Other 0.2175 0.0645 3.37
## Multi_SIRE 0.0260 0.0642 0.41
## CA * gender_orientation=Bisexual_male -0.2391 0.4490 -0.53
## CA' * gender_orientation=Bisexual_male 0.9892 1.7574 0.56
## CA'' * gender_orientation=Bisexual_male -3.9309 8.9611 -0.44
## CA''' * gender_orientation=Bisexual_male 3.3185 19.0625 0.17
## CA * gender_orientation=Homosexual_male 0.0402 0.2634 0.15
## CA' * gender_orientation=Homosexual_male -0.2715 1.1502 -0.24
## CA'' * gender_orientation=Homosexual_male 1.8147 6.3067 0.29
## CA''' * gender_orientation=Homosexual_male -3.8032 14.3955 -0.26
## CA * gender_orientation=Homosexual_female -0.1878 0.5257 -0.36
## CA' * gender_orientation=Homosexual_female 1.3717 2.2558 0.61
## CA'' * gender_orientation=Homosexual_female -5.0324 13.8470 -0.36
## CA''' * gender_orientation=Homosexual_female 5.6123 41.9161 0.13
## CA * gender_orientation=Bisexual_female 0.0339 0.3288 0.10
## CA' * gender_orientation=Bisexual_female 0.3932 1.2402 0.32
## CA'' * gender_orientation=Bisexual_female -3.2927 6.3190 -0.52
## CA''' * gender_orientation=Bisexual_female 12.6939 13.7328 0.92
## CA * gender_orientation=Heterosexual_female -0.0396 0.1454 -0.27
## CA' * gender_orientation=Heterosexual_female -0.0474 0.5900 -0.08
## CA'' * gender_orientation=Heterosexual_female 1.3642 3.1124 0.44
## CA''' * gender_orientation=Heterosexual_female -3.5118 7.0504 -0.50
## Pr(>|t|)
## Intercept 0.0071
## CA 0.6340
## CA' 0.2411
## CA'' 0.4326
## CA''' 0.4398
## gender_orientation=Bisexual_male 0.6585
## gender_orientation=Homosexual_male 0.8843
## gender_orientation=Homosexual_female 0.3697
## gender_orientation=Bisexual_female 0.4287
## gender_orientation=Heterosexual_female 0.0703
## age 0.0623
## age' 0.7391
## age'' 0.7785
## age''' 0.4283
## White 0.7161
## Black 0.6431
## Asian 0.4483
## Hispanic 0.2067
## Native_American 0.6601
## Indian 0.7682
## Middle_Eastern 0.1469
## Pacific_Islander 0.5931
## Other 0.0007
## Multi_SIRE 0.6851
## CA * gender_orientation=Bisexual_male 0.5945
## CA' * gender_orientation=Bisexual_male 0.5735
## CA'' * gender_orientation=Bisexual_male 0.6609
## CA''' * gender_orientation=Bisexual_male 0.8618
## CA * gender_orientation=Homosexual_male 0.8788
## CA' * gender_orientation=Homosexual_male 0.8134
## CA'' * gender_orientation=Homosexual_male 0.7736
## CA''' * gender_orientation=Homosexual_male 0.7916
## CA * gender_orientation=Homosexual_female 0.7210
## CA' * gender_orientation=Homosexual_female 0.5432
## CA'' * gender_orientation=Homosexual_female 0.7163
## CA''' * gender_orientation=Homosexual_female 0.8935
## CA * gender_orientation=Bisexual_female 0.9179
## CA' * gender_orientation=Bisexual_female 0.7512
## CA'' * gender_orientation=Bisexual_female 0.6023
## CA''' * gender_orientation=Bisexual_female 0.3553
## CA * gender_orientation=Heterosexual_female 0.7853
## CA' * gender_orientation=Heterosexual_female 0.9360
## CA'' * gender_orientation=Heterosexual_female 0.6612
## CA''' * gender_orientation=Heterosexual_female 0.6184
##
##
## $ordinal_GSO
## Frequencies of Missing Values Due to Each Variable
## crime_score_imp CA gender_orientation_ord
## 17381 0 192
## age White Black
## 0 1694 1694
## Asian Hispanic Native_American
## 1694 1694 1694
## Indian Middle_Eastern Pacific_Islander
## 1694 1694 1694
## Other Multi_SIRE
## 1694 1694
##
## Linear Regression Model
##
## rms::ols(formula = MOD_add_str_to_formula(crime_score_imp ~ CA *
## gender_orientation_ord + rcs(age), SIRE_dummies), data = d %>%
## dplyr::filter(CA > -2))
##
##
## Model Likelihood Discrimination
## Ratio Test Indexes
## Obs 7002 LR chi2 318.21 R2 0.044
## sigma0.9786 d.f. 25 R2 adj 0.041
## d.f. 6976 Pr(> chi2) 0.0000 g 0.238
##
## Residuals
##
## Min 1Q Median 3Q Max
## -1.7306 -0.8172 -0.3433 1.0683 2.2034
##
##
## Coef S.E. t
## Intercept 0.7199 0.3114 2.31
## CA -0.1995 0.0152 -13.08
## gender_orientation_ord=Bisexual_male 0.0719 0.0824 0.87
## gender_orientation_ord=Homosexual_male -0.0913 0.0576 -1.58
## gender_orientation_ord=Homosexual_female -0.0601 0.1145 -0.52
## gender_orientation_ord=Bisexual_female -0.2066 0.0600 -3.44
## gender_orientation_ord=Heterosexual_female -0.2569 0.0299 -8.58
## age -0.0226 0.0119 -1.90
## age' 0.0255 0.0774 0.33
## age'' 0.0938 0.3176 0.30
## age''' -0.3342 0.4116 -0.81
## White -0.0201 0.0507 -0.40
## Black -0.0305 0.0651 -0.47
## Asian -0.0505 0.0658 -0.77
## Hispanic 0.0792 0.0615 1.29
## Native_American -0.0396 0.0812 -0.49
## Indian 0.0200 0.0982 0.20
## Middle_Eastern 0.1435 0.1027 1.40
## Pacific_Islander 0.0708 0.1341 0.53
## Other 0.2164 0.0644 3.36
## Multi_SIRE 0.0299 0.0641 0.47
## CA * gender_orientation_ord=Bisexual_male -0.0307 0.0893 -0.34
## CA * gender_orientation_ord=Homosexual_male 0.0344 0.0595 0.58
## CA * gender_orientation_ord=Homosexual_female 0.2598 0.1339 1.94
## CA * gender_orientation_ord=Bisexual_female 0.1486 0.0691 2.15
## CA * gender_orientation_ord=Heterosexual_female 0.0656 0.0315 2.08
## Pr(>|t|)
## Intercept 0.0208
## CA <0.0001
## gender_orientation_ord=Bisexual_male 0.3828
## gender_orientation_ord=Homosexual_male 0.1134
## gender_orientation_ord=Homosexual_female 0.5999
## gender_orientation_ord=Bisexual_female 0.0006
## gender_orientation_ord=Heterosexual_female <0.0001
## age 0.0581
## age' 0.7418
## age'' 0.7677
## age''' 0.4168
## White 0.6922
## Black 0.6397
## Asian 0.4424
## Hispanic 0.1980
## Native_American 0.6252
## Indian 0.8389
## Middle_Eastern 0.1624
## Pacific_Islander 0.5972
## Other 0.0008
## Multi_SIRE 0.6414
## CA * gender_orientation_ord=Bisexual_male 0.7312
## CA * gender_orientation_ord=Homosexual_male 0.5629
## CA * gender_orientation_ord=Homosexual_female 0.0524
## CA * gender_orientation_ord=Bisexual_female 0.0315
## CA * gender_orientation_ord=Heterosexual_female 0.0373
##
##
## $numeric_GSO
## Frequencies of Missing Values Due to Each Variable
## crime_score_imp CA gender_orientation_num
## 17381 0 192
## age White Black
## 0 1694 1694
## Asian Hispanic Native_American
## 1694 1694 1694
## Indian Middle_Eastern Pacific_Islander
## 1694 1694 1694
## Other Multi_SIRE
## 1694 1694
##
## Linear Regression Model
##
## rms::ols(formula = MOD_add_str_to_formula(crime_score_imp ~ CA *
## gender_orientation_num + rcs(age), SIRE_dummies), data = d %>%
## dplyr::filter(CA > -2))
##
##
## Model Likelihood Discrimination
## Ratio Test Indexes
## Obs 7002 LR chi2 311.35 R2 0.043
## sigma0.9785 d.f. 17 R2 adj 0.041
## d.f. 6984 Pr(> chi2) 0.0000 g 0.236
##
## Residuals
##
## Min 1Q Median 3Q Max
## -1.7330 -0.8178 -0.3466 1.0706 2.1679
##
##
## Coef S.E. t Pr(>|t|)
## Intercept 0.8060 0.3078 2.62 0.0089
## CA -0.2147 0.0188 -11.39 <0.0001
## gender_orientation_num -0.0509 0.0058 -8.84 <0.0001
## age -0.0236 0.0118 -2.00 0.0454
## age' 0.0259 0.0772 0.34 0.7373
## age'' 0.0996 0.3173 0.31 0.7537
## age''' -0.3478 0.4114 -0.85 0.3980
## White -0.0208 0.0507 -0.41 0.6812
## Black -0.0317 0.0651 -0.49 0.6261
## Asian -0.0529 0.0657 -0.81 0.4204
## Hispanic 0.0761 0.0615 1.24 0.2158
## Native_American -0.0417 0.0810 -0.51 0.6069
## Indian 0.0179 0.0982 0.18 0.8552
## Middle_Eastern 0.1435 0.1026 1.40 0.1618
## Pacific_Islander 0.0720 0.1339 0.54 0.5910
## Other 0.2132 0.0643 3.32 0.0009
## Multi_SIRE 0.0366 0.0640 0.57 0.5677
## CA * gender_orientation_num 0.0162 0.0060 2.68 0.0074
##
##
## $demo_only
## Frequencies of Missing Values Due to Each Variable
## crime_score_imp White Black Asian
## 17933 1741 1741 1741
## Hispanic Native_American Indian Middle_Eastern
## 1741 1741 1741 1741
## Pacific_Islander Other Multi_SIRE
## 1741 1741 1741
##
## Linear Regression Model
##
## rms::ols(formula = MOD_add_str_to_formula(crime_score_imp ~ 1,
## SIRE_dummies), data = d)
##
##
## Model Likelihood Discrimination
## Ratio Test Indexes
## Obs 7492 LR chi2 43.62 R2 0.006
## sigma0.9958 d.f. 10 R2 adj 0.004
## d.f. 7481 Pr(> chi2) 0.0000 g 0.057
##
## Residuals
##
## Min 1Q Median 3Q Max
## -1.5121 -0.7740 -0.3258 1.1421 1.9082
##
##
## Coef S.E. t Pr(>|t|)
## Intercept 0.0482 0.0485 0.99 0.3198
## White -0.0769 0.0497 -1.55 0.1217
## Black -0.0307 0.0634 -0.48 0.6281
## Asian -0.1198 0.0644 -1.86 0.0630
## Hispanic 0.1155 0.0597 1.94 0.0529
## Native_American -0.0169 0.0778 -0.22 0.8282
## Indian -0.0553 0.0976 -0.57 0.5714
## Middle_Eastern 0.1128 0.1008 1.12 0.2631
## Pacific_Islander 0.1201 0.1276 0.94 0.3469
## Other 0.1685 0.0630 2.68 0.0075
## Multi_SIRE 0.0640 0.0628 1.02 0.3078
##
##
## $demo_only2
## Frequencies of Missing Values Due to Each Variable
## crime_score_imp SIRE_CC
## 17933 0
##
## Linear Regression Model
##
## rms::ols(formula = crime_score_imp ~ SIRE_CC, data = d)
##
##
## Model Likelihood Discrimination
## Ratio Test Indexes
## Obs 7882 LR chi2 39.62 R2 0.005
## sigma0.9983 d.f. 12 R2 adj 0.003
## d.f. 7869 Pr(> chi2) 0.0001 g 0.057
##
## Residuals
##
## Min 1Q Median 3Q Max
## -1.3684 -0.7734 -0.3252 1.1459 1.9088
##
##
## Coef S.E. t Pr(>|t|)
## Intercept -0.0292 0.0132 -2.21 0.0269
## SIRE_CC=NA 0.0362 0.0522 0.69 0.4887
## SIRE_CC=Other_combos 0.1472 0.0519 2.84 0.0046
## SIRE_CC=Asian -0.0496 0.0627 -0.79 0.4294
## SIRE_CC=Hispanic / Latin 0.2180 0.0659 3.31 0.0009
## SIRE_CC=Black 0.0665 0.0659 1.01 0.3132
## SIRE_CC=Other 0.2544 0.0779 3.27 0.0011
## SIRE_CC=Hispanic / Latin, White 0.1566 0.0845 1.85 0.0640
## SIRE_CC=White, Other 0.2817 0.1110 2.54 0.0112
## SIRE_CC=Native American, White 0.1523 0.1078 1.41 0.1579
## SIRE_CC=Indian -0.0567 0.1168 -0.49 0.6271
## SIRE_CC=Asian, White 0.0054 0.1340 0.04 0.9679
## SIRE_CC=Black, White 0.1627 0.1717 0.95 0.3434
##
##
## $gender_orientation
## Frequencies of Missing Values Due to Each Variable
## crime_score_imp gender_orientation
## 17933 196
##
## Linear Regression Model
##
## rms::ols(formula = crime_score_imp ~ gender_orientation, data = d)
##
##
## Model Likelihood Discrimination
## Ratio Test Indexes
## Obs 7796 LR chi2 65.74 R2 0.008
## sigma0.9972 d.f. 5 R2 adj 0.008
## d.f. 7790 Pr(> chi2) 0.0000 g 0.085
##
## Residuals
##
## Min 1Q Median 3Q Max
## -1.2089 -0.8246 -0.3631 1.1557 2.0426
##
##
## Coef S.E. t Pr(>|t|)
## Intercept 0.0562 0.0139 4.04 <0.0001
## gender_orientation=Bisexual_male 0.0368 0.0769 0.48 0.6319
## gender_orientation=Homosexual_male -0.0343 0.0548 -0.63 0.5320
## gender_orientation=Homosexual_female -0.0649 0.1055 -0.62 0.5383
## gender_orientation=Bisexual_female -0.1207 0.0548 -2.20 0.0277
## gender_orientation=Heterosexual_female -0.2215 0.0280 -7.92 <0.0001
##
#CA betas
map_dbl(ols_fits[1:6], function(x) x %>% .[["coefficients"]] %>% .[2])
## primary SIRE_CC
## -0.153 -0.151
## whites ca_gender_interaction
## -0.163 -0.171
## no_low_CA no_low_CA_gender_interaction
## -0.178 -0.200
#model n's
map_dbl(ols_fits, function(x) x$stats[1])
## primary SIRE_CC
## 7410 7796
## whites ca_gender_interaction
## 5647 7410
## no_low_CA no_low_CA_gender_interaction
## 7002 7002
## nonlinear_ca ordinal_GSO
## 7002 7002
## numeric_GSO demo_only
## 7002 7492
## demo_only2 gender_orientation
## 7882 7796
#model r's
map_dbl(ols_fits, function(x) x$stats[4] %>% sqrt())
## primary SIRE_CC
## 0.2066 0.2028
## whites ca_gender_interaction
## 0.1974 0.2093
## no_low_CA no_low_CA_gender_interaction
## 0.2069 0.2108
## nonlinear_ca ordinal_GSO
## 0.2180 0.2108
## numeric_GSO demo_only
## 0.2085 0.0762
## demo_only2 gender_orientation
## 0.0708 0.0916
#correlations by gender
d %>% dplyr::filter(gender == "Man") %$% cor.test(CA, crime_sum_imp)
##
## Pearson's product-moment correlation
##
## data: CA and crime_sum_imp
## t = -20, df = 6000, p-value <2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.228 -0.178
## sample estimates:
## cor
## -0.203
d %>% dplyr::filter(gender == "Woman") %$% cor.test(CA, crime_sum_imp)
##
## Pearson's product-moment correlation
##
## data: CA and crime_sum_imp
## t = -6, df = 2000, p-value = 2e-09
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.1695 -0.0871
## sample estimates:
## cor
## -0.129
#plot nonlinear CA
nonlinear_ca_df = data.frame(
CA = seq(-2, 2, by = .01),
gender_orientation = "Heterosexual_male",
age = wtd_mean(d$age),
White = T,
Black = F,
Asian = F,
Hispanic = F,
Native_American = F,
Indian = F,
Middle_Eastern = F,
Pacific_Islander = F,
Other = F,
Multi_SIRE = F
)
#model prediction CAS
nonlinear_ca_df$CAS_nonlinear = predict(ols_fits$nonlinear_ca, newdata = nonlinear_ca_df)
nonlinear_ca_df$CAS_linear = predict(ols_fits$no_low_CA_gender_interaction, newdata = nonlinear_ca_df)
#plot
ggplot(nonlinear_ca_df) +
geom_line(aes(CA, CAS_nonlinear), color = "blue") +
geom_line(aes(CA, CAS_linear), color = "red") +
theme_bw() +
xlab("Cognitive ability") +
ylab("Criminal and antisocial behavior score (model predicted)")
GG_save("figures/model_predictions.png")
It seems likely that people on dating sites would try to present themselves as better than they are to some degree. It can be noted that persons who did not answer crime items at all were often the lowest in estimated ability, even lower than those who answered positively (i.e. towards CAS). Here we control for overall number of items answered and look whether CA and CAS score based on other items predicts skipping CAS items.
#loop over each item, fit a model to examine if skippers are lower in CA and higher in CAS in the non-skipped items.
skippers_crime_score = map_df(CAS_vars, function(var_) {
#make missingness outcome
d_local = d
d_local$missing = is.na(d[[var_]])
#fit model
mod_ = rms::lrm(missing ~ questions_answered_z + CA + crime_score + gender_orientation + rcs(age), data = d_local)
data_frame(
CAS = var_,
questions_answered = coef(mod_)[2],
CA = coef(mod_)[3],
crime_score = coef(mod_)[4],
r2adj = mod_$stats[10]
)
})
skippers_crime_score %>% write_clipboard()
## CAS Questions answered CA Crime score R2adj
## 1 arrested -0.90 0.14 0.26 0.09
## 2 prison -1.13 -0.42 -0.54 0.22
## 3 punched_in_face -1.10 0.19 0.10 0.09
## 4 cheated_exam -2.17 0.02 0.07 0.40
## 5 would_tax_cheat -1.19 0.35 0.31 0.16
## 6 stole_glass_from_bar -1.10 0.19 -0.07 0.10
## 7 used_fake_id -1.10 0.20 0.18 0.11
## 8 torture_animal_for_fun -1.54 -0.37 -0.06 0.25
## 9 steal_newspapers -2.48 -0.06 -0.07 0.44
## 10 litter -0.29 -0.03 0.08 0.03
## 11 cigaret_littering -0.99 0.08 0.34 0.09
## 12 hit_SO_in_anger -0.83 -0.30 0.01 0.12
#output various formats
write_rds(d, "data/data_for_reuse.rds", compress = "gz")
write_csv(d, "data/data_for_reuse.csv", na = "")