Initialize

Packages and functions.

options(digits = 3)
options(contrasts=c("contr.treatment", "contr.treatment"))
library(pacman)
p_load(kirkegaard, dplyr, readr, polycor, effsize, rms)

Data

Load data.

#okc dataset
okc = read_rds("data/parsed_data_public.rds") %>% 
  #we need a regular data frame bc we need rownames
  as.data.frame()

#item info
questions = read_csv2("data/question_data.csv")

## Using ',' as decimal and '.' as grouping mark. Use read_delim() for more control.

## Warning: Missing column names filled in: 'X1' [1]

## Parsed with column specification:
## cols(
##   X1 = col_character(),
##   text = col_character(),
##   option_1 = col_character(),
##   option_2 = col_character(),
##   option_3 = col_character(),
##   option_4 = col_character(),
##   N = col_integer(),
##   Type = col_character(),
##   Order = col_character(),
##   Keywords = col_character()
## )

Recode data

We need to subset the data based on the quality of the CA data. Then recode a bunch of variables for our use.

#quick item analyser
#ad hoc function to help recoding
item_info = function(item) {
  #item text
  question_text = questions[questions$X1 == item, "text"] %>% unlist()
  
  #summary stats
  stats = plyr::ddply(okc, item, function(x) {
    data_frame(
      n = nrow(x),
      n_frac = n/nrow(okc),
      mean_CA = wtd_mean(x$CA)
    )
  })
  
  #rename
  names(stats)[1] = question_text
  
  stats
}

#recode some stuff
okc %<>% mutate(
  #answered questions
  questions_answered = okc %>% .[str_subset(questions$X1, "^q")] %>% is.na() %>% `!` %>% rowSums(),
  questions_answered_z = standardize(questions_answered),
  
  #rename sex
  sex = gender,
  
  #age
  age = d_age,
  age_z = age %>% standardize(),
  
  #recode CAS variables
  #we want them so that TRUE -> CAS, FALSE -> non-CAS
  #recode to dichotomous for easy statistical treatment
  arrested = (q252 == "Yes"),
  prison = (q1138 == "Yes"),
  punched_in_face = (q196 == "Yes"),
  cheated_exam = (q400 == "Yes"),
  would_tax_cheat = (q180 == "Yes"),
  stole_glass_from_bar = (q59919 == "Yes."),
  used_fake_id = (q22569 == "Yes"),
  torture_animal_for_fun = (q19928 != "NO WAY!"),
  steal_newspapers = (q39226 != "Pay for a paper and close the dispenser."),
  litter = (q17017 != "Never"),
  cigaret_littering = (q74381 == "No."),
  hit_SO_in_anger = (q81783 == "Yes.")
)

#subset
d = okc %>% filter(
  #at least 5 items for CA estimate
  CA_items >= 5,

  #only binary sex
  #samples too small
  gender %in% c("Man", "Woman")
) %>% mutate(
  #restandardize CA
  CA_orig = CA,
  CA = CA %>% standardize()
)

#add id
d$id = 1:nrow(d)

#recode race to common combinations and independent dummies
d$d_ethnicity %>% table2()

## # A tibble: 148 x 3
##                      Group Count Percent
##                      <chr> <dbl>   <dbl>
##  1                   White 18261  70.738
##  2                    <NA>  1741   6.744
##  3                   Asian   958   3.711
##  4        Hispanic / Latin   779   3.018
##  5                   Black   775   3.002
##  6                   Other   606   2.347
##  7 Hispanic / Latin, White   454   1.759
##  8            White, Other   292   1.131
##  9  Native American, White   244   0.945
## 10                  Indian   226   0.875
## # ... with 138 more rows

#lots!

#explicit NAs
d$ethnicities = plyr::mapvalues(d$d_ethnicity, NA, "NA")

#common combos
d$SIRE_CC = fct_lump(d$ethnicities,
                               #preserve groups with >= 50 persons
                               n = d$d_ethnicity %>% table2() %>% .$Count %>% `>=`(100) %>% sum(),
                               other_level = "Other_combos") %>% 
  #relevel in order of freq
  fct_infreq()

d$SIRE_CC %>% table2() %>% write_clipboard()

##                      Group    Count Percent
## 1                    White 18261.00   70.74
## 2                           1741.00    6.74
## 3             Other_combos  1197.00    4.64
## 4                    Asian   958.00    3.71
## 5         Hispanic / Latin   779.00    3.02
## 6                    Black   775.00    3.00
## 7                    Other   606.00    2.35
## 8  Hispanic / Latin, White   454.00    1.76
## 9             White, Other   292.00    1.13
## 10  Native American, White   244.00    0.95
## 11                  Indian   226.00    0.88
## 12            Asian, White   175.00    0.68
## 13            Black, White   107.00    0.41
## 14                             0.00    0.00

#independent dummies
d$White = str_detect(d$d_ethnicity, "White")
d$Black = str_detect(d$d_ethnicity, "Black")
d$Asian = str_detect(d$d_ethnicity, "Asian")
d$Hispanic = str_detect(d$d_ethnicity, "Hispanic")
d$Native_American = str_detect(d$d_ethnicity, "Native American")
d$Indian = str_detect(d$d_ethnicity, "Indian")
d$Middle_Eastern = str_detect(d$d_ethnicity, "Middle Eastern")
d$Pacific_Islander = str_detect(d$d_ethnicity, "Pacific Islander")
d$Other = str_detect(d$d_ethnicity, "Other")
d$Multi_SIRE = str_detect(d$d_ethnicity, ",")

SIRE_dummies = c("White", "Black", "Asian", "Hispanic", "Native_American", "Indian", "Middle_Eastern", "Pacific_Islander", "Other", "Multi_SIRE")

###var lists
#all CAS vars
CAS_vars = c("arrested",
             "prison",
             "punched_in_face",
             "cheated_exam",
             "would_tax_cheat",
             "stole_glass_from_bar",
             "used_fake_id",
             "torture_animal_for_fun",
             "steal_newspapers",
             "litter",
             "cigaret_littering",
             "hit_SO_in_anger")

#also as names
#so map functions will automatically name stuff in lists
names(CAS_vars) = CAS_vars

#data subsets for ease of use
#all vars of interest
vars = c("CA", 
         "gender", 
         "race",
         CAS_vars)

#CAS vars
CAS_full = okc[CAS_vars]
CAS = d[CAS_vars]

Descriptive statistics

# tables ------------------------------------------------------------------
#pairwise cases
d[vars] %>% count.pairwise() %>% MAT_half() %>% describe()

##    vars   n mean   sd median trimmed  mad min   max range skew kurtosis
## X1    1 105 7737 5703   7394    7046 6426 734 25815 25081 1.08     1.17
##     se
## X1 557

#tables
d$race %>% table2()

## # A tibble: 11 x 3
##               Group Count Percent
##               <chr> <dbl>   <dbl>
##  1            White 18261  70.738
##  2            Mixed  2311   8.952
##  3             <NA>  1741   6.744
##  4            Asian   958   3.711
##  5 Hispanic / Latin   779   3.018
##  6            Black   775   3.002
##  7            Other   606   2.347
##  8           Indian   226   0.875
##  9   Middle Eastern    82   0.318
## 10  Native American    39   0.151
## 11 Pacific Islander    37   0.143

#sex/gender
d$sex %>% table2()

## # A tibble: 3 x 3
##   Group Count Percent
##   <chr> <dbl>   <dbl>
## 1   Man 18129    70.2
## 2 Woman  7686    29.8
## 3  <NA>     0     0.0

okc$sex %>% table2()

## # A tibble: 4 x 3
##   Group Count Percent
##   <chr> <dbl>   <dbl>
## 1   Man 40215   58.82
## 2 Woman 25952   37.96
## 3  <NA>  2006    2.93
## 4 Other   198    0.29

okc$d_gender %>% table2()

## # A tibble: 108 x 3
##               Group Count  Percent
##               <chr> <dbl>    <dbl>
##  1              Man 40215 58.81880
##  2            Woman 25952 37.95761
##  3             <NA>  2006  2.93399
##  4          Cis Man    19  0.02779
##  5        Cis Woman    18  0.02633
##  6      Genderqueer     7  0.01024
##  7      Trans Woman     7  0.01024
##  8 Woman, Cis Woman     7  0.01024
##  9      Genderfluid     6  0.00878
## 10     Man, Cis Man     6  0.00878
## # ... with 98 more rows

okc$d_gender %>% table2() %>% .$Percent %>% .[c(1:2, 4:5)] %>% sum()

## [1] 96.8

okc$d_gender %>% table2() %>% .$Percent %>% .[-c(1:5)] %>% sum()

## [1] 0.235

#sexual orientation x sex
d$gender_orientation %<>% plyr::mapvalues(c("Gay_male", "Gay_female", "Hetero_male", "Hetero_female"), c("Homosexual_male", "Homosexual_female", "Heterosexual_male", "Heterosexual_female"))
d$gender_orientation %<>% factor() %>% fct_relevel(c("Heterosexual_male", "Bisexual_male", "Homosexual_male", "Homosexual_female", "Bisexual_female", "Heterosexual_female"))
d$gender_orientation_ord = ordered(d$gender_orientation)
d$gender_orientation_num = as.numeric(d$gender_orientation)
d$gender_orientation %>% table2() %>% write_clipboard()

##                 Group    Count Percent
## 1   Heterosexual_male 16249.00   62.94
## 2 Heterosexual_female  5859.00   22.70
## 3     Bisexual_female  1426.00    5.52
## 4     Homosexual_male  1236.00    4.79
## 5       Bisexual_male   548.00    2.12
## 6   Homosexual_female   301.00    1.17
## 7                       196.00    0.76

#age
d$age %>% describe()

##    vars     n mean   sd median trimmed  mad min max range skew kurtosis
## X1    1 25815 33.7 7.59     33    33.1 7.41  18 100    82 0.94     2.02
##      se
## X1 0.05

okc$age %>% describe()

##    vars     n mean  sd median trimmed  mad min max range skew kurtosis
## X1    1 66365 31.7 7.8     30      31 7.41  18 100    82 0.98     1.95
##      se
## X1 0.03

#cognitive ability
d$CA %>% describe()

##    vars     n mean sd median trimmed mad   min  max range  skew kurtosis
## X1    1 25815    0  1    0.2    0.06   1 -3.58 2.27  5.85 -0.54    -0.22
##      se
## X1 0.01

d$CA_orig %>% describe()

##    vars     n mean   sd median trimmed  mad   min  max range  skew
## X1    1 25815 0.56 0.61   0.68     0.6 0.61 -1.63 1.95  3.58 -0.54
##    kurtosis se
## X1    -0.22  0

GG_denhist(d, "CA") + 
  scale_x_continuous("Cognitive ability")

## Scale for 'x' is already present. Adding another scale for 'x', which
## will replace the existing scale.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

GG_save("figures/CA_denhist.png")

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

#CAS
#sample sizes by item
d %>% plyr::ddply("sex", function(x) {
  x %>% extract(CAS_vars) %>% 
    map_int(count_NA, reverse = T)
}) %>% 
  write_clipboard()

##     Sex Arrested Prison Punched in face Cheated exam Would tax cheat
## 1   Man     6867   9056            9783         3057            8522
## 2 Woman     2786   3280            4001         1022            3206
##   Stole glass from bar Used fake id Torture animal for fun
## 1                 9122         7767                   6523
## 2                 3504         3044                   2466
##   Steal newspapers Litter Cigaret littering Hit SO in anger
## 1             1239  16579             10891            2196
## 2              488   6677              4014             680

#rate
map_df(okc[CAS_vars], function(x) {
  x = na.omit(x) %>% as.numeric()

  #2 options
  data_frame(n = length(x),
             proportion = mean(x)
            )
}) %>% cbind(CAS_vars, .) %>% 
  write_clipboard()

##                                      CAS vars     N Proportion
## Arrested                             arrested 12456       0.24
## Prison                                 prison 15236       0.02
## Punched in face               punched_in_face 18128       0.34
## Cheated exam                     cheated_exam  5155       0.40
## Would tax cheat               would_tax_cheat 15581       0.42
## Stole glass from bar     stole_glass_from_bar 16747       0.44
## Used fake id                     used_fake_id 14067       0.28
## Torture animal for fun torture_animal_for_fun 10725       0.03
## Steal newspapers             steal_newspapers  2111       0.43
## Litter                                 litter 35221       0.31
## Cigaret littering           cigaret_littering 18983       0.11
## Hit SO in anger               hit_SO_in_anger  3405       0.03

#CA by sex
d %$% describeBy(CA, sex)

## 
##  Descriptive statistics by group 
## group: Man
##    vars     n mean   sd median trimmed  mad   min  max range  skew
## X1    1 18129 0.01 1.02   0.19    0.07 1.08 -3.58 2.27  5.85 -0.52
##    kurtosis   se
## X1    -0.31 0.01
## -------------------------------------------------------- 
## group: Woman
##    vars    n  mean   sd median trimmed  mad   min max range skew kurtosis
## X1    1 7686 -0.02 0.95   0.21    0.05 0.93 -3.57 2.2  5.77 -0.6        0
##      se
## X1 0.01

#countries / State
okc$d_country %>% table2()

## # A tibble: 203 x 3
##        Group Count Percent
##        <chr> <dbl>   <dbl>
##  1        UK  8438   12.34
##  2        NY  7989   11.68
##  3        CA  7145   10.45
##  4        TX  2767    4.05
##  5      <NA>  2006    2.93
##  6        FL  1760    2.57
##  7 Australia  1678    2.45
##  8        NJ  1662    2.43
##  9        IL  1559    2.28
## 10        PA  1533    2.24
## # ... with 193 more rows

#USA total
(okc$d_country %>% str_detect("^..$") %>% sum(na.rm=T)) / nrow(okc)

## [1] 0.772

#Canada
(okc$d_country %in% c("Ontario", "British Columbia", "Alberta", "Quebec", "Manitoba", "Nova Scotia", "Saskatchewan", "New Brunswick", "Newfoundland and Labrador", "Prince Edward Island") %>% sum(na.rm=T)) / nrow(okc)

## [1] 0.032

#overall missing
miss_amount(okc)

## cases with missing data  vars with missing data cells with missing data 
##                   1.000                   1.000                   0.767

sum(is.na(okc))

## [1] 138556616

sum(!is.na(okc))

## [1] 42079566

sum(!is.na(okc)) / (sum(is.na(okc)) + sum(!is.na(okc)))

## [1] 0.233

Analysis 1 - Group means

Compute group means for each outcome.

#TODO: bug fix the below
group_means = map_df(CAS_vars, .f = function(x) {
  #compute SMD for CAS vs. non-CAS
  .d = SMD_matrix(d$CA, d[[x]], extended_output = T)
  
  #return as df
  data_frame(d = -.d$d[2, 1],
             d_lower = -.d$CI_lower[2, 1],
             d_upper = -.d$CI_upper[2, 1],
             n = .d$pairwise_n[2, 1]
             )
  
}) %>% 
  cbind(CAS_vars, .) %>% 
  write_clipboard()

##                                      CAS vars     D D lower D upper     N
## Arrested                             arrested -0.24   -0.19   -0.28  9653
## Prison                                 prison -0.34   -0.19   -0.48 12336
## Punched in face               punched_in_face -0.26   -0.23   -0.30 13784
## Cheated exam                     cheated_exam -0.13   -0.07   -0.20  4079
## Would tax cheat               would_tax_cheat -0.15   -0.11   -0.18 11728
## Stole glass from bar     stole_glass_from_bar -0.03    0.01   -0.06 12626
## Used fake id                     used_fake_id  0.02    0.06   -0.02 10811
## Torture animal for fun torture_animal_for_fun -0.35   -0.23   -0.48  8989
## Steal newspapers             steal_newspapers -0.26   -0.16   -0.35  1727
## Litter                                 litter -0.38   -0.35   -0.41 23256
## Cigaret littering           cigaret_littering -0.49   -0.44   -0.55 14905
## Hit SO in anger               hit_SO_in_anger -0.47   -0.24   -0.70  2876

Analysis 2 - Scoring of CAS behavior

Not so straightforward. Multiple issues:

Lots of missing data.
Likely non-random missing data: not random which users answer more questions; not random whether user answers CAS behavior question.
Questions of unequal importance, how do we weigh? Assume equal weight?
Non-random non-responding. How can we tell? Can we adjust?

Solution: method variation and cross-validation. Do different methods agree?

First we just try looking at the item intercorrelations.

# correlations ------------------------------------------------------------
#correlation matrix
crime_cors = tetrachoric(CAS)$rho

crime_cors_full = tetrachoric(CAS_full)$rho

#output
crime_cors %>% 
  write_clipboard()

##                        Arrested Prison Punched in face Cheated exam
## Arrested                   1.00   0.72            0.45         0.32
## Prison                     0.72   1.00            0.19         0.27
## Punched in face            0.45   0.19            1.00         0.24
## Cheated exam               0.32   0.27            0.24         1.00
## Would tax cheat            0.11   0.05            0.17         0.44
## Stole glass from bar       0.33   0.19            0.25         0.35
## Used fake id               0.37   0.12            0.29         0.32
## Torture animal for fun     0.19   0.31            0.11         0.15
## Steal newspapers           0.24   0.17            0.19         0.57
## Litter                     0.18   0.11            0.19         0.26
## Cigaret littering          0.24   0.11            0.19         0.34
## Hit SO in anger            0.07   0.17            0.31         0.11
##                        Would tax cheat Stole glass from bar Used fake id
## Arrested                          0.11                 0.33         0.37
## Prison                            0.05                 0.19         0.12
## Punched in face                   0.17                 0.25         0.29
## Cheated exam                      0.44                 0.35         0.32
## Would tax cheat                   1.00                 0.25         0.25
## Stole glass from bar              0.25                 1.00         0.49
## Used fake id                      0.25                 0.49         1.00
## Torture animal for fun            0.22                 0.03         0.04
## Steal newspapers                  0.37                 0.34         0.17
## Litter                            0.21                 0.07         0.02
## Cigaret littering                 0.09                 0.13         0.10
## Hit SO in anger                   0.19                 0.05         0.21
##                        Torture animal for fun Steal newspapers Litter
## Arrested                                 0.19             0.24   0.18
## Prison                                   0.31             0.17   0.11
## Punched in face                          0.11             0.19   0.19
## Cheated exam                             0.15             0.57   0.26
## Would tax cheat                          0.22             0.37   0.21
## Stole glass from bar                     0.03             0.34   0.07
## Used fake id                             0.04             0.17   0.02
## Torture animal for fun                   1.00             0.19   0.19
## Steal newspapers                         0.19             1.00   0.33
## Litter                                   0.19             0.33   1.00
## Cigaret littering                        0.09             0.27   0.40
## Hit SO in anger                          0.28             0.03   0.20
##                        Cigaret littering Hit SO in anger
## Arrested                            0.24            0.07
## Prison                              0.11            0.17
## Punched in face                     0.19            0.31
## Cheated exam                        0.34            0.11
## Would tax cheat                     0.09            0.19
## Stole glass from bar                0.13            0.05
## Used fake id                        0.10            0.21
## Torture animal for fun              0.09            0.28
## Steal newspapers                    0.27            0.03
## Litter                              0.40            0.20
## Cigaret littering                   1.00            0.18
## Hit SO in anger                     0.18            1.00

crime_cors_full %>% 
  write_clipboard()

##                        Arrested Prison Punched in face Cheated exam
## Arrested                   1.00   0.73            0.47         0.32
## Prison                     0.73   1.00            0.21         0.30
## Punched in face            0.47   0.21            1.00         0.25
## Cheated exam               0.32   0.30            0.25         1.00
## Would tax cheat            0.14   0.10            0.16         0.44
## Stole glass from bar       0.35   0.19            0.25         0.36
## Used fake id               0.36   0.13            0.30         0.33
## Torture animal for fun     0.25   0.36            0.13         0.18
## Steal newspapers           0.25   0.14            0.19         0.57
## Litter                     0.20   0.15            0.20         0.25
## Cigaret littering          0.28   0.15            0.23         0.30
## Hit SO in anger            0.10   0.12            0.30         0.10
##                        Would tax cheat Stole glass from bar Used fake id
## Arrested                          0.14                 0.35         0.36
## Prison                            0.10                 0.19         0.13
## Punched in face                   0.16                 0.25         0.30
## Cheated exam                      0.44                 0.36         0.33
## Would tax cheat                   1.00                 0.27         0.27
## Stole glass from bar              0.27                 1.00         0.49
## Used fake id                      0.27                 0.49         1.00
## Torture animal for fun            0.23                 0.03         0.06
## Steal newspapers                  0.38                 0.37         0.21
## Litter                            0.20                 0.07         0.03
## Cigaret littering                 0.09                 0.12         0.12
## Hit SO in anger                   0.24                 0.07         0.20
##                        Torture animal for fun Steal newspapers Litter
## Arrested                                 0.25             0.25   0.20
## Prison                                   0.36             0.14   0.15
## Punched in face                          0.13             0.19   0.20
## Cheated exam                             0.18             0.57   0.25
## Would tax cheat                          0.23             0.38   0.20
## Stole glass from bar                     0.03             0.37   0.07
## Used fake id                             0.06             0.21   0.03
## Torture animal for fun                   1.00             0.18   0.19
## Steal newspapers                         0.18             1.00   0.33
## Litter                                   0.19             0.33   1.00
## Cigaret littering                        0.14             0.29   0.42
## Hit SO in anger                          0.34             0.10   0.20
##                        Cigaret littering Hit SO in anger
## Arrested                            0.28            0.10
## Prison                              0.15            0.12
## Punched in face                     0.23            0.30
## Cheated exam                        0.30            0.10
## Would tax cheat                     0.09            0.24
## Stole glass from bar                0.12            0.07
## Used fake id                        0.12            0.20
## Torture animal for fun              0.14            0.34
## Steal newspapers                    0.29            0.10
## Litter                              0.42            0.20
## Cigaret littering                   1.00            0.16
## Hit SO in anger                     0.16            1.00

#averages
crime_cors %>% MAT_half() %>% describe() %>% as.data.frame()

##    vars  n  mean    sd median trimmed   mad   min   max range skew
## X1    1 66 0.224 0.133  0.194   0.213 0.122 0.023 0.723   0.7 1.06
##    kurtosis     se
## X1     1.82 0.0163

crime_cors_full %>% MAT_half() %>% describe() %>% as.data.frame()

##    vars  n  mean    sd median trimmed   mad    min   max range skew
## X1    1 66 0.237 0.131   0.21   0.225 0.127 0.0269 0.731 0.704 1.08
##    kurtosis     se
## X1     1.84 0.0161

#compare full to subset data
#differences in correlations
(crime_cors - crime_cors_full) %>% MAT_half() %>% GG_denhist()

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

(crime_cors - crime_cors_full) %>% MAT_half() %>% describe()

##    vars  n  mean   sd median trimmed  mad   min  max range skew kurtosis
## X1    1 66 -0.01 0.02  -0.01   -0.01 0.02 -0.06 0.05  0.11 0.04      0.2
##    se
## X1  0

Analysis 2 - General CAS score

Now we come to the question of how we should aggregate the CAS items into a single score. It’s not quite obvious the best way to do this. First we examine the pattern of missing data.

#missing data distribution
miss_plot(CAS)

miss_plot(CAS, reverse = T)

GG_save("figures/missing_data_tradeoff.png")
miss_plot(CAS, reverse = T, case = F)

#cases by var
n_by_var = map_int(CAS_vars, ~d[[.]] %>% count_NA(reverse = T)) %>% set_names(CAS_vars)

#complete cases by scale length
l_complete_data = map(seq_along(CAS), function(x) {
  CAS[, names(n_by_var)[1:x], drop = F] %>% na.omit()
})

#cases number with at least x
l_atleast_data = map(seq_along(CAS), function(x) {
  CAS %>% miss_filter(missing = x, reverse = T)
})

#output
data_frame(casewise_complete = map_int(l_complete_data, nrow) %>% set_names(seq_along(CAS)),
           fractional_complete = map_int(l_atleast_data, nrow) %>% set_names(seq_along(CAS))
           ) %>% 
  write_clipboard()

##    Casewise complete Fractional complete
## 1               9653               24514
## 2               6320               21672
## 3               5126               19090
## 4               2163               16779
## 5               1942               13830
## 6               1855               10783
## 7               1798                7882
## 8               1670                5066
## 9                864                3399
## 10               847                2148
## 11               831                1148
## 12               459                 459

#standard error of correlation
se_r = data.frame(n = 3:40000)
se_r$se_r = map_dbl(se_r$n, function(n) {sqrt((1-.15^2)/(n-2))})

ggplot(se_r, aes(n, se_r)) +
  geom_line() +
  scale_y_log10(breaks = helper_breaks(10^(-5:0)), name = "Standard error of correlation") +
  scale_x_continuous(breaks = seq(0, 40000, by = 5000)) +
  theme_bw()

GG_save("figures/se_r.png")

#subset to desired samples re. missing data
CAS2 = l_atleast_data[[7]]

#type variants
#some functions want specific types
CAS2_num = CAS2 %>% df_colFunc(as.numeric)
CAS2_lgl = CAS2 %>% df_colFunc(as.logical)
CAS2_fct = CAS2 %>% df_colFunc(as.factor)

First, we create an imputed version of the data.

#impute
#change items to logical for VIM
#then change back for psych
#impute
CAS2_imp = CAS2_fct %>% miss_impute() %>% map(~as.numeric(.) - 1) %>% as.data.frame()

#assert no missing
assert_that(sum(is.na(CAS2_imp)) == 0)

## [1] TRUE

#compare correlations with unimputed data
CAS2_cors = psych::tetrachoric(CAS2_imp)$rho

CAS2_imp_cors = psych::tetrachoric(CAS2)$rho

#describe
CAS2_cors %>% MAT_half() %>% describe()

##    vars  n mean   sd median trimmed  mad   min  max range skew kurtosis
## X1    1 66 0.26 0.19   0.22    0.25 0.19 -0.04 0.99  1.03 1.08     1.72
##      se
## X1 0.02

CAS2_imp_cors %>% MAT_half() %>% describe()

##    vars  n mean   sd median trimmed  mad min  max range skew kurtosis   se
## X1    1 66 0.22 0.14    0.2    0.21 0.14   0 0.74  0.74    1     1.93 0.02

#differences
CAS2_imp_cors - CAS2_cors

##                        arrested   prison punched_in_face cheated_exam
## arrested                0.00000 -0.00533        -0.06158      -0.2270
## prison                 -0.00533  0.00000         0.00984      -0.0527
## punched_in_face        -0.06158  0.00984         0.00000      -0.1184
## cheated_exam           -0.22696 -0.05268        -0.11841       0.0000
## would_tax_cheat        -0.08937 -0.04049        -0.04813      -0.2041
## stole_glass_from_bar   -0.04266 -0.00601        -0.04222      -0.0559
## used_fake_id           -0.08093 -0.03093        -0.03196      -0.1163
## torture_animal_for_fun  0.03568  0.02939         0.00679      -0.0432
## steal_newspapers       -0.27553 -0.13448        -0.18082      -0.4203
## litter                 -0.00475  0.00939        -0.00663      -0.1176
## cigaret_littering      -0.01596  0.02737        -0.00969      -0.0873
## hit_SO_in_anger         0.01152 -0.02858         0.04572       0.0832
##                        would_tax_cheat stole_glass_from_bar used_fake_id
## arrested                       -0.0894             -0.04266    -0.080927
## prison                         -0.0405             -0.00601    -0.030933
## punched_in_face                -0.0481             -0.04222    -0.031958
## cheated_exam                   -0.2041             -0.05586    -0.116269
## would_tax_cheat                 0.0000             -0.04696    -0.018983
## stole_glass_from_bar           -0.0470              0.00000    -0.042415
## used_fake_id                   -0.0190             -0.04242     0.000000
## torture_animal_for_fun          0.0240              0.00226     0.027054
## steal_newspapers               -0.2384             -0.05607    -0.232859
## litter                         -0.0324             -0.00787    -0.000533
## cigaret_littering              -0.0353             -0.01301     0.004859
## hit_SO_in_anger                 0.0789              0.02829     0.097989
##                        torture_animal_for_fun steal_newspapers    litter
## arrested                              0.03568          -0.2755 -0.004750
## prison                                0.02939          -0.1345  0.009388
## punched_in_face                       0.00679          -0.1808 -0.006628
## cheated_exam                         -0.04318          -0.4203 -0.117557
## would_tax_cheat                       0.02400          -0.2384 -0.032391
## stole_glass_from_bar                  0.00226          -0.0561 -0.007875
## used_fake_id                          0.02705          -0.2329 -0.000533
## torture_animal_for_fun                0.00000           0.0144  0.016487
## steal_newspapers                      0.01443           0.0000 -0.079137
## litter                                0.01649          -0.0791  0.000000
## cigaret_littering                     0.04342          -0.1476 -0.001534
## hit_SO_in_anger                       0.03578           0.0464  0.053359
##                        cigaret_littering hit_SO_in_anger
## arrested                        -0.01596          0.0115
## prison                           0.02737         -0.0286
## punched_in_face                 -0.00969          0.0457
## cheated_exam                    -0.08730          0.0832
## would_tax_cheat                 -0.03529          0.0789
## stole_glass_from_bar            -0.01301          0.0283
## used_fake_id                     0.00486          0.0980
## torture_animal_for_fun           0.04342          0.0358
## steal_newspapers                -0.14761          0.0464
## litter                          -0.00153          0.0534
## cigaret_littering                0.00000          0.1112
## hit_SO_in_anger                  0.11120          0.0000

(CAS2_imp_cors - CAS2_cors) %>% MAT_half() %>% describe()

##    vars  n  mean   sd median trimmed  mad   min  max range  skew kurtosis
## X1    1 66 -0.04 0.09  -0.02   -0.03 0.06 -0.42 0.11  0.53 -1.52     3.06
##      se
## X1 0.01

We try IRT scoring based on the dataset with missing data.

#factor analyze
(CAS2_irt = irt.fa(CAS2_num))

## Item Response Analysis using Factor Analysis 
## 
## Call: irt.fa(x = CAS2_num)
## Item Response Analysis using Factor Analysis  
## 
##  Summary information by factor and item
##  Factor =  1 
##                           -3    -2   -1    0    1    2     3
## arrested                0.01  0.03 0.11 0.30 0.48 0.31  0.11
## prison                  0.01  0.02 0.03 0.07 0.12 0.17  0.16
## punched_in_face         0.03  0.07 0.14 0.22 0.23 0.15  0.07
## cheated_exam            0.01  0.06 0.22 0.51 0.46 0.17  0.04
## would_tax_cheat         0.04  0.08 0.13 0.17 0.16 0.11  0.06
## stole_glass_from_bar    0.03  0.08 0.17 0.26 0.24 0.14  0.06
## used_fake_id            0.02  0.05 0.11 0.19 0.23 0.17  0.09
## torture_animal_for_fun  0.02  0.03 0.04 0.06 0.07 0.08  0.07
## steal_newspapers        0.03  0.08 0.20 0.32 0.28 0.14  0.05
## litter                  0.03  0.06 0.10 0.13 0.13 0.10  0.07
## cigaret_littering       0.02  0.04 0.07 0.11 0.15 0.15  0.11
## hit_SO_in_anger         0.02  0.03 0.04 0.06 0.07 0.07  0.07
## Test Info               0.26  0.61 1.36 2.41 2.62 1.77  0.97
## SEM                     1.96  1.28 0.86 0.64 0.62 0.75  1.01
## Reliability            -2.83 -0.64 0.26 0.58 0.62 0.43 -0.03
## 
## Factor analysis with Call: fa(r = r, nfactors = nfactors, n.obs = n.obs, rotate = rotate, 
##     fm = fm)
## 
## Test of the hypothesis that 1 factor is sufficient.
## The degrees of freedom for the model is 54  and the objective function was  1.81 
## The number of observations was  7882  with Chi Square =  14218  with prob <  0 
## 
## The root mean square of the residuals (RMSA) is  0.12 
## The df corrected root mean square of the residuals is  0.13 
## 
## Tucker Lewis Index of factoring reliability =  0.374
## RMSEA index =  0.182  and the 10 % confidence intervals are  0.18 0.185
## BIC =  13733

#loadings
fa_plot_loadings(CAS2_irt$fa) +
  scale_y_continuous(breaks = seq(0, 1, .1))

GG_save("figures/irt_loadings.png")

#score
CAS2_scores = scoreIrt(CAS2_irt, items = CAS2_num)
CAS2_scores$crime_score = CAS2_scores$theta1 %>% standardize()
CAS2_scores$id = rownames(CAS2) %>% as.integer()

#distribution of crime scores
GG_denhist(CAS2_scores, "crime_score") +
  scale_x_continuous(name = "General CAS score - irt")

## Scale for 'x' is already present. Adding another scale for 'x', which
## will replace the existing scale.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

GG_save("figures/crime_score_dist.png")

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

What if we impute the missing data beforehand? Maybe the odd results are due to incorrect handling of missing data, so we impute the data beforehand.

#factor analyze again
(CAS2_imp_irt = irt.fa(CAS2_imp, sort = F))

## Item Response Analysis using Factor Analysis 
## 
## Call: irt.fa(x = CAS2_imp, sort = F)
## Item Response Analysis using Factor Analysis  
## 
##  Summary information by factor and item
##  Factor =  1 
##                         -3    -2    -1    0    1    2     3
## arrested              0.00  0.02  0.08 0.30 0.62 0.42  0.12
## prison                0.01  0.02  0.03 0.06 0.10 0.14  0.14
## punched_in_face       0.03  0.06  0.13 0.22 0.23 0.15  0.08
## cheated_exam          0.00  0.00  0.00 0.08 2.85 0.89  0.02
## would_tax_cheat       0.03  0.07  0.18 0.30 0.28 0.15  0.06
## stole_glass_from_bar  0.03  0.07  0.15 0.23 0.23 0.14  0.07
## used_fake_id          0.02  0.05  0.11 0.21 0.26 0.20  0.10
## steal_newspapers      0.00  0.00  0.01 0.19 3.05 0.56  0.02
## litter                0.03  0.06  0.10 0.12 0.13 0.10  0.07
## cigaret_littering     0.02  0.04  0.07 0.11 0.14 0.14  0.11
## Test Info             0.16  0.39  0.86 1.82 7.88 2.90  0.77
## SEM                   2.46  1.60  1.08 0.74 0.36 0.59  1.14
## Reliability          -5.06 -1.57 -0.17 0.45 0.87 0.65 -0.29
## 
## Factor analysis with Call: fa(r = r, nfactors = nfactors, n.obs = n.obs, rotate = rotate, 
##     fm = fm)
## 
## Test of the hypothesis that 1 factor is sufficient.
## The degrees of freedom for the model is 54  and the objective function was  4.26 
## The number of observations was  7882  with Chi Square =  33566  with prob <  0 
## 
## The root mean square of the residuals (RMSA) is  0.12 
## The df corrected root mean square of the residuals is  0.13 
## 
## Tucker Lewis Index of factoring reliability =  0.355
## RMSEA index =  0.281  and the 10 % confidence intervals are  0.278 0.283
## BIC =  33082

#loadings
fa_plot_loadings(CAS2_imp_irt$fa)

#compare loadings
CAS2_irts = list('without pre-imputation' = CAS2_irt$fa,
                          'with pre-imputation' = CAS2_imp_irt$fa)
fa_congruence_matrix(CAS2_irts)

##      MR1  MR1
## MR1 1.00 0.97
## MR1 0.97 1.00

fa_plot_loadings(CAS2_irts)

#score
CAS2_imp_scores = scoreIrt(CAS2_imp_irt, items = CAS2_imp)
CAS2_imp_scores$crime_score_imp = CAS2_imp_scores$theta1 %>% standardize()
CAS2_imp_scores$id = rownames(CAS2) %>% as.integer()

#distribution of crime scores
GG_denhist(CAS2_imp_scores, "crime_score_imp") +
  xlab("General CAS score (from imputed data)")

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Compare scoring methods.

#compare scores
score_comparison = data_frame(crime_sum_imp = rowSums(CAS2_imp),
                              crime_score = CAS2_scores$crime_score,
                              crime_score_imp = CAS2_imp_scores$crime_score,
                              CAS2_complete = miss_by_case(CAS2, reverse = T),
                              id = rownames(CAS2) %>% as.integer()
                              )

#assert all ids in subset in complete sample
assert_that(all(score_comparison$id %in% d$id))

## [1] TRUE

#join to main
d = full_join(score_comparison, d, by = "id")

#correlations
wtd.cors(d[c("crime_sum_imp", "crime_score", "crime_score_imp", "CA", "CAS2_complete")]) %>% 
  write_clipboard()

##                 Crime sum imp Crime score Crime score imp    CA
## Crime sum imp            1.00        0.90            0.94 -0.18
## Crime score              0.90        1.00            0.82 -0.13
## Crime score imp          0.94        0.82            1.00 -0.16
## CA                      -0.18       -0.13           -0.16  1.00
## CAS2 complete            0.04        0.05            0.09 -0.03
##                 CAS2 complete
## Crime sum imp            0.04
## Crime score              0.05
## Crime score imp          0.09
## CA                      -0.03
## CAS2 complete            1.00

#CA and crime
GG_scatter(d, "CA", "crime_sum_imp")

d$crime_sum_imp_fct = d$crime_sum_imp %>% factor() %>% fct_relevel(0:11)
GG_group_means(d, "CA", "crime_sum_imp_fct") +
  scale_x_discrete("Simple criminality and anti-social score (out of 12)") +
  scale_y_continuous("Cognitive ability (z-score)")

## Missing values were removed.

## Warning in qt(1 - ((1 - CI)/2), df = as.numeric(x[4]) - 1): NaNs produced

## Warning in qt(1 - ((1 - CI)/2), df = as.numeric(x[4]) - 1): NaNs produced

## Scale for 'x' is already present. Adding another scale for 'x', which
## will replace the existing scale.

## Warning: Removed 2 rows containing missing values (geom_errorbar).

GG_save("figures/crime_sum_ca.png")

## Warning: Removed 2 rows containing missing values (geom_errorbar).

#table
table2(d$crime_sum_imp_fct)

## # A tibble: 13 x 3
##    Group Count  Percent
##    <chr> <dbl>    <dbl>
##  1  <NA> 17933 69.46736
##  2     1  1782  6.90296
##  3     0  1678  6.50010
##  4     2  1215  4.70657
##  5     3   833  3.22681
##  6     4   812  3.14546
##  7     5   625  2.42107
##  8     6   460  1.78191
##  9     7   304  1.17761
## 10     8   135  0.52295
## 11     9    36  0.13945
## 12    10     1  0.00387
## 13    11     1  0.00387

Analysis 3 - Multivariate modeling

Now we try to see if the relation is due to some obvious confound.

#fit models
ols_fits = list(
  primary = rms::ols(MOD_add_str_to_formula(crime_score_imp ~ CA + gender_orientation + rcs(age), SIRE_dummies), data = d),
  SIRE_CC = rms::ols(crime_score_imp ~ CA + gender_orientation + rcs(age) + SIRE_CC, data = d),
  whites = rms::ols(crime_score_imp ~ CA + gender_orientation + rcs(age), data = d %>% dplyr::filter(race == "White")),
  ca_gender_interaction = rms::ols(MOD_add_str_to_formula(crime_score_imp ~ CA * gender_orientation + rcs(age), SIRE_dummies), data = d),
  no_low_CA = rms::ols(MOD_add_str_to_formula(crime_score_imp ~ CA + gender_orientation + rcs(age), SIRE_dummies), data = d %>% dplyr::filter(CA > -2)),
  no_low_CA_gender_interaction = rms::ols(MOD_add_str_to_formula(crime_score_imp ~ CA * gender_orientation + rcs(age), SIRE_dummies), data = d %>% dplyr::filter(CA > -2)),
  nonlinear_ca = rms::ols(MOD_add_str_to_formula(crime_score_imp ~ rcs(CA) * gender_orientation + rcs(age), SIRE_dummies), data = d %>% dplyr::filter(CA > -2)),
  
  #alternative GSO codings
  ordinal_GSO = rms::ols(MOD_add_str_to_formula(crime_score_imp ~ CA * gender_orientation_ord + rcs(age), SIRE_dummies), data = d %>% dplyr::filter(CA > -2)),
  numeric_GSO = rms::ols(MOD_add_str_to_formula(crime_score_imp ~ CA * gender_orientation_num + rcs(age), SIRE_dummies), data = d %>% dplyr::filter(CA > -2)),
  
  #baselines
  demo_only = rms::ols(MOD_add_str_to_formula(crime_score_imp ~ 1, SIRE_dummies), data = d),
  demo_only2 = rms::ols(crime_score_imp ~ SIRE_CC, data = d),
  gender_orientation = rms::ols(crime_score_imp ~ gender_orientation, data = d)
)

ols_fits

## $primary
## Frequencies of Missing Values Due to Each Variable
##    crime_score_imp                 CA gender_orientation 
##              17933                  0                196 
##                age              White              Black 
##                  0               1741               1741 
##              Asian           Hispanic    Native_American 
##               1741               1741               1741 
##             Indian     Middle_Eastern   Pacific_Islander 
##               1741               1741               1741 
##              Other         Multi_SIRE 
##               1741               1741 
## 
## Linear Regression Model
##  
##  rms::ols(formula = MOD_add_str_to_formula(crime_score_imp ~ CA + 
##      gender_orientation + rcs(age), SIRE_dummies), data = d)
##  
##  
##                 Model Likelihood     Discrimination    
##                    Ratio Test           Indexes        
##  Obs    7410    LR chi2    323.19    R2       0.043    
##  sigma0.9790    d.f.           20    R2 adj   0.040    
##  d.f.   7389    Pr(> chi2) 0.0000    g        0.232    
##  
##  Residuals
##  
##      Min      1Q  Median      3Q     Max 
##  -1.7683 -0.8129 -0.3335  1.0633  2.2378 
##  
##  
##                                         Coef    S.E.   t      Pr(>|t|)
##  Intercept                               0.6253 0.3021   2.07 0.0385  
##  CA                                     -0.1530 0.0107 -14.25 <0.0001 
##  gender_orientation=Bisexual_male        0.0565 0.0792   0.71 0.4756  
##  gender_orientation=Homosexual_male     -0.0853 0.0557  -1.53 0.1257  
##  gender_orientation=Homosexual_female   -0.0956 0.1076  -0.89 0.3745  
##  gender_orientation=Bisexual_female     -0.1573 0.0575  -2.73 0.0063  
##  gender_orientation=Heterosexual_female -0.2645 0.0289  -9.15 <0.0001 
##  age                                    -0.0192 0.0115  -1.66 0.0965  
##  age'                                    0.0098 0.0810   0.12 0.9041  
##  age''                                   0.1584 0.3302   0.48 0.6315  
##  age'''                                 -0.4100 0.4227  -0.97 0.3321  
##  White                                  -0.0268 0.0493  -0.54 0.5859  
##  Black                                  -0.0298 0.0628  -0.48 0.6348  
##  Asian                                  -0.0532 0.0640  -0.83 0.4057  
##  Hispanic                                0.0738 0.0591   1.25 0.2119  
##  Native_American                        -0.0358 0.0774  -0.46 0.6441  
##  Indian                                 -0.0174 0.0966  -0.18 0.8572  
##  Middle_Eastern                          0.1478 0.0993   1.49 0.1368  
##  Pacific_Islander                        0.0487 0.1289   0.38 0.7054  
##  Other                                   0.2061 0.0625   3.30 0.0010  
##  Multi_SIRE                              0.0380 0.0624   0.61 0.5423  
##  
## 
## $SIRE_CC
## Frequencies of Missing Values Due to Each Variable
##    crime_score_imp                 CA gender_orientation 
##              17933                  0                196 
##                age            SIRE_CC 
##                  0                  0 
## 
## Linear Regression Model
##  
##  rms::ols(formula = crime_score_imp ~ CA + gender_orientation + 
##      rcs(age) + SIRE_CC, data = d)
##  
##  
##                 Model Likelihood     Discrimination    
##                    Ratio Test           Indexes        
##  Obs    7796    LR chi2    327.28    R2       0.041    
##  sigma0.9817    d.f.           22    R2 adj   0.038    
##  d.f.   7773    Pr(> chi2) 0.0000    g        0.229    
##  
##  Residuals
##  
##      Min      1Q  Median      3Q     Max 
##  -1.7881 -0.8149 -0.3325  1.0743  2.2333 
##  
##  
##                                         Coef    S.E.   t      Pr(>|t|)
##  Intercept                               0.5554 0.2904   1.91 0.0558  
##  CA                                     -0.1511 0.0105 -14.43 <0.0001 
##  gender_orientation=Bisexual_male        0.0157 0.0764   0.21 0.8372  
##  gender_orientation=Homosexual_male     -0.0981 0.0546  -1.80 0.0724  
##  gender_orientation=Homosexual_female   -0.1472 0.1043  -1.41 0.1582  
##  gender_orientation=Bisexual_female     -0.1651 0.0555  -2.97 0.0030  
##  gender_orientation=Heterosexual_female -0.2623 0.0282  -9.31 <0.0001 
##  age                                    -0.0174 0.0113  -1.54 0.1227  
##  age'                                   -0.0059 0.0790  -0.08 0.9402  
##  age''                                   0.2300 0.3221   0.71 0.4751  
##  age'''                                 -0.5107 0.4121  -1.24 0.2153  
##  SIRE_CC=NA                              0.0563 0.0517   1.09 0.2763  
##  SIRE_CC=Other_combos                    0.1116 0.0517   2.16 0.0310  
##  SIRE_CC=Asian                          -0.0333 0.0621  -0.54 0.5925  
##  SIRE_CC=Hispanic / Latin                0.1176 0.0654   1.80 0.0724  
##  SIRE_CC=Black                           0.0188 0.0653   0.29 0.7731  
##  SIRE_CC=Other                           0.2535 0.0772   3.28 0.0010  
##  SIRE_CC=Hispanic / Latin, White         0.1084 0.0835   1.30 0.1945  
##  SIRE_CC=White, Other                    0.2888 0.1107   2.61 0.0091  
##  SIRE_CC=Native American, White          0.1099 0.1076   1.02 0.3070  
##  SIRE_CC=Indian                         -0.0689 0.1157  -0.60 0.5516  
##  SIRE_CC=Asian, White                    0.0423 0.1320   0.32 0.7487  
##  SIRE_CC=Black, White                    0.0958 0.1691   0.57 0.5712  
##  
## 
## $whites
## Frequencies of Missing Values Due to Each Variable
##    crime_score_imp                 CA gender_orientation 
##              12552                  0                129 
##                age 
##                  0 
## 
## Linear Regression Model
##  
##  rms::ols(formula = crime_score_imp ~ CA + gender_orientation + 
##      rcs(age), data = d %>% dplyr::filter(race == "White"))
##  
##  
##                 Model Likelihood     Discrimination    
##                    Ratio Test           Indexes        
##  Obs    5647    LR chi2    224.39    R2       0.039    
##  sigma0.9788    d.f.           10    R2 adj   0.037    
##  d.f.   5636    Pr(> chi2) 0.0000    g        0.222    
##  
##  Residuals
##  
##      Min      1Q  Median      3Q     Max 
##  -1.6592 -0.8138 -0.3425  1.0693  2.2335 
##  
##  
##                                         Coef    S.E.   t      Pr(>|t|)
##  Intercept                               0.7435 0.3389   2.19 0.0283  
##  CA                                     -0.1628 0.0123 -13.25 <0.0001 
##  gender_orientation=Bisexual_male       -0.0240 0.0929  -0.26 0.7963  
##  gender_orientation=Homosexual_male     -0.0661 0.0673  -0.98 0.3255  
##  gender_orientation=Homosexual_female   -0.0570 0.1279  -0.45 0.6561  
##  gender_orientation=Bisexual_female     -0.1813 0.0666  -2.72 0.0065  
##  gender_orientation=Heterosexual_female -0.2461 0.0332  -7.41 <0.0001 
##  age                                    -0.0257 0.0129  -1.99 0.0471  
##  age'                                    0.0897 0.1126   0.80 0.4258  
##  age''                                  -0.1171 0.3751  -0.31 0.7550  
##  age'''                                 -0.0784 0.4001  -0.20 0.8446  
##  
## 
## $ca_gender_interaction
## Frequencies of Missing Values Due to Each Variable
##    crime_score_imp                 CA gender_orientation 
##              17933                  0                196 
##                age              White              Black 
##                  0               1741               1741 
##              Asian           Hispanic    Native_American 
##               1741               1741               1741 
##             Indian     Middle_Eastern   Pacific_Islander 
##               1741               1741               1741 
##              Other         Multi_SIRE 
##               1741               1741 
## 
## Linear Regression Model
##  
##  rms::ols(formula = MOD_add_str_to_formula(crime_score_imp ~ CA * 
##      gender_orientation + rcs(age), SIRE_dummies), data = d)
##  
##  
##                 Model Likelihood     Discrimination    
##                    Ratio Test           Indexes        
##  Obs    7410    LR chi2    331.84    R2       0.044    
##  sigma0.9788    d.f.           25    R2 adj   0.041    
##  d.f.   7384    Pr(> chi2) 0.0000    g        0.234    
##  
##  Residuals
##  
##      Min      1Q  Median      3Q     Max 
##  -1.8093 -0.8253 -0.3391  1.0645  2.1559 
##  
##  
##                                              Coef    S.E.   t      Pr(>|t|)
##  Intercept                                    0.6080 0.3025   2.01 0.0445  
##  CA                                          -0.1712 0.0129 -13.23 <0.0001 
##  gender_orientation=Bisexual_male             0.0607 0.0795   0.76 0.4449  
##  gender_orientation=Homosexual_male          -0.0816 0.0571  -1.43 0.1528  
##  gender_orientation=Homosexual_female        -0.0410 0.1144  -0.36 0.7199  
##  gender_orientation=Bisexual_female          -0.1559 0.0576  -2.71 0.0068  
##  gender_orientation=Heterosexual_female      -0.2510 0.0294  -8.53 <0.0001 
##  age                                         -0.0187 0.0116  -1.62 0.1053  
##  age'                                         0.0102 0.0811   0.13 0.8999  
##  age''                                        0.1533 0.3304   0.46 0.6427  
##  age'''                                      -0.4016 0.4228  -0.95 0.3422  
##  White                                       -0.0252 0.0493  -0.51 0.6090  
##  Black                                       -0.0256 0.0629  -0.41 0.6835  
##  Asian                                       -0.0505 0.0640  -0.79 0.4301  
##  Hispanic                                     0.0750 0.0591   1.27 0.2044  
##  Native_American                             -0.0357 0.0775  -0.46 0.6445  
##  Indian                                      -0.0152 0.0966  -0.16 0.8752  
##  Middle_Eastern                               0.1502 0.0994   1.51 0.1307  
##  Pacific_Islander                             0.0461 0.1289   0.36 0.7209  
##  Other                                        0.2061 0.0625   3.30 0.0010  
##  Multi_SIRE                                   0.0353 0.0624   0.57 0.5714  
##  CA * gender_orientation=Bisexual_male       -0.0042 0.0768  -0.05 0.9568  
##  CA * gender_orientation=Homosexual_male      0.0257 0.0528   0.49 0.6270  
##  CA * gender_orientation=Homosexual_female    0.1681 0.1102   1.53 0.1272  
##  CA * gender_orientation=Bisexual_female      0.0416 0.0583   0.71 0.4752  
##  CA * gender_orientation=Heterosexual_female  0.0670 0.0264   2.53 0.0113  
##  
## 
## $no_low_CA
## Frequencies of Missing Values Due to Each Variable
##    crime_score_imp                 CA gender_orientation 
##              17381                  0                192 
##                age              White              Black 
##                  0               1694               1694 
##              Asian           Hispanic    Native_American 
##               1694               1694               1694 
##             Indian     Middle_Eastern   Pacific_Islander 
##               1694               1694               1694 
##              Other         Multi_SIRE 
##               1694               1694 
## 
## Linear Regression Model
##  
##  rms::ols(formula = MOD_add_str_to_formula(crime_score_imp ~ CA + 
##      gender_orientation + rcs(age), SIRE_dummies), data = d %>% 
##      dplyr::filter(CA > -2))
##  
##  
##                 Model Likelihood     Discrimination    
##                    Ratio Test           Indexes        
##  Obs    7002    LR chi2    306.43    R2       0.043    
##  sigma0.9791    d.f.           20    R2 adj   0.040    
##  d.f.   6981    Pr(> chi2) 0.0000    g        0.234    
##  
##  Residuals
##  
##      Min      1Q  Median      3Q     Max 
##  -1.7026 -0.8116 -0.3400  1.0646  2.2701 
##  
##  
##                                         Coef    S.E.   t      Pr(>|t|)
##  Intercept                               0.7206 0.3113   2.31 0.0207  
##  CA                                     -0.1776 0.0127 -14.03 <0.0001 
##  gender_orientation=Bisexual_male        0.0635 0.0807   0.79 0.4314  
##  gender_orientation=Homosexual_male     -0.0913 0.0571  -1.60 0.1098  
##  gender_orientation=Homosexual_female   -0.1133 0.1103  -1.03 0.3044  
##  gender_orientation=Bisexual_female     -0.1851 0.0589  -3.14 0.0017  
##  gender_orientation=Heterosexual_female -0.2593 0.0299  -8.67 <0.0001 
##  age                                    -0.0226 0.0119  -1.89 0.0583  
##  age'                                    0.0238 0.0774   0.31 0.7584  
##  age''                                   0.1013 0.3176   0.32 0.7497  
##  age'''                                 -0.3430 0.4117  -0.83 0.4047  
##  White                                  -0.0220 0.0507  -0.43 0.6648  
##  Black                                  -0.0335 0.0651  -0.51 0.6076  
##  Asian                                  -0.0525 0.0658  -0.80 0.4249  
##  Hispanic                                0.0738 0.0615   1.20 0.2304  
##  Native_American                        -0.0458 0.0811  -0.56 0.5724  
##  Indian                                  0.0173 0.0983   0.18 0.8600  
##  Middle_Eastern                          0.1397 0.1026   1.36 0.1737  
##  Pacific_Islander                        0.0752 0.1340   0.56 0.5746  
##  Other                                   0.2129 0.0644   3.31 0.0009  
##  Multi_SIRE                              0.0365 0.0641   0.57 0.5694  
##  
## 
## $no_low_CA_gender_interaction
## Frequencies of Missing Values Due to Each Variable
##    crime_score_imp                 CA gender_orientation 
##              17381                  0                192 
##                age              White              Black 
##                  0               1694               1694 
##              Asian           Hispanic    Native_American 
##               1694               1694               1694 
##             Indian     Middle_Eastern   Pacific_Islander 
##               1694               1694               1694 
##              Other         Multi_SIRE 
##               1694               1694 
## 
## Linear Regression Model
##  
##  rms::ols(formula = MOD_add_str_to_formula(crime_score_imp ~ CA * 
##      gender_orientation + rcs(age), SIRE_dummies), data = d %>% 
##      dplyr::filter(CA > -2))
##  
##  
##                 Model Likelihood     Discrimination    
##                    Ratio Test           Indexes        
##  Obs    7002    LR chi2    318.21    R2       0.044    
##  sigma0.9786    d.f.           25    R2 adj   0.041    
##  d.f.   6976    Pr(> chi2) 0.0000    g        0.238    
##  
##  Residuals
##  
##      Min      1Q  Median      3Q     Max 
##  -1.7306 -0.8172 -0.3433  1.0683  2.2034 
##  
##  
##                                              Coef    S.E.   t      Pr(>|t|)
##  Intercept                                    0.7199 0.3114   2.31 0.0208  
##  CA                                          -0.1995 0.0152 -13.08 <0.0001 
##  gender_orientation=Bisexual_male             0.0719 0.0824   0.87 0.3828  
##  gender_orientation=Homosexual_male          -0.0913 0.0576  -1.58 0.1134  
##  gender_orientation=Homosexual_female        -0.0601 0.1145  -0.52 0.5999  
##  gender_orientation=Bisexual_female          -0.2066 0.0600  -3.44 0.0006  
##  gender_orientation=Heterosexual_female      -0.2569 0.0299  -8.58 <0.0001 
##  age                                         -0.0226 0.0119  -1.90 0.0581  
##  age'                                         0.0255 0.0774   0.33 0.7418  
##  age''                                        0.0938 0.3176   0.30 0.7677  
##  age'''                                      -0.3342 0.4116  -0.81 0.4168  
##  White                                       -0.0201 0.0507  -0.40 0.6922  
##  Black                                       -0.0305 0.0651  -0.47 0.6397  
##  Asian                                       -0.0505 0.0658  -0.77 0.4424  
##  Hispanic                                     0.0792 0.0615   1.29 0.1980  
##  Native_American                             -0.0396 0.0812  -0.49 0.6252  
##  Indian                                       0.0200 0.0982   0.20 0.8389  
##  Middle_Eastern                               0.1435 0.1027   1.40 0.1624  
##  Pacific_Islander                             0.0708 0.1341   0.53 0.5972  
##  Other                                        0.2164 0.0644   3.36 0.0008  
##  Multi_SIRE                                   0.0299 0.0641   0.47 0.6414  
##  CA * gender_orientation=Bisexual_male       -0.0307 0.0893  -0.34 0.7312  
##  CA * gender_orientation=Homosexual_male      0.0344 0.0595   0.58 0.5629  
##  CA * gender_orientation=Homosexual_female    0.2598 0.1339   1.94 0.0524  
##  CA * gender_orientation=Bisexual_female      0.1486 0.0691   2.15 0.0315  
##  CA * gender_orientation=Heterosexual_female  0.0656 0.0315   2.08 0.0373  
##  
## 
## $nonlinear_ca
## Frequencies of Missing Values Due to Each Variable
##    crime_score_imp                 CA gender_orientation 
##              17381                  0                192 
##                age              White              Black 
##                  0               1694               1694 
##              Asian           Hispanic    Native_American 
##               1694               1694               1694 
##             Indian     Middle_Eastern   Pacific_Islander 
##               1694               1694               1694 
##              Other         Multi_SIRE 
##               1694               1694 
## 
## Linear Regression Model
##  
##  rms::ols(formula = MOD_add_str_to_formula(crime_score_imp ~ rcs(CA) * 
##      gender_orientation + rcs(age), SIRE_dummies), data = d %>% 
##      dplyr::filter(CA > -2))
##  
##  
##                 Model Likelihood     Discrimination    
##                    Ratio Test           Indexes        
##  Obs    7002    LR chi2    340.82    R2       0.048    
##  sigma0.9783    d.f.           43    R2 adj   0.042    
##  d.f.   6958    Pr(> chi2) 0.0000    g        0.249    
##  
##  Residuals
##  
##      Min      1Q  Median      3Q     Max 
##  -1.7057 -0.7924 -0.3386  1.0539  2.2156 
##  
##  
##                                                 Coef    S.E.    t    
##  Intercept                                       0.8827  0.3281  2.69
##  CA                                             -0.0363  0.0762 -0.48
##  CA'                                            -0.3627  0.3094 -1.17
##  CA''                                            1.2594  1.6048  0.78
##  CA'''                                          -2.6788  3.4672 -0.77
##  gender_orientation=Bisexual_male               -0.2476  0.5602 -0.44
##  gender_orientation=Homosexual_male             -0.0502  0.3450 -0.15
##  gender_orientation=Homosexual_female           -0.5828  0.6496 -0.90
##  gender_orientation=Bisexual_female             -0.3161  0.3994 -0.79
##  gender_orientation=Heterosexual_female         -0.3382  0.1868 -1.81
##  age                                            -0.0223  0.0120 -1.86
##  age'                                            0.0259  0.0776  0.33
##  age''                                           0.0895  0.3183  0.28
##  age'''                                         -0.3266  0.4122 -0.79
##  White                                          -0.0185  0.0508 -0.36
##  Black                                          -0.0302  0.0653 -0.46
##  Asian                                          -0.0499  0.0658 -0.76
##  Hispanic                                        0.0778  0.0616  1.26
##  Native_American                                -0.0357  0.0812 -0.44
##  Indian                                          0.0290  0.0984  0.29
##  Middle_Eastern                                  0.1492  0.1028  1.45
##  Pacific_Islander                                0.0717  0.1342  0.53
##  Other                                           0.2175  0.0645  3.37
##  Multi_SIRE                                      0.0260  0.0642  0.41
##  CA * gender_orientation=Bisexual_male          -0.2391  0.4490 -0.53
##  CA' * gender_orientation=Bisexual_male          0.9892  1.7574  0.56
##  CA'' * gender_orientation=Bisexual_male        -3.9309  8.9611 -0.44
##  CA''' * gender_orientation=Bisexual_male        3.3185 19.0625  0.17
##  CA * gender_orientation=Homosexual_male         0.0402  0.2634  0.15
##  CA' * gender_orientation=Homosexual_male       -0.2715  1.1502 -0.24
##  CA'' * gender_orientation=Homosexual_male       1.8147  6.3067  0.29
##  CA''' * gender_orientation=Homosexual_male     -3.8032 14.3955 -0.26
##  CA * gender_orientation=Homosexual_female      -0.1878  0.5257 -0.36
##  CA' * gender_orientation=Homosexual_female      1.3717  2.2558  0.61
##  CA'' * gender_orientation=Homosexual_female    -5.0324 13.8470 -0.36
##  CA''' * gender_orientation=Homosexual_female    5.6123 41.9161  0.13
##  CA * gender_orientation=Bisexual_female         0.0339  0.3288  0.10
##  CA' * gender_orientation=Bisexual_female        0.3932  1.2402  0.32
##  CA'' * gender_orientation=Bisexual_female      -3.2927  6.3190 -0.52
##  CA''' * gender_orientation=Bisexual_female     12.6939 13.7328  0.92
##  CA * gender_orientation=Heterosexual_female    -0.0396  0.1454 -0.27
##  CA' * gender_orientation=Heterosexual_female   -0.0474  0.5900 -0.08
##  CA'' * gender_orientation=Heterosexual_female   1.3642  3.1124  0.44
##  CA''' * gender_orientation=Heterosexual_female -3.5118  7.0504 -0.50
##                                                 Pr(>|t|)
##  Intercept                                      0.0071  
##  CA                                             0.6340  
##  CA'                                            0.2411  
##  CA''                                           0.4326  
##  CA'''                                          0.4398  
##  gender_orientation=Bisexual_male               0.6585  
##  gender_orientation=Homosexual_male             0.8843  
##  gender_orientation=Homosexual_female           0.3697  
##  gender_orientation=Bisexual_female             0.4287  
##  gender_orientation=Heterosexual_female         0.0703  
##  age                                            0.0623  
##  age'                                           0.7391  
##  age''                                          0.7785  
##  age'''                                         0.4283  
##  White                                          0.7161  
##  Black                                          0.6431  
##  Asian                                          0.4483  
##  Hispanic                                       0.2067  
##  Native_American                                0.6601  
##  Indian                                         0.7682  
##  Middle_Eastern                                 0.1469  
##  Pacific_Islander                               0.5931  
##  Other                                          0.0007  
##  Multi_SIRE                                     0.6851  
##  CA * gender_orientation=Bisexual_male          0.5945  
##  CA' * gender_orientation=Bisexual_male         0.5735  
##  CA'' * gender_orientation=Bisexual_male        0.6609  
##  CA''' * gender_orientation=Bisexual_male       0.8618  
##  CA * gender_orientation=Homosexual_male        0.8788  
##  CA' * gender_orientation=Homosexual_male       0.8134  
##  CA'' * gender_orientation=Homosexual_male      0.7736  
##  CA''' * gender_orientation=Homosexual_male     0.7916  
##  CA * gender_orientation=Homosexual_female      0.7210  
##  CA' * gender_orientation=Homosexual_female     0.5432  
##  CA'' * gender_orientation=Homosexual_female    0.7163  
##  CA''' * gender_orientation=Homosexual_female   0.8935  
##  CA * gender_orientation=Bisexual_female        0.9179  
##  CA' * gender_orientation=Bisexual_female       0.7512  
##  CA'' * gender_orientation=Bisexual_female      0.6023  
##  CA''' * gender_orientation=Bisexual_female     0.3553  
##  CA * gender_orientation=Heterosexual_female    0.7853  
##  CA' * gender_orientation=Heterosexual_female   0.9360  
##  CA'' * gender_orientation=Heterosexual_female  0.6612  
##  CA''' * gender_orientation=Heterosexual_female 0.6184  
##  
## 
## $ordinal_GSO
## Frequencies of Missing Values Due to Each Variable
##        crime_score_imp                     CA gender_orientation_ord 
##                  17381                      0                    192 
##                    age                  White                  Black 
##                      0                   1694                   1694 
##                  Asian               Hispanic        Native_American 
##                   1694                   1694                   1694 
##                 Indian         Middle_Eastern       Pacific_Islander 
##                   1694                   1694                   1694 
##                  Other             Multi_SIRE 
##                   1694                   1694 
## 
## Linear Regression Model
##  
##  rms::ols(formula = MOD_add_str_to_formula(crime_score_imp ~ CA * 
##      gender_orientation_ord + rcs(age), SIRE_dummies), data = d %>% 
##      dplyr::filter(CA > -2))
##  
##  
##                 Model Likelihood     Discrimination    
##                    Ratio Test           Indexes        
##  Obs    7002    LR chi2    318.21    R2       0.044    
##  sigma0.9786    d.f.           25    R2 adj   0.041    
##  d.f.   6976    Pr(> chi2) 0.0000    g        0.238    
##  
##  Residuals
##  
##      Min      1Q  Median      3Q     Max 
##  -1.7306 -0.8172 -0.3433  1.0683  2.2034 
##  
##  
##                                                  Coef    S.E.   t     
##  Intercept                                        0.7199 0.3114   2.31
##  CA                                              -0.1995 0.0152 -13.08
##  gender_orientation_ord=Bisexual_male             0.0719 0.0824   0.87
##  gender_orientation_ord=Homosexual_male          -0.0913 0.0576  -1.58
##  gender_orientation_ord=Homosexual_female        -0.0601 0.1145  -0.52
##  gender_orientation_ord=Bisexual_female          -0.2066 0.0600  -3.44
##  gender_orientation_ord=Heterosexual_female      -0.2569 0.0299  -8.58
##  age                                             -0.0226 0.0119  -1.90
##  age'                                             0.0255 0.0774   0.33
##  age''                                            0.0938 0.3176   0.30
##  age'''                                          -0.3342 0.4116  -0.81
##  White                                           -0.0201 0.0507  -0.40
##  Black                                           -0.0305 0.0651  -0.47
##  Asian                                           -0.0505 0.0658  -0.77
##  Hispanic                                         0.0792 0.0615   1.29
##  Native_American                                 -0.0396 0.0812  -0.49
##  Indian                                           0.0200 0.0982   0.20
##  Middle_Eastern                                   0.1435 0.1027   1.40
##  Pacific_Islander                                 0.0708 0.1341   0.53
##  Other                                            0.2164 0.0644   3.36
##  Multi_SIRE                                       0.0299 0.0641   0.47
##  CA * gender_orientation_ord=Bisexual_male       -0.0307 0.0893  -0.34
##  CA * gender_orientation_ord=Homosexual_male      0.0344 0.0595   0.58
##  CA * gender_orientation_ord=Homosexual_female    0.2598 0.1339   1.94
##  CA * gender_orientation_ord=Bisexual_female      0.1486 0.0691   2.15
##  CA * gender_orientation_ord=Heterosexual_female  0.0656 0.0315   2.08
##                                                  Pr(>|t|)
##  Intercept                                       0.0208  
##  CA                                              <0.0001 
##  gender_orientation_ord=Bisexual_male            0.3828  
##  gender_orientation_ord=Homosexual_male          0.1134  
##  gender_orientation_ord=Homosexual_female        0.5999  
##  gender_orientation_ord=Bisexual_female          0.0006  
##  gender_orientation_ord=Heterosexual_female      <0.0001 
##  age                                             0.0581  
##  age'                                            0.7418  
##  age''                                           0.7677  
##  age'''                                          0.4168  
##  White                                           0.6922  
##  Black                                           0.6397  
##  Asian                                           0.4424  
##  Hispanic                                        0.1980  
##  Native_American                                 0.6252  
##  Indian                                          0.8389  
##  Middle_Eastern                                  0.1624  
##  Pacific_Islander                                0.5972  
##  Other                                           0.0008  
##  Multi_SIRE                                      0.6414  
##  CA * gender_orientation_ord=Bisexual_male       0.7312  
##  CA * gender_orientation_ord=Homosexual_male     0.5629  
##  CA * gender_orientation_ord=Homosexual_female   0.0524  
##  CA * gender_orientation_ord=Bisexual_female     0.0315  
##  CA * gender_orientation_ord=Heterosexual_female 0.0373  
##  
## 
## $numeric_GSO
## Frequencies of Missing Values Due to Each Variable
##        crime_score_imp                     CA gender_orientation_num 
##                  17381                      0                    192 
##                    age                  White                  Black 
##                      0                   1694                   1694 
##                  Asian               Hispanic        Native_American 
##                   1694                   1694                   1694 
##                 Indian         Middle_Eastern       Pacific_Islander 
##                   1694                   1694                   1694 
##                  Other             Multi_SIRE 
##                   1694                   1694 
## 
## Linear Regression Model
##  
##  rms::ols(formula = MOD_add_str_to_formula(crime_score_imp ~ CA * 
##      gender_orientation_num + rcs(age), SIRE_dummies), data = d %>% 
##      dplyr::filter(CA > -2))
##  
##  
##                 Model Likelihood     Discrimination    
##                    Ratio Test           Indexes        
##  Obs    7002    LR chi2    311.35    R2       0.043    
##  sigma0.9785    d.f.           17    R2 adj   0.041    
##  d.f.   6984    Pr(> chi2) 0.0000    g        0.236    
##  
##  Residuals
##  
##      Min      1Q  Median      3Q     Max 
##  -1.7330 -0.8178 -0.3466  1.0706  2.1679 
##  
##  
##                              Coef    S.E.   t      Pr(>|t|)
##  Intercept                    0.8060 0.3078   2.62 0.0089  
##  CA                          -0.2147 0.0188 -11.39 <0.0001 
##  gender_orientation_num      -0.0509 0.0058  -8.84 <0.0001 
##  age                         -0.0236 0.0118  -2.00 0.0454  
##  age'                         0.0259 0.0772   0.34 0.7373  
##  age''                        0.0996 0.3173   0.31 0.7537  
##  age'''                      -0.3478 0.4114  -0.85 0.3980  
##  White                       -0.0208 0.0507  -0.41 0.6812  
##  Black                       -0.0317 0.0651  -0.49 0.6261  
##  Asian                       -0.0529 0.0657  -0.81 0.4204  
##  Hispanic                     0.0761 0.0615   1.24 0.2158  
##  Native_American             -0.0417 0.0810  -0.51 0.6069  
##  Indian                       0.0179 0.0982   0.18 0.8552  
##  Middle_Eastern               0.1435 0.1026   1.40 0.1618  
##  Pacific_Islander             0.0720 0.1339   0.54 0.5910  
##  Other                        0.2132 0.0643   3.32 0.0009  
##  Multi_SIRE                   0.0366 0.0640   0.57 0.5677  
##  CA * gender_orientation_num  0.0162 0.0060   2.68 0.0074  
##  
## 
## $demo_only
## Frequencies of Missing Values Due to Each Variable
##  crime_score_imp            White            Black            Asian 
##            17933             1741             1741             1741 
##         Hispanic  Native_American           Indian   Middle_Eastern 
##             1741             1741             1741             1741 
## Pacific_Islander            Other       Multi_SIRE 
##             1741             1741             1741 
## 
## Linear Regression Model
##  
##  rms::ols(formula = MOD_add_str_to_formula(crime_score_imp ~ 1, 
##      SIRE_dummies), data = d)
##  
##  
##                 Model Likelihood     Discrimination    
##                    Ratio Test           Indexes        
##  Obs    7492    LR chi2     43.62    R2       0.006    
##  sigma0.9958    d.f.           10    R2 adj   0.004    
##  d.f.   7481    Pr(> chi2) 0.0000    g        0.057    
##  
##  Residuals
##  
##      Min      1Q  Median      3Q     Max 
##  -1.5121 -0.7740 -0.3258  1.1421  1.9082 
##  
##  
##                   Coef    S.E.   t     Pr(>|t|)
##  Intercept         0.0482 0.0485  0.99 0.3198  
##  White            -0.0769 0.0497 -1.55 0.1217  
##  Black            -0.0307 0.0634 -0.48 0.6281  
##  Asian            -0.1198 0.0644 -1.86 0.0630  
##  Hispanic          0.1155 0.0597  1.94 0.0529  
##  Native_American  -0.0169 0.0778 -0.22 0.8282  
##  Indian           -0.0553 0.0976 -0.57 0.5714  
##  Middle_Eastern    0.1128 0.1008  1.12 0.2631  
##  Pacific_Islander  0.1201 0.1276  0.94 0.3469  
##  Other             0.1685 0.0630  2.68 0.0075  
##  Multi_SIRE        0.0640 0.0628  1.02 0.3078  
##  
## 
## $demo_only2
## Frequencies of Missing Values Due to Each Variable
## crime_score_imp         SIRE_CC 
##           17933               0 
## 
## Linear Regression Model
##  
##  rms::ols(formula = crime_score_imp ~ SIRE_CC, data = d)
##  
##  
##                 Model Likelihood     Discrimination    
##                    Ratio Test           Indexes        
##  Obs    7882    LR chi2     39.62    R2       0.005    
##  sigma0.9983    d.f.           12    R2 adj   0.003    
##  d.f.   7869    Pr(> chi2) 0.0001    g        0.057    
##  
##  Residuals
##  
##      Min      1Q  Median      3Q     Max 
##  -1.3684 -0.7734 -0.3252  1.1459  1.9088 
##  
##  
##                                  Coef    S.E.   t     Pr(>|t|)
##  Intercept                       -0.0292 0.0132 -2.21 0.0269  
##  SIRE_CC=NA                       0.0362 0.0522  0.69 0.4887  
##  SIRE_CC=Other_combos             0.1472 0.0519  2.84 0.0046  
##  SIRE_CC=Asian                   -0.0496 0.0627 -0.79 0.4294  
##  SIRE_CC=Hispanic / Latin         0.2180 0.0659  3.31 0.0009  
##  SIRE_CC=Black                    0.0665 0.0659  1.01 0.3132  
##  SIRE_CC=Other                    0.2544 0.0779  3.27 0.0011  
##  SIRE_CC=Hispanic / Latin, White  0.1566 0.0845  1.85 0.0640  
##  SIRE_CC=White, Other             0.2817 0.1110  2.54 0.0112  
##  SIRE_CC=Native American, White   0.1523 0.1078  1.41 0.1579  
##  SIRE_CC=Indian                  -0.0567 0.1168 -0.49 0.6271  
##  SIRE_CC=Asian, White             0.0054 0.1340  0.04 0.9679  
##  SIRE_CC=Black, White             0.1627 0.1717  0.95 0.3434  
##  
## 
## $gender_orientation
## Frequencies of Missing Values Due to Each Variable
##    crime_score_imp gender_orientation 
##              17933                196 
## 
## Linear Regression Model
##  
##  rms::ols(formula = crime_score_imp ~ gender_orientation, data = d)
##  
##  
##                 Model Likelihood     Discrimination    
##                    Ratio Test           Indexes        
##  Obs    7796    LR chi2     65.74    R2       0.008    
##  sigma0.9972    d.f.            5    R2 adj   0.008    
##  d.f.   7790    Pr(> chi2) 0.0000    g        0.085    
##  
##  Residuals
##  
##      Min      1Q  Median      3Q     Max 
##  -1.2089 -0.8246 -0.3631  1.1557  2.0426 
##  
##  
##                                         Coef    S.E.   t     Pr(>|t|)
##  Intercept                               0.0562 0.0139  4.04 <0.0001 
##  gender_orientation=Bisexual_male        0.0368 0.0769  0.48 0.6319  
##  gender_orientation=Homosexual_male     -0.0343 0.0548 -0.63 0.5320  
##  gender_orientation=Homosexual_female   -0.0649 0.1055 -0.62 0.5383  
##  gender_orientation=Bisexual_female     -0.1207 0.0548 -2.20 0.0277  
##  gender_orientation=Heterosexual_female -0.2215 0.0280 -7.92 <0.0001 
##

#CA betas
map_dbl(ols_fits[1:6], function(x) x %>% .[["coefficients"]] %>% .[2])

##                      primary                      SIRE_CC 
##                       -0.153                       -0.151 
##                       whites        ca_gender_interaction 
##                       -0.163                       -0.171 
##                    no_low_CA no_low_CA_gender_interaction 
##                       -0.178                       -0.200

#model n's
map_dbl(ols_fits, function(x) x$stats[1])

##                      primary                      SIRE_CC 
##                         7410                         7796 
##                       whites        ca_gender_interaction 
##                         5647                         7410 
##                    no_low_CA no_low_CA_gender_interaction 
##                         7002                         7002 
##                 nonlinear_ca                  ordinal_GSO 
##                         7002                         7002 
##                  numeric_GSO                    demo_only 
##                         7002                         7492 
##                   demo_only2           gender_orientation 
##                         7882                         7796

#model r's
map_dbl(ols_fits, function(x) x$stats[4] %>% sqrt())

##                      primary                      SIRE_CC 
##                       0.2066                       0.2028 
##                       whites        ca_gender_interaction 
##                       0.1974                       0.2093 
##                    no_low_CA no_low_CA_gender_interaction 
##                       0.2069                       0.2108 
##                 nonlinear_ca                  ordinal_GSO 
##                       0.2180                       0.2108 
##                  numeric_GSO                    demo_only 
##                       0.2085                       0.0762 
##                   demo_only2           gender_orientation 
##                       0.0708                       0.0916

#correlations by gender
d %>% dplyr::filter(gender == "Man") %$% cor.test(CA, crime_sum_imp)

## 
##  Pearson's product-moment correlation
## 
## data:  CA and crime_sum_imp
## t = -20, df = 6000, p-value <2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.228 -0.178
## sample estimates:
##    cor 
## -0.203

d %>% dplyr::filter(gender == "Woman") %$% cor.test(CA, crime_sum_imp)

## 
##  Pearson's product-moment correlation
## 
## data:  CA and crime_sum_imp
## t = -6, df = 2000, p-value = 2e-09
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.1695 -0.0871
## sample estimates:
##    cor 
## -0.129

#plot nonlinear CA
nonlinear_ca_df = data.frame(
            CA = seq(-2, 2, by = .01),
            gender_orientation = "Heterosexual_male",
            age = wtd_mean(d$age),
            White = T,
            Black = F,
            Asian = F,
            Hispanic = F,
            Native_American = F,
            Indian = F,
            Middle_Eastern = F,
            Pacific_Islander = F,
            Other = F,
            Multi_SIRE = F
          )

#model prediction CAS
nonlinear_ca_df$CAS_nonlinear = predict(ols_fits$nonlinear_ca, newdata = nonlinear_ca_df)
nonlinear_ca_df$CAS_linear = predict(ols_fits$no_low_CA_gender_interaction, newdata = nonlinear_ca_df)

#plot
ggplot(nonlinear_ca_df) +
  geom_line(aes(CA, CAS_nonlinear), color = "blue") + 
  geom_line(aes(CA, CAS_linear), color = "red") +
  theme_bw() +
  xlab("Cognitive ability") +
  ylab("Criminal and antisocial behavior score (model predicted)")

GG_save("figures/model_predictions.png")

Analysis 4 - Do people systematically skip crime items?

It seems likely that people on dating sites would try to present themselves as better than they are to some degree. It can be noted that persons who did not answer crime items at all were often the lowest in estimated ability, even lower than those who answered positively (i.e. towards CAS). Here we control for overall number of items answered and look whether CA and CAS score based on other items predicts skipping CAS items.

#loop over each item, fit a model to examine if skippers are lower in CA and higher in CAS in the non-skipped items.
skippers_crime_score = map_df(CAS_vars, function(var_) {
  #make missingness outcome
  d_local = d
  d_local$missing = is.na(d[[var_]])

  #fit model
  mod_ = rms::lrm(missing ~ questions_answered_z + CA + crime_score + gender_orientation + rcs(age), data = d_local)

  data_frame(
    CAS = var_,
    questions_answered = coef(mod_)[2],
    CA = coef(mod_)[3],
    crime_score = coef(mod_)[4],
    r2adj = mod_$stats[10]
  )
})

skippers_crime_score %>% write_clipboard()

##                       CAS Questions answered    CA Crime score R2adj
## 1                arrested              -0.90  0.14        0.26  0.09
## 2                  prison              -1.13 -0.42       -0.54  0.22
## 3         punched_in_face              -1.10  0.19        0.10  0.09
## 4            cheated_exam              -2.17  0.02        0.07  0.40
## 5         would_tax_cheat              -1.19  0.35        0.31  0.16
## 6    stole_glass_from_bar              -1.10  0.19       -0.07  0.10
## 7            used_fake_id              -1.10  0.20        0.18  0.11
## 8  torture_animal_for_fun              -1.54 -0.37       -0.06  0.25
## 9        steal_newspapers              -2.48 -0.06       -0.07  0.44
## 10                 litter              -0.29 -0.03        0.08  0.03
## 11      cigaret_littering              -0.99  0.08        0.34  0.09
## 12        hit_SO_in_anger              -0.83 -0.30        0.01  0.12

Save data for reuse

#output various formats
write_rds(d, "data/data_for_reuse.rds", compress = "gz")
write_csv(d, "data/data_for_reuse.csv", na = "")

Self-reported criminal and anti-social behavior on a dating site

The importance of cognitive ability