Miron Zuckerman is updating his meta-analysis of the religion and intelligence relationship, and asked me to compute some extra analyses of the OKCupid dataset.
options(digits = 2, width = 600)
library(pacman)
p_load(kirkegaard, readr, dplyr, rms)
#load
vars = readr::read_csv2("data/question_data.csv")
## Using ',' as decimal and '.' as grouping mark. Use read_delim() for more control.
## Warning: Missing column names filled in: 'X1' [1]
## Parsed with column specification:
## cols(
## X1 = col_character(),
## text = col_character(),
## option_1 = col_character(),
## option_2 = col_character(),
## option_3 = col_character(),
## option_4 = col_character(),
## N = col_integer(),
## Type = col_character(),
## Order = col_character(),
## Keywords = col_character()
## )
d = readr::read_rds("data/parsed_data.rds")
#test items
iq_items_meta = read_csv("data/test_items.csv")
## Warning: Missing column names filled in: 'X1' [1]
## Parsed with column specification:
## cols(
## X1 = col_character(),
## ID = col_integer(),
## text = col_character(),
## option_1 = col_character(),
## option_2 = col_character(),
## option_3 = col_character(),
## option_4 = col_character(),
## option_correct = col_integer()
## )
#limited to persons with at last 5 IQ items
#included IQ items, some were removed due to ~0 datapoints
CA_items = intersect(iq_items_meta$X1, names(d))
d$CA_items = d[CA_items] %>% miss_by_case(reverse = T)
d = filter(d, CA_items >= 5)
#Sex
d$gender %>% table2()
#get scoring key
v_correct_answers_text = apply(iq_items_meta %>% filter(X1 %in% CA_items), MARGIN = 1, function(item) {
item["option_" + 1:4][item["option_correct"] %>% as.numeric()]
})
#score items
CA_scored_items = score_items(d[CA_items], key = v_correct_answers_text)
#factor analysis / IRT
irt_CA = irt.fa(CA_scored_items)
irt_CA
## Item Response Analysis using Factor Analysis
##
## Call: irt.fa(x = CA_scored_items)
## Item Response Analysis using Factor Analysis
##
## Summary information by factor and item
## Factor = 1
## -3 -2 -1 0 1 2 3
## q178 0.40 0.39 0.18 0.06 0.02 0.00 0.00
## q255 0.11 0.35 0.56 0.32 0.10 0.02 0.01
## q1201 0.15 0.58 0.74 0.25 0.05 0.01 0.00
## q14835 0.07 0.11 0.12 0.11 0.08 0.05 0.03
## q8672 0.03 0.12 0.45 0.67 0.30 0.07 0.01
## q18154 0.19 0.63 0.62 0.19 0.04 0.01 0.00
## q12625 0.14 0.25 0.28 0.18 0.08 0.03 0.01
## q477 0.15 0.34 0.40 0.23 0.08 0.02 0.01
## q256 0.07 0.11 0.13 0.12 0.09 0.05 0.03
## q43639 0.23 0.77 0.63 0.15 0.03 0.00 0.00
## q267 0.05 0.09 0.15 0.18 0.15 0.09 0.05
## q18698 0.04 0.19 0.66 0.66 0.19 0.04 0.01
## q511 0.01 0.03 0.12 0.34 0.51 0.30 0.09
## q57844 0.16 0.46 0.54 0.23 0.06 0.01 0.00
## Test Info 1.80 4.42 5.58 3.69 1.77 0.72 0.25
## SEM 0.75 0.48 0.42 0.52 0.75 1.18 1.99
## Reliability 0.44 0.77 0.82 0.73 0.44 -0.39 -2.96
##
## Factor analysis with Call: fa(r = r, nfactors = nfactors, n.obs = n.obs, rotate = rotate,
## fm = fm)
##
## Test of the hypothesis that 1 factor is sufficient.
## The degrees of freedom for the model is 77 and the objective function was 1.1
## The number of observations was 37714 with Chi Square = 42643 with prob < 0
##
## The root mean square of the residuals (RMSA) is 0.06
## The df corrected root mean square of the residuals is 0.07
##
## Tucker Lewis Index of factoring reliability = 0.77
## RMSEA index = 0.12 and the 10 % confidence intervals are 0.12 0.12
## BIC = 41831
irt_CA$fa
## Factor Analysis using method = minres
## Call: fa(r = r, nfactors = nfactors, n.obs = n.obs, rotate = rotate,
## fm = fm)
## Standardized loadings (pattern matrix) based upon correlation matrix
## MR1 h2 u2 com
## q178 0.62 0.38 0.62 1
## q255 0.66 0.43 0.57 1
## q1201 0.73 0.53 0.47 1
## q14835 0.38 0.15 0.85 1
## q8672 0.70 0.49 0.51 1
## q18154 0.71 0.51 0.49 1
## q12625 0.53 0.29 0.71 1
## q477 0.60 0.37 0.63 1
## q256 0.39 0.15 0.85 1
## q43639 0.74 0.55 0.45 1
## q267 0.44 0.20 0.80 1
## q18698 0.72 0.52 0.48 1
## q511 0.64 0.42 0.58 1
## q57844 0.67 0.44 0.56 1
##
## MR1
## SS loadings 5.42
## Proportion Var 0.39
##
## Mean item complexity = 1
## Test of the hypothesis that 1 factor is sufficient.
##
## The degrees of freedom for the null model are 91 and the objective function was 5.9 with Chi Square of 223082
## The degrees of freedom for the model are 77 and the objective function was 1.1
##
## The root mean square of the residuals (RMSR) is 0.06
## The df corrected root mean square of the residuals is 0.07
##
## The harmonic number of observations is 37714 with the empirical chi square 25978 with prob < 0
## The total number of observations was 37714 with Likelihood Chi Square = 42643 with prob < 0
##
## Tucker Lewis Index of factoring reliability = 0.77
## RMSEA index = 0.12 and the 90 % confidence intervals are 0.12 0.12
## BIC = 41831
## Fit based upon off diagonal values = 0.98
## Measures of factor score adequacy
## MR1
## Correlation of (regression) scores with factors 0.95
## Multiple R square of scores with factors 0.91
## Minimum correlation of possible factor scores 0.82
#score
irt_CA_scores = scoreIrt(irt_CA, CA_scored_items)
#save
d$CA_old = d$CA %>% standardize()
d$CA = irt_CA_scores$theta1 %>% standardize()
#plot scores
GG_scatter(d, "CA", "CA_old")
#recode order
d$religion_seriousness = ordered(d$d_religion_seriosity,
levels = c("and laughing about it", "but not too serious about it", "and somewhat serious about it", "and very serious about it"))
d$religion = d$d_religion_type %>% plyr::mapvalues("-", NA) %>% factor()
d %>%
# filter(CA_items >= 5) %>%
GG_group_means("CA", groupvar = "religion", subgroupvar = "religion_seriousness") +
theme(axis.text.x = element_text(angle = -30, hjust = 0))
## Missing values were removed.
## Warning in qt(1 - ((1 - CI)/2), df = as.numeric(x[4]) - 1): NaNs produced
## Warning: Removed 1 rows containing missing values (geom_errorbar).
Latent factor approach.
#subset items
d_religion = d[c("q41", "q42", "q61786", "q210")]
#convert to integer for psych
d_religion_int = map_df(d_religion, as.integer)
#factor analyze / IRT
irt_religion = irt.fa(d_religion_int)
## Warning in matpLower(x, nvar, gminx, gmaxx, gminy, gmaxy): 6 cells were adjusted for 0 values using the correction for continuity. Examine your data carefully.
irt_religion
## Item Response Analysis using Factor Analysis
##
## Call: irt.fa(x = d_religion_int)
## Item Response Analysis using Factor Analysis
##
## Summary information by factor and item
## Factor = 1
## -3 -2 -1 0 1 2 3
## q41 0.75 1.28 1.42 1.77 0.56 0.05 0.00
## q42 0.00 0.00 0.00 0.00 0.00 0.00 0.00
## q61786 0.08 0.17 0.26 0.25 0.15 0.06 0.02
## q210 0.01 0.10 0.61 1.02 0.29 0.04 0.01
## Test Info 0.84 1.55 2.29 3.04 0.99 0.15 0.03
## SEM 1.09 0.80 0.66 0.57 1.00 2.54 5.47
## Reliability -0.19 0.35 0.56 0.67 -0.01 -5.46 -28.91
##
## Factor analysis with Call: fa(r = r, nfactors = nfactors, n.obs = n.obs, rotate = rotate,
## fm = fm)
##
## Test of the hypothesis that 1 factor is sufficient.
## The degrees of freedom for the model is 2 and the objective function was 0.3
## The number of observations was 37714 with Chi Square = 11410 with prob < 0
##
## The root mean square of the residuals (RMSA) is 0.04
## The df corrected root mean square of the residuals is 0.07
##
## Tucker Lewis Index of factoring reliability = 0.78
## RMSEA index = 0.39 and the 10 % confidence intervals are 0.38 0.4
## BIC = 11389
irt_religion$fa
## Factor Analysis using method = minres
## Call: fa(r = r, nfactors = nfactors, n.obs = n.obs, rotate = rotate,
## fm = fm)
## Standardized loadings (pattern matrix) based upon correlation matrix
## MR1 h2 u2 com
## q41 0.93 0.87 0.129 1
## q42 0.97 0.94 0.057 1
## q61786 0.72 0.52 0.477 1
## q210 0.90 0.81 0.188 1
##
## MR1
## SS loadings 3.15
## Proportion Var 0.79
##
## Mean item complexity = 1
## Test of the hypothesis that 1 factor is sufficient.
##
## The degrees of freedom for the null model are 6 and the objective function was 4.1 with Chi Square of 154449
## The degrees of freedom for the model are 2 and the objective function was 0.3
##
## The root mean square of the residuals (RMSR) is 0.04
## The df corrected root mean square of the residuals is 0.07
##
## The harmonic number of observations is 37714 with the empirical chi square 741 with prob < 1e-161
## The total number of observations was 37714 with Likelihood Chi Square = 11410 with prob < 0
##
## Tucker Lewis Index of factoring reliability = 0.78
## RMSEA index = 0.39 and the 90 % confidence intervals are 0.38 0.4
## BIC = 11389
## Fit based upon off diagonal values = 1
## Measures of factor score adequacy
## MR1
## Correlation of (regression) scores with factors 0.99
## Multiple R square of scores with factors 0.97
## Minimum correlation of possible factor scores 0.95
#score subjects
irt_religion_scores = scoreIrt(irt_religion, items = d_religion_int)
#standardize
d$latent_religion = irt_religion_scores$theta1 %>% standardize() %>% multiply_by(-1)
#plot
GG_scatter(d, "CA", "latent_religion") +
scale_y_continuous(limits = c(NA, 2))
#correct for measurement error
#religion
(religion_reliability = irt_religion$plot$sumInfo[[1]] %>% colSums() %>% max() %>% (function(x) 1-(1/sqrt(x)^2)))
## [1] 0.67
#CA
(CA_reliability = irt_CA$plot$sumInfo[[1]] %>% colSums() %>% max() %>% (function(x) 1-(1/sqrt(x)^2)))
## [1] 0.82
#correct
cor_matrix(d[c("CA", "latent_religion")], reliabilities = c(CA_reliability, religion_reliability))
## CA latent_religion
## CA 0.82 -0.34
## latent_religion -0.34 0.67
#recode
d$education_phase = d$d_education_phase %>% ordered(levels = c("Dropped out of", "Working on", "Graduated from"))
d$education_type = d$d_education_type %>% factor()
#plot
GG_group_means(d, "CA", "education_type", "education_phase")
## Missing values were removed.
#by sex x orientation
GG_group_means(d, "latent_religion", "gender_orientation")
## Missing values were removed.
#regression models
ols(latent_religion ~ CA + rcs(d_age) + gender_orientation, data = d)
## Frequencies of Missing Values Due to Each Variable
## latent_religion CA d_age gender_orientation
## 636 0 739 1114
##
## Linear Regression Model
##
## ols(formula = latent_religion ~ CA + rcs(d_age) + gender_orientation,
## data = d)
##
##
## Model Likelihood Discrimination
## Ratio Test Indexes
## Obs 35987 LR chi2 3271.80 R2 0.087
## sigma0.9558 d.f. 10 R2 adj 0.087
## d.f. 35976 Pr(> chi2) 0.0000 g 0.332
##
## Residuals
##
## Min 1Q Median 3Q Max
## -2.2947 -0.9316 0.1702 0.7772 2.3094
##
##
## Coef S.E. t Pr(>|t|)
## Intercept -0.7120 0.1094 -6.51 <0.0001
## CA -0.2603 0.0051 -50.68 <0.0001
## d_age 0.0268 0.0045 6.00 <0.0001
## d_age' -0.0109 0.0324 -0.34 0.7364
## d_age'' -0.0590 0.1304 -0.45 0.6511
## d_age''' 0.1565 0.1543 1.01 0.3103
## gender_orientation=Bisexual_female -0.2533 0.0239 -10.58 <0.0001
## gender_orientation=Gay_female -0.0886 0.0452 -1.96 0.0497
## gender_orientation=Gay_male -0.0177 0.0256 -0.69 0.4883
## gender_orientation=Bisexual_male -0.1866 0.0385 -4.84 <0.0001
## gender_orientation=Hetero_male -0.1392 0.0122 -11.44 <0.0001
##
#add race too
ols(latent_religion ~ CA + rcs(d_age) + gender_orientation + race, data = d)
## Frequencies of Missing Values Due to Each Variable
## latent_religion CA d_age gender_orientation race
## 636 0 739 1114 3783
##
## Linear Regression Model
##
## ols(formula = latent_religion ~ CA + rcs(d_age) + gender_orientation +
## race, data = d)
##
##
## Model Likelihood Discrimination
## Ratio Test Indexes
## Obs 33089 LR chi2 4127.38 R2 0.117
## sigma0.9383 d.f. 19 R2 adj 0.117
## d.f. 33069 Pr(> chi2) 0.0000 g 0.382
##
## Residuals
##
## Min 1Q Median 3Q Max
## -2.3160 -0.8899 0.1581 0.7455 2.4053
##
##
## Coef S.E. t Pr(>|t|)
## Intercept -0.9269 0.1143 -8.11 <0.0001
## CA -0.2316 0.0054 -42.95 <0.0001
## d_age 0.0313 0.0047 6.72 <0.0001
## d_age' -0.0090 0.0336 -0.27 0.7875
## d_age'' -0.0949 0.1346 -0.71 0.4805
## d_age''' 0.2258 0.1588 1.42 0.1550
## gender_orientation=Bisexual_female -0.2465 0.0249 -9.90 <0.0001
## gender_orientation=Gay_female -0.1266 0.0473 -2.68 0.0074
## gender_orientation=Gay_male -0.0647 0.0259 -2.50 0.0124
## gender_orientation=Bisexual_male -0.1953 0.0401 -4.87 <0.0001
## gender_orientation=Hetero_male -0.1618 0.0126 -12.86 <0.0001
## race=Mixed 0.3052 0.0178 17.11 <0.0001
## race=Asian 0.2739 0.0248 11.03 <0.0001
## race=Hispanic / Latin 0.4362 0.0261 16.70 <0.0001
## race=Black 0.6760 0.0262 25.85 <0.0001
## race=Other 0.1036 0.0316 3.28 0.0011
## race=Indian 0.3444 0.0510 6.75 <0.0001
## race=Middle Eastern 0.2975 0.0774 3.84 0.0001
## race=Native American 0.3561 0.1124 3.17 0.0015
## race=Pacific Islander 0.4298 0.1157 3.71 0.0002
##