What

Miron Zuckerman is updating his meta-analysis of the religion and intelligence relationship, and asked me to compute some extra analyses of the OKCupid dataset.

Start

options(digits = 2, width = 600)
library(pacman)
p_load(kirkegaard, readr, dplyr, rms)

Data

#load
vars = readr::read_csv2("data/question_data.csv")
## Using ',' as decimal and '.' as grouping mark. Use read_delim() for more control.
## Warning: Missing column names filled in: 'X1' [1]
## Parsed with column specification:
## cols(
##   X1 = col_character(),
##   text = col_character(),
##   option_1 = col_character(),
##   option_2 = col_character(),
##   option_3 = col_character(),
##   option_4 = col_character(),
##   N = col_integer(),
##   Type = col_character(),
##   Order = col_character(),
##   Keywords = col_character()
## )
d = readr::read_rds("data/parsed_data.rds")

#test items
iq_items_meta = read_csv("data/test_items.csv")
## Warning: Missing column names filled in: 'X1' [1]
## Parsed with column specification:
## cols(
##   X1 = col_character(),
##   ID = col_integer(),
##   text = col_character(),
##   option_1 = col_character(),
##   option_2 = col_character(),
##   option_3 = col_character(),
##   option_4 = col_character(),
##   option_correct = col_integer()
## )
#limited to persons with at last 5 IQ items
#included IQ items, some were removed due to ~0 datapoints
CA_items = intersect(iq_items_meta$X1, names(d))
d$CA_items = d[CA_items] %>% miss_by_case(reverse = T)
d = filter(d, CA_items >= 5)

Descriptive

#Sex
d$gender %>% table2()

Extract intelligence

#get scoring key
v_correct_answers_text = apply(iq_items_meta %>% filter(X1 %in% CA_items), MARGIN = 1, function(item) {
  item["option_" + 1:4][item["option_correct"] %>% as.numeric()]
})

#score items
CA_scored_items = score_items(d[CA_items], key = v_correct_answers_text)

#factor analysis / IRT
irt_CA = irt.fa(CA_scored_items)

irt_CA
## Item Response Analysis using Factor Analysis 
## 
## Call: irt.fa(x = CA_scored_items)
## Item Response Analysis using Factor Analysis  
## 
##  Summary information by factor and item
##  Factor =  1 
##               -3   -2   -1    0    1     2     3
## q178        0.40 0.39 0.18 0.06 0.02  0.00  0.00
## q255        0.11 0.35 0.56 0.32 0.10  0.02  0.01
## q1201       0.15 0.58 0.74 0.25 0.05  0.01  0.00
## q14835      0.07 0.11 0.12 0.11 0.08  0.05  0.03
## q8672       0.03 0.12 0.45 0.67 0.30  0.07  0.01
## q18154      0.19 0.63 0.62 0.19 0.04  0.01  0.00
## q12625      0.14 0.25 0.28 0.18 0.08  0.03  0.01
## q477        0.15 0.34 0.40 0.23 0.08  0.02  0.01
## q256        0.07 0.11 0.13 0.12 0.09  0.05  0.03
## q43639      0.23 0.77 0.63 0.15 0.03  0.00  0.00
## q267        0.05 0.09 0.15 0.18 0.15  0.09  0.05
## q18698      0.04 0.19 0.66 0.66 0.19  0.04  0.01
## q511        0.01 0.03 0.12 0.34 0.51  0.30  0.09
## q57844      0.16 0.46 0.54 0.23 0.06  0.01  0.00
## Test Info   1.80 4.42 5.58 3.69 1.77  0.72  0.25
## SEM         0.75 0.48 0.42 0.52 0.75  1.18  1.99
## Reliability 0.44 0.77 0.82 0.73 0.44 -0.39 -2.96
## 
## Factor analysis with Call: fa(r = r, nfactors = nfactors, n.obs = n.obs, rotate = rotate, 
##     fm = fm)
## 
## Test of the hypothesis that 1 factor is sufficient.
## The degrees of freedom for the model is 77  and the objective function was  1.1 
## The number of observations was  37714  with Chi Square =  42643  with prob <  0 
## 
## The root mean square of the residuals (RMSA) is  0.06 
## The df corrected root mean square of the residuals is  0.07 
## 
## Tucker Lewis Index of factoring reliability =  0.77
## RMSEA index =  0.12  and the 10 % confidence intervals are  0.12 0.12
## BIC =  41831
irt_CA$fa
## Factor Analysis using method =  minres
## Call: fa(r = r, nfactors = nfactors, n.obs = n.obs, rotate = rotate, 
##     fm = fm)
## Standardized loadings (pattern matrix) based upon correlation matrix
##         MR1   h2   u2 com
## q178   0.62 0.38 0.62   1
## q255   0.66 0.43 0.57   1
## q1201  0.73 0.53 0.47   1
## q14835 0.38 0.15 0.85   1
## q8672  0.70 0.49 0.51   1
## q18154 0.71 0.51 0.49   1
## q12625 0.53 0.29 0.71   1
## q477   0.60 0.37 0.63   1
## q256   0.39 0.15 0.85   1
## q43639 0.74 0.55 0.45   1
## q267   0.44 0.20 0.80   1
## q18698 0.72 0.52 0.48   1
## q511   0.64 0.42 0.58   1
## q57844 0.67 0.44 0.56   1
## 
##                 MR1
## SS loadings    5.42
## Proportion Var 0.39
## 
## Mean item complexity =  1
## Test of the hypothesis that 1 factor is sufficient.
## 
## The degrees of freedom for the null model are  91  and the objective function was  5.9 with Chi Square of  223082
## The degrees of freedom for the model are 77  and the objective function was  1.1 
## 
## The root mean square of the residuals (RMSR) is  0.06 
## The df corrected root mean square of the residuals is  0.07 
## 
## The harmonic number of observations is  37714 with the empirical chi square  25978  with prob <  0 
## The total number of observations was  37714  with Likelihood Chi Square =  42643  with prob <  0 
## 
## Tucker Lewis Index of factoring reliability =  0.77
## RMSEA index =  0.12  and the 90 % confidence intervals are  0.12 0.12
## BIC =  41831
## Fit based upon off diagonal values = 0.98
## Measures of factor score adequacy             
##                                                    MR1
## Correlation of (regression) scores with factors   0.95
## Multiple R square of scores with factors          0.91
## Minimum correlation of possible factor scores     0.82
#score
irt_CA_scores = scoreIrt(irt_CA, CA_scored_items)

#save
d$CA_old = d$CA %>% standardize()
d$CA = irt_CA_scores$theta1 %>% standardize()

#plot scores
GG_scatter(d, "CA", "CA_old")

Religious stance and seriousness of the belief

#recode order
d$religion_seriousness = ordered(d$d_religion_seriosity,
                                 levels = c("and laughing about it", "but not too serious about it", "and somewhat serious about it", "and very serious about it"))
d$religion = d$d_religion_type %>% plyr::mapvalues("-", NA) %>% factor()

d %>% 
  # filter(CA_items >= 5) %>% 
  GG_group_means("CA", groupvar = "religion", subgroupvar = "religion_seriousness") +
  theme(axis.text.x = element_text(angle = -30, hjust = 0))
## Missing values were removed.
## Warning in qt(1 - ((1 - CI)/2), df = as.numeric(x[4]) - 1): NaNs produced
## Warning: Removed 1 rows containing missing values (geom_errorbar).

General factor

Latent factor approach.

#subset items
d_religion = d[c("q41", "q42", "q61786", "q210")]

#convert to integer for psych
d_religion_int = map_df(d_religion, as.integer)

#factor analyze / IRT
irt_religion = irt.fa(d_religion_int)
## Warning in matpLower(x, nvar, gminx, gmaxx, gminy, gmaxy): 6 cells were adjusted for 0 values using the correction for continuity. Examine your data carefully.

irt_religion
## Item Response Analysis using Factor Analysis  
## 
## Call: irt.fa(x = d_religion_int)
## Item Response Analysis using Factor Analysis  
## 
##  Summary information by factor and item
##  Factor =  1 
##                -3   -2   -1    0     1     2      3
## q41          0.75 1.28 1.42 1.77  0.56  0.05   0.00
## q42          0.00 0.00 0.00 0.00  0.00  0.00   0.00
## q61786       0.08 0.17 0.26 0.25  0.15  0.06   0.02
## q210         0.01 0.10 0.61 1.02  0.29  0.04   0.01
## Test Info    0.84 1.55 2.29 3.04  0.99  0.15   0.03
## SEM          1.09 0.80 0.66 0.57  1.00  2.54   5.47
## Reliability -0.19 0.35 0.56 0.67 -0.01 -5.46 -28.91
## 
## Factor analysis with Call: fa(r = r, nfactors = nfactors, n.obs = n.obs, rotate = rotate, 
##     fm = fm)
## 
## Test of the hypothesis that 1 factor is sufficient.
## The degrees of freedom for the model is 2  and the objective function was  0.3 
## The number of observations was  37714  with Chi Square =  11410  with prob <  0 
## 
## The root mean square of the residuals (RMSA) is  0.04 
## The df corrected root mean square of the residuals is  0.07 
## 
## Tucker Lewis Index of factoring reliability =  0.78
## RMSEA index =  0.39  and the 10 % confidence intervals are  0.38 0.4
## BIC =  11389
irt_religion$fa
## Factor Analysis using method =  minres
## Call: fa(r = r, nfactors = nfactors, n.obs = n.obs, rotate = rotate, 
##     fm = fm)
## Standardized loadings (pattern matrix) based upon correlation matrix
##         MR1   h2    u2 com
## q41    0.93 0.87 0.129   1
## q42    0.97 0.94 0.057   1
## q61786 0.72 0.52 0.477   1
## q210   0.90 0.81 0.188   1
## 
##                 MR1
## SS loadings    3.15
## Proportion Var 0.79
## 
## Mean item complexity =  1
## Test of the hypothesis that 1 factor is sufficient.
## 
## The degrees of freedom for the null model are  6  and the objective function was  4.1 with Chi Square of  154449
## The degrees of freedom for the model are 2  and the objective function was  0.3 
## 
## The root mean square of the residuals (RMSR) is  0.04 
## The df corrected root mean square of the residuals is  0.07 
## 
## The harmonic number of observations is  37714 with the empirical chi square  741  with prob <  1e-161 
## The total number of observations was  37714  with Likelihood Chi Square =  11410  with prob <  0 
## 
## Tucker Lewis Index of factoring reliability =  0.78
## RMSEA index =  0.39  and the 90 % confidence intervals are  0.38 0.4
## BIC =  11389
## Fit based upon off diagonal values = 1
## Measures of factor score adequacy             
##                                                    MR1
## Correlation of (regression) scores with factors   0.99
## Multiple R square of scores with factors          0.97
## Minimum correlation of possible factor scores     0.95
#score subjects
irt_religion_scores = scoreIrt(irt_religion, items = d_religion_int)

#standardize
d$latent_religion = irt_religion_scores$theta1 %>% standardize() %>% multiply_by(-1)

#plot
GG_scatter(d, "CA", "latent_religion") +
  scale_y_continuous(limits = c(NA, 2))

#correct for measurement error
#religion
(religion_reliability = irt_religion$plot$sumInfo[[1]] %>% colSums() %>% max() %>% (function(x) 1-(1/sqrt(x)^2)))
## [1] 0.67
#CA
(CA_reliability = irt_CA$plot$sumInfo[[1]] %>% colSums() %>% max() %>% (function(x) 1-(1/sqrt(x)^2)))
## [1] 0.82
#correct
cor_matrix(d[c("CA", "latent_religion")], reliabilities = c(CA_reliability, religion_reliability))
##                    CA latent_religion
## CA               0.82           -0.34
## latent_religion -0.34            0.67

Education

#recode
d$education_phase = d$d_education_phase %>% ordered(levels = c("Dropped out of", "Working on", "Graduated from"))
d$education_type = d$d_education_type %>% factor()

#plot
GG_group_means(d, "CA", "education_type", "education_phase")
## Missing values were removed.

Sex and orientation

#by sex x orientation
GG_group_means(d, "latent_religion", "gender_orientation")
## Missing values were removed.

#regression models
ols(latent_religion ~ CA + rcs(d_age) + gender_orientation, data = d)
## Frequencies of Missing Values Due to Each Variable
##    latent_religion                 CA              d_age gender_orientation 
##                636                  0                739               1114 
## 
## Linear Regression Model
##  
##  ols(formula = latent_religion ~ CA + rcs(d_age) + gender_orientation, 
##      data = d)
##  
##  
##                  Model Likelihood     Discrimination    
##                     Ratio Test           Indexes        
##  Obs   35987    LR chi2    3271.80    R2       0.087    
##  sigma0.9558    d.f.            10    R2 adj   0.087    
##  d.f.  35976    Pr(> chi2)  0.0000    g        0.332    
##  
##  Residuals
##  
##      Min      1Q  Median      3Q     Max 
##  -2.2947 -0.9316  0.1702  0.7772  2.3094 
##  
##  
##                                     Coef    S.E.   t      Pr(>|t|)
##  Intercept                          -0.7120 0.1094  -6.51 <0.0001 
##  CA                                 -0.2603 0.0051 -50.68 <0.0001 
##  d_age                               0.0268 0.0045   6.00 <0.0001 
##  d_age'                             -0.0109 0.0324  -0.34 0.7364  
##  d_age''                            -0.0590 0.1304  -0.45 0.6511  
##  d_age'''                            0.1565 0.1543   1.01 0.3103  
##  gender_orientation=Bisexual_female -0.2533 0.0239 -10.58 <0.0001 
##  gender_orientation=Gay_female      -0.0886 0.0452  -1.96 0.0497  
##  gender_orientation=Gay_male        -0.0177 0.0256  -0.69 0.4883  
##  gender_orientation=Bisexual_male   -0.1866 0.0385  -4.84 <0.0001 
##  gender_orientation=Hetero_male     -0.1392 0.0122 -11.44 <0.0001 
## 
#add race too
ols(latent_religion ~ CA + rcs(d_age) + gender_orientation + race, data = d)
## Frequencies of Missing Values Due to Each Variable
##    latent_religion                 CA              d_age gender_orientation               race 
##                636                  0                739               1114               3783 
## 
## Linear Regression Model
##  
##  ols(formula = latent_religion ~ CA + rcs(d_age) + gender_orientation + 
##      race, data = d)
##  
##  
##                  Model Likelihood     Discrimination    
##                     Ratio Test           Indexes        
##  Obs   33089    LR chi2    4127.38    R2       0.117    
##  sigma0.9383    d.f.            19    R2 adj   0.117    
##  d.f.  33069    Pr(> chi2)  0.0000    g        0.382    
##  
##  Residuals
##  
##      Min      1Q  Median      3Q     Max 
##  -2.3160 -0.8899  0.1581  0.7455  2.4053 
##  
##  
##                                     Coef    S.E.   t      Pr(>|t|)
##  Intercept                          -0.9269 0.1143  -8.11 <0.0001 
##  CA                                 -0.2316 0.0054 -42.95 <0.0001 
##  d_age                               0.0313 0.0047   6.72 <0.0001 
##  d_age'                             -0.0090 0.0336  -0.27 0.7875  
##  d_age''                            -0.0949 0.1346  -0.71 0.4805  
##  d_age'''                            0.2258 0.1588   1.42 0.1550  
##  gender_orientation=Bisexual_female -0.2465 0.0249  -9.90 <0.0001 
##  gender_orientation=Gay_female      -0.1266 0.0473  -2.68 0.0074  
##  gender_orientation=Gay_male        -0.0647 0.0259  -2.50 0.0124  
##  gender_orientation=Bisexual_male   -0.1953 0.0401  -4.87 <0.0001 
##  gender_orientation=Hetero_male     -0.1618 0.0126 -12.86 <0.0001 
##  race=Mixed                          0.3052 0.0178  17.11 <0.0001 
##  race=Asian                          0.2739 0.0248  11.03 <0.0001 
##  race=Hispanic / Latin               0.4362 0.0261  16.70 <0.0001 
##  race=Black                          0.6760 0.0262  25.85 <0.0001 
##  race=Other                          0.1036 0.0316   3.28 0.0011  
##  race=Indian                         0.3444 0.0510   6.75 <0.0001 
##  race=Middle Eastern                 0.2975 0.0774   3.84 0.0001  
##  race=Native American                0.3561 0.1124   3.17 0.0015  
##  race=Pacific Islander               0.4298 0.1157   3.71 0.0002  
##