library(tidyverse)
library(janitor)
library(psych)
library(dplyr)
library(lavaan)
library(ggplot2)
library(semPlot)

Descriptives

# ---Data prep -----------------------------------------------------------
db <- read_csv("vpq_all_data.csv")
db <- clean_names(db)
# Sample description ------------------------------------------------------
### 1. Frequencies for categorical-ish variables -----------------------------

##### term
term_tab <- db %>%
  tabyl(term) %>%
  adorn_totals("row") %>%
  adorn_pct_formatting(digits = 1)

##### sex
sex_tab <- db %>%
  tabyl(sex) %>%
  adorn_totals("row") %>%
  adorn_pct_formatting(digits = 1)

##### genderid
genderid_tab <- db %>%
  tabyl(genderid) %>%
  adorn_totals("row") %>%
  adorn_pct_formatting(digits = 1)

##### genderid_other (if it is filled in for some people)
genderid_other_tab <- db %>%
  tabyl(genderid_other) %>%
  adorn_totals("row")

term_tab
##        term     n percent
##    FALL2018  2052   12.0%
##    FALL2019  2071   12.1%
##    FALL2020  2351   13.7%
##    FALL2022  5141   30.0%
##  WINTER2019  1817   10.6%
##  WINTER2020  1724   10.1%
##  WINTER2021  1975   11.5%
##       Total 17131  100.0%
sex_tab
##    sex     n percent valid_percent
##      1  6057   35.4%         35.4%
##      2 10976   64.1%         64.2%
##      3     4    0.0%          0.0%
##      4    62    0.4%          0.4%
##      5     2    0.0%          0.0%
##      6     2    0.0%          0.0%
##   <NA>    28    0.2%             -
##  Total 17131  100.0%        100.0%
genderid_tab
##  genderid     n percent valid_percent
##         1  8484   49.5%         49.6%
##         2  8285   48.4%         48.5%
##         3   142    0.8%          0.8%
##         4   181    1.1%          1.1%
##      <NA>    39    0.2%             -
##     Total 17131  100.0%        100.0%
genderid_other_tab
##               genderid_other     n      percent valid_percent
##                    Masculine     1 5.837371e-05   0.005681818
##                      agender     7 4.086160e-04   0.039772727
##                     bisexual     2 1.167474e-04   0.011363636
##                        blend     1 5.837371e-05   0.005681818
##                          boy     1 5.837371e-05   0.005681818
##                   cassgender     1 5.837371e-05   0.005681818
##                    demi-girl     4 2.334948e-04   0.022727273
##                     flexible     1 5.837371e-05   0.005681818
##                          gay     2 1.167474e-04   0.011363636
##                 gender-fluid    16 9.339793e-04   0.090909091
##      gender-fluid_non-binary     2 1.167474e-04   0.011363636
##                 gender-queer     6 3.502423e-04   0.034090909
##        gender_not_conforming     1 5.837371e-05   0.005681818
##                         girl     1 5.837371e-05   0.005681818
##                     girlflux     1 5.837371e-05   0.005681818
##                      he/they     1 5.837371e-05   0.005681818
##                        human     2 1.167474e-04   0.011363636
##                      lesbian     1 5.837371e-05   0.005681818
##                   non-binary   103 6.012492e-03   0.585227273
##    non-binary_ woman-shifted     1 5.837371e-05   0.005681818
##            non-binary_female     5 2.918685e-04   0.028409091
##                    pansexual     2 1.167474e-04   0.011363636
##                     she-they     3 1.751211e-04   0.017045455
##             transgender_male     2 1.167474e-04   0.011363636
##       transgender_non-binary     1 5.837371e-05   0.005681818
##  transgender_non-binary_male     1 5.837371e-05   0.005681818
##                   two-spirit     4 2.334948e-04   0.022727273
##                    unlabeled     2 1.167474e-04   0.011363636
##                        woman     1 5.837371e-05   0.005681818
##                         <NA> 16955 9.897262e-01            NA
##                        Total 17131 1.000000e+00   1.000000000
# Descriptives for age --------------------------------------------------

### 1. Make a clean numeric age variable -------------------
db <- db %>%
  mutate(
    age_num = suppressWarnings(as.numeric(age))
  )

### 2. Descriptives for age_num ----------------------------
age_desc <- db %>%
  summarise(
    n      = sum(!is.na(age_num)),
    mean   = mean(age_num, na.rm = TRUE),
    sd     = sd(age_num, na.rm = TRUE),
    min    = min(age_num, na.rm = TRUE),
    q1     = quantile(age_num, 0.25, na.rm = TRUE),
    median = median(age_num, na.rm = TRUE),
    q3     = quantile(age_num, 0.75, na.rm = TRUE),
    max    = max(age_num, na.rm = TRUE)
  )

age_desc
## # A tibble: 1 × 8
##       n  mean    sd   min    q1 median    q3   max
##   <int> <dbl> <dbl> <dbl> <dbl>  <dbl> <dbl> <dbl>
## 1 17078  19.2  2.84     1    18     18    20   100

Data was obtained from 17131 students from the University of Alberta. The mean age of the partcipants was M = 19.1821642, SD = 2.8421852. Of the total, 35.4% identified as men, 64.1% idenified as women, 0.4% identified as other and 0.2% preferred not to identify.

Missing values

### clean_vpq ---------------------------------
vpq_vars <- c("oe_2", "oe_3", "ob_1", "ob_4")

# Keep only rows with *all* VPQ items present
n_before <- nrow(db)
db_clean <- db[complete.cases(db[, vpq_vars]), ]
n_after  <- nrow(db_clean)

# Standardise items
db_clean <- db_clean %>%
  mutate(
    oe_2_z = as.numeric(scale(oe_2)),
    oe_3_z = as.numeric(scale(oe_3)),
    ob_1_z = as.numeric(scale(ob_1)),
    ob_4_z = as.numeric(scale(ob_4))
  )

# Recompute OE and OB scores in the cleaned data
db_clean <- db_clean %>%
  mutate(
    OE_score = rowMeans(across(c(oe_2, oe_3)), na.rm = TRUE),
    OB_score = rowMeans(across(c(ob_1, ob_4)), na.rm = TRUE)
  )

vpq_items <- db_clean[, vpq_vars]

Out of the total, 537 were removed due to missing values in the VPQ responses. A new total of 16594 participants were included in the analyses.

Factor Analyses

We explored how the data structure aligns with the different theoretical frameworks.

set.seed(123)

N <- nrow(db_clean)
index_A <- sample(1:N, size = round(N/2), replace = FALSE)

db_A <- db_clean[index_A, ]   # Sample for EFA
db_B <- db_clean[-index_A, ]  # Sample for CFA

vpq_A <- db_A[, vpq_vars] #efa
vpq_B <- db_B[, vpq_vars] #cfa

To allow independent cross-validation of the factor structure, the full dataset was randomly split into two equal subsamples. Consequently, EFAn = 8297, and CFAn = 8297.

Exploratory Factor Analysis

# 1) Parallel analysis  ----------------------------------------
fa.parallel(
  vpq_A,
  fa = "fa",
  fm = "ml",
  n.iter = 100,
  main = "Parallel Analysis (EFA sample)"
)

## Parallel analysis suggests that the number of factors =  2  and the number of components =  NA

A parallel analysis using maximum likelihood extraction indicated that a two-factor solution was optimal for the Visual Perspective Questionnaire (VPQ), with the observed-data eigenvalues exceeding those from random data for two factors, suggesting no additional components. Despite this, we stilld explored a one-factor solution.

One-Factor EFA

# 1-factor
efa_1f <- fa(
  vpq_A,
  nfactors = 1,
  fm = "ml",
  rotate = "oblimin"
)
print(efa_1f)
## Factor Analysis using method =  ml
## Call: fa(r = vpq_A, nfactors = 1, rotate = "oblimin", fm = "ml")
## Standardized loadings (pattern matrix) based upon correlation matrix
##        ML1    h2    u2 com
## oe_2 -0.54 0.295 0.705   1
## oe_3  0.35 0.120 0.880   1
## ob_1  0.17 0.029 0.971   1
## ob_4  1.00 0.995 0.005   1
## 
##                 ML1
## SS loadings    1.44
## Proportion Var 0.36
## 
## Mean item complexity =  1
## Test of the hypothesis that 1 factor is sufficient.
## 
## df null model =  6  with the objective function =  0.76 with Chi Square =  6316.77
## df of  the model are 2  and the objective function was  0.26 
## 
## The root mean square of the residuals (RMSR) is  0.18 
## The df corrected root mean square of the residuals is  0.32 
## 
## The harmonic n.obs is  8297 with the empirical chi square  3320.63  with prob <  0 
## The total n.obs was  8297  with Likelihood Chi Square =  2134.4  with prob <  0 
## 
## Tucker Lewis Index of factoring reliability =  -0.014
## RMSEA index =  0.358  and the 90 % confidence intervals are  0.346 0.371
## BIC =  2116.35
## Fit based upon off diagonal values = 0.67
## Measures of factor score adequacy             
##                                                    ML1
## Correlation of (regression) scores with factors   1.00
## Multiple R square of scores with factors          1.00
## Minimum correlation of possible factor scores     0.99

Two-Factor EFA

# 2-factor
efa_2f <- fa(
  vpq_A,
  nfactors = 2,
  fm = "ml",
  rotate = "oblimin"
)
print(efa_2f)
## Factor Analysis using method =  ml
## Call: fa(r = vpq_A, nfactors = 2, rotate = "oblimin", fm = "ml")
## Standardized loadings (pattern matrix) based upon correlation matrix
##        ML1   ML2   h2    u2 com
## oe_2 -0.60  0.15 0.34 0.661 1.1
## oe_3  0.10  0.84 0.76 0.237 1.0
## ob_1  0.33 -0.52 0.30 0.701 1.7
## ob_4  0.96  0.07 0.96 0.036 1.0
## 
##                        ML1  ML2
## SS loadings           1.38 0.99
## Proportion Var        0.34 0.25
## Cumulative Var        0.34 0.59
## Proportion Explained  0.58 0.42
## Cumulative Proportion 0.58 1.00
## 
##  With factor correlations of 
##      ML1  ML2
## ML1 1.00 0.23
## ML2 0.23 1.00
## 
## Mean item complexity =  1.2
## Test of the hypothesis that 2 factors are sufficient.
## 
## df null model =  6  with the objective function =  0.76 with Chi Square =  6316.77
## df of  the model are -1  and the objective function was  0 
## 
## The root mean square of the residuals (RMSR) is  0 
## The df corrected root mean square of the residuals is  NA 
## 
## The harmonic n.obs is  8297 with the empirical chi square  0  with prob <  NA 
## The total n.obs was  8297  with Likelihood Chi Square =  0  with prob <  NA 
## 
## Tucker Lewis Index of factoring reliability =  1.001
## Fit based upon off diagonal values = 1
## Measures of factor score adequacy             
##                                                    ML1  ML2
## Correlation of (regression) scores with factors   0.98 0.88
## Multiple R square of scores with factors          0.96 0.78
## Minimum correlation of possible factor scores     0.93 0.56

Loading comparisons

# Compare loadings in a tidy way. Cutoff = 0 to see all the loadings

## One-factor
print(efa_1f$loadings, cutoff = 0)
## 
## Loadings:
##      ML1   
## oe_2 -0.543
## oe_3  0.347
## ob_1  0.170
## ob_4  0.997
## 
##                  ML1
## SS loadings    1.439
## Proportion Var 0.360
## Two-factor
print(efa_2f$loadings, cutoff = 0)
## 
## Loadings:
##      ML1    ML2   
## oe_2 -0.599  0.152
## oe_3  0.099  0.845
## ob_1  0.330 -0.520
## ob_4  0.963  0.070
## 
##                  ML1   ML2
## SS loadings    1.406 1.012
## Proportion Var 0.351 0.253
## Cumulative Var 0.351 0.604

Confirmatory Factor Analysis

# Determine scale range to reverse OB items (for bipolar CFA)
min_scale <- min(as.matrix(vpq_items), na.rm = TRUE)
max_scale <- max(as.matrix(vpq_items), na.rm = TRUE)

db_B <- db_B %>%
  mutate(
    ob_1_rev = (max_scale + min_scale) - ob_1,
    ob_4_rev = (max_scale + min_scale) - ob_4
  )

# Subsets for models
## Mutually Exclusive (ME)
vpq_ME  <- db_B %>% dplyr::select(oe_2, oe_3, ob_1_rev, ob_4_rev) %>% na.omit()
## Continuum or Complementary (CC)
vpq_CC  <- db_B %>% dplyr::select(oe_2, oe_3, ob_1, ob_4)         %>% na.omit()
### Independent (IND)
vpq_IND <- db_B %>% dplyr::select(oe_2, oe_3, ob_1, ob_4)         %>% na.omit()

Mutually exclusive: single bipolar factor

This model tested the mutually exclusive framework by specifying a single bipolar factor underlying all four VPQ items

model_ME <- '
  VP_bipolar =~ oe_2 + oe_3 + ob_1_rev + ob_4_rev
'

fit_ME <- cfa(
  model_ME,
  data = vpq_ME,
  std.lv = TRUE
)
summary(fit_ME)
## lavaan 0.6.17 ended normally after 7271 iterations
## 
##   Estimator                                         ML
##   Optimization method                           NLMINB
##   Number of model parameters                         8
## 
##   Number of observations                          8297
## 
## Model Test User Model:
##                                                       
##   Test statistic                              1461.835
##   Degrees of freedom                                 2
##   P-value (Chi-square)                           0.000
## 
## Parameter Estimates:
## 
##   Standard errors                             Standard
##   Information                                 Expected
##   Information saturated (h1) model          Structured
## 
## Latent Variables:
##                    Estimate  Std.Err  z-value  P(>|z|)
##   VP_bipolar =~                                       
##     oe_2              0.018       NA                  
##     oe_3             -0.016       NA                  
##     ob_1_rev          0.009       NA                  
##     ob_4_rev         28.558       NA                  
## 
## Variances:
##                    Estimate  Std.Err  z-value  P(>|z|)
##    .oe_2              1.084       NA                  
##    .oe_3              1.234       NA                  
##    .ob_1_rev          1.512       NA                  
##    .ob_4_rev       -814.436       NA                  
##     VP_bipolar        1.000
semPaths(fit_ME,   whatLabels = "std", layout = "tree", edge.label.cex = 0.8)

Continuum / complementary: single factor, all items original direction

This model tested the continuum/complementary framework by specifying a single latent continuum factor, with items coded so that higher scores reflected greater engagement in an observer-like perspective.

model_CC <- '
  VP_compl =~ oe_2 + oe_3 + ob_1 + ob_4
'

fit_CC <- cfa(
  model_CC,
  data = vpq_CC,
  std.lv = TRUE
)
summary(fit_CC)
## lavaan 0.6.17 ended normally after 7271 iterations
## 
##   Estimator                                         ML
##   Optimization method                           NLMINB
##   Number of model parameters                         8
## 
##   Number of observations                          8297
## 
## Model Test User Model:
##                                                       
##   Test statistic                              1461.835
##   Degrees of freedom                                 2
##   P-value (Chi-square)                           0.000
## 
## Parameter Estimates:
## 
##   Standard errors                             Standard
##   Information                                 Expected
##   Information saturated (h1) model          Structured
## 
## Latent Variables:
##                    Estimate  Std.Err  z-value  P(>|z|)
##   VP_compl =~                                         
##     oe_2              0.018       NA                  
##     oe_3             -0.016       NA                  
##     ob_1             -0.009       NA                  
##     ob_4            -28.558       NA                  
## 
## Variances:
##                    Estimate  Std.Err  z-value  P(>|z|)
##    .oe_2              1.084       NA                  
##    .oe_3              1.234       NA                  
##    .ob_1              1.512       NA                  
##    .ob_4           -814.436       NA                  
##     VP_compl          1.000

Continuum / complementary: two positively correlated factors

This model tested the continuum/complementary framework with two latent factors (OE and OB). The correlation between OE and OB was freely estimated.

model_CC_2 <- '
  OE =~ oe_2 + oe_3
  OB =~ ob_1 + ob_4
  OE ~~ OB    # freely estimated correlation
'

fit_CC_2 <- cfa(
  model_CC_2,
  data   = vpq_CC,  
  std.lv = TRUE
)
summary(fit_CC_2)
## lavaan 0.6.17 ended normally after 2754 iterations
## 
##   Estimator                                         ML
##   Optimization method                           NLMINB
##   Number of model parameters                         9
## 
##   Number of observations                          8297
## 
## Model Test User Model:
##                                                       
##   Test statistic                              1451.930
##   Degrees of freedom                                 1
##   P-value (Chi-square)                           0.000
## 
## Parameter Estimates:
## 
##   Standard errors                             Standard
##   Information                                 Expected
##   Information saturated (h1) model          Structured
## 
## Latent Variables:
##                    Estimate  Std.Err  z-value  P(>|z|)
##   OE =~                                               
##     oe_2              0.211       NA                  
##     oe_3             -0.192       NA                  
##   OB =~                                               
##     ob_1              0.012       NA                  
##     ob_4             22.617       NA                  
## 
## Covariances:
##                    Estimate  Std.Err  z-value  P(>|z|)
##   OE ~~                                               
##     OB               -0.113       NA                  
## 
## Variances:
##                    Estimate  Std.Err  z-value  P(>|z|)
##    .oe_2              1.040       NA                  
##    .oe_3              1.197       NA                  
##    .ob_1              1.511       NA                  
##    .ob_4           -510.411       NA                  
##     OE                1.000                           
##     OB                1.000

Continuum / complementary: two negatively correlated factors

This model tested the continuum/complementary framework with two latent factors (OE and OB). The correlation between OE and OB was constrained to be negative.

model_CC_neg <- '
  # latent factors
  OE =~ oe_2 + oe_3
  OB =~ ob_1 + ob_4

  # force correlation to be negative but estimated freely
  theta =~ 1*OE        # dummy: create a free parameter
  OE ~~ -exp(theta)*OB
'


fit_CC_neg <- cfa(
  model_CC_neg,
  data   = vpq_CC, 
  std.lv = TRUE
)

summary(fit_CC_neg)
## lavaan 0.6.17 did NOT end normally after 10000 iterations
## ** WARNING ** Estimates below are most likely unreliable
## 
##   Estimator                                         ML
##   Optimization method                           NLMINB
##   Number of model parameters                        10
## 
##   Number of observations                          8297
## 
## 
## Parameter Estimates:
## 
##   Standard errors                             Standard
##   Information                                 Expected
##   Information saturated (h1) model          Structured
## 
## Latent Variables:
##                    Estimate   Std.Err  z-value  P(>|z|)
##   OE =~                                                
##     oe_2               0.211       NA                  
##     oe_3              -0.192       NA                  
##   OB =~                                                
##     ob_1               0.006       NA                  
##     ob_4              44.131       NA                  
##   theta =~                                             
##     OE                 1.000                           
## 
## Covariances:
##                    Estimate   Std.Err  z-value  P(>|z|)
##  .OE ~~                                                
##     OB      (thet)    -0.029       NA                  
##   OB ~~                                                
##     theta             -0.029       NA                  
## 
## Variances:
##                    Estimate   Std.Err  z-value  P(>|z|)
##    .oe_2               1.040       NA                  
##    .oe_3               1.197       NA                  
##    .ob_1               1.512       NA                  
##    .ob_4           -1946.443       NA                  
##    .OE                 0.000                           
##     OB                 1.000                           
##     theta              1.000

Independent: two uncorrelated factors

This model tested the independent framework by specifying two orthogonal factors, with the correlation between OE and OB fixed to zero

# forced to treat the two latent factors as completely unrelated
model_IND <- '
  OE =~ oe_2 + oe_3
  OB =~ ob_1 + ob_4
  OE ~~ 0*OB        # orthogonal factors, correlation fixed to zero
'

fit_IND <- cfa(
  model_IND,
  data   = vpq_IND,
  std.lv = TRUE,
  control = list(iter.max = 50000)
)

summary(fit_IND)
## lavaan 0.6.17 ended normally after 12 iterations
## 
##   Estimator                                         ML
##   Optimization method                           NLMINB
##   Number of model parameters                         8
## 
##   Number of observations                          8297
## 
## Model Test User Model:
##                                                       
##   Test statistic                              5879.609
##   Degrees of freedom                                 2
##   P-value (Chi-square)                           0.000
## 
## Parameter Estimates:
## 
##   Standard errors                             Standard
##   Information                                 Expected
##   Information saturated (h1) model          Structured
## 
## Latent Variables:
##                    Estimate  Std.Err  z-value  P(>|z|)
##   OE =~                                               
##     oe_2              0.805       NA                  
##     oe_3             -0.050       NA                  
##   OB =~                                               
##     ob_1              0.910       NA                  
##     ob_4              0.243       NA                  
## 
## Covariances:
##                    Estimate  Std.Err  z-value  P(>|z|)
##   OE ~~                                               
##     OB                0.000                           
## 
## Variances:
##                    Estimate  Std.Err  z-value  P(>|z|)
##    .oe_2              0.437       NA                  
##    .oe_3              1.232       NA                  
##    .ob_1              0.683       NA                  
##    .ob_4              1.056       NA                  
##     OE                1.000                           
##     OB                1.000

Model checks

Convergence checks

lavInspect(fit_ME,  "converged")
## [1] TRUE
lavInspect(fit_CC,  "converged")
## [1] TRUE
lavInspect(fit_CC_2,"converged")
## [1] TRUE
lavInspect(fit_CC_neg,"converged")
## [1] FALSE
lavInspect(fit_IND, "converged")
## [1] TRUE

Latent Covariances

We examined latent covariances of two models. In the Continuum / complementary: two positively correlated factors model we explored the question: “If we allow OE and OB to relate, how strongly do they actually do so?”. In the Independent: two uncorrelated factors we ask “What does the model assume about the relationship between OE and OB?”.

This is relevant because the observed-scale correlations between OE and OB items could be contaminated by measurement error and/or influenced by item wording and response styles. Latent covariances analysis remove item-specific noise and could reflect underlying psychological relationship between perspectives.

lavInspect(fit_CC_2, "cov.lv")
##        OE     OB
## OE  1.000       
## OB -0.113  1.000
lavInspect(fit_IND, "cor.lv")
##    OE OB
## OE  1   
## OB  0  1

When allowed to covary, OE and OB showed a negligible latent association (r = -0.1132342), indicating that assuming independence between perspectives does not conflict with the data. Moreover, when OE and OB are assumed independent, the data do not contradict that assumption.

Simulation of the Theoretical Frameworks VS Real Data

We used a simulation-based approach to evaluate how well different theoretical frameworks captured the joint distribution of OE and OB scores in the current data.

Data Distribution

library(tidyverse)
library(MASS)     # for mvrnorm
library(cowplot)  # for arranging plots

# Data handling --------------------------------------------
### vpq ---------------------------------------------------------------------
vpq_db <- db_clean
# compute OE and OB scores in the cleaned data
vpq_db$OE_score <- rowMeans(vpq_db[, c("oe_2", "oe_3")], na.rm = TRUE)
vpq_db$OB_score <- rowMeans(vpq_db[, c("ob_1", "ob_4")], na.rm = TRUE)

vpq_db <- vpq_db %>%
  mutate(
    OE_z = as.numeric(scale(OE_score)),
    OB_z = as.numeric(scale(OB_score))
  )



# 1. Load data
# Compute OE and OB scores from items
vpq_db <- vpq_db %>%
  mutate(
    OE_score = rowMeans(across(c(oe_2, oe_3)), na.rm = TRUE),
    OB_score = rowMeans(across(c(ob_1, ob_4)), na.rm = TRUE)
  ) %>%
  mutate(
    OE_z = as.numeric(scale(OE_score)),
    OB_z = as.numeric(scale(OB_score))
  )



# real data -----------------------------------------------------------
real_df <- vpq_db %>%
  dplyr::mutate(
    OE = OE_z,
    OB = OB_z,
    framework = "Real data"
  ) %>%
  dplyr::select(OE, OB, framework)

N <- nrow(real_df)

# simulations -----------------------------------------------------------
simulate_framework <- function(N, rho, label){
  Sigma <- matrix(c(1, rho, rho, 1), 2, 2)
  raw <- MASS::mvrnorm(N, mu = c(0,0), Sigma = Sigma)
  tibble(
    OE = raw[,1],
    OB = raw[,2],
    framework = label
  )
}

sim_independent <- simulate_framework(N, -0.30, "Independent")
sim_continuum   <- simulate_framework(N, -0.60, "Continuum")
sim_mutual      <- simulate_framework(N, -0.90, "Mutually exclusive")

# Combine *all* frameworks
all_df <- dplyr::bind_rows(real_df, sim_independent, sim_continuum, sim_mutual)

set.seed(123)

all_df_plot <- all_df %>%
  mutate(
    OE_plot = if_else(
      framework == "Real data",
      OE + rnorm(n(), 0, 0.35),   # jitter only real
      OE
    ),
    OB_plot = if_else(
      framework == "Real data",
      OB + rnorm(n(), 0, 0.35),
      OB
    )
  )

x_limits <- range(all_df_plot$OE_plot, na.rm = TRUE)
y_limits <- range(all_df_plot$OB_plot, na.rm = TRUE)

Quadrants

We compared the empirical association observed in the VPQ data against formalized simulations derived from the three hypothesized frameworks. Each framework makes distinct predictions about the strength and structure of the relationship between OE and OB, allowing the observed data to be evaluated relative to theoretically constrained benchmarks rather than in isolation.

Three frameworks were operationalized using bivariate normal simulations with unit variances and varying correlations between OE and OB. The independent framework was modelled with a weak negative correlation (ρ = −.30), reflecting the assumption that OE and OB are largely separable dimensions that can co-occur with limited constraint. The continuum framework was modelled with a moderate negative correlation (ρ = −.60), consistent with the view that OE and OB reflect opposing tendencies along a single dimension, such that increases in one are typically accompanied by decreases in the other. Finally, the mutually exclusive framework was modelled with a strong negative correlation (ρ = −.90), representing the theoretical position that adopting one perspective largely precludes the other.

# 1. Real data: OE / OB scores and z-scores-----------------------------------------
vpq_db <- vpq_db %>%
  dplyr::mutate(
    OE_score = rowMeans(dplyr::across(c(oe_2, oe_3)), na.rm = TRUE),
    OB_score = rowMeans(dplyr::across(c(ob_1, ob_4)), na.rm = TRUE)
  ) %>%
  dplyr::mutate(
    OE_z = as.numeric(scale(OE_score)),
    OB_z = as.numeric(scale(OB_score))
  )

# Real data in z-space for later binding
real_df <- vpq_db %>%
  dplyr::transmute(
    OE = OE_z,
    OB = OB_z,
    framework = "Real data"
  )

N <- nrow(real_df)

# 2. Simulate the three theoretical frameworks -------------------------------------------------------------------------

simulate_framework <- function(N, rho, label){
  Sigma <- matrix(c(1, rho, rho, 1), 2, 2)
  raw   <- MASS::mvrnorm(N, mu = c(0, 0), Sigma = Sigma)
  tibble(
    OE = raw[, 1],
    OB = raw[, 2],
    framework = label
  )
}

sim_independent <- simulate_framework(N, -0.30, "Independent")
sim_continuum   <- simulate_framework(N, -0.60, "Continuum")
sim_mutual      <- simulate_framework(N, -0.90, "Mutually exclusive")

# 3. Combine real + simulated-------------------------------------------------------------------------

quad_all <- dplyr::bind_rows(
  real_df,
  sim_independent,
  sim_continuum,
  sim_mutual
)
# EXTREME QUADRANTS
# 4. Extreme quadrant thresholds (±1 SD) in z-space -----------------------

# Since OE and OB in quad_all are z-scores, ±1 SD corresponds to ±1.
thr_OE_hi <-  1
thr_OE_lo <- -1
thr_OB_hi <-  1
thr_OB_lo <- -1


# 5. Apply extreme quadrant rules to ALL frameworks -----------------------

quad_all <- quad_all %>%
  dplyr::mutate(
    group = dplyr::case_when(
      OE >= thr_OE_hi & OB >= thr_OB_hi ~ "High_OE_High_OB",
      OE <= thr_OE_lo & OB <= thr_OB_lo ~ "Low_OE_Low_OB",
      OE >= thr_OE_hi & OB <= thr_OB_lo ~ "High_OE_Low_OB",
      OE <= thr_OE_lo & OB >= thr_OB_hi ~ "Low_OE_High_OB",
      TRUE                              ~ "Medium_or_Mixed"
    )
  )


# 6. Summary table: counts and proportion-----------------------------------------

quad_summary <- quad_all %>%
  dplyr::count(framework, group) %>%
  dplyr::group_by(framework) %>%
  dplyr::mutate(prop = n / sum(n)) %>%
  dplyr::ungroup()

quad_summary
## # A tibble: 18 × 4
##    framework          group               n    prop
##    <chr>              <chr>           <int>   <dbl>
##  1 Continuum          High_OE_High_OB    43 0.00259
##  2 Continuum          High_OE_Low_OB   1199 0.0723 
##  3 Continuum          Low_OE_High_OB   1265 0.0762 
##  4 Continuum          Low_OE_Low_OB      24 0.00145
##  5 Continuum          Medium_or_Mixed 14063 0.847  
##  6 Independent        High_OE_High_OB   151 0.00910
##  7 Independent        High_OE_Low_OB    806 0.0486 
##  8 Independent        Low_OE_High_OB    827 0.0498 
##  9 Independent        Low_OE_Low_OB     191 0.0115 
## 10 Independent        Medium_or_Mixed 14619 0.881  
## 11 Mutually exclusive High_OE_Low_OB   1860 0.112  
## 12 Mutually exclusive Low_OE_High_OB   1921 0.116  
## 13 Mutually exclusive Medium_or_Mixed 12813 0.772  
## 14 Real data          High_OE_High_OB   161 0.00970
## 15 Real data          High_OE_Low_OB    805 0.0485 
## 16 Real data          Low_OE_High_OB    903 0.0544 
## 17 Real data          Low_OE_Low_OB     198 0.0119 
## 18 Real data          Medium_or_Mixed 14527 0.875
# 7. Jittered version for plotting (real data only) ------------------------

set.seed(123)

quad_all_plot <- quad_all %>%
  dplyr::mutate(
    OE_plot = dplyr::if_else(
      framework == "Real data",
      OE + rnorm(dplyr::n(), 0, 0.35),
      OE
    ),
    OB_plot = dplyr::if_else(
      framework == "Real data",
      OB + rnorm(dplyr::n(), 0, 0.35),
      OB
    )
  )

Odd Ratios

We tested whether the joint distribution of OE and OB scores was better characterized by statistical independence or by a complementary, quasi-bipolar relation by comparing quadrant frequencies from the real data with those generated under each theoretical framework.

# Odds ratios for extreme quadrants only ----------------------------------

compute_or_extreme <- function(counts_df, quadrant_name, sim_label,
                               real_label = "Real data") {
  
  real_df <- counts_df %>% dplyr::filter(framework == real_label)
  sim_df  <- counts_df %>% dplyr::filter(framework == sim_label)
  
  # counts for the target quadrant
  a <- real_df$n[real_df$quadrant == quadrant_name]
  c <- sim_df$n[sim_df$quadrant == quadrant_name]
  
  # handle absent quadrants
  if (length(a) == 0) a <- 0
  if (length(c) == 0) c <- 0
  
  b <- sum(real_df$n) - a
  d <- sum(sim_df$n)  - c
  
  # Haldane correction
  a <- a + 0.5; b <- b + 0.5
  c <- c + 0.5; d <- d + 0.5
  
  OR <- (a / b) / (c / d)
  
  tibble::tibble(
    quadrant   = quadrant_name,
    comparison = paste(real_label, "vs", sim_label),
    OR         = OR
  )
}

quad_profiles <- quad_summary %>%
  dplyr::rename(quadrant = group)

extreme_quadrants <- c("High_OE_High_OB", "Low_OE_Low_OB")
sim_frameworks <- c("Independent", "Continuum", "Mutually exclusive")

odds_ratio_results <- purrr::map_dfr(sim_frameworks, function(sim_lab) {
  purrr::map_dfr(extreme_quadrants, function(q) {
    compute_or_extreme(quad_profiles, q, sim_lab)
  })
})

odds_ratio_results
## # A tibble: 6 × 3
##   quadrant        comparison                          OR
##   <chr>           <chr>                            <dbl>
## 1 High_OE_High_OB Real data vs Independent          1.07
## 2 Low_OE_Low_OB   Real data vs Independent          1.04
## 3 High_OE_High_OB Real data vs Continuum            3.74
## 4 Low_OE_Low_OB   Real data vs Continuum            8.19
## 5 High_OE_High_OB Real data vs Mutually exclusive 326.  
## 6 Low_OE_Low_OB   Real data vs Mutually exclusive 402.

Each odds ratio compares how often an extreme quadrant appears in the real data relative to how often it would appear if a given theoretical framework were true.

Convergent Validity (Vividness of Visual Imagery) in MassTesting_2026

Data Distribution

db_original <- read_csv("MT_2026.csv", show_col_types = FALSE) %>%
  clean_names() %>%
  mutate(
    OE = rowMeans(across(c(oe_2, oe_3)), na.rm = TRUE),
    OB = rowMeans(across(c(ob_1, ob_4)), na.rm = TRUE)
  )
total_db_original <- nrow(db_original)
# Missing -----------------------------------------------------------------

db_complete <- na.omit(db_original)
missing_cases <- nrow(db_original) - nrow(db_complete)

# VVIQ descriptive before vviq < 32 removed ---------------------------------------------------------
ggplot(db_complete, aes(x = vviq_total)) +
  geom_density(linewidth = 1) +
  geom_vline(xintercept = 32, colour = "red", linewidth = 1) +
  labs(
    x = "VVIQ total score",
    y = "Density",
    title = "Density distribution of VVIQ scores"
  ) +
  theme_minimal()

# Aphantasia cases ---------------------------------------------------------
db_aph <- db_complete %>% filter(vviq_total >= 32)
aphantasia_cases <- nrow(db_complete) - nrow(db_aph)
db <- db_aph

# Total participants after filtering
total <- nrow(db)

We explored the relationship between the VVIQ and VPQ quartiles distribution. A total of 1928 participants completed online versions of the VPQ and the VVIQ. Of the total, 294 were excluded due to missing data. Another 73 were excluded due to be identified as possible aphantasia cases (VVIQ scores below 32). The final sample size was 1561

Quadrants

Using the same approach as before, we calculated the quadrants for the new sample.

# Quadrants (z-score OE/OB, ±0.5 SD cutoffs) -------------------------------

quad_all <- db %>%
  mutate(
    OE_z = as.numeric(scale(OE)),
    OB_z = as.numeric(scale(OB))
  )

# --- FIXED THRESHOLD (used in BOTH datasets) ---
thr_OE_hi <-  0.5
thr_OE_lo <- -0.5
thr_OB_hi <-  0.5
thr_OB_lo <- -0.5

quad_all <- quad_all %>%
  mutate(
    group = case_when(
      OE_z >= thr_OE_hi & OB_z >= thr_OB_hi ~ "High_OE_High_OB",
      OE_z <= thr_OE_lo & OB_z <= thr_OB_lo ~ "Low_OE_Low_OB",
      OE_z >= thr_OE_hi & OB_z <= thr_OB_lo ~ "High_OE_Low_OB",
      OE_z <= thr_OE_lo & OB_z >= thr_OB_hi ~ "Low_OE_High_OB",
      TRUE                                  ~ "Medium_or_Mixed"
    )
  )

quad_counts_MT_2026 <- quad_all %>% count(group, name = "n")
quad_counts_MT_2026
## # A tibble: 5 × 2
##   group               n
##   <chr>           <int>
## 1 High_OE_High_OB    59
## 2 High_OE_Low_OB    335
## 3 Low_OE_High_OB    117
## 4 Low_OE_Low_OB      54
## 5 Medium_or_Mixed   996

VVIQ as Potentiator of Extreme Quadrants

# --- Prep: VVIQ z + keep 4 quadrants -----------------------------------------

quad_levels <- c(
  "High_OE_High_OB",
  "Low_OE_Low_OB",
  "High_OE_Low_OB",
  "Low_OE_High_OB"
)

db_quad4 <- quad_all %>%
  mutate(vviq_z = as.numeric(scale(vviq_total))) %>%
  filter(group %in% quad_levels) %>%
  mutate(
    extreme_consistent = as.integer(
      group %in% c("High_OE_High_OB", "Low_OE_Low_OB")
    ),
    quad4 = relevel(factor(group, levels = quad_levels),
                    ref = "High_OE_Low_OB")
  )

Binary Logistic Regression: consistent (HH + LL) vs inconsistent (HL + LH)

The binary logistic regression examines whether imagery vividness is associated with the alignment of own-eyes and observer perspectives when participants occupy extreme regions of the OE–OB space. By collapsing extreme quadrants into consistent profiles (High OE–High OB and Low OE–Low OB) versus inconsistent profiles (High OE–Low OB and Low OE–High OB), the model tests whether higher vividness predicts a greater likelihood that both perspectives are jointly high or jointly low rather than mismatched. Thus, the regression does not assess perspective strength per se, but rather whether imagery vividness is related to the coherence between own-eyes and observer perspectives under a fixed quadrant definition.

logit_mod <- glm(
  extreme_consistent ~ vviq_z,
  data   = db_quad4,
  family = binomial(link = "logit")
)

summary(logit_mod)
## 
## Call:
## glm(formula = extreme_consistent ~ vviq_z, family = binomial(link = "logit"), 
##     data = db_quad4)
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -1.3866     0.1057 -13.119   <2e-16 ***
## vviq_z        0.2032     0.1098   1.851   0.0641 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 565.45  on 564  degrees of freedom
## Residual deviance: 561.98  on 563  degrees of freedom
## AIC: 565.98
## 
## Number of Fisher Scoring iterations: 4
exp(cbind(OR = coef(logit_mod), confint(logit_mod)))
##                    OR     2.5 %    97.5 %
## (Intercept) 0.2499327 0.2022433 0.3061936
## vviq_z      1.2253445 0.9897204 1.5230866

Multinomial Logistic Regression

The multinomial logistic regression extends the binary analysis by examining how imagery vividness relates to specific extreme perspective configurations rather than overall consistency. Using the High OE–Low OB quadrant as the reference category, the model estimates whether increases in vividness differentially shift the likelihood of belonging to High OE–High OB, Low OE–Low OB, or Low OE–High OB profiles. This analysis decomposes the binary consistency effect, allowing identification of which particular quadrant transitions contribute to the overall tendency toward aligned versus misaligned perspective profiles, and clarifies whether imagery vividness selectively favors certain configurations or acts more uniformly across the extreme OE–OB space.

library(nnet)

multi_mod <- multinom(
  quad4 ~ vviq_z,
  data  = db_quad4,
  trace = FALSE
)

summary(multi_mod)
## Call:
## multinom(formula = quad4 ~ vviq_z, data = db_quad4, trace = FALSE)
## 
## Coefficients:
##                 (Intercept)    vviq_z
## High_OE_High_OB   -1.726333 0.2001968
## Low_OE_Low_OB     -1.827197 0.3241902
## Low_OE_High_OB    -1.042255 0.2139621
## 
## Std. Errors:
##                 (Intercept)    vviq_z
## High_OE_High_OB   0.1415455 0.1467977
## Low_OE_Low_OB     0.1485536 0.1547399
## Low_OE_High_OB    0.1078279 0.1118537
## 
## Residual Deviance: 1231.256 
## AIC: 1243.256
exp(coef(multi_mod))
##                 (Intercept)   vviq_z
## High_OE_High_OB   0.1779357 1.221643
## Low_OE_Low_OB     0.1608638 1.382910
## Low_OE_High_OB    0.3526585 1.238576

Interpretation: Estimate (β) : The change in the log-odds of the outcome for a one-unit increase in the predictor.

Odds Ratio (OR = exp(β)) : Converts log-odds into an interpretable scale.

  • OR > 1: odds increase.

  • OR = 1: no change.

  • OR < 1: odds decrease.

Intercept: The log-odds (or OR) of the outcome when the predictor is 0.

Standard Error (SE): Reflects uncertainty in the coefficient estimate. Larger SEs indicate less precise estimates.

z value: Ratio of Estimate to SE. Used to compute the p-value.

p-value: Probability of observing an effect at least as extreme if the true effect were zero. Used to assess statistical evidence for an effect.

Confidence Interval (CI): Range of plausible OR values. If the CI includes 1, the effect is not statistically distinguishable from no effect.

(OPAM) Convergent Validity (Vividness of Visual Imagery)

(OPAM) Data Distribution

# Read raw data 

db <- read_csv("opam.csv") %>%
  clean_names()

db <- db %>%
  mutate(
    # VPQ scores
    OE = rowMeans(across(c(oe_2, oe_3)), na.rm = TRUE),
    OB = rowMeans(across(c(ob_1, ob_4)), na.rm = TRUE)
  )

# vviq transformations
vviq_min <- min(db$vviq_total, na.rm = TRUE)
vviq_max <- max(db$vviq_total, na.rm = TRUE)

db$vviq_inv <- (vviq_max + vviq_min) - db$vviq_total
#check
summary(db$vviq_total)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   16.00   50.00   57.00   56.31   64.00   80.00
summary(db$vviq_inv)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   16.00   32.00   39.00   39.69   46.00   80.00
cor(db$vviq_total, db$vviq_inv, use = "complete.obs") # Should be -1
## [1] -1
# # vviq_descriptive --------------------------------------------------------
# 
ggplot(db, aes(x = vviq_inv)) +
  geom_density(linewidth = 1) +
  geom_vline(xintercept = 32, colour = "red", linewidth = 1) +
  labs(
    x = "VVIQ total score",
    y = "Density",
    title = "Density distribution of VVIQ scores"
  ) +
  theme_minimal()

## Aphantasia cases
#
db <- db %>%
filter(vviq_inv >= 32)

(OPAM) Quadrants

# 1) Compute cuts from the CURRENT db (raw OE/OB)
mu_OE <- mean(db$OE, na.rm = TRUE)
sd_OE <- sd(db$OE,  na.rm = TRUE)

mu_OB <- mean(db$OB, na.rm = TRUE)
sd_OB <- sd(db$OB,  na.rm = TRUE)

k_sd <- 0.5   # keep as before to increase representation

high_OE_cut <- mu_OE + k_sd * sd_OE
low_OE_cut  <- mu_OE - k_sd * sd_OE

high_OB_cut <- mu_OB + k_sd * sd_OB
low_OB_cut  <- mu_OB - k_sd * sd_OB

# 2) Assign quadrants + mixed using those cuts
db_quartiles <- db %>%
  mutate(
    VP_quartile = case_when(
      OE >= high_OE_cut & OB >= high_OB_cut ~ "High_OE_High_OB",
      OE <= low_OE_cut  & OB <= low_OB_cut  ~ "Low_OE_Low_OB",
      OE >= high_OE_cut & OB <= low_OB_cut  ~ "High_OE_Low_OB",
      OE <= low_OE_cut  & OB >= high_OB_cut ~ "Low_OE_High_OB",
      TRUE                                  ~ "Mixed"
    ),
    VP_quartile = factor(
      VP_quartile,
      levels = c(
        "High_OE_High_OB",
        "Low_OE_Low_OB",
        "High_OE_Low_OB",
        "Low_OE_High_OB",
        "Mixed"
      )
    )
  )

# 3) Counts + proportions
vp_counts <- db_quartiles %>%
  count(VP_quartile, name = "n") %>%
  mutate(prop = n / sum(n))

vp_counts
## # A tibble: 5 × 3
##   VP_quartile         n   prop
##   <fct>           <int>  <dbl>
## 1 High_OE_High_OB    11 0.0373
## 2 Low_OE_Low_OB       3 0.0102
## 3 High_OE_Low_OB     62 0.210 
## 4 Low_OE_High_OB     63 0.214 
## 5 Mixed             156 0.529
# keep the cut values handy for plotting
cuts <- tibble(
  k_sd = k_sd,
  low_OE_cut = low_OE_cut,
  high_OE_cut = high_OE_cut,
  low_OB_cut = low_OB_cut,
  high_OB_cut = high_OB_cut
)
cuts
## # A tibble: 1 × 5
##    k_sd low_OE_cut high_OE_cut low_OB_cut high_OB_cut
##   <dbl>      <dbl>       <dbl>      <dbl>       <dbl>
## 1   0.5       2.93        3.85       1.90        2.93

(OPAM) VVIQ as Potentiator of Extreme Quadrants

quad_levels <- c(
  "High_OE_High_OB",
  "Low_OE_Low_OB",
  "High_OE_Low_OB",
  "Low_OE_High_OB"
)

db_quad4 <- db_quartiles %>%
  mutate(vviq_z = as.numeric(scale(vviq_inv))) %>%
  filter(VP_quartile %in% quad_levels) %>%
  mutate(
    extreme_consistent = as.integer(VP_quartile %in% c("High_OE_High_OB", "Low_OE_Low_OB")),
    quad4 = relevel(factor(VP_quartile, levels = quad_levels), ref = "High_OE_Low_OB")
  )

(OPAM) Binary Logistic Regression: consistent (HH + LL) vs inconsistent (HL + LH)

logit_mod <- glm(
  extreme_consistent ~ vviq_z,
  data   = db_quad4,
  family = binomial(link = "logit")
)

summary(logit_mod)
## 
## Call:
## glm(formula = extreme_consistent ~ vviq_z, family = binomial(link = "logit"), 
##     data = db_quad4)
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -2.3344     0.3112  -7.502 6.31e-14 ***
## vviq_z        0.4730     0.2235   2.116   0.0343 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 90.812  on 138  degrees of freedom
## Residual deviance: 86.612  on 137  degrees of freedom
## AIC: 90.612
## 
## Number of Fisher Scoring iterations: 5
# odds ratios and CIs
exp(cbind(OR = coef(logit_mod), confint(logit_mod)))
##                     OR      2.5 %    97.5 %
## (Intercept) 0.09687267 0.04950845 0.1697893
## vviq_z      1.60486922 1.02194654 2.5106769

(OPAM) Multinomial Logistic Regression

multi_mod <- nnet::multinom(
  quad4 ~ vviq_z,
  data = db_quad4
)
## # weights:  12 (6 variable)
## initial  value 192.694916 
## iter  10 value 135.544047
## final  value 135.486234 
## converged
summary(multi_mod)
## Call:
## nnet::multinom(formula = quad4 ~ vviq_z, data = db_quad4)
## 
## Coefficients:
##                 (Intercept)      vviq_z
## High_OE_High_OB -2.00200529  0.61997136
## Low_OE_Low_OB   -3.17841144 -0.65103326
## Low_OE_High_OB   0.01642012 -0.03949422
## 
## Std. Errors:
##                 (Intercept)    vviq_z
## High_OE_High_OB   0.3845583 0.2572050
## Low_OE_Low_OB     0.6875981 0.8111332
## Low_OE_High_OB    0.1789328 0.1816985
## 
## Residual Deviance: 270.9725 
## AIC: 282.9725
# odds ratios for multinomial model
exp(coef(multi_mod))
##                 (Intercept)    vviq_z
## High_OE_High_OB  0.13506417 1.8588748
## Low_OE_Low_OB    0.04165177 0.5215066
## Low_OE_High_OB   1.01655567 0.9612755