This script fits the models and produces the figures reported in the Results section of the hybrid-drawing paper. Models test whether each hybrid drawing contains recognizable content from its two constituent base concepts, using CLIP-based classification ranks against a set of 14 base categories.

1 Data

data_raw <- read.csv('../data_intermediates/analysis/prolific_k_way_classification_all_children_anonymized.csv',
                     stringsAsFactors = FALSE)

# coerce string booleans to logical
str_bool_cols <- c("is_adult", "concept1_available", "concept2_available",
                   "constituent1_in_top_1", "constituent2_in_top_1",
                   "constituent1_in_top_5", "constituent2_in_top_5",
                   "both_constituents_in_top_1", "both_constituents_in_top_5")
data_raw[str_bool_cols] <- lapply(data_raw[str_bool_cols],
                                  function(x) as.logical(as.character(x)))

cat("rows in raw dataset:", nrow(data_raw), "\n")

## rows in raw dataset: 2938

cat("unique participants:", length(unique(data_raw$participant_id)), "\n")

## unique participants: 739

cat("hybrid categories:", length(unique(data_raw$hybrid_category)), "\n")

## hybrid categories: 8

cat("is_adult breakdown in raw dataset:\n")

## is_adult breakdown in raw dataset:

print(table(data_raw$is_adult, useNA = "always"))

## 
## FALSE  TRUE  <NA> 
##  2723   215     0

# child analyses use ages 3-10; adults are handled separately
data <- data_raw %>%
  filter(is_adult == FALSE | is.na(is_adult)) %>%
  filter(age_numeric >= 3)

cat("after excluding adults and under-3s:", nrow(data), "drawings\n")

## after excluding adults and under-3s: 2513 drawings

cat("sessions:", length(unique(data$participant_id)), "\n")

## sessions: 613

cat("age range:", range(data$age_numeric), "\n")

## age range: 3 10

cat("mean age:", round(mean(data$age_numeric), 2),
    "SD:", round(sd(data$age_numeric), 2), "\n")

## mean age: 5.82 SD: 2.06

print(table(data$age_numeric))

## 
##   3   4   5   6   7   8   9  10 
## 338 451 415 484 321 166 135 203

# center age; coerce boolean columns
data$age_centered <- scale(data$age_numeric, center = TRUE, scale = FALSE)[, 1]

# restrict to drawings whose constituents both have a base-category centroid
data <- data %>%
  filter(concept1_available == TRUE & concept2_available == TRUE)

cat("after filtering for available centroids:", nrow(data), "drawings\n")

## after filtering for available centroids: 2191 drawings

cat("sessions:", length(unique(data$participant_id)), "\n")

## sessions: 600

# exact chance under a uniformly random permutation of 14 base categories
n_concepts        <- 14
individual_chance <- 5 / n_concepts                          # single constituent in top 5
both_chance       <- (5 * 4) / (n_concepts * (n_concepts - 1)) # both specific constituents in top 5

cat("single-constituent top-5 chance:", round(individual_chance, 3), "\n")

## single-constituent top-5 chance: 0.357

cat("joint top-5 chance:", round(both_chance, 3), "\n")

## joint top-5 chance: 0.11

data$individual_chance <- individual_chance
data$both_chance       <- both_chance

2 Descriptive statistics

overall <- data %>%
  summarise(
    n              = n(),
    prop_c1_top5   = mean(constituent1_in_top_5),
    prop_c2_top5   = mean(constituent2_in_top_5),
    prop_both_top5 = mean(both_constituents_in_top_5)
  )
print(overall)

##      n prop_c1_top5 prop_c2_top5 prop_both_top5
## 1 2191     0.587403    0.4997718      0.2409859

per_age <- data %>%
  group_by(age_numeric) %>%
  summarise(
    n              = n(),
    prop_c1_top5   = round(mean(constituent1_in_top_5), 3),
    prop_c2_top5   = round(mean(constituent2_in_top_5), 3),
    prop_both_top5 = round(mean(both_constituents_in_top_5), 3),
    .groups = 'drop'
  )
print(per_age)

## # A tibble: 8 × 5
##   age_numeric     n prop_c1_top5 prop_c2_top5 prop_both_top5
##         <dbl> <int>        <dbl>        <dbl>          <dbl>
## 1           3   292        0.462        0.497          0.216
## 2           4   397        0.501        0.516          0.207
## 3           5   360        0.614        0.464          0.233
## 4           6   417        0.638        0.484          0.249
## 5           7   279        0.663        0.48           0.254
## 6           8   147        0.673        0.463          0.218
## 7           9   120        0.642        0.508          0.283
## 8          10   179        0.587        0.631          0.324

per_category <- data %>%
  group_by(hybrid_category) %>%
  summarise(
    n              = n(),
    prop_c1_top5   = round(mean(constituent1_in_top_5), 3),
    prop_c2_top5   = round(mean(constituent2_in_top_5), 3),
    prop_both_top5 = round(mean(both_constituents_in_top_5), 3),
    .groups = 'drop'
  ) %>%
  arrange(desc(prop_both_top5))
print(per_category)

## # A tibble: 7 × 5
##   hybrid_category       n prop_c1_top5 prop_c2_top5 prop_both_top5
##   <chr>             <int>        <dbl>        <dbl>          <dbl>
## 1 a tiger frog        269        0.651        0.636          0.45 
## 2 a mushroom house    349        0.691        0.47           0.404
## 3 a bike bee          318        0.566        0.553          0.299
## 4 a sheep fish        303        0.759        0.465          0.254
## 5 a train cat         343        0.531        0.551          0.181
## 6 an elephant snail   296        0.476        0.483          0.095
## 7 a rabbit boat       313        0.441        0.355          0.013

3 Primary models

# constituent 1 above chance + age effect
model1 <- glmer(constituent1_in_top_5 ~ age_centered +
                  offset(qlogis(individual_chance)) +
                  (age_centered | hybrid_category) +
                  (1 | participant_id),
                data = data, family = binomial,
                control = glmerControl(optimizer = "bobyqa"))
summary(model1)

## Generalized linear mixed model fit by maximum likelihood (Laplace
##   Approximation) [glmerMod]
##  Family: binomial  ( logit )
## Formula: 
## constituent1_in_top_5 ~ age_centered + offset(qlogis(individual_chance)) +  
##     (age_centered | hybrid_category) + (1 | participant_id)
##    Data: data
## Control: glmerControl(optimizer = "bobyqa")
## 
##       AIC       BIC    logLik -2*log(L)  df.resid 
##    2869.8    2904.0   -1428.9    2857.8      2185 
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -2.1293 -0.9862  0.5860  0.8020  1.5150 
## 
## Random effects:
##  Groups          Name         Variance Std.Dev. Corr 
##  participant_id  (Intercept)  0.156013 0.39498       
##  hybrid_category (Intercept)  0.221825 0.47098       
##                  age_centered 0.001329 0.03645  -0.50
## Number of obs: 2191, groups:  participant_id, 600; hybrid_category, 7
## 
## Fixed effects:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept)   0.96628    0.18480   5.229 1.71e-07 ***
## age_centered  0.11150    0.02805   3.975 7.03e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr)
## age_centerd -0.226

# constituent 2 above chance + age effect
model2 <- glmer(constituent2_in_top_5 ~ age_centered +
                  offset(qlogis(individual_chance)) +
                  (age_centered | hybrid_category) +
                  (1 | participant_id),
                data = data, family = binomial,
                control = glmerControl(optimizer = "bobyqa"))
summary(model2)

## Generalized linear mixed model fit by maximum likelihood (Laplace
##   Approximation) [glmerMod]
##  Family: binomial  ( logit )
## Formula: 
## constituent2_in_top_5 ~ age_centered + offset(qlogis(individual_chance)) +  
##     (age_centered | hybrid_category) + (1 | participant_id)
##    Data: data
## Control: glmerControl(optimizer = "bobyqa")
## 
##       AIC       BIC    logLik -2*log(L)  df.resid 
##    3003.0    3037.2   -1495.5    2991.0      2185 
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -1.6950 -0.9434 -0.6961  0.9933  1.4147 
## 
## Random effects:
##  Groups          Name         Variance Std.Dev. Corr
##  participant_id  (Intercept)  0.021861 0.14786      
##  hybrid_category (Intercept)  0.101631 0.31880      
##                  age_centered 0.005331 0.07302  0.83
## Number of obs: 2191, groups:  participant_id, 600; hybrid_category, 7
## 
## Fixed effects:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept)   0.59691    0.12838   4.649 3.33e-06 ***
## age_centered  0.04104    0.03508   1.170    0.242    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr)
## age_centerd 0.616

# formal test of the constituent x age interaction
# two rows per drawing, one per constituent
data_long <- data %>%
  select(drawing_file, participant_id, age_centered, hybrid_category,
         individual_chance,
         constituent1_in_top_5, constituent2_in_top_5) %>%
  pivot_longer(
    cols      = c(constituent1_in_top_5, constituent2_in_top_5),
    names_to  = "constituent",
    values_to = "in_top_5"
  ) %>%
  mutate(constituent = factor(constituent,
                              levels = c("constituent1_in_top_5",
                                         "constituent2_in_top_5"),
                              labels = c("first", "second")))

combined_model <- glmer(in_top_5 ~ age_centered * constituent +
                          offset(qlogis(individual_chance)) +
                          (age_centered | hybrid_category) +
                          (1 | participant_id) +
                          (1 | drawing_file),
                        data = data_long, family = binomial,
                        control = glmerControl(optimizer = "bobyqa"))
summary(combined_model)

## Generalized linear mixed model fit by maximum likelihood (Laplace
##   Approximation) [glmerMod]
##  Family: binomial  ( logit )
## Formula: 
## in_top_5 ~ age_centered * constituent + offset(qlogis(individual_chance)) +  
##     (age_centered | hybrid_category) + (1 | participant_id) +  
##     (1 | drawing_file)
##    Data: data_long
## Control: glmerControl(optimizer = "bobyqa")
## 
##       AIC       BIC    logLik -2*log(L)  df.resid 
##    5917.1    5974.6   -2949.5    5899.1      4373 
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -2.0245 -1.0434  0.6745  0.8874  1.4044 
## 
## Random effects:
##  Groups          Name         Variance Std.Dev. Corr 
##  drawing_file    (Intercept)  0.000000 0.00000       
##  participant_id  (Intercept)  0.000000 0.00000       
##  hybrid_category (Intercept)  0.095888 0.30966       
##                  age_centered 0.002279 0.04774  -0.04
## Number of obs: 4382, groups:  
## drawing_file, 2191; participant_id, 600; hybrid_category, 7
## 
## Fixed effects:
##                                Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                     0.95786    0.12517   7.652 1.97e-14 ***
## age_centered                    0.11051    0.02841   3.890   0.0001 ***
## constituentsecond              -0.36669    0.06189  -5.925 3.12e-09 ***
## age_centered:constituentsecond -0.07124    0.03030  -2.351   0.0187 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr) ag_cnt cnsttn
## age_centerd -0.010              
## cnsttntscnd -0.253 -0.022       
## ag_cntrd:cn -0.010 -0.554  0.017
## optimizer (bobyqa) convergence code: 0 (OK)
## boundary (singular) fit: see help('isSingular')

# joint recognition: both constituents in top 5
both_in_top5 <- glmer(both_constituents_in_top_5 ~ age_centered +
                        offset(qlogis(both_chance)) +
                        (age_centered | hybrid_category) +
                        (1 | participant_id),
                      data = data, family = binomial,
                      control = glmerControl(optimizer = "bobyqa"))
summary(both_in_top5)

## Generalized linear mixed model fit by maximum likelihood (Laplace
##   Approximation) [glmerMod]
##  Family: binomial  ( logit )
## Formula: 
## both_constituents_in_top_5 ~ age_centered + offset(qlogis(both_chance)) +  
##     (age_centered | hybrid_category) + (1 | participant_id)
##    Data: data
## Control: glmerControl(optimizer = "bobyqa")
## 
##       AIC       BIC    logLik -2*log(L)  df.resid 
##    2154.4    2188.5   -1071.2    2142.4      2185 
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -1.1045 -0.6184 -0.3607 -0.0830 10.9464 
## 
## Random effects:
##  Groups          Name         Variance Std.Dev. Corr 
##  participant_id  (Intercept)  0.12531  0.3540        
##  hybrid_category (Intercept)  1.74568  1.3212        
##                  age_centered 0.01173  0.1083   -0.85
## Number of obs: 2191, groups:  participant_id, 600; hybrid_category, 7
## 
## Fixed effects:
##              Estimate Std. Error z value Pr(>|z|)  
## (Intercept)    0.5253     0.5074   1.035   0.3006  
## age_centered   0.1262     0.0534   2.363   0.0181 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr)
## age_centerd -0.675

# analysis (c): per-pair BLUPs from the joint model
joint_blups <- ranef(both_in_top5)$hybrid_category %>%
  as_tibble(rownames = "hybrid_category") %>%
  rename(intercept_dev = `(Intercept)`,
         age_slope_dev = age_centered) %>%
  mutate(
    intercept_total = intercept_dev + fixef(both_in_top5)["(Intercept)"],
    age_slope_total = age_slope_dev + fixef(both_in_top5)["age_centered"]
  ) %>%
  arrange(desc(intercept_total))
print(joint_blups)

## # A tibble: 7 × 5
##   hybrid_category   intercept_dev age_slope_dev intercept_total age_slope_total
##   <chr>                     <dbl>         <dbl>           <dbl>           <dbl>
## 1 a tiger frog             1.34         -0.0606           1.87           0.0656
## 2 a mushroom house         1.17         -0.105            1.70           0.0208
## 3 a bike bee               0.704        -0.0744           1.23           0.0518
## 4 a sheep fish             0.496        -0.0822           1.02           0.0439
## 5 a train cat             -0.0133        0.0488           0.512          0.175 
## 6 an elephant snail       -0.787         0.0911          -0.262          0.217 
## 7 a rabbit boat           -2.59          0.159           -2.06           0.285

4 Robustness and exploratory analyses

# robustness: z-scored rank as continuous outcome for constituent 1
rank_model1 <- lmer(scale(concept1_rank) ~ age_centered +
                      (age_centered | hybrid_category) +
                      (1 | participant_id),
                    data = data)
summary(rank_model1)

## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: 
## scale(concept1_rank) ~ age_centered + (age_centered | hybrid_category) +  
##     (1 | participant_id)
##    Data: data
## 
## REML criterion at convergence: 6013.4
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -2.0454 -0.8004 -0.1998  0.8099  2.3462 
## 
## Random effects:
##  Groups          Name         Variance  Std.Dev. Corr 
##  participant_id  (Intercept)  0.0564222 0.23753       
##  hybrid_category (Intercept)  0.0844257 0.29056       
##                  age_centered 0.0006883 0.02624  -1.00
##  Residual                     0.8475031 0.92060       
## Number of obs: 2191, groups:  participant_id, 600; hybrid_category, 7
## 
## Fixed effects:
##              Estimate Std. Error       df t value Pr(>|t|)    
## (Intercept)   0.01795    0.11213  6.11735   0.160 0.877976    
## age_centered -0.07136    0.01473 10.11918  -4.845 0.000654 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr)
## age_centerd -0.658
## optimizer (nloptwrap) convergence code: 0 (OK)
## boundary (singular) fit: see help('isSingular')

# robustness: z-scored rank as continuous outcome for constituent 2
rank_model2 <- lmer(scale(concept2_rank) ~ age_centered +
                      (age_centered | hybrid_category) +
                      (1 | participant_id),
                    data = data)
summary(rank_model2)

## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: 
## scale(concept2_rank) ~ age_centered + (age_centered | hybrid_category) +  
##     (1 | participant_id)
##    Data: data
## 
## REML criterion at convergence: 6077.3
## 
## Scaled residuals: 
##      Min       1Q   Median       3Q      Max 
## -1.86941 -0.91005  0.03783  0.81875  2.11901 
## 
## Random effects:
##  Groups          Name         Variance Std.Dev. Corr
##  participant_id  (Intercept)  0.009920 0.09960      
##  hybrid_category (Intercept)  0.083219 0.28848      
##                  age_centered 0.001599 0.03999  0.20
##  Residual                     0.911840 0.95490      
## Number of obs: 2191, groups:  participant_id, 600; hybrid_category, 7
## 
## Fixed effects:
##              Estimate Std. Error       df t value Pr(>|t|)
## (Intercept)  -0.01056    0.11104  6.01439  -0.095    0.927
## age_centered -0.02582    0.01822  6.33809  -1.417    0.204
## 
## Correlation of Fixed Effects:
##             (Intr)
## age_centerd 0.162

# analysis (b): continuous-rank robustness check for joint recognition
# max of the two z-scored ranks per drawing; lower = both constituents better recognized
data <- data %>%
  mutate(
    concept1_rank_z = as.numeric(scale(concept1_rank)),
    concept2_rank_z = as.numeric(scale(concept2_rank)),
    max_rank_z      = pmax(concept1_rank_z, concept2_rank_z)
  )

joint_rank_model <- lmer(max_rank_z ~ age_centered +
                           (age_centered | hybrid_category) +
                           (1 | participant_id),
                         data = data)
summary(joint_rank_model)

## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: max_rank_z ~ age_centered + (age_centered | hybrid_category) +  
##     (1 | participant_id)
##    Data: data
## 
## REML criterion at convergence: 4868.7
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -3.2513 -0.7342  0.0210  0.6588  2.7644 
## 
## Random effects:
##  Groups          Name         Variance  Std.Dev. Corr
##  participant_id  (Intercept)  0.0135836 0.11655      
##  hybrid_category (Intercept)  0.2096745 0.45790      
##                  age_centered 0.0007266 0.02696  0.18
##  Residual                     0.5157427 0.71815      
## Number of obs: 2191, groups:  participant_id, 600; hybrid_category, 7
## 
## Fixed effects:
##              Estimate Std. Error       df t value Pr(>|t|)   
## (Intercept)   0.68802    0.17384  6.01251   3.958  0.00744 **
## age_centered -0.05723    0.01290  6.46340  -4.437  0.00367 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr)
## age_centerd 0.145

# exploratory: joint model restricted to ages 6 and up
data_6plus <- data %>%
  filter(age_numeric >= 6) %>%
  mutate(age_centered = scale(age_numeric, center = TRUE, scale = FALSE)[, 1])

cat("6+ sample:", nrow(data_6plus), "drawings,",
    length(unique(data_6plus$participant_id)), "sessions\n")

## 6+ sample: 1142 drawings, 288 sessions

both_6plus <- glmer(both_constituents_in_top_5 ~ age_centered +
                      offset(qlogis(both_chance)) +
                      (age_centered | hybrid_category) +
                      (1 | participant_id),
                    data = data_6plus, family = binomial,
                    control = glmerControl(optimizer = "bobyqa"))
summary(both_6plus)

## Generalized linear mixed model fit by maximum likelihood (Laplace
##   Approximation) [glmerMod]
##  Family: binomial  ( logit )
## Formula: 
## both_constituents_in_top_5 ~ age_centered + offset(qlogis(both_chance)) +  
##     (age_centered | hybrid_category) + (1 | participant_id)
##    Data: data_6plus
## Control: glmerControl(optimizer = "bobyqa")
## 
##       AIC       BIC    logLik -2*log(L)  df.resid 
##    1203.4    1233.6    -595.7    1191.4      1136 
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -1.0374 -0.6450 -0.4547  0.9769  7.5452 
## 
## Random effects:
##  Groups          Name         Variance Std.Dev. Corr 
##  participant_id  (Intercept)  0.06994  0.2645        
##  hybrid_category (Intercept)  1.60821  1.2682        
##                  age_centered 0.02092  0.1447   -1.00
## Number of obs: 1142, groups:  participant_id, 288; hybrid_category, 7
## 
## Fixed effects:
##              Estimate Std. Error z value Pr(>|z|)  
## (Intercept)    0.7157     0.4932   1.451   0.1467  
## age_centered   0.1535     0.0819   1.874   0.0609 .
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr)
## age_centerd -0.702
## optimizer (bobyqa) convergence code: 0 (OK)
## boundary (singular) fit: see help('isSingular')

5 Adult benchmark (analysis d)

# analysis (d): adult benchmark on the 215 adult drawings (~46 sessions)
# same outcomes and offsets as children; random slope dropped because
# age does not vary meaningfully within adults

data_adult <- data_raw %>%
  filter(is_adult == TRUE,
         concept1_available == TRUE & concept2_available == TRUE)

cat("adult sample:", nrow(data_adult), "drawings,",
    length(unique(data_adult$participant_id)), "sessions\n")

## adult sample: 190 drawings, 46 sessions

# attach chance columns and per-pair / overall proportions
data_adult$individual_chance <- individual_chance
data_adult$both_chance        <- both_chance

adult_overall <- data_adult %>%
  summarise(
    n              = n(),
    prop_c1_top5   = mean(constituent1_in_top_5),
    prop_c2_top5   = mean(constituent2_in_top_5),
    prop_both_top5 = mean(both_constituents_in_top_5)
  )
print(adult_overall)

##     n prop_c1_top5 prop_c2_top5 prop_both_top5
## 1 190    0.5947368    0.5789474      0.3157895

adult_by_category <- data_adult %>%
  group_by(hybrid_category) %>%
  summarise(
    n              = n(),
    prop_c1_top5   = round(mean(constituent1_in_top_5), 3),
    prop_c2_top5   = round(mean(constituent2_in_top_5), 3),
    prop_both_top5 = round(mean(both_constituents_in_top_5), 3),
    .groups        = "drop"
  ) %>%
  arrange(desc(prop_both_top5))
print(adult_by_category)

## # A tibble: 7 × 5
##   hybrid_category       n prop_c1_top5 prop_c2_top5 prop_both_top5
##   <chr>             <int>        <dbl>        <dbl>          <dbl>
## 1 a tiger frog         25        0.72         0.68           0.52 
## 2 a bike bee           26        0.423        0.808          0.423
## 3 a sheep fish         25        0.8          0.56           0.36 
## 4 a train cat          26        0.654        0.654          0.346
## 5 a mushroom house     33        0.515        0.364          0.333
## 6 an elephant snail    27        0.37         0.741          0.185
## 7 a rabbit boat        28        0.714        0.321          0.071

# constituent 1
adult_model1 <- glmer(constituent1_in_top_5 ~ 1 +
                        offset(qlogis(individual_chance)) +
                        (1 | hybrid_category) +
                        (1 | participant_id),
                      data = data_adult, family = binomial,
                      control = glmerControl(optimizer = "bobyqa"))
summary(adult_model1)

## Generalized linear mixed model fit by maximum likelihood (Laplace
##   Approximation) [glmerMod]
##  Family: binomial  ( logit )
## Formula: constituent1_in_top_5 ~ 1 + offset(qlogis(individual_chance)) +  
##     (1 | hybrid_category) + (1 | participant_id)
##    Data: data_adult
## Control: glmerControl(optimizer = "bobyqa")
## 
##       AIC       BIC    logLik -2*log(L)  df.resid 
##     258.2     268.0    -126.1     252.2       187 
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -1.6043 -0.9807  0.6233  0.7588  1.0944 
## 
## Random effects:
##  Groups          Name        Variance Std.Dev.
##  participant_id  (Intercept) 0.000    0.0000  
##  hybrid_category (Intercept) 0.263    0.5129  
## Number of obs: 190, groups:  participant_id, 46; hybrid_category, 7
## 
## Fixed effects:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)   1.0083     0.2477   4.071 4.68e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## optimizer (bobyqa) convergence code: 0 (OK)
## boundary (singular) fit: see help('isSingular')

# constituent 2
adult_model2 <- glmer(constituent2_in_top_5 ~ 1 +
                        offset(qlogis(individual_chance)) +
                        (1 | hybrid_category) +
                        (1 | participant_id),
                      data = data_adult, family = binomial,
                      control = glmerControl(optimizer = "bobyqa"))
summary(adult_model2)

## Generalized linear mixed model fit by maximum likelihood (Laplace
##   Approximation) [glmerMod]
##  Family: binomial  ( logit )
## Formula: constituent2_in_top_5 ~ 1 + offset(qlogis(individual_chance)) +  
##     (1 | hybrid_category) + (1 | participant_id)
##    Data: data_adult
## Control: glmerControl(optimizer = "bobyqa")
## 
##       AIC       BIC    logLik -2*log(L)  df.resid 
##     255.3     265.0    -124.6     249.3       187 
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -1.6212 -0.8449  0.5801  0.7332  1.3006 
## 
## Random effects:
##  Groups          Name        Variance Std.Dev.
##  participant_id  (Intercept) 0.1014   0.3184  
##  hybrid_category (Intercept) 0.4265   0.6531  
## Number of obs: 190, groups:  participant_id, 46; hybrid_category, 7
## 
## Fixed effects:
##             Estimate Std. Error z value Pr(>|z|)   
## (Intercept)   0.9795     0.2985   3.282  0.00103 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

# joint
adult_both <- glmer(both_constituents_in_top_5 ~ 1 +
                      offset(qlogis(both_chance)) +
                      (1 | hybrid_category) +
                      (1 | participant_id),
                    data = data_adult, family = binomial,
                    control = glmerControl(optimizer = "bobyqa"))
summary(adult_both)

## Generalized linear mixed model fit by maximum likelihood (Laplace
##   Approximation) [glmerMod]
##  Family: binomial  ( logit )
## Formula: both_constituents_in_top_5 ~ 1 + offset(qlogis(both_chance)) +  
##     (1 | hybrid_category) + (1 | participant_id)
##    Data: data_adult
## Control: glmerControl(optimizer = "bobyqa")
## 
##       AIC       BIC    logLik -2*log(L)  df.resid 
##     239.0     248.8    -116.5     233.0       187 
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -0.9202 -0.6968 -0.5391  1.1491  2.1728 
## 
## Random effects:
##  Groups          Name        Variance Std.Dev.
##  participant_id  (Intercept) 0.08406  0.2899  
##  hybrid_category (Intercept) 0.31628  0.5624  
## Number of obs: 190, groups:  participant_id, 46; hybrid_category, 7
## 
## Fixed effects:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)   1.2539     0.2815   4.455 8.39e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

# constituent asymmetry in adults: long format with constituent factor
data_adult_long <- data_adult %>%
  select(drawing_file, participant_id, hybrid_category, individual_chance,
         constituent1_in_top_5, constituent2_in_top_5) %>%
  pivot_longer(
    cols      = c(constituent1_in_top_5, constituent2_in_top_5),
    names_to  = "constituent",
    values_to = "in_top_5"
  ) %>%
  mutate(constituent = factor(constituent,
                              levels = c("constituent1_in_top_5",
                                         "constituent2_in_top_5"),
                              labels = c("first", "second")))

adult_combined <- glmer(in_top_5 ~ constituent +
                          offset(qlogis(individual_chance)) +
                          (1 | hybrid_category) +
                          (1 | participant_id) +
                          (1 | drawing_file),
                        data = data_adult_long, family = binomial,
                        control = glmerControl(optimizer = "bobyqa"))
summary(adult_combined)

## Generalized linear mixed model fit by maximum likelihood (Laplace
##   Approximation) [glmerMod]
##  Family: binomial  ( logit )
## Formula: in_top_5 ~ constituent + offset(qlogis(individual_chance)) +  
##     (1 | hybrid_category) + (1 | participant_id) + (1 | drawing_file)
##    Data: data_adult_long
## Control: glmerControl(optimizer = "bobyqa")
## 
##       AIC       BIC    logLik -2*log(L)  df.resid 
##     523.4     543.1    -256.7     513.4       375 
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -1.3529 -1.1400  0.7530  0.8257  0.9899 
## 
## Random effects:
##  Groups          Name        Variance  Std.Dev. 
##  drawing_file    (Intercept) 1.110e-14 1.054e-07
##  participant_id  (Intercept) 1.321e-14 1.149e-07
##  hybrid_category (Intercept) 6.536e-02 2.557e-01
## Number of obs: 380, groups:  
## drawing_file, 190; participant_id, 46; hybrid_category, 7
## 
## Fixed effects:
##                   Estimate Std. Error z value Pr(>|z|)    
## (Intercept)        0.99311    0.17835   5.568 2.57e-08 ***
## constituentsecond -0.06619    0.21006  -0.315    0.753    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr)
## cnsttntscnd -0.593
## optimizer (bobyqa) convergence code: 0 (OK)
## boundary (singular) fit: see help('isSingular')

6 Figures

clean_category <- function(x) {
  x %>%
    str_remove("^an? ") %>%
    str_replace_all(" ", "-")
}

# per-pair, per-age proportions for both constituents
panel_a_data <- data %>%
  mutate(hybrid_category = clean_category(hybrid_category)) %>%
  select(participant_id, age_numeric, hybrid_category,
         constituent1_in_top_5, constituent2_in_top_5) %>%
  pivot_longer(
    cols      = c(constituent1_in_top_5, constituent2_in_top_5),
    names_to  = "constituent",
    values_to = "top_5"
  ) %>%
  mutate(constituent = factor(constituent,
                              levels = c("constituent1_in_top_5",
                                         "constituent2_in_top_5"),
                              labels = c("Constituent 1", "Constituent 2"))) %>%
  group_by(age_numeric, hybrid_category, constituent) %>%
  summarise(
    prop_top_5  = mean(top_5),
    se          = sd(top_5) / sqrt(n()),
    n_drawings  = n(),
    .groups     = "drop"
  )

fig_constituents <- ggplot(panel_a_data,
                           aes(x = age_numeric, y = prop_top_5, color = constituent)) +
  geom_hline(yintercept = individual_chance, linetype = "dashed", color = "grey50") +
  geom_point(aes(size = n_drawings), alpha = 0.8) +
  geom_linerange(aes(ymin = prop_top_5 - se, ymax = prop_top_5 + se), alpha = 0.3) +
  geom_smooth(method = "lm", aes(fill = constituent), alpha = 0.2, se = TRUE) +
  facet_wrap(~hybrid_category, nrow = 1) +
  scale_color_manual(values = c("Constituent 1" = "#E74C3C",
                                "Constituent 2" = "#3498DB")) +
  scale_fill_manual(values  = c("Constituent 1" = "#E74C3C",
                                "Constituent 2" = "#3498DB")) +
  scale_size_continuous(range = c(0.5, 2)) +
  ylim(0, 1) +
  labs(x = "Age (years)", y = "Proportion with constituent in top 5",
       color = "Constituent", fill = "Constituent",
       size = "Number of\ndrawings") +
  theme_few() +
  theme(legend.position = "right", aspect.ratio = 1,
        strip.text = element_text(size = 8),
        axis.title = element_text(size = 9),
        axis.text  = element_text(size = 9),
        legend.title = element_text(size = 8),
        legend.text  = element_text(size = 7))
fig_constituents

# ggsave("../writing/fig_constituent_age.svg", fig_constituents, width = 10, height = 4)

# per-category and overall age trajectories for joint recognition
category_data <- data %>%
  mutate(hybrid_category = clean_category(hybrid_category)) %>%
  group_by(age_numeric, hybrid_category) %>%
  summarise(prop_both_top_5 = mean(both_constituents_in_top_5), .groups = "drop")

collapsed_data <- data %>%
  group_by(age_numeric) %>%
  summarise(
    prop_both_top_5 = mean(both_constituents_in_top_5),
    se              = sd(both_constituents_in_top_5) / sqrt(n()),
    n_drawings      = n(),
    .groups         = "drop"
  )

x_limits <- range(collapsed_data$age_numeric)

fig_joint <- ggplot() +
  geom_hline(yintercept = both_chance, linetype = "dashed", color = "grey50") +
  geom_smooth(data = category_data,
              aes(x = age_numeric, y = prop_both_top_5,
                  color = hybrid_category, fill = hybrid_category),
              method = "lm", se = TRUE, alpha = 0.1, linewidth = 0.7) +
  geom_smooth(data = collapsed_data,
              aes(x = age_numeric, y = prop_both_top_5),
              method = "lm", color = "black", fill = "grey30",
              alpha = 0.3, linewidth = 1.2, se = TRUE) +
  geom_point(data = collapsed_data,
             aes(x = age_numeric, y = prop_both_top_5, size = n_drawings),
             alpha = 0.8) +
  scale_size_continuous(range = c(2, 5)) +
  scale_color_viridis_d(name = "Hybrid category") +
  scale_fill_viridis_d(name  = "Hybrid category") +
  scale_x_continuous(limits = x_limits, expand = c(0.01, 0.01)) +
  ylim(0, 1) +
  labs(x = "Age (years)", y = "Proportion with both constituents in top 5",
       size = "Number of\ndrawings") +
  theme_few() +
  theme(legend.position = "right",
        axis.title = element_text(size = 12),
        axis.text  = element_text(size = 11)) +
  guides(fill = guide_legend(), color = guide_legend())
fig_joint

# ggsave("../writing/fig_joint_age.svg", fig_joint, width = 7, height = 5)

fig_ranks <- data %>%
  mutate(hybrid_category_clean = clean_category(hybrid_category)) %>%
  ggplot(aes(x = concept1_rank, y = concept2_rank, color = hybrid_category_clean)) +
  geom_point(alpha = 0.6, size = 2) +
  geom_hline(yintercept = 5, color = "black", linewidth = 0.5) +
  geom_vline(xintercept = 5, color = "black", linewidth = 0.5) +
  facet_wrap(~hybrid_category_clean, nrow = 1) +
  coord_fixed(ratio = 1) +
  scale_color_viridis_d() +
  labs(x = "Constituent 1 classification rank",
       y = "Constituent 2 classification rank") +
  theme_few() +
  theme(legend.position = "none",
        strip.text = element_text(size = 8),
        axis.title = element_text(size = 9),
        axis.text  = element_text(size = 9))
fig_ranks

# ggsave("../writing/fig_rank_scatter.svg", fig_ranks, width = 10, height = 3)

Hybrid drawing classification rank models

Haoyu Du

2026-05-20