hybrid-draw-cds-abstract-analysis-1

1 classification models
2 against chance
3 effect size
4 dev effects - how to interpret for each constituent?

This analysis tests whether children’s hybrid drawings contain recognizable elements from both constituent concepts using CLIP embeddings and k-way classification. We examine whether hybrid drawings rank their constituent concepts above chance levels when compared to all base concepts in our stimulus set.

# load anonymized hybrid drawing analysis data
data <- read.csv(here::here('cds_k_way_classification_all_children_anonymized.csv'))

cat("total hybrid drawings:", nrow(data), "\n")

## total hybrid drawings: 1506

cat("unique participants:", length(unique(data$participant_id)), "\n")

## unique participants: 392

cat("hybrid categories:", length(unique(data$hybrid_category)), "\n")

## hybrid categories: 8

cat("age range:", range(data$age_numeric), "\n")

## age range: 2 10

data_original <- data  # save original if needed
data <- data %>%
  filter(age_numeric >= 3)
n_participants <- length(unique(data$participant_id))
age_mean <- round(mean(data$age_numeric), 2)
age_sd <- round(sd(data$age_numeric), 2)
age_min <- min(data$age_numeric)
age_max <- max(data$age_numeric)
n_drawings <- nrow(data)
cat("Sample descriptives after filtering:\n")

## Sample descriptives after filtering:

cat("Children (N =", n_participants, ", ages", age_min, "-", age_max, "years, M =", age_mean, ", SD =", age_sd, ")\n")

## Children (N = 337 , ages 3 - 10 years, M = 5.61 , SD = 2.09 )

cat("Total drawings:", n_drawings, "\n")

## Total drawings: 1367

cat("Drawings per age:\n")

## Drawings per age:

print(table(data$age_numeric))

## 
##   3   4   5   6   7   8   9  10 
## 218 310 191 252 146  73  71 106

# center age for interpretable intercept
data$age_centered <- scale(data$age_numeric, center = TRUE, scale = FALSE)[,1]

# analysis parameters
n_concepts <- 14  # dinosaur not included in base concepts, but truck can be used for comparison?
top1_chance <- 1/n_concepts
top5_chance <- 5/n_concepts

cat("number of base concepts:", n_concepts, "\n")

## number of base concepts: 14

cat("top-1 chance level:", round(top1_chance, 3), "\n")

## top-1 chance level: 0.071

cat("top-5 chance level:", round(top5_chance, 3), "\n")

## top-5 chance level: 0.357

boolean_cols <- c("constituent1_in_top_1", "constituent2_in_top_1", 
                  "constituent1_in_top_5", "constituent2_in_top_5",
                  "both_constituents_in_top_1", "both_constituents_in_top_5")
data[boolean_cols] <- lapply(data[boolean_cols], as.logical)

summary(data)

##  hybrid_category      concept1           concept2         participant_id    
##  Length:1367        Length:1367        Length:1367        Length:1367       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##   age_numeric     centroid_source    concept1_available concept2_available
##  Min.   : 3.000   Length:1367        Length:1367        Length:1367       
##  1st Qu.: 4.000   Class :character   Class :character   Class :character  
##  Median : 5.000   Mode  :character   Mode  :character   Mode  :character  
##  Mean   : 5.608                                                           
##  3rd Qu.: 7.000                                                           
##  Max.   :10.000                                                           
##                                                                           
##  concept1_similarity concept1_rank    constituent1_in_top_1
##  Min.   :0.7705      Min.   : 1.000   Mode :logical        
##  1st Qu.:0.9072      1st Qu.: 2.000   FALSE:936            
##  Median :0.9282      Median : 5.000   TRUE :255            
##  Mean   :0.9231      Mean   : 5.673   NA's :176            
##  3rd Qu.:0.9429      3rd Qu.: 9.000                        
##  Max.   :0.9771      Max.   :14.000                        
##  NA's   :176         NA's   :176                           
##  constituent1_in_top_5 concept2_similarity concept2_rank   
##  Mode :logical         Min.   :0.8008      Min.   : 1.000  
##  FALSE:536             1st Qu.:0.9014      1st Qu.: 2.000  
##  TRUE :655             Median :0.9243      Median : 5.000  
##  NA's :176             Mean   :0.9204      Mean   : 6.024  
##                        3rd Qu.:0.9434      3rd Qu.: 9.000  
##                        Max.   :0.9756      Max.   :14.000  
##                        NA's   :176         NA's   :176     
##  constituent2_in_top_1 constituent2_in_top_5 both_constituents_in_top_1
##  Mode :logical         Mode :logical         Mode :logical             
##  FALSE:973             FALSE:594             FALSE:1191                
##  TRUE :218             TRUE :597             NA's :176                 
##  NA's :176             NA's :176                                       
##                                                                        
##                                                                        
##                                                                        
##  both_constituents_in_top_5 drawing_file        age_centered    
##  Mode :logical              Length:1367        Min.   :-2.6079  
##  FALSE:923                  Class :character   1st Qu.:-1.6079  
##  TRUE :268                  Mode  :character   Median :-0.6079  
##  NA's :176                                     Mean   : 0.0000  
##                                                3rd Qu.: 1.3921  
##                                                Max.   : 4.3921  
##

age_descriptives <- data %>%
  group_by(age_numeric) %>%
  summarise(
    n_drawings = n(),
    n_both_top5 = sum(both_constituents_in_top_5, na.rm = TRUE),
    prop_both_top5 = round(mean(both_constituents_in_top_5, na.rm = TRUE), 3),
    .groups = 'drop'
  )
print(age_descriptives)

## # A tibble: 8 × 4
##   age_numeric n_drawings n_both_top5 prop_both_top5
##         <dbl>      <int>       <int>          <dbl>
## 1           3        218          33          0.176
## 2           4        310          52          0.188
## 3           5        191          27          0.167
## 4           6        252          55          0.255
## 5           7        146          32          0.252
## 6           8         73          15          0.227
## 7           9         71          17          0.27 
## 8          10        106          37          0.398

overall_percent <- round(mean(data$both_constituents_in_top_5, na.rm = TRUE) * 100, 1)
cat(overall_percent, "% of drawings had both constituents in top 5\n")

## 22.5 % of drawings had both constituents in top 5

data$concept1_available <- as.logical(data$concept1_available)
data$concept2_available <- as.logical(data$concept2_available)

cat("After conversion:\n")

## After conversion:

print(table(data$concept1_available, useNA = "always"))

## 
## FALSE  TRUE  <NA> 
##   176  1191     0

print(table(data$concept2_available, useNA = "always"))

## 
## FALSE  TRUE  <NA> 
##   176  1191     0

data_filtered <- data %>%
  filter(concept1_available == TRUE & concept2_available == TRUE)

cat("original data:", nrow(data), "drawings\n")

## original data: 1367 drawings

cat("after filtering for available centroids:", nrow(data_filtered), "drawings\n")

## after filtering for available centroids: 1191 drawings

cat("remaining concept1 values:\n")

## remaining concept1 values:

print(table(data_filtered$concept1))

## 
##      a bike  a mushroom    a rabbit     a sheep     a tiger     a train 
##         188         174         168         171         146         184 
## an elephant 
##         160

# use filtered data for all analyses
data <- data_filtered

# filter for rabbit-boat hybrids
rabbit_boat <- data %>% 
  filter(hybrid_category == "a rabbit boat")

# look at similarity rankings
rabbit_boat_similarities <- rabbit_boat %>%
  select(starts_with("concept"), contains("similarity"), contains("rank")) %>%
  arrange(concept1_rank)

# see what ranks highly that shouldn't
cat("Top non-constituent similarities for rabbit-boat:\n")

## Top non-constituent similarities for rabbit-boat:

print(summary(rabbit_boat_similarities))

##    concept1           concept2         concept1_available concept2_available
##  Length:168         Length:168         Mode:logical       Mode:logical      
##  Class :character   Class :character   TRUE:168           TRUE:168          
##  Mode  :character   Mode  :character                                        
##                                                                             
##                                                                             
##                                                                             
##  concept1_similarity concept1_rank    concept2_similarity concept2_rank  
##  Min.   :0.7705      Min.   : 1.000   Min.   :0.8164      Min.   : 1.00  
##  1st Qu.:0.8947      1st Qu.: 2.000   1st Qu.:0.8923      1st Qu.: 2.75  
##  Median :0.9268      Median : 9.000   Median :0.9138      Median :10.00  
##  Mean   :0.9167      Mean   : 7.673   Mean   :0.9126      Mean   : 8.19  
##  3rd Qu.:0.9434      3rd Qu.:13.000   3rd Qu.:0.9385      3rd Qu.:13.00  
##  Max.   :0.9673      Max.   :14.000   Max.   :0.9731      Max.   :14.00

Make some plots by age for each hybrid concepts

age_summary <- data %>%
  select(participant_id, age_numeric, hybrid_category, concept1, concept2, concept1_similarity, concept2_similarity) %>% 
  pivot_longer(cols=c('concept1_similarity','concept2_similarity'), names_to = 'concept_comparison', values_to='similarity') %>%
  group_by(age_numeric, hybrid_category, concept1, concept2, concept_comparison) %>%  # grouping by hybrid_category
  summarize(mean_sim = mean(similarity), sd_sim = sd(similarity), num_drawings = n())

Average similarity to each base concept across age

ggplot(age_summary, aes(x=age_numeric, y=mean_sim, col=concept_comparison)) +
  geom_point(alpha=.8, aes(size=num_drawings)) +
  geom_linerange(aes(ymax = (mean_sim + sd_sim), ymin = (mean_sim - sd_sim),col=concept_comparison), alpha=.2) +
  facet_wrap(~hybrid_category, nrow=2) +  # changed from ~concept1 to ~hybrid_category
  theme_few()

age_summary_top_5 <- data %>%
  select(participant_id, age_numeric, concept1, concept2, constituent1_in_top_5, constituent2_in_top_5) %>%
  mutate(hybrid_category = paste0(str_split_fixed(concept1,' ',2)[,2],'-',str_split_fixed(concept2,' ',2)[,2])) %>%
  pivot_longer(cols=c('constituent1_in_top_5','constituent2_in_top_5'), names_to = 'concept_comparison', values_to='top_5_accuracy') %>%
  group_by(age_numeric, hybrid_category, concept1, concept2, concept_comparison) %>%
  summarize(mean_acc_top_5 = mean(top_5_accuracy), se_acc_top_5 = sd(top_5_accuracy) / n(), num_drawings = n())

age_summary_both_top_5 <- data %>%
  select(participant_id, age_numeric, hybrid_category, concept1, concept2, both_constituents_in_top_5) %>%
  group_by(age_numeric, hybrid_category, concept1, concept2) %>%
  summarize(mean_acc_top_5 = mean(both_constituents_in_top_5), se_acc_top_5 = sd(both_constituents_in_top_5) / n(), num_drawings = n())

age_summary_top_1 <- data %>%
  select(participant_id, age_numeric, concept1, concept2, constituent1_in_top_1, constituent2_in_top_1) %>%
  mutate(hybrid_category = paste0(str_split_fixed(concept1,' ',2)[,2],'-',str_split_fixed(concept2,' ',2)[,2])) %>%
  pivot_longer(cols=c('constituent1_in_top_1','constituent2_in_top_1'), names_to = 'concept_comparison', values_to='top_1_accuracy') %>%
  group_by(age_numeric, hybrid_category, concept1, concept2, concept_comparison) %>%
  summarize(mean_acc_top_1 = mean(top_1_accuracy), se_acc_top_1 = sd(top_1_accuracy) / n(), num_drawings = n())

Average similarity (scaled) to each base concept across age

ggplot(age_summary_top_5, aes(x=age_numeric, y=mean_acc_top_5, col=concept_comparison)) +
  geom_point(alpha=.8, aes(size=num_drawings)) +
  geom_linerange(aes(ymax = (mean_acc_top_5 + se_acc_top_5), ymin = (mean_acc_top_5 - se_acc_top_5),col=concept_comparison), alpha=.2) +
  facet_wrap(~hybrid_category, nrow=2) +
  theme_few() +
  ylim(0,1) +
  ylab('Mean acc top 5') +
  xlab('Age in years') +
  geom_smooth(method='lm', aes(fill=concept_comparison), alpha=.2) +
  geom_hline(yintercept=top5_chance, linetype='dashed', color='grey')

Look at top 1 – funkier!

ggplot(age_summary_top_1, aes(x=age_numeric, y=mean_acc_top_1, col=concept_comparison)) +
  geom_point(alpha=.8, aes(size=num_drawings)) +
  geom_linerange(aes(ymax = (mean_acc_top_1 + se_acc_top_1), ymin = (mean_acc_top_1 - se_acc_top_1),col=concept_comparison), alpha=.2) +
  facet_wrap(~hybrid_category, nrow=2) +
  theme_few() +
  ylim(0,1) +
  ylab('Mean acc top 1') +
  xlab('Age in years')

ggplot(age_summary_top_5, aes(x=age_numeric, y=mean_acc_top_5, col=hybrid_category)) +
  geom_point(alpha=.8, aes(size=num_drawings)) +
  geom_linerange(aes(ymax = (mean_acc_top_5 + se_acc_top_5), ymin = (mean_acc_top_5 - se_acc_top_5),col=hybrid_category), alpha=.2) +
  facet_wrap(~concept_comparison, nrow=1) +
  theme_few() +
  ylim(0,1) +
  ylab('Mean acc top 5') +
  xlab('Age in years') +
  geom_smooth(method='lm', aes(fill=hybrid_category), alpha=.2) +
  geom_hline(yintercept=top5_chance, linetype='dashed', color='grey')

ggplot(age_summary_both_top_5, aes(x=age_numeric, y=mean_acc_top_5, col=hybrid_category)) +
  geom_point(alpha=.8, aes(size=num_drawings)) +
  geom_linerange(aes(ymax = (mean_acc_top_5 + se_acc_top_5), ymin = (mean_acc_top_5 - se_acc_top_5),col=hybrid_category), alpha=.2) +
  facet_wrap(~hybrid_category, nrow=2) +
  theme_few() +
  ylim(0,1) +
  ylab('Mean acc both concepts in top 5') +
  xlab('Age in years') +
  geom_smooth(method='lm', aes(fill=hybrid_category), alpha=.2)

1 classification models

We use general linear mixed-effects models to test whether each constituent ranks in the top-5 above chance levels, controlling for age and accounting for clustering by participant and hybrid category.

#add chance level
individual_chance = 5/14  # top-5 chance level
data$individual_chance <- individual_chance
# fit logistic mixed-effects models
model1 <- glmer(constituent1_in_top_5 ~ age_centered + 
                  offset(qlogis(individual_chance))+
                  (age_centered | hybrid_category) + 
                  (1 | participant_id), 
                  data = data, family = binomial,
                  control = glmerControl(optimizer = "bobyqa"))

summary(model1)

## Generalized linear mixed model fit by maximum likelihood (Laplace
##   Approximation) [glmerMod]
##  Family: binomial  ( logit )
## Formula: 
## constituent1_in_top_5 ~ age_centered + offset(qlogis(individual_chance)) +  
##     (age_centered | hybrid_category) + (1 | participant_id)
##    Data: data
## Control: glmerControl(optimizer = "bobyqa")
## 
##       AIC       BIC    logLik -2*log(L)  df.resid 
##    1577.7    1608.2    -782.8    1565.7      1185 
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -2.5381 -0.9174  0.5529  0.8343  1.5906 
## 
## Random effects:
##  Groups          Name         Variance Std.Dev. Corr
##  participant_id  (Intercept)  0.208168 0.45625      
##  hybrid_category (Intercept)  0.179086 0.42319      
##                  age_centered 0.003473 0.05894  0.05
## Number of obs: 1191, groups:  participant_id, 332; hybrid_category, 7
## 
## Fixed effects:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept)   0.79690    0.17432   4.572 4.84e-06 ***
## age_centered  0.19690    0.04097   4.806 1.54e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr)
## age_centerd 0.049

model2 <- glmer(constituent2_in_top_5 ~ age_centered + 
                  offset(qlogis(individual_chance))+
                  (age_centered | hybrid_category) + 
                  (1 | participant_id), 
                  data = data, family = binomial,
                  control = glmerControl(optimizer = "bobyqa"))
summary(model2)

## Generalized linear mixed model fit by maximum likelihood (Laplace
##   Approximation) [glmerMod]
##  Family: binomial  ( logit )
## Formula: 
## constituent2_in_top_5 ~ age_centered + offset(qlogis(individual_chance)) +  
##     (age_centered | hybrid_category) + (1 | participant_id)
##    Data: data
## Control: glmerControl(optimizer = "bobyqa")
## 
##       AIC       BIC    logLik -2*log(L)  df.resid 
##    1633.8    1664.3    -810.9    1621.8      1185 
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -1.6502 -0.9563  0.6060  0.9773  1.3626 
## 
## Random effects:
##  Groups          Name         Variance Std.Dev. Corr
##  participant_id  (Intercept)  0.000000 0.00000      
##  hybrid_category (Intercept)  0.116545 0.34139      
##                  age_centered 0.009012 0.09493  0.36
## Number of obs: 1191, groups:  participant_id, 332; hybrid_category, 7
## 
## Fixed effects:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept)   0.59669    0.14204   4.201 2.66e-05 ***
## age_centered  0.06252    0.04598   1.359    0.174    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr)
## age_centerd 0.262 
## optimizer (bobyqa) convergence code: 0 (OK)
## boundary (singular) fit: see help('isSingular')

cat("Model 1 convergence:", model1@optinfo$conv$opt, "\n")

## Model 1 convergence: 0

cat("Model 2 convergence:", model2@optinfo$conv$opt, "\n")

## Model 2 convergence: 0

both_chance <- (5/n_concepts)^2
data$both_chance <- both_chance
both_in_top5 <- glmer(both_constituents_in_top_5 ~ 
                      offset(qlogis(both_chance)) +  # This adjusts intercept to chance level
                      age_centered + 
                      (age_centered | hybrid_category) + 
                      (1 | participant_id), 
                      data = data, family = binomial,
                      control = glmerControl(optimizer = "bobyqa"))

summary(both_in_top5)

## Generalized linear mixed model fit by maximum likelihood (Laplace
##   Approximation) [glmerMod]
##  Family: binomial  ( logit )
## Formula: 
## both_constituents_in_top_5 ~ offset(qlogis(both_chance)) + age_centered +  
##     (age_centered | hybrid_category) + (1 | participant_id)
##    Data: data
## Control: glmerControl(optimizer = "bobyqa")
## 
##       AIC       BIC    logLik -2*log(L)  df.resid 
##    1125.5    1156.0    -556.7    1113.5      1185 
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -1.0434 -0.6258 -0.3237 -0.0386  5.3843 
## 
## Random effects:
##  Groups          Name         Variance  Std.Dev.  Corr 
##  participant_id  (Intercept)  1.927e-10 1.388e-05      
##  hybrid_category (Intercept)  2.834e+00 1.683e+00      
##                  age_centered 4.233e-02 2.057e-01 -0.97
## Number of obs: 1191, groups:  participant_id, 332; hybrid_category, 7
## 
## Fixed effects:
##              Estimate Std. Error z value Pr(>|z|)   
## (Intercept)   0.07838    0.66266   0.118  0.90585   
## age_centered  0.25397    0.09598   2.646  0.00814 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr)
## age_centerd -0.864
## optimizer (bobyqa) convergence code: 0 (OK)
## boundary (singular) fit: see help('isSingular')

data_6plus <- data %>%
  filter(age_numeric >= 6)

cat("6 and up sample:\n")

## 6 and up sample:

cat("N children:", length(unique(data_6plus$participant_id)), "\n")

## N children: 141

cat("N drawings:", nrow(data_6plus), "\n")

## N drawings: 565

cat("Age range:", range(data_6plus$age_numeric), "\n")

## Age range: 6 10

cat("Mean age:", round(mean(data_6plus$age_numeric), 2), "\n")

## Mean age: 7.45

age_desc_6plus <- data_6plus %>%
  group_by(age_numeric) %>%
  summarise(
    n_drawings = n(),
    prop_both_top5 = round(mean(both_constituents_in_top_5, na.rm = TRUE), 3),
    .groups = 'drop'
  )
print(age_desc_6plus)

## # A tibble: 5 × 3
##   age_numeric n_drawings prop_both_top5
##         <dbl>      <int>          <dbl>
## 1           6        216          0.255
## 2           7        127          0.252
## 3           8         66          0.227
## 4           9         63          0.27 
## 5          10         93          0.398

overall_6plus <- round(mean(data_6plus$both_constituents_in_top_5, na.rm = TRUE), 3)
cat("Overall proportion for 6 and up:", overall_6plus, "\n")

## Overall proportion for 6 and up: 0.276

# center age for 6+ sample
data_6plus$age_centered <- scale(data_6plus$age_numeric, center = TRUE, scale = FALSE)[,1]

# both constituents model with offset - ages 6+
both_6plus_offset <- glmer(both_constituents_in_top_5 ~ 
                          offset(qlogis(both_chance)) +  
                          age_centered + 
                          (age_centered | hybrid_category) + 
                          (1 | participant_id), 
                          data = data_6plus, family = binomial,
                          control = glmerControl(optimizer = "bobyqa"))

summary(both_6plus_offset)

## Generalized linear mixed model fit by maximum likelihood (Laplace
##   Approximation) [glmerMod]
##  Family: binomial  ( logit )
## Formula: 
## both_constituents_in_top_5 ~ offset(qlogis(both_chance)) + age_centered +  
##     (age_centered | hybrid_category) + (1 | participant_id)
##    Data: data_6plus
## Control: glmerControl(optimizer = "bobyqa")
## 
##       AIC       BIC    logLik -2*log(L)  df.resid 
##     619.8     645.8    -303.9     607.8       559 
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -1.0464 -0.7357 -0.4252  1.0678  3.2988 
## 
## Random effects:
##  Groups          Name         Variance Std.Dev. Corr 
##  participant_id  (Intercept)  0.00000  0.0000        
##  hybrid_category (Intercept)  1.48125  1.2171        
##                  age_centered 0.02875  0.1696   -0.91
## Number of obs: 565, groups:  participant_id, 141; hybrid_category, 7
## 
## Fixed effects:
##              Estimate Std. Error z value Pr(>|z|)  
## (Intercept)    0.6322     0.4848   1.304   0.1923  
## age_centered   0.2120     0.1057   2.006   0.0449 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr)
## age_centerd -0.624
## optimizer (bobyqa) convergence code: 0 (OK)
## boundary (singular) fit: see help('isSingular')

intercept_6plus <- fixef(both_6plus_offset)["(Intercept)"]
intercept_se_6plus <- sqrt(diag(vcov(both_6plus_offset)))["(Intercept)"]
age_effect_6plus <- fixef(both_6plus_offset)["age_centered"]
cat("Intercept (above-chance performance):", round(intercept_6plus, 3), "\n")

## Intercept (above-chance performance): 0.632

cat("Age effect:", round(age_effect_6plus, 3), "\n")

## Age effect: 0.212

Lmers on normalized rank values give same results, as per graphs, for individual concepts

rank_model2 <- lmer(scale(concept2_rank) ~ age_centered + 
                  (age_centered | hybrid_category) + 
                  (1 | participant_id), 
                  data = data)

summary(rank_model2)

## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: 
## scale(concept2_rank) ~ age_centered + (age_centered | hybrid_category) +  
##     (1 | participant_id)
##    Data: data
## 
## REML criterion at convergence: 3288.1
## 
## Scaled residuals: 
##      Min       1Q   Median       3Q      Max 
## -1.97058 -0.88732  0.03511  0.83748  2.11372 
## 
## Random effects:
##  Groups          Name         Variance  Std.Dev.  Corr 
##  participant_id  (Intercept)  3.499e-09 5.915e-05      
##  hybrid_category (Intercept)  9.935e-02 3.152e-01      
##                  age_centered 2.495e-03 4.995e-02 -0.40
##  Residual                     8.997e-01 9.485e-01      
## Number of obs: 1191, groups:  participant_id, 332; hybrid_category, 7
## 
## Fixed effects:
##               Estimate Std. Error        df t value Pr(>|t|)
## (Intercept)  -0.002877   0.122283  5.990453  -0.024    0.982
## age_centered -0.041864   0.023021  6.061584  -1.819    0.118
## 
## Correlation of Fixed Effects:
##             (Intr)
## age_centerd -0.322
## optimizer (nloptwrap) convergence code: 0 (OK)
## boundary (singular) fit: see help('isSingular')

rank_model1 <- lmer(scale(concept1_rank) ~ age_centered + 
                  (age_centered | hybrid_category) + 
                  (1 | participant_id), 
                  data = data)

summary(rank_model1)

## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: 
## scale(concept1_rank) ~ age_centered + (age_centered | hybrid_category) +  
##     (1 | participant_id)
##    Data: data
## 
## REML criterion at convergence: 3251.5
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -2.0568 -0.8047 -0.1803  0.8326  2.2571 
## 
## Random effects:
##  Groups          Name         Variance Std.Dev. Corr 
##  participant_id  (Intercept)  0.070772 0.26603       
##  hybrid_category (Intercept)  0.080576 0.28386       
##                  age_centered 0.001007 0.03173  -0.81
##  Residual                     0.815123 0.90284       
## Number of obs: 1191, groups:  participant_id, 332; hybrid_category, 7
## 
## Fixed effects:
##              Estimate Std. Error       df t value Pr(>|t|)    
## (Intercept)   0.02319    0.11171  6.26019   0.208 0.842124    
## age_centered -0.09939    0.01907  8.23816  -5.211 0.000739 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr)
## age_centerd -0.487

m1_coef <- summary(model1)$coefficients["age_centered", ]
m2_coef <- summary(model2)$coefficients["age_centered", ]

category_stats <- data %>%
  group_by(hybrid_category) %>%
  summarise(
    n = n(),
    prop_both_top5 = mean(both_constituents_in_top_5),
    prop_c1_top5 = mean(constituent1_in_top_5),
    prop_c2_top5 = mean(constituent2_in_top_5),
    .groups = 'drop'
  )

(Constituent 1: B = 0.2, SE = 0.04, Z = 4.81, p = 1.5e-06; Constituent 2: B = 0.06, SE = 0.05, Z = 1.36, p = 0.17)

(e.g., tiger-frog: 42% of drawings had both constituents in top 5) while others showed trade-offs between constituents (e.g., rabbit-boat: 42% for rabbit, 35% for boat, but only 1% jointly)

library(ggthemes)

clean_category <- function(x) {
  x %>%
    str_remove("^a ") %>%
    str_replace(" ", "-")
}

panel_a_data <- data %>%
  select(participant_id, age_numeric, hybrid_category, 
         constituent1_in_top_5, constituent2_in_top_5) %>%
  mutate(hybrid_category = clean_category(hybrid_category)) %>%
  pivot_longer(cols = c(constituent1_in_top_5, constituent2_in_top_5),
               names_to = "constituent",
               values_to = "in_top_5") %>%
  mutate(constituent = recode(constituent,
                              "constituent1_in_top_5" = "Constituent 1",
                              "constituent2_in_top_5" = "Constituent 2")) %>%
  group_by(age_numeric, hybrid_category, constituent) %>%
  summarise(
    prop_top_5 = mean(in_top_5),
    se = sd(in_top_5) / sqrt(n()),
    n_drawings = n(),
    .groups = "drop"
  )

ggplot(panel_a_data, 
       aes(x = age_numeric, y = prop_top_5, color = constituent)) +
  geom_hline(yintercept = 5/14, linetype = "dashed", color = "grey50") +
  geom_point(aes(size = n_drawings), alpha = 0.8) +
  geom_linerange(aes(ymin = prop_top_5 - se, ymax = prop_top_5 + se), alpha = 0.3) +
  geom_smooth(method = "lm", aes(fill = constituent), alpha = 0.2, se = TRUE) +
  facet_wrap(~hybrid_category, nrow = 2) +
  scale_color_manual(values = c("Constituent 1" = "#E74C3C", "Constituent 2" = "#3498DB")) +
  scale_fill_manual(values = c("Constituent 1" = "#E74C3C", "Constituent 2" = "#3498DB")) +
  scale_size_continuous(range = c(1, 4)) +
  ylim(0, 1) +
  labs(
    x = "Age (years)",
    y = "Proportion with Constituent in Top 5",
    color = "Constituent",
    fill = "Constituent",
    size = "Number of\nDrawings"
  ) +
  theme_few() +
  theme(legend.position = "right")

panel_b_collapsed <- data %>%
  group_by(age_numeric) %>%
  summarise(
    prop_both_top_5 = mean(both_constituents_in_top_5),
    se = sd(both_constituents_in_top_5) / sqrt(n()),
    n_drawings = n(),
    .groups = "drop"
  )

ggplot(panel_b_collapsed,
       aes(x = age_numeric, y = prop_both_top_5)) +
  geom_hline(yintercept = (5/14)^2, linetype = "dashed", color = "grey50") +
  geom_point(aes(size = n_drawings), alpha = 0.8) +
  geom_linerange(aes(ymin = prop_both_top_5 - se, ymax = prop_both_top_5 + se), alpha = 0.3) +
  geom_smooth(method = "lm", color = "#2C3E50", fill = "#2C3E50", alpha = 0.2, se = TRUE) +
  scale_size_continuous(range = c(2, 5)) +
  ylim(0, 1) +
  labs(
    x = "Age (years)",
    y = "Proportion with Both Constituents in Top 5",
    size = "Number of\nDrawings"
  ) +
  theme_few() +
  theme(legend.position = "right")

(Bria not touching below here)

2 against chance

chance_logit_top5 <- qlogis(top5_chance)

# model params
intercept1 <- fixef(model1)["(Intercept)"]
intercept1_se <- sqrt(diag(vcov(model1)))["(Intercept)"]
intercept2 <- fixef(model2)["(Intercept)"]
intercept2_se <- sqrt(diag(vcov(model2)))["(Intercept)"]

# test stats
t_stat1 <- (intercept1 - chance_logit_top5) / intercept1_se
t_stat2 <- (intercept2 - chance_logit_top5) / intercept2_se
p_value1 <- 2 * (1 - pnorm(abs(t_stat1)))
p_value2 <- 2 * (1 - pnorm(abs(t_stat2)))

# bonferroni correction for multiple testing
alpha_corrected <- 0.05 / 2

# results summary
statistical_results <- data.frame(
  Constituent = c("Concept 1", "Concept 2"),
  Intercept = c(intercept1, intercept2),
  SE = c(intercept1_se, intercept2_se),
  t_statistic = c(t_stat1, t_stat2),
  p_value = c(p_value1, p_value2),
  Significant = c(p_value1 < alpha_corrected, p_value2 < alpha_corrected)
)

print(statistical_results, digits = 4)

##   Constituent Intercept     SE t_statistic   p_value Significant
## 1   Concept 1    0.7969 0.1743       7.943 1.998e-15        TRUE
## 2   Concept 2    0.5967 0.1420       8.339 0.000e+00        TRUE

3 effect size

prob1_predicted <- plogis(intercept1)
prob2_predicted <- plogis(intercept2)

effect_sizes <- data.frame(
  Constituent = c("Concept 1", "Concept 2"),
  Predicted_Probability = c(prob1_predicted, prob2_predicted),
  Chance_Probability = c(top5_chance, top5_chance),
  Difference = c(prob1_predicted - top5_chance, prob2_predicted - top5_chance),
  Ratio = c(prob1_predicted / top5_chance, prob2_predicted / top5_chance)
)

print(effect_sizes)

##   Constituent Predicted_Probability Chance_Probability Difference    Ratio
## 1   Concept 1             0.6893116          0.3571429  0.3321687 1.930072
## 2   Concept 2             0.6448996          0.3571429  0.2877567 1.805719

summary(model1)

## Generalized linear mixed model fit by maximum likelihood (Laplace
##   Approximation) [glmerMod]
##  Family: binomial  ( logit )
## Formula: 
## constituent1_in_top_5 ~ age_centered + offset(qlogis(individual_chance)) +  
##     (age_centered | hybrid_category) + (1 | participant_id)
##    Data: data
## Control: glmerControl(optimizer = "bobyqa")
## 
##       AIC       BIC    logLik -2*log(L)  df.resid 
##    1577.7    1608.2    -782.8    1565.7      1185 
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -2.5381 -0.9174  0.5529  0.8343  1.5906 
## 
## Random effects:
##  Groups          Name         Variance Std.Dev. Corr
##  participant_id  (Intercept)  0.208168 0.45625      
##  hybrid_category (Intercept)  0.179086 0.42319      
##                  age_centered 0.003473 0.05894  0.05
## Number of obs: 1191, groups:  participant_id, 332; hybrid_category, 7
## 
## Fixed effects:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept)   0.79690    0.17432   4.572 4.84e-06 ***
## age_centered  0.19690    0.04097   4.806 1.54e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr)
## age_centerd 0.049

summary(model2)

## Generalized linear mixed model fit by maximum likelihood (Laplace
##   Approximation) [glmerMod]
##  Family: binomial  ( logit )
## Formula: 
## constituent2_in_top_5 ~ age_centered + offset(qlogis(individual_chance)) +  
##     (age_centered | hybrid_category) + (1 | participant_id)
##    Data: data
## Control: glmerControl(optimizer = "bobyqa")
## 
##       AIC       BIC    logLik -2*log(L)  df.resid 
##    1633.8    1664.3    -810.9    1621.8      1185 
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -1.6502 -0.9563  0.6060  0.9773  1.3626 
## 
## Random effects:
##  Groups          Name         Variance Std.Dev. Corr
##  participant_id  (Intercept)  0.000000 0.00000      
##  hybrid_category (Intercept)  0.116545 0.34139      
##                  age_centered 0.009012 0.09493  0.36
## Number of obs: 1191, groups:  participant_id, 332; hybrid_category, 7
## 
## Fixed effects:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept)   0.59669    0.14204   4.201 2.66e-05 ***
## age_centered  0.06252    0.04598   1.359    0.174    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr)
## age_centerd 0.262 
## optimizer (bobyqa) convergence code: 0 (OK)
## boundary (singular) fit: see help('isSingular')

model2_simplified <- glmer(constituent2_in_top_5 ~ age_centered + 
                          (age_centered | hybrid_category), 
                          data = data, family = binomial,
                          control = glmerControl(optimizer = "bobyqa"))

intercept2_alt <- fixef(model2_simplified)["(Intercept)"]
intercept2_alt_se <- sqrt(diag(vcov(model2_simplified)))["(Intercept)"]
t_stat2_alt <- (intercept2_alt - chance_logit_top5) / intercept2_alt_se
p_value2_alt <- 2 * (1 - pnorm(abs(t_stat2_alt)))

cat("original model2 results:\n")

## original model2 results:

cat("Intercept:", intercept2, "SE:", intercept2_se, "p-value:", p_value2, "\n\n")

## Intercept: 0.596694 SE: 0.1420408 p-value: 0

cat("simplified model results:\n")

## simplified model results:

cat("Intercept:", intercept2_alt, "SE:", intercept2_alt_se, "p-value:", p_value2_alt, "\n")

## Intercept: 0.008906734 SE: 0.1420405 p-value: 2.658948e-05

summary(model2_simplified)

## Generalized linear mixed model fit by maximum likelihood (Laplace
##   Approximation) [glmerMod]
##  Family: binomial  ( logit )
## Formula: 
## constituent2_in_top_5 ~ age_centered + (age_centered | hybrid_category)
##    Data: data
## Control: glmerControl(optimizer = "bobyqa")
## 
##       AIC       BIC    logLik -2*log(L)  df.resid 
##    1631.8    1657.3    -810.9    1621.8      1186 
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -1.6502 -0.9563  0.6060  0.9773  1.3626 
## 
## Random effects:
##  Groups          Name         Variance Std.Dev. Corr
##  hybrid_category (Intercept)  0.116544 0.34139      
##                  age_centered 0.009012 0.09493  0.36
## Number of obs: 1191, groups:  hybrid_category, 7
## 
## Fixed effects:
##              Estimate Std. Error z value Pr(>|z|)
## (Intercept)  0.008907   0.142041   0.063    0.950
## age_centered 0.062515   0.045984   1.359    0.174
## 
## Correlation of Fixed Effects:
##             (Intr)
## age_centerd 0.262

participant variance was zero in model 2 after accounting for category and age effects?

4 dev effects - how to interpret for each constituent?

age_effect1 <- fixef(model1)["age_centered"] 
age_effect2 <- fixef(model2)["age_centered"]

cat("age effects (per year):\n")

## age effects (per year):

cat("Constituent 1:", round(age_effect1, 3), "\n")

## Constituent 1: 0.197

cat("Constituent 2:", round(age_effect2, 3), "\n")

## Constituent 2: 0.063

hybrid-draw-cds-abstract-analysis-1

Haoyu Du

2026-01-03

1 classification models

2 against chance

3 effect size

4 dev effects - how to interpret for each constituent?