hybrid-draw-classification-rank-models

1 classification models

This analysis tests whether children’s hybrid drawings contain recognizable elements from both constituent concepts using CLIP embeddings and k-way classification. We examine whether hybrid drawings rank their constituent concepts above chance levels when compared to all base concepts in our stimulus set.

# load anonymized hybrid drawing analysis data
data <- read.csv('../data_intermediates/analysis/prolific_k_way_classification_all_children_anonymized.csv')

cat("total hybrid drawings:", nrow(data), "\n")

## total hybrid drawings: 2938

cat("unique participants:", length(unique(data$participant_id)), "\n")

## unique participants: 739

cat("hybrid categories:", length(unique(data$hybrid_category)), "\n")

## hybrid categories: 8

cat("age range:", range(data$age_numeric, na.rm = TRUE), "\n")

## age range: 2 10

data_original <- data  # save original if needed
data <- data %>%
  filter(age_numeric >= 3)
n_participants <- length(unique(data$participant_id))
age_mean <- round(mean(data$age_numeric), 2)
age_sd <- round(sd(data$age_numeric), 2)
age_min <- min(data$age_numeric)
age_max <- max(data$age_numeric)
n_drawings <- nrow(data)
cat("Sample descriptives after filtering:\n")

## Sample descriptives after filtering:

cat("Children (N =", n_participants, ", ages", age_min, "-", age_max, "years, M =", age_mean, ", SD =", age_sd, ")\n")

## Children (N = 613 , ages 3 - 10 years, M = 5.82 , SD = 2.06 )

cat("Total drawings:", n_drawings, "\n")

## Total drawings: 2513

cat("Drawings per age:\n")

## Drawings per age:

print(table(data$age_numeric))

## 
##   3   4   5   6   7   8   9  10 
## 338 451 415 484 321 166 135 203

# center age for interpretable intercept
data$age_centered <- scale(data$age_numeric, center = TRUE, scale = FALSE)[,1]

# analysis parameters
n_concepts <- 14  # dinosaur truck not included in base concepts
top1_chance <- 1/n_concepts
top5_chance <- 5/n_concepts

cat("number of base concepts:", n_concepts, "\n")

## number of base concepts: 14

cat("top-1 chance level:", round(top1_chance, 3), "\n")

## top-1 chance level: 0.071

cat("top-5 chance level:", round(top5_chance, 3), "\n")

## top-5 chance level: 0.357

boolean_cols <- c("constituent1_in_top_1", "constituent2_in_top_1", 
                  "constituent1_in_top_5", "constituent2_in_top_5",
                  "both_constituents_in_top_1", "both_constituents_in_top_5")
data[boolean_cols] <- lapply(data[boolean_cols], as.logical)

summary(data)

##  hybrid_category      concept1           concept2         participant_id    
##  Length:2513        Length:2513        Length:2513        Length:2513       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##      age             age_numeric       is_adult         centroid_source   
##  Length:2513        Min.   : 3.000   Length:2513        Length:2513       
##  Class :character   1st Qu.: 4.000   Class :character   Class :character  
##  Mode  :character   Median : 6.000   Mode  :character   Mode  :character  
##                     Mean   : 5.817                                        
##                     3rd Qu.: 7.000                                        
##                     Max.   :10.000                                        
##                                                                           
##  concept1_available concept2_available concept1_similarity concept1_rank   
##  Length:2513        Length:2513        Min.   :0.7705      Min.   : 1.000  
##  Class :character   Class :character   1st Qu.:0.9080      1st Qu.: 2.000  
##  Mode  :character   Mode  :character   Median :0.9282      Median : 4.000  
##                                        Mean   :0.9234      Mean   : 5.328  
##                                        3rd Qu.:0.9429      3rd Qu.: 8.000  
##                                        Max.   :0.9770      Max.   :14.000  
##                                        NA's   :322         NA's   :322     
##  constituent1_in_top_1 constituent1_in_top_5 concept2_similarity
##  Mode :logical         Mode :logical         Min.   :0.7803     
##  FALSE:1665            FALSE:904             1st Qu.:0.9009     
##  TRUE :526             TRUE :1287            Median :0.9233     
##  NA's :322             NA's :322             Mean   :0.9192     
##                                              3rd Qu.:0.9419     
##                                              Max.   :0.9790     
##                                              NA's   :322        
##  concept2_rank    constituent2_in_top_1 constituent2_in_top_5
##  Min.   : 1.000   Mode :logical         Mode :logical        
##  1st Qu.: 2.000   FALSE:1765            FALSE:1096           
##  Median : 6.000   TRUE :426             TRUE :1095           
##  Mean   : 5.979   NA's :322             NA's :322            
##  3rd Qu.: 9.000                                              
##  Max.   :14.000                                              
##  NA's   :322                                                 
##  both_constituents_in_top_1 both_constituents_in_top_5 drawing_file      
##  Mode :logical              Mode :logical              Length:2513       
##  FALSE:2191                 FALSE:1663                 Class :character  
##  NA's :322                  TRUE :528                  Mode  :character  
##                             NA's :322                                    
##                                                                          
##                                                                          
##                                                                          
##   age_centered    
##  Min.   :-2.8166  
##  1st Qu.:-1.8166  
##  Median : 0.1834  
##  Mean   : 0.0000  
##  3rd Qu.: 1.1834  
##  Max.   : 4.1834  
##

age_descriptives <- data %>%
  group_by(age_numeric) %>%
  summarise(
    n_drawings = n(),
    n_both_top5 = sum(both_constituents_in_top_5, na.rm = TRUE),
    prop_both_top5 = round(mean(both_constituents_in_top_5, na.rm = TRUE), 3),
    .groups = 'drop'
  )
print(age_descriptives)

## # A tibble: 8 × 4
##   age_numeric n_drawings n_both_top5 prop_both_top5
##         <dbl>      <int>       <int>          <dbl>
## 1           3        338          63          0.216
## 2           4        451          82          0.207
## 3           5        415          84          0.233
## 4           6        484         104          0.249
## 5           7        321          71          0.254
## 6           8        166          32          0.218
## 7           9        135          34          0.283
## 8          10        203          58          0.324

overall_percent <- round(mean(data$both_constituents_in_top_5, na.rm = TRUE) * 100, 1)
cat(overall_percent, "% of drawings had both constituents in top 5\n")

## 24.1 % of drawings had both constituents in top 5

data$concept1_available <- as.logical(data$concept1_available)
data$concept2_available <- as.logical(data$concept2_available)

cat("After conversion:\n")

## After conversion:

print(table(data$concept1_available, useNA = "always"))

## 
## FALSE  TRUE  <NA> 
##   322  2191     0

print(table(data$concept2_available, useNA = "always"))

## 
## FALSE  TRUE  <NA> 
##   322  2191     0

data_filtered <- data %>%
  filter(concept1_available == TRUE & concept2_available == TRUE)

cat("original data:", nrow(data), "drawings\n")

## original data: 2513 drawings

cat("after filtering for available centroids:", nrow(data_filtered), "drawings\n")

## after filtering for available centroids: 2191 drawings

cat("remaining concept1 values:\n")

## remaining concept1 values:

print(table(data_filtered$concept1))

## 
##      a bike  a mushroom    a rabbit     a sheep     a tiger     a train 
##         318         349         313         303         269         343 
## an elephant 
##         296

# use filtered data for all analyses
data <- data_filtered

0.1 age plot for each hybrid

age_summary <- data %>%
  select(participant_id, age_numeric, hybrid_category, concept1, concept2, concept1_similarity, concept2_similarity) %>% 
  pivot_longer(cols=c('concept1_similarity','concept2_similarity'), names_to = 'concept_comparison', values_to='similarity') %>%
  group_by(age_numeric, hybrid_category, concept1, concept2, concept_comparison) %>%  # grouping by hybrid_category
  summarize(mean_sim = mean(similarity), sd_sim = sd(similarity), num_drawings = n())

average similarity to each base concept across age

ggplot(age_summary, aes(x=age_numeric, y=mean_sim, col=concept_comparison)) +
  geom_point(alpha=.8, aes(size=num_drawings)) +
  geom_linerange(aes(ymax = (mean_sim + sd_sim), ymin = (mean_sim - sd_sim),col=concept_comparison), alpha=.2) +
  facet_wrap(~hybrid_category, nrow=2) +  # changed from ~concept1 to ~hybrid_category
  theme_few()

age_summary_top_5 <- data %>%
  select(participant_id, age_numeric, concept1, concept2, constituent1_in_top_5, constituent2_in_top_5) %>%
  mutate(hybrid_category = paste0(str_split_fixed(concept1,' ',2)[,2],'-',str_split_fixed(concept2,' ',2)[,2])) %>%
  pivot_longer(cols=c('constituent1_in_top_5','constituent2_in_top_5'), names_to = 'concept_comparison', values_to='top_5_accuracy') %>%
  group_by(age_numeric, hybrid_category, concept1, concept2, concept_comparison) %>%
  summarize(mean_acc_top_5 = mean(top_5_accuracy), se_acc_top_5 = sd(top_5_accuracy) / n(), num_drawings = n())

age_summary_both_top_5 <- data %>%
  select(participant_id, age_numeric, hybrid_category, concept1, concept2, both_constituents_in_top_5) %>%
  group_by(age_numeric, hybrid_category, concept1, concept2) %>%
  summarize(mean_acc_top_5 = mean(both_constituents_in_top_5), se_acc_top_5 = sd(both_constituents_in_top_5) / n(), num_drawings = n())

age_summary_top_1 <- data %>%
  select(participant_id, age_numeric, concept1, concept2, constituent1_in_top_1, constituent2_in_top_1) %>%
  mutate(hybrid_category = paste0(str_split_fixed(concept1,' ',2)[,2],'-',str_split_fixed(concept2,' ',2)[,2])) %>%
  pivot_longer(cols=c('constituent1_in_top_1','constituent2_in_top_1'), names_to = 'concept_comparison', values_to='top_1_accuracy') %>%
  group_by(age_numeric, hybrid_category, concept1, concept2, concept_comparison) %>%
  summarize(mean_acc_top_1 = mean(top_1_accuracy), se_acc_top_1 = sd(top_1_accuracy) / n(), num_drawings = n())

average similarity (scaled) to each base concept across age

ggplot(age_summary_top_5, aes(x=age_numeric, y=mean_acc_top_5, col=concept_comparison)) +
  geom_point(alpha=.8, aes(size=num_drawings)) +
  geom_linerange(aes(ymax = (mean_acc_top_5 + se_acc_top_5), ymin = (mean_acc_top_5 - se_acc_top_5),col=concept_comparison), alpha=.2) +
  facet_wrap(~hybrid_category, nrow=2) +
  theme_few() +
  ylim(0,1) +
  ylab('Mean acc top 5') +
  xlab('Age in years') +
  geom_smooth(method='lm', aes(fill=concept_comparison), alpha=.2) +
  geom_hline(yintercept=top5_chance, linetype='dashed', color='grey')

Look at top 1 – funkier!

ggplot(age_summary_top_1, aes(x=age_numeric, y=mean_acc_top_1, col=concept_comparison)) +
  geom_point(alpha=.8, aes(size=num_drawings)) +
  geom_linerange(aes(ymax = (mean_acc_top_1 + se_acc_top_1), ymin = (mean_acc_top_1 - se_acc_top_1),col=concept_comparison), alpha=.2) +
  facet_wrap(~hybrid_category, nrow=2) +
  theme_few() +
  ylim(0,1) +
  ylab('Mean acc top 1') +
  xlab('Age in years')

ggplot(age_summary_top_5, aes(x=age_numeric, y=mean_acc_top_5, col=hybrid_category)) +
  geom_point(alpha=.8, aes(size=num_drawings)) +
  geom_linerange(aes(ymax = (mean_acc_top_5 + se_acc_top_5), ymin = (mean_acc_top_5 - se_acc_top_5),col=hybrid_category), alpha=.2) +
  facet_wrap(~concept_comparison, nrow=1) +
  theme_few() +
  ylim(0,1) +
  ylab('Mean acc top 5') +
  xlab('Age in years') +
  geom_smooth(method='lm', aes(fill=hybrid_category), alpha=.2) +
  geom_hline(yintercept=top5_chance, linetype='dashed', color='grey')

ggplot(age_summary_both_top_5, aes(x=age_numeric, y=mean_acc_top_5, col=hybrid_category)) +
  geom_point(alpha=.8, aes(size=num_drawings)) +
  geom_linerange(aes(ymax = (mean_acc_top_5 + se_acc_top_5), ymin = (mean_acc_top_5 - se_acc_top_5),col=hybrid_category), alpha=.2) +
  facet_wrap(~hybrid_category, nrow=2) +
  theme_few() +
  ylim(0,1) +
  ylab('Mean acc both concepts in top 5') +
  xlab('Age in years') +
  geom_smooth(method='lm', aes(fill=hybrid_category), alpha=.2)

1 classification models

We use general linear mixed-effects models to test whether each constituent ranks in the top-5 above chance levels, controlling for age and accounting for clustering by participant and hybrid category.

#add chance level
individual_chance = 5/14  # top-5 chance level
data$individual_chance <- individual_chance
# fit logistic mixed-effects models
model1 <- glmer(constituent1_in_top_5 ~ age_centered + 
                  offset(qlogis(individual_chance))+
                  (age_centered | hybrid_category) + 
                  (1 | participant_id), 
                  data = data, family = binomial,
                  control = glmerControl(optimizer = "bobyqa"))

summary(model1)

## Generalized linear mixed model fit by maximum likelihood (Laplace
##   Approximation) [glmerMod]
##  Family: binomial  ( logit )
## Formula: 
## constituent1_in_top_5 ~ age_centered + offset(qlogis(individual_chance)) +  
##     (age_centered | hybrid_category) + (1 | participant_id)
##    Data: data
## Control: glmerControl(optimizer = "bobyqa")
## 
##       AIC       BIC    logLik -2*log(L)  df.resid 
##    2869.8    2904.0   -1428.9    2857.8      2185 
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -2.1293 -0.9862  0.5860  0.8020  1.5150 
## 
## Random effects:
##  Groups          Name         Variance Std.Dev. Corr 
##  participant_id  (Intercept)  0.156013 0.39498       
##  hybrid_category (Intercept)  0.221825 0.47098       
##                  age_centered 0.001329 0.03645  -0.50
## Number of obs: 2191, groups:  participant_id, 600; hybrid_category, 7
## 
## Fixed effects:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept)   0.96628    0.18480   5.229 1.71e-07 ***
## age_centered  0.11150    0.02805   3.975 7.03e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr)
## age_centerd -0.226

model2 <- glmer(constituent2_in_top_5 ~ age_centered + 
                  offset(qlogis(individual_chance))+
                  (age_centered | hybrid_category) + 
                  (1 | participant_id), 
                  data = data, family = binomial,
                  control = glmerControl(optimizer = "bobyqa"))
summary(model2)

## Generalized linear mixed model fit by maximum likelihood (Laplace
##   Approximation) [glmerMod]
##  Family: binomial  ( logit )
## Formula: 
## constituent2_in_top_5 ~ age_centered + offset(qlogis(individual_chance)) +  
##     (age_centered | hybrid_category) + (1 | participant_id)
##    Data: data
## Control: glmerControl(optimizer = "bobyqa")
## 
##       AIC       BIC    logLik -2*log(L)  df.resid 
##    3003.0    3037.2   -1495.5    2991.0      2185 
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -1.6950 -0.9434 -0.6961  0.9933  1.4147 
## 
## Random effects:
##  Groups          Name         Variance Std.Dev. Corr
##  participant_id  (Intercept)  0.021861 0.14786      
##  hybrid_category (Intercept)  0.101631 0.31880      
##                  age_centered 0.005331 0.07302  0.83
## Number of obs: 2191, groups:  participant_id, 600; hybrid_category, 7
## 
## Fixed effects:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept)   0.59691    0.12838   4.649 3.33e-06 ***
## age_centered  0.04104    0.03508   1.170    0.242    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr)
## age_centerd 0.616

cat("Model 1 convergence:", model1@optinfo$conv$opt, "\n")

## Model 1 convergence: 0

cat("Model 2 convergence:", model2@optinfo$conv$opt, "\n")

## Model 2 convergence: 0

both_chance <- (5*4) / (14*13)
data$both_chance <- both_chance
both_in_top5 <- glmer(both_constituents_in_top_5 ~ 
                      offset(qlogis(both_chance)) +  # This adjusts intercept to chance level
                      age_centered + 
                      (age_centered | hybrid_category) + 
                      (1 | participant_id), 
                      data = data, family = binomial,
                      control = glmerControl(optimizer = "bobyqa"))

summary(both_in_top5)

## Generalized linear mixed model fit by maximum likelihood (Laplace
##   Approximation) [glmerMod]
##  Family: binomial  ( logit )
## Formula: 
## both_constituents_in_top_5 ~ offset(qlogis(both_chance)) + age_centered +  
##     (age_centered | hybrid_category) + (1 | participant_id)
##    Data: data
## Control: glmerControl(optimizer = "bobyqa")
## 
##       AIC       BIC    logLik -2*log(L)  df.resid 
##    2154.4    2188.5   -1071.2    2142.4      2185 
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -1.1045 -0.6184 -0.3607 -0.0830 10.9464 
## 
## Random effects:
##  Groups          Name         Variance Std.Dev. Corr 
##  participant_id  (Intercept)  0.12531  0.3540        
##  hybrid_category (Intercept)  1.74568  1.3212        
##                  age_centered 0.01173  0.1083   -0.85
## Number of obs: 2191, groups:  participant_id, 600; hybrid_category, 7
## 
## Fixed effects:
##              Estimate Std. Error z value Pr(>|z|)  
## (Intercept)    0.5253     0.5074   1.035   0.3006  
## age_centered   0.1262     0.0534   2.363   0.0181 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr)
## age_centerd -0.675

data_6plus <- data %>%
  filter(age_numeric >= 6)

cat("6 and up sample:\n")

## 6 and up sample:

cat("N children:", length(unique(data_6plus$participant_id)), "\n")

## N children: 288

cat("N drawings:", nrow(data_6plus), "\n")

## N drawings: 1142

cat("Age range:", range(data_6plus$age_numeric), "\n")

## Age range: 6 10

cat("Mean age:", round(mean(data_6plus$age_numeric), 2), "\n")

## Mean age: 7.44

age_desc_6plus <- data_6plus %>%
  group_by(age_numeric) %>%
  summarise(
    n_drawings = n(),
    prop_both_top5 = round(mean(both_constituents_in_top_5, na.rm = TRUE), 3),
    .groups = 'drop'
  )
print(age_desc_6plus)

## # A tibble: 5 × 3
##   age_numeric n_drawings prop_both_top5
##         <dbl>      <int>          <dbl>
## 1           6        417          0.249
## 2           7        279          0.254
## 3           8        147          0.218
## 4           9        120          0.283
## 5          10        179          0.324

overall_6plus <- round(mean(data_6plus$both_constituents_in_top_5, na.rm = TRUE), 3)
cat("Overall proportion for 6 and up:", overall_6plus, "\n")

## Overall proportion for 6 and up: 0.262

# center age for 6+ sample
data_6plus$age_centered <- scale(data_6plus$age_numeric, center = TRUE, scale = FALSE)[,1]

# both constituents model with offset - ages 6+
both_6plus_offset <- glmer(both_constituents_in_top_5 ~ 
                          offset(qlogis(both_chance)) +  
                          age_centered + 
                          (age_centered | hybrid_category) + 
                          (1 | participant_id), 
                          data = data_6plus, family = binomial,
                          control = glmerControl(optimizer = "bobyqa"))

summary(both_6plus_offset)

## Generalized linear mixed model fit by maximum likelihood (Laplace
##   Approximation) [glmerMod]
##  Family: binomial  ( logit )
## Formula: 
## both_constituents_in_top_5 ~ offset(qlogis(both_chance)) + age_centered +  
##     (age_centered | hybrid_category) + (1 | participant_id)
##    Data: data_6plus
## Control: glmerControl(optimizer = "bobyqa")
## 
##       AIC       BIC    logLik -2*log(L)  df.resid 
##    1203.4    1233.6    -595.7    1191.4      1136 
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -1.0374 -0.6450 -0.4547  0.9769  7.5452 
## 
## Random effects:
##  Groups          Name         Variance Std.Dev. Corr 
##  participant_id  (Intercept)  0.06994  0.2645        
##  hybrid_category (Intercept)  1.60821  1.2682        
##                  age_centered 0.02092  0.1447   -1.00
## Number of obs: 1142, groups:  participant_id, 288; hybrid_category, 7
## 
## Fixed effects:
##              Estimate Std. Error z value Pr(>|z|)  
## (Intercept)    0.7157     0.4932   1.451   0.1467  
## age_centered   0.1535     0.0819   1.874   0.0609 .
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr)
## age_centerd -0.702
## optimizer (bobyqa) convergence code: 0 (OK)
## boundary (singular) fit: see help('isSingular')

intercept_6plus <- fixef(both_6plus_offset)["(Intercept)"]
intercept_se_6plus <- sqrt(diag(vcov(both_6plus_offset)))["(Intercept)"]
age_effect_6plus <- fixef(both_6plus_offset)["age_centered"]
cat("Intercept (above-chance performance):", round(intercept_6plus, 3), "\n")

## Intercept (above-chance performance): 0.716

cat("Age effect:", round(age_effect_6plus, 3), "\n")

## Age effect: 0.153

Lmers on normalized rank values give same results, as per graphs, for individual concepts

rank_model2 <- lmer(scale(concept2_rank) ~ age_centered + 
                  (age_centered | hybrid_category) + 
                  (1 | participant_id), 
                  data = data)

summary(rank_model2)

## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: 
## scale(concept2_rank) ~ age_centered + (age_centered | hybrid_category) +  
##     (1 | participant_id)
##    Data: data
## 
## REML criterion at convergence: 6077.3
## 
## Scaled residuals: 
##      Min       1Q   Median       3Q      Max 
## -1.86941 -0.91005  0.03783  0.81875  2.11901 
## 
## Random effects:
##  Groups          Name         Variance Std.Dev. Corr
##  participant_id  (Intercept)  0.009920 0.09960      
##  hybrid_category (Intercept)  0.083219 0.28848      
##                  age_centered 0.001599 0.03999  0.20
##  Residual                     0.911840 0.95490      
## Number of obs: 2191, groups:  participant_id, 600; hybrid_category, 7
## 
## Fixed effects:
##              Estimate Std. Error       df t value Pr(>|t|)
## (Intercept)  -0.01056    0.11104  6.01439  -0.095    0.927
## age_centered -0.02582    0.01822  6.33809  -1.417    0.204
## 
## Correlation of Fixed Effects:
##             (Intr)
## age_centerd 0.162

rank_model1 <- lmer(scale(concept1_rank) ~ age_centered + 
                  (age_centered | hybrid_category) + 
                  (1 | participant_id), 
                  data = data)

summary(rank_model1)

## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: 
## scale(concept1_rank) ~ age_centered + (age_centered | hybrid_category) +  
##     (1 | participant_id)
##    Data: data
## 
## REML criterion at convergence: 6013.4
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -2.0454 -0.8004 -0.1998  0.8099  2.3462 
## 
## Random effects:
##  Groups          Name         Variance  Std.Dev. Corr 
##  participant_id  (Intercept)  0.0564222 0.23753       
##  hybrid_category (Intercept)  0.0844257 0.29056       
##                  age_centered 0.0006883 0.02624  -1.00
##  Residual                     0.8475031 0.92060       
## Number of obs: 2191, groups:  participant_id, 600; hybrid_category, 7
## 
## Fixed effects:
##              Estimate Std. Error       df t value Pr(>|t|)    
## (Intercept)   0.01795    0.11213  6.11735   0.160 0.877976    
## age_centered -0.07136    0.01473 10.11918  -4.845 0.000654 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr)
## age_centerd -0.658
## optimizer (nloptwrap) convergence code: 0 (OK)
## boundary (singular) fit: see help('isSingular')

m1_coef <- summary(model1)$coefficients["age_centered", ]
m2_coef <- summary(model2)$coefficients["age_centered", ]

category_stats <- data %>%
  group_by(hybrid_category) %>%
  summarise(
    n = n(),
    prop_both_top5 = mean(both_constituents_in_top_5),
    prop_c1_top5 = mean(constituent1_in_top_5),
    prop_c2_top5 = mean(constituent2_in_top_5),
    .groups = 'drop'
  )

(Constituent 1: B = 0.11, SE = 0.03, Z = 3.98, p = 7e-05; Constituent 2: B = 0.04, SE = 0.04, Z = 1.17, p = 0.24)

(e.g., tiger-frog: 45% of drawings had both constituents in top 5) while others showed trade-offs between constituents (e.g., rabbit-boat: 44% for rabbit, 35% for boat, but only 1% jointly)

library(ggthemes)

clean_category <- function(x) {
  x %>%
    str_remove("^an? ") %>%
    str_replace_all(" ", "-")
}

# base_plot <- ggplot(panel_a_data, 
#        aes(x = age_numeric, y = prop_top_5)) +
#   geom_hline(yintercept = 5/14, linetype = "dashed", color = "grey50") +
#   facet_wrap(~hybrid_category, nrow = 1) +
#   scale_color_manual(values = c("Constituent 1" = "#E74C3C", "Constituent 2" = "#3498DB")) +
#   scale_fill_manual(values = c("Constituent 1" = "#E74C3C", "Constituent 2" = "#3498DB")) +
#   scale_size_continuous(range = c(0.5, 2)) +
#   scale_x_continuous(limits = range(panel_a_data$age_numeric)) +
#   ylim(0, 1) +
#   labs(
#     x = "Age (years)",
#     y = "Proportion with Constituent in Top 5",
#     color = "Constituent",
#     fill = "Constituent",
#     size = "Number of\nDrawings"
#   ) +
#   theme_few() +
#   theme(
#     legend.position = "right",
#     aspect.ratio = 1,
#     strip.text = element_text(size = 8),
#     axis.title = element_text(size = 9),
#     axis.text = element_text(size = 11),
#     legend.title = element_text(size = 8),
#     legend.text = element_text(size = 7)
#   )
# 
# # frame 1: axes and chance line only
# frame1 <- base_plot
# ggsave("~/Documents/GitHub/exploratory-drawing-analysis/writing/constituent_age_trend_frame1.svg",
#        frame1, width = 10, height = 6)
# 
# # frame 2: add constituent 1
# c1_data <- panel_a_data %>% filter(constituent == "Constituent 1")
# 
# frame2 <- base_plot +
#   geom_point(data = c1_data, aes(color = constituent, size = n_drawings), alpha = 0.8) +
#   geom_linerange(data = c1_data, 
#                  aes(ymin = prop_top_5 - se, ymax = prop_top_5 + se, color = constituent), 
#                  alpha = 0.3) +
#   geom_smooth(data = c1_data, 
#               aes(color = constituent, fill = constituent), 
#               method = "lm", alpha = 0.2, se = TRUE)
# ggsave("~/Documents/GitHub/exploratory-drawing-analysis/writing/constituent_age_trend_frame2.svg",
#        frame2, width = 10, height = 6)
# 
# # frame 3: add both constituents
# frame3 <- base_plot +
#   geom_point(aes(color = constituent, size = n_drawings), alpha = 0.8) +
#   geom_linerange(aes(ymin = prop_top_5 - se, ymax = prop_top_5 + se, color = constituent), 
#                  alpha = 0.3) +
#   geom_smooth(aes(color = constituent, fill = constituent), 
#               method = "lm", alpha = 0.2, se = TRUE)
# ggsave("~/Documents/GitHub/exploratory-drawing-analysis/writing/constituent_age_trend_frame3.svg",
#        frame3, width = 10, height = 6)

category_data <- data %>%
  mutate(hybrid_category = clean_category(hybrid_category)) %>%
  group_by(age_numeric, hybrid_category) %>%
  summarise(
    prop_both_top_5 = mean(both_constituents_in_top_5),
    .groups = "drop"
  )

collapsed_data <- data %>%
  group_by(age_numeric) %>%
  summarise(
    prop_both_top_5 = mean(both_constituents_in_top_5),
    se = sd(both_constituents_in_top_5) / sqrt(n()),
    n_drawings = n(),
    .groups = "drop"
  )

x_limits <- range(collapsed_data$age_numeric)

# frame 1: axes and chance line only
both_frame1 <- ggplot() +
  geom_hline(yintercept = (5/14)^2, linetype = "dashed", color = "grey50") +
  scale_x_continuous(limits = x_limits, expand = c(0.01, 0.01)) +
  ylim(0, 1) +
  labs(
    x = "Age (years)",
    y = "Proportion with Both Constituents in Top 5",
    size = "Number of\nDrawings"
  ) +
  theme_few() +
  theme(
    legend.position = "right",
    axis.title = element_text(size = 12),
    axis.text = element_text(size = 11)
  )
ggsave("~/Documents/GitHub/exploratory-drawing-analysis/writing/overall_age_trend_frame1.svg",
       both_frame1, width = 6, height = 6)

# frame 2: full plot
both_frame2 <- ggplot() +
  geom_hline(yintercept = (5/14)^2, linetype = "dashed", color = "grey50") +
  geom_smooth(data = category_data,
              aes(x = age_numeric, y = prop_both_top_5, color = hybrid_category,
                  fill = hybrid_category),
              method = "lm", se = TRUE, alpha = 0.1, linewidth = 0.7) +
  geom_smooth(data = collapsed_data,
              aes(x = age_numeric, y = prop_both_top_5),
              method = "lm", color = "black", fill = "grey30", alpha = 0.3,
              linewidth = 1.2, se = TRUE) +
  geom_point(data = collapsed_data,
             aes(x = age_numeric, y = prop_both_top_5, size = n_drawings),
             alpha = 0.8) +
  scale_size_continuous(range = c(2, 5)) +
  scale_color_viridis_d(name = "Hybrid Category") +
  scale_fill_viridis_d(name = "Hybrid Category") +
  scale_x_continuous(limits = x_limits, expand = c(0.01, 0.01)) +
  ylim(0, 1) +
  labs(
    x = "Age (years)",
    y = "Proportion with Both Constituents in Top 5",
    size = "Number of\nDrawings"
  ) +
  theme_few() +
  theme(
    legend.position = "right",
    axis.title = element_text(size = 12),
    axis.text = element_text(size = 11)
  ) +
  guides(fill = guide_legend(), color = guide_legend())
ggsave("~/Documents/GitHub/exploratory-drawing-analysis/writing/overall_age_trend_frame2.svg",
       both_frame2, width = 8, height = 6)

data %>%
  mutate(hybrid_category_clean = clean_category(hybrid_category)) %>%
  ggplot(aes(x = concept1_rank, y = concept2_rank, color = hybrid_category_clean)) +
  geom_point(alpha = 0.6, size = 2) +
  geom_hline(yintercept = 5, color = "black", linewidth = 0.5) +
  geom_vline(xintercept = 5, color = "black", linewidth = 0.5) +
  facet_wrap(~hybrid_category_clean, nrow = 1) +
  coord_fixed(ratio = 1) +
  scale_color_viridis_d() +
  labs(
    x = "Constituent 1 Classification Rank",
    y = "Constituent 2 Classification Rank"
  ) +
  theme_few() +
  theme(
    legend.position = "none",
    strip.text = element_text(size = 8),
    axis.title = element_text(size = 9),
    axis.text = element_text(size = 9)
  )

ggsave("~/Documents/GitHub/exploratory-drawing-analysis/writing/classification_rank.svg")

# library(patchwork)
# p1 <- ggplot(panel_a_data, 
#        aes(x = age_numeric, y = prop_top_5, color = constituent)) +
#   geom_hline(yintercept = 5/14, linetype = "dashed", color = "grey50") +
#   geom_point(aes(size = n_drawings), alpha = 0.8) +
#   geom_linerange(aes(ymin = prop_top_5 - se, ymax = prop_top_5 + se), alpha = 0.3) +
#   geom_smooth(method = "lm", aes(fill = constituent), alpha = 0.2, se = TRUE) +
#   facet_wrap(~hybrid_category, nrow = 1) +
#   scale_color_manual(values = c("Constituent 1" = "#E74C3C", "Constituent 2" = "#3498DB")) +
#   scale_fill_manual(values = c("Constituent 1" = "#E74C3C", "Constituent 2" = "#3498DB")) +
#   scale_size_continuous(range = c(1, 3)) +
#   ylim(0, 1) +
#   labs(
#     x = "Age (years)",
#     y = "Proportion with Constituent in Top 5",
#     color = "Constituent",
#     fill = "Constituent",
#     size = "Number of\nDrawings"
#   ) +
#   theme_few() +
#   theme(
#     legend.position = "right",
#     aspect.ratio = 1,
#     strip.text = element_text(size = 9),
#     axis.title = element_text(size = 8),
#     axis.text = element_text(size = 6),
#     legend.title = element_text(size = 8),
#     legend.text = element_text(size = 7)
#   )
# 
# p2 <- data %>%
#   mutate(hybrid_category_clean = clean_category(hybrid_category)) %>%
#   ggplot(aes(x = concept1_rank, y = concept2_rank, color = hybrid_category_clean)) +
#   geom_point(alpha = 0.6, size = 2.8) +
#   geom_hline(yintercept = 5, color = "black", linewidth = 0.5) +
#   geom_vline(xintercept = 5, color = "black", linewidth = 0.5) +
#   facet_wrap(~hybrid_category_clean, nrow = 1) +
#   coord_fixed(ratio = 1) +
#   scale_color_viridis_d() +
#   labs(
#     x = "Constituent 1 Classification Rank",
#     y = "Constituent 2 Classification Rank"
#   ) +
#   theme_few() +
#   theme(
#     legend.position = "none",
#     strip.text = element_text(size = 9),
#     axis.title = element_text(size = 8),
#     axis.text = element_text(size = 6)
#   )
# 
# p1 / p2 + plot_annotation(tag_levels = "A")

hybrid-draw-classification-rank-models

Haoyu Du

2026-05-19

0.1 age plot for each hybrid

1 classification models