This analysis tests whether children’s hybrid drawings contain recognizable elements from both constituent concepts using CLIP embeddings and k-way classification. We examine whether hybrid drawings rank their constituent concepts above chance levels when compared to all base concepts in our stimulus set.
# load anonymized hybrid drawing analysis data
data <- read.csv(here::here('cds_k_way_classification_all_children_anonymized.csv'))
cat("total hybrid drawings:", nrow(data), "\n")
## total hybrid drawings: 1506
cat("unique participants:", length(unique(data$participant_id)), "\n")
## unique participants: 392
cat("hybrid categories:", length(unique(data$hybrid_category)), "\n")
## hybrid categories: 8
cat("age range:", range(data$age_numeric), "\n")
## age range: 2 10
data_original <- data # save original if needed
data <- data %>%
filter(age_numeric >= 3)
n_participants <- length(unique(data$participant_id))
age_mean <- round(mean(data$age_numeric), 2)
age_sd <- round(sd(data$age_numeric), 2)
age_min <- min(data$age_numeric)
age_max <- max(data$age_numeric)
n_drawings <- nrow(data)
cat("Sample descriptives after filtering:\n")
## Sample descriptives after filtering:
cat("Children (N =", n_participants, ", ages", age_min, "-", age_max, "years, M =", age_mean, ", SD =", age_sd, ")\n")
## Children (N = 337 , ages 3 - 10 years, M = 5.61 , SD = 2.09 )
cat("Total drawings:", n_drawings, "\n")
## Total drawings: 1367
cat("Drawings per age:\n")
## Drawings per age:
print(table(data$age_numeric))
##
## 3 4 5 6 7 8 9 10
## 218 310 191 252 146 73 71 106
# center age for interpretable intercept
data$age_centered <- scale(data$age_numeric, center = TRUE, scale = FALSE)[,1]
# analysis parameters
n_concepts <- 14 # dinosaur not included in base concepts, but truck can be used for comparison?
top1_chance <- 1/n_concepts
top5_chance <- 5/n_concepts
cat("number of base concepts:", n_concepts, "\n")
## number of base concepts: 14
cat("top-1 chance level:", round(top1_chance, 3), "\n")
## top-1 chance level: 0.071
cat("top-5 chance level:", round(top5_chance, 3), "\n")
## top-5 chance level: 0.357
boolean_cols <- c("constituent1_in_top_1", "constituent2_in_top_1",
"constituent1_in_top_5", "constituent2_in_top_5",
"both_constituents_in_top_1", "both_constituents_in_top_5")
data[boolean_cols] <- lapply(data[boolean_cols], as.logical)
summary(data)
## hybrid_category concept1 concept2 participant_id
## Length:1367 Length:1367 Length:1367 Length:1367
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## age_numeric centroid_source concept1_available concept2_available
## Min. : 3.000 Length:1367 Length:1367 Length:1367
## 1st Qu.: 4.000 Class :character Class :character Class :character
## Median : 5.000 Mode :character Mode :character Mode :character
## Mean : 5.608
## 3rd Qu.: 7.000
## Max. :10.000
##
## concept1_similarity concept1_rank constituent1_in_top_1
## Min. :0.7705 Min. : 1.000 Mode :logical
## 1st Qu.:0.9072 1st Qu.: 2.000 FALSE:936
## Median :0.9282 Median : 5.000 TRUE :255
## Mean :0.9231 Mean : 5.673 NA's :176
## 3rd Qu.:0.9429 3rd Qu.: 9.000
## Max. :0.9771 Max. :14.000
## NA's :176 NA's :176
## constituent1_in_top_5 concept2_similarity concept2_rank
## Mode :logical Min. :0.8008 Min. : 1.000
## FALSE:536 1st Qu.:0.9014 1st Qu.: 2.000
## TRUE :655 Median :0.9243 Median : 5.000
## NA's :176 Mean :0.9204 Mean : 6.024
## 3rd Qu.:0.9434 3rd Qu.: 9.000
## Max. :0.9756 Max. :14.000
## NA's :176 NA's :176
## constituent2_in_top_1 constituent2_in_top_5 both_constituents_in_top_1
## Mode :logical Mode :logical Mode :logical
## FALSE:973 FALSE:594 FALSE:1191
## TRUE :218 TRUE :597 NA's :176
## NA's :176 NA's :176
##
##
##
## both_constituents_in_top_5 drawing_file age_centered
## Mode :logical Length:1367 Min. :-2.6079
## FALSE:923 Class :character 1st Qu.:-1.6079
## TRUE :268 Mode :character Median :-0.6079
## NA's :176 Mean : 0.0000
## 3rd Qu.: 1.3921
## Max. : 4.3921
##
age_descriptives <- data %>%
group_by(age_numeric) %>%
summarise(
n_drawings = n(),
n_both_top5 = sum(both_constituents_in_top_5, na.rm = TRUE),
prop_both_top5 = round(mean(both_constituents_in_top_5, na.rm = TRUE), 3),
.groups = 'drop'
)
print(age_descriptives)
## # A tibble: 8 × 4
## age_numeric n_drawings n_both_top5 prop_both_top5
## <dbl> <int> <int> <dbl>
## 1 3 218 33 0.176
## 2 4 310 52 0.188
## 3 5 191 27 0.167
## 4 6 252 55 0.255
## 5 7 146 32 0.252
## 6 8 73 15 0.227
## 7 9 71 17 0.27
## 8 10 106 37 0.398
overall_percent <- round(mean(data$both_constituents_in_top_5, na.rm = TRUE) * 100, 1)
cat(overall_percent, "% of drawings had both constituents in top 5\n")
## 22.5 % of drawings had both constituents in top 5
data$concept1_available <- as.logical(data$concept1_available)
data$concept2_available <- as.logical(data$concept2_available)
cat("After conversion:\n")
## After conversion:
print(table(data$concept1_available, useNA = "always"))
##
## FALSE TRUE <NA>
## 176 1191 0
print(table(data$concept2_available, useNA = "always"))
##
## FALSE TRUE <NA>
## 176 1191 0
data_filtered <- data %>%
filter(concept1_available == TRUE & concept2_available == TRUE)
cat("original data:", nrow(data), "drawings\n")
## original data: 1367 drawings
cat("after filtering for available centroids:", nrow(data_filtered), "drawings\n")
## after filtering for available centroids: 1191 drawings
cat("remaining concept1 values:\n")
## remaining concept1 values:
print(table(data_filtered$concept1))
##
## a bike a mushroom a rabbit a sheep a tiger a train
## 188 174 168 171 146 184
## an elephant
## 160
# use filtered data for all analyses
data <- data_filtered
# filter for rabbit-boat hybrids
rabbit_boat <- data %>%
filter(hybrid_category == "a rabbit boat")
# look at similarity rankings
rabbit_boat_similarities <- rabbit_boat %>%
select(starts_with("concept"), contains("similarity"), contains("rank")) %>%
arrange(concept1_rank)
# see what ranks highly that shouldn't
cat("Top non-constituent similarities for rabbit-boat:\n")
## Top non-constituent similarities for rabbit-boat:
print(summary(rabbit_boat_similarities))
## concept1 concept2 concept1_available concept2_available
## Length:168 Length:168 Mode:logical Mode:logical
## Class :character Class :character TRUE:168 TRUE:168
## Mode :character Mode :character
##
##
##
## concept1_similarity concept1_rank concept2_similarity concept2_rank
## Min. :0.7705 Min. : 1.000 Min. :0.8164 Min. : 1.00
## 1st Qu.:0.8947 1st Qu.: 2.000 1st Qu.:0.8923 1st Qu.: 2.75
## Median :0.9268 Median : 9.000 Median :0.9138 Median :10.00
## Mean :0.9167 Mean : 7.673 Mean :0.9126 Mean : 8.19
## 3rd Qu.:0.9434 3rd Qu.:13.000 3rd Qu.:0.9385 3rd Qu.:13.00
## Max. :0.9673 Max. :14.000 Max. :0.9731 Max. :14.00
Make some plots by age for each hybrid concepts
age_summary <- data %>%
select(participant_id, age_numeric, hybrid_category, concept1, concept2, concept1_similarity, concept2_similarity) %>%
pivot_longer(cols=c('concept1_similarity','concept2_similarity'), names_to = 'concept_comparison', values_to='similarity') %>%
group_by(age_numeric, hybrid_category, concept1, concept2, concept_comparison) %>% # grouping by hybrid_category
summarize(mean_sim = mean(similarity), sd_sim = sd(similarity), num_drawings = n())
Average similarity to each base concept across age
ggplot(age_summary, aes(x=age_numeric, y=mean_sim, col=concept_comparison)) +
geom_point(alpha=.8, aes(size=num_drawings)) +
geom_linerange(aes(ymax = (mean_sim + sd_sim), ymin = (mean_sim - sd_sim),col=concept_comparison), alpha=.2) +
facet_wrap(~hybrid_category, nrow=2) + # changed from ~concept1 to ~hybrid_category
theme_few()
age_summary_top_5 <- data %>%
select(participant_id, age_numeric, concept1, concept2, constituent1_in_top_5, constituent2_in_top_5) %>%
mutate(hybrid_category = paste0(str_split_fixed(concept1,' ',2)[,2],'-',str_split_fixed(concept2,' ',2)[,2])) %>%
pivot_longer(cols=c('constituent1_in_top_5','constituent2_in_top_5'), names_to = 'concept_comparison', values_to='top_5_accuracy') %>%
group_by(age_numeric, hybrid_category, concept1, concept2, concept_comparison) %>%
summarize(mean_acc_top_5 = mean(top_5_accuracy), se_acc_top_5 = sd(top_5_accuracy) / n(), num_drawings = n())
age_summary_both_top_5 <- data %>%
select(participant_id, age_numeric, hybrid_category, concept1, concept2, both_constituents_in_top_5) %>%
group_by(age_numeric, hybrid_category, concept1, concept2) %>%
summarize(mean_acc_top_5 = mean(both_constituents_in_top_5), se_acc_top_5 = sd(both_constituents_in_top_5) / n(), num_drawings = n())
age_summary_top_1 <- data %>%
select(participant_id, age_numeric, concept1, concept2, constituent1_in_top_1, constituent2_in_top_1) %>%
mutate(hybrid_category = paste0(str_split_fixed(concept1,' ',2)[,2],'-',str_split_fixed(concept2,' ',2)[,2])) %>%
pivot_longer(cols=c('constituent1_in_top_1','constituent2_in_top_1'), names_to = 'concept_comparison', values_to='top_1_accuracy') %>%
group_by(age_numeric, hybrid_category, concept1, concept2, concept_comparison) %>%
summarize(mean_acc_top_1 = mean(top_1_accuracy), se_acc_top_1 = sd(top_1_accuracy) / n(), num_drawings = n())
Average similarity (scaled) to each base concept across age
ggplot(age_summary_top_5, aes(x=age_numeric, y=mean_acc_top_5, col=concept_comparison)) +
geom_point(alpha=.8, aes(size=num_drawings)) +
geom_linerange(aes(ymax = (mean_acc_top_5 + se_acc_top_5), ymin = (mean_acc_top_5 - se_acc_top_5),col=concept_comparison), alpha=.2) +
facet_wrap(~hybrid_category, nrow=2) +
theme_few() +
ylim(0,1) +
ylab('Mean acc top 5') +
xlab('Age in years') +
geom_smooth(method='lm', aes(fill=concept_comparison), alpha=.2) +
geom_hline(yintercept=top5_chance, linetype='dashed', color='grey')
Look at top 1 – funkier!
ggplot(age_summary_top_1, aes(x=age_numeric, y=mean_acc_top_1, col=concept_comparison)) +
geom_point(alpha=.8, aes(size=num_drawings)) +
geom_linerange(aes(ymax = (mean_acc_top_1 + se_acc_top_1), ymin = (mean_acc_top_1 - se_acc_top_1),col=concept_comparison), alpha=.2) +
facet_wrap(~hybrid_category, nrow=2) +
theme_few() +
ylim(0,1) +
ylab('Mean acc top 1') +
xlab('Age in years')
ggplot(age_summary_top_5, aes(x=age_numeric, y=mean_acc_top_5, col=hybrid_category)) +
geom_point(alpha=.8, aes(size=num_drawings)) +
geom_linerange(aes(ymax = (mean_acc_top_5 + se_acc_top_5), ymin = (mean_acc_top_5 - se_acc_top_5),col=hybrid_category), alpha=.2) +
facet_wrap(~concept_comparison, nrow=1) +
theme_few() +
ylim(0,1) +
ylab('Mean acc top 5') +
xlab('Age in years') +
geom_smooth(method='lm', aes(fill=hybrid_category), alpha=.2) +
geom_hline(yintercept=top5_chance, linetype='dashed', color='grey')
ggplot(age_summary_both_top_5, aes(x=age_numeric, y=mean_acc_top_5, col=hybrid_category)) +
geom_point(alpha=.8, aes(size=num_drawings)) +
geom_linerange(aes(ymax = (mean_acc_top_5 + se_acc_top_5), ymin = (mean_acc_top_5 - se_acc_top_5),col=hybrid_category), alpha=.2) +
facet_wrap(~hybrid_category, nrow=2) +
theme_few() +
ylim(0,1) +
ylab('Mean acc both concepts in top 5') +
xlab('Age in years') +
geom_smooth(method='lm', aes(fill=hybrid_category), alpha=.2)
We use general linear mixed-effects models to test whether each constituent ranks in the top-5 above chance levels, controlling for age and accounting for clustering by participant and hybrid category.
#add chance level
individual_chance = 5/14 # top-5 chance level
data$individual_chance <- individual_chance
# fit logistic mixed-effects models
model1 <- glmer(constituent1_in_top_5 ~ age_centered +
offset(qlogis(individual_chance))+
(age_centered | hybrid_category) +
(1 | participant_id),
data = data, family = binomial,
control = glmerControl(optimizer = "bobyqa"))
summary(model1)
## Generalized linear mixed model fit by maximum likelihood (Laplace
## Approximation) [glmerMod]
## Family: binomial ( logit )
## Formula:
## constituent1_in_top_5 ~ age_centered + offset(qlogis(individual_chance)) +
## (age_centered | hybrid_category) + (1 | participant_id)
## Data: data
## Control: glmerControl(optimizer = "bobyqa")
##
## AIC BIC logLik -2*log(L) df.resid
## 1577.7 1608.2 -782.8 1565.7 1185
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -2.5381 -0.9174 0.5529 0.8343 1.5906
##
## Random effects:
## Groups Name Variance Std.Dev. Corr
## participant_id (Intercept) 0.208168 0.45625
## hybrid_category (Intercept) 0.179086 0.42319
## age_centered 0.003473 0.05894 0.05
## Number of obs: 1191, groups: participant_id, 332; hybrid_category, 7
##
## Fixed effects:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.79690 0.17432 4.572 4.84e-06 ***
## age_centered 0.19690 0.04097 4.806 1.54e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr)
## age_centerd 0.049
model2 <- glmer(constituent2_in_top_5 ~ age_centered +
offset(qlogis(individual_chance))+
(age_centered | hybrid_category) +
(1 | participant_id),
data = data, family = binomial,
control = glmerControl(optimizer = "bobyqa"))
summary(model2)
## Generalized linear mixed model fit by maximum likelihood (Laplace
## Approximation) [glmerMod]
## Family: binomial ( logit )
## Formula:
## constituent2_in_top_5 ~ age_centered + offset(qlogis(individual_chance)) +
## (age_centered | hybrid_category) + (1 | participant_id)
## Data: data
## Control: glmerControl(optimizer = "bobyqa")
##
## AIC BIC logLik -2*log(L) df.resid
## 1633.8 1664.3 -810.9 1621.8 1185
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -1.6502 -0.9563 0.6060 0.9773 1.3626
##
## Random effects:
## Groups Name Variance Std.Dev. Corr
## participant_id (Intercept) 0.000000 0.00000
## hybrid_category (Intercept) 0.116545 0.34139
## age_centered 0.009012 0.09493 0.36
## Number of obs: 1191, groups: participant_id, 332; hybrid_category, 7
##
## Fixed effects:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.59669 0.14204 4.201 2.66e-05 ***
## age_centered 0.06252 0.04598 1.359 0.174
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr)
## age_centerd 0.262
## optimizer (bobyqa) convergence code: 0 (OK)
## boundary (singular) fit: see help('isSingular')
cat("Model 1 convergence:", model1@optinfo$conv$opt, "\n")
## Model 1 convergence: 0
cat("Model 2 convergence:", model2@optinfo$conv$opt, "\n")
## Model 2 convergence: 0
both_chance <- (5/n_concepts)^2
data$both_chance <- both_chance
both_in_top5 <- glmer(both_constituents_in_top_5 ~
offset(qlogis(both_chance)) + # This adjusts intercept to chance level
age_centered +
(age_centered | hybrid_category) +
(1 | participant_id),
data = data, family = binomial,
control = glmerControl(optimizer = "bobyqa"))
summary(both_in_top5)
## Generalized linear mixed model fit by maximum likelihood (Laplace
## Approximation) [glmerMod]
## Family: binomial ( logit )
## Formula:
## both_constituents_in_top_5 ~ offset(qlogis(both_chance)) + age_centered +
## (age_centered | hybrid_category) + (1 | participant_id)
## Data: data
## Control: glmerControl(optimizer = "bobyqa")
##
## AIC BIC logLik -2*log(L) df.resid
## 1125.5 1156.0 -556.7 1113.5 1185
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -1.0434 -0.6258 -0.3237 -0.0386 5.3843
##
## Random effects:
## Groups Name Variance Std.Dev. Corr
## participant_id (Intercept) 1.927e-10 1.388e-05
## hybrid_category (Intercept) 2.834e+00 1.683e+00
## age_centered 4.233e-02 2.057e-01 -0.97
## Number of obs: 1191, groups: participant_id, 332; hybrid_category, 7
##
## Fixed effects:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.07838 0.66266 0.118 0.90585
## age_centered 0.25397 0.09598 2.646 0.00814 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr)
## age_centerd -0.864
## optimizer (bobyqa) convergence code: 0 (OK)
## boundary (singular) fit: see help('isSingular')
data_6plus <- data %>%
filter(age_numeric >= 6)
cat("6 and up sample:\n")
## 6 and up sample:
cat("N children:", length(unique(data_6plus$participant_id)), "\n")
## N children: 141
cat("N drawings:", nrow(data_6plus), "\n")
## N drawings: 565
cat("Age range:", range(data_6plus$age_numeric), "\n")
## Age range: 6 10
cat("Mean age:", round(mean(data_6plus$age_numeric), 2), "\n")
## Mean age: 7.45
age_desc_6plus <- data_6plus %>%
group_by(age_numeric) %>%
summarise(
n_drawings = n(),
prop_both_top5 = round(mean(both_constituents_in_top_5, na.rm = TRUE), 3),
.groups = 'drop'
)
print(age_desc_6plus)
## # A tibble: 5 × 3
## age_numeric n_drawings prop_both_top5
## <dbl> <int> <dbl>
## 1 6 216 0.255
## 2 7 127 0.252
## 3 8 66 0.227
## 4 9 63 0.27
## 5 10 93 0.398
overall_6plus <- round(mean(data_6plus$both_constituents_in_top_5, na.rm = TRUE), 3)
cat("Overall proportion for 6 and up:", overall_6plus, "\n")
## Overall proportion for 6 and up: 0.276
# center age for 6+ sample
data_6plus$age_centered <- scale(data_6plus$age_numeric, center = TRUE, scale = FALSE)[,1]
# both constituents model with offset - ages 6+
both_6plus_offset <- glmer(both_constituents_in_top_5 ~
offset(qlogis(both_chance)) +
age_centered +
(age_centered | hybrid_category) +
(1 | participant_id),
data = data_6plus, family = binomial,
control = glmerControl(optimizer = "bobyqa"))
summary(both_6plus_offset)
## Generalized linear mixed model fit by maximum likelihood (Laplace
## Approximation) [glmerMod]
## Family: binomial ( logit )
## Formula:
## both_constituents_in_top_5 ~ offset(qlogis(both_chance)) + age_centered +
## (age_centered | hybrid_category) + (1 | participant_id)
## Data: data_6plus
## Control: glmerControl(optimizer = "bobyqa")
##
## AIC BIC logLik -2*log(L) df.resid
## 619.8 645.8 -303.9 607.8 559
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -1.0464 -0.7357 -0.4252 1.0678 3.2988
##
## Random effects:
## Groups Name Variance Std.Dev. Corr
## participant_id (Intercept) 0.00000 0.0000
## hybrid_category (Intercept) 1.48125 1.2171
## age_centered 0.02875 0.1696 -0.91
## Number of obs: 565, groups: participant_id, 141; hybrid_category, 7
##
## Fixed effects:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.6322 0.4848 1.304 0.1923
## age_centered 0.2120 0.1057 2.006 0.0449 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr)
## age_centerd -0.624
## optimizer (bobyqa) convergence code: 0 (OK)
## boundary (singular) fit: see help('isSingular')
intercept_6plus <- fixef(both_6plus_offset)["(Intercept)"]
intercept_se_6plus <- sqrt(diag(vcov(both_6plus_offset)))["(Intercept)"]
age_effect_6plus <- fixef(both_6plus_offset)["age_centered"]
cat("Intercept (above-chance performance):", round(intercept_6plus, 3), "\n")
## Intercept (above-chance performance): 0.632
cat("Age effect:", round(age_effect_6plus, 3), "\n")
## Age effect: 0.212
Lmers on normalized rank values give same results, as per graphs, for individual concepts
rank_model2 <- lmer(scale(concept2_rank) ~ age_centered +
(age_centered | hybrid_category) +
(1 | participant_id),
data = data)
summary(rank_model2)
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula:
## scale(concept2_rank) ~ age_centered + (age_centered | hybrid_category) +
## (1 | participant_id)
## Data: data
##
## REML criterion at convergence: 3288.1
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -1.97058 -0.88732 0.03511 0.83748 2.11372
##
## Random effects:
## Groups Name Variance Std.Dev. Corr
## participant_id (Intercept) 3.499e-09 5.915e-05
## hybrid_category (Intercept) 9.935e-02 3.152e-01
## age_centered 2.495e-03 4.995e-02 -0.40
## Residual 8.997e-01 9.485e-01
## Number of obs: 1191, groups: participant_id, 332; hybrid_category, 7
##
## Fixed effects:
## Estimate Std. Error df t value Pr(>|t|)
## (Intercept) -0.002877 0.122283 5.990453 -0.024 0.982
## age_centered -0.041864 0.023021 6.061584 -1.819 0.118
##
## Correlation of Fixed Effects:
## (Intr)
## age_centerd -0.322
## optimizer (nloptwrap) convergence code: 0 (OK)
## boundary (singular) fit: see help('isSingular')
rank_model1 <- lmer(scale(concept1_rank) ~ age_centered +
(age_centered | hybrid_category) +
(1 | participant_id),
data = data)
summary(rank_model1)
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula:
## scale(concept1_rank) ~ age_centered + (age_centered | hybrid_category) +
## (1 | participant_id)
## Data: data
##
## REML criterion at convergence: 3251.5
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -2.0568 -0.8047 -0.1803 0.8326 2.2571
##
## Random effects:
## Groups Name Variance Std.Dev. Corr
## participant_id (Intercept) 0.070772 0.26603
## hybrid_category (Intercept) 0.080576 0.28386
## age_centered 0.001007 0.03173 -0.81
## Residual 0.815123 0.90284
## Number of obs: 1191, groups: participant_id, 332; hybrid_category, 7
##
## Fixed effects:
## Estimate Std. Error df t value Pr(>|t|)
## (Intercept) 0.02319 0.11171 6.26019 0.208 0.842124
## age_centered -0.09939 0.01907 8.23816 -5.211 0.000739 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr)
## age_centerd -0.487
m1_coef <- summary(model1)$coefficients["age_centered", ]
m2_coef <- summary(model2)$coefficients["age_centered", ]
category_stats <- data %>%
group_by(hybrid_category) %>%
summarise(
n = n(),
prop_both_top5 = mean(both_constituents_in_top_5),
prop_c1_top5 = mean(constituent1_in_top_5),
prop_c2_top5 = mean(constituent2_in_top_5),
.groups = 'drop'
)
(Constituent 1: B = 0.2, SE = 0.04, Z = 4.81, p = 1.5e-06; Constituent 2: B = 0.06, SE = 0.05, Z = 1.36, p = 0.17)
(e.g., tiger-frog: 42% of drawings had both constituents in top 5) while others showed trade-offs between constituents (e.g., rabbit-boat: 42% for rabbit, 35% for boat, but only 1% jointly)
library(ggthemes)
clean_category <- function(x) {
x %>%
str_remove("^a ") %>%
str_replace(" ", "-")
}
panel_a_data <- data %>%
select(participant_id, age_numeric, hybrid_category,
constituent1_in_top_5, constituent2_in_top_5) %>%
mutate(hybrid_category = clean_category(hybrid_category)) %>%
pivot_longer(cols = c(constituent1_in_top_5, constituent2_in_top_5),
names_to = "constituent",
values_to = "in_top_5") %>%
mutate(constituent = recode(constituent,
"constituent1_in_top_5" = "Constituent 1",
"constituent2_in_top_5" = "Constituent 2")) %>%
group_by(age_numeric, hybrid_category, constituent) %>%
summarise(
prop_top_5 = mean(in_top_5),
se = sd(in_top_5) / sqrt(n()),
n_drawings = n(),
.groups = "drop"
)
ggplot(panel_a_data,
aes(x = age_numeric, y = prop_top_5, color = constituent)) +
geom_hline(yintercept = 5/14, linetype = "dashed", color = "grey50") +
geom_point(aes(size = n_drawings), alpha = 0.8) +
geom_linerange(aes(ymin = prop_top_5 - se, ymax = prop_top_5 + se), alpha = 0.3) +
geom_smooth(method = "lm", aes(fill = constituent), alpha = 0.2, se = TRUE) +
facet_wrap(~hybrid_category, nrow = 2) +
scale_color_manual(values = c("Constituent 1" = "#E74C3C", "Constituent 2" = "#3498DB")) +
scale_fill_manual(values = c("Constituent 1" = "#E74C3C", "Constituent 2" = "#3498DB")) +
scale_size_continuous(range = c(1, 4)) +
ylim(0, 1) +
labs(
x = "Age (years)",
y = "Proportion with Constituent in Top 5",
color = "Constituent",
fill = "Constituent",
size = "Number of\nDrawings"
) +
theme_few() +
theme(legend.position = "right")
panel_b_collapsed <- data %>%
group_by(age_numeric) %>%
summarise(
prop_both_top_5 = mean(both_constituents_in_top_5),
se = sd(both_constituents_in_top_5) / sqrt(n()),
n_drawings = n(),
.groups = "drop"
)
ggplot(panel_b_collapsed,
aes(x = age_numeric, y = prop_both_top_5)) +
geom_hline(yintercept = (5/14)^2, linetype = "dashed", color = "grey50") +
geom_point(aes(size = n_drawings), alpha = 0.8) +
geom_linerange(aes(ymin = prop_both_top_5 - se, ymax = prop_both_top_5 + se), alpha = 0.3) +
geom_smooth(method = "lm", color = "#2C3E50", fill = "#2C3E50", alpha = 0.2, se = TRUE) +
scale_size_continuous(range = c(2, 5)) +
ylim(0, 1) +
labs(
x = "Age (years)",
y = "Proportion with Both Constituents in Top 5",
size = "Number of\nDrawings"
) +
theme_few() +
theme(legend.position = "right")
(Bria not touching below here)
chance_logit_top5 <- qlogis(top5_chance)
# model params
intercept1 <- fixef(model1)["(Intercept)"]
intercept1_se <- sqrt(diag(vcov(model1)))["(Intercept)"]
intercept2 <- fixef(model2)["(Intercept)"]
intercept2_se <- sqrt(diag(vcov(model2)))["(Intercept)"]
# test stats
t_stat1 <- (intercept1 - chance_logit_top5) / intercept1_se
t_stat2 <- (intercept2 - chance_logit_top5) / intercept2_se
p_value1 <- 2 * (1 - pnorm(abs(t_stat1)))
p_value2 <- 2 * (1 - pnorm(abs(t_stat2)))
# bonferroni correction for multiple testing
alpha_corrected <- 0.05 / 2
# results summary
statistical_results <- data.frame(
Constituent = c("Concept 1", "Concept 2"),
Intercept = c(intercept1, intercept2),
SE = c(intercept1_se, intercept2_se),
t_statistic = c(t_stat1, t_stat2),
p_value = c(p_value1, p_value2),
Significant = c(p_value1 < alpha_corrected, p_value2 < alpha_corrected)
)
print(statistical_results, digits = 4)
## Constituent Intercept SE t_statistic p_value Significant
## 1 Concept 1 0.7969 0.1743 7.943 1.998e-15 TRUE
## 2 Concept 2 0.5967 0.1420 8.339 0.000e+00 TRUE
prob1_predicted <- plogis(intercept1)
prob2_predicted <- plogis(intercept2)
effect_sizes <- data.frame(
Constituent = c("Concept 1", "Concept 2"),
Predicted_Probability = c(prob1_predicted, prob2_predicted),
Chance_Probability = c(top5_chance, top5_chance),
Difference = c(prob1_predicted - top5_chance, prob2_predicted - top5_chance),
Ratio = c(prob1_predicted / top5_chance, prob2_predicted / top5_chance)
)
print(effect_sizes)
## Constituent Predicted_Probability Chance_Probability Difference Ratio
## 1 Concept 1 0.6893116 0.3571429 0.3321687 1.930072
## 2 Concept 2 0.6448996 0.3571429 0.2877567 1.805719
summary(model1)
## Generalized linear mixed model fit by maximum likelihood (Laplace
## Approximation) [glmerMod]
## Family: binomial ( logit )
## Formula:
## constituent1_in_top_5 ~ age_centered + offset(qlogis(individual_chance)) +
## (age_centered | hybrid_category) + (1 | participant_id)
## Data: data
## Control: glmerControl(optimizer = "bobyqa")
##
## AIC BIC logLik -2*log(L) df.resid
## 1577.7 1608.2 -782.8 1565.7 1185
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -2.5381 -0.9174 0.5529 0.8343 1.5906
##
## Random effects:
## Groups Name Variance Std.Dev. Corr
## participant_id (Intercept) 0.208168 0.45625
## hybrid_category (Intercept) 0.179086 0.42319
## age_centered 0.003473 0.05894 0.05
## Number of obs: 1191, groups: participant_id, 332; hybrid_category, 7
##
## Fixed effects:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.79690 0.17432 4.572 4.84e-06 ***
## age_centered 0.19690 0.04097 4.806 1.54e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr)
## age_centerd 0.049
summary(model2)
## Generalized linear mixed model fit by maximum likelihood (Laplace
## Approximation) [glmerMod]
## Family: binomial ( logit )
## Formula:
## constituent2_in_top_5 ~ age_centered + offset(qlogis(individual_chance)) +
## (age_centered | hybrid_category) + (1 | participant_id)
## Data: data
## Control: glmerControl(optimizer = "bobyqa")
##
## AIC BIC logLik -2*log(L) df.resid
## 1633.8 1664.3 -810.9 1621.8 1185
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -1.6502 -0.9563 0.6060 0.9773 1.3626
##
## Random effects:
## Groups Name Variance Std.Dev. Corr
## participant_id (Intercept) 0.000000 0.00000
## hybrid_category (Intercept) 0.116545 0.34139
## age_centered 0.009012 0.09493 0.36
## Number of obs: 1191, groups: participant_id, 332; hybrid_category, 7
##
## Fixed effects:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.59669 0.14204 4.201 2.66e-05 ***
## age_centered 0.06252 0.04598 1.359 0.174
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr)
## age_centerd 0.262
## optimizer (bobyqa) convergence code: 0 (OK)
## boundary (singular) fit: see help('isSingular')
model2_simplified <- glmer(constituent2_in_top_5 ~ age_centered +
(age_centered | hybrid_category),
data = data, family = binomial,
control = glmerControl(optimizer = "bobyqa"))
intercept2_alt <- fixef(model2_simplified)["(Intercept)"]
intercept2_alt_se <- sqrt(diag(vcov(model2_simplified)))["(Intercept)"]
t_stat2_alt <- (intercept2_alt - chance_logit_top5) / intercept2_alt_se
p_value2_alt <- 2 * (1 - pnorm(abs(t_stat2_alt)))
cat("original model2 results:\n")
## original model2 results:
cat("Intercept:", intercept2, "SE:", intercept2_se, "p-value:", p_value2, "\n\n")
## Intercept: 0.596694 SE: 0.1420408 p-value: 0
cat("simplified model results:\n")
## simplified model results:
cat("Intercept:", intercept2_alt, "SE:", intercept2_alt_se, "p-value:", p_value2_alt, "\n")
## Intercept: 0.008906734 SE: 0.1420405 p-value: 2.658948e-05
summary(model2_simplified)
## Generalized linear mixed model fit by maximum likelihood (Laplace
## Approximation) [glmerMod]
## Family: binomial ( logit )
## Formula:
## constituent2_in_top_5 ~ age_centered + (age_centered | hybrid_category)
## Data: data
## Control: glmerControl(optimizer = "bobyqa")
##
## AIC BIC logLik -2*log(L) df.resid
## 1631.8 1657.3 -810.9 1621.8 1186
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -1.6502 -0.9563 0.6060 0.9773 1.3626
##
## Random effects:
## Groups Name Variance Std.Dev. Corr
## hybrid_category (Intercept) 0.116544 0.34139
## age_centered 0.009012 0.09493 0.36
## Number of obs: 1191, groups: hybrid_category, 7
##
## Fixed effects:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.008907 0.142041 0.063 0.950
## age_centered 0.062515 0.045984 1.359 0.174
##
## Correlation of Fixed Effects:
## (Intr)
## age_centerd 0.262
participant variance was zero in model 2 after accounting for category and age effects?
age_effect1 <- fixef(model1)["age_centered"]
age_effect2 <- fixef(model2)["age_centered"]
cat("age effects (per year):\n")
## age effects (per year):
cat("Constituent 1:", round(age_effect1, 3), "\n")
## Constituent 1: 0.197
cat("Constituent 2:", round(age_effect2, 3), "\n")
## Constituent 2: 0.063