This analysis tests whether children’s hybrid drawings contain recognizable elements from both constituent concepts using CLIP embeddings and k-way classification. We examine whether hybrid drawings rank their constituent concepts above chance levels when compared to all base concepts in our stimulus set.
# load anonymized hybrid drawing analysis data
data <- read.csv('../data_intermediates/analysis/prolific_k_way_classification_all_children_anonymized.csv')
cat("total hybrid drawings:", nrow(data), "\n")
## total hybrid drawings: 2938
cat("unique participants:", length(unique(data$participant_id)), "\n")
## unique participants: 739
cat("hybrid categories:", length(unique(data$hybrid_category)), "\n")
## hybrid categories: 8
cat("age range:", range(data$age_numeric, na.rm = TRUE), "\n")
## age range: 2 10
data_original <- data # save original if needed
data <- data %>%
filter(age_numeric >= 3)
n_participants <- length(unique(data$participant_id))
age_mean <- round(mean(data$age_numeric), 2)
age_sd <- round(sd(data$age_numeric), 2)
age_min <- min(data$age_numeric)
age_max <- max(data$age_numeric)
n_drawings <- nrow(data)
cat("Sample descriptives after filtering:\n")
## Sample descriptives after filtering:
cat("Children (N =", n_participants, ", ages", age_min, "-", age_max, "years, M =", age_mean, ", SD =", age_sd, ")\n")
## Children (N = 613 , ages 3 - 10 years, M = 5.82 , SD = 2.06 )
cat("Total drawings:", n_drawings, "\n")
## Total drawings: 2513
cat("Drawings per age:\n")
## Drawings per age:
print(table(data$age_numeric))
##
## 3 4 5 6 7 8 9 10
## 338 451 415 484 321 166 135 203
# center age for interpretable intercept
data$age_centered <- scale(data$age_numeric, center = TRUE, scale = FALSE)[,1]
# analysis parameters
n_concepts <- 14 # dinosaur truck not included in base concepts
top1_chance <- 1/n_concepts
top5_chance <- 5/n_concepts
cat("number of base concepts:", n_concepts, "\n")
## number of base concepts: 14
cat("top-1 chance level:", round(top1_chance, 3), "\n")
## top-1 chance level: 0.071
cat("top-5 chance level:", round(top5_chance, 3), "\n")
## top-5 chance level: 0.357
boolean_cols <- c("constituent1_in_top_1", "constituent2_in_top_1",
"constituent1_in_top_5", "constituent2_in_top_5",
"both_constituents_in_top_1", "both_constituents_in_top_5")
data[boolean_cols] <- lapply(data[boolean_cols], as.logical)
summary(data)
## hybrid_category concept1 concept2 participant_id
## Length:2513 Length:2513 Length:2513 Length:2513
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## age age_numeric is_adult centroid_source
## Length:2513 Min. : 3.000 Length:2513 Length:2513
## Class :character 1st Qu.: 4.000 Class :character Class :character
## Mode :character Median : 6.000 Mode :character Mode :character
## Mean : 5.817
## 3rd Qu.: 7.000
## Max. :10.000
##
## concept1_available concept2_available concept1_similarity concept1_rank
## Length:2513 Length:2513 Min. :0.7705 Min. : 1.000
## Class :character Class :character 1st Qu.:0.9080 1st Qu.: 2.000
## Mode :character Mode :character Median :0.9282 Median : 4.000
## Mean :0.9234 Mean : 5.328
## 3rd Qu.:0.9429 3rd Qu.: 8.000
## Max. :0.9770 Max. :14.000
## NA's :322 NA's :322
## constituent1_in_top_1 constituent1_in_top_5 concept2_similarity
## Mode :logical Mode :logical Min. :0.7803
## FALSE:1665 FALSE:904 1st Qu.:0.9009
## TRUE :526 TRUE :1287 Median :0.9233
## NA's :322 NA's :322 Mean :0.9192
## 3rd Qu.:0.9419
## Max. :0.9790
## NA's :322
## concept2_rank constituent2_in_top_1 constituent2_in_top_5
## Min. : 1.000 Mode :logical Mode :logical
## 1st Qu.: 2.000 FALSE:1765 FALSE:1096
## Median : 6.000 TRUE :426 TRUE :1095
## Mean : 5.979 NA's :322 NA's :322
## 3rd Qu.: 9.000
## Max. :14.000
## NA's :322
## both_constituents_in_top_1 both_constituents_in_top_5 drawing_file
## Mode :logical Mode :logical Length:2513
## FALSE:2191 FALSE:1663 Class :character
## NA's :322 TRUE :528 Mode :character
## NA's :322
##
##
##
## age_centered
## Min. :-2.8166
## 1st Qu.:-1.8166
## Median : 0.1834
## Mean : 0.0000
## 3rd Qu.: 1.1834
## Max. : 4.1834
##
age_descriptives <- data %>%
group_by(age_numeric) %>%
summarise(
n_drawings = n(),
n_both_top5 = sum(both_constituents_in_top_5, na.rm = TRUE),
prop_both_top5 = round(mean(both_constituents_in_top_5, na.rm = TRUE), 3),
.groups = 'drop'
)
print(age_descriptives)
## # A tibble: 8 × 4
## age_numeric n_drawings n_both_top5 prop_both_top5
## <dbl> <int> <int> <dbl>
## 1 3 338 63 0.216
## 2 4 451 82 0.207
## 3 5 415 84 0.233
## 4 6 484 104 0.249
## 5 7 321 71 0.254
## 6 8 166 32 0.218
## 7 9 135 34 0.283
## 8 10 203 58 0.324
overall_percent <- round(mean(data$both_constituents_in_top_5, na.rm = TRUE) * 100, 1)
cat(overall_percent, "% of drawings had both constituents in top 5\n")
## 24.1 % of drawings had both constituents in top 5
data$concept1_available <- as.logical(data$concept1_available)
data$concept2_available <- as.logical(data$concept2_available)
cat("After conversion:\n")
## After conversion:
print(table(data$concept1_available, useNA = "always"))
##
## FALSE TRUE <NA>
## 322 2191 0
print(table(data$concept2_available, useNA = "always"))
##
## FALSE TRUE <NA>
## 322 2191 0
data_filtered <- data %>%
filter(concept1_available == TRUE & concept2_available == TRUE)
cat("original data:", nrow(data), "drawings\n")
## original data: 2513 drawings
cat("after filtering for available centroids:", nrow(data_filtered), "drawings\n")
## after filtering for available centroids: 2191 drawings
cat("remaining concept1 values:\n")
## remaining concept1 values:
print(table(data_filtered$concept1))
##
## a bike a mushroom a rabbit a sheep a tiger a train
## 318 349 313 303 269 343
## an elephant
## 296
# use filtered data for all analyses
data <- data_filtered
age_summary <- data %>%
select(participant_id, age_numeric, hybrid_category, concept1, concept2, concept1_similarity, concept2_similarity) %>%
pivot_longer(cols=c('concept1_similarity','concept2_similarity'), names_to = 'concept_comparison', values_to='similarity') %>%
group_by(age_numeric, hybrid_category, concept1, concept2, concept_comparison) %>% # grouping by hybrid_category
summarize(mean_sim = mean(similarity), sd_sim = sd(similarity), num_drawings = n())
average similarity to each base concept across age
ggplot(age_summary, aes(x=age_numeric, y=mean_sim, col=concept_comparison)) +
geom_point(alpha=.8, aes(size=num_drawings)) +
geom_linerange(aes(ymax = (mean_sim + sd_sim), ymin = (mean_sim - sd_sim),col=concept_comparison), alpha=.2) +
facet_wrap(~hybrid_category, nrow=2) + # changed from ~concept1 to ~hybrid_category
theme_few()
age_summary_top_5 <- data %>%
select(participant_id, age_numeric, concept1, concept2, constituent1_in_top_5, constituent2_in_top_5) %>%
mutate(hybrid_category = paste0(str_split_fixed(concept1,' ',2)[,2],'-',str_split_fixed(concept2,' ',2)[,2])) %>%
pivot_longer(cols=c('constituent1_in_top_5','constituent2_in_top_5'), names_to = 'concept_comparison', values_to='top_5_accuracy') %>%
group_by(age_numeric, hybrid_category, concept1, concept2, concept_comparison) %>%
summarize(mean_acc_top_5 = mean(top_5_accuracy), se_acc_top_5 = sd(top_5_accuracy) / n(), num_drawings = n())
age_summary_both_top_5 <- data %>%
select(participant_id, age_numeric, hybrid_category, concept1, concept2, both_constituents_in_top_5) %>%
group_by(age_numeric, hybrid_category, concept1, concept2) %>%
summarize(mean_acc_top_5 = mean(both_constituents_in_top_5), se_acc_top_5 = sd(both_constituents_in_top_5) / n(), num_drawings = n())
age_summary_top_1 <- data %>%
select(participant_id, age_numeric, concept1, concept2, constituent1_in_top_1, constituent2_in_top_1) %>%
mutate(hybrid_category = paste0(str_split_fixed(concept1,' ',2)[,2],'-',str_split_fixed(concept2,' ',2)[,2])) %>%
pivot_longer(cols=c('constituent1_in_top_1','constituent2_in_top_1'), names_to = 'concept_comparison', values_to='top_1_accuracy') %>%
group_by(age_numeric, hybrid_category, concept1, concept2, concept_comparison) %>%
summarize(mean_acc_top_1 = mean(top_1_accuracy), se_acc_top_1 = sd(top_1_accuracy) / n(), num_drawings = n())
average similarity (scaled) to each base concept across age
ggplot(age_summary_top_5, aes(x=age_numeric, y=mean_acc_top_5, col=concept_comparison)) +
geom_point(alpha=.8, aes(size=num_drawings)) +
geom_linerange(aes(ymax = (mean_acc_top_5 + se_acc_top_5), ymin = (mean_acc_top_5 - se_acc_top_5),col=concept_comparison), alpha=.2) +
facet_wrap(~hybrid_category, nrow=2) +
theme_few() +
ylim(0,1) +
ylab('Mean acc top 5') +
xlab('Age in years') +
geom_smooth(method='lm', aes(fill=concept_comparison), alpha=.2) +
geom_hline(yintercept=top5_chance, linetype='dashed', color='grey')
Look at top 1 – funkier!
ggplot(age_summary_top_1, aes(x=age_numeric, y=mean_acc_top_1, col=concept_comparison)) +
geom_point(alpha=.8, aes(size=num_drawings)) +
geom_linerange(aes(ymax = (mean_acc_top_1 + se_acc_top_1), ymin = (mean_acc_top_1 - se_acc_top_1),col=concept_comparison), alpha=.2) +
facet_wrap(~hybrid_category, nrow=2) +
theme_few() +
ylim(0,1) +
ylab('Mean acc top 1') +
xlab('Age in years')
ggplot(age_summary_top_5, aes(x=age_numeric, y=mean_acc_top_5, col=hybrid_category)) +
geom_point(alpha=.8, aes(size=num_drawings)) +
geom_linerange(aes(ymax = (mean_acc_top_5 + se_acc_top_5), ymin = (mean_acc_top_5 - se_acc_top_5),col=hybrid_category), alpha=.2) +
facet_wrap(~concept_comparison, nrow=1) +
theme_few() +
ylim(0,1) +
ylab('Mean acc top 5') +
xlab('Age in years') +
geom_smooth(method='lm', aes(fill=hybrid_category), alpha=.2) +
geom_hline(yintercept=top5_chance, linetype='dashed', color='grey')
ggplot(age_summary_both_top_5, aes(x=age_numeric, y=mean_acc_top_5, col=hybrid_category)) +
geom_point(alpha=.8, aes(size=num_drawings)) +
geom_linerange(aes(ymax = (mean_acc_top_5 + se_acc_top_5), ymin = (mean_acc_top_5 - se_acc_top_5),col=hybrid_category), alpha=.2) +
facet_wrap(~hybrid_category, nrow=2) +
theme_few() +
ylim(0,1) +
ylab('Mean acc both concepts in top 5') +
xlab('Age in years') +
geom_smooth(method='lm', aes(fill=hybrid_category), alpha=.2)
We use general linear mixed-effects models to test whether each constituent ranks in the top-5 above chance levels, controlling for age and accounting for clustering by participant and hybrid category.
#add chance level
individual_chance = 5/14 # top-5 chance level
data$individual_chance <- individual_chance
# fit logistic mixed-effects models
model1 <- glmer(constituent1_in_top_5 ~ age_centered +
offset(qlogis(individual_chance))+
(age_centered | hybrid_category) +
(1 | participant_id),
data = data, family = binomial,
control = glmerControl(optimizer = "bobyqa"))
summary(model1)
## Generalized linear mixed model fit by maximum likelihood (Laplace
## Approximation) [glmerMod]
## Family: binomial ( logit )
## Formula:
## constituent1_in_top_5 ~ age_centered + offset(qlogis(individual_chance)) +
## (age_centered | hybrid_category) + (1 | participant_id)
## Data: data
## Control: glmerControl(optimizer = "bobyqa")
##
## AIC BIC logLik -2*log(L) df.resid
## 2869.8 2904.0 -1428.9 2857.8 2185
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -2.1293 -0.9862 0.5860 0.8020 1.5150
##
## Random effects:
## Groups Name Variance Std.Dev. Corr
## participant_id (Intercept) 0.156013 0.39498
## hybrid_category (Intercept) 0.221825 0.47098
## age_centered 0.001329 0.03645 -0.50
## Number of obs: 2191, groups: participant_id, 600; hybrid_category, 7
##
## Fixed effects:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.96628 0.18480 5.229 1.71e-07 ***
## age_centered 0.11150 0.02805 3.975 7.03e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr)
## age_centerd -0.226
model2 <- glmer(constituent2_in_top_5 ~ age_centered +
offset(qlogis(individual_chance))+
(age_centered | hybrid_category) +
(1 | participant_id),
data = data, family = binomial,
control = glmerControl(optimizer = "bobyqa"))
summary(model2)
## Generalized linear mixed model fit by maximum likelihood (Laplace
## Approximation) [glmerMod]
## Family: binomial ( logit )
## Formula:
## constituent2_in_top_5 ~ age_centered + offset(qlogis(individual_chance)) +
## (age_centered | hybrid_category) + (1 | participant_id)
## Data: data
## Control: glmerControl(optimizer = "bobyqa")
##
## AIC BIC logLik -2*log(L) df.resid
## 3003.0 3037.2 -1495.5 2991.0 2185
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -1.6950 -0.9434 -0.6961 0.9933 1.4147
##
## Random effects:
## Groups Name Variance Std.Dev. Corr
## participant_id (Intercept) 0.021861 0.14786
## hybrid_category (Intercept) 0.101631 0.31880
## age_centered 0.005331 0.07302 0.83
## Number of obs: 2191, groups: participant_id, 600; hybrid_category, 7
##
## Fixed effects:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.59691 0.12838 4.649 3.33e-06 ***
## age_centered 0.04104 0.03508 1.170 0.242
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr)
## age_centerd 0.616
cat("Model 1 convergence:", model1@optinfo$conv$opt, "\n")
## Model 1 convergence: 0
cat("Model 2 convergence:", model2@optinfo$conv$opt, "\n")
## Model 2 convergence: 0
both_chance <- (5*4) / (14*13)
data$both_chance <- both_chance
both_in_top5 <- glmer(both_constituents_in_top_5 ~
offset(qlogis(both_chance)) + # This adjusts intercept to chance level
age_centered +
(age_centered | hybrid_category) +
(1 | participant_id),
data = data, family = binomial,
control = glmerControl(optimizer = "bobyqa"))
summary(both_in_top5)
## Generalized linear mixed model fit by maximum likelihood (Laplace
## Approximation) [glmerMod]
## Family: binomial ( logit )
## Formula:
## both_constituents_in_top_5 ~ offset(qlogis(both_chance)) + age_centered +
## (age_centered | hybrid_category) + (1 | participant_id)
## Data: data
## Control: glmerControl(optimizer = "bobyqa")
##
## AIC BIC logLik -2*log(L) df.resid
## 2154.4 2188.5 -1071.2 2142.4 2185
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -1.1045 -0.6184 -0.3607 -0.0830 10.9464
##
## Random effects:
## Groups Name Variance Std.Dev. Corr
## participant_id (Intercept) 0.12531 0.3540
## hybrid_category (Intercept) 1.74568 1.3212
## age_centered 0.01173 0.1083 -0.85
## Number of obs: 2191, groups: participant_id, 600; hybrid_category, 7
##
## Fixed effects:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.5253 0.5074 1.035 0.3006
## age_centered 0.1262 0.0534 2.363 0.0181 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr)
## age_centerd -0.675
data_6plus <- data %>%
filter(age_numeric >= 6)
cat("6 and up sample:\n")
## 6 and up sample:
cat("N children:", length(unique(data_6plus$participant_id)), "\n")
## N children: 288
cat("N drawings:", nrow(data_6plus), "\n")
## N drawings: 1142
cat("Age range:", range(data_6plus$age_numeric), "\n")
## Age range: 6 10
cat("Mean age:", round(mean(data_6plus$age_numeric), 2), "\n")
## Mean age: 7.44
age_desc_6plus <- data_6plus %>%
group_by(age_numeric) %>%
summarise(
n_drawings = n(),
prop_both_top5 = round(mean(both_constituents_in_top_5, na.rm = TRUE), 3),
.groups = 'drop'
)
print(age_desc_6plus)
## # A tibble: 5 × 3
## age_numeric n_drawings prop_both_top5
## <dbl> <int> <dbl>
## 1 6 417 0.249
## 2 7 279 0.254
## 3 8 147 0.218
## 4 9 120 0.283
## 5 10 179 0.324
overall_6plus <- round(mean(data_6plus$both_constituents_in_top_5, na.rm = TRUE), 3)
cat("Overall proportion for 6 and up:", overall_6plus, "\n")
## Overall proportion for 6 and up: 0.262
# center age for 6+ sample
data_6plus$age_centered <- scale(data_6plus$age_numeric, center = TRUE, scale = FALSE)[,1]
# both constituents model with offset - ages 6+
both_6plus_offset <- glmer(both_constituents_in_top_5 ~
offset(qlogis(both_chance)) +
age_centered +
(age_centered | hybrid_category) +
(1 | participant_id),
data = data_6plus, family = binomial,
control = glmerControl(optimizer = "bobyqa"))
summary(both_6plus_offset)
## Generalized linear mixed model fit by maximum likelihood (Laplace
## Approximation) [glmerMod]
## Family: binomial ( logit )
## Formula:
## both_constituents_in_top_5 ~ offset(qlogis(both_chance)) + age_centered +
## (age_centered | hybrid_category) + (1 | participant_id)
## Data: data_6plus
## Control: glmerControl(optimizer = "bobyqa")
##
## AIC BIC logLik -2*log(L) df.resid
## 1203.4 1233.6 -595.7 1191.4 1136
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -1.0374 -0.6450 -0.4547 0.9769 7.5452
##
## Random effects:
## Groups Name Variance Std.Dev. Corr
## participant_id (Intercept) 0.06994 0.2645
## hybrid_category (Intercept) 1.60821 1.2682
## age_centered 0.02092 0.1447 -1.00
## Number of obs: 1142, groups: participant_id, 288; hybrid_category, 7
##
## Fixed effects:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.7157 0.4932 1.451 0.1467
## age_centered 0.1535 0.0819 1.874 0.0609 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr)
## age_centerd -0.702
## optimizer (bobyqa) convergence code: 0 (OK)
## boundary (singular) fit: see help('isSingular')
intercept_6plus <- fixef(both_6plus_offset)["(Intercept)"]
intercept_se_6plus <- sqrt(diag(vcov(both_6plus_offset)))["(Intercept)"]
age_effect_6plus <- fixef(both_6plus_offset)["age_centered"]
cat("Intercept (above-chance performance):", round(intercept_6plus, 3), "\n")
## Intercept (above-chance performance): 0.716
cat("Age effect:", round(age_effect_6plus, 3), "\n")
## Age effect: 0.153
Lmers on normalized rank values give same results, as per graphs, for individual concepts
rank_model2 <- lmer(scale(concept2_rank) ~ age_centered +
(age_centered | hybrid_category) +
(1 | participant_id),
data = data)
summary(rank_model2)
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula:
## scale(concept2_rank) ~ age_centered + (age_centered | hybrid_category) +
## (1 | participant_id)
## Data: data
##
## REML criterion at convergence: 6077.3
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -1.86941 -0.91005 0.03783 0.81875 2.11901
##
## Random effects:
## Groups Name Variance Std.Dev. Corr
## participant_id (Intercept) 0.009920 0.09960
## hybrid_category (Intercept) 0.083219 0.28848
## age_centered 0.001599 0.03999 0.20
## Residual 0.911840 0.95490
## Number of obs: 2191, groups: participant_id, 600; hybrid_category, 7
##
## Fixed effects:
## Estimate Std. Error df t value Pr(>|t|)
## (Intercept) -0.01056 0.11104 6.01439 -0.095 0.927
## age_centered -0.02582 0.01822 6.33809 -1.417 0.204
##
## Correlation of Fixed Effects:
## (Intr)
## age_centerd 0.162
rank_model1 <- lmer(scale(concept1_rank) ~ age_centered +
(age_centered | hybrid_category) +
(1 | participant_id),
data = data)
summary(rank_model1)
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula:
## scale(concept1_rank) ~ age_centered + (age_centered | hybrid_category) +
## (1 | participant_id)
## Data: data
##
## REML criterion at convergence: 6013.4
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -2.0454 -0.8004 -0.1998 0.8099 2.3462
##
## Random effects:
## Groups Name Variance Std.Dev. Corr
## participant_id (Intercept) 0.0564222 0.23753
## hybrid_category (Intercept) 0.0844257 0.29056
## age_centered 0.0006883 0.02624 -1.00
## Residual 0.8475031 0.92060
## Number of obs: 2191, groups: participant_id, 600; hybrid_category, 7
##
## Fixed effects:
## Estimate Std. Error df t value Pr(>|t|)
## (Intercept) 0.01795 0.11213 6.11735 0.160 0.877976
## age_centered -0.07136 0.01473 10.11918 -4.845 0.000654 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr)
## age_centerd -0.658
## optimizer (nloptwrap) convergence code: 0 (OK)
## boundary (singular) fit: see help('isSingular')
m1_coef <- summary(model1)$coefficients["age_centered", ]
m2_coef <- summary(model2)$coefficients["age_centered", ]
category_stats <- data %>%
group_by(hybrid_category) %>%
summarise(
n = n(),
prop_both_top5 = mean(both_constituents_in_top_5),
prop_c1_top5 = mean(constituent1_in_top_5),
prop_c2_top5 = mean(constituent2_in_top_5),
.groups = 'drop'
)
(Constituent 1: B = 0.11, SE = 0.03, Z = 3.98, p = 7e-05; Constituent 2: B = 0.04, SE = 0.04, Z = 1.17, p = 0.24)
(e.g., tiger-frog: 45% of drawings had both constituents in top 5) while others showed trade-offs between constituents (e.g., rabbit-boat: 44% for rabbit, 35% for boat, but only 1% jointly)
library(ggthemes)
clean_category <- function(x) {
x %>%
str_remove("^an? ") %>%
str_replace_all(" ", "-")
}
# base_plot <- ggplot(panel_a_data,
# aes(x = age_numeric, y = prop_top_5)) +
# geom_hline(yintercept = 5/14, linetype = "dashed", color = "grey50") +
# facet_wrap(~hybrid_category, nrow = 1) +
# scale_color_manual(values = c("Constituent 1" = "#E74C3C", "Constituent 2" = "#3498DB")) +
# scale_fill_manual(values = c("Constituent 1" = "#E74C3C", "Constituent 2" = "#3498DB")) +
# scale_size_continuous(range = c(0.5, 2)) +
# scale_x_continuous(limits = range(panel_a_data$age_numeric)) +
# ylim(0, 1) +
# labs(
# x = "Age (years)",
# y = "Proportion with Constituent in Top 5",
# color = "Constituent",
# fill = "Constituent",
# size = "Number of\nDrawings"
# ) +
# theme_few() +
# theme(
# legend.position = "right",
# aspect.ratio = 1,
# strip.text = element_text(size = 8),
# axis.title = element_text(size = 9),
# axis.text = element_text(size = 11),
# legend.title = element_text(size = 8),
# legend.text = element_text(size = 7)
# )
#
# # frame 1: axes and chance line only
# frame1 <- base_plot
# ggsave("~/Documents/GitHub/exploratory-drawing-analysis/writing/constituent_age_trend_frame1.svg",
# frame1, width = 10, height = 6)
#
# # frame 2: add constituent 1
# c1_data <- panel_a_data %>% filter(constituent == "Constituent 1")
#
# frame2 <- base_plot +
# geom_point(data = c1_data, aes(color = constituent, size = n_drawings), alpha = 0.8) +
# geom_linerange(data = c1_data,
# aes(ymin = prop_top_5 - se, ymax = prop_top_5 + se, color = constituent),
# alpha = 0.3) +
# geom_smooth(data = c1_data,
# aes(color = constituent, fill = constituent),
# method = "lm", alpha = 0.2, se = TRUE)
# ggsave("~/Documents/GitHub/exploratory-drawing-analysis/writing/constituent_age_trend_frame2.svg",
# frame2, width = 10, height = 6)
#
# # frame 3: add both constituents
# frame3 <- base_plot +
# geom_point(aes(color = constituent, size = n_drawings), alpha = 0.8) +
# geom_linerange(aes(ymin = prop_top_5 - se, ymax = prop_top_5 + se, color = constituent),
# alpha = 0.3) +
# geom_smooth(aes(color = constituent, fill = constituent),
# method = "lm", alpha = 0.2, se = TRUE)
# ggsave("~/Documents/GitHub/exploratory-drawing-analysis/writing/constituent_age_trend_frame3.svg",
# frame3, width = 10, height = 6)
category_data <- data %>%
mutate(hybrid_category = clean_category(hybrid_category)) %>%
group_by(age_numeric, hybrid_category) %>%
summarise(
prop_both_top_5 = mean(both_constituents_in_top_5),
.groups = "drop"
)
collapsed_data <- data %>%
group_by(age_numeric) %>%
summarise(
prop_both_top_5 = mean(both_constituents_in_top_5),
se = sd(both_constituents_in_top_5) / sqrt(n()),
n_drawings = n(),
.groups = "drop"
)
x_limits <- range(collapsed_data$age_numeric)
# frame 1: axes and chance line only
both_frame1 <- ggplot() +
geom_hline(yintercept = (5/14)^2, linetype = "dashed", color = "grey50") +
scale_x_continuous(limits = x_limits, expand = c(0.01, 0.01)) +
ylim(0, 1) +
labs(
x = "Age (years)",
y = "Proportion with Both Constituents in Top 5",
size = "Number of\nDrawings"
) +
theme_few() +
theme(
legend.position = "right",
axis.title = element_text(size = 12),
axis.text = element_text(size = 11)
)
ggsave("~/Documents/GitHub/exploratory-drawing-analysis/writing/overall_age_trend_frame1.svg",
both_frame1, width = 6, height = 6)
# frame 2: full plot
both_frame2 <- ggplot() +
geom_hline(yintercept = (5/14)^2, linetype = "dashed", color = "grey50") +
geom_smooth(data = category_data,
aes(x = age_numeric, y = prop_both_top_5, color = hybrid_category,
fill = hybrid_category),
method = "lm", se = TRUE, alpha = 0.1, linewidth = 0.7) +
geom_smooth(data = collapsed_data,
aes(x = age_numeric, y = prop_both_top_5),
method = "lm", color = "black", fill = "grey30", alpha = 0.3,
linewidth = 1.2, se = TRUE) +
geom_point(data = collapsed_data,
aes(x = age_numeric, y = prop_both_top_5, size = n_drawings),
alpha = 0.8) +
scale_size_continuous(range = c(2, 5)) +
scale_color_viridis_d(name = "Hybrid Category") +
scale_fill_viridis_d(name = "Hybrid Category") +
scale_x_continuous(limits = x_limits, expand = c(0.01, 0.01)) +
ylim(0, 1) +
labs(
x = "Age (years)",
y = "Proportion with Both Constituents in Top 5",
size = "Number of\nDrawings"
) +
theme_few() +
theme(
legend.position = "right",
axis.title = element_text(size = 12),
axis.text = element_text(size = 11)
) +
guides(fill = guide_legend(), color = guide_legend())
ggsave("~/Documents/GitHub/exploratory-drawing-analysis/writing/overall_age_trend_frame2.svg",
both_frame2, width = 8, height = 6)
data %>%
mutate(hybrid_category_clean = clean_category(hybrid_category)) %>%
ggplot(aes(x = concept1_rank, y = concept2_rank, color = hybrid_category_clean)) +
geom_point(alpha = 0.6, size = 2) +
geom_hline(yintercept = 5, color = "black", linewidth = 0.5) +
geom_vline(xintercept = 5, color = "black", linewidth = 0.5) +
facet_wrap(~hybrid_category_clean, nrow = 1) +
coord_fixed(ratio = 1) +
scale_color_viridis_d() +
labs(
x = "Constituent 1 Classification Rank",
y = "Constituent 2 Classification Rank"
) +
theme_few() +
theme(
legend.position = "none",
strip.text = element_text(size = 8),
axis.title = element_text(size = 9),
axis.text = element_text(size = 9)
)
ggsave("~/Documents/GitHub/exploratory-drawing-analysis/writing/classification_rank.svg")
# library(patchwork)
# p1 <- ggplot(panel_a_data,
# aes(x = age_numeric, y = prop_top_5, color = constituent)) +
# geom_hline(yintercept = 5/14, linetype = "dashed", color = "grey50") +
# geom_point(aes(size = n_drawings), alpha = 0.8) +
# geom_linerange(aes(ymin = prop_top_5 - se, ymax = prop_top_5 + se), alpha = 0.3) +
# geom_smooth(method = "lm", aes(fill = constituent), alpha = 0.2, se = TRUE) +
# facet_wrap(~hybrid_category, nrow = 1) +
# scale_color_manual(values = c("Constituent 1" = "#E74C3C", "Constituent 2" = "#3498DB")) +
# scale_fill_manual(values = c("Constituent 1" = "#E74C3C", "Constituent 2" = "#3498DB")) +
# scale_size_continuous(range = c(1, 3)) +
# ylim(0, 1) +
# labs(
# x = "Age (years)",
# y = "Proportion with Constituent in Top 5",
# color = "Constituent",
# fill = "Constituent",
# size = "Number of\nDrawings"
# ) +
# theme_few() +
# theme(
# legend.position = "right",
# aspect.ratio = 1,
# strip.text = element_text(size = 9),
# axis.title = element_text(size = 8),
# axis.text = element_text(size = 6),
# legend.title = element_text(size = 8),
# legend.text = element_text(size = 7)
# )
#
# p2 <- data %>%
# mutate(hybrid_category_clean = clean_category(hybrid_category)) %>%
# ggplot(aes(x = concept1_rank, y = concept2_rank, color = hybrid_category_clean)) +
# geom_point(alpha = 0.6, size = 2.8) +
# geom_hline(yintercept = 5, color = "black", linewidth = 0.5) +
# geom_vline(xintercept = 5, color = "black", linewidth = 0.5) +
# facet_wrap(~hybrid_category_clean, nrow = 1) +
# coord_fixed(ratio = 1) +
# scale_color_viridis_d() +
# labs(
# x = "Constituent 1 Classification Rank",
# y = "Constituent 2 Classification Rank"
# ) +
# theme_few() +
# theme(
# legend.position = "none",
# strip.text = element_text(size = 9),
# axis.title = element_text(size = 8),
# axis.text = element_text(size = 6)
# )
#
# p1 / p2 + plot_annotation(tag_levels = "A")