Setup

library(here)

## here() starts at /Users/brialong/Documents/GitHub/birch-draw

library(tidyverse)

## Warning: package 'ggplot2' was built under R version 4.3.3

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.2     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.5.2     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2

## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

Import data

drawings = read_csv(file=here::here('data/preprocessed/metadata_with_ages_cossim_images.csv'))

## Rows: 636 Columns: 18
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr   (6): row_id, session_id, text1, participant_id, bounding_box, particip...
## dbl  (10): trial_num, submit_time, start_time, trial_duration, num_strokes, ...
## dttm  (2): submit_date, submit_date_readable
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

descriptions = read_csv(file=here::here('data/preprocessed/metadata_with_ages_cossim_transcripts.csv'))

## Rows: 554 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (5): SubID, target_category, utterance_ns, utterance_masked, interference
## dbl (8): version, age_months, age, age_approximated, age_discrepency, exclud...
## lgl (2): Unnamed: 5, Unnamed: 6
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Import and clean data

clean up descriptions

descriptions_clean <- descriptions %>%
  select(SubID, age_months, txt_cossim_to_adults, utterance_ns, target_category) %>%
  rename(category = target_category, sub_id = SubID, similarity = txt_cossim_to_adults) %>%
  mutate(mode = 'description')  %>%
  mutate(length = str_count(utterance_ns, "\\S+")) %>%
  select(-utterance_ns) 

descriptions_clean$word_length_scaled = scale(descriptions_clean$length)[,1]

clean up drawings preprocessed data

drawings_clean <- drawings %>%
  select(participant_id_clean, age_months, img_cossim_to_adults, text1, trial_duration) %>%
  rename(category = text1, sub_id = participant_id_clean, similarity = img_cossim_to_adults, length = trial_duration) %>%
  mutate(category = str_split_fixed(category,'_',2)[,2]) %>%
  mutate(mode = 'drawing') 
  
drawings_clean$drawing_length_scaled = scale(drawings_clean$length)[,1]

merge

all_data <- descriptions_clean %>%
  full_join(drawings_clean)

## Joining with `by = join_by(sub_id, age_months, similarity, category, mode,
## length)`

write_csv(all_data,file=here::here('data/preprocessed/merged_cleaned_data.csv'))

library(assertthat)

## 
## Attaching package: 'assertthat'

## The following object is masked from 'package:tibble':
## 
##     has_name

assert_that(sum(is.na(all_data$similarity))==0)

## [1] TRUE

Wide form data

all_data_wide <- drawings_clean %>%
 select(-length,-mode) %>%
  rename(drawing_similarity = similarity) %>%
  right_join(descriptions_clean %>% rename(description_similarity = similarity) %>% select(-length,-mode), by=c('sub_id','age_months','category'))

Descriptive plots

library(ggthemes)
ggplot(all_data %>% filter(age_months<200), aes(x=age_months, y=similarity, col=mode))+
  geom_point(alpha=.2) +
  geom_smooth(method='lm') +
  facet_wrap(~category) +
  ylab('Similarity to adults') +
  xlab('Age in months') +
  theme_few()

## `geom_smooth()` using formula = 'y ~ x'

Drawing vs. description similarity by category

library(viridis)

## Loading required package: viridisLite

ggplot(all_data_wide %>% filter(age_months<200), aes(x=drawing_similarity, y=description_similarity, col=age_months))+
  geom_point() +
  # scale_color_viridis() +
  geom_smooth(method='lm', aes(group=category)) +
  facet_wrap(~as.factor(category)) +
  ylab('Similarity to adults') +
  xlab('Age in months') +
  theme_few()

## `geom_smooth()` using formula = 'y ~ x'

## Warning: Removed 8 rows containing non-finite outside the scale range
## (`stat_smooth()`).

## Warning: The following aesthetics were dropped during statistical transformation:
## colour.
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
##   the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
##   variable into a factor?
## The following aesthetics were dropped during statistical transformation:
## colour.
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
##   the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
##   variable into a factor?
## The following aesthetics were dropped during statistical transformation:
## colour.
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
##   the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
##   variable into a factor?
## The following aesthetics were dropped during statistical transformation:
## colour.
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
##   the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
##   variable into a factor?
## The following aesthetics were dropped during statistical transformation:
## colour.
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
##   the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
##   variable into a factor?
## The following aesthetics were dropped during statistical transformation:
## colour.
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
##   the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
##   variable into a factor?

## Warning: Removed 8 rows containing missing values or values outside the scale range
## (`geom_point()`).

ggplot(all_data_wide %>% filter(age_months<200), aes(x=drawing_length_scaled, y=drawing_similarity, col=category))+
  geom_point() +
  geom_smooth(method='lm') +
  facet_wrap(~category) +
  ylab('Similarity to adults') +
  xlab('Drawing length (scaled)') +
  theme_few()

## `geom_smooth()` using formula = 'y ~ x'

## Warning: Removed 8 rows containing non-finite outside the scale range
## (`stat_smooth()`).

## Warning: Removed 8 rows containing missing values or values outside the scale range
## (`geom_point()`).

Word length vs description similarity

ggplot(all_data_wide %>% filter(age_months<200), aes(x=word_length_scaled, y=description_similarity, col=category))+
  geom_point() +
  geom_smooth(method='lm') +
  facet_wrap(~category) +
  ylab('Similarity to adults') +
  xlab('Utterance length (scaled word count)') +
  theme_few()

## `geom_smooth()` using formula = 'y ~ x'

## Warning: Removed 9 rows containing non-finite outside the scale range
## (`stat_smooth()`).

## Warning: Removed 9 rows containing missing values or values outside the scale range
## (`geom_point()`).

Subject-level effects

sub_data <- all_data_wide %>%
  group_by(sub_id) %>%
  filter(age_months<200) %>%
  mutate(avg_drawing_similarity = mean(drawing_similarity), avg_description_similarity = mean(description_similarity), age_months = age_months[1])

ggplot(sub_data, aes(x=avg_drawing_similarity, y=avg_description_similarity, col=age_months)) +
  geom_point() +
  geom_smooth(method='lm') +
  theme_few() +
  ylab('Description similarity') +
  xlab('Drawing simialrity')

## `geom_smooth()` using formula = 'y ~ x'

## Warning: Removed 24 rows containing non-finite outside the scale range
## (`stat_smooth()`).

## Warning: The following aesthetics were dropped during statistical transformation:
## colour.
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
##   the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
##   variable into a factor?

## Warning: Removed 24 rows containing missing values or values outside the scale range
## (`geom_point()`).

Explore whether subject-level differences relate to age?

sub_data <- sub_data %>%
  mutate(mode_similarity = (drawing_similarity-description_similarity))

ggplot(sub_data, aes(x=age_months, y=mode_similarity, col=age_months)) +
  geom_point() +
  geom_smooth(method='lm') +
  theme_few() + 
  ylab('Drawing - language similarity by subject')

## `geom_smooth()` using formula = 'y ~ x'

## Warning: Removed 8 rows containing non-finite outside the scale range
## (`stat_smooth()`).

## Warning: The following aesthetics were dropped during statistical transformation:
## colour.
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
##   the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
##   variable into a factor?

## Warning: Removed 8 rows containing missing values or values outside the scale range
## (`geom_point()`).

Run inferential stats

library(lme4)

## Loading required package: Matrix

## 
## Attaching package: 'Matrix'

## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack

library(lmerTest)

## 
## Attaching package: 'lmerTest'

## The following object is masked from 'package:lme4':
## 
##     lmer

## The following object is masked from 'package:stats':
## 
##     step

lmer_mode = lmer(data=all_data_wide, drawing_similarity ~ description_similarity*age_months + word_length_scaled + drawing_length_scaled +  (1|sub_id) + (1|category))

summary(lmer_mode)

## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: 
## drawing_similarity ~ description_similarity * age_months + word_length_scaled +  
##     drawing_length_scaled + (1 | sub_id) + (1 | category)
##    Data: all_data_wide
## 
## REML criterion at convergence: -2.6
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -3.4796 -0.6385  0.0510  0.6994  2.1320 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev.
##  sub_id   (Intercept) 0.005295 0.07276 
##  category (Intercept) 0.006218 0.07886 
##  Residual             0.047868 0.21879 
## Number of obs: 538, groups:  sub_id, 100; category, 6
## 
## Fixed effects:
##                                     Estimate Std. Error         df t value
## (Intercept)                       -3.148e-02  5.096e-02  2.547e+01  -0.618
## description_similarity             3.272e-01  9.964e-02  4.522e+02   3.284
## age_months                         2.284e-03  3.884e-04  3.993e+02   5.882
## word_length_scaled                 1.702e-03  1.232e-02  2.575e+02   0.138
## drawing_length_scaled             -1.238e-02  1.242e-02  3.258e+02  -0.997
## description_similarity:age_months -1.891e-03  7.768e-04  4.889e+02  -2.435
##                                   Pr(>|t|)    
## (Intercept)                         0.5422    
## description_similarity              0.0011 ** 
## age_months                        8.59e-09 ***
## word_length_scaled                  0.8902    
## drawing_length_scaled               0.3194    
## description_similarity:age_months   0.0153 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr) dscrp_ ag_mnt wrd_l_ drwn__
## dscrptn_sml -0.512                            
## age_months  -0.685  0.544                     
## wrd_lngth_s  0.260 -0.190 -0.255              
## drwng_lngt_ -0.204  0.006  0.203 -0.195       
## dscrptn_s:_  0.563 -0.815 -0.831  0.121  0.004

Get out descirptives for rmarkdown

kid_data = all_data %>% filter(age_months<200)
num_kids = length(unique(kid_data$sub_id))

mean_age = kid_data %>%
  distinct(sub_id, age_months) %>%
  summarize(mean_age = mean(age_months)/12) %>%
  pull(mean_age)

sd_age = kid_data %>%
  distinct(sub_id, age_months) %>%
  summarize(sd_age = sd(age_months)/12) %>%
  pull(sd_age)

Children’s conceptual representations are inherently multimodal, integrating information from perception, language, and interaction into a coherent, flexible representation. To what extent to these modalities overlap? Here, we investigate the expression of two of these modalities, vision and language, by quantifying the semantic overlap across children’s drawings and descriptions of fine-grained visual concepts. Using a tablet interface with a researcher present at a local aquarium, we asked children (N = 91, ages 2 to 11 years, M = mean_age, SD = 2.3982566) to draw and describe six related concepts (i.e. sea animals) by interacting with a novice agent (i.e. “Zorpie”, a robot alien who just landed on earth).

Using CLIP embeddings, we tested two hypotheses about the overlap between children’s drawings and descriptions. First, we examined whether children’s drawings and descriptions both reflect gradually accurate expressions across development. Replicating prior work exclusively in drawings, we find both children’s drawings and descriptions reflect gradual accuracy as children slowly accumulate increasingly fine grained representations of related visual concepts (stats here idk ). Second, as an initial assessment of cross‐modal semantic alignment, we examine whether, within individual children, average drawing accuracy is correlated with average description accuracy. We find XX These findings suggest that distinct partial knowledge is expressed through each modality independently. We seek to quantify these differences further in future analyses to examine modality-dependent idiosyncratic and time-based analyses of children’s multimodal representations.

bria-analyses

Bria Long

2025-12-16

Setup

Import and clean data

Descriptive plots

Run inferential stats