library(here)
## here() starts at /Users/brialong/Documents/GitHub/birch-draw
library(tidyverse)
## Warning: package 'ggplot2' was built under R version 4.3.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.2 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.5.2 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
Import data
drawings = read_csv(file=here::here('data/preprocessed/metadata_with_ages_cossim_images.csv'))
## Rows: 636 Columns: 18
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (6): row_id, session_id, text1, participant_id, bounding_box, particip...
## dbl (10): trial_num, submit_time, start_time, trial_duration, num_strokes, ...
## dttm (2): submit_date, submit_date_readable
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
descriptions = read_csv(file=here::here('data/preprocessed/metadata_with_ages_cossim_transcripts.csv'))
## Rows: 554 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (5): SubID, target_category, utterance_ns, utterance_masked, interference
## dbl (8): version, age_months, age, age_approximated, age_discrepency, exclud...
## lgl (2): Unnamed: 5, Unnamed: 6
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
clean up descriptions
descriptions_clean <- descriptions %>%
select(SubID, age_months, txt_cossim_to_adults, utterance_ns, target_category) %>%
rename(category = target_category, sub_id = SubID, similarity = txt_cossim_to_adults) %>%
mutate(mode = 'description') %>%
mutate(length = str_count(utterance_ns, "\\S+")) %>%
select(-utterance_ns)
descriptions_clean$word_length_scaled = scale(descriptions_clean$length)[,1]
clean up drawings preprocessed data
drawings_clean <- drawings %>%
select(participant_id_clean, age_months, img_cossim_to_adults, text1, trial_duration) %>%
rename(category = text1, sub_id = participant_id_clean, similarity = img_cossim_to_adults, length = trial_duration) %>%
mutate(category = str_split_fixed(category,'_',2)[,2]) %>%
mutate(mode = 'drawing')
drawings_clean$drawing_length_scaled = scale(drawings_clean$length)[,1]
merge
all_data <- descriptions_clean %>%
full_join(drawings_clean)
## Joining with `by = join_by(sub_id, age_months, similarity, category, mode,
## length)`
write_csv(all_data,file=here::here('data/preprocessed/merged_cleaned_data.csv'))
library(assertthat)
##
## Attaching package: 'assertthat'
## The following object is masked from 'package:tibble':
##
## has_name
assert_that(sum(is.na(all_data$similarity))==0)
## [1] TRUE
Wide form data
all_data_wide <- drawings_clean %>%
select(-length,-mode) %>%
rename(drawing_similarity = similarity) %>%
right_join(descriptions_clean %>% rename(description_similarity = similarity) %>% select(-length,-mode), by=c('sub_id','age_months','category'))
library(ggthemes)
ggplot(all_data %>% filter(age_months<200), aes(x=age_months, y=similarity, col=mode))+
geom_point(alpha=.2) +
geom_smooth(method='lm') +
facet_wrap(~category) +
ylab('Similarity to adults') +
xlab('Age in months') +
theme_few()
## `geom_smooth()` using formula = 'y ~ x'
Drawing vs. description similarity by category
library(viridis)
## Loading required package: viridisLite
ggplot(all_data_wide %>% filter(age_months<200), aes(x=drawing_similarity, y=description_similarity, col=age_months))+
geom_point() +
# scale_color_viridis() +
geom_smooth(method='lm', aes(group=category)) +
facet_wrap(~as.factor(category)) +
ylab('Similarity to adults') +
xlab('Age in months') +
theme_few()
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 8 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: The following aesthetics were dropped during statistical transformation:
## colour.
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
## the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
## variable into a factor?
## The following aesthetics were dropped during statistical transformation:
## colour.
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
## the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
## variable into a factor?
## The following aesthetics were dropped during statistical transformation:
## colour.
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
## the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
## variable into a factor?
## The following aesthetics were dropped during statistical transformation:
## colour.
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
## the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
## variable into a factor?
## The following aesthetics were dropped during statistical transformation:
## colour.
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
## the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
## variable into a factor?
## The following aesthetics were dropped during statistical transformation:
## colour.
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
## the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
## variable into a factor?
## Warning: Removed 8 rows containing missing values or values outside the scale range
## (`geom_point()`).
ggplot(all_data_wide %>% filter(age_months<200), aes(x=drawing_length_scaled, y=drawing_similarity, col=category))+
geom_point() +
geom_smooth(method='lm') +
facet_wrap(~category) +
ylab('Similarity to adults') +
xlab('Drawing length (scaled)') +
theme_few()
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 8 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 8 rows containing missing values or values outside the scale range
## (`geom_point()`).
Word length vs description similarity
ggplot(all_data_wide %>% filter(age_months<200), aes(x=word_length_scaled, y=description_similarity, col=category))+
geom_point() +
geom_smooth(method='lm') +
facet_wrap(~category) +
ylab('Similarity to adults') +
xlab('Utterance length (scaled word count)') +
theme_few()
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 9 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 9 rows containing missing values or values outside the scale range
## (`geom_point()`).
Subject-level effects
sub_data <- all_data_wide %>%
group_by(sub_id) %>%
filter(age_months<200) %>%
mutate(avg_drawing_similarity = mean(drawing_similarity), avg_description_similarity = mean(description_similarity), age_months = age_months[1])
ggplot(sub_data, aes(x=avg_drawing_similarity, y=avg_description_similarity, col=age_months)) +
geom_point() +
geom_smooth(method='lm') +
theme_few() +
ylab('Description similarity') +
xlab('Drawing simialrity')
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 24 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: The following aesthetics were dropped during statistical transformation:
## colour.
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
## the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
## variable into a factor?
## Warning: Removed 24 rows containing missing values or values outside the scale range
## (`geom_point()`).
Explore whether subject-level differences relate to age?
sub_data <- sub_data %>%
mutate(mode_similarity = (drawing_similarity-description_similarity))
ggplot(sub_data, aes(x=age_months, y=mode_similarity, col=age_months)) +
geom_point() +
geom_smooth(method='lm') +
theme_few() +
ylab('Drawing - language similarity by subject')
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 8 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: The following aesthetics were dropped during statistical transformation:
## colour.
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
## the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
## variable into a factor?
## Warning: Removed 8 rows containing missing values or values outside the scale range
## (`geom_point()`).
library(lme4)
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
library(lmerTest)
##
## Attaching package: 'lmerTest'
## The following object is masked from 'package:lme4':
##
## lmer
## The following object is masked from 'package:stats':
##
## step
lmer_mode = lmer(data=all_data_wide, drawing_similarity ~ description_similarity*age_months + word_length_scaled + drawing_length_scaled + (1|sub_id) + (1|category))
summary(lmer_mode)
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula:
## drawing_similarity ~ description_similarity * age_months + word_length_scaled +
## drawing_length_scaled + (1 | sub_id) + (1 | category)
## Data: all_data_wide
##
## REML criterion at convergence: -2.6
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -3.4796 -0.6385 0.0510 0.6994 2.1320
##
## Random effects:
## Groups Name Variance Std.Dev.
## sub_id (Intercept) 0.005295 0.07276
## category (Intercept) 0.006218 0.07886
## Residual 0.047868 0.21879
## Number of obs: 538, groups: sub_id, 100; category, 6
##
## Fixed effects:
## Estimate Std. Error df t value
## (Intercept) -3.148e-02 5.096e-02 2.547e+01 -0.618
## description_similarity 3.272e-01 9.964e-02 4.522e+02 3.284
## age_months 2.284e-03 3.884e-04 3.993e+02 5.882
## word_length_scaled 1.702e-03 1.232e-02 2.575e+02 0.138
## drawing_length_scaled -1.238e-02 1.242e-02 3.258e+02 -0.997
## description_similarity:age_months -1.891e-03 7.768e-04 4.889e+02 -2.435
## Pr(>|t|)
## (Intercept) 0.5422
## description_similarity 0.0011 **
## age_months 8.59e-09 ***
## word_length_scaled 0.8902
## drawing_length_scaled 0.3194
## description_similarity:age_months 0.0153 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr) dscrp_ ag_mnt wrd_l_ drwn__
## dscrptn_sml -0.512
## age_months -0.685 0.544
## wrd_lngth_s 0.260 -0.190 -0.255
## drwng_lngt_ -0.204 0.006 0.203 -0.195
## dscrptn_s:_ 0.563 -0.815 -0.831 0.121 0.004
Get out descirptives for rmarkdown
kid_data = all_data %>% filter(age_months<200)
num_kids = length(unique(kid_data$sub_id))
mean_age = kid_data %>%
distinct(sub_id, age_months) %>%
summarize(mean_age = mean(age_months)/12) %>%
pull(mean_age)
sd_age = kid_data %>%
distinct(sub_id, age_months) %>%
summarize(sd_age = sd(age_months)/12) %>%
pull(sd_age)
Children’s conceptual representations are inherently multimodal,
integrating information from perception, language, and interaction into
a coherent, flexible representation. To what extent to these modalities
overlap? Here, we investigate the expression of two of these modalities,
vision and language, by quantifying the semantic overlap across
children’s drawings and descriptions of fine-grained visual concepts.
Using a tablet interface with a researcher present at a local aquarium,
we asked children (N = 91, ages 2 to 11 years, M =
mean_age, SD = 2.3982566) to draw and describe six related
concepts (i.e. sea animals) by interacting with a novice agent
(i.e. “Zorpie”, a robot alien who just landed on earth).
Using CLIP embeddings, we tested two hypotheses about the overlap between children’s drawings and descriptions. First, we examined whether children’s drawings and descriptions both reflect gradually accurate expressions across development. Replicating prior work exclusively in drawings, we find both children’s drawings and descriptions reflect gradual accuracy as children slowly accumulate increasingly fine grained representations of related visual concepts (stats here idk ). Second, as an initial assessment of cross‐modal semantic alignment, we examine whether, within individual children, average drawing accuracy is correlated with average description accuracy. We find XX These findings suggest that distinct partial knowledge is expressed through each modality independently. We seek to quantify these differences further in future analyses to examine modality-dependent idiosyncratic and time-based analyses of children’s multimodal representations.