Setup

library(here)

## here() starts at /Users/brialong/Documents/GitHub/birch-draw

library(tidyverse)

## Warning: package 'ggplot2' was built under R version 4.3.3

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.2     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.5.2     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2

## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

# for models
library(lme4)

## Loading required package: Matrix
## 
## Attaching package: 'Matrix'
## 
## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack

library(lmerTest)

## 
## Attaching package: 'lmerTest'
## 
## The following object is masked from 'package:lme4':
## 
##     lmer
## 
## The following object is masked from 'package:stats':
## 
##     step

library(ggthemes)

Import data

drawings = read_csv(file=here::here('data/preprocessed/metadata_with_ages_cossim_images.csv'))

## Rows: 636 Columns: 18
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr   (6): row_id, session_id, text1, participant_id, bounding_box, particip...
## dbl  (10): trial_num, submit_time, start_time, trial_duration, num_strokes, ...
## dttm  (2): submit_date, submit_date_readable
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

descriptions = read_csv(file=here::here('data/preprocessed/metadata_with_ages_cossim_transcripts.csv'))

## Rows: 554 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (5): SubID, target_category, utterance_ns, utterance_masked, interference
## dbl (8): version, age_months, age, age_approximated, age_discrepency, exclud...
## lgl (2): Unnamed: 5, Unnamed: 6
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Import and clean data

clean up descriptions

descriptions_clean <- descriptions %>%
  select(SubID, age_months, txt_cossim_to_adults, utterance_ns, target_category) %>%
  rename(category = target_category, sub_id = SubID, similarity = txt_cossim_to_adults) %>%
  mutate(mode = 'description')  %>%
  mutate(length = str_count(utterance_ns, "\\S+")) %>%
  select(-utterance_ns) 

descriptions_clean$word_length_scaled = scale(descriptions_clean$length)[,1]

clean up drawings preprocessed data

drawings_clean <- drawings %>%
  select(participant_id_clean, age_months, img_cossim_to_adults, text1, trial_duration) %>%
  rename(category = text1, sub_id = participant_id_clean, similarity = img_cossim_to_adults, length = trial_duration) %>%
  mutate(category = str_split_fixed(category,'_',2)[,2]) %>%
  mutate(mode = 'drawing') 
  
drawings_clean$drawing_length_scaled = scale(drawings_clean$length)[,1]

merge

all_data <- descriptions_clean %>%
  full_join(drawings_clean) %>%
  filter(age_months<200)

## Joining with `by = join_by(sub_id, age_months, similarity, category, mode,
## length)`

write_csv(all_data,file=here::here('data/preprocessed/merged_cleaned_data.csv'))

library(assertthat)

## 
## Attaching package: 'assertthat'

## The following object is masked from 'package:tibble':
## 
##     has_name

assert_that(sum(is.na(all_data$similarity))==0)

## [1] TRUE

Wide form data

all_data_wide <- drawings_clean %>%
 select(-length,-mode) %>%
  rename(drawing_similarity = similarity) %>%
  right_join(descriptions_clean %>% rename(description_similarity = similarity) %>% select(-length,-mode), by=c('sub_id','age_months','category'))

all_data_wide_kids <- all_data_wide %>%
  filter(age_months<200)

Descriptive plots

library(ggthemes)
ggplot(all_data %>% filter(age_months<200), aes(x=age_months, y=similarity, col=mode))+
  geom_point(alpha=.2) +
  geom_smooth(method='lm') +
  facet_wrap(~category) +
  ylab('Similarity to adults') +
  xlab('Age in months') +
  theme_few()

## `geom_smooth()` using formula = 'y ~ x'

Drawing vs. description similarity by category

library(viridis)

## Loading required package: viridisLite

ggplot(all_data_wide %>% filter(age_months<200), aes(x=drawing_similarity, y=description_similarity, col=age_months))+
  geom_point() +
  # scale_color_viridis() +
  geom_smooth(method='lm', aes(group=category)) +
  facet_wrap(~as.factor(category)) +
  ylab('Description similarity') +
  xlab('Drawing similarity') +
  theme_few()

## `geom_smooth()` using formula = 'y ~ x'

## Warning: Removed 8 rows containing non-finite outside the scale range
## (`stat_smooth()`).

## Warning: The following aesthetics were dropped during statistical transformation:
## colour.
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
##   the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
##   variable into a factor?
## The following aesthetics were dropped during statistical transformation:
## colour.
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
##   the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
##   variable into a factor?
## The following aesthetics were dropped during statistical transformation:
## colour.
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
##   the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
##   variable into a factor?
## The following aesthetics were dropped during statistical transformation:
## colour.
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
##   the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
##   variable into a factor?
## The following aesthetics were dropped during statistical transformation:
## colour.
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
##   the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
##   variable into a factor?
## The following aesthetics were dropped during statistical transformation:
## colour.
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
##   the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
##   variable into a factor?

## Warning: Removed 8 rows containing missing values or values outside the scale range
## (`geom_point()`).

crab = all_data_wide_kids %>% filter(category=='crab')
cor.test(crab$drawing_similarity, crab$description_similarity)

## 
##  Pearson's product-moment correlation
## 
## data:  crab$drawing_similarity and crab$description_similarity
## t = 3.9488, df = 70, p-value = 0.0001849
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.2165628 0.5992337
## sample estimates:
##       cor 
## 0.4268175

whale = all_data_wide_kids %>% filter(category=='whale')
cor.test(whale$drawing_similarity, whale$description_similarity)

## 
##  Pearson's product-moment correlation
## 
## data:  whale$drawing_similarity and whale$description_similarity
## t = 3.5093, df = 71, p-value = 0.0007844
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.1693805 0.5645929
## sample estimates:
##       cor 
## 0.3844656

shark = all_data_wide_kids %>% filter(category=='shark')
cor.test(shark$drawing_similarity, shark$description_similarity)

## 
##  Pearson's product-moment correlation
## 
## data:  shark$drawing_similarity and shark$description_similarity
## t = 1.6603, df = 68, p-value = 0.1015
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.03942517  0.41318828
## sample estimates:
##       cor 
## 0.1973773

seahorse = all_data_wide_kids %>% filter(category=='seahorse')
cor.test(seahorse$drawing_similarity, seahorse$description_similarity)

## 
##  Pearson's product-moment correlation
## 
## data:  seahorse$drawing_similarity and seahorse$description_similarity
## t = 0.24633, df = 67, p-value = 0.8062
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.2080821  0.2648749
## sample estimates:
##        cor 
## 0.03007998

ggplot(all_data_wide %>% filter(age_months<200), aes(x=drawing_length_scaled, y=drawing_similarity, col=category))+
  geom_point() +
  geom_smooth(method='lm') +
  facet_wrap(~category) +
  ylab('Similarity to adults') +
  xlab('Drawing length (scaled)') +
  theme_few()

## `geom_smooth()` using formula = 'y ~ x'

## Warning: Removed 8 rows containing non-finite outside the scale range
## (`stat_smooth()`).

## Warning: Removed 8 rows containing missing values or values outside the scale range
## (`geom_point()`).

Word length vs description similarity

ggplot(all_data_wide %>% filter(age_months<200), aes(x=word_length_scaled, y=description_similarity, col=category))+
  geom_point() +
  geom_smooth(method='lm') +
  facet_wrap(~category) +
  ylab('Similarity to adults') +
  xlab('Utterance length (scaled word count)') +
  theme_few()

## `geom_smooth()` using formula = 'y ~ x'

## Warning: Removed 9 rows containing non-finite outside the scale range
## (`stat_smooth()`).

## Warning: Removed 9 rows containing missing values or values outside the scale range
## (`geom_point()`).

Subject-level effects

sub_data <- all_data_wide %>%
  group_by(sub_id) %>%
  filter(age_months<200) %>%
  mutate(avg_drawing_similarity = mean(drawing_similarity), avg_description_similarity = mean(description_similarity), age_months = age_months[1])

ggplot(sub_data, aes(x=avg_drawing_similarity, y=avg_description_similarity, col=age_months)) +
  geom_point() +
  geom_smooth(method='lm') +
  theme_few() +
  ylab('Description similarity') +
  xlab('Drawing simialrity')

## `geom_smooth()` using formula = 'y ~ x'

## Warning: Removed 24 rows containing non-finite outside the scale range
## (`stat_smooth()`).

## Warning: The following aesthetics were dropped during statistical transformation:
## colour.
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
##   the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
##   variable into a factor?

## Warning: Removed 24 rows containing missing values or values outside the scale range
## (`geom_point()`).

Subject-level estimates are qutie correlated

cor.test(sub_data$avg_drawing_similarity, sub_data$avg_description_similarity)

## 
##  Pearson's product-moment correlation
## 
## data:  sub_data$avg_drawing_similarity and sub_data$avg_description_similarity
## t = 15.221, df = 412, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.5344753 0.6582646
## sample estimates:
##       cor 
## 0.5999488

Explore whether subject-level differences relate to age?

sub_data <- sub_data %>%
  mutate(mode_similarity = (drawing_similarity-description_similarity))

ggplot(sub_data, aes(x=age_months, y=mode_similarity, col=age_months)) +
  geom_point() +
  geom_smooth(method='lm') +
  theme_few() + 
  ylab('Drawing - language similarity by subject')

## `geom_smooth()` using formula = 'y ~ x'

## Warning: Removed 8 rows containing non-finite outside the scale range
## (`stat_smooth()`).

## Warning: The following aesthetics were dropped during statistical transformation:
## colour.
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
##   the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
##   variable into a factor?

## Warning: Removed 8 rows containing missing values or values outside the scale range
## (`geom_point()`).

Run inferential stats

Here is a basic lmer, with random intercepts for category/subjects

One of the challenges here is that we only have 6 categories. That means fitting models that actually deal with that uncertainty is going to be tougher than not

lmer_mode = lmer(data=all_data_wide_kids, drawing_similarity ~ description_similarity + age_months + (1 |sub_id) + (1 |category))

summary(lmer_mode)

## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: drawing_similarity ~ description_similarity + age_months + (1 |  
##     sub_id) + (1 | category)
##    Data: all_data_wide_kids
## 
## REML criterion at convergence: 8.2
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -3.3780 -0.6243  0.0304  0.6581  2.1334 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev.
##  sub_id   (Intercept) 0.005757 0.07587 
##  category (Intercept) 0.006183 0.07863 
##  Residual             0.050714 0.22520 
## Number of obs: 430, groups:  sub_id, 85; category, 6
## 
## Fixed effects:
##                          Estimate Std. Error         df t value Pr(>|t|)    
## (Intercept)             -0.111518   0.056075  30.476767  -1.989   0.0558 .  
## description_similarity   0.127841   0.061232 415.946702   2.088   0.0374 *  
## age_months               0.003276   0.000547  84.628852   5.989 4.96e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr) dscrp_
## dscrptn_sml  0.050       
## age_months  -0.755 -0.319

lmer_mode = lmer(data=all_data_wide_kids, drawing_similarity ~ description_similarity + age_months + word_length_scaled + drawing_length_scaled +  (1  |sub_id) + (1 |category))

summary(lmer_mode)

## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: 
## drawing_similarity ~ description_similarity + age_months + word_length_scaled +  
##     drawing_length_scaled + (1 | sub_id) + (1 | category)
##    Data: all_data_wide_kids
## 
## REML criterion at convergence: 21.9
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -3.4272 -0.6437  0.0524  0.6517  2.2568 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev.
##  sub_id   (Intercept) 0.005608 0.07489 
##  category (Intercept) 0.006270 0.07919 
##  Residual             0.051097 0.22605 
## Number of obs: 422, groups:  sub_id, 80; category, 6
## 
## Fixed effects:
##                          Estimate Std. Error         df t value Pr(>|t|)    
## (Intercept)            -1.179e-01  6.014e-02  3.638e+01  -1.960   0.0577 .  
## description_similarity  1.435e-01  6.494e-02  4.103e+02   2.209   0.0277 *  
## age_months              3.343e-03  5.708e-04  8.018e+01   5.856 9.97e-08 ***
## word_length_scaled     -5.079e-03  1.466e-02  2.134e+02  -0.346   0.7294    
## drawing_length_scaled  -1.486e-02  1.383e-02  2.852e+02  -1.074   0.2836    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr) dscrp_ ag_mnt wrd_l_
## dscrptn_sml -0.049                     
## age_months  -0.774 -0.223              
## wrd_lngth_s  0.290 -0.193 -0.249       
## drwng_lngt_ -0.152 -0.013  0.108 -0.150

Get out descirptives for rmarkdown – example below

kid_data = all_data %>% filter(age_months<200)
num_kids = length(unique(kid_data$sub_id))

mean_age = kid_data %>%
  distinct(sub_id, age_months) %>%
  summarize(mean_age = mean(age_months)/12) %>%
  pull(mean_age)

sd_age = kid_data %>%
  distinct(sub_id, age_months) %>%
  summarize(sd_age = sd(age_months)/12) %>%
  pull(sd_age)

adults = all_data_wide %>% filter(age_months>200)

Using a tablet interface with a researcher present at a local aquarium, we asked children (N = 91, ages 2 to 11 years, M = rmean_age`, SD = 2.3982566) to draw and describe six related concepts (i.e. sea animals) by interacting with a novice agent (i.e. “Zorpie”, a robot alien who just landed on earth).

bria-analyses

Bria Long

2025-12-16

Setup

Import and clean data

Descriptive plots

Run inferential stats