library(here)
## here() starts at /Users/brialong/Documents/GitHub/birch-draw
library(tidyverse)
## Warning: package 'ggplot2' was built under R version 4.3.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.2 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.5.2 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
# for models
library(lme4)
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
##
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
library(lmerTest)
##
## Attaching package: 'lmerTest'
##
## The following object is masked from 'package:lme4':
##
## lmer
##
## The following object is masked from 'package:stats':
##
## step
library(ggthemes)
Import data
drawings = read_csv(file=here::here('data/preprocessed/metadata_with_ages_cossim_images.csv'))
## Rows: 636 Columns: 18
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (6): row_id, session_id, text1, participant_id, bounding_box, particip...
## dbl (10): trial_num, submit_time, start_time, trial_duration, num_strokes, ...
## dttm (2): submit_date, submit_date_readable
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
descriptions = read_csv(file=here::here('data/preprocessed/metadata_with_ages_cossim_transcripts.csv'))
## Rows: 554 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (5): SubID, target_category, utterance_ns, utterance_masked, interference
## dbl (8): version, age_months, age, age_approximated, age_discrepency, exclud...
## lgl (2): Unnamed: 5, Unnamed: 6
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
clean up descriptions
descriptions_clean <- descriptions %>%
select(SubID, age_months, txt_cossim_to_adults, utterance_ns, target_category) %>%
rename(category = target_category, sub_id = SubID, similarity = txt_cossim_to_adults) %>%
mutate(mode = 'description') %>%
mutate(length = str_count(utterance_ns, "\\S+")) %>%
select(-utterance_ns)
descriptions_clean$word_length_scaled = scale(descriptions_clean$length)[,1]
clean up drawings preprocessed data
drawings_clean <- drawings %>%
select(participant_id_clean, age_months, img_cossim_to_adults, text1, trial_duration) %>%
rename(category = text1, sub_id = participant_id_clean, similarity = img_cossim_to_adults, length = trial_duration) %>%
mutate(category = str_split_fixed(category,'_',2)[,2]) %>%
mutate(mode = 'drawing')
drawings_clean$drawing_length_scaled = scale(drawings_clean$length)[,1]
merge
all_data <- descriptions_clean %>%
full_join(drawings_clean) %>%
filter(age_months<200)
## Joining with `by = join_by(sub_id, age_months, similarity, category, mode,
## length)`
write_csv(all_data,file=here::here('data/preprocessed/merged_cleaned_data.csv'))
library(assertthat)
##
## Attaching package: 'assertthat'
## The following object is masked from 'package:tibble':
##
## has_name
assert_that(sum(is.na(all_data$similarity))==0)
## [1] TRUE
Wide form data
all_data_wide <- drawings_clean %>%
select(-length,-mode) %>%
rename(drawing_similarity = similarity) %>%
right_join(descriptions_clean %>% rename(description_similarity = similarity) %>% select(-length,-mode), by=c('sub_id','age_months','category'))
all_data_wide_kids <- all_data_wide %>%
filter(age_months<200)
library(ggthemes)
ggplot(all_data %>% filter(age_months<200), aes(x=age_months, y=similarity, col=mode))+
geom_point(alpha=.2) +
geom_smooth(method='lm') +
facet_wrap(~category) +
ylab('Similarity to adults') +
xlab('Age in months') +
theme_few()
## `geom_smooth()` using formula = 'y ~ x'
Drawing vs. description similarity by category
library(viridis)
## Loading required package: viridisLite
ggplot(all_data_wide %>% filter(age_months<200), aes(x=drawing_similarity, y=description_similarity, col=age_months))+
geom_point() +
# scale_color_viridis() +
geom_smooth(method='lm', aes(group=category)) +
facet_wrap(~as.factor(category)) +
ylab('Description similarity') +
xlab('Drawing similarity') +
theme_few()
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 8 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: The following aesthetics were dropped during statistical transformation:
## colour.
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
## the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
## variable into a factor?
## The following aesthetics were dropped during statistical transformation:
## colour.
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
## the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
## variable into a factor?
## The following aesthetics were dropped during statistical transformation:
## colour.
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
## the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
## variable into a factor?
## The following aesthetics were dropped during statistical transformation:
## colour.
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
## the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
## variable into a factor?
## The following aesthetics were dropped during statistical transformation:
## colour.
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
## the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
## variable into a factor?
## The following aesthetics were dropped during statistical transformation:
## colour.
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
## the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
## variable into a factor?
## Warning: Removed 8 rows containing missing values or values outside the scale range
## (`geom_point()`).
crab = all_data_wide_kids %>% filter(category=='crab')
cor.test(crab$drawing_similarity, crab$description_similarity)
##
## Pearson's product-moment correlation
##
## data: crab$drawing_similarity and crab$description_similarity
## t = 3.9488, df = 70, p-value = 0.0001849
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.2165628 0.5992337
## sample estimates:
## cor
## 0.4268175
whale = all_data_wide_kids %>% filter(category=='whale')
cor.test(whale$drawing_similarity, whale$description_similarity)
##
## Pearson's product-moment correlation
##
## data: whale$drawing_similarity and whale$description_similarity
## t = 3.5093, df = 71, p-value = 0.0007844
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.1693805 0.5645929
## sample estimates:
## cor
## 0.3844656
shark = all_data_wide_kids %>% filter(category=='shark')
cor.test(shark$drawing_similarity, shark$description_similarity)
##
## Pearson's product-moment correlation
##
## data: shark$drawing_similarity and shark$description_similarity
## t = 1.6603, df = 68, p-value = 0.1015
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.03942517 0.41318828
## sample estimates:
## cor
## 0.1973773
seahorse = all_data_wide_kids %>% filter(category=='seahorse')
cor.test(seahorse$drawing_similarity, seahorse$description_similarity)
##
## Pearson's product-moment correlation
##
## data: seahorse$drawing_similarity and seahorse$description_similarity
## t = 0.24633, df = 67, p-value = 0.8062
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.2080821 0.2648749
## sample estimates:
## cor
## 0.03007998
ggplot(all_data_wide %>% filter(age_months<200), aes(x=drawing_length_scaled, y=drawing_similarity, col=category))+
geom_point() +
geom_smooth(method='lm') +
facet_wrap(~category) +
ylab('Similarity to adults') +
xlab('Drawing length (scaled)') +
theme_few()
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 8 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 8 rows containing missing values or values outside the scale range
## (`geom_point()`).
Word length vs description similarity
ggplot(all_data_wide %>% filter(age_months<200), aes(x=word_length_scaled, y=description_similarity, col=category))+
geom_point() +
geom_smooth(method='lm') +
facet_wrap(~category) +
ylab('Similarity to adults') +
xlab('Utterance length (scaled word count)') +
theme_few()
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 9 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 9 rows containing missing values or values outside the scale range
## (`geom_point()`).
Subject-level effects
sub_data <- all_data_wide %>%
group_by(sub_id) %>%
filter(age_months<200) %>%
mutate(avg_drawing_similarity = mean(drawing_similarity), avg_description_similarity = mean(description_similarity), age_months = age_months[1])
ggplot(sub_data, aes(x=avg_drawing_similarity, y=avg_description_similarity, col=age_months)) +
geom_point() +
geom_smooth(method='lm') +
theme_few() +
ylab('Description similarity') +
xlab('Drawing simialrity')
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 24 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: The following aesthetics were dropped during statistical transformation:
## colour.
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
## the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
## variable into a factor?
## Warning: Removed 24 rows containing missing values or values outside the scale range
## (`geom_point()`).
Subject-level estimates are qutie correlated
cor.test(sub_data$avg_drawing_similarity, sub_data$avg_description_similarity)
##
## Pearson's product-moment correlation
##
## data: sub_data$avg_drawing_similarity and sub_data$avg_description_similarity
## t = 15.221, df = 412, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.5344753 0.6582646
## sample estimates:
## cor
## 0.5999488
Explore whether subject-level differences relate to age?
sub_data <- sub_data %>%
mutate(mode_similarity = (drawing_similarity-description_similarity))
ggplot(sub_data, aes(x=age_months, y=mode_similarity, col=age_months)) +
geom_point() +
geom_smooth(method='lm') +
theme_few() +
ylab('Drawing - language similarity by subject')
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 8 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: The following aesthetics were dropped during statistical transformation:
## colour.
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
## the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
## variable into a factor?
## Warning: Removed 8 rows containing missing values or values outside the scale range
## (`geom_point()`).
Here is a basic lmer, with random intercepts for category/subjects
One of the challenges here is that we only have 6 categories. That means fitting models that actually deal with that uncertainty is going to be tougher than not
lmer_mode = lmer(data=all_data_wide_kids, drawing_similarity ~ description_similarity + age_months + (1 |sub_id) + (1 |category))
summary(lmer_mode)
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: drawing_similarity ~ description_similarity + age_months + (1 |
## sub_id) + (1 | category)
## Data: all_data_wide_kids
##
## REML criterion at convergence: 8.2
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -3.3780 -0.6243 0.0304 0.6581 2.1334
##
## Random effects:
## Groups Name Variance Std.Dev.
## sub_id (Intercept) 0.005757 0.07587
## category (Intercept) 0.006183 0.07863
## Residual 0.050714 0.22520
## Number of obs: 430, groups: sub_id, 85; category, 6
##
## Fixed effects:
## Estimate Std. Error df t value Pr(>|t|)
## (Intercept) -0.111518 0.056075 30.476767 -1.989 0.0558 .
## description_similarity 0.127841 0.061232 415.946702 2.088 0.0374 *
## age_months 0.003276 0.000547 84.628852 5.989 4.96e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr) dscrp_
## dscrptn_sml 0.050
## age_months -0.755 -0.319
lmer_mode = lmer(data=all_data_wide_kids, drawing_similarity ~ description_similarity + age_months + word_length_scaled + drawing_length_scaled + (1 |sub_id) + (1 |category))
summary(lmer_mode)
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula:
## drawing_similarity ~ description_similarity + age_months + word_length_scaled +
## drawing_length_scaled + (1 | sub_id) + (1 | category)
## Data: all_data_wide_kids
##
## REML criterion at convergence: 21.9
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -3.4272 -0.6437 0.0524 0.6517 2.2568
##
## Random effects:
## Groups Name Variance Std.Dev.
## sub_id (Intercept) 0.005608 0.07489
## category (Intercept) 0.006270 0.07919
## Residual 0.051097 0.22605
## Number of obs: 422, groups: sub_id, 80; category, 6
##
## Fixed effects:
## Estimate Std. Error df t value Pr(>|t|)
## (Intercept) -1.179e-01 6.014e-02 3.638e+01 -1.960 0.0577 .
## description_similarity 1.435e-01 6.494e-02 4.103e+02 2.209 0.0277 *
## age_months 3.343e-03 5.708e-04 8.018e+01 5.856 9.97e-08 ***
## word_length_scaled -5.079e-03 1.466e-02 2.134e+02 -0.346 0.7294
## drawing_length_scaled -1.486e-02 1.383e-02 2.852e+02 -1.074 0.2836
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr) dscrp_ ag_mnt wrd_l_
## dscrptn_sml -0.049
## age_months -0.774 -0.223
## wrd_lngth_s 0.290 -0.193 -0.249
## drwng_lngt_ -0.152 -0.013 0.108 -0.150
Get out descirptives for rmarkdown – example below
kid_data = all_data %>% filter(age_months<200)
num_kids = length(unique(kid_data$sub_id))
mean_age = kid_data %>%
distinct(sub_id, age_months) %>%
summarize(mean_age = mean(age_months)/12) %>%
pull(mean_age)
sd_age = kid_data %>%
distinct(sub_id, age_months) %>%
summarize(sd_age = sd(age_months)/12) %>%
pull(sd_age)
adults = all_data_wide %>% filter(age_months>200)
Using a tablet interface with a researcher present at a local
aquarium, we asked children (N = 91, ages 2 to 11 years, M =
rmean_age`, SD = 2.3982566) to draw and describe six
related concepts (i.e. sea animals) by interacting with a novice agent
(i.e. “Zorpie”, a robot alien who just landed on earth).