Load data and libraries

libraries

library(tidyverse)
library(lme4)
library(lmerTest)

library(ggthemes)
library(stringr)
library(knitr) # for html rendering
library(langcog) # for CIs, install via githubtools

library(viridis) # for colors
library(philentropy) # for distance functions (jsd)

Read preprocessed file

load(here::here('data/preprocessed_data/part_emphasis.RData'))

sub_summary <- part_emphasis_all %>%
  group_by(sub_id) %>%
  summarize(num_drawings = length(unique(filename_short))) %>%
  arrange(num_drawings)

Write out some basic descriptives

We collected 2064 drawings from runique(part_emphasis_all$sub_id)` children who were 3.28, 5.66 years old (range 7, 95 per child).

Calculate JSD for each sketch

First step is to create inclusion/emphasis data structure Desired structure is 1 row per drawing Values reflect inclusion/emphasis on part Get so there are 0’s for parts that had nothing

Get data structure

wide_emphasis_all_categories <- part_emphasis_all %>%
  ungroup() %>%
  select(filename_short, roi_labelName, emphasis, years_old, sub_id, category) %>%
  pivot_wider(names_from = roi_labelName, values_from = emphasis, values_fill = 0, names_prefix="Emph_")  %>%
  mutate(rounded_age =  round(years_old,4)) %>%
  rownames_to_column(var = 'unique_id') %>%
  mutate(sketch_id = paste0(sub_id, '_',rounded_age, '_', unique_id))

Create shuffled ids and put them in the data structure – Shuffle filenames within categories – the category labels are so different by categogry the full shuffling is really weird.

shuffled_ids <- wide_emphasis_all_categories %>%
  distinct(category, filename_short, sketch_id) %>%
  group_by(category) %>%
  mutate(sketch_id_shuffled = sketch_id[sample(1:length(unique(sketch_id)))])

wide_emphasis_all_categories <- wide_emphasis_all_categories %>%
  left_join(shuffled_ids)

## Joining with `by = join_by(filename_short, category, sketch_id)`

Grab meta data for each of these sketches for joining back later.

meta_data <- wide_emphasis_all_categories %>%
  select(sketch_id, category, years_old, sub_id, filename_short)

Make this into a data matrix we can feed to JSD

jsd_by_sketch_matrix <-  wide_emphasis_all_categories %>%
  select(-filename_short, -category) %>%
  select(starts_with("Emph_")) %>%
  data.matrix()

Calculate JSD for each sketch

jsd_by_sketch <- JSD(jsd_by_sketch_matrix, est.prob='empirical')

## Metric: 'jensen-shannon' using unit: 'log2'; comparing: 2064 vectors.

# jsd_by_sketch <- JSD(jsd_by_sketch_matrix, est.prob=NULL)

Make shuffled and unshuffled versinos

jsd_by_sketch_shuffled <- jsd_by_sketch
colnames(jsd_by_sketch_shuffled) = wide_emphasis_all_categories$sketch_id_shuffled
rownames(jsd_by_sketch_shuffled) = wide_emphasis_all_categories$sketch_id_shuffled

colnames(jsd_by_sketch) = wide_emphasis_all_categories$sketch_id
rownames(jsd_by_sketch) = wide_emphasis_all_categories$sketch_id

Now convert back to long form for anlayses

jsd_by_sketch_long <- jsd_by_sketch %>%
  as_tibble() %>%
  add_column(sketch_id = wide_emphasis_all_categories$sketch_id) %>%
  pivot_longer(cols = starts_with('sub'), names_to = "sketch_2", values_to = "jsd") %>%
  right_join(meta_data) %>%
  rename(years_old_sketch_1 = years_old, sub_id_sketch_1 = sub_id, sketch_1 = sketch_id, category_sketch_1 = category) %>%
  right_join(meta_data, by=c('sketch_2' = 'sketch_id')) %>%
  rename(years_old_sketch_2 = years_old, sub_id_sketch_2 = sub_id, category_sketch_2 = category)

## Joining with `by = join_by(sketch_id)`

jsd_by_sketch_long_shuffled <- jsd_by_sketch_shuffled %>%
  as_tibble() %>%
  add_column(sketch_id_shuffled = wide_emphasis_all_categories$sketch_id_shuffled) %>%
  pivot_longer(cols = starts_with('sub'), names_to = "sketch_2", values_to = "jsd") %>%
  right_join(meta_data, by = c('sketch_id_shuffled' = 'sketch_id')) %>%
  rename(years_old_sketch_1 = years_old, sub_id_sketch_1 = sub_id, sketch_1 = sketch_id_shuffled, category_sketch_1 = category) %>%
  right_join(meta_data, by=c('sketch_2' = 'sketch_id')) %>%
  rename(years_old_sketch_2 = years_old, sub_id_sketch_2 = sub_id, category_sketch_2 = category)

Main analysis questions

Are children’s drawings OWN drawings more similar to other drawings they’ve made than to other kids (of the same category)?

Caculate average JSD within each category for each participant with every other participant (including themselves) Then average across categories Do this for shuffled and then unshuffled drawings

jsd_by_sub <- jsd_by_sketch_long %>%
  filter(!sketch_1==sketch_2) %>%
  mutate(same_sub = (sub_id_sketch_1 == sub_id_sketch_2)) %>%
  filter(category_sketch_1 == category_sketch_2) %>%
  # for each combination of subs, within categories
  group_by(sub_id_sketch_1, sub_id_sketch_2, category_sketch_1, same_sub) %>%
  summarize(mean_jsd = mean(jsd), count_drawings_sub_1 = length(unique(sketch_1)), count_drawings_sub_2 = length(unique(sketch_2))) %>%
  ungroup() %>%
  # don't consider pairings where there were less than 2 drawing per category
  filter(count_drawings_sub_1 > 2) %>%
  filter(count_drawings_sub_2 > 2)  %>%
  group_by(sub_id_sketch_1, same_sub) %>%
  summarize(mean_jsd = mean(mean_jsd))

## `summarise()` has grouped output by 'sub_id_sketch_1', 'sub_id_sketch_2',
## 'category_sketch_1'. You can override using the `.groups` argument.
## `summarise()` has grouped output by 'sub_id_sketch_1'. You can override using
## the `.groups` argument.

t.test(jsd_by_sub$mean_jsd[jsd_by_sub$same_sub==FALSE], jsd_by_sub$mean_jsd[jsd_by_sub$same_sub==TRUE])

## 
##  Welch Two Sample t-test
## 
## data:  jsd_by_sub$mean_jsd[jsd_by_sub$same_sub == FALSE] and jsd_by_sub$mean_jsd[jsd_by_sub$same_sub == TRUE]
## t = 3.945, df = 27.039, p-value = 0.0005108
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  0.0591551 0.1873656
## sample estimates:
## mean of x mean of y 
## 0.5713785 0.4481182

Now do the same thing with shuffled drawings

jsd_by_sub_shuffled <- jsd_by_sketch_long_shuffled %>%
  filter(!sketch_1==sketch_2) %>% # only between on-identifical sketches
  mutate(same_sub = (sub_id_sketch_1 == sub_id_sketch_2)) %>% 
  filter(category_sketch_1 == category_sketch_2) %>% # make sure we're looking within category
  # for each combination of subs, within categories
  group_by(sub_id_sketch_1, sub_id_sketch_2, category_sketch_1, same_sub) %>%
  summarize(mean_jsd = mean(jsd), count_drawings_sub_1 = length(unique(sketch_1)), count_drawings_sub_2 = length(unique(sketch_2))) %>%
  ungroup() %>%
  # don't consider pairings where there was only 1 drawing per category
  filter(count_drawings_sub_1 > 2) %>%
  filter(count_drawings_sub_2 > 2)  %>%
  group_by(sub_id_sketch_1, same_sub) %>%
  summarize(mean_jsd = mean(mean_jsd))

## `summarise()` has grouped output by 'sub_id_sketch_1', 'sub_id_sketch_2',
## 'category_sketch_1'. You can override using the `.groups` argument.
## `summarise()` has grouped output by 'sub_id_sketch_1'. You can override using
## the `.groups` argument.

t.test(jsd_by_sub_shuffled$mean_jsd[jsd_by_sub_shuffled$same_sub==FALSE], jsd_by_sub_shuffled$mean_jsd[jsd_by_sub_shuffled$same_sub==TRUE])

## 
##  Welch Two Sample t-test
## 
## data:  jsd_by_sub_shuffled$mean_jsd[jsd_by_sub_shuffled$same_sub == FALSE] and jsd_by_sub_shuffled$mean_jsd[jsd_by_sub_shuffled$same_sub == TRUE]
## t = 1.9812, df = 29.779, p-value = 0.05687
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.0009231182  0.0601643504
## sample estimates:
## mean of x mean of y 
## 0.5572850 0.5276644

Make data structure to test this within a linear regression to be sure it’s significant.

full_data_jsd <- jsd_by_sub %>%
  mutate(condition = 'unshuffled') %>%
  full_join(jsd_by_sub_shuffled %>% mutate(condition='shuffled'))

## Joining with `by = join_by(sub_id_sketch_1, same_sub, mean_jsd, condition)`

Pans out in an interaction term.

summary(lm(data = full_data_jsd, mean_jsd ~ condition*same_sub ))

## 
## Call:
## lm(formula = mean_jsd ~ condition * same_sub, data = full_data_jsd)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.44812 -0.02887  0.00437  0.03192  0.23069 
## 
## Coefficients:
##                                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                       0.55729    0.01732  32.178  < 2e-16 ***
## conditionunshuffled               0.01409    0.02449   0.575  0.56630    
## same_subTRUE                     -0.02962    0.02449  -1.209  0.22938    
## conditionunshuffled:same_subTRUE -0.09364    0.03464  -2.703  0.00807 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.08831 on 100 degrees of freedom
## Multiple R-squared:  0.2329, Adjusted R-squared:  0.2099 
## F-statistic: 10.12 on 3 and 100 DF,  p-value: 6.999e-06

ggplot(data = full_data_jsd, aes(x=same_sub, y=mean_jsd, col=same_sub)) +
  geom_point() +
  # geom_smooth(method='lm')   +
  geom_line(aes(group=sub_id_sketch_1), color='grey') +
  facet_wrap(~condition)

# Examine this effect by categroy

jsd_by_sub_by_category <- jsd_by_sketch_long %>%
  filter(!sketch_1==sketch_2) %>%
  mutate(same_sub = (sub_id_sketch_1 == sub_id_sketch_2)) %>%
  filter(category_sketch_1 == category_sketch_2) %>%
  # for each combination of subs, within categories
  group_by(sub_id_sketch_1, sub_id_sketch_2, category_sketch_1, same_sub) %>%
  summarize(mean_jsd = mean(jsd), count_drawings_sub_1 = length(unique(sketch_1)), count_drawings_sub_2 = length(unique(sketch_2))) %>%
  ungroup() %>%
  # don't consider pairings where there were less than 2 drawing per category
  filter(count_drawings_sub_1 > 2) %>%
  filter(count_drawings_sub_2 > 2)  %>%
  group_by(sub_id_sketch_1, category_sketch_1, same_sub) %>%
  summarize(mean_jsd = mean(mean_jsd))

## `summarise()` has grouped output by 'sub_id_sketch_1', 'sub_id_sketch_2',
## 'category_sketch_1'. You can override using the `.groups` argument.
## `summarise()` has grouped output by 'sub_id_sketch_1', 'category_sketch_1'. You
## can override using the `.groups` argument.

ggplot(data = jsd_by_sub_by_category, aes(x=same_sub, y=mean_jsd, col=same_sub)) +
  geom_point() +
  geom_line(aes(group=sub_id_sketch_1), color='grey') +
  facet_wrap(~category_sketch_1)

Examine JSD y category and age

Here, each dot is going to be an individual sketch of a category, with the y-axis is it’s similarity to other drawings made either by other children (of that same category). Each drawing it tagged with the age of the drawer at the time.

xd <- jsd_by_sketch_long %>%
  filter(!sketch_1==sketch_2) %>%
  mutate(same_sub = (sub_id_sketch_1 == sub_id_sketch_2)) %>%
  filter(category_sketch_1 == category_sketch_2) %>%
  # for each combination of subs, within categories
  group_by(sub_id_sketch_1, sub_id_sketch_2, same_sub, category_sketch_1, years_old_sketch_1) %>%
  summarize(mean_jsd = mean(jsd), count_drawings_sub_1 = length(unique(sketch_1)), count_drawings_sub_2 = length(unique(sketch_2)), years_old_sketch_1 = years_old_sketch_1[1]) %>%
  ungroup() %>%
  filter(same_sub==FALSE) %>% # only look between subjects here because 1 drawings per age per child by definiton
  group_by(sub_id_sketch_1, category_sketch_1, years_old_sketch_1) %>%
  summarize(mean_jsd = mean(mean_jsd))

## `summarise()` has grouped output by 'sub_id_sketch_1', 'sub_id_sketch_2',
## 'same_sub', 'category_sketch_1'. You can override using the `.groups` argument.
## `summarise()` has grouped output by 'sub_id_sketch_1', 'category_sketch_1'. You
## can override using the `.groups` argument.

ggplot(data=xd, aes(x=years_old_sketch_1, y=mean_jsd, col=category_sketch_1)) +
  geom_point(alpha=.2) +
  geom_smooth(method='lm') +
  ylab('Dissimialarity to other childrens drawings(JSD)') +
  facet_wrap(~category_sketch_1)

## `geom_smooth()` using formula = 'y ~ x'

step4_cds_abstract

Bria Long

2024-01-04

Load data and libraries

libraries

Read preprocessed file

Write out some basic descriptives

Calculate JSD for each sketch

Get data structure

Main analysis questions

Are children’s drawings OWN drawings more similar to other drawings they’ve made than to other kids (of the same category)?

Examine JSD y category and age