Embeddings

These are the embeddings for three different subdivisions of participants: (1) young-middle-old-adult (3-4/5-6/7-8/adult), i.e., categories that the next experiment adaptively sampled on (2) a post-hoc split based on median age (excluding adults), and (3) a post-hoc split based on median vocabulary as reported by parents.

young-middle-old

young = read.csv("model_34.csv", header = FALSE) %>%
  mutate(group = "young")
middle = read.csv("model_56.csv", header = FALSE) %>%
  mutate(group = "middle")
old = read.csv("model_78.csv", header = FALSE) %>%
  mutate(group = "old")
adult = read.csv("model_adults.csv", header = FALSE) %>%
  mutate(group = "adult")

labs =read.table("labels.txt", header = FALSE) %>%
      rename(item = V1) %>%
      mutate(item = lapply(str_split(item, ".jpg"), 
                                   function(x) {x[1]}))
d = cbind(labs, young) %>%
  rbind(cbind(labs,middle)) %>%
  rbind(cbind(labs,old)) %>%
  rbind(cbind(labs,adult)) %>%
  rename(x = V1,
         y = V2) %>%
  mutate(item = as.factor(unlist(item))) %>%
  mutate(group = fct_relevel(group, "young", "middle", "old")) 
dict = read.csv("../image_dict.csv") %>%
  select(images, type, theme) %>%
  rename(item = images)

d  = left_join(d, dict)

Points

ggplot(d, aes(x = x, y = y, color = type, shape = theme)) +
   geom_point(size = 3) +
   facet_grid(~group, scales = "free") +
   theme_bw() +
   theme(panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(),
        strip.text.x = element_text(size = 14),
        strip.background = element_rect(colour="grey", fill="grey"),
        axis.title = element_blank(),
        axis.text = element_blank(),
        axis.ticks = element_blank(),
        legend.position = "bottom")

Labels

ggplot(d, aes(x = x, y = y)) +
   geom_text(aes(label=item), size = 3) +
   facet_grid(~group, scales = "free") +
   theme_bw() +
   theme(panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(),
        strip.text.x = element_text(size = 14),
        strip.background = element_rect(colour="grey", fill="grey"),
        axis.title = element_blank(),
        axis.text = element_blank(),
        axis.ticks = element_blank())

post-hoc median age split

read in embeddings

low = read.csv("model_lowage.csv", header = FALSE) %>%
  mutate(group = "low")
high = read.csv("model_highage.csv", header = FALSE) %>%
  mutate(group = "high")


d.m.age= cbind(labs, low) %>%
  rbind(cbind(labs,high)) %>%
  rename(x = V1,
         y = V2) %>%
  mutate(item = as.factor(unlist(item))) %>%
  mutate(group = fct_relevel(group, "low", "high")) %>%
  left_join(dict)

Points

ggplot(d.m.age, aes(x = x, y = y, color = type, shape = theme)) +
   geom_point(size = 3) +
   facet_wrap(~group, scales = "free") +
   theme_bw() +
   theme(panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(),
        strip.text.x = element_text(size = 14),
        strip.background = element_rect(colour="grey", fill="grey"),
        axis.title = element_blank(),
        axis.text = element_blank(),
        axis.ticks = element_blank(),
        legend.position = "bottom")

Labels

ggplot(d.m.age, aes(x = x, y = y)) +
   geom_text(aes(label=item), size = 3) +
   facet_wrap(~group, scales = "free") +
   theme_bw() +
   theme(panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(),
        strip.text.x = element_text(size = 14),
        strip.background = element_rect(colour="grey", fill="grey"),
        axis.title = element_blank(),
        axis.text = element_blank(),
        axis.ticks = element_blank())

post-hoc median vocab split

read in embeddings

low = read.csv("model_lowvocab.csv", header = FALSE) %>%
  mutate(group = "low")
high = read.csv("model_highvocab.csv", header = FALSE) %>%
  mutate(group = "high")

d.m.vocab = cbind(labs, low) %>%
  rbind(cbind(labs,high)) %>%
  rename(x = V1,
         y = V2) %>%
  mutate(item = as.factor(unlist(item))) %>%
  mutate(group = fct_relevel(group, "low", "high")) %>%
  left_join(dict)

Points

ggplot(d.m.vocab, aes(x = x, y = y, color = type, shape = theme)) +
   geom_point(size = 3) +
   facet_wrap(~group, scales = "free") +
   theme_bw() +
   theme(panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(),
        strip.text.x = element_text(size = 14),
        strip.background = element_rect(colour="grey", fill="grey"),
        axis.title = element_blank(),
        axis.text = element_blank(),
        axis.ticks = element_blank(),
        legend.position = "bottom")

Labels

ggplot(d.m.vocab, aes(x = x, y = y)) +
   geom_text(aes(label=item), size = 3) +
   facet_wrap(~group, scales = "free") +
   theme_bw() +
   theme(panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(),
        strip.text.x = element_text(size = 14),
        strip.background = element_rect(colour="grey",
                                        fill="grey"),
        axis.title = element_blank(),
        axis.text = element_blank(),
        axis.ticks = element_blank())

Pairwise distance

For each dimension (theme vs. type), I calculated the distance of each item to every other item in the same group vs. across group. I then calculated a relative distance for each item as the within distance divided by then across distance, and then averaged across items in the same group. If our a priori categories are getting more distinct across development, then this measure should decrease with age.

young-middle-old split

# all pairs of items with their feature info
all.pairs2 = expand.grid(unlist(labs$item),unlist(labs$item)) %>%
  rename(item1 = Var1,
         item2 = Var2)

getMeanDist2 <- function(age, d){
  # coordinates for this age group
  coords = d %>%
    filter(group == age) %>%
    select(item,x,y)
  
  # merge in item coordinates for this age group
  these.pairs.with.coords = all.pairs2 %>%
    select(item1, item2) %>%
    left_join(coords, by=c("item1" = "item")) %>%
    rename(x1 = x,
           y1 = y) %>%
    left_join(coords, by=c("item2" = "item")) %>%
    rename(x2 = x,
           y2 = y)
  
  # get dists
  mean.dists = these.pairs.with.coords %>%
       rowwise() %>%
       mutate(dist = sqrt((x2-x1)^2 + (y2-y1)^2)) %>%
       mutate(group = age)

  mean.dists
}

all.dists <- map(c("young", "middle", "old", "adult"), getMeanDist2, d) %>%
          bind_rows() %>%
  left_join(dict, by=c("item1" = "item"))%>%
  rename(type1 = type,
         theme1 = theme) %>%
  left_join(dict, by=c("item2" = "item")) %>%
  rename(type2 = type,
         theme2 = theme)

theme

all.dists.item.ms = all.dists %>%
    group_by(item1, theme2, group) %>%
    summarize(mean = mean(dist)) 
    
all.dists.item.ms.gp = all.dists.item.ms %>%
  left_join(dict, by=c("item1" = "item")) %>%
  rename(type1 = type,
         theme1 = theme) %>%
  select(group, item1, theme1, theme2, mean) %>%
  mutate(category = ifelse(theme1 == theme2, "within", "between")) %>%
  ungroup() %>%
  select(-theme1, -theme2) %>%
  group_by(group, item1) %>%
  spread(category, mean) %>%
  mutate(relative.theme.dist = within/between)

ms = all.dists.item.ms.gp %>%
  left_join(select(dict, -type), by=c("item1" = "item")) %>%
  rename(theme1 = theme) %>%
  group_by(theme1, group) %>%
  multi_boot_standard(column = "relative.theme.dist") %>%
  mutate(group = fct_relevel(group, "young", "middle", "old", "adult")) 

ggplot(ms, aes(x = group, y = mean)) +
  geom_bar(stat ="identity") +
  geom_linerange(aes(ymax = ci_upper, ymin = ci_lower)) +
  facet_wrap(~theme1) +
  xlab("age group") +
  ylab("relative mean pairwise distance") +
  theme_bw()

type

all.dists.item.ms = all.dists %>%
    group_by(item1, type2, group) %>%
    summarize(mean = mean(dist)) 
    
all.dists.item.ms.gp = all.dists.item.ms %>%
  left_join(dict, by=c("item1" = "item")) %>%
  rename(type1 = type,
         theme1 = theme) %>%
  select(group, item1, type1, type2, mean) %>%
  mutate(category = ifelse(type1 == type2, "within", "between")) %>%
  ungroup() %>%
  select(-type1, -type2) %>%
  group_by(group, item1) %>%
  spread(category, mean) %>%
  mutate(relative.type.dist = within/between)

ms.crit = all.dists.item.ms.gp %>%
  left_join(select(dict, -theme), by=c("item1" = "item")) %>%
  rename(type1 = type) %>%
  group_by(type1, group) %>%
  multi_boot_standard(column = "relative.type.dist") %>%
  mutate(group = fct_relevel(group, "young", "middle", "old", "adult")) 

ggplot(ms.crit, aes(x = group, y = mean)) +
  geom_bar(stat ="identity") +
  geom_linerange(aes(ymax = ci_upper, ymin = ci_lower)) +
  facet_wrap(~type1) +
  xlab("age group") +
  ylab("relative mean pairwise distance") +
  theme_bw()

post-hoc median age split

all.dists <- map(c("low","high"), getMeanDist2, d.m.age) %>%
          bind_rows() %>%
  left_join(dict, by=c("item1" = "item"))%>%
  rename(type1 = type,
         theme1 = theme) %>%
  left_join(dict, by=c("item2" = "item")) %>%
  rename(type2 = type,
         theme2 = theme)

theme

all.dists.item.ms = all.dists %>%
    group_by(item1, theme2, group) %>%
    summarize(mean = mean(dist)) 
    
all.dists.item.ms.gp = all.dists.item.ms %>%
  left_join(dict, by=c("item1" = "item")) %>%
  rename(type1 = type,
         theme1 = theme) %>%
  select(group, item1, theme1, theme2, mean) %>%
  mutate(category = ifelse(theme1 == theme2, "within", "between")) %>%
  ungroup() %>%
  select(-theme1, -theme2) %>%
  group_by(group, item1) %>%
  spread(category, mean) %>%
  mutate(relative.theme.dist = within/between)

ms = all.dists.item.ms.gp %>%
  left_join(select(dict, -type), by=c("item1" = "item")) %>%
  rename(theme1 = theme) %>%
  group_by(theme1, group) %>%
  multi_boot_standard(column = "relative.theme.dist") %>%
  mutate(group = fct_relevel(group, c("low","high"))) 

ggplot(ms, aes(x = group, y = mean)) +
  geom_bar(stat ="identity") +
  geom_linerange(aes(ymax = ci_upper, ymin = ci_lower)) +
  facet_wrap(~theme1) +
  xlab("age group") +
  ylab("relative mean pairwise distance") +
  theme_bw()

type

all.dists.item.ms = all.dists %>%
    group_by(item1, type2, group) %>%
    summarize(mean = mean(dist)) 
    
all.dists.item.ms.gp = all.dists.item.ms %>%
  left_join(dict, by=c("item1" = "item")) %>%
  rename(type1 = type,
         theme1 = theme) %>%
  select(group, item1, type1, type2, mean) %>%
  mutate(category = ifelse(type1 == type2, "within", "between")) %>%
  ungroup() %>%
  select(-type1, -type2) %>%
  group_by(group, item1) %>%
  spread(category, mean) %>%
  mutate(relative.type.dist = within/between)

ms = all.dists.item.ms.gp %>%
  left_join(select(dict, -theme), by=c("item1" = "item")) %>%
  rename(type1 = type) %>%
  group_by(type1, group) %>%
  multi_boot_standard(column = "relative.type.dist") %>%
  mutate(group = fct_relevel(group, c("low","high"))) 

ggplot(ms, aes(x = group, y = mean)) +
  geom_bar(stat ="identity") +
  geom_linerange(aes(ymax = ci_upper, ymin = ci_lower)) +
  facet_wrap(~type1) +
  xlab("age group") +
  ylab("relative mean pairwise distance") +
  theme_bw()

post-hoc median vocab split

all.dists <- map(c("low","high"), getMeanDist2, d.m.vocab) %>%
          bind_rows() %>%
  left_join(dict, by=c("item1" = "item"))%>%
  rename(type1 = type,
         theme1 = theme) %>%
  left_join(dict, by=c("item2" = "item")) %>%
  rename(type2 = type,
         theme2 = theme)

theme

all.dists.item.ms = all.dists %>%
    group_by(item1, theme2, group) %>%
    summarize(mean = mean(dist)) 
    
all.dists.item.ms.gp = all.dists.item.ms %>%
  left_join(dict, by=c("item1" = "item")) %>%
  rename(type1 = type,
         theme1 = theme) %>%
  select(group, item1, theme1, theme2, mean) %>%
  mutate(category = ifelse(theme1 == theme2, "within", "between")) %>%
  ungroup() %>%
  select(-theme1, -theme2) %>%
  group_by(group, item1) %>%
  spread(category, mean) %>%
  mutate(relative.theme.dist = within/between)

ms = all.dists.item.ms.gp %>%
  left_join(select(dict, -type), by=c("item1" = "item")) %>%
  rename(theme1 = theme) %>%
  group_by(theme1, group) %>%
  multi_boot_standard(column = "relative.theme.dist") %>%
  mutate(group = fct_relevel(group, c("low","high"))) 

ggplot(ms, aes(x = group, y = mean)) +
  geom_bar(stat ="identity") +
  geom_linerange(aes(ymax = ci_upper, ymin = ci_lower)) +
  facet_wrap(~theme1) +
  xlab("vocab group") +
  ylab("relative mean pairwise distance") +
  theme_bw()

type

all.dists.item.ms = all.dists %>%
    group_by(item1, type2, group) %>%
    summarize(mean = mean(dist)) 
    
all.dists.item.ms.gp = all.dists.item.ms %>%
  left_join(dict, by=c("item1" = "item")) %>%
  rename(type1 = type,
         theme1 = theme) %>%
  select(group, item1, type1, type2, mean) %>%
  mutate(category = ifelse(type1 == type2, "within", "between")) %>%
  ungroup() %>%
  select(-type1, -type2) %>%
  group_by(group, item1) %>%
  spread(category, mean) %>%
  mutate(relative.type.dist = within/between)

ms = all.dists.item.ms.gp %>%
  left_join(select(dict, -theme), by=c("item1" = "item")) %>%
  rename(type1 = type) %>%
  group_by(type1, group) %>%
  multi_boot_standard(column = "relative.type.dist") %>%
  mutate(group = fct_relevel(group, c("low","high"))) 

ggplot(ms, aes(x = group, y = mean)) +
  geom_bar(stat ="identity") +
  geom_linerange(aes(ymax = ci_upper, ymin = ci_lower)) +
  facet_wrap(~type1) +
  xlab("vocab group") +
  ylab("relative mean pairwise distance") +
  theme_bw()

Grant plots

For the grant, I was planning on only showing the young and middle (3-4 and 5-6 yo) embeddings. This simplifies things a bit, and is nice because we have the same sample size in both. I was also going to collapse across the wild-farm theme distinction since the results are not very clear there.

# Procrustes transformation
young = filter(d, group == "young") %>%
  select(x, y) %>%
  as.matrix()

middle = filter(d, group == "middle") %>%
  select(x, y) %>%
  as.matrix()

ym = procOPA(young, middle)
m = rbind(ym$Ahat,ym$Bhat) %>%
  as.data.frame() %>%
  mutate(group = c(rep("young",16), rep("middle", 16)),
         item = c(unlist(labs$item), unlist(labs$item))) %>%
  left_join(dict)

ym = procrustes(young, middle)
m = rbind(ym$X,ym$Yrot) %>%
  as.data.frame() %>%
  mutate(group = c(rep("young",16), rep("middle", 16)),
         item = c(unlist(labs$item), unlist(labs$item))) %>%
  left_join(dict) %>%
  mutate(group = fct_relevel(group, c("young","middle"))) 
age_names <- c(`young` = "3-4 year olds", `middle` = "5-6 year olds") 

ggplot(m, aes(x = x, y = y)) +
   geom_text(aes(label=item, color = type), size = 3) +
   facet_grid(~group, labeller = labeller(group = as_labeller(age_names))) +
   theme_bw() +
   theme(panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(),
        strip.text.x = element_text(size = 14),
        strip.background = element_rect(colour="grey", fill="grey"),
        axis.title = element_blank(),
        axis.text = element_blank(),
        axis.ticks = element_blank(),
        legend.position = "none")

Pairwise-relative distance for type (bird vs. mammal), including adult control group.

cats <- c(`bird` = "bird category", `mammal` = "mammal category") 

ggplot(filter(ms.crit, group != "old"), aes(x = group, y = mean)) +
  geom_bar(stat ="identity", fill = "darkgrey") +
  scale_x_discrete("age group", labels = c("young" = "3-4yo", 
               "middle" = "5-6yo", 
                "adult" = "adults"))+ 
  facet_grid(~type1, labeller = labeller(type1 = as_labeller(cats))) +
  geom_linerange(aes(ymax = ci_upper, ymin = ci_lower)) +
  ylab("relative mean pairwise distance") +
  theme_bw()+
theme(strip.text.x = element_text(size = 14))

The take away here is that both the bird and mammal category get more distinct with age, but the mammal category takes longer to develop (and is overall less distinct?).