Embeddings

These are the embeddings for three different subdivisions of participants: (1) young-middle-old-adult (3-4/5-6/7-8/adult), i.e., categories that the next experiment adaptively sampled on (2) a post-hoc split based on median age (excluding adults), and (3) a post-hoc split based on median vocabulary as reported by parents.

young-middle-old

young = read.csv("model_34.csv", header = FALSE) %>%
  mutate(group = "young")
middle = read.csv("model_56.csv", header = FALSE) %>%
  mutate(group = "middle")
old = read.csv("model_78.csv", header = FALSE) %>%
  mutate(group = "old")
adult = read.csv("model_adults.csv", header = FALSE) %>%
  mutate(group = "adult")

labs =read.table("labels.txt", header = FALSE) %>%
      rename(item = V1) %>%
      mutate(item = lapply(str_split(item, ".jpg"), 
                                   function(x) {x[1]}))
d = cbind(labs, young) %>%
  rbind(cbind(labs,middle)) %>%
  rbind(cbind(labs,old)) %>%
  rbind(cbind(labs,adult)) %>%
  rename(x = V1,
         y = V2) %>%
  mutate(item = as.factor(unlist(item))) %>%
  mutate(group = fct_relevel(group, "young", "middle", "old"))

dict = read.csv("../image_dict.csv") %>%
  select(images, type, theme) %>%
  rename(item = images)

d  = left_join(d, dict)

Points

ggplot(d, aes(x = x, y = y, color = type, shape = theme)) +
   geom_point(size = 3) +
   facet_grid(~group, scales = "free") +
   theme_bw() +
   theme(panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(),
        strip.text.x = element_text(size = 14),
        strip.background = element_rect(colour="grey", fill="grey"),
        axis.title = element_blank(),
        axis.text = element_blank(),
        axis.ticks = element_blank(),
        legend.position = "bottom")

Labels

ggplot(d, aes(x = x, y = y)) +
   geom_text(aes(label=item), size = 3) +
   facet_grid(~group, scales = "free") +
   theme_bw() +
   theme(panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(),
        strip.text.x = element_text(size = 14),
        strip.background = element_rect(colour="grey", fill="grey"),
        axis.title = element_blank(),
        axis.text = element_blank(),
        axis.ticks = element_blank())

post-hoc median age split

read in embeddings

low = read.csv("model_lowage.csv", header = FALSE) %>%
  mutate(group = "low")
high = read.csv("model_highage.csv", header = FALSE) %>%
  mutate(group = "high")


d.m.age= cbind(labs, low) %>%
  rbind(cbind(labs,high)) %>%
  rename(x = V1,
         y = V2) %>%
  mutate(item = as.factor(unlist(item))) %>%
  mutate(group = fct_relevel(group, "low", "high")) %>%
  left_join(dict)

Points

ggplot(d.m.age, aes(x = x, y = y, color = type, shape = theme)) +
   geom_point(size = 3) +
   facet_wrap(~group, scales = "free") +
   theme_bw() +
   theme(panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(),
        strip.text.x = element_text(size = 14),
        strip.background = element_rect(colour="grey", fill="grey"),
        axis.title = element_blank(),
        axis.text = element_blank(),
        axis.ticks = element_blank(),
        legend.position = "bottom")

Labels

ggplot(d.m.age, aes(x = x, y = y)) +
   geom_text(aes(label=item), size = 3) +
   facet_wrap(~group, scales = "free") +
   theme_bw() +
   theme(panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(),
        strip.text.x = element_text(size = 14),
        strip.background = element_rect(colour="grey", fill="grey"),
        axis.title = element_blank(),
        axis.text = element_blank(),
        axis.ticks = element_blank())

post-hoc median vocab split

read in embeddings

low = read.csv("model_lowvocab.csv", header = FALSE) %>%
  mutate(group = "low")
high = read.csv("model_highvocab.csv", header = FALSE) %>%
  mutate(group = "high")

d.m.vocab = cbind(labs, low) %>%
  rbind(cbind(labs,high)) %>%
  rename(x = V1,
         y = V2) %>%
  mutate(item = as.factor(unlist(item))) %>%
  mutate(group = fct_relevel(group, "low", "high")) %>%
  left_join(dict)

Points

ggplot(d.m.vocab, aes(x = x, y = y, color = type, shape = theme)) +
   geom_point(size = 3) +
   facet_wrap(~group, scales = "free") +
   theme_bw() +
   theme(panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(),
        strip.text.x = element_text(size = 14),
        strip.background = element_rect(colour="grey", fill="grey"),
        axis.title = element_blank(),
        axis.text = element_blank(),
        axis.ticks = element_blank(),
        legend.position = "bottom")

Labels

ggplot(d.m.vocab, aes(x = x, y = y)) +
   geom_text(aes(label=item), size = 3) +
   facet_wrap(~group, scales = "free") +
   theme_bw() +
   theme(panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(),
        strip.text.x = element_text(size = 14),
        strip.background = element_rect(colour="grey",
                                        fill="grey"),
        axis.title = element_blank(),
        axis.text = element_blank(),
        axis.ticks = element_blank())

Pairwise distance

For each dimension (theme vs. type), I calculated the distance of each item to every other item in the same group vs. across group. I then calculated a relative distance for each item as the within distance divided by then across distance, and then averaged across items in the same group. If our a priori categories are getting more distinct across development, then this measure should decrease with age.

young-middle-old split

# all pairs of items with their feature info
all.pairs2 = expand.grid(unlist(labs$item),unlist(labs$item)) %>%
  rename(item1 = Var1,
         item2 = Var2)

getMeanDist2 <- function(age, d){
  # coordinates for this age group
  coords = d %>%
    filter(group == age) %>%
    select(item,x,y)
  
  # merge in item coordinates for this age group
  these.pairs.with.coords = all.pairs2 %>%
    select(item1, item2) %>%
    left_join(coords, by=c("item1" = "item")) %>%
    rename(x1 = x,
           y1 = y) %>%
    left_join(coords, by=c("item2" = "item")) %>%
    rename(x2 = x,
           y2 = y)
  
  # get dists
  mean.dists = these.pairs.with.coords %>%
       rowwise() %>%
       mutate(dist = sqrt((x2-x1)^2 + (y2-y1)^2)) %>%
       mutate(group = age)

  mean.dists
}

all.dists <- map(c("young", "middle", "old", "adult"), getMeanDist2, d) %>%
          bind_rows() %>%
  left_join(dict, by=c("item1" = "item"))%>%
  rename(type1 = type,
         theme1 = theme) %>%
  left_join(dict, by=c("item2" = "item")) %>%
  rename(type2 = type,
         theme2 = theme)

theme

all.dists.item.ms = all.dists %>%
    group_by(item1, theme2, group) %>%
    summarize(mean = mean(dist)) 
    
all.dists.item.ms.gp = all.dists.item.ms %>%
  left_join(dict, by=c("item1" = "item")) %>%
  rename(type1 = type,
         theme1 = theme) %>%
  select(group, item1, theme1, theme2, mean) %>%
  mutate(category = ifelse(theme1 == theme2, "within", "between")) %>%
  ungroup() %>%
  select(-theme1, -theme2) %>%
  group_by(group, item1) %>%
  spread(category, mean) %>%
  mutate(relative.theme.dist = within/between)

ms = all.dists.item.ms.gp %>%
  left_join(select(dict, -type), by=c("item1" = "item")) %>%
  rename(theme1 = theme) %>%
  group_by(theme1, group) %>%
  multi_boot_standard(column = "relative.theme.dist") %>%
  mutate(group = fct_relevel(group, "young", "middle", "old", "adult")) 

ggplot(ms, aes(x = group, y = mean)) +
  geom_bar(stat ="identity") +
  geom_linerange(aes(ymax = ci_upper, ymin = ci_lower)) +
  facet_wrap(~theme1) +
  xlab("age group") +
  ylab("relative mean pairwise distance") +
  theme_bw()

type

all.dists.item.ms = all.dists %>%
    group_by(item1, type2, group) %>%
    summarize(mean = mean(dist)) 
    
all.dists.item.ms.gp = all.dists.item.ms %>%
  left_join(dict, by=c("item1" = "item")) %>%
  rename(type1 = type,
         theme1 = theme) %>%
  select(group, item1, type1, type2, mean) %>%
  mutate(category = ifelse(type1 == type2, "within", "between")) %>%
  ungroup() %>%
  select(-type1, -type2) %>%
  group_by(group, item1) %>%
  spread(category, mean) %>%
  mutate(relative.type.dist = within/between)

ms.crit = all.dists.item.ms.gp %>%
  left_join(select(dict, -theme), by=c("item1" = "item")) %>%
  rename(type1 = type) %>%
  group_by(type1, group) %>%
  multi_boot_standard(column = "relative.type.dist") %>%
  mutate(group = fct_relevel(group, "young", "middle", "old", "adult")) 

ggplot(ms.crit, aes(x = group, y = mean)) +
  geom_bar(stat ="identity") +
  geom_linerange(aes(ymax = ci_upper, ymin = ci_lower)) +
  facet_wrap(~type1) +
  xlab("age group") +
  ylab("relative mean pairwise distance") +
  theme_bw()

post-hoc median age split

all.dists <- map(c("low","high"), getMeanDist2, d.m.age) %>%
          bind_rows() %>%
  left_join(dict, by=c("item1" = "item"))%>%
  rename(type1 = type,
         theme1 = theme) %>%
  left_join(dict, by=c("item2" = "item")) %>%
  rename(type2 = type,
         theme2 = theme)

theme

all.dists.item.ms = all.dists %>%
    group_by(item1, theme2, group) %>%
    summarize(mean = mean(dist)) 
    
all.dists.item.ms.gp = all.dists.item.ms %>%
  left_join(dict, by=c("item1" = "item")) %>%
  rename(type1 = type,
         theme1 = theme) %>%
  select(group, item1, theme1, theme2, mean) %>%
  mutate(category = ifelse(theme1 == theme2, "within", "between")) %>%
  ungroup() %>%
  select(-theme1, -theme2) %>%
  group_by(group, item1) %>%
  spread(category, mean) %>%
  mutate(relative.theme.dist = within/between)

ms = all.dists.item.ms.gp %>%
  left_join(select(dict, -type), by=c("item1" = "item")) %>%
  rename(theme1 = theme) %>%
  group_by(theme1, group) %>%
  multi_boot_standard(column = "relative.theme.dist") %>%
  mutate(group = fct_relevel(group, c("low","high"))) 

ggplot(ms, aes(x = group, y = mean)) +
  geom_bar(stat ="identity") +
  geom_linerange(aes(ymax = ci_upper, ymin = ci_lower)) +
  facet_wrap(~theme1) +
  xlab("age group") +
  ylab("relative mean pairwise distance") +
  theme_bw()

type

all.dists.item.ms = all.dists %>%
    group_by(item1, type2, group) %>%
    summarize(mean = mean(dist)) 
    
all.dists.item.ms.gp = all.dists.item.ms %>%
  left_join(dict, by=c("item1" = "item")) %>%
  rename(type1 = type,
         theme1 = theme) %>%
  select(group, item1, type1, type2, mean) %>%
  mutate(category = ifelse(type1 == type2, "within", "between")) %>%
  ungroup() %>%
  select(-type1, -type2) %>%
  group_by(group, item1) %>%
  spread(category, mean) %>%
  mutate(relative.type.dist = within/between)

ms = all.dists.item.ms.gp %>%
  left_join(select(dict, -theme), by=c("item1" = "item")) %>%
  rename(type1 = type) %>%
  group_by(type1, group) %>%
  multi_boot_standard(column = "relative.type.dist") %>%
  mutate(group = fct_relevel(group, c("low","high"))) 

ggplot(ms, aes(x = group, y = mean)) +
  geom_bar(stat ="identity") +
  geom_linerange(aes(ymax = ci_upper, ymin = ci_lower)) +
  facet_wrap(~type1) +
  xlab("age group") +
  ylab("relative mean pairwise distance") +
  theme_bw()

post-hoc median vocab split

all.dists <- map(c("low","high"), getMeanDist2, d.m.vocab) %>%
          bind_rows() %>%
  left_join(dict, by=c("item1" = "item"))%>%
  rename(type1 = type,
         theme1 = theme) %>%
  left_join(dict, by=c("item2" = "item")) %>%
  rename(type2 = type,
         theme2 = theme)

theme

all.dists.item.ms = all.dists %>%
    group_by(item1, theme2, group) %>%
    summarize(mean = mean(dist)) 
    
all.dists.item.ms.gp = all.dists.item.ms %>%
  left_join(dict, by=c("item1" = "item")) %>%
  rename(type1 = type,
         theme1 = theme) %>%
  select(group, item1, theme1, theme2, mean) %>%
  mutate(category = ifelse(theme1 == theme2, "within", "between")) %>%
  ungroup() %>%
  select(-theme1, -theme2) %>%
  group_by(group, item1) %>%
  spread(category, mean) %>%
  mutate(relative.theme.dist = within/between)

ms = all.dists.item.ms.gp %>%
  left_join(select(dict, -type), by=c("item1" = "item")) %>%
  rename(theme1 = theme) %>%
  group_by(theme1, group) %>%
  multi_boot_standard(column = "relative.theme.dist") %>%
  mutate(group = fct_relevel(group, c("low","high"))) 

ggplot(ms, aes(x = group, y = mean)) +
  geom_bar(stat ="identity") +
  geom_linerange(aes(ymax = ci_upper, ymin = ci_lower)) +
  facet_wrap(~theme1) +
  xlab("vocab group") +
  ylab("relative mean pairwise distance") +
  theme_bw()

type

all.dists.item.ms = all.dists %>%
    group_by(item1, type2, group) %>%
    summarize(mean = mean(dist)) 
    
all.dists.item.ms.gp = all.dists.item.ms %>%
  left_join(dict, by=c("item1" = "item")) %>%
  rename(type1 = type,
         theme1 = theme) %>%
  select(group, item1, type1, type2, mean) %>%
  mutate(category = ifelse(type1 == type2, "within", "between")) %>%
  ungroup() %>%
  select(-type1, -type2) %>%
  group_by(group, item1) %>%
  spread(category, mean) %>%
  mutate(relative.type.dist = within/between)

ms = all.dists.item.ms.gp %>%
  left_join(select(dict, -theme), by=c("item1" = "item")) %>%
  rename(type1 = type) %>%
  group_by(type1, group) %>%
  multi_boot_standard(column = "relative.type.dist") %>%
  mutate(group = fct_relevel(group, c("low","high"))) 

ggplot(ms, aes(x = group, y = mean)) +
  geom_bar(stat ="identity") +
  geom_linerange(aes(ymax = ci_upper, ymin = ci_lower)) +
  facet_wrap(~type1) +
  xlab("vocab group") +
  ylab("relative mean pairwise distance") +
  theme_bw()

Grant plots

For the grant, I was planning on only showing the young and middle (3-4 and 5-6 yo) embeddings. This simplifies things a bit, and is nice because we have the same sample size in both. I was also going to collapse across the wild-farm theme distinction since the results are not very clear there.

# Procrustes transformation
young = filter(d, group == "young") %>%
  select(x, y) %>%
  as.matrix()

middle = filter(d, group == "middle") %>%
  select(x, y) %>%
  as.matrix()

ym = procOPA(young, middle)
m = rbind(ym$Ahat,ym$Bhat) %>%
  as.data.frame() %>%
  mutate(group = c(rep("young",16), rep("middle", 16)),
         item = c(unlist(labs$item), unlist(labs$item))) %>%
  left_join(dict)

ym = procrustes(young, middle)
m = rbind(ym$X,ym$Yrot) %>%
  as.data.frame() %>%
  mutate(group = c(rep("young",16), rep("middle", 16)),
         item = c(unlist(labs$item), unlist(labs$item))) %>%
  left_join(dict) %>%
  mutate(group = fct_relevel(group, c("young","middle")))

age_names <- c(`young` = "3-4 year olds", `middle` = "5-6 year olds") 

ggplot(m, aes(x = x, y = y)) +
   geom_text(aes(label=item, color = type), size = 3) +
   facet_grid(~group, labeller = labeller(group = as_labeller(age_names))) +
   theme_bw() +
   theme(panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(),
        strip.text.x = element_text(size = 14),
        strip.background = element_rect(colour="grey", fill="grey"),
        axis.title = element_blank(),
        axis.text = element_blank(),
        axis.ticks = element_blank(),
        legend.position = "none")

Pairwise-relative distance for type (bird vs. mammal), including adult control group.

cats <- c(`bird` = "bird category", `mammal` = "mammal category") 

ggplot(filter(ms.crit, group != "old"), aes(x = group, y = mean)) +
  geom_bar(stat ="identity", fill = "darkgrey") +
  scale_x_discrete("age group", labels = c("young" = "3-4yo", 
               "middle" = "5-6yo", 
                "adult" = "adults"))+ 
  facet_grid(~type1, labeller = labeller(type1 = as_labeller(cats))) +
  geom_linerange(aes(ymax = ci_upper, ymin = ci_lower)) +
  ylab("relative mean pairwise distance") +
  theme_bw()+
theme(strip.text.x = element_text(size = 14))

The take away here is that both the bird and mammal category get more distinct with age, but the mammal category takes longer to develop (and is overall less distinct?).

NextKids pilot embeddings

Molly Lewis

2017-04-05

Embeddings

young-middle-old

Points

Labels

post-hoc median age split

Points

Labels

post-hoc median vocab split

Points

Labels

Pairwise distance

young-middle-old split

theme

type

post-hoc median age split

theme

type

post-hoc median vocab split

theme

type

Grant plots