Looking at the longitudinal development of vocab for a single child, what does the trajectory of complexity bias look like?

roy = read.csv("roy_et_al_data.csv") 
complexity = read.csv("cdi_complexity_norms.csv")

d = inner_join(roy, complexity, by = "word") 

ggplot(d, aes(y = wordLength, x = mean)) +
  geom_point()+ 
  geom_smooth(method = "lm") +
  ylab("Length") +
  xlab("Mean complexity rating") +
  ggtitle("Complexity bias in wordbirth data") +
  theme_bw(base_size = 18)

tidy(cor.test(d$wordLength, d$mean, na.rm = T)) %>%
    select(estimate, statistic, p.value) %>%
  kable()
estimate statistic p.value
0.3375865 6.680736 0

Overall, there is a .34 correlation in the vocabulary.

Now, let’s look at how that correlation changes over time.

m = d %>%
  mutate(aoa.round = round(aoa)) %>%
  group_by(aoa.round) %>%
  summarise(n = n())

3 - month chunks. Non-cumulative.

d.f = d %>%
  mutate(aoa.round = round(aoa)) %>%
  filter(aoa.round < 23)   %>%
  mutate(aoa.cut = cut_width(aoa.round, width = 4)) %>%
  select(word, aoa.cut, mean, wordLength)

d.f %>%
  ggplot(aes(y = wordLength, x = mean)) +
  geom_point() +
  facet_grid(~aoa.cut) +
  geom_smooth(method = "lm") +
  ylab("Length") +
  xlab("Mean complexity rating") +
  ggtitle("Age") +
  theme_bw(base_size = 18)

Cumulative

c1 =  filter(d.f, aoa.cut == "[10,14]") 
c2 = filter(d.f, aoa.cut == "(14,18]") 
c3 = filter(d.f, aoa.cut == "(18,22]") 

cc1 = mutate(c1, age = 1)
cc2 = rbind(c1, c2) %>% mutate(age = 2)
cc3 = rbind(c1, c2, c3) %>% mutate(age = 3)

d.devo.cumulative = rbind(cc1, cc2, cc3)

ggplot(d.devo.cumulative, aes(y = wordLength, x = mean)) +
  geom_point() +
  facet_grid(~age) +
  geom_smooth(method = "lm") +
  ylab("Length") +
  xlab("Mean complexity rating") +
  ggtitle("Age") +
  theme_bw(base_size = 18)

Are the entropy measures correlated with complexity?

d$mean.KL = rowMeans(cbind(d$srl.topic.KL, d$srl.temp.KL, d$srl.sp.KL))

    d %>%
  select(aoa, s.cmu.phon, sln.freq.pre, s.uttlen.pre, s.mrc.conc, s.mrc.imag, s.mrc.fam, mean.KL, mean, wordLength ) %>%
  ggcorr(label = T)

    tidy(cor.test(d$srl.temp.KL, d$mean, na.rm = T)) %>%
    select(estimate, statistic, p.value) %>%
  kable()
estimate statistic p.value
-0.0315192 -0.5874291 0.5572976

Just spatial entropy, but negatively. Somewhat puzzling – objects with higher spatial entropy are less conceptually complex.