library(knitr)

opts_chunk$set(echo = T, message = F, warning = F, 
               error = F, cache = F, tidy = F)

library(tidyverse)
library(langcog)
library(tidyr)
library(broom)
theme_set(theme_classic(base_size = 10))

Words at t1

by mtld delta group

t1_words <- target_types %>%
  left_join(groups_info %>% 
              mutate(target_child_id = as.numeric(target_child_id))) %>%
  filter(tbin == "t1") %>%
  distinct(tbin, delta_resid_group, gloss)

# get all t1 low/high words
t1_words_low <- filter(t1_words, delta_resid_group == "low")
t1_words_high <- filter(t1_words, delta_resid_group == "high")
prop_from_each_group_said_word_t1 <- target_types %>%
  filter(tbin == "t1") %>%
  count(gloss, delta_resid_group) %>%
  group_by(delta_resid_group) %>%
  mutate(total = case_when(delta_resid_group == "low" ~ 
                              nrow(filter(groups_info, delta_resid_group == "low")),
                           delta_resid_group == "high"~ 
                             nrow(filter(groups_info, delta_resid_group == "high")))) %>%
  mutate(prop = n / total)  %>%
  select(-total, -n) %>%
  group_by(delta_resid_group, gloss) %>%
  arrange(delta_resid_group, -prop) 

diff_in_props_between_groups <- prop_from_each_group_said_word_t1 %>%
  spread(delta_resid_group, prop) %>%
  mutate(high = ifelse(is.na(high), 0, high),
         low = ifelse(is.na(low), 0, low),
         diff = high - low) %>%
  arrange(-diff) %>%
  ungroup() 

DT::datatable(diff_in_props_between_groups, 
              caption = "Proportion of kids in each group that said each word, and the difference in proportions.")

Diff = prop high - prop low that know a word

Diff score correlation with other predictors

concreteness <- read_csv("data/control_variables/brysbaert_corpus.csv")
freq <- read_tsv("data/control_variables/SUBTLEXus_corpus.txt")
babbiness <- read_csv("data/control_variables/babiness_iconicity.csv")

diff_with_predictors <- diff_in_props_between_groups %>%
  left_join(concreteness %>% select(Word, Conc.M) %>% distinct(), by = c("gloss" = "Word")) %>%
  left_join(freq %>% select(Word, Lg10WF) %>% distinct(), by = c("gloss" = "Word")) %>%
  left_join(babbiness %>% select(word, iconicity, babiness)  %>% distinct(), by = c("gloss" = "word")) %>%
    rename(concreteness = Conc.M,
          log_frequency = Lg10WF,
          prop_group_diff = diff)

diff_with_predictors_long <- diff_with_predictors %>%
  select(-high, -low) %>%
  gather(measure, value, -gloss, -prop_group_diff) 

ggplot(diff_with_predictors_long, aes(x = value, y = prop_group_diff)) +
  geom_point(aes(color = measure), size = .6) +
  ggtitle("Diff score correlations with other measures") +
  geom_smooth(method = "lm") +
  facet_wrap(~measure, scales = "free") +
  theme_classic() +
  theme(legend.position = "none")

diff_with_predictors_long %>%
  group_by(measure) %>%
  do(tidy(cor.test(.$prop_group_diff, .$value))) %>%
  select(measure, estimate, statistic, p.value) %>%
  kable()
measure estimate statistic p.value
babiness -0.0718031 -1.271577 0.2044705
concreteness -0.0647499 -2.454545 0.0142247
iconicity -0.0291726 -0.515510 0.6065618
log_frequency 0.2952014 12.172032 0.0000000

Kid level analyses

categorical MTLD groups

The units here are invidiual kid’s vocabulary sets at t1.

kid_measures_long <- target_types %>%
  filter(tbin == "t1") %>%
  left_join(diff_in_props_between_groups %>% select(gloss, diff)) %>%
  left_join(concreteness %>% select(Word, Conc.M) %>% distinct(), by = c("gloss" = "Word")) %>%
  left_join(freq %>% select(Word, Lg10WF) %>% distinct(), by = c("gloss" = "Word")) %>%
  left_join(babbiness %>% select(word, iconicity, babiness)  %>% distinct(), by = c("gloss" = "word"))  %>%
  rename(concreteness = Conc.M,
         log_frequency = Lg10WF,
         prop_group_diff = diff) %>%
  gather("measure", "value", c(-1:-10)) %>%
  mutate(measure = as.factor(measure))  

kid_ms <- kid_measures_long %>%
  group_by(target_child_id, measure, delta_resid_group) %>%
  summarize(mean_value = mean(value, na.rm = T))

kid_ms %>%
  group_by(delta_resid_group, measure) %>%
  multi_boot_standard(col = "mean_value", na.rm = T)  %>% 
  ggplot(aes(x = delta_resid_group, y = mean, 
              group = delta_resid_group, fill = delta_resid_group)) +
  facet_wrap(~measure, scales = "free_y", ncol = 5) +
  ylab("predictor") +
  xlab("vocab group") +
  geom_bar(position = "dodge", stat = "identity") +
  geom_linerange(aes(ymin = ci_lower, ymax = ci_upper), 
                 position=position_dodge(width=0.9)) +
  theme_classic() +
  theme(legend.position = "none")

continuous MTLD

Here’s the same analysis as above but now i’m using the continous MTLD score.

kid_ms_continuous_long <- kid_ms %>%
  left_join(groups_info %>% select(target_child_id, delta_resid))  %>%
  ungroup()

ggplot(kid_ms_continuous_long, aes(x = mean_value, y = delta_resid)) +
  geom_point(aes(color = measure), size = .6) +
  ggtitle("MTLD scores predicted by other measures") +
  geom_smooth(method = "lm") +
  facet_wrap(~measure, scales = "free") +
  theme_classic() +
  theme(legend.position = "none")

kid_ms_continuous_long %>%
  group_by(measure) %>%
  do(tidy(cor.test(.$delta_resid, .$mean_value))) %>%
  select(measure, estimate, statistic, p.value) %>%
  kable()
measure estimate statistic p.value
babiness 0.0304915 0.2828981 0.7779348
concreteness 0.1457034 1.3736903 0.1730672
iconicity -0.1819339 -1.7158213 0.0897953
log_frequency -0.2197489 -2.1250509 0.0363538
prop_group_diff 0.0818177 0.7788019 0.4381393