library(knitr)

opts_chunk$set(echo = T, message = F, warning = F, 
               error = F, cache = F, tidy = F)

library(tidyverse)
library(langcog)
library(data.table)
library(feather)
library(broom)

theme_set(theme_classic(base_size = 10))

Do kids that know words at T2 (using mtld measure) know words that are more “useful” at T1? In particular, we define useful words to be words that allow you to make more trigrams in adult speech using coca spoken corpus (https://corpus.byu.edu/coca/).

FINDINGS:

Group level

# Get all unique words for two groups at t1. 
groups_info <- read_csv("groups_info.csv")
target_types <- read_csv("target_types_delta_450_1150.csv") %>%
  mutate(tbin = fct_recode(tbin, 
                           t1 = "low",
                           t2 = "high")) %>%
  mutate(gloss = tolower(gloss))

MIN_KIDS_PRODUCED_WORD <- 5

good_words <- target_types %>%
  count(gloss, target_child_id) %>% # one row a word-kid
  count(gloss) %>%
  filter(nn >= MIN_KIDS_PRODUCED_WORD) # more than n words?

good_kids <- target_types %>%
  filter(gloss %in% good_words$gloss) %>%
  count(target_child_id, tbin) %>% # each row a tbin-kid
  count(target_child_id) %>%
  filter(nn > 1) # two time points?

# get the good words, filtering on kids and words
good_types <- target_types %>%
    filter(gloss %in% good_words$gloss,
           target_child_id %in% good_kids$target_child_id)

# get t1 words only
t1_words <- good_types %>%
  left_join(groups_info %>% 
              mutate(target_child_id = as.numeric(target_child_id))) %>%
  filter(tbin == "t1") %>%
  distinct(tbin, delta_resid_group, gloss)
coca_trigrams <- read_csv("results_freq_prob_coca.csv") %>%
  select(w1, w2, w3, prob3, prob2_1) %>%
  data.table() 

t1_words_high <- t1_words %>%
                    filter(delta_resid_group == "high") %>%
                    select(gloss) %>%
  unlist(use.names = F)

t1_words_low <- t1_words %>%
                    filter(delta_resid_group == "low") %>%
                    select(gloss) %>%
  unlist(use.names = F)

high_trigrams <- coca_trigrams[w1 %in% t1_words_high &
                                 w2 %in% t1_words_high & 
                                 w3 %in% t1_words_high] %>%
  mutate(vocab_group = "high")

low_trigrams <- coca_trigrams[w1 %in% t1_words_low &
                                 w2 %in% t1_words_low & 
                                 w3 %in% t1_words_low] %>%
  mutate(vocab_group = "low")

all_trigrams <- bind_rows(high_trigrams, low_trigrams)

ggplot(all_trigrams, aes(x = prob3, 
                         fill = vocab_group, 
                         group = vocab_group)) +
  geom_histogram()

all_trigrams %>%
  group_by(vocab_group) %>%
  multi_boot_standard(col = "prob3", na.rm = T)
## # A tibble: 2 x 4
## # Groups:   vocab_group [?]
##   vocab_group ci_lower ci_upper  mean
##   <chr>          <dbl>    <dbl> <dbl>
## 1 high           -1.90    -1.89 -1.90
## 2 low            -1.91    -1.89 -1.90

Individual Kids

good_types_t1 <- good_types %>%
  left_join(groups_info %>% 
              mutate(target_child_id = as.numeric(target_child_id))) %>%
  filter(tbin == "t1")

get_trigrams_by_kid <- function(df, all_trigrams){
     current_trigrams <- all_trigrams[w1 %in% df$gloss &
                                 w2 %in% df$gloss & 
                                 w3 %in% df$gloss] 
     
     mean(current_trigrams$prob3, na.rm = T)
  
}

group_ids <-  good_types_t1 %>%
  distinct(target_child_id, 
         delta_resid_group)

trigram_freqs_by_kid <- good_types_t1 %>%
              nest(-target_child_id, -tbin) %>%
              mutate(mean_trigram_frequency = 
                       map(data, get_trigrams_by_kid,
                           coca_trigrams)) %>%
            select(-data) %>%
            unnest() %>%
            left_join(group_ids)


ggplot(trigram_freqs_by_kid, 
       aes(x = mean_trigram_frequency, 
                         fill = delta_resid_group, 
                         group = delta_resid_group)) +
  geom_density(alpha = .4)

trigram_freqs_by_kid %>%
  group_by(delta_resid_group) %>%
  multi_boot_standard(col = "mean_trigram_frequency",
                      na.rm = T)
## # A tibble: 2 x 4
## # Groups:   delta_resid_group [?]
##   delta_resid_group ci_lower ci_upper  mean
##   <chr>                <dbl>    <dbl> <dbl>
## 1 high                 -1.98    -1.73 -1.87
## 2 low                  -1.91    -1.68 -1.80