library(knitr)

opts_chunk$set(echo = T, message = F, warning = F, 
               error = F, cache = F, tidy = F)

library(tidyverse)
library(langcog)
library(data.table)
library(feather)
library(broom)

theme_set(theme_classic(base_size = 10))

Do kids that know words at T2 (using mtld measure) know words that are more “useful” at T1? In particular, we define useful words to be words that allow you to make more trigrams in switchboard (http://www.anc.org/data/oanc/contents/).

Group level

# Get all unique words for two groups at t1. 
groups_info <- read_csv("../1_exploration/groups_info.csv")
target_types <- read_csv("../1_exploration/target_types_delta_450_1150.csv") %>%
  mutate(tbin = fct_recode(tbin, 
                           t1 = "low",
                           t2 = "high")) %>%
  mutate(gloss = tolower(gloss))

TOPNWORDS_PER_GROUP <- 500

t1_words <- target_types %>%
  filter(tbin == "t1") %>% # #t1 only
  left_join(groups_info %>% 
              mutate(target_child_id = as.numeric(target_child_id))) %>% # merge in group info
  group_by(delta_resid_group, gloss)  %>% # get glosses by group
  summarize(n = n()) %>% # get gloss counts by group
  top_n(TOPNWORDS_PER_GROUP) %>%
  group_by(delta_resid_group)  %>%
  slice(1:TOPNWORDS_PER_GROUP) %>%
  arrange(delta_resid_group, -n)
sw_trigrams <- read_csv("switchboard_trigrams.csv") %>%
  data.table() 

t1_words_high <- t1_words %>%
                    filter(delta_resid_group == "high") %>%
                    select(gloss) %>%
  unlist(use.names = F)


t1_words_low <- t1_words %>%
                    filter(delta_resid_group == "low") %>%
                    select(gloss) %>%
  unlist(use.names = F)

high_trigrams <- sw_trigrams[w1 %in% t1_words_high &
                                 w2 %in% t1_words_high & 
                                 w3 %in% t1_words_high] %>%
  mutate(vocab_group = "high")

low_trigrams <- sw_trigrams[w1 %in% t1_words_low &
                                 w2 %in% t1_words_low & 
                                 w3 %in% t1_words_low] %>%
  mutate(vocab_group = "low")

all_trigrams <- bind_rows(high_trigrams, low_trigrams)

 all_trigrams %>%
  mutate(log_freq = log(freq)) %>%
  group_by(vocab_group) %>%
  multi_boot_standard(col = "log_freq", na.rm = T) %>%
  ggplot(aes( x = vocab_group, y = mean, group = vocab_group, fill = vocab_group)) +
  geom_bar(position = "dodge", stat = "identity") +
  geom_linerange(aes(ymin = ci_lower, ymax = ci_upper), 
                 position=position_dodge(width=0.9)) +
  theme_classic()

  #summarize(mean_freq= mean(log_freq),
  #          ci_lower_freq = mean(log_freq) - (1.96 * sd(log_freq)),
  #          ci_upper_freq = mean(log_freq) + (1.96 * sd(log_freq)))
  

 all_trigrams %>%
  group_by(vocab_group) %>%
  multi_boot_standard(col = "freq", na.rm = T) %>%
  kable()
vocab_group ci_lower ci_upper mean
high 4.069088 4.361839 4.208633
low 2.027058 2.460856 2.219358

Individual Kids

good_types_t1 <- target_types %>%
  left_join(groups_info %>% 
              mutate(target_child_id = as.numeric(target_child_id))) %>%
  filter(tbin == "t1") %>%
  filter(gloss %in% t1_words$gloss)

get_trigrams_by_kid <- function(df, all_trigrams){
     current_trigrams <- all_trigrams[w1 %in% df$gloss &
                                 w2 %in% df$gloss & 
                                 w3 %in% df$gloss] 
     
      mean(log(current_trigrams$freq))
}


trigram_freqs_by_kid <- good_types_t1 %>%
              nest(-target_child_id, -tbin) %>%
              mutate(log_mean_trigram_frequency = 
                       map(data, get_trigrams_by_kid,
                           sw_trigrams)) %>%
            select(-data) %>%
            unnest() %>%
            left_join(groups_info %>% select(target_child_id, delta_resid_group))


ggplot(trigram_freqs_by_kid, 
       aes(x = log_mean_trigram_frequency, 
                         fill = delta_resid_group, 
                         group = delta_resid_group)) +
  geom_density(alpha = .4) +
  theme_classic()

trigram_freqs_by_kid %>%
  group_by(delta_resid_group) %>%
  multi_boot_standard(col = "log_mean_trigram_frequency", na.rm = T) %>%
    ggplot(aes( x = delta_resid_group, y = mean, group = delta_resid_group, fill = delta_resid_group)) +
  geom_bar(position = "dodge", stat = "identity") +
  geom_linerange(aes(ymin = ci_lower, ymax = ci_upper), 
                 position=position_dodge(width=0.9)) +
  theme_classic()