library(knitr)
opts_chunk$set(echo = T, message = F, warning = F,
error = F, cache = F, tidy = F)
library(tidyverse)
library(langcog)
library(data.table)
library(feather)
library(broom)
theme_set(theme_classic(base_size = 10))Do kids that know words at T2 (using mtld measure) know words that are more “useful” at T1? In particular, we define useful words to be words that allow you to make more trigrams in switchboard (http://www.anc.org/data/oanc/contents/).
# Get all unique words for two groups at t1.
groups_info <- read_csv("../1_exploration/groups_info.csv")
target_types <- read_csv("../1_exploration/target_types_delta_450_1150.csv") %>%
mutate(tbin = fct_recode(tbin,
t1 = "low",
t2 = "high")) %>%
mutate(gloss = tolower(gloss))
TOPNWORDS_PER_GROUP <- 500
t1_words <- target_types %>%
filter(tbin == "t1") %>% # #t1 only
left_join(groups_info %>%
mutate(target_child_id = as.numeric(target_child_id))) %>% # merge in group info
group_by(delta_resid_group, gloss) %>% # get glosses by group
summarize(n = n()) %>% # get gloss counts by group
top_n(TOPNWORDS_PER_GROUP) %>%
group_by(delta_resid_group) %>%
slice(1:TOPNWORDS_PER_GROUP) %>%
arrange(delta_resid_group, -n)sw_trigrams <- read_csv("switchboard_trigrams.csv") %>%
data.table()
t1_words_high <- t1_words %>%
filter(delta_resid_group == "high") %>%
select(gloss) %>%
unlist(use.names = F)
t1_words_low <- t1_words %>%
filter(delta_resid_group == "low") %>%
select(gloss) %>%
unlist(use.names = F)
high_trigrams <- sw_trigrams[w1 %in% t1_words_high &
w2 %in% t1_words_high &
w3 %in% t1_words_high] %>%
mutate(vocab_group = "high")
low_trigrams <- sw_trigrams[w1 %in% t1_words_low &
w2 %in% t1_words_low &
w3 %in% t1_words_low] %>%
mutate(vocab_group = "low")
all_trigrams <- bind_rows(high_trigrams, low_trigrams)
all_trigrams %>%
mutate(log_freq = log(freq)) %>%
group_by(vocab_group) %>%
multi_boot_standard(col = "log_freq", na.rm = T) %>%
ggplot(aes( x = vocab_group, y = mean, group = vocab_group, fill = vocab_group)) +
geom_bar(position = "dodge", stat = "identity") +
geom_linerange(aes(ymin = ci_lower, ymax = ci_upper),
position=position_dodge(width=0.9)) +
theme_classic() #summarize(mean_freq= mean(log_freq),
# ci_lower_freq = mean(log_freq) - (1.96 * sd(log_freq)),
# ci_upper_freq = mean(log_freq) + (1.96 * sd(log_freq)))
all_trigrams %>%
group_by(vocab_group) %>%
multi_boot_standard(col = "freq", na.rm = T) %>%
kable()| vocab_group | ci_lower | ci_upper | mean |
|---|---|---|---|
| high | 4.069088 | 4.361839 | 4.208633 |
| low | 2.027058 | 2.460856 | 2.219358 |
good_types_t1 <- target_types %>%
left_join(groups_info %>%
mutate(target_child_id = as.numeric(target_child_id))) %>%
filter(tbin == "t1") %>%
filter(gloss %in% t1_words$gloss)
get_trigrams_by_kid <- function(df, all_trigrams){
current_trigrams <- all_trigrams[w1 %in% df$gloss &
w2 %in% df$gloss &
w3 %in% df$gloss]
mean(log(current_trigrams$freq))
}
trigram_freqs_by_kid <- good_types_t1 %>%
nest(-target_child_id, -tbin) %>%
mutate(log_mean_trigram_frequency =
map(data, get_trigrams_by_kid,
sw_trigrams)) %>%
select(-data) %>%
unnest() %>%
left_join(groups_info %>% select(target_child_id, delta_resid_group))
ggplot(trigram_freqs_by_kid,
aes(x = log_mean_trigram_frequency,
fill = delta_resid_group,
group = delta_resid_group)) +
geom_density(alpha = .4) +
theme_classic()trigram_freqs_by_kid %>%
group_by(delta_resid_group) %>%
multi_boot_standard(col = "log_mean_trigram_frequency", na.rm = T) %>%
ggplot(aes( x = delta_resid_group, y = mean, group = delta_resid_group, fill = delta_resid_group)) +
geom_bar(position = "dodge", stat = "identity") +
geom_linerange(aes(ymin = ci_lower, ymax = ci_upper),
position=position_dodge(width=0.9)) +
theme_classic()