library(knitr)
opts_chunk$set(echo = T, message = F, warning = F,
error = F, cache = F, tidy = F)
library(tidyverse)
library(langcog)
library(data.table)
library(feather)
library(broom)
theme_set(theme_classic(base_size = 10))
Do kids that know words at T2 (using mtld measure) know words that are more “useful” at T1? In particular, we define useful words to be words that allow you to make more trigrams in adult speech using coca spoken corpus (https://corpus.byu.edu/coca/).
FINDINGS:
# Get all unique words for two groups at t1.
groups_info <- read_csv("../1_exploration/groups_info.csv")
target_types <- read_csv("../1_exploration/target_types_delta_450_1150.csv") %>%
mutate(tbin = fct_recode(tbin,
t1 = "low",
t2 = "high")) %>%
mutate(gloss = tolower(gloss))
TOPNWORDS_PER_GROUP <- 500
t1_words <- target_types %>%
filter(tbin == "t1") %>% # #t1 only
left_join(groups_info %>%
mutate(target_child_id = as.numeric(target_child_id))) %>% # merge in group info
group_by(delta_resid_group, gloss) %>% # get glosses by group
summarize(n = n()) %>% # get gloss counts by group
top_n(TOPNWORDS_PER_GROUP) %>%
group_by(delta_resid_group) %>%
slice(1:TOPNWORDS_PER_GROUP) %>%
arrange(delta_resid_group, -n)
coca_trigrams <- read_csv("results_freq_prob_coca.csv") %>%
select(w1, w2, w3, prob3, freq3) %>%
data.table()
t1_words_high <- t1_words %>%
filter(delta_resid_group == "high") %>%
select(gloss) %>%
unlist(use.names = F)
t1_words_low <- t1_words %>%
filter(delta_resid_group == "low") %>%
select(gloss) %>%
unlist(use.names = F)
high_trigrams <- coca_trigrams[w1 %in% t1_words_high &
w2 %in% t1_words_high &
w3 %in% t1_words_high] %>%
mutate(vocab_group = "high")
low_trigrams <- coca_trigrams[w1 %in% t1_words_low &
w2 %in% t1_words_low &
w3 %in% t1_words_low] %>%
mutate(vocab_group = "low")
all_trigrams <- bind_rows(high_trigrams, low_trigrams) %>%
mutate(log_freq = log(freq3))
ggplot(all_trigrams, aes(x = prob3,
fill = vocab_group,
group = vocab_group)) +
geom_density(alpha = .4)
all_trigrams %>%
group_by(vocab_group) %>%
multi_boot_standard(col = "prob3", na.rm = T) %>%
ggplot(aes(x = vocab_group, y = mean, group = vocab_group,
fill = vocab_group)) +
geom_bar(position = "dodge", stat = "identity") +
geom_linerange(aes(ymin = ci_lower, ymax = ci_upper),
position=position_dodge(width=0.9)) +
theme_classic()
ggplot(all_trigrams, aes(x = log_freq,
fill = vocab_group,
group = vocab_group)) +
geom_density(alpha = .4)
all_trigrams %>%
group_by(vocab_group) %>%
multi_boot_standard(col = "log_freq", na.rm = T) %>%
ggplot(aes(x = vocab_group, y = mean, group = vocab_group,
fill = vocab_group)) +
geom_bar(position = "dodge", stat = "identity") +
geom_linerange(aes(ymin = ci_lower, ymax = ci_upper),
position=position_dodge(width=0.9)) +
theme_classic()
good_types_t1 <- target_types %>%
left_join(groups_info %>%
mutate(target_child_id = as.numeric(target_child_id))) %>%
filter(tbin == "t1")
get_trigrams_by_kid <- function(df, all_trigrams){
current_trigrams <- all_trigrams[w1 %in% df$gloss &
w2 %in% df$gloss &
w3 %in% df$gloss]
mean(current_trigrams$prob3, na.rm = T)
}
group_ids <- good_types_t1 %>%
distinct(target_child_id,
delta_resid_group)
trigram_freqs_by_kid <- good_types_t1 %>%
nest(-target_child_id, -tbin) %>%
mutate(mean_trigram_probability =
map(data, get_trigrams_by_kid,
coca_trigrams)) %>%
select(-data) %>%
unnest() %>%
left_join(groups_info %>% select(target_child_id, delta_resid_group))
ggplot(trigram_freqs_by_kid,
aes(x = mean_trigram_probability,
fill = delta_resid_group,
group = delta_resid_group)) +
geom_density(alpha = .4)
trigram_freqs_by_kid %>%
group_by(delta_resid_group) %>%
multi_boot_standard(col = "mean_trigram_probability",
na.rm = T) %>%
ggplot(aes(x = delta_resid_group, y = mean, group = delta_resid_group, fill = delta_resid_group)) +
geom_bar(position = "dodge", stat = "identity") +
geom_linerange(aes(ymin = ci_lower, ymax = ci_upper),
position=position_dodge(width=0.9)) +
theme_classic()
good_types_t1 <- target_types %>%
left_join(groups_info %>%
mutate(target_child_id = as.numeric(target_child_id))) %>%
filter(tbin == "t1")
get_trigrams_by_kid <- function(df, all_trigrams){
current_trigrams <- all_trigrams[w1 %in% df$gloss &
w2 %in% df$gloss &
w3 %in% df$gloss]
mean(log(current_trigrams$freq3), na.rm = T)
}
group_ids <- good_types_t1 %>%
distinct(target_child_id,
delta_resid_group)
trigram_freqs_by_kid <- good_types_t1 %>%
nest(-target_child_id, -tbin) %>%
mutate(log_mean_trigram_freq=
map(data, get_trigrams_by_kid,
coca_trigrams)) %>%
select(-data) %>%
unnest() %>%
left_join(groups_info %>% select(target_child_id, delta_resid_group))
ggplot(trigram_freqs_by_kid,
aes(x = log_mean_trigram_freq,
fill = delta_resid_group,
group = delta_resid_group)) +
geom_density(alpha = .4)
trigram_freqs_by_kid %>%
group_by(delta_resid_group) %>%
multi_boot_standard(col = "log_mean_trigram_freq",
na.rm = T) %>%
ggplot(aes( x = delta_resid_group, y = mean, group = delta_resid_group, fill = delta_resid_group)) +
geom_bar(position = "dodge", stat = "identity") +
geom_linerange(aes(ymin = ci_lower, ymax = ci_upper),
position=position_dodge(width=0.9)) +
theme_classic()