library(knitr)
opts_chunk$set(echo = T, message = F, warning = F,
error = F, cache = F, tidy = F)
library(tidyverse)
library(langcog)
library(data.table)
library(feather)
library(broom)
theme_set(theme_classic(base_size = 10))Do kids that know more words at T2 (using mtld measure) know words that are more “useful” at T1? In particular, we define useful words to be words that allow you to make more trigrams in adult speech from childes (child-directed). Because high kids have bigger vocabs, we sample n words per group (here, 500).
It turns out, that the pattern of effects depends a lot on what you set the threshold value for knowing a word to be. The results shown here are where the threhold is set at 5 occurences (but it looks different if it’s set to 1.)
FINDINGS:
Min occurences of word in vocab = 1:
Min occurences of word in vocab = 5:
MINCOUNT <- 5
TOPNWORDS_PER_GROUP <- 500
# Get all unique words for two groups at t1.
groups_info <- read_csv("../1_mtld_measure/data/groups_info_600_900.csv")
target_types <- read_csv("../1_mtld_measure/data/target_types_for_MTLD_kids_600_900.csv") %>%
group_by(target_child_id, tbin, gloss) %>%
summarize(count = sum(count)) %>%
ungroup() %>%
filter(count >= MINCOUNT) %>%
mutate(gloss = tolower(gloss))
t1_words <- target_types %>%
filter(tbin == "t1") %>% # #t1 only
left_join(groups_info %>%
mutate(target_child_id = as.numeric(target_child_id))) %>% # merge in group info
group_by(delta_resid_group, gloss) %>% # get glosses by group
summarize(n = n()) %>% # get gloss counts by group
top_n(TOPNWORDS_PER_GROUP) %>%
group_by(delta_resid_group) %>%
slice(1:TOPNWORDS_PER_GROUP) %>%
arrange(delta_resid_group, -n)childes_trigrams <- read_csv("data/trigrams/adult_childes_trigrams_turns.csv") %>%
data.table()
t1_words_high <- t1_words %>%
ungroup()%>%
filter(delta_resid_group == "high") %>%
select(gloss) %>%
unlist(use.names = F)
t1_words_low <- t1_words %>%
ungroup() %>%
filter(delta_resid_group == "low") %>%
select(gloss) %>%
unlist(use.names = F)
high_trigrams <- childes_trigrams[w1 %in% t1_words_high &
w2 %in% t1_words_high &
w3 %in% t1_words_high] %>%
mutate(vocab_group = "high")
low_trigrams <- childes_trigrams[w1 %in% t1_words_low &
w2 %in% t1_words_low &
w3 %in% t1_words_low] %>%
mutate(vocab_group = "low")
all_trigrams <- bind_rows(high_trigrams, low_trigrams)count(all_trigrams, vocab_group) %>%
kable()| vocab_group | n |
|---|---|
| high | 100231 |
| low | 25762 |
all_trigrams %>%
mutate(log_freq = log(freq)) %>%
group_by(vocab_group) %>%
multi_boot_standard(col = "log_freq", na.rm = T) %>%
ggplot(aes( x = vocab_group, y = mean, group = vocab_group, fill = vocab_group)) +
geom_bar(position = "dodge", stat = "identity") +
geom_linerange(aes(ymin = ci_lower, ymax = ci_upper),
position=position_dodge(width=0.9)) +
theme_classic() #summarize(mean_freq= mean(log_freq),
# ci_lower_freq = mean(log_freq) - (1.96 * sd(log_freq)),
# ci_upper_freq = mean(log_freq) + (1.96 * sd(log_freq)))
all_trigrams %>%
group_by(vocab_group) %>%
multi_boot_standard(col = "freq", na.rm = T) %>%
kable()| vocab_group | ci_lower | ci_upper | mean |
|---|---|---|---|
| high | 4.023067 | 4.362018 | 4.16881 |
| low | 3.726431 | 4.612536 | 4.09417 |
freq <- read_tsv("../1_mtld_measure/data/control_variables/SUBTLEXus_corpus.txt") %>%
select(Word, Lg10WF)
freq_tidy <- freq %>%
mutate(Word = tolower(Word)) %>%
rename(w1 = Word,
log_freq1 = Lg10WF) %>%
mutate(w2 = w1,
log_freq2 = log_freq1,
w3 = w1,
log_freq3 = log_freq1)
all_trigrams_with_freq <- all_trigrams %>%
left_join(freq_tidy %>% select(w1, log_freq1)) %>%
left_join(freq_tidy %>% select(w2, log_freq2)) %>%
left_join(freq_tidy %>% select(w3, log_freq3)) %>%
rowwise()%>%
mutate(mean_word_freq = mean(c(log_freq1, log_freq2, log_freq3),
na.rm = T),
max_word_freq = max(c(log_freq1, log_freq2, log_freq3),
na.rm = T),
log_trigram_freq = log(freq)) %>%
select(vocab_group, log_trigram_freq, max_word_freq, mean_word_freq)
lm(log_trigram_freq ~ vocab_group + mean_word_freq, d = all_trigrams_with_freq) %>%
summary()##
## Call:
## lm(formula = log_trigram_freq ~ vocab_group + mean_word_freq,
## data = all_trigrams_with_freq)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.5250 -0.5693 -0.2550 0.3161 7.3738
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -3.941802 0.035122 -112.23 < 2e-16 ***
## vocab_grouplow -0.039697 0.005960 -6.66 2.74e-11 ***
## mean_word_freq 0.866465 0.006744 128.48 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.853 on 125950 degrees of freedom
## (40 observations deleted due to missingness)
## Multiple R-squared: 0.116, Adjusted R-squared: 0.116
## F-statistic: 8266 on 2 and 125950 DF, p-value: < 2.2e-16
lm(log_trigram_freq ~ vocab_group + max_word_freq, d = all_trigrams_with_freq %>% filter(is.finite(max_word_freq))) %>%
summary()##
## Call:
## lm(formula = log_trigram_freq ~ vocab_group + max_word_freq,
## data = all_trigrams_with_freq %>% filter(is.finite(max_word_freq)))
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.8122 -0.5972 -0.3792 0.3139 7.7071
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.043505 0.037827 -54.023 < 2e-16 ***
## vocab_grouplow -0.026020 0.006222 -4.182 2.89e-05 ***
## max_word_freq 0.452620 0.006565 68.949 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.8905 on 125950 degrees of freedom
## Multiple R-squared: 0.03654, Adjusted R-squared: 0.03652
## F-statistic: 2388 on 2 and 125950 DF, p-value: < 2.2e-16
Kids in the high group produce high frequency trigrams, contorling for the mean frequency of the words/max frequency of the words in the trigrams.
good_types_t1 <- target_types %>%
left_join(groups_info %>%
mutate(target_child_id = as.numeric(target_child_id))) %>%
filter(tbin == "t1") %>%
filter(gloss %in% t1_words$gloss)
get_trigrams_by_kid <- function(df, all_trigrams, measure, freq){
current_trigrams <- all_trigrams[w1 %in% df$gloss &
w2 %in% df$gloss &
w3 %in% df$gloss]
vocab_freqs <- df %>%
left_join(freq, by = c("gloss" = "Word"))
mean_freq <- mean(vocab_freqs$Lg10WF, na.rm = T )
median_freq <- median(vocab_freqs$Lg10WF, na.rm = T )
max_freq <- median(vocab_freqs$Lg10WF, na.rm = T )
if (measure == "num"){
list(log_trigram_num = log(nrow(current_trigrams)),
mean_word_freq = mean_freq,
median_word_freq = median_freq,
max_word_freq = max_freq)
} else if (measure == "freq"){
list(mean_trigram_freq = mean(log(current_trigrams$freq)),
mean_word_freq = mean_freq,
median_word_freq = median_freq,
max_word_freq = max_freq)
}
}trigram_num_by_kid <- good_types_t1 %>%
nest(-target_child_id, -tbin) %>%
mutate(kid_measures =
map(data, get_trigrams_by_kid,
childes_trigrams, "num", freq),
kid_measure_names = list(c("log_num_trigrams", # this is bad
"mean_word_freq",
"median_word_freq",
"max_word_freq"))) %>%
select(-data) %>%
unnest() %>%
mutate(kid_measures = unlist(kid_measures)) %>%
left_join(groups_info %>% select(target_child_id, delta_resid_group)) %>%
spread(kid_measure_names, kid_measures)
ggplot(trigram_num_by_kid,
aes(x = log_num_trigrams,
fill = delta_resid_group,
group = delta_resid_group)) +
geom_density(alpha = .4) +
theme_classic()ggplot(trigram_num_by_kid,
aes(x = log_num_trigrams,
fill = delta_resid_group)) +
geom_histogram() +
theme_classic()trigram_num_by_kid %>%
group_by(delta_resid_group) %>%
multi_boot_standard(col = "log_num_trigrams", na.rm = T) %>%
ggplot(aes( x = delta_resid_group, y = mean,
group = delta_resid_group, fill = delta_resid_group)) +
geom_bar(position = "dodge", stat = "identity") +
geom_linerange(aes(ymin = ci_lower, ymax = ci_upper),
position=position_dodge(width=0.9)) +
theme_classic()lm(log_num_trigrams ~ delta_resid_group + mean_word_freq,
d = trigram_num_by_kid %>% filter(is.finite(log_num_trigrams))) %>%
summary()##
## Call:
## lm(formula = log_num_trigrams ~ delta_resid_group + mean_word_freq,
## data = trigram_num_by_kid %>% filter(is.finite(log_num_trigrams)))
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.5340 -1.7205 0.6778 2.1102 5.7103
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 16.1982 2.6749 6.056 5.34e-08 ***
## delta_resid_grouplow -2.6669 0.6923 -3.852 0.000247 ***
## mean_word_freq -2.0165 0.5970 -3.378 0.001168 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.979 on 74 degrees of freedom
## (3 observations deleted due to missingness)
## Multiple R-squared: 0.3053, Adjusted R-squared: 0.2865
## F-statistic: 16.26 on 2 and 74 DF, p-value: 1.403e-06
lm(log_num_trigrams ~ delta_resid_group + median_word_freq,
d = trigram_num_by_kid %>% filter(is.finite(log_num_trigrams))) %>%
summary()##
## Call:
## lm(formula = log_num_trigrams ~ delta_resid_group + median_word_freq,
## data = trigram_num_by_kid %>% filter(is.finite(log_num_trigrams)))
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.5658 -2.0368 0.4806 2.0022 5.5137
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 14.3034 2.2911 6.243 2.44e-08 ***
## delta_resid_grouplow -2.6411 0.7028 -3.758 0.000339 ***
## median_word_freq -1.5834 0.5070 -3.123 0.002552 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.009 on 74 degrees of freedom
## (3 observations deleted due to missingness)
## Multiple R-squared: 0.2916, Adjusted R-squared: 0.2724
## F-statistic: 15.23 on 2 and 74 DF, p-value: 2.892e-06
trigram_freqs_by_kid <- good_types_t1 %>%
nest(-target_child_id, -tbin) %>%
mutate(kid_measures =
map(data, get_trigrams_by_kid,
childes_trigrams, "freq", freq),
kid_measure_names = list(c("log_mean_trigram_frequency", # this is bad
"mean_word_freq",
"median_word_freq",
"max_word_freq"))) %>%
select(-data) %>%
unnest() %>%
mutate(kid_measures = unlist(kid_measures)) %>%
left_join(groups_info %>% select(target_child_id, delta_resid_group)) %>%
spread(kid_measure_names, kid_measures)
ggplot(trigram_freqs_by_kid,
aes(x = log_mean_trigram_frequency,
fill = delta_resid_group,
group = delta_resid_group)) +
geom_density(alpha = .4) +
theme_classic()trigram_freqs_by_kid %>%
group_by(delta_resid_group) %>%
multi_boot_standard(col = "log_mean_trigram_frequency", na.rm = T) %>%
ggplot(aes( x = delta_resid_group, y = mean, group = delta_resid_group, fill = delta_resid_group)) +
geom_bar(position = "dodge", stat = "identity") +
geom_linerange(aes(ymin = ci_lower, ymax = ci_upper),
position=position_dodge(width=0.9)) +
theme_classic()lm(log_mean_trigram_frequency ~ delta_resid_group + mean_word_freq, d = trigram_freqs_by_kid) %>%
summary()##
## Call:
## lm(formula = log_mean_trigram_frequency ~ delta_resid_group +
## mean_word_freq, data = trigram_freqs_by_kid)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.3618 -0.4136 -0.1613 0.1153 3.1254
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.4181 0.7778 -3.109 0.00267 **
## delta_resid_grouplow 0.5240 0.2013 2.603 0.01116 *
## mean_word_freq 0.7703 0.1736 4.437 3.12e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.8664 on 74 degrees of freedom
## (9 observations deleted due to missingness)
## Multiple R-squared: 0.3029, Adjusted R-squared: 0.284
## F-statistic: 16.08 on 2 and 74 DF, p-value: 1.593e-06
lm(log_mean_trigram_frequency ~ delta_resid_group + median_word_freq, d = trigram_freqs_by_kid) %>%
summary()##
## Call:
## lm(formula = log_mean_trigram_frequency ~ delta_resid_group +
## median_word_freq, data = trigram_freqs_by_kid)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.3375 -0.4262 -0.1738 0.1008 3.1990
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.4487 0.6840 -2.118 0.03752 *
## delta_resid_grouplow 0.5309 0.2098 2.531 0.01351 *
## median_word_freq 0.5493 0.1513 3.629 0.00052 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.8982 on 74 degrees of freedom
## (9 observations deleted due to missingness)
## Multiple R-squared: 0.2508, Adjusted R-squared: 0.2305
## F-statistic: 12.38 on 2 and 74 DF, p-value: 2.296e-05