library(knitr)
opts_chunk$set(echo = T, message = F, warning = F,
error = F, cache = F, tidy = F)
library(tidyverse)
library(langcog)
library(tidyr)
library(broom)
theme_set(theme_classic(base_size = 10))
t1_words <- target_types %>%
left_join(groups_info %>%
mutate(target_child_id = as.numeric(target_child_id))) %>%
filter(tbin == "t1") %>%
distinct(tbin, delta_resid_group, gloss)
# get all t1 low/high words
t1_words_low <- filter(t1_words, delta_resid_group == "low")
t1_words_high <- filter(t1_words, delta_resid_group == "high")
prop_from_each_group_said_word_t1 <- target_types %>%
filter(tbin == "t1") %>%
count(gloss, delta_resid_group) %>%
group_by(delta_resid_group) %>%
mutate(total = case_when(delta_resid_group == "low" ~
nrow(filter(groups_info, delta_resid_group == "low")),
delta_resid_group == "high"~
nrow(filter(groups_info, delta_resid_group == "high")))) %>%
mutate(prop = n / total) %>%
select(-total, -n) %>%
group_by(delta_resid_group, gloss) %>%
arrange(delta_resid_group, -prop)
diff_in_props_between_groups <- prop_from_each_group_said_word_t1 %>%
spread(delta_resid_group, prop) %>%
mutate(high = ifelse(is.na(high), 0, high),
low = ifelse(is.na(low), 0, low),
diff = high - low) %>%
arrange(-diff) %>%
ungroup()
DT::datatable(diff_in_props_between_groups,
caption = "Proportion of kids in each group that said each word, and the difference in proportions.")
concreteness <- read_csv("data/control_variables/brysbaert_corpus.csv")
freq <- read_tsv("data/control_variables/SUBTLEXus_corpus.txt")
babbiness <- read_csv("data/control_variables/babiness_iconicity.csv")
diff_with_predictors <- diff_in_props_between_groups %>%
left_join(concreteness %>% select(Word, Conc.M) %>% distinct(), by = c("gloss" = "Word")) %>%
left_join(freq %>% select(Word, Lg10WF) %>% distinct(), by = c("gloss" = "Word")) %>%
left_join(babbiness %>% select(word, iconicity, babiness) %>% distinct(), by = c("gloss" = "word")) %>%
rename(concreteness = Conc.M,
log_frequency = Lg10WF,
prop_group_diff = diff)
diff_with_predictors_long <- diff_with_predictors %>%
select(-high, -low) %>%
gather(measure, value, -gloss, -prop_group_diff)
ggplot(diff_with_predictors_long, aes(x = value, y = prop_group_diff)) +
geom_point(aes(color = measure), size = .6) +
ggtitle("Diff score correlations with other measures") +
geom_smooth(method = "lm") +
facet_wrap(~measure, scales = "free") +
theme_classic() +
theme(legend.position = "none")
diff_with_predictors_long %>%
group_by(measure) %>%
do(tidy(cor.test(.$prop_group_diff, .$value))) %>%
select(measure, estimate, statistic, p.value) %>%
kable()
measure | estimate | statistic | p.value |
---|---|---|---|
babiness | -0.0718031 | -1.271577 | 0.2044705 |
concreteness | -0.0647499 | -2.454545 | 0.0142247 |
iconicity | -0.0291726 | -0.515510 | 0.6065618 |
log_frequency | 0.2952014 | 12.172032 | 0.0000000 |
The units here are invidiual kid’s vocabulary sets at t1.
kid_measures_long <- target_types %>%
filter(tbin == "t1") %>%
left_join(diff_in_props_between_groups %>% select(gloss, diff)) %>%
left_join(concreteness %>% select(Word, Conc.M) %>% distinct(), by = c("gloss" = "Word")) %>%
left_join(freq %>% select(Word, Lg10WF) %>% distinct(), by = c("gloss" = "Word")) %>%
left_join(babbiness %>% select(word, iconicity, babiness) %>% distinct(), by = c("gloss" = "word")) %>%
rename(concreteness = Conc.M,
log_frequency = Lg10WF,
prop_group_diff = diff) %>%
gather("measure", "value", c(-1:-10)) %>%
mutate(measure = as.factor(measure))
kid_ms <- kid_measures_long %>%
group_by(target_child_id, measure, delta_resid_group) %>%
summarize(mean_value = mean(value, na.rm = T))
kid_ms %>%
group_by(delta_resid_group, measure) %>%
multi_boot_standard(col = "mean_value", na.rm = T) %>%
ggplot(aes(x = delta_resid_group, y = mean,
group = delta_resid_group, fill = delta_resid_group)) +
facet_wrap(~measure, scales = "free_y", ncol = 5) +
ylab("predictor") +
xlab("vocab group") +
geom_bar(position = "dodge", stat = "identity") +
geom_linerange(aes(ymin = ci_lower, ymax = ci_upper),
position=position_dodge(width=0.9)) +
theme_classic() +
theme(legend.position = "none")
Here’s the same analysis as above but now i’m using the continous MTLD score.
kid_ms_continuous_long <- kid_ms %>%
left_join(groups_info %>% select(target_child_id, delta_resid)) %>%
ungroup()
ggplot(kid_ms_continuous_long, aes(x = mean_value, y = delta_resid)) +
geom_point(aes(color = measure), size = .6) +
ggtitle("MTLD scores predicted by other measures") +
geom_smooth(method = "lm") +
facet_wrap(~measure, scales = "free") +
theme_classic() +
theme(legend.position = "none")
kid_ms_continuous_long %>%
group_by(measure) %>%
do(tidy(cor.test(.$delta_resid, .$mean_value))) %>%
select(measure, estimate, statistic, p.value) %>%
kable()
measure | estimate | statistic | p.value |
---|---|---|---|
babiness | 0.0304915 | 0.2828981 | 0.7779348 |
concreteness | 0.1457034 | 1.3736903 | 0.1730672 |
iconicity | -0.1819339 | -1.7158213 | 0.0897953 |
log_frequency | -0.2197489 | -2.1250509 | 0.0363538 |
prop_group_diff | 0.0818177 | 0.7788019 | 0.4381393 |