library(knitr)
opts_chunk$set(echo = T, message = F, warning = F,
error = F, cache = F, tidy = F)
library(tidyverse)
library(langcog)
library(tidyr)
library(broom)
theme_set(theme_classic(base_size = 10))t1_words <- target_types_clean %>%
left_join(groups_info %>%
mutate(target_child_id = as.numeric(target_child_id))) %>%
filter(tbin == "t1") %>%
distinct(tbin, delta_resid_group, gloss)
# get all t1 low/high words
t1_words_low <- filter(t1_words, delta_resid_group == "low")
t1_words_high <- filter(t1_words, delta_resid_group == "high")prop_from_each_group_said_word_t1 <- target_types_clean %>%
filter(tbin == "t1") %>%
count(gloss, delta_resid_group) %>%
group_by(delta_resid_group) %>%
mutate(total = case_when(delta_resid_group == "low" ~
nrow(filter(groups_info, delta_resid_group == "low")),
delta_resid_group == "high"~
nrow(filter(groups_info, delta_resid_group == "high")))) %>%
mutate(prop = n / total) %>%
select(-total, -n) %>%
group_by(delta_resid_group, gloss) %>%
arrange(delta_resid_group, -prop)
diff_in_props_between_groups <- prop_from_each_group_said_word_t1 %>%
spread(delta_resid_group, prop) %>%
mutate(high = ifelse(is.na(high), 0, high),
low = ifelse(is.na(low), 0, low),
diff = high - low) %>%
arrange(-diff) %>%
ungroup()
DT::datatable(diff_in_props_between_groups,
caption = "Proportion of kids in each group that said each word, and the difference in proportions.")concreteness <- read_csv("data/control_variables/brysbaert_corpus.csv")
freq <- read_tsv("data/control_variables/SUBTLEXus_corpus.txt")
babbiness <- read_csv("data/control_variables/babiness_iconicity.csv")
diff_with_predictors <- diff_in_props_between_groups %>%
left_join(concreteness %>% select(Word, Conc.M) %>% distinct(), by = c("gloss" = "Word")) %>%
left_join(freq %>% select(Word, Lg10WF) %>% distinct(), by = c("gloss" = "Word")) %>%
left_join(babbiness %>% select(word, iconicity, babiness) %>% distinct(), by = c("gloss" = "word")) %>%
rename(concreteness = Conc.M,
log_frequency = Lg10WF,
prop_group_diff = diff)
diff_with_predictors_long <- diff_with_predictors %>%
select(-high, -low) %>%
gather(measure, value, -gloss, -prop_group_diff)
ggplot(diff_with_predictors_long, aes(x = value, y = prop_group_diff)) +
geom_point(aes(color = measure), size = .6) +
ggtitle("Diff score correlations with other measures") +
geom_smooth(method = "lm") +
facet_wrap(~measure, scales = "free") +
theme_classic() +
theme(legend.position = "none")diff_with_predictors_long %>%
group_by(measure) %>%
do(tidy(cor.test(.$prop_group_diff, .$value))) %>%
select(measure, estimate, statistic, p.value) %>%
kable()| measure | estimate | statistic | p.value |
|---|---|---|---|
| babiness | -0.0451859 | -0.8364878 | 0.4034646 |
| concreteness | -0.0211532 | -1.1759294 | 0.2397137 |
| iconicity | 0.0490565 | 0.9083079 | 0.3643551 |
| log_frequency | 0.5068416 | 36.0719844 | 0.0000000 |
The units here are invidiual kid’s vocabulary sets at t1.
kid_measures_long <- target_types_clean %>%
filter(tbin == "t1") %>%
left_join(diff_in_props_between_groups %>% select(gloss, diff)) %>%
left_join(concreteness %>% select(Word, Conc.M) %>% distinct(), by = c("gloss" = "Word")) %>%
left_join(freq %>% select(Word, Lg10WF) %>% distinct(), by = c("gloss" = "Word")) %>%
left_join(babbiness %>% select(word, iconicity, babiness) %>% distinct(), by = c("gloss" = "word")) %>%
rename(concreteness = Conc.M,
log_frequency = Lg10WF,
prop_group_diff = diff) %>%
gather("measure", "value", c(-1:-9)) %>%
mutate(measure = as.factor(measure))
kid_ms <- kid_measures_long %>%
group_by(target_child_id, measure, delta_resid_group) %>%
summarize(mean_value = mean(value, na.rm = T))
kid_ms %>%
group_by(delta_resid_group, measure) %>%
multi_boot_standard(col = "mean_value", na.rm = T) %>%
ggplot(aes(x = delta_resid_group, y = mean,
group = delta_resid_group, fill = delta_resid_group)) +
facet_wrap(~measure, scales = "free_y", ncol = 5) +
ylab("predictor") +
xlab("vocab group") +
geom_bar(position = "dodge", stat = "identity") +
geom_linerange(aes(ymin = ci_lower, ymax = ci_upper),
position=position_dodge(width=0.9)) +
theme_classic() +
theme(legend.position = "none")Here’s the same analysis as above but now i’m using the continous MTLD score.
kid_ms_continuous_long <- kid_ms %>%
left_join(groups_info %>% select(target_child_id, delta_resid)) %>%
ungroup()
ggplot(kid_ms_continuous_long, aes(x = mean_value, y = delta_resid)) +
geom_point(aes(color = measure), size = .6) +
ggtitle("MTLD scores predicted by other measures") +
geom_smooth(method = "lm") +
facet_wrap(~measure, scales = "free") +
theme_classic() +
theme(legend.position = "none")kid_ms_continuous_long %>%
group_by(measure) %>%
do(tidy(cor.test(.$delta_resid, .$mean_value))) %>%
select(measure, estimate, statistic, p.value) %>%
kable()| measure | estimate | statistic | p.value |
|---|---|---|---|
| babiness | 0.0589108 | 0.5871744 | 0.5584233 |
| concreteness | 0.1954155 | 1.9825833 | 0.0501853 |
| iconicity | -0.0284973 | -0.2836595 | 0.7772637 |
| log_frequency | -0.1574152 | -1.5860357 | 0.1159203 |
| prop_group_diff | 0.3394595 | 3.5907987 | 0.0005151 |