library(knitr)
opts_chunk$set(echo = T, message = F, warning = F,
error = F, cache = F, tidy = F)
library(tidyverse)
library(langcog)
library(feather)
theme_set(theme_classic(base_size = 20))The goal of this analysis is to test the hypothesis that children learn words that are subordinates to known words at the previous timepoint. To do this, I analyzed the vocabulary for each kid at each timepoint using word embeddings trained on English Wikipedia.
Here the baseline is sample of all words that kid could have known at current hypernym level (previously: sample of all other words that kids actually knows).
“Inclusive” means the x-axis refers to words the hypernym level of the anchor words known at the previous timepoint and all smaller hypernym values.
“Exclusive” means means the x-axis refers to the exact hypernym level of the anchor words at the previous timestamp.
ITEM_KEY <- "data/item_key.csv"
item_key <- read_csv(ITEM_KEY)
HYP <- "data/wordbank_hypernyms.csv"
hypernyms <- read_csv(HYP) %>% # foot
filter(uni_lemma != "feet") %>%
left_join(item_key %>% select(uni_lemma, num_item_id),
by = "uni_lemma")
hyp_counts <- count(hypernyms, hypernyms)These plots show Actual - Random as a function of hypernym level at previous timepoint.
mean_all_incl <- read_csv("data/mean_dists_all_words_incl.csv") %>%
mutate(type = "mean_all_incl")
mean_all_excl <- read_csv("data/mean_dists_all_words_excl.csv") %>%
mutate(type = "mean_all_excl")
all_measures <- bind_rows(list(mean_all_incl, mean_all_excl))group_means <- all_measures %>%
mutate(diff = actual - random) %>%
group_by(type, num_hyper, child_id) %>%
summarize(diff = mean(diff, na.rm = T),
actual = mean(actual, na.rm = T),
random = mean(random, na.rm = T)) %>%
gather("measure", "value", -1:-3) %>%
group_by(type, num_hyper, measure) %>%
multi_boot_standard(col = "value", na.rm = T)
group_means %>%
filter(measure == "diff") %>%
filter(num_hyper %in% 1:14) %>%
left_join(hyp_counts, by = c("num_hyper" = "hypernyms")) %>%
ggplot(aes(x = num_hyper, y = mean)) +
geom_line(color = "black") +
xlab("Hypernym Level") +
facet_wrap(.~ type) +
geom_hline(yintercept = 0, linetype = 2) +
ylab("Actual - Random") +
geom_linerange(aes(ymin = ci_lower, ymax = ci_upper), size= .4) +
geom_point(size = 2, aes(color = n)) +
scale_color_continuous(low = "grey", high = "red") +
scale_x_continuous(breaks = 1:14) +
theme_classic()group_means %>%
filter(measure != "diff") %>%
filter(num_hyper %in% 1:14) %>%
left_join(hyp_counts, by = c("num_hyper" = "hypernyms")) %>%
ggplot(aes(x = num_hyper, y = mean, group = measure)) +
geom_line(color = "black", aes(linetype = measure)) +
xlab("Hypernym Level") +
facet_wrap(.~ type) +
geom_hline(yintercept = 0, linetype = 2) +
ylab("cosine") +
geom_linerange(aes(ymin = ci_lower, ymax = ci_upper), size= .4) +
geom_point(size = 2, aes(color = n)) +
scale_color_continuous(low = "grey", high = "red") +
scale_x_continuous(breaks = 1:14) +
theme_classic()group_means %>%
filter(measure != "diff" & type == "mean_all_incl") %>%
filter(num_hyper %in% 1:14) %>%
#filter(num_hyper == 7) %>%
#data.frame()
left_join(hyp_counts, by = c("num_hyper" = "hypernyms")) %>%
group_by(measure) %>%
multi_boot_standard(col = "mean") %>%
ggplot(aes(x = measure, y = mean)) +
geom_pointrange(aes(ymin = ci_lower, ymax = ci_upper)) +
ylab("Cosine similarity of newly learned words to \npreviously-known words with lower hypernym values")This analysis asks how many words a kid learned that are close to the hypernyms at the previous timepoint. These plots actual - random, averaged across all the anchor hypernyms.
n_know <- read_csv("data/n_know_close_10_incl.csv") %>%
mutate(diff = actual - random,
type = "inclusive") %>%
select(type, child_id, session_num, num_hyper, random, actual, diff)
n_know2 <- read_csv("data/n_know_close_10_excl.csv") %>%
mutate(diff = actual - random,
type = "exclusive") %>%
select(type, child_id, session_num, num_hyper, random, actual, diff)
mean_n_know <- n_know %>%
bind_rows(n_know2) %>%
group_by(num_hyper, child_id, type) %>%
summarize(diff = mean(diff)) %>%
group_by(num_hyper, type) %>%
multi_boot_standard(col = "diff", na.rm = T)
mean_n_know %>%
left_join(hyp_counts, by = c("num_hyper" = "hypernyms")) %>%
ggplot(aes(x = num_hyper, y = mean)) +
geom_line() +
facet_wrap(.~ type) +
xlab("Hypernym Level") +
geom_hline(yintercept = 0, linetype = 2) +
ylab("Actual - Random") +
geom_linerange(aes(ymin = ci_lower, ymax = ci_upper), size= .4) +
geom_point(size = 2, aes(color = n)) +
scale_color_continuous(low = "grey", high = "red") +
scale_x_continuous(breaks = 1:14) +
theme_classic()