Adult data from: http://www.helsinki.fi/varieng/CoRD/corpora/BROWN/
Sampled adult data reprsent one sample of each document in the brown corpus (N = 126) of equal length in words to montag corpus.
PATH <- "data/overlap_gensim_sg_bats_adults_full.csv"
d <- read_csv(PATH,
col_names = c("word_type", "ci_low", "ci_high",
"mean", "n_random", "n_closestwords",
"n_vectors", "n_threads", "n_iter",
"windowsize", "min_count","model_type", "n"))
d_raw_es <- d %>%
filter(word_type != "random") %>%
left_join(d %>% filter(word_type == "random") %>%
rename(ci_low_random = ci_low,
ci_high_random = ci_high,
mean_random = mean) %>%
select(-word_type, -n)) %>%
mutate(sd = ((ci_high-ci_low)/2) * sqrt(n)/1.96,
sd_random = ((ci_high_random-ci_low_random)/2) * sqrt(n_random)/1.96)
d_es <- d_raw_es %>%
rowwise() %>%
do(compute.es::mes(.$mean, .$mean_random,
.$sd, .$sd_random,
.$n, .$n_random,
verbose = F) %>% select(d, pval.d, l.d, u.d)) %>%
bind_cols(d_raw_es) %>%
mutate(sig = ifelse(pval.d < .05, "*",""),
source = "adults_full")
PATH <- "data/overlap_gensim_sg_bats_adults.csv"
d <- read_csv(PATH,
col_names = c("word_type", "ci_low", "ci_high",
"mean", "n_random", "n_closestwords",
"n_vectors", "n_threads", "n_iter",
"windowsize", "min_count","model_type", "n"))
d_raw_es <- d %>%
filter(word_type != "random") %>%
left_join(d %>% filter(word_type == "random") %>%
rename(ci_low_random = ci_low,
ci_high_random = ci_high,
mean_random = mean) %>%
select(-word_type, -n)) %>%
mutate(sd = ((ci_high-ci_low)/2) * sqrt(n)/1.96,
sd_random = ((ci_high_random-ci_low_random)/2) * sqrt(n_random)/1.96)
d_es2 <- d_raw_es %>%
rowwise() %>%
do(compute.es::mes(.$mean, .$mean_random,
.$sd, .$sd_random,
.$n, .$n_random,
verbose = F) %>% select(d, pval.d, l.d, u.d)) %>%
bind_cols(d_raw_es) %>%
mutate(sig = ifelse(pval.d < .05, "*",""),
source = "adults_sampled")
PATH <- "data/overlap_gensim_sg_bats_kid.csv"
d <- read_csv(PATH,
col_names = c("word_type", "ci_low", "ci_high",
"mean", "n_random", "n_closestwords",
"n_vectors", "n_threads", "n_iter",
"windowsize", "min_count","model_type", "n"))
d_raw_es <- d %>%
filter(word_type != "random") %>%
left_join(d %>% filter(word_type == "random") %>%
rename(ci_low_random = ci_low,
ci_high_random = ci_high,
mean_random = mean) %>%
select(-word_type, -n)) %>%
mutate(sd = ((ci_high-ci_low)/2) * sqrt(n)/1.96,
sd_random = ((ci_high_random-ci_low_random)/2) * sqrt(n_random)/1.96)
d_es3 <- d_raw_es %>%
rowwise() %>%
do(compute.es::mes(.$mean, .$mean_random,
.$sd, .$sd_random,
.$n, .$n_random,
verbose = F) %>% select(d, pval.d, l.d, u.d)) %>%
bind_cols(d_raw_es) %>%
mutate(sig = ifelse(pval.d < .05, "*",""),
source = "kids")
Here are the results from kids full (1 and 2), adults full brown, and adults sampled.
d_es %>%
bind_rows(d_es3) %>%
bind_rows(d_es2) %>%
#filter(word_type != "synonyms") %>%
ggplot(aes(x = windowsize, y = d, group = word_type)) +
#geom_linerange(aes(ymin = l.d, ymax = u.d, color = word_type), size = .2) +
geom_line() +
geom_point(aes(shape = sig, color = word_type), size = 2) +
facet_grid(source ~ n_closestwords) +
geom_hline(aes(yintercept = 0), color = "red", linetype = 2) +
theme_classic()
Here’s what it looks like when you only look at antonyms and prouns, for adults full and kids:
d_es %>%
bind_rows(d_es3) %>%
bind_rows(d_es2) %>%
filter(word_type %in% c("gender_pronoun", "antonyms")) %>%
filter(source != "adults_sampled") %>%
ggplot(aes(x = windowsize, y = d, group = interaction(source, word_type))) +
#geom_linerange(aes(ymin = l.d, ymax = u.d, color = word_type), size = .2) +
geom_line(aes(linetype = source)) +
geom_point(aes(shape = sig, color = word_type), size = 2) +
facet_grid(. ~ n_closestwords) +
geom_hline(aes(yintercept = 0), color = "red", linetype = 2) +
theme_classic()
Take aways: