Children’s book corpus

Adult data from: http://www.helsinki.fi/varieng/CoRD/corpora/BROWN/

Sampled adult data reprsent one sample of each document in the brown corpus (N = 126) of equal length in words to montag corpus.

PATH  <- "data/overlap_gensim_sg_bats_adults_full.csv"
d <- read_csv(PATH,
              col_names = c("word_type", "ci_low", "ci_high",
                            "mean", "n_random", "n_closestwords",
                            "n_vectors", "n_threads", "n_iter",
                            "windowsize", "min_count","model_type", "n")) 
d_raw_es <- d %>%
  filter(word_type != "random") %>%
  left_join(d %>% filter(word_type == "random") %>%
         rename(ci_low_random = ci_low,
                ci_high_random = ci_high,
                mean_random = mean) %>%
           select(-word_type, -n)) %>%
  mutate(sd = ((ci_high-ci_low)/2) * sqrt(n)/1.96,
         sd_random = ((ci_high_random-ci_low_random)/2) * sqrt(n_random)/1.96) 

d_es <- d_raw_es %>%
  rowwise() %>%
  do(compute.es::mes(.$mean, .$mean_random,
                         .$sd, .$sd_random,
                         .$n, .$n_random,
                         verbose = F) %>% select(d, pval.d, l.d, u.d)) %>%
  bind_cols(d_raw_es) %>%
  mutate(sig = ifelse(pval.d < .05, "*",""),
         source = "adults_full")

PATH  <- "data/overlap_gensim_sg_bats_adults.csv"
d <- read_csv(PATH,
              col_names = c("word_type", "ci_low", "ci_high",
                            "mean", "n_random", "n_closestwords",
                            "n_vectors", "n_threads", "n_iter",
                            "windowsize", "min_count","model_type", "n")) 
d_raw_es <- d %>%
  filter(word_type != "random") %>%
  left_join(d %>% filter(word_type == "random") %>%
         rename(ci_low_random = ci_low,
                ci_high_random = ci_high,
                mean_random = mean) %>%
           select(-word_type, -n)) %>%
  mutate(sd = ((ci_high-ci_low)/2) * sqrt(n)/1.96,
         sd_random = ((ci_high_random-ci_low_random)/2) * sqrt(n_random)/1.96)

d_es2 <- d_raw_es %>%
  rowwise() %>%
  do(compute.es::mes(.$mean, .$mean_random,
                         .$sd, .$sd_random,
                         .$n, .$n_random,
                         verbose = F) %>% select(d, pval.d, l.d, u.d)) %>%
  bind_cols(d_raw_es) %>%
  mutate(sig = ifelse(pval.d < .05, "*",""),
         source = "adults_sampled")

PATH  <- "data/overlap_gensim_sg_bats_kid.csv"
d <- read_csv(PATH,
              col_names = c("word_type", "ci_low", "ci_high",
                            "mean", "n_random", "n_closestwords",
                            "n_vectors", "n_threads", "n_iter",
                            "windowsize", "min_count","model_type", "n")) 
d_raw_es <- d %>%
  filter(word_type != "random") %>%
  left_join(d %>% filter(word_type == "random") %>%
         rename(ci_low_random = ci_low,
                ci_high_random = ci_high,
                mean_random = mean) %>%
           select(-word_type, -n)) %>%
  mutate(sd = ((ci_high-ci_low)/2) * sqrt(n)/1.96,
         sd_random = ((ci_high_random-ci_low_random)/2) * sqrt(n_random)/1.96)

d_es3 <- d_raw_es %>%
  rowwise() %>%
  do(compute.es::mes(.$mean, .$mean_random,
                         .$sd, .$sd_random,
                         .$n, .$n_random,
                         verbose = F) %>% select(d, pval.d, l.d, u.d)) %>%
  bind_cols(d_raw_es) %>%
  mutate(sig = ifelse(pval.d < .05, "*",""),
         source = "kids")

Here are the results from kids full (1 and 2), adults full brown, and adults sampled.

  d_es %>%
  bind_rows(d_es3) %>%
  bind_rows(d_es2) %>%
  #filter(word_type != "synonyms") %>%
    ggplot(aes(x = windowsize, y = d, group = word_type)) +
    #geom_linerange(aes(ymin = l.d, ymax = u.d, color = word_type), size = .2) +
  geom_line() +
  geom_point(aes(shape = sig, color = word_type), size = 2) +
  facet_grid(source ~  n_closestwords) +
  geom_hline(aes(yintercept = 0), color = "red", linetype = 2) +
  theme_classic()

Here’s what it looks like when you only look at antonyms and prouns, for adults full and kids:

  d_es %>%
  bind_rows(d_es3) %>%
  bind_rows(d_es2) %>%
  filter(word_type %in% c("gender_pronoun", "antonyms")) %>%
    filter(source != "adults_sampled") %>%
    ggplot(aes(x = windowsize, y = d, group = interaction(source, word_type))) +
    #geom_linerange(aes(ymin = l.d, ymax = u.d, color = word_type), size = .2) +
  geom_line(aes(linetype = source)) +
  geom_point(aes(shape = sig, color = word_type), size = 2) +
  facet_grid(. ~  n_closestwords) +
  geom_hline(aes(yintercept = 0), color = "red", linetype = 2) +
  theme_classic()

Take aways:

adult sampled looks weird, probably need to take more samples and average
need better list of synonyms (they aren’t consistent across corpora)
in general this is a problem - not all corpora have the exact same set of words
error bars on effect sizes over estimate uncertainty

Children’s book corpus

Effect size word overlap analysis (gensim)

Molly Lewis

2018-06-27