library(knitr)

opts_chunk$set(echo = T, message = F, warning = F, 
               error = F, cache = F, tidy = F)

library(tidyverse)
library(janitor)

theme_set(theme_classic(base_size = 20))

We want a set of words that:

Note that CDI only goes up to 30 months.

Using childes words

Of the words that appear in childes, here are the words that are frequent words that do not appear in wordbank, and have low hypernym values.

wordbank_hypernyms <- read_csv("data/wordbank_hypernyms.csv")

childes_hypernyms <- read_csv("data/childes_hypernyms.csv") %>%
  select(-X1)


word_freq <- read_tsv("/Users/mollylewis/Documents/research/Projects/2_published/iterated_RC/corpus/pop_size/data/SUBTLEXus_corpus.txt") %>%
  clean_names() %>%
  select(word, lg10wf) %>%
  rename(log_word_freq = lg10wf)


all_words <- childes_hypernyms %>%
  filter(!(word %in% wordbank_hypernyms$uni_lemma)) %>%
  left_join(word_freq) %>%
  filter(!is.na(log_word_freq))


FREQ_CUTOFF <- 3
HYPER_CUTOFF <- 8
targ_words <- all_words %>%
  filter(log_word_freq > FREQ_CUTOFF) %>%
  filter(hypernyms <  HYPER_CUTOFF) %>%
  data.frame() %>%
  arrange(hypernyms)
  
DT::datatable(targ_words)

Cluster

tsne_dimsF <- read_csv("temp.csv")

ggplot(tsne_dimsF,
         aes(x = tsne_X, y = tsne_Y, color = as.factor(cluster_wiki))) +
  geom_text(aes(label = word), size = 1.5) +
  theme_void() +
  theme(legend.position = "none")

Using adult AoA norms

According to Kuperman AoA norms, the mean of words in wordbank is about 4. Below are words that are between 4.5-5 in aoa and are relatively frequent. Note that we don’t have hypernym scores for these words so these words represent all hypernym levesl (whereas the ones above are only low hypernym).

aoa_norms <- read_csv("/Users/mollylewis/Library/Mobile\ Documents/com~apple~CloudDocs/Documents/research/Projects/next_kids/stimuli_selection/AoA_ratings_Kuperman_et_al_BRM.csv") %>%
  clean_names() %>%
  select(word, rating_mean)

aoa_norms %>%
  right_join(wordbank_hypernyms, by = c("word" = "uni_lemma")) %>%
  ggplot(aes(x = rating_mean)) +
  geom_density() +
  theme_minimal()

adult_aoa_targ_words <- aoa_norms %>%
  filter(rating_mean > 4.5 & rating_mean < 5.5) %>%
  left_join(word_freq) %>%
  filter(log_word_freq > 3)

DT::datatable(adult_aoa_targ_words)

Cluster

tsne_dimsF <- read_csv("temp2.csv")

ggplot(tsne_dimsF,
         aes(x = tsne_X, y = tsne_Y, color = as.factor(cluster_wiki))) +
  geom_text(aes(label = word), size = 1.5) +
  theme_void() +
  theme(legend.position = "none")

```