library(knitr)
opts_chunk$set(echo = T, message = F, warning = F,
error = F, cache = T, tidy = F)
library(tidyverse)
library(langcog)
library(feather)
theme_set(theme_classic(base_size = 10))
item_key <- read_csv("data/item_key.csv") %>%
mutate(num_item_id = as.character(num_item_id))
item_data <- read_csv("data/item_data.csv") %>%
select(1,4) %>%
select(num_item_id, category) %>%
mutate(num_item_id = as.character(num_item_id))
word_bank_hyper <- read_csv("data/wordbank_hypernyms.csv") %>%
filter(uni_lemma != "feet") %>%
select(uni_lemma, hypernyms) %>%
left_join(item_key %>% select(uni_lemma, num_item_id), by = "uni_lemma") %>%
left_join(item_data, by = "num_item_id") %>%
select(num_item_id, uni_lemma, category, hypernyms) %>%
mutate_if(is.character, as.factor) %>%
ungroup()
word_bank_hyper_norm <- word_bank_hyper %>%
group_by(category) %>%
mutate(hypernyms_scaled_cat = scale(hypernyms)) %>%
mutate_if(is.character, as.factor) %>%
ungroup()
POS <- "/Users/mollylewis/Documents/research/Projects/1_in_progress/VOCAB_SEEDS/analyses/3_kid_vocabs/data/SUBTLEX-US\ frequency\ list\ with\ PoS\ information\ text\ version.txt"
pos_data <- read_tsv(POS) %>%
select(Word, Dom_PoS_SUBTLEX) %>%
rename(pos_dom = Dom_PoS_SUBTLEX,
word = Word)
word_bank_hyper_norm_pos <- word_bank_hyper %>%
rowwise()%>%
mutate(item_clean = str_trim(str_split(uni_lemma, "\\(")[[1]][1])) %>%
left_join(pos_data, by = c("item_clean" = "word")) %>%
mutate(pos_cat = case_when(pos_dom == "Noun"~"n",
pos_dom == "Verb"~"v",
TRUE ~ "o"),
pos_cat = as.factor(pos_cat)) %>%
group_by(pos_cat) %>%
mutate(hypernyms_scaled_pos = scale(hypernyms)) %>%
mutate_if(is.character, as.factor) %>%
ungroup()
all_hypers <- word_bank_hyper_norm %>%
left_join(word_bank_hyper_norm_pos %>% as.data.frame() %>% select(num_item_id, hypernyms_scaled_pos)) %>%
select(uni_lemma, category, hypernyms, hypernyms_scaled_cat, hypernyms_scaled_pos) %>%
arrange(uni_lemma) %>%
mutate_if(is.numeric, round, 2)
DT::datatable(all_hypers)