library(knitr)

opts_chunk$set(echo = T, message = F, warning = F, 
               error = F, cache = T, tidy = F)

library(tidyverse)
library(langcog)
library(feather)

theme_set(theme_classic(base_size = 10))
item_key <- read_csv("data/item_key.csv")  %>%
  mutate(num_item_id = as.character(num_item_id))

item_data <- read_csv("data/item_data.csv") %>%
  select(1,4) %>%
  select(num_item_id, category) %>%
  mutate(num_item_id = as.character(num_item_id))

word_bank_hyper <- read_csv("data/wordbank_hypernyms.csv") %>%
  filter(uni_lemma != "feet") %>%
  select(uni_lemma, hypernyms)  %>%
  left_join(item_key %>% select(uni_lemma, num_item_id), by = "uni_lemma") %>%
  left_join(item_data, by = "num_item_id")   %>%
  select(num_item_id, uni_lemma, category, hypernyms) %>%
  mutate_if(is.character, as.factor) %>%
  ungroup()
word_bank_hyper_norm <- word_bank_hyper %>%
  group_by(category) %>%
  mutate(hypernyms_scaled_cat = scale(hypernyms))  %>%
  mutate_if(is.character, as.factor) %>%
  ungroup()
POS <- "/Users/mollylewis/Documents/research/Projects/1_in_progress/VOCAB_SEEDS/analyses/3_kid_vocabs/data/SUBTLEX-US\ frequency\ list\ with\ PoS\ information\ text\ version.txt"

pos_data <- read_tsv(POS) %>%
  select(Word, Dom_PoS_SUBTLEX) %>%
  rename(pos_dom = Dom_PoS_SUBTLEX,
         word = Word)

word_bank_hyper_norm_pos <- word_bank_hyper %>%
  rowwise()%>%
  mutate(item_clean = str_trim(str_split(uni_lemma, "\\(")[[1]][1])) %>%
  left_join(pos_data, by = c("item_clean" = "word")) %>%
  mutate(pos_cat  = case_when(pos_dom == "Noun"~"n",
                          pos_dom == "Verb"~"v",
                          TRUE ~ "o"),
        pos_cat = as.factor(pos_cat)) %>%
  group_by(pos_cat) %>%
  mutate(hypernyms_scaled_pos = scale(hypernyms))  %>%
  mutate_if(is.character, as.factor) %>%
  ungroup()
all_hypers <- word_bank_hyper_norm %>%
  left_join(word_bank_hyper_norm_pos %>% as.data.frame() %>% select(num_item_id, hypernyms_scaled_pos)) %>%
  select(uni_lemma, category, hypernyms, hypernyms_scaled_cat, hypernyms_scaled_pos) %>%
  arrange(uni_lemma)  %>%
  mutate_if(is.numeric, round, 2)

  DT::datatable(all_hypers)