Corpora

MONTAG_PATH <- "../2_w2vec/data/montag12_raw.txt"
BROWN_RANDOM_PATH <- "../2_w2vec/data/random_brown1.txt"
BROWN_FULL_PATH <- "../2_w2vec/data/full_brown_fiction.txt"

montag_corpus <- read_lines(MONTAG_PATH)   %>%
      str_split(boundary("word")) %>%
      unlist() %>%
      data.frame(word = .) %>%
      mutate(corpus = "montag")

brown_r_corpus <- read_lines(BROWN_RANDOM_PATH)   %>%
      str_split(boundary("word")) %>%
        unlist() %>%
      data.frame(word = .) %>%
      mutate(corpus = "brown_random")

brown_f_corpus <- read_lines(BROWN_FULL_PATH)   %>%
      str_split(boundary("word")) %>%
      unlist() %>%
      data.frame(word = .) %>%
      mutate(corpus = "brown_full")

corpora <- bind_rows(list(montag_corpus, brown_r_corpus, brown_f_corpus))

Word counts by corpus

word_counts <- corpora %>%
  count(word, corpus)

Gender and concreteness norms

glasglow_norms <- read_csv("../../data/raw/norms/GlasgowNorms.csv") %>%
  select(word, GEND_M)  %>% #contains("_M"))  
  rename(maleness = GEND_M)

brysbaert_norms  <- read_csv("/Users/mollylewis/Documents/research/Projects/1_in_progress/VOCAB_SEEDS/analyses/1_mtld_measure/data/control_variables/brysbaert_corpus.csv") %>%
  select(Word, Conc.M) %>%
  rename(conc = Conc.M,
         word = Word)

freq_norms <- 
  read_tsv("/Users/mollylewis/Documents/research/Projects/2_published/iterated_RC/corpus/pop_size/data/SUBTLEXus_corpus.txt") %>%
  select(Word, Lg10WF) %>%
  rename(word = Word, 
         log_freq = Lg10WF)

Join stuff together

tidy_words <- corpora %>%
  left_join(glasglow_norms) %>%
  left_join(brysbaert_norms) %>%
  left_join(freq_norms) %>%
  filter(!is.na(maleness)) 

Word distributions

Tokens

tidy_words %>%
  ggplot(aes(x = maleness, fill = corpus)) +
  geom_density(alpha = .3) +
  theme_classic()

tidy_words %>%
  group_by(corpus) %>%
  multi_boot_standard(col = "maleness")
## # A tibble: 3 x 4
## # Groups:   corpus [?]
##   corpus       ci_lower ci_upper  mean
##   <chr>           <dbl>    <dbl> <dbl>
## 1 brown_full       3.97     3.98  3.98
## 2 brown_random     3.94     3.97  3.95
## 3 montag           3.82     3.84  3.83

Token Residuals

type_df <-  tidy_words  %>%
  distinct(word, .keep_all = T) %>%
  select(-corpus)

mod <- lm(maleness ~ log_freq + conc, type_df)

type_df_resid <- type_df %>%
  modelr::add_residuals(mod) %>%
  select(word, resid)

tidy_words_with_resid <- tidy_words %>%
  left_join(type_df_resid)

tidy_words_with_resid %>%
  ggplot(aes(x = resid, fill = corpus)) +
  geom_density(alpha = .3) +
  theme_classic()

Types

tidy_words %>%
  distinct(word, corpus, .keep_all = T) %>%
  ggplot(aes(x = maleness, fill = corpus)) +
  geom_density(alpha = .3) +
  theme_classic()

Types with resids

tidy_words_with_resid %>%
  distinct(word, corpus, .keep_all = T) %>%
  ggplot(aes(x = resid, fill = corpus)) +
  geom_density(alpha = .3) +
  theme_classic()

Overall, the distribhtions look the same, with kids books being more feminine overall.