MONTAG_PATH <- "../2_w2vec/data/montag12_raw.txt"
BROWN_RANDOM_PATH <- "../2_w2vec/data/random_brown1.txt"
BROWN_FULL_PATH <- "../2_w2vec/data/full_brown_fiction.txt"
montag_corpus <- read_lines(MONTAG_PATH) %>%
str_split(boundary("word")) %>%
unlist() %>%
data.frame(word = .) %>%
mutate(corpus = "montag")
brown_r_corpus <- read_lines(BROWN_RANDOM_PATH) %>%
str_split(boundary("word")) %>%
unlist() %>%
data.frame(word = .) %>%
mutate(corpus = "brown_random")
brown_f_corpus <- read_lines(BROWN_FULL_PATH) %>%
str_split(boundary("word")) %>%
unlist() %>%
data.frame(word = .) %>%
mutate(corpus = "brown_full")
corpora <- bind_rows(list(montag_corpus, brown_r_corpus, brown_f_corpus))
Word counts by corpus
word_counts <- corpora %>%
count(word, corpus)
glasglow_norms <- read_csv("../../data/raw/norms/GlasgowNorms.csv") %>%
select(word, GEND_M) %>% #contains("_M"))
rename(maleness = GEND_M)
brysbaert_norms <- read_csv("/Users/mollylewis/Documents/research/Projects/1_in_progress/VOCAB_SEEDS/analyses/1_mtld_measure/data/control_variables/brysbaert_corpus.csv") %>%
select(Word, Conc.M) %>%
rename(conc = Conc.M,
word = Word)
freq_norms <-
read_tsv("/Users/mollylewis/Documents/research/Projects/2_published/iterated_RC/corpus/pop_size/data/SUBTLEXus_corpus.txt") %>%
select(Word, Lg10WF) %>%
rename(word = Word,
log_freq = Lg10WF)
Join stuff together
tidy_words <- corpora %>%
left_join(glasglow_norms) %>%
left_join(brysbaert_norms) %>%
left_join(freq_norms) %>%
filter(!is.na(maleness))
Tokens
tidy_words %>%
ggplot(aes(x = maleness, fill = corpus)) +
geom_density(alpha = .3) +
theme_classic()
tidy_words %>%
group_by(corpus) %>%
multi_boot_standard(col = "maleness")
## # A tibble: 3 x 4
## # Groups: corpus [?]
## corpus ci_lower ci_upper mean
## <chr> <dbl> <dbl> <dbl>
## 1 brown_full 3.97 3.98 3.98
## 2 brown_random 3.94 3.97 3.95
## 3 montag 3.82 3.84 3.83
Token Residuals
type_df <- tidy_words %>%
distinct(word, .keep_all = T) %>%
select(-corpus)
mod <- lm(maleness ~ log_freq + conc, type_df)
type_df_resid <- type_df %>%
modelr::add_residuals(mod) %>%
select(word, resid)
tidy_words_with_resid <- tidy_words %>%
left_join(type_df_resid)
tidy_words_with_resid %>%
ggplot(aes(x = resid, fill = corpus)) +
geom_density(alpha = .3) +
theme_classic()
Types
tidy_words %>%
distinct(word, corpus, .keep_all = T) %>%
ggplot(aes(x = maleness, fill = corpus)) +
geom_density(alpha = .3) +
theme_classic()
Types with resids
tidy_words_with_resid %>%
distinct(word, corpus, .keep_all = T) %>%
ggplot(aes(x = resid, fill = corpus)) +
geom_density(alpha = .3) +
theme_classic()
Overall, the distribhtions look the same, with kids books being more feminine overall.