# gender norms
GENDER_NORMS <- here("data/processed/words/gender_ratings_mean.csv")
gender_words <- read_csv(GENDER_NORMS)
gender_norms <- gender_words %>%
mutate(word = map_chr(word, ~unlist(str_split(.x, " "))[[1]]),
word = tolower(word),
word = str_remove_all(word, '[:punct:]')) %>%
distinct(word, .keep_all = T) %>%
group_by(word) %>%
summarize(human_gender_rating = mean(mean, na.rm = T))
MODELPATH <- "/Volumes/My\ Passport/coca_kidbook_models/"
kidbook_path <- list.files(MODELPATH, full.names = T)[1]
coca_path <- list.files(MODELPATH, full.names = T)[2]
kid_model <- fread(
kidbook_path,
header = FALSE,
skip = 1,
quote = "",
encoding = "UTF-8",
data.table = TRUE,
col.names = c("word",
unlist(lapply(2:301, function(x) paste0("V", x))))) %>%
mutate(word = tolower(word))
coca_model <- fread(
coca_path,
header = FALSE,
skip = 1,
quote = "",
encoding = "UTF-8",
data.table = TRUE,
col.names = c("word",
unlist(lapply(2:301, function(x) paste0("V", x))))) %>%
mutate(word = tolower(word))
common_words <- intersect(kid_model$word, coca_model$word)
Below are the correlations between human ratings and each of 300 principle components.
kid_model_filtered <- kid_model %>%
filter(word %in% common_words)
kids_pca <- prcomp(kid_model_filtered[,c(2:301)], center = TRUE ,scale. = TRUE)
data.frame(word = kid_model_filtered$word,
pca = kids_pca$x) %>%
left_join(gender_norms) %>%
filter(!is.na(human_gender_rating)) %>%
gather("component", "value", -human_gender_rating, -word) %>%
group_by(component) %>%
nest(-word) %>%
mutate(temp = map(data, ~cor.test(.$value, .$human_gender_rating) %>% tidy())) %>%
select(-data) %>%
unnest() %>%
arrange(-estimate) %>%
mutate(sig = p.value < .05) %>%
select(component, estimate, p.value, sig) %>%
filter(sig) %>%
data.frame()
## component estimate p.value sig
## 1 pca.PC5 0.19621051 2.129033e-07 TRUE
## 2 pca.PC19 0.12507364 1.010441e-03 TRUE
## 3 pca.PC24 0.12074491 1.508901e-03 TRUE
## 4 pca.PC8 0.11464544 2.598845e-03 TRUE
## 5 pca.PC26 0.09506935 1.260410e-02 TRUE
## 6 pca.PC133 0.09493517 1.273028e-02 TRUE
## 7 pca.PC109 0.08919504 1.928502e-02 TRUE
## 8 pca.PC80 0.08694083 2.257156e-02 TRUE
## 9 pca.PC254 0.08277561 2.993400e-02 TRUE
## 10 pca.PC15 0.08243384 3.062062e-02 TRUE
## 11 pca.PC258 0.08139840 3.278380e-02 TRUE
## 12 pca.PC93 0.08084269 3.399769e-02 TRUE
## 13 pca.PC16 0.07717430 4.301125e-02 TRUE
## 14 pca.PC59 -0.07633130 4.534594e-02 TRUE
## 15 pca.PC162 -0.07643436 4.505493e-02 TRUE
## 16 pca.PC47 -0.07669918 4.431432e-02 TRUE
## 17 pca.PC102 -0.07762900 4.179460e-02 TRUE
## 18 pca.PC69 -0.07910219 3.805059e-02 TRUE
## 19 pca.PC114 -0.08018154 3.549163e-02 TRUE
## 20 pca.PC32 -0.08041206 3.496453e-02 TRUE
## 21 pca.PC127 -0.08768732 2.143311e-02 TRUE
## 22 pca.PC175 -0.09936517 9.106187e-03 TRUE
## 23 pca.PC12 -0.10522508 5.732854e-03 TRUE
## 24 pca.PC4 -0.11457565 2.614685e-03 TRUE
## 25 pca.PC20 -0.12027392 1.574993e-03 TRUE
## 26 pca.PC68 -0.12668357 8.676397e-04 TRUE
## 27 pca.PC103 -0.16857075 8.773706e-06 TRUE
coca_model_filtered <- coca_model %>%
filter(word %in% common_words)
coca_pca <- prcomp(coca_model_filtered[,c(2:301)], center = TRUE ,scale. = TRUE)
data.frame(word = coca_model_filtered$word,
pca = coca_pca$x) %>%
left_join(gender_norms) %>%
filter(!is.na(human_gender_rating)) %>%
gather("component", "value", -human_gender_rating, -word) %>%
group_by(component) %>%
nest(-word) %>%
mutate(temp = map(data, ~cor.test(.$value, .$human_gender_rating) %>% tidy())) %>%
select(-data) %>%
unnest() %>%
arrange(-estimate) %>%
mutate(sig = p.value < .05) %>%
select(component, estimate, p.value, sig) %>%
filter(sig) %>%
data.frame()
## component estimate p.value sig
## 1 pca.PC159 0.13667230 3.240855e-04 TRUE
## 2 pca.PC17 0.13172252 5.324629e-04 TRUE
## 3 pca.PC47 0.12687928 8.516154e-04 TRUE
## 4 pca.PC3 0.11851930 1.845311e-03 TRUE
## 5 pca.PC188 0.09969201 8.879393e-03 TRUE
## 6 pca.PC216 0.09162190 1.622086e-02 TRUE
## 7 pca.PC5 0.09060714 1.744592e-02 TRUE
## 8 pca.PC91 0.09044326 1.765118e-02 TRUE
## 9 pca.PC95 0.08886294 1.974134e-02 TRUE
## 10 pca.PC165 0.08716606 2.222268e-02 TRUE
## 11 pca.PC136 0.08624531 2.367911e-02 TRUE
## 12 pca.PC80 0.08380350 2.794799e-02 TRUE
## 13 pca.PC195 0.08160136 3.234980e-02 TRUE
## 14 pca.PC169 0.07733680 4.257308e-02 TRUE
## 15 pca.PC15 -0.07676635 4.412810e-02 TRUE
## 16 pca.PC2 -0.07698951 4.351415e-02 TRUE
## 17 pca.PC34 -0.07715576 4.306150e-02 TRUE
## 18 pca.PC103 -0.07739398 4.241976e-02 TRUE
## 19 pca.PC99 -0.07806757 4.064878e-02 TRUE
## 20 pca.PC237 -0.07831487 4.001448e-02 TRUE
## 21 pca.PC7 -0.08612160 2.388096e-02 TRUE
## 22 pca.PC104 -0.13478591 3.923717e-04 TRUE
## 23 pca.PC25 -0.14748504 1.032970e-04 TRUE
## 24 pca.PC4 -0.17837762 2.500332e-06 TRUE