PCA model bias

Kid book model
Coca model

# gender norms
GENDER_NORMS <- here("data/processed/words/gender_ratings_mean.csv")

gender_words <- read_csv(GENDER_NORMS)
gender_norms <- gender_words %>%
  mutate(word = map_chr(word, ~unlist(str_split(.x, " "))[[1]]),
         word = tolower(word),
         word = str_remove_all(word, '[:punct:]')) %>%
  distinct(word, .keep_all = T) %>%
  group_by(word) %>%
  summarize(human_gender_rating  = mean(mean, na.rm = T))

MODELPATH <- "/Volumes/My\ Passport/coca_kidbook_models/"

kidbook_path <- list.files(MODELPATH, full.names = T)[1]
coca_path <- list.files(MODELPATH, full.names = T)[2]


kid_model <- fread(
  kidbook_path,
  header = FALSE,
  skip = 1,
  quote = "",
  encoding = "UTF-8",
  data.table = TRUE,
  col.names = c("word",
                unlist(lapply(2:301, function(x) paste0("V", x))))) %>%
  mutate(word = tolower(word))

coca_model <- fread(
  coca_path,
  header = FALSE,
  skip = 1,
  quote = "",
  encoding = "UTF-8",
  data.table = TRUE,
  col.names = c("word",
                unlist(lapply(2:301, function(x) paste0("V", x))))) %>%
  mutate(word = tolower(word))

common_words <- intersect(kid_model$word, coca_model$word)

Kid book model

Below are the correlations between human ratings and each of 300 principle components.

kid_model_filtered <- kid_model %>%
  filter(word %in% common_words)

kids_pca <- prcomp(kid_model_filtered[,c(2:301)], center = TRUE ,scale. = TRUE)

data.frame(word = kid_model_filtered$word, 
           pca = kids_pca$x)  %>%
  left_join(gender_norms) %>%
  filter(!is.na(human_gender_rating)) %>%
  gather("component", "value", -human_gender_rating, -word) %>%
  group_by(component) %>%
  nest(-word) %>%
  mutate(temp = map(data, ~cor.test(.$value, .$human_gender_rating) %>% tidy())) %>%
  select(-data) %>%
  unnest() %>%
  arrange(-estimate) %>%
  mutate(sig = p.value < .05) %>%
  select(component, estimate, p.value, sig) %>%
  filter(sig) %>%
  data.frame()

##    component    estimate      p.value  sig
## 1    pca.PC5  0.19621051 2.129033e-07 TRUE
## 2   pca.PC19  0.12507364 1.010441e-03 TRUE
## 3   pca.PC24  0.12074491 1.508901e-03 TRUE
## 4    pca.PC8  0.11464544 2.598845e-03 TRUE
## 5   pca.PC26  0.09506935 1.260410e-02 TRUE
## 6  pca.PC133  0.09493517 1.273028e-02 TRUE
## 7  pca.PC109  0.08919504 1.928502e-02 TRUE
## 8   pca.PC80  0.08694083 2.257156e-02 TRUE
## 9  pca.PC254  0.08277561 2.993400e-02 TRUE
## 10  pca.PC15  0.08243384 3.062062e-02 TRUE
## 11 pca.PC258  0.08139840 3.278380e-02 TRUE
## 12  pca.PC93  0.08084269 3.399769e-02 TRUE
## 13  pca.PC16  0.07717430 4.301125e-02 TRUE
## 14  pca.PC59 -0.07633130 4.534594e-02 TRUE
## 15 pca.PC162 -0.07643436 4.505493e-02 TRUE
## 16  pca.PC47 -0.07669918 4.431432e-02 TRUE
## 17 pca.PC102 -0.07762900 4.179460e-02 TRUE
## 18  pca.PC69 -0.07910219 3.805059e-02 TRUE
## 19 pca.PC114 -0.08018154 3.549163e-02 TRUE
## 20  pca.PC32 -0.08041206 3.496453e-02 TRUE
## 21 pca.PC127 -0.08768732 2.143311e-02 TRUE
## 22 pca.PC175 -0.09936517 9.106187e-03 TRUE
## 23  pca.PC12 -0.10522508 5.732854e-03 TRUE
## 24   pca.PC4 -0.11457565 2.614685e-03 TRUE
## 25  pca.PC20 -0.12027392 1.574993e-03 TRUE
## 26  pca.PC68 -0.12668357 8.676397e-04 TRUE
## 27 pca.PC103 -0.16857075 8.773706e-06 TRUE

Coca model

coca_model_filtered <- coca_model %>%
  filter(word %in% common_words)

coca_pca <- prcomp(coca_model_filtered[,c(2:301)], center = TRUE ,scale. = TRUE)

data.frame(word = coca_model_filtered$word, 
           pca = coca_pca$x)  %>%
  left_join(gender_norms) %>%
  filter(!is.na(human_gender_rating)) %>%
  gather("component", "value", -human_gender_rating, -word) %>%
  group_by(component) %>%
  nest(-word) %>%
  mutate(temp = map(data, ~cor.test(.$value, .$human_gender_rating) %>% tidy())) %>%
  select(-data) %>%
  unnest() %>%
  arrange(-estimate) %>%
  mutate(sig = p.value < .05) %>%
  select(component, estimate, p.value, sig) %>%
  filter(sig)  %>%
  data.frame()

##    component    estimate      p.value  sig
## 1  pca.PC159  0.13667230 3.240855e-04 TRUE
## 2   pca.PC17  0.13172252 5.324629e-04 TRUE
## 3   pca.PC47  0.12687928 8.516154e-04 TRUE
## 4    pca.PC3  0.11851930 1.845311e-03 TRUE
## 5  pca.PC188  0.09969201 8.879393e-03 TRUE
## 6  pca.PC216  0.09162190 1.622086e-02 TRUE
## 7    pca.PC5  0.09060714 1.744592e-02 TRUE
## 8   pca.PC91  0.09044326 1.765118e-02 TRUE
## 9   pca.PC95  0.08886294 1.974134e-02 TRUE
## 10 pca.PC165  0.08716606 2.222268e-02 TRUE
## 11 pca.PC136  0.08624531 2.367911e-02 TRUE
## 12  pca.PC80  0.08380350 2.794799e-02 TRUE
## 13 pca.PC195  0.08160136 3.234980e-02 TRUE
## 14 pca.PC169  0.07733680 4.257308e-02 TRUE
## 15  pca.PC15 -0.07676635 4.412810e-02 TRUE
## 16   pca.PC2 -0.07698951 4.351415e-02 TRUE
## 17  pca.PC34 -0.07715576 4.306150e-02 TRUE
## 18 pca.PC103 -0.07739398 4.241976e-02 TRUE
## 19  pca.PC99 -0.07806757 4.064878e-02 TRUE
## 20 pca.PC237 -0.07831487 4.001448e-02 TRUE
## 21   pca.PC7 -0.08612160 2.388096e-02 TRUE
## 22 pca.PC104 -0.13478591 3.923717e-04 TRUE
## 23  pca.PC25 -0.14748504 1.032970e-04 TRUE
## 24   pca.PC4 -0.17837762 2.500332e-06 TRUE

PCA model bias

Molly Lewis

2019-07-23

Kid book model

Coca model