p1 <- here("data/raw/other_norms/animacy_rater1.csv")
rater1 <- read_csv(p1, skip = 2, col_names = c("word", "X", "animacy_animate", "animacy_inanimate",
                                               "animacy_both", "animacy_dk", "X2",
                                                "humanness_human", "humanness_nonhuman",
                                               "humanness_both", "humanness_dk")) %>%
  select(-X, -X2) %>%
  mutate(rater_id = 1)

rater1_animacy <- rater1 %>%
  select(word, rater_id, contains("animacy")) %>%
  gather("measure", "value", -1:-2)

rater1_humanness <- rater1 %>%
  select(word, rater_id, contains("humanness")) %>%
  gather("measure", "value", -1:-2)

p2 <- here("data/raw/other_norms/animacy_rater2.csv")
rater2 <- read_csv(p2, skip = 2, col_names = c("word", "X", "animacy_animate", "animacy_inanimate",
                                               "animacy_both", "animacy_dk", "X2",
                                                "humanness_human", "humanness_nonhuman",
                                               "humanness_both", "humanness_dk")) %>%
  select(-X, -X2) %>%
  mutate(rater_id = 2)

rater2_animacy <- rater2 %>%
  select(word, rater_id, contains("animacy")) %>%
  gather("measure", "value", -1:-2)

rater2_humanness <- rater2 %>%
  select(word, rater_id, contains("humanness")) %>%
  gather("measure", "value", -1:-2)

animacy_data <- bind_rows(list(rater1_animacy, rater1_humanness,
                               rater2_animacy, rater2_humanness)) %>%
  filter(value == 1) %>%
  select(-value) %>%
  separate(measure, into = c("measure", "value"), "_")


tidy_data <-  animacy_data %>%
  mutate(word = str_replace_all(word, '\"', "")) %>%
  group_by(word, measure) %>%
  summarize(values = list(value)) %>%
  mutate(n_vals = map_dbl(values, ~ length(unique(.))))

The raters disagree (n_vals = 2) on roughly one-half of all words:

tidy_data %>%
  ungroup()%>%
  count(measure, n_vals) %>%
  kable()
measure n_vals n
animacy 1 1323
animacy 2 957
humanness 1 1182
humanness 2 1098

Let’s look at the words with only one value.

data_agreed <- tidy_data %>%
  filter(n_vals == 1) %>%
  mutate(value = map_chr(values, pluck, 1)) %>%
  select(-values, -n_vals) %>%
  ungroup() 

data_agreed %>%
  count(measure, value) %>%
  kable()
measure value n
animacy animate 762
animacy both 16
animacy inanimate 545
humanness both 286
humanness human 596
humanness nonhuman 300
RATING_DEMO_PATH <- here("data/processed/words/gender_ratings_demographics.csv")
rating_demos <- read_csv(RATING_DEMO_PATH)

non_native_participants <- rating_demos %>%
  filter(question_name == "native_english" | question_name == "native") %>%
  filter(response_str == 0) %>%
  pull(subj_id)

rating_demos_ex <- rating_demos %>%
  filter(!(subj_id %in% non_native_participants))

gender_data <- rating_demos_ex %>%
  filter(question_name == "gender") %>%
  count(response_str)

age_median <- rating_demos_ex %>%
  filter(question_name == "age")  %>%
  mutate(response_str = as.numeric(response_str)) %>%
  summarize(median_age = median(response_str, na.rm = T))

## TH
edu_data <- rating_demos_ex %>%
  filter(question_name == "education") %>%
  mutate(response_str = as.factor(response_str)) %>%
  rename(edu = response_str) %>%
  select(-question_name)

RATING_RATINGS_PATH <- here("data/processed/words/gender_ratings.csv")
ratings <- read_csv(RATING_RATINGS_PATH, col_names = TRUE, cols(subj_id = "c", word = "c", rating = "n")) %>%
  filter(!(subj_id %in% non_native_participants))


N_ratings_per_subject <- ratings %>%
  count(subj_id) %>%
  summarize(min = min(n),
           max = max(n))

by_word_means <- ratings %>% 
  group_by(word) %>%
  summarize(mean_rating = mean(rating)) %>%
  ungroup() 

words_with_ratings <- data_agreed %>%
  left_join(by_word_means)

ggplot(words_with_ratings, aes(x= mean_rating)) +
  geom_density(aes(fill = value), alpha = .2) +
  facet_grid(value ~ measure, drop = T,scales = "free_x" )+
  geom_vline(aes(xintercept = 3), linetype = 2) +

  theme_classic()

gender_rating_by_word_type <- words_with_ratings %>%
  group_by(measure, value) %>%
  multi_boot_standard(col = "mean_rating", na.rm = T) 
  

ggplot(gender_rating_by_word_type, aes(x = value, y= mean)) +
  facet_wrap(~ measure, drop = T,scales = "free_x" )+
  geom_hline(aes(yintercept = 3), linetype = 2) +
  geom_pointrange(aes(ymin= ci_lower, ymax = ci_upper))+
  theme_classic()