Distributional semantics as a predictor of human similarity judgments

Reproduction of 6b with language data

all_data <- bind_rows(human_data, language_data) %>%
    mutate(texture = fct_relevel(texture, "scales", "skin", "fur", "feathers"),
           group = fct_relevel(group, "S", "CB", "language"),
           animal = fct_relevel(animal,  animal_order)) 

# 
all_data %>%
  filter(group != "language") %>%
  ggplot(aes(x = texture, y = fct_rev(animal), fill = similarity))  +
  geom_tile() +
  facet_wrap(~group) +
  ylab("animal") +
  scale_fill_gradient(low = "white", high = "black", limits = c(0,1))  +
  theme_classic()+
  theme(legend.position = "bottom")

all_data %>%
  filter(group == "language") %>%
  ggplot(aes(x = texture, y = fct_rev(animal), fill = similarity))  +
  geom_tile() +
  facet_wrap(~set_id) +
  ylab("animal") +
  scale_fill_gradient(low = "white", high = "black")  +
  theme_classic() +
  theme(legend.position = "bottom")

#cowplot::plot_grid(p1, p2, ncol = 2,label_size = 16, rel_widths = c(2,1))

Correlations with human similarity

Raw similarity

All

all_language_only <- all_data %>%
  filter(group == "language") %>%
  rename(language_similarity = similarity)  %>%
  select(-group, -correct)

lang_raw_comparison_all <- full_join(all_language_only, human_data) %>%
  mutate(similarity = case_when(similarity == 0 ~ .0001, TRUE~ similarity))

ggplot(lang_raw_comparison_all, aes(x = language_similarity, y = similarity, color = group)) +
  ylab("human similarity") +
  facet_wrap(~set_id)+
  geom_point(alpha = .8) + 
  geom_smooth(method = "lm") +
  theme_classic()

corrs2 <- lang_raw_comparison_all %>%
  group_by(group, set_id) %>%
  nest() %>%
  mutate(test = map(data, ~ tidy(cor.test(.x$language_similarity, .x$similarity)))) %>%
  select(-data) %>%
  unnest() 

ggplot(corrs2, aes(x = jitter(set_id), y = estimate, 
                  group = group, color = group)) +
  geom_line() + 
  geom_pointrange(aes(ymin = conf.low, ymax = conf.high))  +
  geom_hline(aes(yintercept = 0 ), linetype = 2)+
  xlab("number of anchor words") +
  ylab("correlation estimate") +
  theme_classic()

correct only

correct_human_data <- human_data %>%
  filter(correct == 1) %>%
  select(-correct)

correct_language_data <- language_data %>%
  select(-group) %>%
  right_join(correct_human_data %>% distinct(animal, texture)) %>%
  rename(language_similarity = similarity) 

lang_raw_comparison <- full_join(correct_language_data, correct_human_data) %>%
  mutate(language_similarity = log(language_similarity))

ggplot(lang_raw_comparison, aes(x = language_similarity, y = similarity, color = group)) +
  ylab("human similarity") +
  facet_wrap(~set_id)+
  geom_point(alpha = .8) + 
  geom_smooth(method = "lm") +
  theme_classic()

corrs1 <- lang_raw_comparison %>%
  group_by(group, set_id) %>%
  nest() %>%
  mutate(test = map(data, ~ tidy(cor.test(.x$language_similarity,
                                          .x$similarity)))) %>%
  select(-data) %>%
  unnest()

ggplot(corrs1, aes(x = jitter(set_id), y = estimate, 
                  group = group, color = group)) +
  geom_line() + 
  geom_pointrange(aes(ymin = conf.low, ymax = conf.high))  +
  geom_hline(aes(yintercept = 0 ), linetype = 2)+
  xlab("number of anchor words") +
  ylab("correlation estimate") +
  theme_classic()

Diff similarity

All

get_diff_sim <- function(df, current_texture){
  #print(df)
  value = filter(df, texture != current_texture) %>%
    summarize(incorrect_similarity = mean(similarity)) %>%
    mutate(correct_similarity = filter(df, texture == current_texture) %>% 
                pull(similarity),
           x = correct_similarity - incorrect_similarity) %>%
    pull(x)

  #print(value)
  data.frame(similarity = value)
  
}
correct_language_data_diff <- language_data %>%
  select(-group)  %>%
  group_by(set_id, animal) %>%
  nest() %>%
  right_join(language_data %>% distinct(set_id, animal, texture)) %>%
  mutate(temp = map2(data, texture, get_diff_sim)) %>%
  select(-data) %>%
  unnest() %>%
  rename(language_similarity = similarity)  %>%
  arrange(set_id, animal, texture)


correct_human_data_diff <- human_data %>%
  select(-correct) %>%
  group_by(group, animal) %>%
  nest() %>%
  right_join(human_data %>% distinct(group, animal, texture)) %>%
  mutate(temp = map2(data, texture, get_diff_sim)) %>%
  select(-data) %>%
  arrange(group, animal, texture) %>%
  unnest()
  

diff_comparison_both <- full_join(correct_language_data_diff,
                                  correct_human_data_diff)

ggplot(diff_comparison_both, aes(x = language_similarity,
                            y = similarity, color = group)) +
  ylab("human similarity") +
  facet_wrap(~set_id) +
  geom_point(alpha = .8) + 
  geom_smooth(method = "lm") +
  theme_classic()

corrs <- diff_comparison_both %>%
  group_by(group, set_id) %>%
  nest() %>%
  mutate(test = map(data, ~ tidy(cor.test(.x$language_similarity, 
                                          .x$similarity)))) %>%
  select(-data) %>%
  unnest()

ggplot(corrs, aes(x = jitter(set_id), y = estimate, 
                  group = group, color = group)) +
  geom_line() + 
  geom_pointrange(aes(ymin = conf.low, ymax = conf.high))  +
  geom_hline(aes(yintercept = 0 ), linetype = 2)+
  xlab("number of anchor words") +
  ylab("correlation estimate") +
  theme_classic()

Correct Only

diff_comparison_both_correct <- diff_comparison_both %>%
  right_join(correct_human_data %>% select(animal, texture))

ggplot(diff_comparison_both_correct, aes(x = language_similarity,
                            y = similarity, color = group)) +
  ylab("human similarity") +
  facet_wrap(~set_id) +
  geom_point(alpha = .8) + 
  geom_smooth(method = "lm") +
  theme_classic()

corrs3 <- diff_comparison_both_correct %>%
  group_by(group, set_id) %>%
  nest() %>%
  mutate(test = map(data, ~ tidy(cor.test(.x$language_similarity, 
                                          .x$similarity)))) %>%
  select(-data) %>%
  unnest() 

ggplot(corrs3, aes(x = jitter(set_id), y = estimate, 
                  group = group, color = group)) +
  geom_line() + 
  geom_pointrange(aes(ymin = conf.low, ymax = conf.high))  +
  geom_hline(aes(yintercept = 0 ), linetype = 2)+
  xlab("number of anchor words") +
  ylab("correlation estimate") +
  theme_classic()

```

Distributional semantics as a predictor of human similarity judgments

Texture

Molly Lewis

2019-06-03

Reproduction of 6b with language data

Correlations with human similarity

Raw similarity

All

correct only

Diff similarity

All

Correct Only