Distributional semantics as a predictor of human similarity judgments

Language and human similarity data
Pairwise similarity
Item-pair plot
Item plot

Language and human similarity data

For each animal pair, we have human judgements of similarity along 5 dimensions (habitat, food, shape, skin, color) and language estimates of similarity. The human judgements of similarity come from a card sorting task and are at the group level (blind vs. sighted; I think there’s also subject level data if we want it).

The language estimates of similarity come from word embedding models. I identified all the unique color words listed by participants in describing the animals (N = 25; I removed a few that were related to visual properties at all, like “water”). Here they are:

"white" "black" "red" "light" "gold" "blue" "brown" "dark" "yellow" "neutral" "multi" "orange"

"grey" "gray" "medium" "pattern" "pink" "bright" "patterns" "spots" "tan" "amber" "stripes" "tawny" "ochre"

Then, for each animal, I created a 25-D vector corresponding to each of the color words. Each value corresponds to the cosine distance between the vector for that animal and the vector for that word. I then calculated the distance between each animal based on these color vectors. I’ve only done this for color information so far, but in principle we could do this for the other dimensions.

LANG_ANIMAL_DISTANCE <- here("data/animal_color_distances_language_wiki.csv")
TIDY_HUMAN_PATH <- here("data/tidy_human_data.csv") 
language_data <- read_csv(LANG_ANIMAL_DISTANCE) %>%
  select(-contains("PCA"))
 # mutate(language_similarity_simple_dist = -language_similarity_simple_dist)

human_data <- read_csv(TIDY_HUMAN_PATH) 
full_sim_data <- full_join(language_data, human_data, by = c("animal1", "animal2"))   

human_data_wide <- human_data %>%
  unite("measure", c("participant_type", "similarity_type")) %>%
  spread(measure, human_similarity)
full_sim_data_wide <- full_join(language_data, human_data_wide, 
                                by = c("animal1", "animal2"))   

full_sim_data_wide2 <-  full_join(language_data, human_data,
                                  by = c("animal1", "animal2"))    %>%
    spread(similarity_type, human_similarity)

Pairwise similarity

Each data point here is a animal pair

plot_data <- full_sim_data_wide %>%
  select_if(is.numeric) 

long_corr <- cor(plot_data, 
                 use = "pairwise.complete.obs") %>%
  as.data.frame() %>%
  rownames_to_column("v2") %>%
  gather("v1", "estimate", -v2)

long_p <- corrplot::cor.mtest(plot_data, 
                              use = "pairwise.complete.obs")$p %>%
  as.data.frame(row.names = names(plot_data)) %>%
  do(setNames(.,names(plot_data))) %>%
  rownames_to_column("v2") %>%
  gather("v1", "p", -v2)

corr_df <- full_join(long_corr, long_p) %>%
  mutate(estimate_char = case_when(v1 == v2 ~ "", 
                                   TRUE ~ as.character(round(estimate,2))),
         estimate = case_when(v1 == v2 ~ as.numeric(NA), 
                              TRUE ~ estimate),
         estimate_color = case_when(p < .05 ~ estimate, TRUE ~ 0 ))

ggplot(corr_df, aes(v1, fct_rev(v2), fill = estimate_color)) + 
  geom_tile() + #rectangles for each correlation
  #add actual correlation value in the rectangle
  geom_text(aes(label = estimate_char), size=3) + 
  scale_fill_gradient2(low ="blue", mid = "white", high = "red", 
                       midpoint = 0, space = "Lab", guide = "colourbar",
                       name = "Pearson's r") +
  ggtitle("Pairwise Correlation Coefficients") +
  theme_classic(base_size = 12) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1), #, hjust = .95, vjust = .2), 
        axis.title.x=element_blank(), 
        axis.title.y=element_blank(),
        axis.ticks = element_blank(),
        legend.position = "none")

Item-pair plot

full_sim_data_wide %>%
  select(1:4, 9) %>%
  gather( "participant_type","human_similarity", 4:5) %>%
  ggplot(aes(x = language_similarity_simple_dist,
             y = human_similarity,
             color = participant_type)) +
  geom_point(alpha =.2) +
  geom_smooth(method = "lm") +
  theme_classic()

lm(human_similarity_color~ language_similarity_simple_dist*participant_type ,  data = full_sim_data_wide2 %>% mutate_if(is.numeric, scale)) %>%
  summary()

## 
## Call:
## lm(formula = human_similarity_color ~ language_similarity_simple_dist * 
##     participant_type, data = full_sim_data_wide2 %>% mutate_if(is.numeric, 
##     scale))
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.3813 -0.6959 -0.2463  0.4022  4.2562 
## 
## Coefficients:
##                                                         Estimate
## (Intercept)                                              0.18756
## language_similarity_simple_dist                         -0.11401
## participant_typesighted                                 -0.37512
## language_similarity_simple_dist:participant_typesighted  0.02211
##                                                         Std. Error t value
## (Intercept)                                                0.04691   3.998
## language_similarity_simple_dist                            0.04694  -2.429
## participant_typesighted                                    0.06635  -5.654
## language_similarity_simple_dist:participant_typesighted    0.06638   0.333
##                                                         Pr(>|t|)    
## (Intercept)                                             6.93e-05 ***
## language_similarity_simple_dist                           0.0154 *  
## participant_typesighted                                 2.13e-08 ***
## language_similarity_simple_dist:participant_typesighted   0.7392    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.9784 on 866 degrees of freedom
## Multiple R-squared:  0.04594,    Adjusted R-squared:  0.04264 
## F-statistic:  13.9 on 3 and 866 DF,  p-value: 7.384e-09

lm(human_similarity_color~ language_similarity_simple_dist*participant_type + 
     human_similarity_habitat +
     human_similarity_food + 
     human_similarity_shape,  data = full_sim_data_wide2 %>% mutate_if(is.numeric, scale)) %>%
  summary()

## 
## Call:
## lm(formula = human_similarity_color ~ language_similarity_simple_dist * 
##     participant_type + human_similarity_habitat + human_similarity_food + 
##     human_similarity_shape, data = full_sim_data_wide2 %>% mutate_if(is.numeric, 
##     scale))
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.4050 -0.6388 -0.2404  0.3665  4.2425 
## 
## Coefficients:
##                                                         Estimate
## (Intercept)                                              0.17344
## language_similarity_simple_dist                         -0.10448
## participant_typesighted                                 -0.34688
## human_similarity_habitat                                 0.16647
## human_similarity_food                                    0.02447
## human_similarity_shape                                   0.08921
## language_similarity_simple_dist:participant_typesighted  0.02118
##                                                         Std. Error t value
## (Intercept)                                                0.04581   3.786
## language_similarity_simple_dist                            0.04564  -2.289
## participant_typesighted                                    0.06517  -5.322
## human_similarity_habitat                                   0.03956   4.208
## human_similarity_food                                      0.03908   0.626
## human_similarity_shape                                     0.04244   2.102
## language_similarity_simple_dist:participant_typesighted    0.06446   0.329
##                                                         Pr(>|t|)    
## (Intercept)                                             0.000164 ***
## language_similarity_simple_dist                         0.022321 *  
## participant_typesighted                                 1.31e-07 ***
## human_similarity_habitat                                2.84e-05 ***
## human_similarity_food                                   0.531355    
## human_similarity_shape                                  0.035834 *  
## language_similarity_simple_dist:participant_typesighted 0.742549    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.9497 on 863 degrees of freedom
## Multiple R-squared:  0.1042, Adjusted R-squared:  0.098 
## F-statistic: 16.74 on 6 and 863 DF,  p-value: < 2.2e-16

Item plot

wide_df <- full_sim_data_wide %>%
  select(1:4, 9) %>%
  gather("participant_type","human_similarity", 4:5) 

get_item_means <- function(current_item, sim_df){
  sim_df %>%
    filter(animal1 == current_item | animal2 == current_item) %>%
    group_by(participant_type) %>%
    summarize(language_similarity_simple_dist = mean(language_similarity_simple_dist),
              human_similarity = mean(human_similarity)) %>%
    mutate(item = current_item)
}

by_item_means <- map_df(unique(c(wide_df$animal1,wide_df$animal2)), get_item_means, wide_df)

by_item_means %>%
  ggplot(aes(x = log(language_similarity_simple_dist),
             y = human_similarity,
             color = participant_type)) +
  facet_wrap(~participant_type) +
  geom_label(aes(label = item)) +
  geom_smooth(method = "lm") +
  theme_classic()

Distributional semantics as a predictor of human similarity judgments

Molly Lewis

2019-05-28

Language and human similarity data

Pairwise similarity

Item-pair plot

Item plot