Language and human similarity data

For each animal pair, we have human judgements of similarity along 5 dimensions ( shape, skin/texture, color) and language estimates of similarity. The human judgements of similarity come from a card sorting task and are at the group level (blind vs. sighted).

The language estimates of similarity come from word embedding models. I identified all the unique color/texture/skin words listed by participants in describing the animals, and removed a few that were obviously unrelated to teh dimension). E.g., here are the words for color:

"white" "black" "red" "light" "gold" "blue" "brown" "dark" "yellow" "neutral" "multi" "orange"

"grey" "gray" "medium" "pattern" "pink" "bright" "patterns" "spots" "tan" "amber" "stripes" "tawny" "ochre"

Then, for each animal, I created a 25-D vector, where each dimension corresponds to one of the color words. Each value corresponds to the cosine distance between the vector for that animal and the vector for that word. I then calculated the distance between each animal based on these color vectors. I did an analagous thing for the other dimensions.

TAXONOMIC_DATA <- here("data/processed/animal_distances_taxonomic.csv")

taxonomic_long <- read_csv(TAXONOMIC_DATA) %>%
  rename(taxo_similarity = similarity)

Wiki

LANG_ANIMAL_DISTANCE_COLOR <- here("data/processed/animal_color_distances_language_wiki.csv")
LANG_ANIMAL_DISTANCE_SHAPE<- here("data/processed/animal_shape_distances_language_wiki.csv")
LANG_ANIMAL_DISTANCE_TEXTURE <- here("data/processed/animal_texture_distances_language_wiki.csv")

TIDY_HUMAN_PATH <- here("data/processed/tidy_human_data.csv") 
language_data <- read_csv(LANG_ANIMAL_DISTANCE_COLOR) %>%
  left_join(read_csv(LANG_ANIMAL_DISTANCE_SHAPE), by  = c("animal1", "animal2")) %>%
  left_join(read_csv(LANG_ANIMAL_DISTANCE_TEXTURE),by  = c("animal1", "animal2")) %>%
  select(-contains("PCA"))
 # mutate(language_similarity_simple_dist = -language_similarity_simple_dist)

human_data <- read_csv(TIDY_HUMAN_PATH) 
full_sim_data <- full_join(language_data, human_data, by = c("animal1", "animal2"))   

human_data_wide <- human_data %>%
  unite("measure", c("participant_type", "similarity_type")) %>%
  spread(measure, human_similarity)

full_sim_data_wide <- full_join(language_data, human_data_wide, 
                                by = c("animal1", "animal2"))  %>% full_join(taxonomic_long) %>%
  filter(animal1 < animal2)

full_sim_data_wide2 <-  full_join(language_data, human_data,
                                  by = c("animal1", "animal2"))    %>%
    spread(similarity_type, human_similarity) %>%
  full_join(taxonomic_long) %>%
  filter(animal1 < animal2)

Each data point here is an animal pair

plot_data <- full_sim_data_wide %>%
  select_if(is.numeric) 

long_corr <- cor(plot_data, 
                 use = "pairwise.complete.obs") %>%
  as.data.frame() %>%
  rownames_to_column("v2") %>%
  gather("v1", "estimate", -v2)

long_p <- corrplot::cor.mtest(plot_data, 
                              use = "pairwise.complete.obs")$p %>%
  as.data.frame(row.names = names(plot_data)) %>%
  do(setNames(.,names(plot_data))) %>%
  rownames_to_column("v2") %>%
  gather("v1", "p", -v2)

corr_df <- full_join(long_corr, long_p) %>%
  mutate(estimate_char = case_when(v1 == v2 ~ "", 
                                   TRUE ~ as.character(round(estimate,2))),
         estimate = case_when(v1 == v2 ~ as.numeric(NA), 
                              TRUE ~ estimate),
         estimate_color = case_when(p < .05 ~ estimate, TRUE ~ 0 ))

ggplot(corr_df, aes(v1, fct_rev(v2), fill = estimate_color)) + 
  geom_tile() + #rectangles for each correlation
  #add actual correlation value in the rectangle
  geom_text(aes(label = estimate_char), size=3) + 
  scale_fill_gradient2(low ="blue", mid = "white", high = "red", 
                       midpoint = 0, space = "Lab", guide = "colourbar",
                       name = "Pearson's r") +
  ggtitle("Pairwise Correlation Coefficients") +
  theme_classic(base_size = 12) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1), #, hjust = .95, vjust = .2), 
        axis.title.x=element_blank(), 
        axis.title.y=element_blank(),
        axis.ticks = element_blank(),
        legend.position = "none")

Regressions showing that both taxonomic and language similarity predict human similarity (similiar pattern for other dimensions).

lm(human_similarity_color~ language_similarity_simple_dist_color+  
    taxo_similarity  ,  data = full_sim_data_wide2 %>% mutate_if(is.numeric, scale) %>% filter(participant_type == "blind")) %>%
  summary()

## 
## Call:
## lm(formula = human_similarity_color ~ language_similarity_simple_dist_color + 
##     taxo_similarity, data = full_sim_data_wide2 %>% mutate_if(is.numeric, 
##     scale) %>% filter(participant_type == "blind"))
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.19043 -0.48458 -0.06184  0.37166  3.15547 
## 
## Coefficients:
##                                       Estimate Std. Error t value Pr(>|t|)
## (Intercept)                            0.18756    0.03199   5.864 8.99e-09
## language_similarity_simple_dist_color -0.12045    0.03200  -3.764 0.000191
## taxo_similarity                       -0.19940    0.03200  -6.230 1.11e-09
##                                          
## (Intercept)                           ***
## language_similarity_simple_dist_color ***
## taxo_similarity                       ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.6671 on 432 degrees of freedom
## Multiple R-squared:  0.1088, Adjusted R-squared:  0.1047 
## F-statistic: 26.38 on 2 and 432 DF,  p-value: 1.558e-11

lm(human_similarity_color~ language_similarity_simple_dist_color+
    taxo_similarity,  data = full_sim_data_wide2 %>% mutate_if(is.numeric, scale) %>% filter(participant_type == "sighted")) %>%
  summary()

## 
## Call:
## lm(formula = human_similarity_color ~ language_similarity_simple_dist_color + 
##     taxo_similarity, data = full_sim_data_wide2 %>% mutate_if(is.numeric, 
##     scale) %>% filter(participant_type == "sighted"))
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.0470 -0.7732 -0.4904  0.1773  4.2224 
## 
## Coefficients:
##                                       Estimate Std. Error t value Pr(>|t|)
## (Intercept)                           -0.18756    0.05737  -3.269  0.00117
## language_similarity_simple_dist_color -0.09404    0.05741  -1.638  0.10212
## taxo_similarity                       -0.02250    0.05741  -0.392  0.69534
##                                         
## (Intercept)                           **
## language_similarity_simple_dist_color   
## taxo_similarity                         
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.197 on 432 degrees of freedom
## Multiple R-squared:  0.00651,    Adjusted R-squared:  0.001911 
## F-statistic: 1.415 on 2 and 432 DF,  p-value: 0.2439

Google News

LANG_ANIMAL_DISTANCE_COLOR <- here("data/processed/animal_color_distances_language_google.csv")
LANG_ANIMAL_DISTANCE_SHAPE<- here("data/processed/animal_shape_distances_language_google.csv")
LANG_ANIMAL_DISTANCE_TEXTURE <- here("data/processed/animal_texture_distances_language_google.csv")

TIDY_HUMAN_PATH <- here("data/processed/tidy_human_data.csv") 
language_data <- read_csv(LANG_ANIMAL_DISTANCE_COLOR) %>%
  left_join(read_csv(LANG_ANIMAL_DISTANCE_SHAPE), by  = c("animal1", "animal2")) %>%
  left_join(read_csv(LANG_ANIMAL_DISTANCE_TEXTURE),by  = c("animal1", "animal2")) %>%
  select(-contains("PCA"))
 # mutate(language_similarity_simple_dist = -language_similarity_simple_dist)

human_data <- read_csv(TIDY_HUMAN_PATH) 
full_sim_data <- full_join(language_data, human_data, by = c("animal1", "animal2"))   

human_data_wide <- human_data %>%
  unite("measure", c("participant_type", "similarity_type")) %>%
  spread(measure, human_similarity)
full_sim_data_wide <- full_join(language_data, human_data_wide, 
                                by = c("animal1", "animal2"))   %>%full_join(taxonomic_long)  %>%
  filter(animal1 < animal2)

full_sim_data_wide2 <-  full_join(language_data, human_data,
                                  by = c("animal1", "animal2"))    %>%
    spread(similarity_type, human_similarity) %>% 
  full_join(taxonomic_long) %>%
  filter(animal1 < animal2)

Replication on a second corpus.

plot_data <- full_sim_data_wide %>%
  select_if(is.numeric) 

long_corr <- cor(plot_data, 
                 use = "pairwise.complete.obs") %>%
  as.data.frame() %>%
  rownames_to_column("v2") %>%
  gather("v1", "estimate", -v2)

long_p <- corrplot::cor.mtest(plot_data, 
                              use = "pairwise.complete.obs")$p %>%
  as.data.frame(row.names = names(plot_data)) %>%
  do(setNames(.,names(plot_data))) %>%
  rownames_to_column("v2") %>%
  gather("v1", "p", -v2)

corr_df <- full_join(long_corr, long_p) %>%
  mutate(estimate_char = case_when(v1 == v2 ~ "", 
                                   TRUE ~ as.character(round(estimate,2))),
         estimate = case_when(v1 == v2 ~ as.numeric(NA), 
                              TRUE ~ estimate),
         estimate_color = case_when(p < .05 ~ estimate, TRUE ~ 0 ))

ggplot(corr_df, aes(v1, fct_rev(v2), fill = estimate_color)) + 
  geom_tile() + #rectangles for each correlation
  #add actual correlation value in the rectangle
  geom_text(aes(label = estimate_char), size=3) + 
  scale_fill_gradient2(low ="blue", mid = "white", high = "red", 
                       midpoint = 0, space = "Lab", guide = "colourbar",
                       name = "Pearson's r") +
  ggtitle("Pairwise Correlation Coefficients") +
  theme_classic(base_size = 12) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1), #, hjust = .95, vjust = .2), 
        axis.title.x=element_blank(), 
        axis.title.y=element_blank(),
        axis.ticks = element_blank(),
        legend.position = "none")

Distributional semantics as a predictor of human similarity judgments

Molly Lewis

2019-06-06

Language and human similarity data

Wiki

Google News