Language and human similarity data

For each animal pair, we have human judgements of similarity along 5 dimensions (habitat, food, shape, skin, color) and language estimates of similarity. The human judgements of similarity come from a card sorting task and are at the group level (blind vs. sighted; I think there’s also subject level data if we want it).

The language estimates of similarity come from word embedding models. I identified all the unique color words listed by participants in describing the animals (N = 25; I removed a few that were related to visual properties at all, like “water”). Here they are:

"white" "black" "red" "light" "gold" "blue" "brown" "dark" "yellow" "neutral" "multi" "orange"

"grey" "gray" "medium" "pattern" "pink" "bright" "patterns" "spots" "tan" "amber" "stripes" "tawny" "ochre"

Then, for each animal, I created a 25-D vector corresponding to each of the color words. Each value corresponds to the cosine distance between the vector for that animal and the vector for that word. I then calculated the distance between each animal based on these color vectors. I’ve only done this for color information so far, but in principle we could do this for the other dimensions. I did an analagous thing for the other dimensions.

TAXONOMIC_DATA <- here("data/processed/animal_distances_taxonomic.csv")

taxonomic_long <- read_csv(TAXONOMIC_DATA) %>%
  rename(taxo_similarity = similarity)

Wiki

LANG_ANIMAL_DISTANCE_COLOR <- here("data/processed/animal_color_distances_language_wiki.csv")
LANG_ANIMAL_DISTANCE_SHAPE<- here("data/processed/animal_shape_distances_language_wiki.csv")
LANG_ANIMAL_DISTANCE_TEXTURE <- here("data/processed/animal_texture_distances_language_wiki.csv")

TIDY_HUMAN_PATH <- here("data/processed/tidy_human_data.csv") 
language_data <- read_csv(LANG_ANIMAL_DISTANCE_COLOR) %>%
  left_join(read_csv(LANG_ANIMAL_DISTANCE_SHAPE), by  = c("animal1", "animal2")) %>%
  left_join(read_csv(LANG_ANIMAL_DISTANCE_TEXTURE),by  = c("animal1", "animal2")) %>%
  select(-contains("PCA"))
 # mutate(language_similarity_simple_dist = -language_similarity_simple_dist)

human_data <- read_csv(TIDY_HUMAN_PATH) 
full_sim_data <- full_join(language_data, human_data, by = c("animal1", "animal2"))   

human_data_wide <- human_data %>%
  unite("measure", c("participant_type", "similarity_type")) %>%
  spread(measure, human_similarity)

full_sim_data_wide <- full_join(language_data, human_data_wide, 
                                by = c("animal1", "animal2"))  %>% full_join(taxonomic_long) %>%
  filter(animal1 < animal2)

full_sim_data_wide2 <-  full_join(language_data, human_data,
                                  by = c("animal1", "animal2"))    %>%
    spread(similarity_type, human_similarity) %>%
  full_join(taxonomic_long) %>%
  filter(animal1 < animal2)

Each data point here is an animal pair

plot_data <- full_sim_data_wide %>%
  select_if(is.numeric) 

long_corr <- cor(plot_data, 
                 use = "pairwise.complete.obs") %>%
  as.data.frame() %>%
  rownames_to_column("v2") %>%
  gather("v1", "estimate", -v2)

long_p <- corrplot::cor.mtest(plot_data, 
                              use = "pairwise.complete.obs")$p %>%
  as.data.frame(row.names = names(plot_data)) %>%
  do(setNames(.,names(plot_data))) %>%
  rownames_to_column("v2") %>%
  gather("v1", "p", -v2)

corr_df <- full_join(long_corr, long_p) %>%
  mutate(estimate_char = case_when(v1 == v2 ~ "", 
                                   TRUE ~ as.character(round(estimate,2))),
         estimate = case_when(v1 == v2 ~ as.numeric(NA), 
                              TRUE ~ estimate),
         estimate_color = case_when(p < .05 ~ estimate, TRUE ~ 0 ))

ggplot(corr_df, aes(v1, fct_rev(v2), fill = estimate_color)) + 
  geom_tile() + #rectangles for each correlation
  #add actual correlation value in the rectangle
  geom_text(aes(label = estimate_char), size=3) + 
  scale_fill_gradient2(low ="blue", mid = "white", high = "red", 
                       midpoint = 0, space = "Lab", guide = "colourbar",
                       name = "Pearson's r") +
  ggtitle("Pairwise Correlation Coefficients") +
  theme_classic(base_size = 12) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1), #, hjust = .95, vjust = .2), 
        axis.title.x=element_blank(), 
        axis.title.y=element_blank(),
        axis.ticks = element_blank(),
        legend.position = "none")

lm(human_similarity_color~ language_similarity_simple_dist_color+participant_type + 
    taxo_similarity  ,  data = full_sim_data_wide2 %>% mutate_if(is.numeric, scale)) %>%
  summary()

## 
## Call:
## lm(formula = human_similarity_color ~ language_similarity_simple_dist_color + 
##     participant_type + taxo_similarity, data = full_sim_data_wide2 %>% 
##     mutate_if(is.numeric, scale))
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.2494 -0.6622 -0.2665  0.3644  4.1476 
## 
## Coefficients:
##                                       Estimate Std. Error t value Pr(>|t|)
## (Intercept)                            0.18756    0.04659   4.026 6.18e-05
## language_similarity_simple_dist_color -0.10725    0.03297  -3.253 0.001185
## participant_typesighted               -0.37512    0.06589  -5.693 1.71e-08
## taxo_similarity                       -0.11095    0.03297  -3.366 0.000798
##                                          
## (Intercept)                           ***
## language_similarity_simple_dist_color ** 
## participant_typesighted               ***
## taxo_similarity                       ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.9718 on 866 degrees of freedom
## Multiple R-squared:  0.05891,    Adjusted R-squared:  0.05565 
## F-statistic: 18.07 on 3 and 866 DF,  p-value: 2.217e-11

Google News

LANG_ANIMAL_DISTANCE_COLOR <- here("data/processed/animal_color_distances_language_google.csv")
LANG_ANIMAL_DISTANCE_SHAPE<- here("data/processed/animal_shape_distances_language_google.csv")
LANG_ANIMAL_DISTANCE_TEXTURE <- here("data/processed/animal_texture_distances_language_google.csv")

TIDY_HUMAN_PATH <- here("data/processed/tidy_human_data.csv") 
language_data <- read_csv(LANG_ANIMAL_DISTANCE_COLOR) %>%
  left_join(read_csv(LANG_ANIMAL_DISTANCE_SHAPE), by  = c("animal1", "animal2")) %>%
  left_join(read_csv(LANG_ANIMAL_DISTANCE_TEXTURE),by  = c("animal1", "animal2")) %>%
  select(-contains("PCA"))
 # mutate(language_similarity_simple_dist = -language_similarity_simple_dist)

human_data <- read_csv(TIDY_HUMAN_PATH) 
full_sim_data <- full_join(language_data, human_data, by = c("animal1", "animal2"))   

human_data_wide <- human_data %>%
  unite("measure", c("participant_type", "similarity_type")) %>%
  spread(measure, human_similarity)
full_sim_data_wide <- full_join(language_data, human_data_wide, 
                                by = c("animal1", "animal2"))   %>%full_join(taxonomic_long)  %>%
  filter(animal1 < animal2)

full_sim_data_wide2 <-  full_join(language_data, human_data,
                                  by = c("animal1", "animal2"))    %>%
    spread(similarity_type, human_similarity) %>% 
  full_join(taxonomic_long) %>%
  filter(animal1 < animal2)

Each data point here is an animal pair

plot_data <- full_sim_data_wide %>%
  select_if(is.numeric) 

long_corr <- cor(plot_data, 
                 use = "pairwise.complete.obs") %>%
  as.data.frame() %>%
  rownames_to_column("v2") %>%
  gather("v1", "estimate", -v2)

long_p <- corrplot::cor.mtest(plot_data, 
                              use = "pairwise.complete.obs")$p %>%
  as.data.frame(row.names = names(plot_data)) %>%
  do(setNames(.,names(plot_data))) %>%
  rownames_to_column("v2") %>%
  gather("v1", "p", -v2)

corr_df <- full_join(long_corr, long_p) %>%
  mutate(estimate_char = case_when(v1 == v2 ~ "", 
                                   TRUE ~ as.character(round(estimate,2))),
         estimate = case_when(v1 == v2 ~ as.numeric(NA), 
                              TRUE ~ estimate),
         estimate_color = case_when(p < .05 ~ estimate, TRUE ~ 0 ))

ggplot(corr_df, aes(v1, fct_rev(v2), fill = estimate_color)) + 
  geom_tile() + #rectangles for each correlation
  #add actual correlation value in the rectangle
  geom_text(aes(label = estimate_char), size=3) + 
  scale_fill_gradient2(low ="blue", mid = "white", high = "red", 
                       midpoint = 0, space = "Lab", guide = "colourbar",
                       name = "Pearson's r") +
  ggtitle("Pairwise Correlation Coefficients") +
  theme_classic(base_size = 12) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1), #, hjust = .95, vjust = .2), 
        axis.title.x=element_blank(), 
        axis.title.y=element_blank(),
        axis.ticks = element_blank(),
        legend.position = "none")

lm(human_similarity_color~ language_similarity_simple_dist_color+participant_type + 
    taxo_similarity  ,  data = full_sim_data_wide2 %>% mutate_if(is.numeric, scale)) %>%
  summary()

## 
## Call:
## lm(formula = human_similarity_color ~ language_similarity_simple_dist_color + 
##     participant_type + taxo_similarity, data = full_sim_data_wide2 %>% 
##     mutate_if(is.numeric, scale))
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.1824 -0.6535 -0.2698  0.3805  4.0856 
## 
## Coefficients:
##                                       Estimate Std. Error t value Pr(>|t|)
## (Intercept)                            0.18756    0.04662   4.023 6.24e-05
## language_similarity_simple_dist_color -0.10266    0.03298  -3.113  0.00192
## participant_typesighted               -0.37512    0.06593  -5.690 1.74e-08
## taxo_similarity                       -0.11098    0.03298  -3.365  0.00080
##                                          
## (Intercept)                           ***
## language_similarity_simple_dist_color ** 
## participant_typesighted               ***
## taxo_similarity                       ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.9723 on 866 degrees of freedom
## Multiple R-squared:  0.05795,    Adjusted R-squared:  0.05469 
## F-statistic: 17.76 on 3 and 866 DF,  p-value: 3.424e-11

Distributional semantics as a predictor of human similarity judgments

Molly Lewis

2019-06-05

Language and human similarity data

Wiki

Google News