For each animal pair, we have human judgements of similarity along 5 dimensions ( shape, skin/texture, color) and language estimates of similarity. The human judgements of similarity come from a card sorting task and are at the group level (blind vs. sighted).
The language estimates of similarity come from word embedding models. I identified all the unique color/texture/skin words listed by participants in describing the animals, and removed a few that were obviously unrelated to teh dimension). E.g., here are the words for color:
"white" "black" "red" "light" "gold" "blue" "brown" "dark" "yellow" "neutral" "multi" "orange"
"grey" "gray" "medium" "pattern" "pink" "bright" "patterns" "spots" "tan" "amber" "stripes" "tawny" "ochre"
Then, for each animal, I created a 25-D vector, where each dimension corresponds to one of the color words. Each value corresponds to the cosine distance between the vector for that animal and the vector for that word. I then calculated the distance between each animal based on these color vectors. I did an analagous thing for the other dimensions.
TAXONOMIC_DATA <- here("data/processed/animal_distances_taxonomic.csv")
taxonomic_long <- read_csv(TAXONOMIC_DATA) %>%
rename(taxo_similarity = similarity)
LANG_ANIMAL_DISTANCE_COLOR <- here("data/processed/animal_color_distances_language_wiki.csv")
LANG_ANIMAL_DISTANCE_SHAPE<- here("data/processed/animal_shape_distances_language_wiki.csv")
LANG_ANIMAL_DISTANCE_TEXTURE <- here("data/processed/animal_texture_distances_language_wiki.csv")
TIDY_HUMAN_PATH <- here("data/processed/tidy_human_data.csv")
language_data <- read_csv(LANG_ANIMAL_DISTANCE_COLOR) %>%
left_join(read_csv(LANG_ANIMAL_DISTANCE_SHAPE), by = c("animal1", "animal2")) %>%
left_join(read_csv(LANG_ANIMAL_DISTANCE_TEXTURE),by = c("animal1", "animal2")) %>%
select(-contains("PCA"))
# mutate(language_similarity_simple_dist = -language_similarity_simple_dist)
human_data <- read_csv(TIDY_HUMAN_PATH)
full_sim_data <- full_join(language_data, human_data, by = c("animal1", "animal2"))
human_data_wide <- human_data %>%
unite("measure", c("participant_type", "similarity_type")) %>%
spread(measure, human_similarity)
full_sim_data_wide <- full_join(language_data, human_data_wide,
by = c("animal1", "animal2")) %>% full_join(taxonomic_long) %>%
filter(animal1 < animal2)
full_sim_data_wide2 <- full_join(language_data, human_data,
by = c("animal1", "animal2")) %>%
spread(similarity_type, human_similarity) %>%
full_join(taxonomic_long) %>%
filter(animal1 < animal2)
Each data point here is an animal pair
plot_data <- full_sim_data_wide %>%
select_if(is.numeric)
long_corr <- cor(plot_data,
use = "pairwise.complete.obs") %>%
as.data.frame() %>%
rownames_to_column("v2") %>%
gather("v1", "estimate", -v2)
long_p <- corrplot::cor.mtest(plot_data,
use = "pairwise.complete.obs")$p %>%
as.data.frame(row.names = names(plot_data)) %>%
do(setNames(.,names(plot_data))) %>%
rownames_to_column("v2") %>%
gather("v1", "p", -v2)
corr_df <- full_join(long_corr, long_p) %>%
mutate(estimate_char = case_when(v1 == v2 ~ "",
TRUE ~ as.character(round(estimate,2))),
estimate = case_when(v1 == v2 ~ as.numeric(NA),
TRUE ~ estimate),
estimate_color = case_when(p < .05 ~ estimate, TRUE ~ 0 ))
ggplot(corr_df, aes(v1, fct_rev(v2), fill = estimate_color)) +
geom_tile() + #rectangles for each correlation
#add actual correlation value in the rectangle
geom_text(aes(label = estimate_char), size=3) +
scale_fill_gradient2(low ="blue", mid = "white", high = "red",
midpoint = 0, space = "Lab", guide = "colourbar",
name = "Pearson's r") +
ggtitle("Pairwise Correlation Coefficients") +
theme_classic(base_size = 12) +
theme(axis.text.x = element_text(angle = 45, hjust = 1), #, hjust = .95, vjust = .2),
axis.title.x=element_blank(),
axis.title.y=element_blank(),
axis.ticks = element_blank(),
legend.position = "none")
Regressions showing that both taxonomic and language similarity predict human similarity (similiar pattern for other dimensions).
lm(human_similarity_color~ language_similarity_simple_dist_color+
taxo_similarity , data = full_sim_data_wide2 %>% mutate_if(is.numeric, scale) %>% filter(participant_type == "blind")) %>%
summary()
##
## Call:
## lm(formula = human_similarity_color ~ language_similarity_simple_dist_color +
## taxo_similarity, data = full_sim_data_wide2 %>% mutate_if(is.numeric,
## scale) %>% filter(participant_type == "blind"))
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.19043 -0.48458 -0.06184 0.37166 3.15547
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.18756 0.03199 5.864 8.99e-09
## language_similarity_simple_dist_color -0.12045 0.03200 -3.764 0.000191
## taxo_similarity -0.19940 0.03200 -6.230 1.11e-09
##
## (Intercept) ***
## language_similarity_simple_dist_color ***
## taxo_similarity ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.6671 on 432 degrees of freedom
## Multiple R-squared: 0.1088, Adjusted R-squared: 0.1047
## F-statistic: 26.38 on 2 and 432 DF, p-value: 1.558e-11
lm(human_similarity_color~ language_similarity_simple_dist_color+
taxo_similarity, data = full_sim_data_wide2 %>% mutate_if(is.numeric, scale) %>% filter(participant_type == "sighted")) %>%
summary()
##
## Call:
## lm(formula = human_similarity_color ~ language_similarity_simple_dist_color +
## taxo_similarity, data = full_sim_data_wide2 %>% mutate_if(is.numeric,
## scale) %>% filter(participant_type == "sighted"))
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.0470 -0.7732 -0.4904 0.1773 4.2224
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.18756 0.05737 -3.269 0.00117
## language_similarity_simple_dist_color -0.09404 0.05741 -1.638 0.10212
## taxo_similarity -0.02250 0.05741 -0.392 0.69534
##
## (Intercept) **
## language_similarity_simple_dist_color
## taxo_similarity
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.197 on 432 degrees of freedom
## Multiple R-squared: 0.00651, Adjusted R-squared: 0.001911
## F-statistic: 1.415 on 2 and 432 DF, p-value: 0.2439
LANG_ANIMAL_DISTANCE_COLOR <- here("data/processed/animal_color_distances_language_google.csv")
LANG_ANIMAL_DISTANCE_SHAPE<- here("data/processed/animal_shape_distances_language_google.csv")
LANG_ANIMAL_DISTANCE_TEXTURE <- here("data/processed/animal_texture_distances_language_google.csv")
TIDY_HUMAN_PATH <- here("data/processed/tidy_human_data.csv")
language_data <- read_csv(LANG_ANIMAL_DISTANCE_COLOR) %>%
left_join(read_csv(LANG_ANIMAL_DISTANCE_SHAPE), by = c("animal1", "animal2")) %>%
left_join(read_csv(LANG_ANIMAL_DISTANCE_TEXTURE),by = c("animal1", "animal2")) %>%
select(-contains("PCA"))
# mutate(language_similarity_simple_dist = -language_similarity_simple_dist)
human_data <- read_csv(TIDY_HUMAN_PATH)
full_sim_data <- full_join(language_data, human_data, by = c("animal1", "animal2"))
human_data_wide <- human_data %>%
unite("measure", c("participant_type", "similarity_type")) %>%
spread(measure, human_similarity)
full_sim_data_wide <- full_join(language_data, human_data_wide,
by = c("animal1", "animal2")) %>%full_join(taxonomic_long) %>%
filter(animal1 < animal2)
full_sim_data_wide2 <- full_join(language_data, human_data,
by = c("animal1", "animal2")) %>%
spread(similarity_type, human_similarity) %>%
full_join(taxonomic_long) %>%
filter(animal1 < animal2)
Replication on a second corpus.
plot_data <- full_sim_data_wide %>%
select_if(is.numeric)
long_corr <- cor(plot_data,
use = "pairwise.complete.obs") %>%
as.data.frame() %>%
rownames_to_column("v2") %>%
gather("v1", "estimate", -v2)
long_p <- corrplot::cor.mtest(plot_data,
use = "pairwise.complete.obs")$p %>%
as.data.frame(row.names = names(plot_data)) %>%
do(setNames(.,names(plot_data))) %>%
rownames_to_column("v2") %>%
gather("v1", "p", -v2)
corr_df <- full_join(long_corr, long_p) %>%
mutate(estimate_char = case_when(v1 == v2 ~ "",
TRUE ~ as.character(round(estimate,2))),
estimate = case_when(v1 == v2 ~ as.numeric(NA),
TRUE ~ estimate),
estimate_color = case_when(p < .05 ~ estimate, TRUE ~ 0 ))
ggplot(corr_df, aes(v1, fct_rev(v2), fill = estimate_color)) +
geom_tile() + #rectangles for each correlation
#add actual correlation value in the rectangle
geom_text(aes(label = estimate_char), size=3) +
scale_fill_gradient2(low ="blue", mid = "white", high = "red",
midpoint = 0, space = "Lab", guide = "colourbar",
name = "Pearson's r") +
ggtitle("Pairwise Correlation Coefficients") +
theme_classic(base_size = 12) +
theme(axis.text.x = element_text(angle = 45, hjust = 1), #, hjust = .95, vjust = .2),
axis.title.x=element_blank(),
axis.title.y=element_blank(),
axis.ticks = element_blank(),
legend.position = "none")