I will make these a bit prettier, but basically they just convert the existing data frames with distance/ similarity info into “distance” matrices.
#### useful functions ####
convert_similarity_to_matrix <- function(wide_data) {
#work into symmetric matrix (messy)
#convert to matrix
m <- as.matrix(wide_data)
#add extra row
m <- rbind(m,c(rep(NA,ncol(m)-1),1.0))
row.names(m)[nrow(m)] <- colnames(m)[ncol(m)]
#add extra column
m <- cbind(c(1.0,rep(NA,nrow(m)-1)),m)
colnames(m)[1] <- row.names(m)[1]
#convert into symmetric matrix (using forceSymmetric in Matrix package)
diag(m) <- 1
m <- forceSymmetric(m)
#return
m
}
convert_similarity_to_distance <- function(wide_data, col_name,reverse_dist=T, human_data=T) {
#convert human similarity values based on a similarity column name
#extract subset of wide human data
temp <- as.data.frame(wide_data) %>%
select(animal1,animal2, !!col_name) %>%
spread(animal2,!!col_name,fill="", convert=T)
#change animal1 column to row name
row.names(temp) <- temp$animal1
temp <- temp %>%
select(-animal1)
if (human_data) {
#convert to symmetric matrix
temp <- convert_similarity_to_matrix(temp)
} else {
#convert to matrix
temp <- as.matrix(temp)
}
if (reverse_dist) {
#convert from similarity to "distance"
temp <- 1- temp
}
#return
temp
}
LANG_ANIMAL_DISTANCE_COLOR <- here("data/processed/animal_color_distances_language_wiki.csv")
LANG_ANIMAL_DISTANCE_COLOR
## [1] "/Users/martinzettersten/Documents/GitHub/keb_2019_reanalysis/data/processed/animal_color_distances_language_wiki.csv"
LANG_ANIMAL_DISTANCE_SHAPE<- here("data/processed/animal_shape_distances_language_wiki.csv")
LANG_ANIMAL_DISTANCE_TEXTURE <- here("data/processed/animal_texture_distances_language_wiki.csv")
TIDY_HUMAN_PATH <- here("data/processed/tidy_human_data.csv")
language_data <- read_csv(LANG_ANIMAL_DISTANCE_COLOR) %>%
left_join(read_csv(LANG_ANIMAL_DISTANCE_SHAPE), by = c("animal1", "animal2")) %>%
left_join(read_csv(LANG_ANIMAL_DISTANCE_TEXTURE),by = c("animal1", "animal2")) %>%
select(-contains("PCA"))
human_data <- read_csv(TIDY_HUMAN_PATH)
human_data_wide <- human_data %>%
unite("measure", c("participant_type", "similarity_type")) %>%
spread(measure, human_similarity)
sighted_human_color_clust <- human_data_wide %>%
convert_similarity_to_distance("sighted_human_similarity_color") %>%
as.dist() %>%
hclust()
ggdendro::ggdendrogram(sighted_human_color_clust, rotate = T) +
ggtitle("Sighted Human Similarity Color")
blind_human_color_clust <- human_data_wide %>%
convert_similarity_to_distance("blind_human_similarity_color") %>%
as.dist() %>%
hclust()
ggdendro::ggdendrogram(blind_human_color_clust, rotate = T) +
ggtitle("Blind Human Similarity Color")
language_color_clust <- language_data %>%
convert_similarity_to_distance("language_similarity_simple_dist_color", reverse_dist=F, human_data=F) %>%
as.dist() %>%
hclust()
ggdendro::ggdendrogram(language_color_clust, rotate = T) +
ggtitle("Language Distances Color")
Lower entanglement is better (0 is perfect alignment).
dends <- dendlist(as.dendrogram(blind_human_color_clust), as.dendrogram(sighted_human_color_clust))
x <- tanglegram(dends, common_subtrees_color_branches = TRUE, highlight_branches_lwd = F)
paste("entanglement = ",round(entanglement(x), 2))
## [1] "entanglement = 0.5"
dends <- dendlist(as.dendrogram(blind_human_color_clust), as.dendrogram(sighted_human_color_clust))
x <- dends %>%
untangle(method = "step2side") %>%
tanglegram( common_subtrees_color_branches = TRUE,highlight_branches_lwd = F)
paste("entanglement = ",round(entanglement(x), 2))
## [1] "entanglement = 0.22"
See https://cran.r-project.org/web/packages/dendextend/vignettes/introduction.html#the-fowlkes-mallows-index-and-the-bk-plot for details.
The FM-Index is a measure of similarity between two clusterings (higher means greater similarity). The Bk plot shows the FM-INdex for various values of k, where k is the number of clusters. The dashed black line shows the expected value assuming no connection between the clusterings, the red line shows the critical significance level. Dotted points above indicate significantly similar clusterings, i.e. better than one would expect from simply reshuffling labels.
Bk_plot(as.dendrogram(blind_human_color_clust), as.dendrogram(sighted_human_color_clust), main="FM-Index for different numbers of clusters.\nDots closer to 1 indicate more similar clusterings.\nDots above red line are significant.")
Choosing a smaller cluster number (k=5).
#five clusters
print("FIVE CLUSTERS")
## [1] "FIVE CLUSTERS"
adjustedRand(cutree(blind_human_color_clust, k=5), cutree(sighted_human_color_clust, k=5))
## Rand HA MA FM Jaccard
## 0.7057471 0.1913559 0.2689056 0.3846865 0.2380952
Lower entanglement is better (0 is perfect alignment).
dends <- dendlist(as.dendrogram(sighted_human_color_clust), as.dendrogram(language_color_clust))
x <- tanglegram(dends, common_subtrees_color_branches = TRUE, highlight_branches_lwd = F)
paste("entanglement = ",round(entanglement(x), 2))
## [1] "entanglement = 0.57"
x <- dends %>%
untangle(method = "step2side") %>%
tanglegram( common_subtrees_color_branches = TRUE,highlight_branches_lwd = F)
paste("entanglement = ",round(entanglement(x), 2))
## [1] "entanglement = 0.3"
Bk_plot(as.dendrogram(sighted_human_color_clust), as.dendrogram(language_color_clust), main="FM-Index for different numbers of clusters.\nDots closer to 1 indicate more similar clusterings.\nDots above red line are significant.")
Choosing a smaller cluster number.
#five clusters
print("FIVE CLUSTERS")
## [1] "FIVE CLUSTERS"
adjustedRand(cutree(sighted_human_color_clust, k=5), cutree(language_color_clust, k=5))
## Rand HA MA FM Jaccard
## 0.61379310 0.01160432 0.09502262 0.27130427 0.15577889
Lower entanglement is better (0 is perfect alignment).
dends <- dendlist(as.dendrogram(blind_human_color_clust), as.dendrogram(language_color_clust))
x <- tanglegram(dends, common_subtrees_color_branches = TRUE, highlight_branches_lwd = F)
paste("entanglement = ",round(entanglement(x), 2))
## [1] "entanglement = 0.56"
x <- dends %>%
untangle(method = "step2side") %>%
tanglegram( common_subtrees_color_branches = TRUE,highlight_branches_lwd = F)
paste("entanglement = ",round(entanglement(x), 2))
## [1] "entanglement = 0.38"
Bk_plot(as.dendrogram(blind_human_color_clust), as.dendrogram(language_color_clust), main="FM-Index for different numbers of clusters.\nDots closer to 1 indicate more similar clusterings.\nDots above red line are significant.")
Choosing a smaller cluster number (k=5).
#five clusters
print("FIVE CLUSTERS")
## [1] "FIVE CLUSTERS"
adjustedRand(cutree(blind_human_color_clust, k=5), cutree(language_color_clust, k=5))
## Rand HA MA FM Jaccard
## 0.609195402 0.009430171 0.091384184 0.274721128 0.158415842
sighted_human_shape_clust <- human_data_wide %>%
convert_similarity_to_distance("sighted_human_similarity_shape") %>%
as.dist() %>%
hclust()
ggdendro::ggdendrogram(sighted_human_shape_clust, rotate = T) +
ggtitle("Sighted Human Similarity shape")
blind_human_shape_clust <- human_data_wide %>%
convert_similarity_to_distance("blind_human_similarity_shape") %>%
as.dist() %>%
hclust()
ggdendro::ggdendrogram(blind_human_shape_clust, rotate = T) +
ggtitle("Blind Human Similarity shape")
language_shape_clust <- language_data %>%
convert_similarity_to_distance("language_similarity_simple_dist_shape", reverse_dist=F, human_data=F) %>%
as.dist() %>%
hclust()
ggdendro::ggdendrogram(language_shape_clust, rotate = T) +
ggtitle("Language Distances shape")
dends <- dendlist(as.dendrogram(blind_human_shape_clust), as.dendrogram(sighted_human_shape_clust))
x <- dends %>%
untangle(method = "step2side") %>%
tanglegram( common_subtrees_color_branches = TRUE,highlight_branches_lwd = F)
paste("entanglement = ",round(entanglement(x), 2))
## [1] "entanglement = 0.05"
Bk_plot(as.dendrogram(blind_human_shape_clust), as.dendrogram(sighted_human_shape_clust), main="FM-Index for different numbers of clusters.\nDots closer to 1 indicate more similar clusterings.\nDots above red line are significant.")
Choosing a small cluster number (k=5).
#five clusters
print("FIVE CLUSTERS")
## [1] "FIVE CLUSTERS"
adjustedRand(cutree(blind_human_shape_clust, k=5), cutree(sighted_human_shape_clust, k=5))
## Rand HA MA FM Jaccard
## 0.7471264 0.3708914 0.4209172 0.5461264 0.3750000
dends <- dendlist(as.dendrogram(sighted_human_shape_clust), as.dendrogram(language_shape_clust))
x <- dends %>%
untangle(method = "step2side") %>%
tanglegram( common_subtrees_color_branches = TRUE,highlight_branches_lwd = F)
paste("entanglement = ",round(entanglement(x), 2))
## [1] "entanglement = 0.14"
Bk_plot(as.dendrogram(sighted_human_shape_clust), as.dendrogram(language_shape_clust), main="FM-Index for different numbers of clusters.\nDots closer to 1 indicate more similar clusterings.\nDots above red line are significant.")
Choosing a small cluster number (k=5).
#five clusters
print("FIVE CLUSTERS")
## [1] "FIVE CLUSTERS"
adjustedRand(cutree(sighted_human_shape_clust, k=5), cutree(language_shape_clust, k=5))
## Rand HA MA FM Jaccard
## 0.7149425 0.2590863 0.3235708 0.4504870 0.2873563
dends <- dendlist(as.dendrogram(blind_human_shape_clust), as.dendrogram(language_shape_clust))
x <- dends %>%
untangle(method = "step2side") %>%
tanglegram( common_subtrees_color_branches = TRUE,highlight_branches_lwd = F)
paste("entanglement = ",round(entanglement(x), 2))
## [1] "entanglement = 0.34"
Bk_plot(as.dendrogram(blind_human_shape_clust), as.dendrogram(language_shape_clust), main="FM-Index for different numbers of clusters.\nDots closer to 1 indicate more similar clusterings.\nDots above red line are significant.")
Choosing a small cluster number (k=5)
#five clusters
print("FIVE CLUSTERS")
## [1] "FIVE CLUSTERS"
adjustedRand(cutree(blind_human_shape_clust, k=5), cutree(language_shape_clust, k=5))
## Rand HA MA FM Jaccard
## 0.7241379 0.2533257 0.3231386 0.4355350 0.2771084
sighted_human_skin_clust <- human_data_wide %>%
convert_similarity_to_distance("sighted_human_similarity_skin") %>%
as.dist() %>%
hclust()
ggdendro::ggdendrogram(sighted_human_skin_clust, rotate = T) +
ggtitle("Sighted Human Similarity skin")
blind_human_skin_clust <- human_data_wide %>%
convert_similarity_to_distance("blind_human_similarity_skin") %>%
as.dist() %>%
hclust()
ggdendro::ggdendrogram(blind_human_skin_clust, rotate = T) +
ggtitle("Blind Human Similarity skin")
language_skin_clust <- language_data %>%
convert_similarity_to_distance("language_similarity_simple_dist_texture", reverse_dist=F, human_data=F) %>%
as.dist() %>%
hclust()
ggdendro::ggdendrogram(language_skin_clust, rotate = T) +
ggtitle("Language Distances skin")
dends <- dendlist(as.dendrogram(blind_human_skin_clust), as.dendrogram(sighted_human_skin_clust))
x <- dends %>%
untangle(method = "step2side") %>%
tanglegram( common_subtrees_color_branches = TRUE,highlight_branches_lwd = F)
paste("entanglement = ",round(entanglement(x), 2))
## [1] "entanglement = 0.13"
Bk_plot(as.dendrogram(blind_human_skin_clust), as.dendrogram(sighted_human_skin_clust), main="FM-Index for different numbers of clusters.\nDots closer to 1 indicate more similar clusterings.\nDots above red line are significant.")
Choosing a small cluster number (k=5).
#five clusters
print("FIVE CLUSTERS")
## [1] "FIVE CLUSTERS"
adjustedRand(cutree(blind_human_skin_clust, k=5), cutree(sighted_human_skin_clust, k=5))
## Rand HA MA FM Jaccard
## 0.7333333 0.3609261 0.4085522 0.5489591 0.3661202
dends <- dendlist(as.dendrogram(sighted_human_skin_clust), as.dendrogram(language_skin_clust))
x <- dends %>%
untangle(method = "step2side") %>%
tanglegram( common_subtrees_color_branches = TRUE,highlight_branches_lwd = F)
paste("entanglement = ",round(entanglement(x), 2))
## [1] "entanglement = 0.28"
Bk_plot(as.dendrogram(sighted_human_skin_clust), as.dendrogram(language_skin_clust), main="FM-Index for different numbers of clusters.\nDots closer to 1 indicate more similar clusterings.\nDots above red line are significant.")
Choosing a small cluster number (k=5).
#five clusters
print("FIVE CLUSTERS")
## [1] "FIVE CLUSTERS"
adjustedRand(cutree(sighted_human_skin_clust, k=5), cutree(language_skin_clust, k=5))
## Rand HA MA FM Jaccard
## 0.57931034 0.03808741 0.10119840 0.34547576 0.20779221
dends <- dendlist(as.dendrogram(blind_human_skin_clust), as.dendrogram(language_skin_clust))
x <- dends %>%
untangle(method = "step2side") %>%
tanglegram( common_subtrees_color_branches = TRUE,highlight_branches_lwd = F)
paste("entanglement = ",round(entanglement(x), 2))
## [1] "entanglement = 0.24"
Bk_plot(as.dendrogram(blind_human_skin_clust), as.dendrogram(language_skin_clust), main="FM-Index for different numbers of clusters.\nDots closer to 1 indicate more similar clusterings.\nDots above red line are significant.")
Choosing a small cluster number (k=5)
#five clusters
print("FIVE CLUSTERS")
## [1] "FIVE CLUSTERS"
adjustedRand(cutree(blind_human_skin_clust, k=5), cutree(language_skin_clust, k=5))
## Rand HA MA FM Jaccard
## 0.65287356 0.09999589 0.17793801 0.33165521 0.19680851