Cluster Analysis Language And Human Similarity Data

Basic Clusterings
- SHAPE
- TEXTURE
- COLOR
Entanglement Comparisons
- SHAPE
- TEXTURE
- COLOR
Entanglement Values
Indices of Similarity between Clusterings
- FM Index (z-scores)
- Adjusted Rand Index

#### LOAD DATA AND COMPUTE CLUSTERS ####
#Initial code loading data and organizing clusters
LANG_ANIMAL_DISTANCE_COLOR <- here("data/processed/animal_color_distances_language_wiki.csv")
LANG_ANIMAL_DISTANCE_SHAPE<- here("data/processed/animal_shape_distances_language_wiki.csv")
LANG_ANIMAL_DISTANCE_TEXTURE <- here("data/processed/animal_texture_distances_language_wiki.csv")
TIDY_HUMAN_PATH <- here("data/processed/tidy_human_data.csv")

language_data <- read_csv(LANG_ANIMAL_DISTANCE_COLOR) %>%
  left_join(read_csv(LANG_ANIMAL_DISTANCE_SHAPE), by  = c("animal1", "animal2")) %>%
  left_join(read_csv(LANG_ANIMAL_DISTANCE_TEXTURE),by  = c("animal1", "animal2")) %>%
  select(-contains("PCA"))
human_data <- read_csv(TIDY_HUMAN_PATH)
human_data_wide <- human_data %>%
  unite("measure", c("participant_type", "similarity_type")) %>%
  spread(measure, human_similarity)

#rename human "skin" columns to texture
colnames(human_data_wide)[colnames(human_data_wide)=="blind_human_similarity_skin"] <- "blind_human_similarity_texture"
colnames(human_data_wide)[colnames(human_data_wide)=="sighted_human_similarity_skin"] <- "sighted_human_similarity_texture"

####Re-compute clusters and organize into a list####
cluster_list=list()
data_sources=c("sighted","blind","language")
knowledge_types=c("color","shape","texture")
for (knowledge_type in knowledge_types) {
  for (data_source in data_sources) {
    if (data_source == "language") {
      cluster_list[[knowledge_type]][[data_source]] <- language_data %>%
  convert_similarity_to_distance(paste(data_source,"_similarity_simple_dist_",knowledge_type,sep=""), reverse_dist=F, human_data=F) %>%
  as.dist() %>%
  hclust()
    } else {
      cluster_list[[knowledge_type]][[data_source]] <- human_data_wide %>%
      convert_similarity_to_distance(paste(data_source,"_human_similarity_",knowledge_type,sep="")) %>%
      as.dist() %>%
      hclust()
    }
  }
}

##read in data frame of cluster similarity values
cluster_similarity_values <- read_csv(here("scripts/clustering/cluster_similarity_values.csv"))

#process into shorter dataframe
cluster_similarity <- cluster_similarity_values %>%
  rowwise() %>% 
  mutate(data_source = paste(sort(c(as.character(data_source_1), as.character(data_source_2))), collapse = "_")) %>%  
  ungroup() %>%
  group_by(knowledge_type,data_source) %>%
  select(-data_source_1,-data_source_2) %>%
  summarise_all(min)

Basic Clusterings

SHAPE

ggdendro::ggdendrogram(cluster_list[["shape"]][["language"]], rotate = T) +
  ggtitle("Language Similarity Shape")
ggdendro::ggdendrogram(cluster_list[["shape"]][["blind"]], rotate = T) +
   ggtitle("Blind Similarity Shape")
ggdendro::ggdendrogram(cluster_list[["shape"]][["sighted"]], rotate = T) +
   ggtitle("Sighted Similarity Shape")

TEXTURE

ggdendro::ggdendrogram(cluster_list[["texture"]][["language"]], rotate = T) +
  ggtitle("Language Similarity Texture")
ggdendro::ggdendrogram(cluster_list[["texture"]][["blind"]], rotate = T) +
   ggtitle("Blind Similarity Texture")
ggdendro::ggdendrogram(cluster_list[["texture"]][["sighted"]], rotate = T) +
   ggtitle("Sighted Similarity Texture")

COLOR

ggdendro::ggdendrogram(cluster_list[["color"]][["language"]], rotate = T) +
  ggtitle("Language Similarity Color")
ggdendro::ggdendrogram(cluster_list[["color"]][["blind"]], rotate = T) +
   ggtitle("Blind Similarity Color")
ggdendro::ggdendrogram(cluster_list[["color"]][["sighted"]], rotate = T) +
   ggtitle("Sighted Similarity Color")

Entanglement Comparisons

SHAPE

dends <- dendlist(as.dendrogram(cluster_list[["shape"]][["language"]]),as.dendrogram(cluster_list[["shape"]][["blind"]]))
x <- dends %>%
  untangle(method = "step2side") %>%
  tanglegram( common_subtrees_color_branches = TRUE,highlight_branches_lwd = F,margin_inner= 5.7,main_left="LANGUAGE",main_right="BLIND") 

paste("Language to Blind Entanglement: ",round(entanglement(x),2))

## [1] "Language to Blind Entanglement:  0.21"

dends <- dendlist(as.dendrogram(cluster_list[["shape"]][["language"]]),as.dendrogram(cluster_list[["shape"]][["sighted"]]))
x <- dends %>%
  untangle(method = "step2side") %>%
  tanglegram( common_subtrees_color_branches = TRUE,highlight_branches_lwd = F,margin_inner= 5.7,main_left="LANGUAGE",main_right="SIGHTED") 
paste("Language to Sighted Entanglement: ",round(entanglement(x),2))

## [1] "Language to Sighted Entanglement:  0.12"

TEXTURE

dends <- dendlist(as.dendrogram(cluster_list[["texture"]][["language"]]),as.dendrogram(cluster_list[["texture"]][["blind"]]))
x <- dends %>%
  untangle(method = "step2side") %>%
  tanglegram( common_subtrees_color_branches = TRUE,highlight_branches_lwd = F,margin_inner= 5.7,main_left="LANGUAGE",main_right="BLIND") 
paste("Language to Blind Entanglement: ",round(entanglement(x),2))

## [1] "Language to Blind Entanglement:  0.14"

dends <- dendlist(as.dendrogram(cluster_list[["texture"]][["language"]]),as.dendrogram(cluster_list[["texture"]][["sighted"]]))
x <- dends %>%
  untangle(method = "step2side") %>%
  tanglegram( common_subtrees_color_branches = TRUE,highlight_branches_lwd = F,margin_inner= 5.7,main_left="LANGUAGE",main_right="SIGHTED") 
paste("Language to Sighted Entanglement: ",round(entanglement(x),2))

## [1] "Language to Sighted Entanglement:  0.17"

COLOR

dends <- dendlist(as.dendrogram(cluster_list[["color"]][["language"]]),as.dendrogram(cluster_list[["color"]][["blind"]]))
x <- dends %>%
  untangle(method = "step2side") %>%
  tanglegram( common_subtrees_color_branches = TRUE,highlight_branches_lwd = F,margin_inner= 5.7,main_left="LANGUAGE",main_right="BLIND") 
paste("Language to Blind Entanglement: ",round(entanglement(x),2))

## [1] "Language to Blind Entanglement:  0.36"

dends <- dendlist(as.dendrogram(cluster_list[["color"]][["language"]]),as.dendrogram(cluster_list[["color"]][["sighted"]]))
x <- dends %>%
  untangle(method = "step2side") %>%
  tanglegram( common_subtrees_color_branches = TRUE,highlight_branches_lwd = F,margin_inner= 5.7,main_left="LANGUAGE",main_right="SIGHTED") 
paste("Language to Sighted Entanglement: ",round(entanglement(x),2))

## [1] "Language to Sighted Entanglement:  0.25"

Entanglement Values

ggplot(cluster_similarity,aes(data_source,entangle_step2side,fill=data_source))+
  geom_bar(stat="identity")+
  ylim(0,1)+
  facet_wrap(~knowledge_type)+
  theme(legend.position="none", 
        axis.text.x  = element_text(angle=90, vjust=0.5))+
  scale_x_discrete(limits=c("blind_sighted","language_sighted","blind_language"),
                   labels=c("Blind \nto Sighted", "Language \nto Sighted","Language \nto Blind"))+
  scale_fill_brewer(palette="Set1")+
  ylab("Entanglement")+
  xlab("Clustering Comparison")

Indices of Similarity between Clusterings

FM Index (z-scores)

cluster_similarity_Z_FM <- cluster_similarity %>%
  select(knowledge_type,data_source,Z_FM_5,Z_FM_10,Z_FM_15, Z_FM_20) %>%
  gather(cluster_num,Z_FM, Z_FM_5:Z_FM_20) %>%
  mutate(cluster_num=as.numeric(as.character(str_remove(cluster_num, "Z_FM_"))))
title <- "shape"
p1 <- ggplot(subset(cluster_similarity_Z_FM,knowledge_type==title),aes(data_source,Z_FM,fill=data_source))+
    geom_bar(stat="identity")+
    facet_wrap(~cluster_num)+
    theme(legend.position="none", 
        axis.text.x  = element_text(angle=90, vjust=0.5,size=8))+
    scale_x_discrete(name="",
                     limits=c("blind_sighted","language_sighted","blind_language"),
                   labels=c("Blind \nto Sighted", "Language \nto Sighted","Language \nto Blind"))+
    scale_fill_brewer(palette="Set1")+
    geom_hline(yintercept=1.645, linetype="dashed")+
  scale_y_continuous(limits=c(-1,16))+
    ylab("FM Index (Z-Scores)")+
    xlab("Clustering Comparison")+
    ggtitle(title)
title <- "texture"
p2 <- ggplot(subset(cluster_similarity_Z_FM,knowledge_type==title),aes(data_source,Z_FM,fill=data_source))+
    geom_bar(stat="identity")+
    facet_wrap(~cluster_num)+
    theme(legend.position="none", 
        axis.text.x  = element_text(angle=90, vjust=0.5,size=8))+
    scale_x_discrete(
      limits=c("blind_sighted","language_sighted","blind_language"),
                   labels=c("Blind \nto Sighted", "Language \nto Sighted","Language \nto Blind"))+
    scale_fill_brewer(palette="Set1")+
    geom_hline(yintercept=1.645, linetype="dashed")+
  scale_y_continuous(name="",limits=c(-1,16))+
    xlab("Clustering Comparison")+
    ggtitle(title)
title <- "color"
p3 <- ggplot(subset(cluster_similarity_Z_FM,knowledge_type==title),aes(data_source,Z_FM,fill=data_source))+
    geom_bar(stat="identity")+
    facet_wrap(~cluster_num)+
    theme(legend.position="none", 
        axis.text.x  = element_text(angle=90, vjust=0.5,size=8))+
    scale_x_discrete(name="",
                     limits=c("blind_sighted","language_sighted","blind_language"),
                   labels=c("Blind \nto Sighted", "Language \nto Sighted","Language \nto Blind"))+
    scale_fill_brewer(palette="Set1")+
    geom_hline(yintercept=1.645, linetype="dashed")+
  scale_y_continuous(name="",limits=c(-1,16))+
    xlab("Clustering Comparison")+
    ggtitle(title)
  

plot_grid(p1,p2,p3,ncol=3)

Adjusted Rand Index

cluster_similarity_adjustedRand <- cluster_similarity %>%
  select(knowledge_type,data_source,adjustedRand_5,adjustedRand_10,adjustedRand_15, adjustedRand_20) %>%
  gather(cluster_num,adjustedRand, adjustedRand_5:adjustedRand_20) %>%
  mutate(cluster_num=as.numeric(as.character(str_remove(cluster_num, "adjustedRand_"))))

title <- "shape"
p1 <- ggplot(subset(cluster_similarity_adjustedRand,knowledge_type==title),aes(data_source,adjustedRand,fill=data_source))+
    geom_bar(stat="identity")+
    facet_wrap(~cluster_num)+
    theme(legend.position="none", 
        axis.text.x  = element_text(angle=90, vjust=0.5,size=8))+
    scale_x_discrete(name="",
                     limits=c("blind_sighted","language_sighted","blind_language"),
                   labels=c("Blind \nto Sighted", "Language \nto Sighted","Language \nto Blind"))+
    scale_fill_brewer(palette="Set1")+
  scale_y_continuous(limits=c(-0.1,1))+
  geom_hline(yintercept=0)+
    ylab("Adjusted Rand Index")+
    xlab("Clustering Comparison")+
    ggtitle(title)
title <- "texture"
p2 <- ggplot(subset(cluster_similarity_adjustedRand,knowledge_type==title),aes(data_source,adjustedRand,fill=data_source))+
    geom_bar(stat="identity")+
    facet_wrap(~cluster_num)+
    theme(legend.position="none", 
        axis.text.x  = element_text(angle=90, vjust=0.5,size=8))+
    scale_x_discrete(limits=c("blind_sighted","language_sighted","blind_language"),
                   labels=c("Blind \nto Sighted", "Language \nto Sighted","Language \nto Blind"))+
    scale_fill_brewer(palette="Set1")+
  geom_hline(yintercept=0)+
  scale_y_continuous(name="", limits=c(-0.1,1))+
    xlab("Clustering Comparison")+
    ggtitle(title)
title <- "color"
p3 <- ggplot(subset(cluster_similarity_adjustedRand,knowledge_type==title),aes(data_source,adjustedRand,fill=data_source))+
    geom_bar(stat="identity")+
    facet_wrap(~cluster_num)+
    theme(legend.position="none", 
        axis.text.x  = element_text(angle=90, vjust=0.5,size=8))+
    scale_x_discrete(
      name="",
      limits=c("blind_sighted","language_sighted","blind_language"),
                   labels=c("Blind \nto Sighted", "Language \nto Sighted","Language \nto Blind"))+
    scale_fill_brewer(palette="Set1")+
  scale_y_continuous(name="", limits=c(-0.1,1))+
  geom_hline(yintercept=0)+
    xlab("Clustering Comparison")+
    ggtitle(title)

plot_grid(p1,p2,p3,ncol=3)

Cluster Analysis Language And Human Similarity Data - Plots

Martin Zettersten

2019-06-07

Basic Clusterings

SHAPE

TEXTURE

COLOR

Entanglement Comparisons

SHAPE

TEXTURE

COLOR

Entanglement Values

Indices of Similarity between Clusterings

FM Index (z-scores)

Adjusted Rand Index