We are going to do some clustering based on categorical variables, and the package ‘stringdist’ will be really helpful to calculate the distance between different strings.
library(data.table)
library(knitr)
library(corrplot)
suppressPackageStartupMessages(library(igraph))
library(stringdist)
mock_data <- read.csv("https://raw.githubusercontent.com/yesimiao/Dataset/master/MOCK_DATA.csv")
mock_data$name <- paste(mock_data$first_name, mock_data$last_name, sep=" ")
mock_data$category <- paste0(as.character(mock_data$ip_address),' ',
as.character(mock_data$email),' ',
as.character(mock_data$name))
models <- as.character(mock_data$category)
distancemodels <- stringdistmatrix(models, models,method = "jw")
rownames(distancemodels) <- mock_data$name
colnames(distancemodels) <- mock_data$name
Do the hierarchy cluster, and sketch the corresponding graph, we can be visualized clearly of the groups of clusters.
cluster <- hclust(as.dist(distancemodels), method='complete')
k=50 # k depends on the data
dfClust <- data.frame(models, cutree(cluster, k=k))
names(dfClust) <- c('models','clusternumber')
plot(cluster,hang = -1, labels = mock_data$name)
rect.hclust(cluster, k = k)
Rearrange our output from the cluster tree into a table:
mock_data$clusternumber<-cutree(cluster, k)
df <- data.table(X = mock_data$clusternumber, NAME = mock_data$name)
DF <- df[,list(NAME=list(NAME)), by = X]
#cluster table
kable(DF)
| X | NAME |
|---|---|
| 1 | Billy Torres, Bill Torres/, Bill , William Torres |
| 2 | William Torres |
| 3 | Doug Romero, Douglas Rem e, Douglas Romo, Douglas Romero, Douglas Romero |
| 4 | Bobby Boyd, Bobby Boyd, Bob Boyd, Rob Boyd, Robert Boyd, Robert Boyd, Robert Boyd |
| 5 | Robert B |
| 6 | Joyce Hughes |
| 7 | Stephen Moreno, Amanda Moreno |
| 8 | William Ferguson |
| 9 | Fred Reyes |
| 10 | Billy Davis |
| 11 | Judy Ryan |
| 12 | Carol Nguyen |
| 13 | Willie Collins |
| 14 | Lisa Schmidt |
| 15 | Martha Berry |
| 16 | Jason Murphy |
| 17 | Steve Oliver |
| 18 | Nancy Murphy |
| 19 | James Evans |
| 20 | Gregory Ross |
| 21 | Pamela Bishop |
| 22 | Theresa Lewis |
| 23 | Antonio Cruz |
| 24 | Carolyn Fernandez |
| 25 | Henry Romero |
| 26 | Frances Pierce |
| 27 | Jeremy Gordon |
| 28 | Kathryn Day |
| 29 | Theresa Jordan |
| 30 | Cheryl Palmer |
| 31 | Edward Ramos |
| 32 | Emily Howard |
| 33 | Frances Ellis |
| 34 | Bobby Gilbert |
| 35 | Ann Flores |
| 36 | Laura Burke |
| 37 | Billy Perry |
| 38 | Samuel Wagner |
| 39 | Gregory Franklin |
| 40 | Tammy Moore |
| 41 | Billy Montgomery |
| 42 | Paul Parker |
| 43 | Walter Vasquez |
| 44 | Tammy Lopez |
| 45 | Marie Hamilton |
| 46 | Shawn Collins |
| 47 | Diana Hill |
| 48 | John Phillips |
| 49 | Justin Clark |
| 50 | Gloria Willis |
Apply the correlation plot from package corrplot. Our plots are based on the distance matrix from function stringdistmatrix.
#for cluster i, we use i here to make it more flexible to change to code.
i="1"
listofcluster <- mock_data[mock_data$clusternumber==i,]
n <- which(mock_data$clusternumber==i)
correlation <- 1 - distancemodels[c(n), c(n)]
#correlation plot
corrplot(correlation, method=c("number"), type='lower')
In order to have further understanding of the relationship between subjects within one specific cluster, network graph from igraph is a good tool to visualize it.
#network plot
graph <- graph.adjacency(correlation,weighted=TRUE, mode="upper", diag = FALSE)
V(graph)$color <- 'grey'
V(graph)$label.cex = 0.85
plot.igraph(graph,vertex.label = V(graph)$name, layout = layout.fruchterman.reingold,
edge.color = "black", edge.width = E(graph)$weight, vertex.size=50,
vertex.shape="circle", edge.label = round(E(graph)$weight, 3),
edge.label.cex=0.7)
Apply exactly the same procedure, we can get the following:
#for cluster 3
i="3" #we do not need to change anything else, the following codes would be same as above
listofcluster <- mock_data[mock_data$clusternumber==i,]
n <- which(mock_data$clusternumber==i)
correlation <- 1 - distancemodels[c(n), c(n)]
#correlation plot
corrplot(correlation, method=c("number"), type='lower')
#network plot
graph <- graph.adjacency(correlation,weighted=TRUE, mode="upper", diag = FALSE)
V(graph)$color <- 'grey'
V(graph)$label.cex = 0.85
plot.igraph(graph,vertex.label = V(graph)$name, layout = layout.fruchterman.reingold,
edge.color = "black", edge.width = E(graph)$weight, vertex.size=50,
vertex.shape="circle", edge.label = round(E(graph)$weight, 3),
edge.label.cex=0.7)
#for cluster 4
i="4" #we do not need to change anything else, the following codes would be same as above
listofcluster <- mock_data[mock_data$clusternumber==i,]
n <- which(mock_data$clusternumber==i)
correlation <- 1 - distancemodels[c(n), c(n)]
#correlation plot
corrplot(correlation, method=c("number"), type='lower')
#network plot
graph <- graph.adjacency(correlation,weighted=TRUE, mode="upper", diag = FALSE)
V(graph)$color <- 'grey'
V(graph)$label.cex = 0.85
plot.igraph(graph,vertex.label = V(graph)$name, layout = layout.fruchterman.reingold,
edge.color = "black", edge.width = E(graph)$weight, vertex.size=50,
vertex.shape="circle", edge.label = round(E(graph)$weight, 3),
edge.label.cex=0.7)
All the codes:
library(data.table) #for table summary
library(knitr) #for table format
library(corrplot) #for correlation plot
library(igraph) #for networplot
library(stringdist) # for calculating distance
mock_data <- read.csv("C:/Users/ye/Desktop/MOCK_DATA.csv")
mock_data$name <- paste(mock_data$first_name, mock_data$last_name, sep=" ")
mock_data$category <- paste0(as.character(mock_data$ip_address),' ',
as.character(mock_data$email),' ',
as.character(mock_data$name))
models <- as.character(mock_data$category)
distancemodels <- stringdistmatrix(models, models,method = "jw")
rownames(distancemodels) <- mock_data$name #specify the rownames
colnames(distancemodels) <- mock_data$name #specify the colnames
cluster <- hclust(as.dist(distancemodels))
k=50 # k depends on the data
dfClust <- data.frame(models, cutree(cluster, k=k))
names(dfClust) <- c('models','clusternumber')
plot(cluster,hang = -1, labels = mock_data$name)
rect.hclust(cluster, k = k) #create red rectangles in the plot
mock_data$clusternumber<-cutree(cluster, k)
df <- data.table(X = mock_data$clusternumber, NAME = mock_data$name)
DF <- df[,list(NAME=list(NAME)), by = X]
#cluster table
kable(DF) #output a nicer table
#for cluster i, we use i here to make it more flexible to change to code.
i="1"
listofcluster <- mock_data[mock_data$clusternumber==i,]
n <- which(mock_data$clusternumber==i)
correlation <- 1 - distancemodels[c(n), c(n)]
#correlation plot
corrplot(correlation, method=c("number"), type='lower')
#network plot
graph <- graph.adjacency(correlation,weighted=TRUE, mode="upper", diag = FALSE)
V(graph)$color <- 'grey'
V(graph)$label.cex = 0.85 #define the text size of nodes
plot.igraph(graph,vertex.label = V(graph)$name, layout = layout.fruchterman.reingold,
edge.color = "black", edge.width = E(graph)$weight, vertex.size=50,
vertex.shape="circle", edge.label = round(E(graph)$weight, 3),
edge.label.cex=0.7)
#for cluster 3
i="3" #we do not need to change anything else, the following codes would be same as above
listofcluster <- mock_data[mock_data$clusternumber==i,]
n <- which(mock_data$clusternumber==i)
correlation <- 1 - distancemodels[c(n), c(n)]
#correlation plot
corrplot(correlation, method=c("number"), type='lower')
#network plot
graph <- graph.adjacency(correlation,weighted=TRUE, mode="upper", diag = FALSE)
V(graph)$color <- 'grey'
V(graph)$label.cex = 0.85
plot.igraph(graph,vertex.label = V(graph)$name, layout = layout.fruchterman.reingold,
edge.color = "black", edge.width = E(graph)$weight, vertex.size=50,
vertex.shape="circle", edge.label = round(E(graph)$weight, 3),
edge.label.cex=0.7)
#for cluster 4
i="4" #we do not need to change anything else, the following codes would be same as above
listofcluster <- mock_data[mock_data$clusternumber==i,]
n <- which(mock_data$clusternumber==i)
correlation <- 1 - distancemodels[c(n), c(n)]
#correlation plot
corrplot(correlation, method=c("number"), type='lower')
#network plot
graph <- graph.adjacency(correlation,weighted=TRUE, mode="upper", diag = FALSE)
V(graph)$color <- 'grey'
V(graph)$label.cex = 0.85
plot.igraph(graph,vertex.label = V(graph)$name, layout = layout.fruchterman.reingold,
edge.color = "black", edge.width = E(graph)$weight, vertex.size=50,
vertex.shape="circle", edge.label = round(E(graph)$weight, 3),
edge.label.cex=0.7)