We are going to do some clustering based on categorical variables, and the package ‘stringdist’ will be really helpful to calculate the distance between different strings.

library(data.table)
library(knitr)
library(corrplot)
suppressPackageStartupMessages(library(igraph))
library(stringdist)
mock_data <- read.csv("https://raw.githubusercontent.com/yesimiao/Dataset/master/MOCK_DATA.csv")
mock_data$name <- paste(mock_data$first_name, mock_data$last_name, sep=" ")
mock_data$category <- paste0(as.character(mock_data$ip_address),' ',
                             as.character(mock_data$email),' ',
                             as.character(mock_data$name))
models <- as.character(mock_data$category)
distancemodels <- stringdistmatrix(models, models,method = "jw")
rownames(distancemodels) <- mock_data$name
colnames(distancemodels) <- mock_data$name

Do the hierarchy cluster, and sketch the corresponding graph, we can be visualized clearly of the groups of clusters.

cluster <- hclust(as.dist(distancemodels), method='complete')
k=50 # k depends on the data
dfClust <- data.frame(models, cutree(cluster, k=k))
names(dfClust) <- c('models','clusternumber')
plot(cluster,hang = -1, labels = mock_data$name)
rect.hclust(cluster, k = k)

Rearrange our output from the cluster tree into a table:

mock_data$clusternumber<-cutree(cluster, k)
df <- data.table(X = mock_data$clusternumber, NAME = mock_data$name)
DF <- df[,list(NAME=list(NAME)), by = X]
#cluster table
kable(DF)
X NAME
1 Billy Torres, Bill Torres/, Bill , William Torres
2 William Torres
3 Doug Romero, Douglas Rem e, Douglas Romo, Douglas Romero, Douglas Romero
4 Bobby Boyd, Bobby Boyd, Bob Boyd, Rob Boyd, Robert Boyd, Robert Boyd, Robert Boyd
5 Robert B
6 Joyce Hughes
7 Stephen Moreno, Amanda Moreno
8 William Ferguson
9 Fred Reyes
10 Billy Davis
11 Judy Ryan
12 Carol Nguyen
13 Willie Collins
14 Lisa Schmidt
15 Martha Berry
16 Jason Murphy
17 Steve Oliver
18 Nancy Murphy
19 James Evans
20 Gregory Ross
21 Pamela Bishop
22 Theresa Lewis
23 Antonio Cruz
24 Carolyn Fernandez
25 Henry Romero
26 Frances Pierce
27 Jeremy Gordon
28 Kathryn Day
29 Theresa Jordan
30 Cheryl Palmer
31 Edward Ramos
32 Emily Howard
33 Frances Ellis
34 Bobby Gilbert
35 Ann Flores
36 Laura Burke
37 Billy Perry
38 Samuel Wagner
39 Gregory Franklin
40 Tammy Moore
41 Billy Montgomery
42 Paul Parker
43 Walter Vasquez
44 Tammy Lopez
45 Marie Hamilton
46 Shawn Collins
47 Diana Hill
48 John Phillips
49 Justin Clark
50 Gloria Willis

Apply the correlation plot from package corrplot. Our plots are based on the distance matrix from function stringdistmatrix.

#for cluster i, we use i here to make it more flexible to change to code.
i="1"
listofcluster <- mock_data[mock_data$clusternumber==i,]
n <- which(mock_data$clusternumber==i)
correlation <- 1 - distancemodels[c(n), c(n)]
#correlation plot
corrplot(correlation, method=c("number"), type='lower')

In order to have further understanding of the relationship between subjects within one specific cluster, network graph from igraph is a good tool to visualize it.

#network plot
graph <- graph.adjacency(correlation,weighted=TRUE, mode="upper", diag = FALSE)
V(graph)$color <- 'grey'
V(graph)$label.cex = 0.85
plot.igraph(graph,vertex.label = V(graph)$name, layout = layout.fruchterman.reingold,
            edge.color = "black", edge.width = E(graph)$weight, vertex.size=50,
            vertex.shape="circle", edge.label = round(E(graph)$weight, 3),
            edge.label.cex=0.7)

Apply exactly the same procedure, we can get the following:

#for cluster 3
i="3" #we do not need to change anything else, the following codes would be same as above
listofcluster <- mock_data[mock_data$clusternumber==i,]
n <- which(mock_data$clusternumber==i)
correlation <- 1 - distancemodels[c(n), c(n)]
#correlation plot
corrplot(correlation, method=c("number"), type='lower')

#network plot
graph <- graph.adjacency(correlation,weighted=TRUE, mode="upper", diag = FALSE)
V(graph)$color <- 'grey'
V(graph)$label.cex = 0.85
plot.igraph(graph,vertex.label = V(graph)$name, layout = layout.fruchterman.reingold,
            edge.color = "black", edge.width = E(graph)$weight, vertex.size=50,
            vertex.shape="circle", edge.label = round(E(graph)$weight, 3),
            edge.label.cex=0.7)

#for cluster 4
i="4" #we do not need to change anything else, the following codes would be same as above
listofcluster <- mock_data[mock_data$clusternumber==i,]
n <- which(mock_data$clusternumber==i)
correlation <- 1 - distancemodels[c(n), c(n)]
#correlation plot
corrplot(correlation, method=c("number"), type='lower')

#network plot
graph <- graph.adjacency(correlation,weighted=TRUE, mode="upper", diag = FALSE)
V(graph)$color <- 'grey'
V(graph)$label.cex = 0.85
plot.igraph(graph,vertex.label = V(graph)$name, layout = layout.fruchterman.reingold,
            edge.color = "black", edge.width = E(graph)$weight, vertex.size=50,
            vertex.shape="circle", edge.label = round(E(graph)$weight, 3),
            edge.label.cex=0.7)

All the codes:
library(data.table) #for table summary
library(knitr) #for table format
library(corrplot) #for correlation plot
library(igraph) #for networplot
library(stringdist) # for calculating distance
mock_data <- read.csv("C:/Users/ye/Desktop/MOCK_DATA.csv")
mock_data$name <- paste(mock_data$first_name, mock_data$last_name, sep=" ")
mock_data$category <- paste0(as.character(mock_data$ip_address),' ',
                             as.character(mock_data$email),' ',
                             as.character(mock_data$name))
models <- as.character(mock_data$category)
distancemodels <- stringdistmatrix(models, models,method = "jw")
rownames(distancemodels) <- mock_data$name #specify the rownames
colnames(distancemodels) <- mock_data$name #specify the colnames
cluster <- hclust(as.dist(distancemodels))
k=50 # k depends on the data
dfClust <- data.frame(models, cutree(cluster, k=k))
names(dfClust) <- c('models','clusternumber')
plot(cluster,hang = -1, labels = mock_data$name)
rect.hclust(cluster, k = k) #create red rectangles in the plot

mock_data$clusternumber<-cutree(cluster, k)
df <- data.table(X = mock_data$clusternumber, NAME = mock_data$name)
DF <- df[,list(NAME=list(NAME)), by = X]
#cluster table
kable(DF) #output a nicer table

#for cluster i, we use i here to make it more flexible to change to code.
i="1"
listofcluster <- mock_data[mock_data$clusternumber==i,]
n <- which(mock_data$clusternumber==i) 
correlation <- 1 - distancemodels[c(n), c(n)]
#correlation plot
corrplot(correlation, method=c("number"), type='lower')

#network plot
graph <- graph.adjacency(correlation,weighted=TRUE, mode="upper", diag = FALSE)
V(graph)$color <- 'grey'
V(graph)$label.cex = 0.85 #define the text size of nodes
plot.igraph(graph,vertex.label = V(graph)$name, layout = layout.fruchterman.reingold,
            edge.color = "black", edge.width = E(graph)$weight, vertex.size=50,
            vertex.shape="circle", edge.label = round(E(graph)$weight, 3),
            edge.label.cex=0.7)

#for cluster 3
i="3" #we do not need to change anything else, the following codes would be same as above
listofcluster <- mock_data[mock_data$clusternumber==i,]
n <- which(mock_data$clusternumber==i)
correlation <- 1 - distancemodels[c(n), c(n)]
#correlation plot
corrplot(correlation, method=c("number"), type='lower')

#network plot
graph <- graph.adjacency(correlation,weighted=TRUE, mode="upper", diag = FALSE)
V(graph)$color <- 'grey'
V(graph)$label.cex = 0.85
plot.igraph(graph,vertex.label = V(graph)$name, layout = layout.fruchterman.reingold,
            edge.color = "black", edge.width = E(graph)$weight, vertex.size=50,
            vertex.shape="circle", edge.label = round(E(graph)$weight, 3),
            edge.label.cex=0.7)

#for cluster 4
i="4" #we do not need to change anything else, the following codes would be same as above
listofcluster <- mock_data[mock_data$clusternumber==i,]
n <- which(mock_data$clusternumber==i)
correlation <- 1 - distancemodels[c(n), c(n)]
#correlation plot
corrplot(correlation, method=c("number"), type='lower')

#network plot
graph <- graph.adjacency(correlation,weighted=TRUE, mode="upper", diag = FALSE)
V(graph)$color <- 'grey'
V(graph)$label.cex = 0.85
plot.igraph(graph,vertex.label = V(graph)$name, layout = layout.fruchterman.reingold,
            edge.color = "black", edge.width = E(graph)$weight, vertex.size=50,
            vertex.shape="circle", edge.label = round(E(graph)$weight, 3),
            edge.label.cex=0.7)