Identity Resolution and Network Graphing

We are going to do some clustering based on categorical variables, and the package ‘stringdist’ will be really helpful to calculate the distance between different strings.

library(data.table)
library(knitr)
library(corrplot)
suppressPackageStartupMessages(library(igraph))
library(stringdist)
mock_data <- read.csv("https://raw.githubusercontent.com/yesimiao/Dataset/master/MOCK_DATA.csv")
mock_data$name <- paste(mock_data$first_name, mock_data$last_name, sep=" ")
mock_data$category <- paste0(as.character(mock_data$ip_address),' ',
                             as.character(mock_data$email),' ',
                             as.character(mock_data$name))
models <- as.character(mock_data$category)
distancemodels <- stringdistmatrix(models, models,method = "jw")
rownames(distancemodels) <- mock_data$name
colnames(distancemodels) <- mock_data$name

Do the hierarchy cluster, and sketch the corresponding graph, we can be visualized clearly of the groups of clusters.

cluster <- hclust(as.dist(distancemodels), method='complete')
k=50 # k depends on the data
dfClust <- data.frame(models, cutree(cluster, k=k))
names(dfClust) <- c('models','clusternumber')
plot(cluster,hang = -1, labels = mock_data$name)
rect.hclust(cluster, k = k)

Rearrange our output from the cluster tree into a table:

mock_data$clusternumber<-cutree(cluster, k)
df <- data.table(X = mock_data$clusternumber, NAME = mock_data$name)
DF <- df[,list(NAME=list(NAME)), by = X]
#cluster table
kable(DF)

X	NAME
1	Billy Torres, Bill Torres/, Bill , William Torres
2	William Torres
3	Doug Romero, Douglas Rem e, Douglas Romo, Douglas Romero, Douglas Romero
4	Bobby Boyd, Bobby Boyd, Bob Boyd, Rob Boyd, Robert Boyd, Robert Boyd, Robert Boyd
5	Robert B
6	Joyce Hughes
7	Stephen Moreno, Amanda Moreno
8	William Ferguson
9	Fred Reyes
10	Billy Davis
11	Judy Ryan
12	Carol Nguyen
13	Willie Collins
14	Lisa Schmidt
15	Martha Berry
16	Jason Murphy
17	Steve Oliver
18	Nancy Murphy
19	James Evans
20	Gregory Ross
21	Pamela Bishop
22	Theresa Lewis
23	Antonio Cruz
24	Carolyn Fernandez
25	Henry Romero
26	Frances Pierce
27	Jeremy Gordon
28	Kathryn Day
29	Theresa Jordan
30	Cheryl Palmer
31	Edward Ramos
32	Emily Howard
33	Frances Ellis
34	Bobby Gilbert
35	Ann Flores
36	Laura Burke
37	Billy Perry
38	Samuel Wagner
39	Gregory Franklin
40	Tammy Moore
41	Billy Montgomery
42	Paul Parker
43	Walter Vasquez
44	Tammy Lopez
45	Marie Hamilton
46	Shawn Collins
47	Diana Hill
48	John Phillips
49	Justin Clark
50	Gloria Willis

Apply the correlation plot from package corrplot. Our plots are based on the distance matrix from function stringdistmatrix.

#for cluster i, we use i here to make it more flexible to change to code.
i="1"
listofcluster <- mock_data[mock_data$clusternumber==i,]
n <- which(mock_data$clusternumber==i)
correlation <- 1 - distancemodels[c(n), c(n)]
#correlation plot
corrplot(correlation, method=c("number"), type='lower')

In order to have further understanding of the relationship between subjects within one specific cluster, network graph from igraph is a good tool to visualize it.

#network plot
graph <- graph.adjacency(correlation,weighted=TRUE, mode="upper", diag = FALSE)
V(graph)$color <- 'grey'
V(graph)$label.cex = 0.85
plot.igraph(graph,vertex.label = V(graph)$name, layout = layout.fruchterman.reingold,
            edge.color = "black", edge.width = E(graph)$weight, vertex.size=50,
            vertex.shape="circle", edge.label = round(E(graph)$weight, 3),
            edge.label.cex=0.7)

Apply exactly the same procedure, we can get the following:

#for cluster 3
i="3" #we do not need to change anything else, the following codes would be same as above
listofcluster <- mock_data[mock_data$clusternumber==i,]
n <- which(mock_data$clusternumber==i)
correlation <- 1 - distancemodels[c(n), c(n)]
#correlation plot
corrplot(correlation, method=c("number"), type='lower')

#network plot
graph <- graph.adjacency(correlation,weighted=TRUE, mode="upper", diag = FALSE)
V(graph)$color <- 'grey'
V(graph)$label.cex = 0.85
plot.igraph(graph,vertex.label = V(graph)$name, layout = layout.fruchterman.reingold,
            edge.color = "black", edge.width = E(graph)$weight, vertex.size=50,
            vertex.shape="circle", edge.label = round(E(graph)$weight, 3),
            edge.label.cex=0.7)

#for cluster 4
i="4" #we do not need to change anything else, the following codes would be same as above
listofcluster <- mock_data[mock_data$clusternumber==i,]
n <- which(mock_data$clusternumber==i)
correlation <- 1 - distancemodels[c(n), c(n)]
#correlation plot
corrplot(correlation, method=c("number"), type='lower')

#network plot
graph <- graph.adjacency(correlation,weighted=TRUE, mode="upper", diag = FALSE)
V(graph)$color <- 'grey'
V(graph)$label.cex = 0.85
plot.igraph(graph,vertex.label = V(graph)$name, layout = layout.fruchterman.reingold,
            edge.color = "black", edge.width = E(graph)$weight, vertex.size=50,
            vertex.shape="circle", edge.label = round(E(graph)$weight, 3),
            edge.label.cex=0.7)

All the codes:
library(data.table) #for table summary
library(knitr) #for table format
library(corrplot) #for correlation plot
library(igraph) #for networplot
library(stringdist) # for calculating distance
mock_data <- read.csv("C:/Users/ye/Desktop/MOCK_DATA.csv")
mock_data$name <- paste(mock_data$first_name, mock_data$last_name, sep=" ")
mock_data$category <- paste0(as.character(mock_data$ip_address),' ',
                             as.character(mock_data$email),' ',
                             as.character(mock_data$name))
models <- as.character(mock_data$category)
distancemodels <- stringdistmatrix(models, models,method = "jw")
rownames(distancemodels) <- mock_data$name #specify the rownames
colnames(distancemodels) <- mock_data$name #specify the colnames
cluster <- hclust(as.dist(distancemodels))
k=50 # k depends on the data
dfClust <- data.frame(models, cutree(cluster, k=k))
names(dfClust) <- c('models','clusternumber')
plot(cluster,hang = -1, labels = mock_data$name)
rect.hclust(cluster, k = k) #create red rectangles in the plot

mock_data$clusternumber<-cutree(cluster, k)
df <- data.table(X = mock_data$clusternumber, NAME = mock_data$name)
DF <- df[,list(NAME=list(NAME)), by = X]
#cluster table
kable(DF) #output a nicer table

#for cluster i, we use i here to make it more flexible to change to code.
i="1"
listofcluster <- mock_data[mock_data$clusternumber==i,]
n <- which(mock_data$clusternumber==i) 
correlation <- 1 - distancemodels[c(n), c(n)]
#correlation plot
corrplot(correlation, method=c("number"), type='lower')

#network plot
graph <- graph.adjacency(correlation,weighted=TRUE, mode="upper", diag = FALSE)
V(graph)$color <- 'grey'
V(graph)$label.cex = 0.85 #define the text size of nodes
plot.igraph(graph,vertex.label = V(graph)$name, layout = layout.fruchterman.reingold,
            edge.color = "black", edge.width = E(graph)$weight, vertex.size=50,
            vertex.shape="circle", edge.label = round(E(graph)$weight, 3),
            edge.label.cex=0.7)

#for cluster 3
i="3" #we do not need to change anything else, the following codes would be same as above
listofcluster <- mock_data[mock_data$clusternumber==i,]
n <- which(mock_data$clusternumber==i)
correlation <- 1 - distancemodels[c(n), c(n)]
#correlation plot
corrplot(correlation, method=c("number"), type='lower')

#network plot
graph <- graph.adjacency(correlation,weighted=TRUE, mode="upper", diag = FALSE)
V(graph)$color <- 'grey'
V(graph)$label.cex = 0.85
plot.igraph(graph,vertex.label = V(graph)$name, layout = layout.fruchterman.reingold,
            edge.color = "black", edge.width = E(graph)$weight, vertex.size=50,
            vertex.shape="circle", edge.label = round(E(graph)$weight, 3),
            edge.label.cex=0.7)

#for cluster 4
i="4" #we do not need to change anything else, the following codes would be same as above
listofcluster <- mock_data[mock_data$clusternumber==i,]
n <- which(mock_data$clusternumber==i)
correlation <- 1 - distancemodels[c(n), c(n)]
#correlation plot
corrplot(correlation, method=c("number"), type='lower')

#network plot
graph <- graph.adjacency(correlation,weighted=TRUE, mode="upper", diag = FALSE)
V(graph)$color <- 'grey'
V(graph)$label.cex = 0.85
plot.igraph(graph,vertex.label = V(graph)$name, layout = layout.fruchterman.reingold,
            edge.color = "black", edge.width = E(graph)$weight, vertex.size=50,
            vertex.shape="circle", edge.label = round(E(graph)$weight, 3),
            edge.label.cex=0.7)

Identity Resolution and Network Graphing

Simiao Ye

09/08/2015