Clustering Using NMI & Jaccard Coefficient

Loading the Data Libraries

library(data.table)
library(QuantumClone)
library(vegan)
library(ggplot2)
library(clusteval)

Loading and reading the datafiles containing various clusters to be analyzed

partitions <- fread("C:/Documents/Data Science/Cluster Analysis in Data Mining/PAW4/partitions.txt",header = FALSE,na.strings = "NA",stringsAsFactors = FALSE, skip = 0)
cluster1 <- fread("C:/Documents/Data Science/Cluster Analysis in Data Mining/PAW4/clustering_1.txt",header = FALSE,na.strings = "NA",stringsAsFactors = FALSE, skip = 0)
cluster2 <- fread("C:/Documents/Data Science/Cluster Analysis in Data Mining/PAW4/clustering_2.txt",header = FALSE,na.strings = "NA",stringsAsFactors = FALSE, skip = 0)
cluster3 <- fread("C:/Documents/Data Science/Cluster Analysis in Data Mining/PAW4/clustering_3.txt",header = FALSE,na.strings = "NA",stringsAsFactors = FALSE, skip = 0)
cluster4 <- fread("C:/Documents/Data Science/Cluster Analysis in Data Mining/PAW4/clustering_4.txt",header = FALSE,na.strings = "NA",stringsAsFactors = FALSE, skip = 0)
cluster5 <- fread("C:/Documents/Data Science/Cluster Analysis in Data Mining/PAW4/clustering_5.txt",header = FALSE,na.strings = "NA",stringsAsFactors = FALSE, skip = 0)
dim(cluster4)

## [1] 300   2

Replacing clusters with 0 by 1, 1 by 2 & 2 by 3 to avoid NAs

c1 <- cluster1$V2+1
c2 <- cluster2$V2+1
c3 <- cluster3$V2+1
c4 <- cluster4$V2+1
c5 <- cluster5$V2+1
groundtruth <- partitions$V2+1

Calculating NMI for each cluster wrt Ground Truth

a <- NMI_cutree(c1,groundtruth)
b <- NMI_cutree(c2,groundtruth)
c <- NMI_cutree(c3,groundtruth)
d <- NMI_cutree(c4,groundtruth)
e <- NMI_cutree(c5,groundtruth)
print(c(a,b,c,d,e))

## [1] 0.8896248 0.6456368 0.3915437 0.7642771 0.7336804

Computing the Jaccard Coefficients wrt Ground Truth

I1 <- length(intersect(c1,groundtruth))
S1 <- I1/(length(c1)+length(groundtruth)-I1)
I2 <- length(intersect(c2,groundtruth))
S2 <- I2/(length(c2)+length(groundtruth)-I2)
I3 <- length(intersect(c3,groundtruth))
S3 <- I3/(length(c3)+length(groundtruth)-I3)
I4 <- length(intersect(c4,groundtruth))
S4 <- I4/(length(c4)+length(groundtruth)-I4)
I5 <- length(intersect(c5,groundtruth))
S5 <- I5/(length(c5)+length(groundtruth)-I5)
print(c(S1,S2,S3,S4,S5))

## [1] 0.005025126 0.005025126 0.005025126 0.005025126 0.003344482

J1 <- vegdist(rbind(unlist(c1, use.names=F),unlist(groundtruth, use.names=F)), method = "jaccard")
J2 <- vegdist(rbind(unlist(c2, use.names=F),unlist(groundtruth, use.names=F)), method = "jaccard")
J3 <- vegdist(rbind(unlist(c3, use.names=F),unlist(groundtruth, use.names=F)), method = "jaccard")
J4 <- vegdist(rbind(unlist(c4, use.names=F),unlist(groundtruth, use.names=F)), method = "jaccard")
J5 <- vegdist(rbind(unlist(c5, use.names=F),unlist(groundtruth, use.names=F)), method = "jaccard")
print(c(J1,J2,J3,J4,J5))

## [1] 0.01324503 0.06763285 0.12792512 0.09118541 0.33333333

JC1 <- cluster_similarity(c1,groundtruth,similarity = "jaccard",method = "independence")
JC2 <- cluster_similarity(c2,groundtruth,similarity = "jaccard",method = "independence")
JC3 <- cluster_similarity(c3,groundtruth,similarity = "jaccard",method = "independence")
JC4 <- cluster_similarity(c4,groundtruth,similarity = "jaccard",method = "independence")
JC5 <- cluster_similarity(c5,groundtruth,similarity = "jaccard",method = "independence")
print(c(JC1,JC2,JC3,JC4,JC5))

## [1] 0.9116890 0.6794843 0.4649305 0.8005979 0.5975855

JC <- c(JC1,JC2,JC3,JC4,JC5)
NMI <- c(a,b,c,d,e)
n <- 1:5
df <- data.frame(NMI,JC)
is.num <- sapply(df, is.numeric)
df[is.num] <- lapply(df[is.num], round, 7)
df

##         NMI        JC
## 1 0.8896248 0.9116890
## 2 0.6456368 0.6794843
## 3 0.3915437 0.4649305
## 4 0.7642771 0.8005979
## 5 0.7336804 0.5975855

Creating a text file that displays NMI & Jaccard Coefficient scores for each data point

write.table(df,file = "scores.txt",sep = , row.names = FALSE, col.names = FALSE, quote = FALSE)

Clustering Using NMI & Jaccard Coefficient

Abhijit Jantre

9 August 2017

Loading the Data Libraries

Loading and reading the datafiles containing various clusters to be analyzed

Replacing clusters with 0 by 1, 1 by 2 & 2 by 3 to avoid NAs

Calculating NMI for each cluster wrt Ground Truth

Computing the Jaccard Coefficients wrt Ground Truth

Creating a text file that displays NMI & Jaccard Coefficient scores for each data point