Intro

The following work shows a step-by-step implementation of the Gustafson-Kessel Algorithm. This algorithms is one of the fuzzy clustering algorithm that deals with the measure of membership of a point to cluster instead of the binary 0/1 classification (either belongs or does not belong). So in the end each point is not assigned to a cluster but is instead given a vector of membership measures. So a point typically belongs to all clusters simultaneously but it’s membership has different ‘strength’ for different clusters. However, when the clustering is complete we can convert the fuzzy clusters into crips ones by simple assigning the point a cluster with the highest value of the membership function. This is what’s called hard partitioning.

Load the data

Pre-processing

First we’ll introduce traditional pre-processing function: normalization and encoding. The first one noramlizes the data by subracting the means, the second one scales the data in the (-1,1) interval.

normalize <- function(M) {
        #center data
        means = apply(M,2,mean)
        Xnorm = t(apply(M,1,function(x) {x-means}))
        Xnorm
}

encode <- function(M) {
        # put on hypershpere
        mins = apply(M,2,min)
        maxs = apply(M,2,max)
        ranges = maxs-mins
        Xnorm = t(apply(M,1,function(x) { 2*(x-mins)/ranges-1}))
        Xnorm = t(apply(Xnorm, 1, function(x) { x/norm(x,type="2")}))
        Xnorm
}

Normalize

Now we will normalize and encode the iris dataset.

iris_norm = normalize(iris_X)
iris_norm = encode(iris_norm)

Initialize centroids and transformation matrices

This procedure creates k centroids with number of dimensions equal to the number of features. The centrois are initialized to random vectors in the [-1,1] interval. Each centroid has a “companion” - a transformation matrix A which is intialized as the identity matrix.

initCentroidsAndA <- function(k,n_features) {
        set.seed(123)
        centroids = matrix(runif(k*n_features,-1,1), ncol = n_features)
        centroids = normalize(centroids)
        centroids = encode(centroids)
        A = vector(mode="list", length=k)
        for (i in seq(1,k)) {
                A[[i]] = diag(x=1, ncol = n_features, nrow = n_features)
        }
        list(ce = centroids, A = A)
}

Find distances

The following function calculates the Mahalanobis distances for each point for each centroid.

calculate_Mahalanobis <- function(X, cAndA) {
        k = nrow(cAndA$ce)
        centroids = cAndA$ce
        A = cAndA$A
        distances = c()
        for (i in seq(1,k)) {
                dsq = as.matrix(apply(X, 1, function(x) { ((t(x-centroids[i,]))%*%solve(A[[i]]))%*%(x-centroids[i,]) }))
                colnames(dsq) = paste("centroid",i)
                distances = cbind(distances,dsq)
        }
        distances
}

Calculate memberships

This function calculates memberships - to what extent each point belongs to each cluster.

cAndA = initCentroidsAndA(3,n_features)
dst = calculate_Mahalanobis(iris_norm, cAndA) 
calculate_memberships <- function(distances) {
        mus = t(apply(distances, 1, function(x) { (1/x)/(sum(1/x)) }))
        mus
}
mships = calculate_memberships(dst)

Update centroid

The next important step is to update centroids and transformation matrices so that we can move on to the next iteration of the algorithm.

update_centroids <- function(X,memberships) {
        k = ncol(memberships)
        n_features = ncol(X)
        N = nrow(X)
        full = cbind(X,memberships)
        centroids = c()
        A = vector(mode="list", length=k)
        for (i in seq(1,k)) {
                den = sum((memberships[,i])^2)
                num = 0
                Fnum = matrix(data=0, nrow = n_features, ncol = n_features)
                for (j in seq(1,N)) {
                        num = num + X[j,]*((memberships[j,i])^2)
                }
                cent = num/den
                centroids = rbind(centroids,cent)
                for (j in seq(1,N)) {
                        Fnum = Fnum + (outer( ((memberships[j,i])^2)*(X[j,]-centroids[i,]) , X[j,]-centroids[i,]))
                }
                Fi = Fnum/den
                A[[i]] = ((det(Fi))^(1/n_features))*solve(Fi)
         }
        list(ce = centroids, A = A)
}

Put everything together

runGustafsonKessel <- function(X, k, epsilon) {
        n_features = ncol(X)
        centAndA = initCentroidsAndA(3, n_features)
        tolerance = 2
        iters = 0
        while (tolerance > epsilon) {
                dst = calculate_Mahalanobis(X, centAndA)
                mships = calculate_memberships(dst)
                newCentAndA = update_centroids(X,mships)
                tolerance = ((norm(centAndA$ce-newCentAndA$ce,type="2")))
                #print(norm(newCentAndA$ce, type="2"))
                #print(norm(centAndA$ce, type="2"))
                #print("Tolerance")
                #print(tolerance)
                centAndA = newCentAndA
                print("Current centroids")
                print(centAndA$ce)
                iters = iters+1
                #print(mships)
        }
        list(dst = dst, centAndA = centAndA, mships = mships)
}
z = runGustafsonKessel(iris_norm,3,0.05)
## [1] "Current centroids"
##            [,1]       [,2]        [,3]       [,4]
## cent -0.3759869  0.1449268 -0.50403834 -0.5290240
## cent  0.3258143 -0.1529541  0.54963406  0.5681777
## cent -0.3645772 -0.5906380 -0.05397593 -0.1659737
## [1] "Current centroids"
##            [,1]       [,2]        [,3]        [,4]
## cent -0.4383195  0.1159104 -0.59350289 -0.61976473
## cent  0.2963708 -0.2222011  0.54389983  0.55521261
## cent -0.4407734 -0.7467326  0.06861647 -0.07497507
## [1] "Current centroids"
##            [,1]       [,2]        [,3]        [,4]
## cent -0.4298221  0.1146965 -0.58237093 -0.60837306
## cent  0.3044524 -0.2177096  0.55296351  0.57096645
## cent -0.4175192 -0.7177399  0.06108012 -0.08144236

Closest hard partition

After we’ve done our clustering it always interesting to have a look at the results and get some visualisation. To do that we need to get the closest hard partition of our data:

hard_partition <- function(memberships) {
        apply(mships,1,which.max)
}

Visualization

Now we can finally observe the results and confirm that the algorithm works fine since it still finds the true clusters. Our data set had 4 attributes, and we can only plot in 3D, so we leave out the attribute with the lowest variance.

library(rgl)
library(car)
ind = sort(apply(iris_norm,2,var),index.return=TRUE, decreasing = TRUE)$ix[1:3]
labs = cnames[ind]
labels = hard_partition(z$mships)
sset = iris_norm[,ind]
x = sset[,1]
y = sset[,2]
z = sset[,3]
par3d("windowRect"= c(0,0,400,400))
scatter3d(x = x, y = y, z = z, 
          xlab=labs[1], ylab=labs[2], zlab = labs[3],
          labels = NULL,
          groups = as.factor(labels), 
          surface = FALSE, 
          grid = FALSE,
          ellipsoid=TRUE)
rgl.snapshot(filename = "plot_gk_iris.png")
Results

Results