Implementation of Hierarchical Clustering (HCL) Algorithm in R

Jungsik Noh

I implement the Hierarchical Clustering (HCL) algorithm for 1-dimensional data from scratch using R. HCL can be applied in multiple ways, depending on how to calibrate distances between clusters. Two kinds of linkage methods are implemented below. In centroid linkage method, a distance between clusters is defined by the distance between centroids. In single linkage method, the distance between clusters is defined by the shortest distance between data points in each cluster.

Centroid linkage method

myHCL <- function(x){
  # Implementation of Hierarchical clustering of a 1-dim'l vector
  # Distance between clusters is defined by the distance between centroids 
  # (centroid linkage method)
  
  # Initialize
  x <- sort(x)
  n <- length(x)
  val1 <- val2 <- CLid1 <- CLid2 <- newCLid <- newCentroid <- NA
  mergeHistory <- data.frame(val1, val2, CLid1, CLid2, newCLid, newCentroid)
  mergeInd <- 1
  newCLid <- n+1                    # node IDs in the outcome dendrogram
  CLidvec <- c(1:n)                 # CLid's for original data    
  CLlabel <- c(1:n)                 # Initial cluster labels  
  K <- n                            # K: Num of clusters
  centroids <- x                    # Initial centroids = data vector
  centroidsLabel <- c(1:n)          # CLid's for centroids vector
  dataToCentroids <- x              # Map from data points to centroids at each step
  dataToCentroidsMat <- cbind(dataToCentroids)
  out <- vector("list", K-3)
  
  while(K > 1){
    cat(paste0("< K= ", K-1, " > \n"))
    pDist <- dist(centroids)
    d_min <- min(pDist)
    ind <- which((as.matrix(pDist) == d_min), arr.ind=TRUE)[1, ]
    CLid1 <- centroidsLabel[ind[1]]
    CLid2 <- centroidsLabel[ind[2]]
    
    val1 <- centroids[ind[1]]; val2 <- centroids[ind[2]]
    ind1 <- which(dataToCentroids ==  val1)
    ind2 <- which(dataToCentroids == val2)
    
    newCentroid <- 
      (val1 * length(ind1) + val2 * length(ind2))/(length(ind1) + length(ind2))
    
    # Merging information
    mergeInfo <- c(val1, val2, CLid1, CLid2, newCLid, newCentroid)
    mergeHistory[mergeInd, ] <- mergeInfo
    
    # Merge
    centroids <- centroids[-ind]
    centroids <- c(centroids, newCentroid)
    ord <- order(centroids)
    centroids <- centroids[ord]
    
    centroidsLabel <- centroidsLabel[-ind]
    centroidsLabel <- c(centroidsLabel, newCLid)
    centroidsLabel <- centroidsLabel[ord]
    
    tmp1 <- which(CLidvec == CLid1)
    CLidvec[tmp1] <- newCLid
    tmp2 <- which(CLidvec == CLid2)
    CLidvec[tmp2] <- newCLid
    
    for (i in c(1:(K-1))){
      tmp <- centroidsLabel[i]
      CLlabel[which(CLidvec == tmp)] <- i
    }
    
    for (i in c(1:(K-1))){
      tmp <- which(CLlabel == i)
      dataToCentroids[tmp] <- centroids[i]
    }
    
    # Update merging indexes
    newCLid <- newCLid + 1
    mergeInd <- mergeInd + 1
    
    # Output data.frame
    out1 <- list(data=x, CLlabel=CLlabel, centroids=centroids)
    print(out1)
    dataToCentroidsMat <- cbind(dataToCentroidsMat, dataToCentroids)
    out[[mergeInd-1]] <- out1
    K <- K-1
  }
  colnames(dataToCentroidsMat) <- NULL
  out3 <- list(steps=out, dataToCentroidsMat=dataToCentroidsMat, mergeHistory=mergeHistory)
  return(out3)
}

While running, the function displays details of cluster labels and cluster centroids while it is aggregating all the data points into the final one cluster.

dat <- c(29,33,35,37,41,43,47,51,53,60,64,70)
out <- myHCL(dat)

## < K= 11 > 
## $data
##  [1] 29 33 35 37 41 43 47 51 53 60 64 70
## 
## $CLlabel
##  [1]  1  2  2  3  4  5  6  7  8  9 10 11
## 
## $centroids
##  [1] 29 34 37 41 43 47 51 53 60 64 70
## 
## < K= 10 > 
## $data
##  [1] 29 33 35 37 41 43 47 51 53 60 64 70
## 
## $CLlabel
##  [1]  1  2  2  3  4  4  5  6  7  8  9 10
## 
## $centroids
##  [1] 29 34 37 42 47 51 53 60 64 70
## 
## < K= 9 > 
## $data
##  [1] 29 33 35 37 41 43 47 51 53 60 64 70
## 
## $CLlabel
##  [1] 1 2 2 3 4 4 5 6 6 7 8 9
## 
## $centroids
## [1] 29 34 37 42 47 52 60 64 70
## 
## < K= 8 > 
## $data
##  [1] 29 33 35 37 41 43 47 51 53 60 64 70
## 
## $CLlabel
##  [1] 1 2 2 2 3 3 4 5 5 6 7 8
## 
## $centroids
## [1] 29 35 42 47 52 60 64 70
## 
## < K= 7 > 
## $data
##  [1] 29 33 35 37 41 43 47 51 53 60 64 70
## 
## $CLlabel
##  [1] 1 2 2 2 3 3 4 5 5 6 6 7
## 
## $centroids
## [1] 29 35 42 47 52 62 70
## 
## < K= 6 > 
## $data
##  [1] 29 33 35 37 41 43 47 51 53 60 64 70
## 
## $CLlabel
##  [1] 1 2 2 2 3 3 3 4 4 5 5 6
## 
## $centroids
## [1] 29.00000 35.00000 43.66667 52.00000 62.00000 70.00000
## 
## < K= 5 > 
## $data
##  [1] 29 33 35 37 41 43 47 51 53 60 64 70
## 
## $CLlabel
##  [1] 1 1 1 1 2 2 2 3 3 4 4 5
## 
## $centroids
## [1] 33.50000 43.66667 52.00000 62.00000 70.00000
## 
## < K= 4 > 
## $data
##  [1] 29 33 35 37 41 43 47 51 53 60 64 70
## 
## $CLlabel
##  [1] 1 1 1 1 2 2 2 3 3 4 4 4
## 
## $centroids
## [1] 33.50000 43.66667 52.00000 64.66667
## 
## < K= 3 > 
## $data
##  [1] 29 33 35 37 41 43 47 51 53 60 64 70
## 
## $CLlabel
##  [1] 1 1 1 1 2 2 2 2 2 3 3 3
## 
## $centroids
## [1] 33.50000 47.00000 64.66667
## 
## < K= 2 > 
## $data
##  [1] 29 33 35 37 41 43 47 51 53 60 64 70
## 
## $CLlabel
##  [1] 1 1 1 1 1 1 1 1 1 2 2 2
## 
## $centroids
## [1] 41.00000 64.66667
## 
## < K= 1 > 
## $data
##  [1] 29 33 35 37 41 43 47 51 53 60 64 70
## 
## $CLlabel
##  [1] 1 1 1 1 1 1 1 1 1 1 1 1
## 
## $centroids
## [1] 46.91667

Output contains merging history and mappings between data points and their centroids to track the clustering results and generate a dendrogram later.

out$mergeHistory

Here the 1st column shows data points and the next columns show their centroids during the merging process of the HCL.

out$dataToCentroidsMat

##       [,1] [,2] [,3] [,4] [,5] [,6]     [,7]     [,8]     [,9]    [,10]
##  [1,]   29   29   29   29   29   29 29.00000 33.50000 33.50000 33.50000
##  [2,]   33   34   34   34   35   35 35.00000 33.50000 33.50000 33.50000
##  [3,]   35   34   34   34   35   35 35.00000 33.50000 33.50000 33.50000
##  [4,]   37   37   37   37   35   35 35.00000 33.50000 33.50000 33.50000
##  [5,]   41   41   42   42   42   42 43.66667 43.66667 43.66667 47.00000
##  [6,]   43   43   42   42   42   42 43.66667 43.66667 43.66667 47.00000
##  [7,]   47   47   47   47   47   47 43.66667 43.66667 43.66667 47.00000
##  [8,]   51   51   51   52   52   52 52.00000 52.00000 52.00000 47.00000
##  [9,]   53   53   53   52   52   52 52.00000 52.00000 52.00000 47.00000
## [10,]   60   60   60   60   60   62 62.00000 62.00000 64.66667 64.66667
## [11,]   64   64   64   64   64   62 62.00000 62.00000 64.66667 64.66667
## [12,]   70   70   70   70   70   70 70.00000 70.00000 64.66667 64.66667
##          [,11]    [,12]
##  [1,] 41.00000 46.91667
##  [2,] 41.00000 46.91667
##  [3,] 41.00000 46.91667
##  [4,] 41.00000 46.91667
##  [5,] 41.00000 46.91667
##  [6,] 41.00000 46.91667
##  [7,] 41.00000 46.91667
##  [8,] 41.00000 46.91667
##  [9,] 41.00000 46.91667
## [10,] 64.66667 46.91667
## [11,] 64.66667 46.91667
## [12,] 64.66667 46.91667

Single linkage method

minDistOfLists <- function(listA, listB){
  a <- unlist(listA)
  b <- unlist(listB)
  min0 <- Inf
  for (i in 1:length(a)){
    for (j in 1:length(b)){
      dist0 <- abs(a[i] - b[j])
      min0 <- min(min0, dist0)
    }
  }
  return(min0)
}

myHCL_mindist <- function(x){
  # Implementation of Hierarchical clustering of a 1-dim'l vector
  # Distance between clusters is defined by the shortest distance
  # (single linkage method)
  
  # Initialize
  x <- sort(x)
  n <- length(x)
  val1 <- val2 <- CLid1 <- CLid2 <- newCLid <- newCentroid <- NA
  mergeHistory <- data.frame(val1, val2, CLid1, CLid2, newCLid, newCentroid)
  mergeInd <- 1
  newCLid <- n+1                    # node IDs in the outcome dendrogram
  CLidvec <- c(1:n)                 # CLid's for original data    
  CLlabel <- c(1:n)                 # Initial cluster labels  
  K <- n                            # K: Num of clusters
  centroids <- x                    # Initial centroids = data vector
  centroidsLabel <- c(1:n)          # CLid's for centroids vector
  dataToCentroids <- x              # Map from data points to centroids at each step
  dataToCentroidsMat <- cbind(dataToCentroids)
  out <- vector("list", K-1)
  
  CLsets <- vector("list", n)
  for (i in 1:n){
    CLsets[[i]] = x[i]
  }
  
  while(K > 1){
    cat(paste0("< K= ", K-1, " > \n"))
    pDist <- matrix(0, K, K)
    # The minimum of the distances between any two points in each cluster
    for (i in c(1:(K-1))){
      for (j in c((i+1):K)){
        pDist[j,i] <- minDistOfLists(CLsets[[i]], CLsets[[j]])
      }
    }
    pDist <- pDist + t(pDist)
    diag(pDist) = Inf
    
    d_min <- min(pDist)
    ind <- which((as.matrix(pDist) == d_min), arr.ind=TRUE)[1, ]
    CLid1 <- centroidsLabel[ind[1]]
    CLid2 <- centroidsLabel[ind[2]]
    
    val1 <- centroids[ind[1]]; val2 <- centroids[ind[2]]
    ind1 <- which(dataToCentroids ==  val1)
    ind2 <- which(dataToCentroids == val2)
    
    newCentroid <- 
      (val1 * length(ind1) + val2 * length(ind2))/(length(ind1) + length(ind2))
      
    # Merging information
    mergeInfo <- c(val1, val2, CLid1, CLid2, newCLid, newCentroid)
    mergeHistory[mergeInd, ] <- mergeInfo
    
    # Merge
    centroids <- centroids[-ind]
    centroids <- c(centroids, newCentroid)
    ord <- order(centroids)
    centroids <- centroids[ord]
    
    centroidsLabel <- centroidsLabel[-ind]
    centroidsLabel <- c(centroidsLabel, newCLid)
    centroidsLabel <- centroidsLabel[ord]
    
    tmp1 <- which(CLidvec == CLid1)
    CLidvec[tmp1] <- newCLid
    tmp2 <- which(CLidvec == CLid2)
    CLidvec[tmp2] <- newCLid
    
    for (i in c(1:(K-1))){
      tmp <- centroidsLabel[i]
      CLlabel[which(CLidvec == tmp)] <- i
    }
    
    for (i in c(1:(K-1))){
      tmp <- which(CLlabel == i)
      dataToCentroids[tmp] <- centroids[i]
      CLsets[[i]] <- x[tmp]
    }
    
    # Update merging indexes
    newCLid <- newCLid + 1
    mergeInd <- mergeInd + 1
    
    # Output data.frame
    out1 <- list(data=x, CLlabel=CLlabel, centroids=centroids)
    print(out1)
    dataToCentroidsMat <- cbind(dataToCentroidsMat, dataToCentroids)
    out[[mergeInd-1]] <- out1
    K <- K-1
  }
  colnames(dataToCentroidsMat) <- NULL
  out3 <- list(steps=out, dataToCentroidsMat=dataToCentroidsMat, mergeHistory=mergeHistory)
  return(out3)
}

dat <- c(29,33,35,37,41,43,47,51,53,60,64,70)
out <- myHCL_mindist(dat)

## < K= 11 > 
## $data
##  [1] 29 33 35 37 41 43 47 51 53 60 64 70
## 
## $CLlabel
##  [1]  1  2  2  3  4  5  6  7  8  9 10 11
## 
## $centroids
##  [1] 29 34 37 41 43 47 51 53 60 64 70
## 
## < K= 10 > 
## $data
##  [1] 29 33 35 37 41 43 47 51 53 60 64 70
## 
## $CLlabel
##  [1]  1  2  2  2  3  4  5  6  7  8  9 10
## 
## $centroids
##  [1] 29 35 41 43 47 51 53 60 64 70
## 
## < K= 9 > 
## $data
##  [1] 29 33 35 37 41 43 47 51 53 60 64 70
## 
## $CLlabel
##  [1] 1 2 2 2 3 3 4 5 6 7 8 9
## 
## $centroids
## [1] 29 35 42 47 51 53 60 64 70
## 
## < K= 8 > 
## $data
##  [1] 29 33 35 37 41 43 47 51 53 60 64 70
## 
## $CLlabel
##  [1] 1 2 2 2 3 3 4 5 5 6 7 8
## 
## $centroids
## [1] 29 35 42 47 52 60 64 70
## 
## < K= 7 > 
## $data
##  [1] 29 33 35 37 41 43 47 51 53 60 64 70
## 
## $CLlabel
##  [1] 1 1 1 1 2 2 3 4 4 5 6 7
## 
## $centroids
## [1] 33.5 42.0 47.0 52.0 60.0 64.0 70.0
## 
## < K= 6 > 
## $data
##  [1] 29 33 35 37 41 43 47 51 53 60 64 70
## 
## $CLlabel
##  [1] 1 1 1 1 1 1 2 3 3 4 5 6
## 
## $centroids
## [1] 36.33333 47.00000 52.00000 60.00000 64.00000 70.00000
## 
## < K= 5 > 
## $data
##  [1] 29 33 35 37 41 43 47 51 53 60 64 70
## 
## $CLlabel
##  [1] 1 1 1 1 1 1 1 2 2 3 4 5
## 
## $centroids
## [1] 37.85714 52.00000 60.00000 64.00000 70.00000
## 
## < K= 4 > 
## $data
##  [1] 29 33 35 37 41 43 47 51 53 60 64 70
## 
## $CLlabel
##  [1] 1 1 1 1 1 1 1 1 1 2 3 4
## 
## $centroids
## [1] 41 60 64 70
## 
## < K= 3 > 
## $data
##  [1] 29 33 35 37 41 43 47 51 53 60 64 70
## 
## $CLlabel
##  [1] 1 1 1 1 1 1 1 1 1 2 2 3
## 
## $centroids
## [1] 41 62 70
## 
## < K= 2 > 
## $data
##  [1] 29 33 35 37 41 43 47 51 53 60 64 70
## 
## $CLlabel
##  [1] 1 1 1 1 1 1 1 1 1 2 2 2
## 
## $centroids
## [1] 41.00000 64.66667
## 
## < K= 1 > 
## $data
##  [1] 29 33 35 37 41 43 47 51 53 60 64 70
## 
## $CLlabel
##  [1] 1 1 1 1 1 1 1 1 1 1 1 1
## 
## $centroids
## [1] 46.91667

For example, \(K=3\) clustering results are quite different in the two methods. Detailed output has the same format as in the above.

out$mergeHistory

out$dataToCentroidsMat

##       [,1] [,2] [,3] [,4] [,5] [,6]     [,7]     [,8] [,9] [,10]    [,11]
##  [1,]   29   29   29   29   29 33.5 36.33333 37.85714   41    41 41.00000
##  [2,]   33   34   35   35   35 33.5 36.33333 37.85714   41    41 41.00000
##  [3,]   35   34   35   35   35 33.5 36.33333 37.85714   41    41 41.00000
##  [4,]   37   37   35   35   35 33.5 36.33333 37.85714   41    41 41.00000
##  [5,]   41   41   41   42   42 42.0 36.33333 37.85714   41    41 41.00000
##  [6,]   43   43   43   42   42 42.0 36.33333 37.85714   41    41 41.00000
##  [7,]   47   47   47   47   47 47.0 47.00000 37.85714   41    41 41.00000
##  [8,]   51   51   51   51   52 52.0 52.00000 52.00000   41    41 41.00000
##  [9,]   53   53   53   53   52 52.0 52.00000 52.00000   41    41 41.00000
## [10,]   60   60   60   60   60 60.0 60.00000 60.00000   60    62 64.66667
## [11,]   64   64   64   64   64 64.0 64.00000 64.00000   64    62 64.66667
## [12,]   70   70   70   70   70 70.0 70.00000 70.00000   70    70 64.66667
##          [,12]
##  [1,] 46.91667
##  [2,] 46.91667
##  [3,] 46.91667
##  [4,] 46.91667
##  [5,] 46.91667
##  [6,] 46.91667
##  [7,] 46.91667
##  [8,] 46.91667
##  [9,] 46.91667
## [10,] 46.91667
## [11,] 46.91667
## [12,] 46.91667