library(pixmap)     # performing image processing
library(dendextend) # performing clustering techniques
library(clValid)    # Calculating Dunn index

1 Overview

Below I will provide some image processing techniques.

2 Case 1. Find mister X.

Imagine we have a set of images of different people and we have a blurred version that we can not recognize by simple eye ball testing. Below is a trivial case of this problem with a function that recognizes who is depicted on blurred picture.

2.1 Data description

Below are a set of pictures with clear face of a person and a blurred one that we want to recognize.

par(mfrow=c(2,2),
    bg = 'gainsboro')

directory = paste(getwd(),"/","images 1", sep = "")

images = list.files(directory)                                           # list of images 

for (i in images) {
  plot(read.pnm(paste(directory,"/",i, sep = "")),
       sub = i) # ploting images
}

The goal is to determine who is misterX.(black image)

2.2 Solution using euclidian distance between normalized pictures.

images = images[images != "MrX.pnm"]            # Exclude MisterX from the list of images 

sim = function(names, directory, compare = "MrX.pnm", br = 0.01) {
  x = read.pnm(paste(directory, "/", compare, sep = ""))   # reading misterX image       
  dist = list()                                            # creating a list of distances
  for (n in names) {
    y = read.pnm(paste(directory,"/", n, sep = ""))        # reading images 
    y@grey = y@grey/norm(y@grey, type = "F")                       # normalizing images
    dist[n] = norm(x@grey - y@grey, type = "F")            # populating a list of distances
  }
  par(mfrow=c(1,2), bg = 'gainsboro')
  x@grey = x@grey + br                                     # giving a little brightness Mrx 
  plot(read.pnm(paste(directory, "/", rownames(as.matrix(which.min(dist))),  # Who is MisterX?
                      sep = "")))
  plot(x)
  dist
}

sim(names = images, 
    directory = paste(getwd(),"/","images 1", sep = ""),
    br = 0.3)

## $bob.pgm
## [1] 2.218785
## 
## $chris.pgm
## [1] 2.244208
## 
## $don.pgm
## [1] 2.127421
## 
## $fred.pgm
## [1] 2.204757
## 
## $greg.pgm
## [1] 2.2059
## 
## $jim.pgm
## [1] 2.236136
## 
## $ted.pgm
## [1] 2.209724

Above is the a list of euclidean distances between MrX and each image from comparing set of images. The blurred image is a picture of Don.

3 Case 2. Grouping simillar pictures.

Imagine that you have a lot of pictures of different people faces and you want to group pictures of same people. This can be solved using clustering technique below.

3.1 Data description

directory = paste(getwd(),"/","images 2", sep = "")
names = list.files(directory) 
par(mfrow=c(2,4), bg = 'gainsboro')

images = list()
for (i in names) {
  plot(read.pnm(paste(directory,"/", i, sep = "")), sub = i) # ploting images
  images[i] = read.pnm(paste(directory,"/",i, sep = ""))     # storing in the list   
}

So, as you can see we have a set of pictures, that we want to group based on the person depicted on the picture.

3.2 Preparing distance matrix

# Creating a list with normalized matrices of images 
norm_m = list()
for (i in 1:length(images)) {
  norm = as.matrix(images[[i]]@grey/norm(images[[i]]@grey, 
                                         type = "F"))
  name = names(images[i])
  norm_m[[name]] = norm
}

# Creating a distance matrix 
dist = matrix(0,nrow = length(norm_m), ncol = length(norm_m), byrow = TRUE)
for (i in 1:length(norm_m)) {
  for (j in 1:length(norm_m)) { 
    dist[i,j] = norm(norm_m[[i]] - norm_m[[j]], 
                     type = "F")
  }
}

# Naming the rows and columns 
rownames(dist) = substr(names, 1,2)
colnames(dist) = substr(names, 1,2)

# Converting a distance matrix to distance format
dist_cl = as.dist(dist)

3.3 Clustering analysis (Methods: Single, Ward.d, Average)

3.3.1 Method: Single

# Method "Single"
hcSingle = hclust(dist_cl, method = "single")
hcSingled = as.dendrogram(hcSingle)

# Plot hcd
plot(hcSingled, main = "Method Single")

# Add cluster rectangles 
rect.dendrogram(hcSingled, k = 5, border = "green")

clsingle = as.data.frame(cutree(hcSingled, 5))

# Ploting pictures from the same cluster
col = c('gainsboro', "floralwhite", "deepskyblue1", "darkolivegreen1", "darkslategray2")
for (i in 1:5) {
  cl = rownames(clsingle)[clsingle == i]
  par(mfrow = c(1, length(cl)), bg = col[i])
  for (i in cl) {
    plot(read.pnm(paste(directory,"/", i,".pgm", sep = "")), sub = i)
  }
}

# Calculating Dunn Index 
memb_single = cutree(hcSingle, 5)
dunn_single = dunn(clusters = memb_single, Data = dist_cl)

3.3.2 Method: Ward.D

# Method "Ward"
hcWard = hclust(dist_cl,method="ward.D")
hcWardd = as.dendrogram(hcWard)

# Plot hcd
plot(hcWardd, main = "Method Ward")
clward = as.data.frame(cutree(hcWardd, 5)) 

# Add cluster rectangles 
rect.dendrogram(hcWardd , k = 5, border = "red")

# Ploting pictures from the same cluster
for (i in 1:5) {
  cl = rownames(clward)[clward == i]
  par(mfrow = c(1, length(cl)),bg = col[i])
  for (i in cl) {
    plot(read.pnm(paste(directory,"/", i,".pgm", sep = "")), sub = i)
  }
}

# Calculating Dunn Index
memb_ward = cutree(hcWard, 5)
dunn_ward = dunn(clusters = memb_ward, Data = dist_cl)

3.3.3 Method: Average

# Method "Average"
hcAverage = hclust(dist_cl,method="average")
hcAveraged = as.dendrogram(hcAverage)

# Plot hcd
plot(hcAveraged, main = "Method Average")
claverage = as.data.frame(cutree(hcAveraged, 5))

# Add cluster rectangles 
rect.dendrogram(hcAveraged, k = 5, border = "orange")

for (i in 1:5) {
  cl = rownames(claverage )[claverage == i]
  par(mfrow = c(1, length(cl)),bg = col[i])
  for (i in cl) {
    plot(read.pnm(paste(directory,"/", i,".pgm", sep = "")), sub = i)
  }
}

# Calculating Dunn Index
memb_average = cutree(hcAverage, 5)
dunn_average = dunn(clusters = memb_average, Data = dist_cl)

3.4 Conclusion

sum = rbind(Single = dunn_single, Average = dunn_average, Ward = dunn_ward)
colnames(sum) = "Dunn Index"
sum 
##         Dunn Index
## Single   0.8843090
## Average  0.8843090
## Ward     0.7727617

Based on above analysis, using both eye ball test as well as Dunn index (returns the ratio between the minimum intercluster distance to the maximum intracluster diameter). Ward.d gives us the best split.