Gemstones colours clustering and image dimension reduction

Introduction

The purpose of this paper is to find how accurately you can cluster the colors of gemstones. Results of the clustering were used to create an “average color” for each type of gemstone. Clustering was done using CLARA method. For this research Alexandrite and Amazonite photos were used. In the second part of this paper I will test how will the results differ if we apply dimension reduction on photos of gemstones and then try to cluster them again.

Part 1 Files and libraries upload

First of all we need to upload our gemstones images (2 of each) and load needed libraries

library(jpeg)
library(rasterImage)

## Loading required package: plotrix

library(cluster)

image1 <- readJPEG("Amazonite/amazonite_18.jpg") 
image2 <- readJPEG("Alexandrite/alexandrite_3.jpg")
image3 <- readJPEG("Amazonite/amazonite_3.jpg") 
image4 <- readJPEG("Alexandrite/alexandrite_18.jpg")

Dimensions and data frame

Checking the dimensions and converting our gemstones photos to data frames of pixels

dm1 <- dim(image1)
dm2 <- dim(image2)
dm3 <- dim(image3)
dm4 <- dim(image4)

dm1

## [1] 168 168   3

dm2

## [1] 295 295   3

dm3

## [1] 260 260   3

dm4

## [1] 250 250   3

Now we need to change the images to data frames in order to perform clustering on them.

rgbImage1 <- data.frame(
  x=rep(1:dm1[2], each=dm1[1]),
  y=rep(dm1[1]:1, dm1[2]),
  r.value=as.vector(image1[,,1]),
  g.value=as.vector(image1[,,2]),
  b.value=as.vector(image1[,,3]))


plot(y ~ x, data=rgbImage1, main="Amazonite 1",
     col = rgb(rgbImage1[c("r.value", "g.value", "b.value")]),
     asp = 1, pch = ".")

rgbImage2 <- data.frame(
  x=rep(1:dm2[2], each=dm2[1]),
  y=rep(dm2[1]:1, dm2[2]),
  r.value=as.vector(image2[,,1]),
  g.value=as.vector(image2[,,2]),
  b.value=as.vector(image2[,,3]))

plot(y ~ x, data=rgbImage2, main="Alexandrite 1",
     col = rgb(rgbImage2[c("r.value", "g.value", "b.value")]),
     asp = 1, pch = ".")

rgbImage3 <- data.frame(
  x=rep(1:dm3[2], each=dm3[1]),
  y=rep(dm3[1]:1, dm3[2]),
  r.value=as.vector(image3[,,1]),
  g.value=as.vector(image3[,,2]),
  b.value=as.vector(image3[,,3]))


plot(y ~ x, data=rgbImage3, main="Amazonite 2",
     col = rgb(rgbImage3[c("r.value", "g.value", "b.value")]),
     asp = 1, pch = ".")

rgbImage4 <- data.frame(
  x=rep(1:dm4[2], each=dm4[1]),
  y=rep(dm4[1]:1, dm4[2]),
  r.value=as.vector(image4[,,1]),
  g.value=as.vector(image4[,,2]),
  b.value=as.vector(image4[,,3]))

plot(y ~ x, data=rgbImage4, main="Alexandrite 2",
     col = rgb(rgbImage4[c("r.value", "g.value", "b.value")]),
     asp = 1, pch = ".")

Optimal number of cluster using silhouette

Using silhouette we need to check what is the optimal no. of clusters. It checks the similarity of the object to the cluster it is assigned in comparison to other clusters. https://en.wikipedia.org/wiki/Silhouette_(clustering)

For Amazonite 1 looking at the graph it is clear that 2 clusters should be chosen

Same situation with Alexandrite 1 as average silhouette is about 0.9

As the purpose of the paper is to retreive true color of the gemstone, instead of picking 3 clusters, which as indicated on the graph has the highest average silhouette, I am going to pick 2 clusters.

For Alexandrite 2 situation is clear, so I pick 2 clusters as suggested by average silhouette.

CLARA

I am using CLARA to cluster the gemstones.

normal = rgbImage1[, c("r.value", "g.value", "b.value")]
clara1 <- clara(normal,2)
plot(silhouette(clara1))

colours1 <- rgb(clara1$medoids[clara1$clustering, ])
plot(y ~ x, data=rgbImage1, main="Amazonite 1",
     col = colours1, 
     asp = 1, pch = ".")

normal2 = rgbImage2[, c("r.value", "g.value", "b.value")]
clara2 <- clara(normal2,2)
plot(silhouette(clara2))

colours2 <- rgb(clara2$medoids[clara2$clustering, ])
plot(y ~ x, data=rgbImage2, main="Alexandrite 1",
     col = colours2, 
     asp = 1, pch = ".")

normal = rgbImage3[, c("r.value", "g.value", "b.value")]
clara3 <- clara(normal,2)
plot(silhouette(clara3))

colours3 <- rgb(clara3$medoids[clara3$clustering, ])
plot(y ~ x, data=rgbImage3, main="Amazonite 2",
     col = colours3, 
     asp = 1, pch = ".")

normal4 = rgbImage4[, c("r.value", "g.value", "b.value")]
clara4 <- clara(normal4,2)
plot(silhouette(clara4))

colours4 <- rgb(clara4$medoids[clara4$clustering, ])
plot(y ~ x, data=rgbImage4, main="Alexandrite 2",
     col = colours4, 
     asp = 1, pch = ".")

Colours of the gemstones

We move on to the main goal of the paper - to be able to distinguish an average colour for the specific gemstone. To do that I will take RGB values for clustered colors for every gemstone. Next I will get an average value for Red, Green and Blue.

Amazonites colours:

unique(colours1)

## [1] "#242424" "#8AA191"

rgb_a1 <- col2rgb(unique(colours1)[2])
rgb_a1

##       [,1]
## red    138
## green  161
## blue   145

unique(colours3)

## [1] "#D3CFD0" "#669396"

rgb_a2 <- col2rgb(unique(colours3)[2])
rgb_a2

##       [,1]
## red    102
## green  147
## blue   150

Average colour for Amazonites:

red1<- mean(c(rgb_a1[1,1], rgb_a2[1,1]))
red1

## [1] 120

green1<- mean(c(rgb_a1[2,1], rgb_a2[2,1]))
green1

## [1] 154

blue1<- round(mean(c(rgb_a1[3,1], rgb_a2[3,1])))
blue1

## [1] 148

## [1] "#789a94"

Alexandrite colours:

unique(colours1)

## [1] "#242424" "#8AA191"

rgb_b1 <- col2rgb(unique(colours2)[2])
rgb_b1

##       [,1]
## red     22
## green   88
## blue    86

unique(colours3)

## [1] "#D3CFD0" "#669396"

rgb_b2 <- col2rgb(unique(colours4)[2])
rgb_b2

##       [,1]
## red     66
## green   89
## blue    95

Average colour for a Alexandrite:

red2<- mean(c(rgb_b1[1,1], rgb_b2[1,1]))
red2

## [1] 44

green2<- round(mean(c(rgb_b1[2,1], rgb_b2[2,1])))
green2

## [1] 88

blue2<- round(mean(c(rgb_b1[3,1], rgb_b2[3,1])))
blue2

## [1] 90

## Results

Using clustering of an image we can distinguish specific colors. As shown in the analysis above we can obtain an average color of specific thing (here: gemstones). It can open path to other analysis such as assigning or classifying objects depending on color.

Part 2 - image dimension reduction

In this part I will try to find out how dimensional reduction methods affect silhouette in the case of image clustering. To test this i will use one of the gemstones photos from previous part.

Libraries and image

library(jpeg)
library(factoextra)

## Loading required package: ggplot2

## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa

library(magick)

## Linking to ImageMagick 6.9.12.3
## Enabled features: cairo, fontconfig, freetype, heic, lcms, pango, raw, rsvg, webp
## Disabled features: fftw, ghostscript, x11

library(gridExtra)
library(ggplot2)
library(abind)

Stone <- readJPEG("Alexandrite/alexandrite_3.jpg")

plot(1, type = "n")

rasterImage(Stone, 0.6, 0.6, 1.4, 1.4)

Stone.rgb <- Stone[,,1] + Stone[,,2] + Stone[,,3]
Stone.bw <- Stone.rgb/max(Stone.rgb)

Black and white image

plot(1, type="n")   # plotting the rasterImage – black & white photo
rasterImage(Stone.bw, 0.6, 0.6, 1.4, 1.4)

PCA on each shade

On the graphs presented below we can see how much of variance specific eigen vector explains. This PCA is run for Red, Green and Blue shade separately

r<-Stone[,,1]   # individual matrix of R colour component
g<-Stone[,,2]
b<-Stone[,,3]

r.pca<-prcomp(r, center=FALSE, scale.=FALSE)    # PCA for R colour component
g.pca<-prcomp(g, center=FALSE, scale.=FALSE)
b.pca<-prcomp(b, center=FALSE, scale.=FALSE)
rgb.pca<-list(r.pca, g.pca, b.pca)  # merging all PCA into one object

f1<-fviz_eig(r.pca, main="Red", barfill="red", ncp=5, addlabels=TRUE)
f2<-fviz_eig(g.pca, main="Green", barfill="green", ncp=5, addlabels=TRUE)
f3<-fviz_eig(b.pca, main="Blue", barfill="blue", ncp=5, addlabels=TRUE)
grid.arrange(f1, f2, f3, ncol=3)

pp=10 # how many principal components should be included
Stone.pca2<-abind(r.pca$x[,1:pp] %*% t(r.pca$rotation[,1:pp]),
                  g.pca$x[,1:pp] %*% t(g.pca$rotation[,1:pp]),
                  b.pca$x[,1:pp] %*% t(b.pca$rotation[,1:pp]),
                  along=3)

For Red shade 1st eigen vector explains 85.7% of variance while 2nd 8.3%. For Green and Blue only first eigen vectors are above 5% threshold.

Photo sizes and compression ratio

vec<-seq.int(3, round(nrow(Stone)), length.out=9)
for(i in vec){
Stone.pca<-sapply(rgb.pca, function(j) {
    new.RGB<-j$x[,1:i] %*% t(j$rotation[,1:i])}, simplify="array")
assign(paste("photo_", round(i,0), sep=""), Stone.pca) # saving as object
writeJPEG(Stone.pca, paste("photo_", round(i,0), "_princ_comp.jpg", sep=""))
}

par(mfrow=c(3,3)) 
par(mar=c(1,1,1,1))
plot(image_read(get(paste("photo_", round(vec[1],0), sep=""))))
plot(image_read(get(paste("photo_", round(vec[2],0), sep=""))))
plot(image_read(get(paste("photo_", round(vec[3],0), sep=""))))
plot(image_read(get(paste("photo_", round(vec[4],0), sep=""))))
plot(image_read(get(paste("photo_", round(vec[5],0), sep=""))))
plot(image_read(get(paste("photo_", round(vec[6],0), sep=""))))
plot(image_read(get(paste("photo_", round(vec[7],0), sep=""))))
plot(image_read(get(paste("photo_", round(vec[8],0), sep=""))))
plot(image_read(get(paste("photo_", round(vec[9],0), sep=""))))

library(Metrics)

sizes<-matrix(0, nrow=9, ncol=4)
colnames(sizes)<-c("Number of PC", "Photo size", "Compression ratio", "MSE-Mean Squared Error")
sizes[,1]<-round(vec,0)
for(i in 1:9) {
path<-paste("photo_", round(vec[i],0), "_princ_comp.jpg", sep="")
sizes[i,2]<-file.info(path)$size 
photo_mse<-readJPEG(path)
sizes[i,4]<-mse(Stone, photo_mse) # from Metrics::
}
sizes[,3]<-round(as.numeric(sizes[,2])/as.numeric(sizes[9,2]),3)
sizes

##       Number of PC Photo size Compression ratio MSE-Mean Squared Error
##  [1,]            3      13417             1.236           2.137120e-01
##  [2,]           40      26442             2.435           7.528819e-02
##  [3,]           76      23444             2.159           4.535206e-02
##  [4,]          112      21264             1.958           2.648520e-02
##  [5,]          149      15076             1.388           3.378838e-03
##  [6,]          186      10895             1.003           7.639558e-05
##  [7,]          222      10858             1.000           4.667461e-05
##  [8,]          258      10858             1.000           4.667461e-05
##  [9,]          295      10858             1.000           4.667461e-05

Above we can see Photo size and Compression ratio for 9 different amounts of Principal Components. As we can see the compression ratio is the biggest for PC = 40.

Silhouette changes

Main goal of this part was to analyse how dimension reduction using PCA can affect silhouette. Below can be found a plot presenting average silhouette depending on the number of Principal Components.

par(mfrow=c(3,3))

imagePC3 <- readJPEG("photo_3_princ_comp.jpg")


dm5 <- dim(imagePC3)


rgbImagePC3 <- data.frame(
  x=rep(1:dm5[2], each=dm5[1]),
  y=rep(dm5[1]:1, dm5[2]),
  r.value=as.vector(imagePC3[,,1]),
  g.value=as.vector(imagePC3[,,2]),
  b.value=as.vector(imagePC3[,,3]))


#optimal number of clusters 
library(cluster)
n1<-c()                     # empty vector to save results
for (i in 1:10) {               # numer of clusters to consider
  cl1<-clara(rgbImagePC3[, c("r.value", "g.value", "b.value")], i)  
  n1[i]<-cl1$silinfo$avg.width      # saving silhouette to vector
}

plot(n1,
     main = "Alexandrite PC(3)",
     xlab = 'No. of clusters',
     ylab = 'Average silhouette')

#-------

imagePC40 <- readJPEG("photo_40_princ_comp.jpg")


dm6 <- dim(imagePC40)


rgbImagePC40 <- data.frame(
  x=rep(1:dm6[2], each=dm6[1]),
  y=rep(dm6[1]:1, dm6[2]),
  r.value=as.vector(imagePC40[,,1]),
  g.value=as.vector(imagePC40[,,2]),
  b.value=as.vector(imagePC40[,,3]))


#optimal number of clusters 
library(cluster)
n2<-c()                     # empty vector to save results
for (i in 1:10) {               # numer of clusters to consider
  cl2<-clara(rgbImagePC40[, c("r.value", "g.value", "b.value")], i) 
  n2[i]<-cl2$silinfo$avg.width      # saving silhouette to vector
}

plot(n2,
     main = "Alexandrite PC(40)",
     xlab = 'No. of clusters',
     ylab = 'Average silhouette')

#----------

imagePC76 <- readJPEG("photo_76_princ_comp.jpg")


dm7 <- dim(imagePC3)


rgbImagePC76 <- data.frame(
  x=rep(1:dm7[2], each=dm7[1]),
  y=rep(dm7[1]:1, dm7[2]),
  r.value=as.vector(imagePC76[,,1]),
  g.value=as.vector(imagePC76[,,2]),
  b.value=as.vector(imagePC76[,,3]))


#optimal number of clusters 
library(cluster)
n3<-c()                     # empty vector to save results
for (i in 1:10) {               # numer of clusters to consider
  cl3<-clara(rgbImagePC76[, c("r.value", "g.value", "b.value")], i) 
  n3[i]<-cl3$silinfo$avg.width      # saving silhouette to vector
}

plot(n3,
     main = "Alexandrite PC(76)",
     xlab = 'No. of clusters',
     ylab = 'Average silhouette')

#--------

imagePC112 <- readJPEG("photo_112_princ_comp.jpg")


dm8 <- dim(imagePC112)


rgbImagePC112 <- data.frame(
  x=rep(1:dm8[2], each=dm8[1]),
  y=rep(dm8[1]:1, dm8[2]),
  r.value=as.vector(imagePC112[,,1]),
  g.value=as.vector(imagePC112[,,2]),
  b.value=as.vector(imagePC112[,,3]))


#optimal number of clusters 
library(cluster)
n4<-c()                     # empty vector to save results
for (i in 1:10) {               # numer of clusters to consider
  cl4<-clara(rgbImagePC112[, c("r.value", "g.value", "b.value")], i)    
  n4[i]<-cl4$silinfo$avg.width      # saving silhouette to vector
}

plot(n4,
     main = "Alexandrite PC(112)",
     xlab = 'No. of clusters',
     ylab = 'Average silhouette')

#-------
imagePC149 <- readJPEG("photo_149_princ_comp.jpg")


dm9 <- dim(imagePC149)


rgbImagePC149 <- data.frame(
  x=rep(1:dm9[2], each=dm9[1]),
  y=rep(dm9[1]:1, dm9[2]),
  r.value=as.vector(imagePC149[,,1]),
  g.value=as.vector(imagePC149[,,2]),
  b.value=as.vector(imagePC149[,,3]))


#optimal number of clusters 
library(cluster)
n5<-c()                     # empty vector to save results
for (i in 1:10) {               # numer of clusters to consider
  cl5<-clara(rgbImagePC149[, c("r.value", "g.value", "b.value")], i)    
  n5[i]<-cl5$silinfo$avg.width      # saving silhouette to vector
}

plot(n5,
     main = "Alexandrite PC149",
     xlab = 'No. of clusters',
     ylab = 'Average silhouette')


#-----
imagePC186 <- readJPEG("photo_186_princ_comp.jpg")


dm10 <- dim(imagePC186)


rgbImagePC186 <- data.frame(
  x=rep(1:dm10[2], each=dm10[1]),
  y=rep(dm10[1]:1, dm10[2]),
  r.value=as.vector(imagePC186[,,1]),
  g.value=as.vector(imagePC186[,,2]),
  b.value=as.vector(imagePC186[,,3]))


#optimal number of clusters 
library(cluster)
n6<-c()                     # empty vector to save results
for (i in 1:10) {               # numer of clusters to consider
  cl6<-clara(rgbImagePC186[, c("r.value", "g.value", "b.value")], i)    
  n6[i]<-cl6$silinfo$avg.width      # saving silhouette to vector
}

plot(n6,
     main = "Alexandrite PC(186)",
     xlab = 'No. of clusters',
     ylab = 'Average silhouette')

#------
imagePC222 <- readJPEG("photo_222_princ_comp.jpg")


dm11 <- dim(imagePC222)


rgbImagePC222 <- data.frame(
  x=rep(1:dm11[2], each=dm11[1]),
  y=rep(dm11[1]:1, dm11[2]),
  r.value=as.vector(imagePC222[,,1]),
  g.value=as.vector(imagePC222[,,2]),
  b.value=as.vector(imagePC222[,,3]))


#optimal number of clusters 
library(cluster)
n6<-c()                     # empty vector to save results
for (i in 1:10) {               # numer of clusters to consider
  cl6<-clara(rgbImagePC222[, c("r.value", "g.value", "b.value")], i)    
  n6[i]<-cl6$silinfo$avg.width      # saving silhouette to vector
}

plot(n6,
     main = "Alexandrite PC(222)",
     xlab = 'No. of clusters',
     ylab = 'Average silhouette')


#-----
imagePC258 <- readJPEG("photo_258_princ_comp.jpg")


dm12 <- dim(imagePC258)


rgbImagePC258 <- data.frame(
  x=rep(1:dm12[2], each=dm12[1]),
  y=rep(dm12[1]:1, dm12[2]),
  r.value=as.vector(imagePC258[,,1]),
  g.value=as.vector(imagePC258[,,2]),
  b.value=as.vector(imagePC258[,,3]))


#optimal number of clusters 
library(cluster)
n7<-c()                     # empty vector to save results
for (i in 1:10) {               # numer of clusters to consider
  cl7<-clara(rgbImagePC258[, c("r.value", "g.value", "b.value")], i)    
  n7[i]<-cl7$silinfo$avg.width      # saving silhouette to vector
}

plot(n7,
     main = "Alexandrite PC(258)",
     xlab = 'No. of clusters',
     ylab = 'Average silhouette')

#----
imagePC295 <- readJPEG("photo_295_princ_comp.jpg")


dm12 <- dim(imagePC295)


rgbImagePC295 <- data.frame(
  x=rep(1:dm12[2], each=dm12[1]),
  y=rep(dm12[1]:1, dm12[2]),
  r.value=as.vector(imagePC295[,,1]),
  g.value=as.vector(imagePC295[,,2]),
  b.value=as.vector(imagePC295[,,3]))


#optimal number of clusters 
library(cluster)
n8<-c()                     # empty vector to save results
for (i in 1:10) {               # numer of clusters to consider
  cl8<-clara(rgbImagePC295[, c("r.value", "g.value", "b.value")], i)    
  n8[i]<-cl8$silinfo$avg.width      # saving silhouette to vector
}

plot(n8,
     main = "Alexandrite PC(295)",
     xlab = 'No. of clusters',
     ylab = 'Average silhouette')

Results

As presented on the graphs for PC = 3 silhouette differ visibly compared to regular photo (this graphs show optimal no. of clusters = 10, while for regular photo optimal no. of clusters = 2). Quite surprisingly for PC = 40 optimal no. of clusters = 2. For PC > 112 with PC increase the average silhouette graph gets more similar to the silhouette of the regular photo.