in this project we will explore how clustering works on images. I selected paintings representative of art periods. Clustering will reveal which art styles are the easiest for the computer to analyze.
library(jpeg)
library(rasterImage)
library(imager)
library(cluster)
library(factoextra)
library(flexclust)
library(fpc)
library(clustertend)
library(cluster)
library(ClusterR)
library(tidyverse)
library(scatterplot3d)
library(rgl)
library(knitr)
names <- c("mona_lisa.jpg", "girl_with_pearl_earring.jpg", "wanderer.jpg", "the_fighting_temaraire.jpg",
"starry_night.jpg", "marilyn_monroe.jpg", "fangor_2.jpg", "fangor_3.jpg")
The paintings we will analyze
par(mfrow = c(2,4))
for (name in names){
image <- load.image(name)
plot(image, axes = FALSE)
}
we need to convert the JPEG format to RGB format of three dimensions, one for each color of pixel display - red, green and blue. We specify a function that will convert the painting to the format appropriate for the clustering algorithms.
rgb_modifier <- function(painting){
image <- readJPEG(painting)
dim <- dim(image)
dataframe <- data.frame(x = rep(1:dim[2], each = dim[1]),
y = rep(dim[1]:1, dim[2]),
red = as.vector(image[,,1]),
green = as.vector(image[,,2]),
blue = as.vector(image[,,3]))
}
converting the images to rgb format and putting them together in a list
mona_lisa <- rgb_modifier("mona_lisa.jpg")
girl_with_pearl_earring <- rgb_modifier("girl_with_pearl_earring.jpg")
wanderer_above_sea_of_clouds <- rgb_modifier("wanderer.jpg")
the_fighting_temaraire <- rgb_modifier("the_fighting_temaraire.jpg")
the_starry_night <- rgb_modifier("starry_night.jpg")
marilyn_monroe <- rgb_modifier("marilyn_monroe.jpg")
fangor_1 <- rgb_modifier("fangor_2.jpg")
fangor_2 <- rgb_modifier("fangor_3.jpg")
rgb_images <- list(mona_lisa, girl_with_pearl_earring, wanderer_above_sea_of_clouds,
the_fighting_temaraire, the_starry_night, marilyn_monroe, fangor_1, fangor_2)
Now we define a function that will return the output of the clustering algorithm
optimizer <- function(painting){
clusters <- c()
for (i in 1:10){
cluster <- clara(painting[,c("red", "green", "blue")], i)
clusters[i] <- cluster$silinfo$avg.width
}
return(clusters)
}
Performing clustering on each painting
mona_lisa_clusters <- optimizer(mona_lisa)
girl_with_pearl_earring_clusters <- optimizer(girl_with_pearl_earring)
wanderer_above_sea_of_clouds_clusters <- optimizer(wanderer_above_sea_of_clouds)
the_fighting_temaraire_clusters <- optimizer(the_fighting_temaraire)
the_starry_night_clusters <-optimizer(the_starry_night)
marilyn_monroe_clusters <- optimizer(marilyn_monroe)
fangor_1_clusters <- optimizer(fangor_1)
fangor_2_clusters <- optimizer(fangor_2)
we derive the names of paintings for plot naming purposes, and grouping the cluster objects in a list
short_names <- substr(names, 1, nchar(names) -4)
clusters <- list(mona_lisa_clusters, girl_with_pearl_earring_clusters,wanderer_above_sea_of_clouds_clusters,
the_fighting_temaraire_clusters, the_starry_night_clusters, marilyn_monroe_clusters,
fangor_1_clusters, fangor_2_clusters)
We want to find the number of clusters optimal for each painting. For example, for mona_lisa we can see that with two clusters, we score the highest average silhouette width, so we go with 2.
i = 1
optimal_cluster_number <- c()
for (cluster in clusters){
plot(cluster, type = "l", main = short_names[i], col = "navyblue")
abline(h=seq(1:20)/20, lty=1, col="grey")
points(cluster, pch = 21, bg = "red")
optimal_cluster_number[i] = which.max(cluster)
i = i + 1
}
now we display the average silhouette scores with the optimal cluster number for each painting
i = 1
for (painting in rgb_images){
plot(silhouette(clara(painting[,3:5], optimal_cluster_number[i])), main = short_names[i])
i = i + 1
}
finally, we can present the paintings using the optimal number of clusters, and we can quickly draw some conclusions based on the results. Let’s take a look at a more modern era painting, Marilyn Monroe by Andy Warhol. We used 5 clusters, and the painting is practically indistinguishable from the original. But if we look at the impressionistic Fighting Temaraire by William Turner, we can see a much downgraded image, which makes sense since the painting has only 0.46 average silhouette score. We can conclude that for clustering, a well defined and clearly outlined colors work better than more transitive coloring. Interestingly, the algorithm has practically ignored the transitive parts of Fangor’s circly paintings, which are the essence of the paintings and what makes them great to look at.
i = 1
dominant_colors <- list()
for (painting in rgb_images){
clara <- clara(painting[,3:5], optimal_cluster_number[i])
colors <- rgb(clara$medoids[clara$clustering, ])
plot(painting$y~painting$x, col = colors, asp = 1)
dominant_colors[[i]] <- rgb(clara$medoids)
i = i + 1
}
We can display what colors make up each cluster and what is the prevalence of each color
i = 1
for (painting in rgb_images){
clara <- clara(painting[,3:5], optimal_cluster_number[i])
df <- data.frame(table(rgb(clara$medoids[clara$clustering,])))
df$percentage <- df$Freq/sum(df$Freq)*100
pie(df$percentage, col = as.character(df$Var1))
i = i + 1
}
We can also display the colors on a 3d graph with three dimensions - red, green and blue, and see where the colors that make up our paintings are located
i = 1
for (painting in rgb_images){
clara <- clara(painting[,3:5], optimal_cluster_number[i])
df <- data.frame(t(col2rgb(rgb(clara$medoids))))
plot3d(x = df$red, y = df$green, z = df$blue, type = "s",
xlab = "red", ylab = "green", zlab = "blue", col = rgb(clara$medoids),
main = short_names[i])
i = i + 1
Sys.sleep(5)
}