EuroSAT Dataset

euroSAT <- list.files("EuroSAT_RGB", pattern = "\\.jpg$", full.names = TRUE, recursive = TRUE)
set.seed(123)

Sampling and processing the images

sample_files <- sample(euroSAT, 2000)

process_image <- function(filepath) {
  img <- readJPEG(filepath) 
  
  # Converting to grayscale
  if (length(dim(img)) == 3) {
    img <- (img[,,1] + img[,,2] + img[,,3]) / 3
  }
  
  as.vector(img)
}
image_matrix <- t(sapply(sample_files, process_image))
df <- as.data.frame(image_matrix)
true_labels <- sapply(strsplit(sample_files, "/"), function(x) x[length(x)-1])

table(true_labels)
## true_labels
##           AnnualCrop               Forest HerbaceousVegetation 
##                  210                  228                  217 
##              Highway           Industrial              Pasture 
##                  207                  152                  159 
##        PermanentCrop          Residential                River 
##                  181                  232                  172 
##              SeaLake 
##                  242
df_scaled <- scale(df)
dim(df_scaled)
## [1] 2000 4096

Assessing clustrability

get_clust_tendency(df_scaled, n = 50, graph = FALSE)$hopkins_stat
## [1] 0.8141851

Clara clustering

# Method 1: Silhouette (Separation)
fviz_nbclust(df_scaled[1:500, ], clara, method = "silhouette") + 
  labs(subtitle = "Silhouette Method")

# Method 2: Elbow Method (Compactness)
fviz_nbclust(df_scaled[1:500, ], clara, method = "wss") + 
  labs(subtitle = "Elbow Method")

clara_flex <- eclust(df_scaled, "clara", k = 2, graph = FALSE)
fviz_cluster(clara_flex, geom = "point")

fviz_silhouette(clara_flex)
## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## ℹ The deprecated feature was likely used in the factoextra package.
##   Please report the issue at <https://github.com/kassambara/factoextra/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
##   cluster size ave.sil.width
## 1       1 1035          0.12
## 2       2  965          0.59

## Visualizing the clusters

conf_mat <- table(Cluster = clara_flex$clustering, Truth = true_labels)
conf_df <- as.data.frame(conf_mat)

land_use_colors <- c(
  "Forest" = "forestgreen",
  "SeaLake" = "dodgerblue3", 
  "Industrial" = "firebrick",
  "Highway" = "darkgrey",
  "River" = "blue",
  "Pasture" = "lawngreen",
  "Residential" = "purple",
  "AnnualCrop" = "gold",
  "HerbaceousVegetation" = "yellowgreen",
  "PermanentCrop" = "chartreuse4"
)

ggplot(conf_df, aes(x = factor(Cluster), y = Freq, fill = Truth)) +
  geom_bar(stat = "identity", position = "fill") +
  labs(title = "Cluster Purity Analysis",
       x = "Cluster ID",
       y = "Proportion of Land Use Types",
       fill = "True Label") +
  theme_minimal() +
  scale_y_continuous(labels = scales::percent) +
  
  scale_fill_manual(values = land_use_colors)