euroSAT <- list.files("EuroSAT_RGB", pattern = "\\.jpg$", full.names = TRUE, recursive = TRUE)
set.seed(123)
sample_files <- sample(euroSAT, 2000)
process_image <- function(filepath) {
img <- readJPEG(filepath)
# Converting to grayscale
if (length(dim(img)) == 3) {
img <- (img[,,1] + img[,,2] + img[,,3]) / 3
}
as.vector(img)
}
image_matrix <- t(sapply(sample_files, process_image))
df <- as.data.frame(image_matrix)
true_labels <- sapply(strsplit(sample_files, "/"), function(x) x[length(x)-1])
table(true_labels)
## true_labels
## AnnualCrop Forest HerbaceousVegetation
## 210 228 217
## Highway Industrial Pasture
## 207 152 159
## PermanentCrop Residential River
## 181 232 172
## SeaLake
## 242
df_scaled <- scale(df)
dim(df_scaled)
## [1] 2000 4096
get_clust_tendency(df_scaled, n = 50, graph = FALSE)$hopkins_stat
## [1] 0.8141851
# Method 1: Silhouette (Separation)
fviz_nbclust(df_scaled[1:500, ], clara, method = "silhouette") +
labs(subtitle = "Silhouette Method")
# Method 2: Elbow Method (Compactness)
fviz_nbclust(df_scaled[1:500, ], clara, method = "wss") +
labs(subtitle = "Elbow Method")
clara_flex <- eclust(df_scaled, "clara", k = 2, graph = FALSE)
fviz_cluster(clara_flex, geom = "point")
fviz_silhouette(clara_flex)
## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## ℹ The deprecated feature was likely used in the factoextra package.
## Please report the issue at <https://github.com/kassambara/factoextra/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## cluster size ave.sil.width
## 1 1 1035 0.12
## 2 2 965 0.59
## Visualizing the clusters
conf_mat <- table(Cluster = clara_flex$clustering, Truth = true_labels)
conf_df <- as.data.frame(conf_mat)
land_use_colors <- c(
"Forest" = "forestgreen",
"SeaLake" = "dodgerblue3",
"Industrial" = "firebrick",
"Highway" = "darkgrey",
"River" = "blue",
"Pasture" = "lawngreen",
"Residential" = "purple",
"AnnualCrop" = "gold",
"HerbaceousVegetation" = "yellowgreen",
"PermanentCrop" = "chartreuse4"
)
ggplot(conf_df, aes(x = factor(Cluster), y = Freq, fill = Truth)) +
geom_bar(stat = "identity", position = "fill") +
labs(title = "Cluster Purity Analysis",
x = "Cluster ID",
y = "Proportion of Land Use Types",
fill = "True Label") +
theme_minimal() +
scale_y_continuous(labels = scales::percent) +
scale_fill_manual(values = land_use_colors)