Optimized Clustering Analysis with Selected Variables

Load Required Libraries

library(readxl)

## Warning: package 'readxl' was built under R version 4.4.2

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(ggplot2)
library(cluster)
library(factoextra)

## Warning: package 'factoextra' was built under R version 4.4.2

## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa

library(NbClust)
library(tidyr)
library(gridExtra)

## 
## Attaching package: 'gridExtra'

## The following object is masked from 'package:dplyr':
## 
##     combine

library(hopkins)

## Warning: package 'hopkins' was built under R version 4.4.2

mydata <- read_xlsx("./mydata.xlsx")

## New names:
## • `Q22a_1` -> `Q22a_1...26`
## • `Q22b_1` -> `Q22b_1...27`
## • `Q22c_1` -> `Q22c_1...28`
## • `Q22a_1` -> `Q22a_1...32`
## • `Q22b_1` -> `Q22b_1...33`
## • `Q22c_1` -> `Q22c_1...34`

mydata <- mydata[-1, ]


head(mydata)

## # A tibble: 6 × 71
##   Q8    Q66   Q10   Q12   Q11   Q48   Q48_5_text Q14a  Q14b  Q14c  Q14d  Q14e 
##   <chr> <chr> <chr> <chr> <chr> <chr> <chr>      <chr> <chr> <chr> <chr> <chr>
## 1 1     3     3     6     2     -2    -2         0     1     0     0     0    
## 2 1     2     5     6     1     1     -2         0     1     0     0     0    
## 3 1     6     5     6     1     3     -2         0     1     0     1     0    
## 4 1     2     5     6     2     -2    -2         1     0     1     1     0    
## 5 1     2     2     6     2     -2    -2         0     1     0     0     0    
## 6 1     6     7     7     2     -2    -2         0     1     0     0     0    
## # ℹ 59 more variables: Q14e_text <chr>, Q15 <chr>, Q16 <chr>, Q17 <chr>,
## #   Q18 <chr>, Q25a <chr>, Q25b <chr>, Q25c <chr>, Q25d <chr>, Q25e <chr>,
## #   Q25f <chr>, Q25g <chr>, Q25h <chr>, Q22a_1...26 <dbl>, Q22b_1...27 <dbl>,
## #   Q22c_1...28 <dbl>, Q22d_1 <dbl>, Q22e_1 <dbl>, Q24 <chr>,
## #   Q22a_1...32 <dbl>, Q22b_1...33 <dbl>, Q22c_1...34 <dbl>, Q13 <chr>,
## #   Q20 <chr>, Q46 <chr>, Q1a_1 <dbl>, Q1b_1 <dbl>, Q1c_1 <dbl>, Q1d_1 <dbl>,
## #   Q1e_1 <dbl>, Q1f_1 <dbl>, Q2a_1 <dbl>, Q2b_1 <dbl>, Q2c_1 <dbl>, …

Load Data

# Load dataset from Excel file
mydata <- read_excel("mydata.xlsx")

## New names:
## • `Q22a_1` -> `Q22a_1...26`
## • `Q22b_1` -> `Q22b_1...27`
## • `Q22c_1` -> `Q22c_1...28`
## • `Q22a_1` -> `Q22a_1...32`
## • `Q22b_1` -> `Q22b_1...33`
## • `Q22c_1` -> `Q22c_1...34`

# Display first few rows
head(mydata)

## # A tibble: 6 × 71
##   Q8      Q66   Q10   Q12   Q11   Q48   Q48_5_text Q14a  Q14b  Q14c  Q14d  Q14e 
##   <chr>   <chr> <chr> <chr> <chr> <chr> <chr>      <chr> <chr> <chr> <chr> <chr>
## 1 Ker im… Raje… Raje… Zlah… Ali … Kate… Drugo:     Javn… Loka… Zdra… Manj… Drug…
## 2 1       3     3     6     2     -2    -2         0     1     0     0     0    
## 3 1       2     5     6     1     1     -2         0     1     0     0     0    
## 4 1       6     5     6     1     3     -2         0     1     0     1     0    
## 5 1       2     5     6     2     -2    -2         1     0     1     1     0    
## 6 1       2     2     6     2     -2    -2         0     1     0     0     0    
## # ℹ 59 more variables: Q14e_text <chr>, Q15 <chr>, Q16 <chr>, Q17 <chr>,
## #   Q18 <chr>, Q25a <chr>, Q25b <chr>, Q25c <chr>, Q25d <chr>, Q25e <chr>,
## #   Q25f <chr>, Q25g <chr>, Q25h <chr>, Q22a_1...26 <dbl>, Q22b_1...27 <dbl>,
## #   Q22c_1...28 <dbl>, Q22d_1 <dbl>, Q22e_1 <dbl>, Q24 <chr>,
## #   Q22a_1...32 <dbl>, Q22b_1...33 <dbl>, Q22c_1...34 <dbl>, Q13 <chr>,
## #   Q20 <chr>, Q46 <chr>, Q1a_1 <dbl>, Q1b_1 <dbl>, Q1c_1 <dbl>, Q1d_1 <dbl>,
## #   Q1e_1 <dbl>, Q1f_1 <dbl>, Q2a_1 <dbl>, Q2b_1 <dbl>, Q2c_1 <dbl>, …

colnames(mydata)

##  [1] "Q8"          "Q66"         "Q10"         "Q12"         "Q11"        
##  [6] "Q48"         "Q48_5_text"  "Q14a"        "Q14b"        "Q14c"       
## [11] "Q14d"        "Q14e"        "Q14e_text"   "Q15"         "Q16"        
## [16] "Q17"         "Q18"         "Q25a"        "Q25b"        "Q25c"       
## [21] "Q25d"        "Q25e"        "Q25f"        "Q25g"        "Q25h"       
## [26] "Q22a_1...26" "Q22b_1...27" "Q22c_1...28" "Q22d_1"      "Q22e_1"     
## [31] "Q24"         "Q22a_1...32" "Q22b_1...33" "Q22c_1...34" "Q13"        
## [36] "Q20"         "Q46"         "Q1a_1"       "Q1b_1"       "Q1c_1"      
## [41] "Q1d_1"       "Q1e_1"       "Q1f_1"       "Q2a_1"       "Q2b_1"      
## [46] "Q2c_1"       "Q3a_1"       "Q3b_1"       "Q3c_1"       "Q4a_1"      
## [51] "Q4b_1"       "Q4c_1"       "Q5a_1"       "Q5b_1"       "Q5c_1"      
## [56] "Q6a_1"       "Q6b_1"       "Q6c_1"       "Q7a_1"       "Q7b_1"      
## [61] "Q7c_1"       "Q39"         "Q40"         "Q37"         "Q41"        
## [66] "Q42"         "Q42_5_text"  "Q43"         "Q44"         "Q45"        
## [71] "Q45_10_text"

# Convert only selected variables to numeric
colnames(mydata)

##  [1] "Q8"          "Q66"         "Q10"         "Q12"         "Q11"        
##  [6] "Q48"         "Q48_5_text"  "Q14a"        "Q14b"        "Q14c"       
## [11] "Q14d"        "Q14e"        "Q14e_text"   "Q15"         "Q16"        
## [16] "Q17"         "Q18"         "Q25a"        "Q25b"        "Q25c"       
## [21] "Q25d"        "Q25e"        "Q25f"        "Q25g"        "Q25h"       
## [26] "Q22a_1...26" "Q22b_1...27" "Q22c_1...28" "Q22d_1"      "Q22e_1"     
## [31] "Q24"         "Q22a_1...32" "Q22b_1...33" "Q22c_1...34" "Q13"        
## [36] "Q20"         "Q46"         "Q1a_1"       "Q1b_1"       "Q1c_1"      
## [41] "Q1d_1"       "Q1e_1"       "Q1f_1"       "Q2a_1"       "Q2b_1"      
## [46] "Q2c_1"       "Q3a_1"       "Q3b_1"       "Q3c_1"       "Q4a_1"      
## [51] "Q4b_1"       "Q4c_1"       "Q5a_1"       "Q5b_1"       "Q5c_1"      
## [56] "Q6a_1"       "Q6b_1"       "Q6c_1"       "Q7a_1"       "Q7b_1"      
## [61] "Q7c_1"       "Q39"         "Q40"         "Q37"         "Q41"        
## [66] "Q42"         "Q42_5_text"  "Q43"         "Q44"         "Q45"        
## [71] "Q45_10_text"

sapply(mydata[, c("Q2a_1", "Q3a_1", "Q4a_1", "Q5a_1", "Q6a_1", "Q7a_1")], function(x) unique(x))

## $Q2a_1
## [1] NA  7  6  4  5  2  3  1
## 
## $Q3a_1
## [1] NA  7  2  6  5  4  3  1
## 
## $Q4a_1
## [1] NA  7  4  3  6  5  2
## 
## $Q5a_1
## [1] NA  7  3  6  4  2  5  1
## 
## $Q6a_1
## [1] NA  7  6  4  5  2  3
## 
## $Q7a_1
## [1] NA  7  3  2  6  4  1  5

mydata %>%
  select("Q2a_1", "Q3a_1", "Q4a_1", "Q5a_1", "Q6a_1", "Q7a_1") %>%

  mutate(across(c("Q2a_1", "Q3a_1", "Q4a_1", "Q5a_1", "Q6a_1", "Q7a_1"), 
                ~ ifelse(grepl("^[0-9\\.]+$", .), as.numeric(.), NA)))

## # A tibble: 153 × 6
##    Q2a_1 Q3a_1 Q4a_1 Q5a_1 Q6a_1 Q7a_1
##    <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
##  1    NA    NA    NA    NA    NA    NA
##  2     7     7     7     7     7     7
##  3     7     2     4     3     7     3
##  4     7     6     7     6     6     2
##  5     6     6     3     3     7     3
##  6     6     5     6     4     7     6
##  7     7     5     6     7     7     4
##  8     7     2     7     2     4     1
##  9     7     2     3     3     6     6
## 10     6     4     5     4     5     6
## # ℹ 143 more rows

mydata <- mydata %>%
  mutate(across(c("Q2a_1", "Q3a_1", "Q4a_1", "Q5a_1", "Q6a_1", "Q7a_1"), 
                ~ ifelse(grepl("^[0-9\\.]+$", .), as.numeric(.), NA))) %>%
  mutate(across(c("Q2a_1", "Q3a_1", "Q4a_1", "Q5a_1", "Q6a_1", "Q7a_1"), 
                ~ replace_na(., mean(., na.rm = TRUE))))


mydata <- mydata %>% mutate(across(c("Q2a_1", "Q3a_1", "Q4a_1", "Q5a_1", "Q6a_1", "Q7a_1"), ~as.numeric(as.character(.))))

Data Preprocessing (Select 5 Best Variables for Clustering)

# Ensure dplyr is loaded for select() and pipes (%>%)
library(dplyr)

# Select the best 5 variables for clustering
selected_vars <- mydata %>% 
  select("Q2a_1", "Q3a_1", "Q4a_1", "Q5a_1", "Q6a_1", "Q7a_1")  # These variables were chosen based on relevance

# Identify complete cases based on selected variables
complete_rows <- complete.cases(selected_vars)

# Filter both datasets to have the same rows
mydata_clean <- mydata[complete_rows, ]
clustering_vars_clean <- selected_vars[complete_rows, ]

# Standardize numeric variables
clustering_vars_clean <- scale(clustering_vars_clean)

# Convert to data frame
clustering_vars_clean <- as.data.frame(clustering_vars_clean)

# Display first few rows of standardized data
head(clustering_vars_clean)

##           Q2a_1         Q3a_1         Q4a_1         Q5a_1         Q6a_1
## 1 -6.465477e-16  1.667864e-15 -5.979446e-16 -2.004948e-15 -1.714299e-15
## 2  8.620437e-01  1.577227e+00  9.212563e-01  1.292048e+00  6.539584e-01
## 3  8.620437e-01 -1.552519e+00 -1.098421e+00 -9.653229e-01  6.539584e-01
## 4  8.620437e-01  9.512780e-01  9.212563e-01  7.277049e-01 -3.111064e-01
## 5  1.340957e-01  9.512780e-01 -1.771647e+00 -9.653229e-01  6.539584e-01
## 6  1.340957e-01  3.253288e-01  2.480305e-01 -4.009803e-01  6.539584e-01
##           Q7a_1
## 1  2.246063e-15
## 2  1.437447e+00
## 3 -5.856265e-01
## 4 -1.091395e+00
## 5 -5.856265e-01
## 6  9.316785e-01

selected_vars <- c("Q2a_1", "Q3a_1", "Q4a_1", "Q5a_1", "Q6a_1", "Q7a_1")

scaled_data <- scale(mydata[selected_vars])
head(scaled_data)

##              Q2a_1         Q3a_1         Q4a_1         Q5a_1         Q6a_1
## [1,] -6.465477e-16  1.667864e-15 -5.979446e-16 -2.004948e-15 -1.714299e-15
## [2,]  8.620437e-01  1.577227e+00  9.212563e-01  1.292048e+00  6.539584e-01
## [3,]  8.620437e-01 -1.552519e+00 -1.098421e+00 -9.653229e-01  6.539584e-01
## [4,]  8.620437e-01  9.512780e-01  9.212563e-01  7.277049e-01 -3.111064e-01
## [5,]  1.340957e-01  9.512780e-01 -1.771647e+00 -9.653229e-01  6.539584e-01
## [6,]  1.340957e-01  3.253288e-01  2.480305e-01 -4.009803e-01  6.539584e-01
##              Q7a_1
## [1,]  2.246063e-15
## [2,]  1.437447e+00
## [3,] -5.856265e-01
## [4,] -1.091395e+00
## [5,] -5.856265e-01
## [6,]  9.316785e-01

library(factoextra)

# Select only numeric columns from mydata
numeric_vars <- c("Q2a_1", "Q3a_1", "Q4a_1", "Q5a_1", "Q6a_1", "Q7a_1")

# Convert variables to numeric
mydata <- mydata %>%
  mutate(across(all_of(numeric_vars), ~ as.numeric(as.character(.))))

# Remove rows with NA values 
mydata_clean <- na.omit(mydata[numeric_vars])

# Check structure to ensure it's numeric
str(mydata_clean)

## tibble [153 × 6] (S3: tbl_df/tbl/data.frame)
##  $ Q2a_1: num [1:153] 5.82 7 7 7 6 ...
##  $ Q3a_1: num [1:153] 4.48 7 2 6 6 ...
##  $ Q4a_1: num [1:153] 5.63 7 4 7 3 ...
##  $ Q5a_1: num [1:153] 4.71 7 3 6 3 ...
##  $ Q6a_1: num [1:153] 6.32 7 7 6 7 ...
##  $ Q7a_1: num [1:153] 4.16 7 3 2 3 ...

# Compute clustering tendency
get_clust_tendency(mydata_clean, n = nrow(mydata_clean) - 1, graph = FALSE)

## $hopkins_stat
## [1] 0.6094304
## 
## $plot
## NULL

Determine Optimal Number of Clusters (Elbow & Silhouette)

# Determine optimal clusters using Elbow and Silhouette methods
fviz_nbclust(clustering_vars_clean, kmeans, method = "wss")

fviz_nbclust(clustering_vars_clean, kmeans, method = "silhouette")

# Load necessary libraries
library(dplyr)
library(cluster)
library(factoextra)  # For advanced visualization

# Select only the clustering variables
selected_vars <- c("Q2a_1", "Q3a_1", "Q4a_1", "Q5a_1", "Q6a_1", "Q7a_1")

# Ensure numeric conversion 
mydata <- mydata %>%
  mutate(across(all_of(selected_vars), ~ as.numeric(as.character(.)), .names = "clean_{col}"))

# Keep only the selected cleaned variables
clean_vars <- paste0("clean_", selected_vars)
df_cluster <- mydata[clean_vars]

# Handle missing values (replace NAs with column mean)
df_cluster <- df_cluster %>%
  mutate(across(everything(), ~ replace_na(., mean(., na.rm = TRUE))))

# Scale the data to standardize it
scaled_data <- scale(df_cluster)

# Compute the distance matrix (Euclidean)
dist_matrix <- dist(scaled_data, method = "euclidean")

# Perform hierarchical clustering using Ward's method
hc <- hclust(dist_matrix, method = "ward.D2")

# Plot the dendrogram
plot(hc, main = "Dendrogram of Hierarchical Clustering", xlab = "", sub = "", cex = 0.8)

fviz_dend(hc, rect = TRUE, k = 4)  # Adjust k (number of clusters)

## Warning: The `<scale>` argument of `guides()` cannot be `FALSE`. Use "none" instead as
## of ggplot2 3.3.4.
## ℹ The deprecated feature was likely used in the factoextra package.
##   Please report the issue at <https://github.com/kassambara/factoextra/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

Perform K-Means Clustering with Optimized Number of Clusters

set.seed(123)

optimal_clusters <- 4 
kmeans_result <- kmeans(scaled_data, centers = optimal_clusters, nstart = 25)

# Add cluster assignments to the dataset
mydata$cluster <- as.factor(kmeans_result$cluster)

# Print cluster centers
print(kmeans_result$centers)

##   clean_Q2a_1 clean_Q3a_1 clean_Q4a_1 clean_Q5a_1 clean_Q6a_1 clean_Q7a_1
## 1   0.3206802  0.01836667   0.3373215   0.4084058   0.4478662  -0.6143754
## 2   0.4153483  0.99395638   0.7070481   0.6250972   0.2591591   1.1500784
## 3   0.2120901 -1.03834614  -1.2667274  -1.2071840   0.2748258  -0.4591844
## 4  -1.2983181 -0.50253942  -0.4034782  -0.4555941  -1.3384334  -0.2266941

Assign Cluster Labels

# Ensure clusters are assigned only to cleaned data
mydata_clean$Cluster <- as.factor(kmeans_result$cluster)

# Display cluster sizes
table(mydata_clean$Cluster)

## 
##  1  2  3  4 
## 50 44 28 31

Visualizing Clusters

fviz_cluster(kmeans_result, data = clustering_vars_clean)

## Remove Outliers 
library(dplyr)

outlier_rows <- c(41, 139, 15, 71, 34, 140, 35)  

mydata <- mydata[-outlier_rows, ]

# Keep only the selected clustering variables
selected_vars <- c("Q2a_1", "Q3a_1", "Q4a_1", "Q5a_1", "Q6a_1", "Q7a_1")
mydata_clu <- mydata[selected_vars]

set.seed(123)  # Ensure reproducibility

# Define the number of clusters
optimal_clusters <- 4  

# Re-run K-Means on the updated data without outliers
kmeans_result <- kmeans(mydata_clu, centers = optimal_clusters, nstart = 25)

# Assign new clusters to the dataset
mydata$cluster <- as.factor(kmeans_result$cluster)

# Ensure clustering_vars_clean is the same as mydata_clu
clustering_vars_clean <- mydata_clu

# Run K-Means on the cleaned data
set.seed(123)
optimal_clusters <- 4
kmeans_result <- kmeans(clustering_vars_clean, centers = optimal_clusters, nstart = 25)

# Assign new clusters to the dataset
mydata$cluster <- as.factor(kmeans_result$cluster)

# Verify that kmeans_result$cluster has the same number of rows
print(length(kmeans_result$cluster))  # Should match nrow(clustering_vars_clean)

## [1] 146

# Plot the new clusters
fviz_cluster(kmeans_result, data = clustering_vars_clean)

Cluster Profiling

# Ensure both datasets are aligned
mydata_clean <- mydata_clean[row.names(mydata_clean) %in% row.names(clustering_vars_clean), ]

# Assign clusters correctly
cluster_profiles <- clustering_vars_clean %>%
  mutate(Cluster = kmeans_result$cluster) %>%  # Use kmeans_result directly
  group_by(Cluster) %>%
  summarise_all(mean)

# Display cluster profiles
print(cluster_profiles)

## # A tibble: 4 × 7
##   Cluster Q2a_1 Q3a_1 Q4a_1 Q5a_1 Q6a_1 Q7a_1
##     <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1       1  4.41  3.63  5.21  4.12  5.59  4.04
## 2       2  6.47  4.78  6.31  5.84  6.81  2.5 
## 3       3  6.33  5.90  6.47  5.80  6.59  6.27
## 4       4  6.22  2.84  4.38  2.53  6.59  2.75

Cluster Profile Plot

# Convert cluster profile data into a data frame
Figure <- as.data.frame(cluster_profiles)

# Add an ID column for each cluster
Figure$id <- seq_len(nrow(Figure))

# Identify only numeric columns for pivoting (excluding non-numeric ones)
numeric_cols <- names(Figure)[sapply(Figure, is.numeric) & names(Figure) != "id"]  

# Transform data into long format
Figure <- pivot_longer(Figure, cols = all_of(numeric_cols))

# Convert Cluster ID into a factor for better plotting
Figure$Group <- factor(Figure$id)  # Convert id into factor

# Generate cluster profile visualization
ggplot(Figure, aes(x = name, y = value)) +
  geom_hline(yintercept = 0) +  # Add a horizontal reference line at y = 0
  theme_bw() +  # Apply clean theme
  geom_point(aes(shape = Group, col = Group), size = 3) +  # Points with shape/colors by group
  geom_line(aes(group = id), linewidth = 1) +  # Lines connecting cluster averages
  ylab("Averages") +
  xlab("Cluster Variables") +
  ylim(-2.2, 2.2) +  # Adjust limits based on your data
  theme(axis.text.x = element_text(angle = 45, vjust = 0.50, size = 10))  # Rotate X-axis labels

## Warning: Removed 26 rows containing missing values or values outside the scale range
## (`geom_point()`).

## Warning: Removed 26 rows containing missing values or values outside the scale range
## (`geom_line()`).

## `geom_line()`: Each group consists of only one observation.
## ℹ Do you need to adjust the group aesthetic?

Optimized Clustering Analysis with Selected Variables

Group 4

`4.2.2025`

Load Required Libraries

Load Data

Data Preprocessing (Select 5 Best Variables for Clustering)

Determine Optimal Number of Clusters (Elbow & Silhouette)

Perform K-Means Clustering with Optimized Number of Clusters

Assign Cluster Labels

Visualizing Clusters

Cluster Profiling

Cluster Profile Plot