Optimized Clustering Analysis with Selected Variables

Load Required Libraries

library(readxl)

## Warning: package 'readxl' was built under R version 4.4.2

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(ggplot2)
library(cluster)
library(factoextra)

## Warning: package 'factoextra' was built under R version 4.4.2

## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa

library(NbClust)
library(tidyr)
library(gridExtra)

## 
## Attaching package: 'gridExtra'

## The following object is masked from 'package:dplyr':
## 
##     combine

library(hopkins)  # For Hopkins Statistic

## Warning: package 'hopkins' was built under R version 4.4.2

mydata <- read_xlsx("./mydata.xlsx")

## New names:
## • `Q22a_1` -> `Q22a_1...26`
## • `Q22b_1` -> `Q22b_1...27`
## • `Q22c_1` -> `Q22c_1...28`
## • `Q22a_1` -> `Q22a_1...32`
## • `Q22b_1` -> `Q22b_1...33`
## • `Q22c_1` -> `Q22c_1...34`

mydata <- mydata[-1, ]


head(mydata)

## # A tibble: 6 × 71
##   Q8    Q66   Q10   Q12   Q11   Q48   Q48_5_text Q14a  Q14b  Q14c  Q14d  Q14e 
##   <chr> <chr> <chr> <chr> <chr> <chr> <chr>      <chr> <chr> <chr> <chr> <chr>
## 1 1     3     3     6     2     -2    -2         0     1     0     0     0    
## 2 1     2     5     6     1     1     -2         0     1     0     0     0    
## 3 1     6     5     6     1     3     -2         0     1     0     1     0    
## 4 1     2     5     6     2     -2    -2         1     0     1     1     0    
## 5 1     2     2     6     2     -2    -2         0     1     0     0     0    
## 6 1     6     7     7     2     -2    -2         0     1     0     0     0    
## # ℹ 59 more variables: Q14e_text <chr>, Q15 <chr>, Q16 <chr>, Q17 <chr>,
## #   Q18 <chr>, Q25a <chr>, Q25b <chr>, Q25c <chr>, Q25d <chr>, Q25e <chr>,
## #   Q25f <chr>, Q25g <chr>, Q25h <chr>, Q22a_1...26 <dbl>, Q22b_1...27 <dbl>,
## #   Q22c_1...28 <dbl>, Q22d_1 <dbl>, Q22e_1 <dbl>, Q24 <chr>,
## #   Q22a_1...32 <dbl>, Q22b_1...33 <dbl>, Q22c_1...34 <dbl>, Q13 <chr>,
## #   Q20 <chr>, Q46 <chr>, Q1a_1 <dbl>, Q1b_1 <dbl>, Q1c_1 <dbl>, Q1d_1 <dbl>,
## #   Q1e_1 <dbl>, Q1f_1 <dbl>, Q2a_1 <dbl>, Q2b_1 <dbl>, Q2c_1 <dbl>, …

Load Data

# Load dataset from Excel file
mydata <- read_excel("mydata.xlsx")

## New names:
## • `Q22a_1` -> `Q22a_1...26`
## • `Q22b_1` -> `Q22b_1...27`
## • `Q22c_1` -> `Q22c_1...28`
## • `Q22a_1` -> `Q22a_1...32`
## • `Q22b_1` -> `Q22b_1...33`
## • `Q22c_1` -> `Q22c_1...34`

# Display first few rows
head(mydata)

## # A tibble: 6 × 71
##   Q8      Q66   Q10   Q12   Q11   Q48   Q48_5_text Q14a  Q14b  Q14c  Q14d  Q14e 
##   <chr>   <chr> <chr> <chr> <chr> <chr> <chr>      <chr> <chr> <chr> <chr> <chr>
## 1 Ker im… Raje… Raje… Zlah… Ali … Kate… Drugo:     Javn… Loka… Zdra… Manj… Drug…
## 2 1       3     3     6     2     -2    -2         0     1     0     0     0    
## 3 1       2     5     6     1     1     -2         0     1     0     0     0    
## 4 1       6     5     6     1     3     -2         0     1     0     1     0    
## 5 1       2     5     6     2     -2    -2         1     0     1     1     0    
## 6 1       2     2     6     2     -2    -2         0     1     0     0     0    
## # ℹ 59 more variables: Q14e_text <chr>, Q15 <chr>, Q16 <chr>, Q17 <chr>,
## #   Q18 <chr>, Q25a <chr>, Q25b <chr>, Q25c <chr>, Q25d <chr>, Q25e <chr>,
## #   Q25f <chr>, Q25g <chr>, Q25h <chr>, Q22a_1...26 <dbl>, Q22b_1...27 <dbl>,
## #   Q22c_1...28 <dbl>, Q22d_1 <dbl>, Q22e_1 <dbl>, Q24 <chr>,
## #   Q22a_1...32 <dbl>, Q22b_1...33 <dbl>, Q22c_1...34 <dbl>, Q13 <chr>,
## #   Q20 <chr>, Q46 <chr>, Q1a_1 <dbl>, Q1b_1 <dbl>, Q1c_1 <dbl>, Q1d_1 <dbl>,
## #   Q1e_1 <dbl>, Q1f_1 <dbl>, Q2a_1 <dbl>, Q2b_1 <dbl>, Q2c_1 <dbl>, …

# Convert only selected variables to numeric
mydata <- mydata %>% mutate(across(c(Q12, Q25b, Q37, Q41, Q46), ~as.numeric(as.character(.))))

## Warning: There were 5 warnings in `mutate()`.
## The first warning was:
## ℹ In argument: `across(c(Q12, Q25b, Q37, Q41, Q46),
##   ~as.numeric(as.character(.)))`.
## Caused by warning:
## ! NAs introduced by coercion
## ℹ Run `dplyr::last_dplyr_warnings()` to see the 4 remaining warnings.

Data Preprocessing (Select 5 Best Variables for Clustering)

# Ensure dplyr is loaded for select() and pipes (%>%)
library(dplyr)

# Select the best 5 variables for clustering
selected_vars <- mydata %>% 
  select(Q12, Q25b, Q37, Q41, Q46)  # These variables were chosen based on relevance

# Identify complete cases based on selected variables
complete_rows <- complete.cases(selected_vars)

# Filter both datasets to have the same rows
mydata_clean <- mydata[complete_rows, ]
clustering_vars_clean <- selected_vars[complete_rows, ]

# Standardize numeric variables
clustering_vars_clean <- scale(clustering_vars_clean)

# Convert to data frame
clustering_vars_clean <- as.data.frame(clustering_vars_clean)

# Display first few rows of standardized data
head(clustering_vars_clean)

##         Q12       Q25b        Q37        Q41        Q46
## 1 0.2785748 -0.2405025  0.9316137  0.1015277 -1.5614040
## 2 0.2785748  0.6511166 -1.3160892  0.1015277  0.3390732
## 3 0.2785748 -0.2405025 -0.1922377 -0.8629853 -1.9233996
## 4 0.2785748  0.6511166 -1.3160892  0.1015277  0.1128259
## 5 0.2785748  0.6511166  0.9316137  0.1015277  0.7915678
## 6 1.0214410  0.6511166 -0.1922377 -0.8629853 -1.4709050

Hopkins Statistic Test (To Assess Clustering Tendency)

# Perform Hopkins test
hopkins_value <- hopkins(clustering_vars_clean)
print(hopkins_value)

## [1] 0.6589873

Determine Optimal Number of Clusters (Elbow & Silhouette)

# Determine optimal clusters using Elbow and Silhouette methods
fviz_nbclust(clustering_vars_clean, kmeans, method = "wss")

fviz_nbclust(clustering_vars_clean, kmeans, method = "silhouette")

Perform K-Means Clustering with Optimized Number of Clusters

set.seed(123) # For reproducibility
k <- 3  # Adjust based on optimal number found

kmeans_result <- kmeans(clustering_vars_clean, centers = k, nstart = 25)

Assign Cluster Labels

# Ensure clusters are assigned only to cleaned data
mydata_clean$Cluster <- as.factor(kmeans_result$cluster)

# Display cluster sizes
table(mydata_clean$Cluster)

## 
##  1  2  3 
## 62 34 56

Visualizing Clusters

fviz_cluster(kmeans_result, data = clustering_vars_clean)

Cluster Profiling

# Compute mean values per cluster
cluster_profiles <- clustering_vars_clean %>%
  mutate(Cluster = mydata_clean$Cluster) %>%
  group_by(Cluster) %>%
  summarise_all(mean)

# Display cluster profiles
print(cluster_profiles)

## # A tibble: 3 × 6
##   Cluster      Q12   Q25b     Q37    Q41    Q46
##   <fct>      <dbl>  <dbl>   <dbl>  <dbl>  <dbl>
## 1 1        0.00300  0.479  0.823   0.615  0.426
## 2 2       -0.858   -1.45   0.0391 -0.693 -1.07 
## 3 3        0.517    0.349 -0.935  -0.260  0.180

Cluster Profile Plot

# Convert cluster profile data into a data frame
Figure <- as.data.frame(cluster_profiles)

# Add an ID column for each cluster
Figure$id <- seq_len(nrow(Figure))

# Identify only numeric columns for pivoting (excluding non-numeric ones)
numeric_cols <- names(Figure)[sapply(Figure, is.numeric) & names(Figure) != "id"]  

# Transform data into long format
Figure <- pivot_longer(Figure, cols = all_of(numeric_cols))

# Convert Cluster ID into a factor for better plotting
Figure$Group <- factor(Figure$id)  # Convert id into factor

# Generate cluster profile visualization
ggplot(Figure, aes(x = name, y = value)) +
  geom_hline(yintercept = 0) +  # Add a horizontal reference line at y = 0
  theme_bw() +  # Apply clean theme
  geom_point(aes(shape = Group, col = Group), size = 3) +  # Points with shape/colors by group
  geom_line(aes(group = id), linewidth = 1) +  # Lines connecting cluster averages
  ylab("Averages") +
  xlab("Cluster Variables") +
  ylim(-2.2, 2.2) +  # Adjust limits based on your data
  theme(axis.text.x = element_text(angle = 45, vjust = 0.50, size = 10))  # Rotate X-axis labels

Optimized Clustering Analysis with Selected Variables

Your Name

2025-02-03

Load Required Libraries

Load Data

Data Preprocessing (Select 5 Best Variables for Clustering)

Hopkins Statistic Test (To Assess Clustering Tendency)

Determine Optimal Number of Clusters (Elbow & Silhouette)

Perform K-Means Clustering with Optimized Number of Clusters

Assign Cluster Labels

Visualizing Clusters

Cluster Profiling

Cluster Profile Plot