Load Required Libraries
library(readxl)
## Warning: package 'readxl' was built under R version 4.4.2
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(cluster)
library(factoextra)
## Warning: package 'factoextra' was built under R version 4.4.2
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(NbClust)
library(tidyr)
library(gridExtra)
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
library(hopkins) # For Hopkins Statistic
## Warning: package 'hopkins' was built under R version 4.4.2
mydata <- read_xlsx("./mydata.xlsx")
## New names:
## • `Q22a_1` -> `Q22a_1...26`
## • `Q22b_1` -> `Q22b_1...27`
## • `Q22c_1` -> `Q22c_1...28`
## • `Q22a_1` -> `Q22a_1...32`
## • `Q22b_1` -> `Q22b_1...33`
## • `Q22c_1` -> `Q22c_1...34`
mydata <- mydata[-1, ]
head(mydata)
## # A tibble: 6 × 71
## Q8 Q66 Q10 Q12 Q11 Q48 Q48_5_text Q14a Q14b Q14c Q14d Q14e
## <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 1 3 3 6 2 -2 -2 0 1 0 0 0
## 2 1 2 5 6 1 1 -2 0 1 0 0 0
## 3 1 6 5 6 1 3 -2 0 1 0 1 0
## 4 1 2 5 6 2 -2 -2 1 0 1 1 0
## 5 1 2 2 6 2 -2 -2 0 1 0 0 0
## 6 1 6 7 7 2 -2 -2 0 1 0 0 0
## # ℹ 59 more variables: Q14e_text <chr>, Q15 <chr>, Q16 <chr>, Q17 <chr>,
## # Q18 <chr>, Q25a <chr>, Q25b <chr>, Q25c <chr>, Q25d <chr>, Q25e <chr>,
## # Q25f <chr>, Q25g <chr>, Q25h <chr>, Q22a_1...26 <dbl>, Q22b_1...27 <dbl>,
## # Q22c_1...28 <dbl>, Q22d_1 <dbl>, Q22e_1 <dbl>, Q24 <chr>,
## # Q22a_1...32 <dbl>, Q22b_1...33 <dbl>, Q22c_1...34 <dbl>, Q13 <chr>,
## # Q20 <chr>, Q46 <chr>, Q1a_1 <dbl>, Q1b_1 <dbl>, Q1c_1 <dbl>, Q1d_1 <dbl>,
## # Q1e_1 <dbl>, Q1f_1 <dbl>, Q2a_1 <dbl>, Q2b_1 <dbl>, Q2c_1 <dbl>, …
Load Data
# Load dataset from Excel file
mydata <- read_excel("mydata.xlsx")
## New names:
## • `Q22a_1` -> `Q22a_1...26`
## • `Q22b_1` -> `Q22b_1...27`
## • `Q22c_1` -> `Q22c_1...28`
## • `Q22a_1` -> `Q22a_1...32`
## • `Q22b_1` -> `Q22b_1...33`
## • `Q22c_1` -> `Q22c_1...34`
# Display first few rows
head(mydata)
## # A tibble: 6 × 71
## Q8 Q66 Q10 Q12 Q11 Q48 Q48_5_text Q14a Q14b Q14c Q14d Q14e
## <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 Ker im… Raje… Raje… Zlah… Ali … Kate… Drugo: Javn… Loka… Zdra… Manj… Drug…
## 2 1 3 3 6 2 -2 -2 0 1 0 0 0
## 3 1 2 5 6 1 1 -2 0 1 0 0 0
## 4 1 6 5 6 1 3 -2 0 1 0 1 0
## 5 1 2 5 6 2 -2 -2 1 0 1 1 0
## 6 1 2 2 6 2 -2 -2 0 1 0 0 0
## # ℹ 59 more variables: Q14e_text <chr>, Q15 <chr>, Q16 <chr>, Q17 <chr>,
## # Q18 <chr>, Q25a <chr>, Q25b <chr>, Q25c <chr>, Q25d <chr>, Q25e <chr>,
## # Q25f <chr>, Q25g <chr>, Q25h <chr>, Q22a_1...26 <dbl>, Q22b_1...27 <dbl>,
## # Q22c_1...28 <dbl>, Q22d_1 <dbl>, Q22e_1 <dbl>, Q24 <chr>,
## # Q22a_1...32 <dbl>, Q22b_1...33 <dbl>, Q22c_1...34 <dbl>, Q13 <chr>,
## # Q20 <chr>, Q46 <chr>, Q1a_1 <dbl>, Q1b_1 <dbl>, Q1c_1 <dbl>, Q1d_1 <dbl>,
## # Q1e_1 <dbl>, Q1f_1 <dbl>, Q2a_1 <dbl>, Q2b_1 <dbl>, Q2c_1 <dbl>, …
# Convert only selected variables to numeric
mydata <- mydata %>% mutate(across(c(Q12, Q25b, Q37, Q41, Q46), ~as.numeric(as.character(.))))
## Warning: There were 5 warnings in `mutate()`.
## The first warning was:
## ℹ In argument: `across(c(Q12, Q25b, Q37, Q41, Q46),
## ~as.numeric(as.character(.)))`.
## Caused by warning:
## ! NAs introduced by coercion
## ℹ Run `dplyr::last_dplyr_warnings()` to see the 4 remaining warnings.
Data Preprocessing (Select 5 Best Variables for Clustering)
# Ensure dplyr is loaded for select() and pipes (%>%)
library(dplyr)
# Select the best 5 variables for clustering
selected_vars <- mydata %>%
select(Q12, Q25b, Q37, Q41, Q46) # These variables were chosen based on relevance
# Identify complete cases based on selected variables
complete_rows <- complete.cases(selected_vars)
# Filter both datasets to have the same rows
mydata_clean <- mydata[complete_rows, ]
clustering_vars_clean <- selected_vars[complete_rows, ]
# Standardize numeric variables
clustering_vars_clean <- scale(clustering_vars_clean)
# Convert to data frame
clustering_vars_clean <- as.data.frame(clustering_vars_clean)
# Display first few rows of standardized data
head(clustering_vars_clean)
## Q12 Q25b Q37 Q41 Q46
## 1 0.2785748 -0.2405025 0.9316137 0.1015277 -1.5614040
## 2 0.2785748 0.6511166 -1.3160892 0.1015277 0.3390732
## 3 0.2785748 -0.2405025 -0.1922377 -0.8629853 -1.9233996
## 4 0.2785748 0.6511166 -1.3160892 0.1015277 0.1128259
## 5 0.2785748 0.6511166 0.9316137 0.1015277 0.7915678
## 6 1.0214410 0.6511166 -0.1922377 -0.8629853 -1.4709050
Hopkins Statistic Test (To Assess Clustering Tendency)
# Perform Hopkins test
hopkins_value <- hopkins(clustering_vars_clean)
print(hopkins_value)
## [1] 0.6589873
Determine Optimal Number of Clusters (Elbow & Silhouette)
# Determine optimal clusters using Elbow and Silhouette methods
fviz_nbclust(clustering_vars_clean, kmeans, method = "wss")

fviz_nbclust(clustering_vars_clean, kmeans, method = "silhouette")

Assign Cluster Labels
# Ensure clusters are assigned only to cleaned data
mydata_clean$Cluster <- as.factor(kmeans_result$cluster)
# Display cluster sizes
table(mydata_clean$Cluster)
##
## 1 2 3
## 62 34 56
Visualizing Clusters
fviz_cluster(kmeans_result, data = clustering_vars_clean)

Cluster Profiling
# Compute mean values per cluster
cluster_profiles <- clustering_vars_clean %>%
mutate(Cluster = mydata_clean$Cluster) %>%
group_by(Cluster) %>%
summarise_all(mean)
# Display cluster profiles
print(cluster_profiles)
## # A tibble: 3 × 6
## Cluster Q12 Q25b Q37 Q41 Q46
## <fct> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 0.00300 0.479 0.823 0.615 0.426
## 2 2 -0.858 -1.45 0.0391 -0.693 -1.07
## 3 3 0.517 0.349 -0.935 -0.260 0.180
Cluster Profile Plot
# Convert cluster profile data into a data frame
Figure <- as.data.frame(cluster_profiles)
# Add an ID column for each cluster
Figure$id <- seq_len(nrow(Figure))
# Identify only numeric columns for pivoting (excluding non-numeric ones)
numeric_cols <- names(Figure)[sapply(Figure, is.numeric) & names(Figure) != "id"]
# Transform data into long format
Figure <- pivot_longer(Figure, cols = all_of(numeric_cols))
# Convert Cluster ID into a factor for better plotting
Figure$Group <- factor(Figure$id) # Convert id into factor
# Generate cluster profile visualization
ggplot(Figure, aes(x = name, y = value)) +
geom_hline(yintercept = 0) + # Add a horizontal reference line at y = 0
theme_bw() + # Apply clean theme
geom_point(aes(shape = Group, col = Group), size = 3) + # Points with shape/colors by group
geom_line(aes(group = id), linewidth = 1) + # Lines connecting cluster averages
ylab("Averages") +
xlab("Cluster Variables") +
ylim(-2.2, 2.2) + # Adjust limits based on your data
theme(axis.text.x = element_text(angle = 45, vjust = 0.50, size = 10)) # Rotate X-axis labels
