Load Required Libraries
library(readxl)
## Warning: package 'readxl' was built under R version 4.4.2
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(cluster)
library(factoextra)
## Warning: package 'factoextra' was built under R version 4.4.2
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(NbClust)
library(tidyr)
library(gridExtra)
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
library(hopkins) # For Hopkins Statistic
## Warning: package 'hopkins' was built under R version 4.4.2
mydata <- read_xlsx("./mydata.xlsx")
## New names:
## • `Q22a_1` -> `Q22a_1...26`
## • `Q22b_1` -> `Q22b_1...27`
## • `Q22c_1` -> `Q22c_1...28`
## • `Q22a_1` -> `Q22a_1...32`
## • `Q22b_1` -> `Q22b_1...33`
## • `Q22c_1` -> `Q22c_1...34`
mydata <- mydata[-1, ]
head(mydata)
## # A tibble: 6 × 71
## Q8 Q66 Q10 Q12 Q11 Q48 Q48_5_text Q14a Q14b Q14c Q14d Q14e
## <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 1 3 3 6 2 -2 -2 0 1 0 0 0
## 2 1 2 5 6 1 1 -2 0 1 0 0 0
## 3 1 6 5 6 1 3 -2 0 1 0 1 0
## 4 1 2 5 6 2 -2 -2 1 0 1 1 0
## 5 1 2 2 6 2 -2 -2 0 1 0 0 0
## 6 1 6 7 7 2 -2 -2 0 1 0 0 0
## # ℹ 59 more variables: Q14e_text <chr>, Q15 <chr>, Q16 <chr>, Q17 <chr>,
## # Q18 <chr>, Q25a <chr>, Q25b <chr>, Q25c <chr>, Q25d <chr>, Q25e <chr>,
## # Q25f <chr>, Q25g <chr>, Q25h <chr>, Q22a_1...26 <dbl>, Q22b_1...27 <dbl>,
## # Q22c_1...28 <dbl>, Q22d_1 <dbl>, Q22e_1 <dbl>, Q24 <chr>,
## # Q22a_1...32 <dbl>, Q22b_1...33 <dbl>, Q22c_1...34 <dbl>, Q13 <chr>,
## # Q20 <chr>, Q46 <chr>, Q1a_1 <dbl>, Q1b_1 <dbl>, Q1c_1 <dbl>, Q1d_1 <dbl>,
## # Q1e_1 <dbl>, Q1f_1 <dbl>, Q2a_1 <dbl>, Q2b_1 <dbl>, Q2c_1 <dbl>, …
Load Data
# Load dataset from Excel file
mydata <- read_excel("mydata.xlsx")
## New names:
## • `Q22a_1` -> `Q22a_1...26`
## • `Q22b_1` -> `Q22b_1...27`
## • `Q22c_1` -> `Q22c_1...28`
## • `Q22a_1` -> `Q22a_1...32`
## • `Q22b_1` -> `Q22b_1...33`
## • `Q22c_1` -> `Q22c_1...34`
# Display first few rows
head(mydata)
## # A tibble: 6 × 71
## Q8 Q66 Q10 Q12 Q11 Q48 Q48_5_text Q14a Q14b Q14c Q14d Q14e
## <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 Ker im… Raje… Raje… Zlah… Ali … Kate… Drugo: Javn… Loka… Zdra… Manj… Drug…
## 2 1 3 3 6 2 -2 -2 0 1 0 0 0
## 3 1 2 5 6 1 1 -2 0 1 0 0 0
## 4 1 6 5 6 1 3 -2 0 1 0 1 0
## 5 1 2 5 6 2 -2 -2 1 0 1 1 0
## 6 1 2 2 6 2 -2 -2 0 1 0 0 0
## # ℹ 59 more variables: Q14e_text <chr>, Q15 <chr>, Q16 <chr>, Q17 <chr>,
## # Q18 <chr>, Q25a <chr>, Q25b <chr>, Q25c <chr>, Q25d <chr>, Q25e <chr>,
## # Q25f <chr>, Q25g <chr>, Q25h <chr>, Q22a_1...26 <dbl>, Q22b_1...27 <dbl>,
## # Q22c_1...28 <dbl>, Q22d_1 <dbl>, Q22e_1 <dbl>, Q24 <chr>,
## # Q22a_1...32 <dbl>, Q22b_1...33 <dbl>, Q22c_1...34 <dbl>, Q13 <chr>,
## # Q20 <chr>, Q46 <chr>, Q1a_1 <dbl>, Q1b_1 <dbl>, Q1c_1 <dbl>, Q1d_1 <dbl>,
## # Q1e_1 <dbl>, Q1f_1 <dbl>, Q2a_1 <dbl>, Q2b_1 <dbl>, Q2c_1 <dbl>, …
colnames(mydata)
## [1] "Q8" "Q66" "Q10" "Q12" "Q11"
## [6] "Q48" "Q48_5_text" "Q14a" "Q14b" "Q14c"
## [11] "Q14d" "Q14e" "Q14e_text" "Q15" "Q16"
## [16] "Q17" "Q18" "Q25a" "Q25b" "Q25c"
## [21] "Q25d" "Q25e" "Q25f" "Q25g" "Q25h"
## [26] "Q22a_1...26" "Q22b_1...27" "Q22c_1...28" "Q22d_1" "Q22e_1"
## [31] "Q24" "Q22a_1...32" "Q22b_1...33" "Q22c_1...34" "Q13"
## [36] "Q20" "Q46" "Q1a_1" "Q1b_1" "Q1c_1"
## [41] "Q1d_1" "Q1e_1" "Q1f_1" "Q2a_1" "Q2b_1"
## [46] "Q2c_1" "Q3a_1" "Q3b_1" "Q3c_1" "Q4a_1"
## [51] "Q4b_1" "Q4c_1" "Q5a_1" "Q5b_1" "Q5c_1"
## [56] "Q6a_1" "Q6b_1" "Q6c_1" "Q7a_1" "Q7b_1"
## [61] "Q7c_1" "Q39" "Q40" "Q37" "Q41"
## [66] "Q42" "Q42_5_text" "Q43" "Q44" "Q45"
## [71] "Q45_10_text"
# Convert only selected variables to numeric
colnames(mydata)
## [1] "Q8" "Q66" "Q10" "Q12" "Q11"
## [6] "Q48" "Q48_5_text" "Q14a" "Q14b" "Q14c"
## [11] "Q14d" "Q14e" "Q14e_text" "Q15" "Q16"
## [16] "Q17" "Q18" "Q25a" "Q25b" "Q25c"
## [21] "Q25d" "Q25e" "Q25f" "Q25g" "Q25h"
## [26] "Q22a_1...26" "Q22b_1...27" "Q22c_1...28" "Q22d_1" "Q22e_1"
## [31] "Q24" "Q22a_1...32" "Q22b_1...33" "Q22c_1...34" "Q13"
## [36] "Q20" "Q46" "Q1a_1" "Q1b_1" "Q1c_1"
## [41] "Q1d_1" "Q1e_1" "Q1f_1" "Q2a_1" "Q2b_1"
## [46] "Q2c_1" "Q3a_1" "Q3b_1" "Q3c_1" "Q4a_1"
## [51] "Q4b_1" "Q4c_1" "Q5a_1" "Q5b_1" "Q5c_1"
## [56] "Q6a_1" "Q6b_1" "Q6c_1" "Q7a_1" "Q7b_1"
## [61] "Q7c_1" "Q39" "Q40" "Q37" "Q41"
## [66] "Q42" "Q42_5_text" "Q43" "Q44" "Q45"
## [71] "Q45_10_text"
sapply(mydata[, c("Q2a_1", "Q3a_1", "Q4a_1", "Q5a_1", "Q6a_1", "Q7a_1")], function(x) unique(x))
## $Q2a_1
## [1] NA 7 6 4 5 2 3 1
##
## $Q3a_1
## [1] NA 7 2 6 5 4 3 1
##
## $Q4a_1
## [1] NA 7 4 3 6 5 2
##
## $Q5a_1
## [1] NA 7 3 6 4 2 5 1
##
## $Q6a_1
## [1] NA 7 6 4 5 2 3
##
## $Q7a_1
## [1] NA 7 3 2 6 4 1 5
mydata %>%
select("Q2a_1", "Q3a_1", "Q4a_1", "Q5a_1", "Q6a_1", "Q7a_1") %>%
mutate(across(c("Q2a_1", "Q3a_1", "Q4a_1", "Q5a_1", "Q6a_1", "Q7a_1"),
~ ifelse(grepl("^[0-9\\.]+$", .), as.numeric(.), NA)))
## # A tibble: 153 × 6
## Q2a_1 Q3a_1 Q4a_1 Q5a_1 Q6a_1 Q7a_1
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 NA NA NA NA NA NA
## 2 7 7 7 7 7 7
## 3 7 2 4 3 7 3
## 4 7 6 7 6 6 2
## 5 6 6 3 3 7 3
## 6 6 5 6 4 7 6
## 7 7 5 6 7 7 4
## 8 7 2 7 2 4 1
## 9 7 2 3 3 6 6
## 10 6 4 5 4 5 6
## # ℹ 143 more rows
mydata <- mydata %>%
mutate(across(c("Q2a_1", "Q3a_1", "Q4a_1", "Q5a_1", "Q6a_1", "Q7a_1"),
~ ifelse(grepl("^[0-9\\.]+$", .), as.numeric(.), NA))) %>%
mutate(across(c("Q2a_1", "Q3a_1", "Q4a_1", "Q5a_1", "Q6a_1", "Q7a_1"),
~ replace_na(., mean(., na.rm = TRUE))))
mydata <- mydata %>% mutate(across(c("Q2a_1", "Q3a_1", "Q4a_1", "Q5a_1", "Q6a_1", "Q7a_1"), ~as.numeric(as.character(.))))
Data Preprocessing (Select 5 Best Variables for Clustering)
# Ensure dplyr is loaded for select() and pipes (%>%)
library(dplyr)
# Select the best 5 variables for clustering
selected_vars <- mydata %>%
select("Q2a_1", "Q3a_1", "Q4a_1", "Q5a_1", "Q6a_1", "Q7a_1") # These variables were chosen based on relevance
# Identify complete cases based on selected variables
complete_rows <- complete.cases(selected_vars)
# Filter both datasets to have the same rows
mydata_clean <- mydata[complete_rows, ]
clustering_vars_clean <- selected_vars[complete_rows, ]
# Standardize numeric variables
clustering_vars_clean <- scale(clustering_vars_clean)
# Convert to data frame
clustering_vars_clean <- as.data.frame(clustering_vars_clean)
# Display first few rows of standardized data
head(clustering_vars_clean)
## Q2a_1 Q3a_1 Q4a_1 Q5a_1 Q6a_1
## 1 -6.465477e-16 1.667864e-15 -5.979446e-16 -2.004948e-15 -1.714299e-15
## 2 8.620437e-01 1.577227e+00 9.212563e-01 1.292048e+00 6.539584e-01
## 3 8.620437e-01 -1.552519e+00 -1.098421e+00 -9.653229e-01 6.539584e-01
## 4 8.620437e-01 9.512780e-01 9.212563e-01 7.277049e-01 -3.111064e-01
## 5 1.340957e-01 9.512780e-01 -1.771647e+00 -9.653229e-01 6.539584e-01
## 6 1.340957e-01 3.253288e-01 2.480305e-01 -4.009803e-01 6.539584e-01
## Q7a_1
## 1 2.246063e-15
## 2 1.437447e+00
## 3 -5.856265e-01
## 4 -1.091395e+00
## 5 -5.856265e-01
## 6 9.316785e-01
Hopkins Statistic Test (To Assess Clustering Tendency)
# Perform Hopkins test
hopkins_value <- hopkins(clustering_vars_clean)
print(hopkins_value)
## [1] 0.9075074
Determine Optimal Number of Clusters (Elbow & Silhouette)
# Determine optimal clusters using Elbow and Silhouette methods
fviz_nbclust(clustering_vars_clean, kmeans, method = "wss")

fviz_nbclust(clustering_vars_clean, kmeans, method = "silhouette")

Assign Cluster Labels
# Ensure clusters are assigned only to cleaned data
mydata_clean$Cluster <- as.factor(kmeans_result$cluster)
# Display cluster sizes
table(mydata_clean$Cluster)
##
## 1 2 3
## 31 75 47
Visualizing Clusters
fviz_cluster(kmeans_result, data = clustering_vars_clean)

Cluster Profiling
# Compute mean values per cluster
cluster_profiles <- clustering_vars_clean %>%
mutate(Cluster = mydata_clean$Cluster) %>%
group_by(Cluster) %>%
summarise_all(mean)
# Display cluster profiles
print(cluster_profiles)
## # A tibble: 3 × 7
## Cluster Q2a_1 Q3a_1 Q4a_1 Q5a_1 Q6a_1 Q7a_1
## <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 -1.20 -0.301 -0.469 -0.219 -1.43 -0.0799
## 2 2 0.453 0.688 0.586 0.665 0.388 0.474
## 3 3 0.0721 -0.900 -0.626 -0.917 0.325 -0.704
Cluster Profile Plot
# Convert cluster profile data into a data frame
Figure <- as.data.frame(cluster_profiles)
# Add an ID column for each cluster
Figure$id <- seq_len(nrow(Figure))
# Identify only numeric columns for pivoting (excluding non-numeric ones)
numeric_cols <- names(Figure)[sapply(Figure, is.numeric) & names(Figure) != "id"]
# Transform data into long format
Figure <- pivot_longer(Figure, cols = all_of(numeric_cols))
# Convert Cluster ID into a factor for better plotting
Figure$Group <- factor(Figure$id) # Convert id into factor
# Generate cluster profile visualization
ggplot(Figure, aes(x = name, y = value)) +
geom_hline(yintercept = 0) + # Add a horizontal reference line at y = 0
theme_bw() + # Apply clean theme
geom_point(aes(shape = Group, col = Group), size = 3) + # Points with shape/colors by group
geom_line(aes(group = id), linewidth = 1) + # Lines connecting cluster averages
ylab("Averages") +
xlab("Cluster Variables") +
ylim(-2.2, 2.2) + # Adjust limits based on your data
theme(axis.text.x = element_text(angle = 45, vjust = 0.50, size = 10)) # Rotate X-axis labels

# Load necessary libraries
library(dplyr)
library(cluster)
library(factoextra) # For advanced visualization
# Select only the clustering variables
selected_vars <- c("Q2a_1", "Q3a_1", "Q4a_1", "Q5a_1", "Q6a_1", "Q7a_1")
# Ensure numeric conversion (if needed)
mydata <- mydata %>%
mutate(across(all_of(selected_vars), ~ as.numeric(as.character(.)), .names = "clean_{col}"))
# Keep only the selected cleaned variables
clean_vars <- paste0("clean_", selected_vars)
df_cluster <- mydata[clean_vars]
# Handle missing values (replace NAs with column mean)
df_cluster <- df_cluster %>%
mutate(across(everything(), ~ replace_na(., mean(., na.rm = TRUE))))
# Scale the data to standardize it
scaled_data <- scale(df_cluster)
# Compute the distance matrix (Euclidean)
dist_matrix <- dist(scaled_data, method = "euclidean")
# Perform hierarchical clustering using Ward's method
hc <- hclust(dist_matrix, method = "ward.D2")
# Plot the dendrogram
plot(hc, main = "Dendrogram of Hierarchical Clustering", xlab = "", sub = "", cex = 0.8)

# Optional: Enhanced dendrogram visualization
fviz_dend(hc, rect = TRUE, k = 3) # Adjust k (number of clusters)
## Warning: The `<scale>` argument of `guides()` cannot be `FALSE`. Use "none" instead as
## of ggplot2 3.3.4.
## ℹ The deprecated feature was likely used in the factoextra package.
## Please report the issue at <https://github.com/kassambara/factoextra/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
