library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.2 ✔ tibble 3.2.1
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.0.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(cluster)
library(factoextra)
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
# Load data
data <- read.csv("cloud1.csv")
# Remove missing values
data_clean <- na.omit(data)
# Select numeric columns only
data_numeric <- select_if(data_clean, is.numeric)
# Normalize numeric columns
data_scaled <- scale(data_numeric)
# Elbow method to find the optimal number of clusters
fviz_nbclust(data_scaled, kmeans, method = "wss") +
labs(title = "Elbow Method for Optimal Number of Clusters")

# Set seed for reproducibility
set.seed(123)
# Choose number of clusters based on elbow plot
k <- 3
km_result <- kmeans(data_scaled, centers = k, nstart = 25)
# Add cluster labels to original cleaned data
data_clustered <- data_clean %>%
mutate(Cluster = as.factor(km_result$cluster))
# Cluster plot using PCA for visualization
fviz_cluster(km_result, data = data_scaled) +
labs(title = "K-means Clustering of Survey Data")

# Summary statistics by cluster
cluster_summary <- data_clustered %>%
group_by(Cluster) %>%
summarise(across(where(is.numeric), mean, .names = "mean_{.col}"))
print(cluster_summary)
## # A tibble: 3 × 4
## Cluster mean_Knowledge mean_Visited mean_ID
## <fct> <dbl> <dbl> <dbl>
## 1 1 5.71 1 4.71
## 2 2 5.14 1 15.7
## 3 3 6.17 2 11.2