library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.2     ✔ tibble    3.2.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.0.4     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(cluster)
library(factoextra)
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
# Load data
data <- read.csv("cloud1.csv")
# Remove missing values
data_clean <- na.omit(data)

# Select numeric columns only
data_numeric <- select_if(data_clean, is.numeric)

# Normalize numeric columns
data_scaled <- scale(data_numeric)
# Elbow method to find the optimal number of clusters
fviz_nbclust(data_scaled, kmeans, method = "wss") +
  labs(title = "Elbow Method for Optimal Number of Clusters")

# Set seed for reproducibility
set.seed(123)

# Choose number of clusters based on elbow plot
k <- 3
km_result <- kmeans(data_scaled, centers = k, nstart = 25)

# Add cluster labels to original cleaned data
data_clustered <- data_clean %>%
  mutate(Cluster = as.factor(km_result$cluster))
# Cluster plot using PCA for visualization
fviz_cluster(km_result, data = data_scaled) +
  labs(title = "K-means Clustering of Survey Data")

# Summary statistics by cluster
cluster_summary <- data_clustered %>%
  group_by(Cluster) %>%
  summarise(across(where(is.numeric), mean, .names = "mean_{.col}"))

print(cluster_summary)
## # A tibble: 3 × 4
##   Cluster mean_Knowledge mean_Visited mean_ID
##   <fct>            <dbl>        <dbl>   <dbl>
## 1 1                 5.71            1    4.71
## 2 2                 5.14            1   15.7 
## 3 3                 6.17            2   11.2