library(readr)
library(dplyr)
library(cluster)
mydata <- read_csv("Group1Survey.csv", skip = 3, show_col_types = FALSE)
mydata <- mydata %>%
mutate(ID = row_number(), .before = 1)
str(mydata)
## tibble [15 × 14] (S3: tbl_df/tbl/data.frame)
## $ ID : int [1:15] 1 2 3 4 5 6 7 8 9 10 ...
## $ 1...1 : num [1:15] 1 1 1 1 1 1 1 1 1 1 ...
## $ 21 : num [1:15] 21 21 21 20 31 22 20 22 21 21 ...
## $ 1...3 : num [1:15] 1 2 2 1 1 2 1 1 1 1 ...
## $ 1...4 : num [1:15] 1 2 1 1 1 2 1 1 1 1 ...
## $ 1...5 : num [1:15] 1 1 1 1 1 1 1 1 1 1 ...
## $ 1...6 : num [1:15] 1 2 1 1 1 1 1 2 1 1 ...
## $ 1...7 : num [1:15] 1 1 1 1 2 2 2 1 1 1 ...
## $ 5...8 : num [1:15] 5 5 0 6 7 7 7 6 1 5 ...
## $ 1...9 : num [1:15] 1 3 1 1 1 2 1 2 1 1 ...
## $ 5...10: num [1:15] 4 10 0 6 2 8 3 7 2 5 ...
## $ 1...11: num [1:15] 2 2 3 1 2 3 2 1 2 3 ...
## $ 1...12: num [1:15] 1 1 1 1 1 1 1 2 1 1 ...
## $ 1...13: num [1:15] 1 1 1 1 1 1 1 1 1 1 ...
summary(mydata)
## ID 1...1 21 1...3 1...4
## Min. : 1.0 Min. :0.0000 Min. :20.0 Min. :1.000 Min. :1.000
## 1st Qu.: 4.5 1st Qu.:1.0000 1st Qu.:21.0 1st Qu.:1.000 1st Qu.:1.000
## Median : 8.0 Median :1.0000 Median :21.0 Median :1.000 Median :1.000
## Mean : 8.0 Mean :0.9333 Mean :21.8 Mean :1.267 Mean :1.133
## 3rd Qu.:11.5 3rd Qu.:1.0000 3rd Qu.:22.0 3rd Qu.:1.500 3rd Qu.:1.000
## Max. :15.0 Max. :1.0000 Max. :31.0 Max. :2.000 Max. :2.000
## 1...5 1...6 1...7 5...8 1...9
## Min. :1 Min. :1.0 Min. :1.000 Min. :0.000 Min. :1.000
## 1st Qu.:1 1st Qu.:1.0 1st Qu.:1.000 1st Qu.:5.000 1st Qu.:1.000
## Median :1 Median :1.0 Median :1.000 Median :5.000 Median :1.000
## Mean :1 Mean :1.2 Mean :1.333 Mean :4.933 Mean :1.333
## 3rd Qu.:1 3rd Qu.:1.0 3rd Qu.:2.000 3rd Qu.:7.000 3rd Qu.:1.500
## Max. :1 Max. :2.0 Max. :2.000 Max. :8.000 Max. :3.000
## 5...10 1...11 1...12 1...13
## Min. : 0.000 Min. :1.000 Min. :1.000 Min. :1
## 1st Qu.: 2.500 1st Qu.:1.000 1st Qu.:1.000 1st Qu.:1
## Median : 5.000 Median :2.000 Median :1.000 Median :1
## Mean : 4.733 Mean :1.933 Mean :1.133 Mean :1
## 3rd Qu.: 6.500 3rd Qu.:2.500 3rd Qu.:1.000 3rd Qu.:1
## Max. :10.000 Max. :3.000 Max. :2.000 Max. :1
analysis_data <- mydata %>%
select(-ID) %>%
mutate(across(everything(), as.factor)) %>%
select(where(~ n_distinct(.) > 1))
distance_matrix <- daisy(analysis_data, metric = "gower")
seg.hclust <- hclust(distance_matrix, method = "complete")
plot(
seg.hclust,
main = "Hierarchical Clustering Dendrogram",
xlab = "Survey Respondents",
sub = ""
)

groups.3 <- cutree(seg.hclust, k = 3)
table(groups.3)
## groups.3
## 1 2 3
## 11 2 2
mydata$Cluster <- groups.3
mydata$ID[groups.3 == 1]
## [1] 1 3 4 5 7 8 9 10 12 13 14
mydata$ID[groups.3 == 2]
## [1] 2 11
mydata$ID[groups.3 == 3]
## [1] 6 15
cluster_summary <- mydata %>%
group_by(Cluster) %>%
summarise(
Size = n(),
across(
-ID,
~ names(sort(table(.), decreasing = TRUE))[1]
)
)
cluster_summary
## # A tibble: 3 × 15
## Cluster Size `1...1` `21` `1...3` `1...4` `1...5` `1...6` `1...7` `5...8`
## <int> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 1 11 1 21 1 1 1 1 1 5
## 2 2 2 1 21 2 1 1 2 1 5
## 3 3 2 1 22 1 1 1 1 1 0
## # ℹ 5 more variables: `1...9` <chr>, `5...10` <chr>, `1...11` <chr>,
## # `1...12` <chr>, `1...13` <chr>
write.csv(mydata, "survey_results_with_clusters.csv", row.names = FALSE)
write.csv(cluster_summary, "cluster_summary.csv", row.names = FALSE)