library(readr)
library(dplyr)
library(cluster)
mydata <- read_csv("Group1Survey.csv", skip = 3, show_col_types = FALSE)

mydata <- mydata %>%
  mutate(ID = row_number(), .before = 1)

str(mydata)
## tibble [15 × 14] (S3: tbl_df/tbl/data.frame)
##  $ ID    : int [1:15] 1 2 3 4 5 6 7 8 9 10 ...
##  $ 1...1 : num [1:15] 1 1 1 1 1 1 1 1 1 1 ...
##  $ 21    : num [1:15] 21 21 21 20 31 22 20 22 21 21 ...
##  $ 1...3 : num [1:15] 1 2 2 1 1 2 1 1 1 1 ...
##  $ 1...4 : num [1:15] 1 2 1 1 1 2 1 1 1 1 ...
##  $ 1...5 : num [1:15] 1 1 1 1 1 1 1 1 1 1 ...
##  $ 1...6 : num [1:15] 1 2 1 1 1 1 1 2 1 1 ...
##  $ 1...7 : num [1:15] 1 1 1 1 2 2 2 1 1 1 ...
##  $ 5...8 : num [1:15] 5 5 0 6 7 7 7 6 1 5 ...
##  $ 1...9 : num [1:15] 1 3 1 1 1 2 1 2 1 1 ...
##  $ 5...10: num [1:15] 4 10 0 6 2 8 3 7 2 5 ...
##  $ 1...11: num [1:15] 2 2 3 1 2 3 2 1 2 3 ...
##  $ 1...12: num [1:15] 1 1 1 1 1 1 1 2 1 1 ...
##  $ 1...13: num [1:15] 1 1 1 1 1 1 1 1 1 1 ...
summary(mydata)
##        ID           1...1              21           1...3           1...4      
##  Min.   : 1.0   Min.   :0.0000   Min.   :20.0   Min.   :1.000   Min.   :1.000  
##  1st Qu.: 4.5   1st Qu.:1.0000   1st Qu.:21.0   1st Qu.:1.000   1st Qu.:1.000  
##  Median : 8.0   Median :1.0000   Median :21.0   Median :1.000   Median :1.000  
##  Mean   : 8.0   Mean   :0.9333   Mean   :21.8   Mean   :1.267   Mean   :1.133  
##  3rd Qu.:11.5   3rd Qu.:1.0000   3rd Qu.:22.0   3rd Qu.:1.500   3rd Qu.:1.000  
##  Max.   :15.0   Max.   :1.0000   Max.   :31.0   Max.   :2.000   Max.   :2.000  
##      1...5       1...6         1...7           5...8           1...9      
##  Min.   :1   Min.   :1.0   Min.   :1.000   Min.   :0.000   Min.   :1.000  
##  1st Qu.:1   1st Qu.:1.0   1st Qu.:1.000   1st Qu.:5.000   1st Qu.:1.000  
##  Median :1   Median :1.0   Median :1.000   Median :5.000   Median :1.000  
##  Mean   :1   Mean   :1.2   Mean   :1.333   Mean   :4.933   Mean   :1.333  
##  3rd Qu.:1   3rd Qu.:1.0   3rd Qu.:2.000   3rd Qu.:7.000   3rd Qu.:1.500  
##  Max.   :1   Max.   :2.0   Max.   :2.000   Max.   :8.000   Max.   :3.000  
##      5...10           1...11          1...12          1...13 
##  Min.   : 0.000   Min.   :1.000   Min.   :1.000   Min.   :1  
##  1st Qu.: 2.500   1st Qu.:1.000   1st Qu.:1.000   1st Qu.:1  
##  Median : 5.000   Median :2.000   Median :1.000   Median :1  
##  Mean   : 4.733   Mean   :1.933   Mean   :1.133   Mean   :1  
##  3rd Qu.: 6.500   3rd Qu.:2.500   3rd Qu.:1.000   3rd Qu.:1  
##  Max.   :10.000   Max.   :3.000   Max.   :2.000   Max.   :1
analysis_data <- mydata %>%
  select(-ID) %>%
  mutate(across(everything(), as.factor)) %>%
  select(where(~ n_distinct(.) > 1))
distance_matrix <- daisy(analysis_data, metric = "gower")

seg.hclust <- hclust(distance_matrix, method = "complete")

plot(
  seg.hclust,
  main = "Hierarchical Clustering Dendrogram",
  xlab = "Survey Respondents",
  sub = ""
)

groups.3 <- cutree(seg.hclust, k = 3)

table(groups.3)
## groups.3
##  1  2  3 
## 11  2  2
mydata$Cluster <- groups.3

mydata$ID[groups.3 == 1]
##  [1]  1  3  4  5  7  8  9 10 12 13 14
mydata$ID[groups.3 == 2]
## [1]  2 11
mydata$ID[groups.3 == 3]
## [1]  6 15
cluster_summary <- mydata %>%
  group_by(Cluster) %>%
  summarise(
    Size = n(),
    across(
      -ID,
      ~ names(sort(table(.), decreasing = TRUE))[1]
    )
  )

cluster_summary
## # A tibble: 3 × 15
##   Cluster Size  `1...1` `21`  `1...3` `1...4` `1...5` `1...6` `1...7` `5...8`
##     <int> <chr> <chr>   <chr> <chr>   <chr>   <chr>   <chr>   <chr>   <chr>  
## 1       1 11    1       21    1       1       1       1       1       5      
## 2       2 2     1       21    2       1       1       2       1       5      
## 3       3 2     1       22    1       1       1       1       1       0      
## # ℹ 5 more variables: `1...9` <chr>, `5...10` <chr>, `1...11` <chr>,
## #   `1...12` <chr>, `1...13` <chr>
write.csv(mydata, "survey_results_with_clusters.csv", row.names = FALSE)
write.csv(cluster_summary, "cluster_summary.csv", row.names = FALSE)