Load Packages

library(readr)
library(dplyr)
library(ggplot2)

Import Data

mydata <- read_csv("Survey_Results.csv", show_col_types = FALSE)

if (mydata$Progress[1] == "Progress") {
  mydata <- mydata[-1, ]
}

mydata <- mydata %>%
  mutate(ID = row_number(), .before = 1)

str(mydata)
## tibble [34 × 11] (S3: tbl_df/tbl/data.frame)
##  $ ID              : int [1:34] 1 2 3 4 5 6 7 8 9 10 ...
##  $ Progress        : chr [1:34] "100" "100" "100" "100" ...
##  $ Duration_seconds: chr [1:34] "88" "54" "125" "55" ...
##  $ Q2              : chr [1:34] "Yes" "Yes" "Yes" "Yes" ...
##  $ Q3              : chr [1:34] "Female" "Female" "Female" "Female" ...
##  $ Q4              : chr [1:34] "Less than 1 year" "1 - 5 years" "1 - 5 years" "6 - 10 years" ...
##  $ Q5              : chr [1:34] "I love my job and I love people." "I love my job and I love people." "I love my job and I love people." "I don't love all people, but I am good at what I do." ...
##  $ Q6              : chr [1:34] "No" "No" "No" "No" ...
##  $ Q7              : chr [1:34] "Local" "Chain" "Chain" "Local" ...
##  $ Q8              : chr [1:34] "Yes" "Yes" "No" "Yes" ...
##  $ Q9              : chr [1:34] "Tip goes up" "Tip goes up" "Makes no difference" "Tip goes up" ...
summary(mydata)
##        ID          Progress         Duration_seconds        Q2           
##  Min.   : 1.00   Length:34          Length:34          Length:34         
##  1st Qu.: 9.25   Class :character   Class :character   Class :character  
##  Median :17.50   Mode  :character   Mode  :character   Mode  :character  
##  Mean   :17.50                                                           
##  3rd Qu.:25.75                                                           
##  Max.   :34.00                                                           
##       Q3                 Q4                 Q5                 Q6           
##  Length:34          Length:34          Length:34          Length:34         
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##       Q7                 Q8                 Q9           
##  Length:34          Length:34          Length:34         
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character  
##                                                          
##                                                          
## 

Prepare Data

analysis_data <- mydata %>%
  select(-ID) %>%
  mutate(across(everything(), as.factor)) %>%
  select(where(~ n_distinct(na.omit(.)) > 1)) %>%
  na.omit()

# Keep only the matching rows from mydata
mydata_clean <- mydata[as.numeric(rownames(analysis_data)), ]

cluster_matrix <- model.matrix(~ . - 1, data = analysis_data)

cluster_matrix <- cluster_matrix[, apply(cluster_matrix, 2, sd) > 0]

Scale Data

scaled_data <- scale(cluster_matrix)

Hierarchical Clustering

distance_matrix <- dist(scaled_data)

seg.hclust <- hclust(distance_matrix, method = "complete")

plot(seg.hclust, main = "Hierarchical Clustering Dendrogram")

Cluster Membership

## Cluster Membership

``` r
groups.3 <- cutree(seg.hclust, k = 3)

table(groups.3)
## groups.3
##  1  2  3 
## 18 10  2
mydata_clean$Cluster <- groups.3

mydata_clean$ID[groups.3 == 1]
##  [1]  1  2  4  5  8  9 13 14 17 19 20 22 23 24 25 27 28 29
mydata_clean$ID[groups.3 == 2]
##  [1]  3  6 10 12 15 16 18 21 26 30
mydata_clean$ID[groups.3 == 3]
## [1]  7 11

Cluster Summary

## Cluster Summary

``` r
cluster_summary <- mydata_clean %>%
  group_by(Cluster) %>%
  summarise(across(everything(), ~ paste(unique(.), collapse = ", ")))

cluster_summary
## # A tibble: 3 × 12
##   Cluster ID       Progress Duration_seconds Q2    Q3    Q4    Q5    Q6    Q7   
##     <int> <chr>    <chr>    <chr>            <chr> <chr> <chr> <chr> <chr> <chr>
## 1       1 1, 2, 4… 100      88, 54, 55, 200… Yes   Fema… Less… I lo… No, … Loca…
## 2       2 3, 6, 1… 100      125, 48, 455, 1… Yes   Fema… 1 - … I lo… No, … Chai…
## 3       3 7, 11    100      117, 28          Yes   Fema… Less… I lo… No    Loca…
## # ℹ 2 more variables: Q8 <chr>, Q9 <chr>

Export Results

write.csv(mydata_clean, "survey_results_with_clusters.csv", row.names = FALSE)
write.csv(cluster_summary, "cluster_summary.csv", row.names = FALSE)

Discussion Questions

1. How many observations do we have in each cluster?

table(groups.3)
## groups.3
##  1  2  3 
## 18 10  2

Discussion Questions for you

  1. We can look at the medians (or means) for the variables in each cluster. Why is this important?

Answer: The mean and the median help to understand how each cluster is different and to help identify patterns in behavior and preferences. This information can also be turned into customer profiles for marketing purposes.

  1. Do you think if mean or median should be used when it comes to analyzing the differences among different clusters? Why?

Answer: For clusters, median is better than mean, especially for clusters with smaller numbers.

  1. Now we need to understand the common characteristics of each cluster. Our goal is to build targeting strategy using the profiles of each cluster. What summary measures of each cluster are appropriate in a descriptive sense.

Answer: Key behavioral variables, like would they recommend the store or if they would come back again and do they like delivery or pick-up. Also, demographics will be helpful to identify our customers and target audiences.

  1. Any major differences between K-means clustering and Hierarchical clustering? Which one do you like better? Why? You may refer to the assigned readings.

Answer: Hierarchical is better for understanding the data K-means is better for executing segmentation at scale Hierarchical, because our dataset is small and we are still figuring things out.