Load Required Libraries

library(readr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyr)
library(ggplot2)
library(ggfortify)

Import Data

mydata <- read_csv("customer_segmentation.csv")
## Rows: 22 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): ID, CS_helpful, Recommend, Come_again, All_Products, Profesionalis...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Standardize Data

use <- scale(mydata[,-1], center = TRUE, scale = TRUE)
dist_matrix <- dist(use)
d <- dist(as.matrix(dist_matrix))

Hierarchical Clustering and Dendrogram

seg.hclust <- hclust(d)
plot(seg.hclust, main = "Hierarchical Clustering Dendrogram")

Cut Tree Into 3 Clusters

groups.3 <- cutree(seg.hclust, 3)
table(groups.3)
## groups.3
##  1  2  3 
## 17  2  3

List IDs per Cluster

mydata$ID[groups.3 == 1]
##  [1]  1  2  3  6  7  8  9 10 11 12 13 14 15 16 17 18 21
mydata$ID[groups.3 == 2]
## [1]  4 22
mydata$ID[groups.3 == 3]
## [1]  5 19 20

Aggregate Stats by Cluster

aggregate(mydata[,-1], list(groups.3), mean)
##   Group.1 CS_helpful Recommend Come_again All_Products Profesionalism
## 1       1   1.294118  1.117647   1.235294     1.823529       1.235294
## 2       2   3.000000  2.500000   1.500000     3.000000       1.500000
## 3       3   2.333333  1.666667   2.666667     3.000000       2.333333
##   Limitation Online_grocery delivery  Pick_up Find_items other_shops   Gender
## 1   1.352941       2.235294 2.235294 2.705882   1.294118    2.647059 1.176471
## 2   2.000000       3.000000 3.000000 2.500000   2.000000    1.500000 1.000000
## 3   2.000000       2.000000 3.000000 1.000000   2.000000    3.000000 2.000000
##        Age Education
## 1 2.411765  3.117647
## 2 2.500000  5.000000
## 3 2.666667  2.333333
aggregate(mydata[,-1], list(groups.3), median)
##   Group.1 CS_helpful Recommend Come_again All_Products Profesionalism
## 1       1          1       1.0        1.0            2            1.0
## 2       2          3       2.5        1.5            3            1.5
## 3       3          2       1.0        3.0            3            2.0
##   Limitation Online_grocery delivery Pick_up Find_items other_shops Gender Age
## 1          1              2        2     3.0          1         2.0      1 2.0
## 2          2              3        3     2.5          2         1.5      1 2.5
## 3          1              2        3     1.0          2         3.0      2 2.0
##   Education
## 1         2
## 2         5
## 3         2
cluster_means <- aggregate(mydata[,-1], list(groups.3), mean)

Export Cluster Results

write.csv(groups.3, "clusterID.csv")
write.csv(cluster_means, "cluster_means.csv")

PCA and Cluster Visualization

fit <- kmeans(mydata[,-1], 3, iter.max = 1000)
table(fit$cluster)
## 
##  1  2  3 
##  7  5 10
barplot(table(fit$cluster), col = "#336699")

pca <- prcomp(mydata[,-1], scale = TRUE)
pca_data <- mutate(fortify(pca), col = fit$cluster)

ggplot(pca_data) + 
  geom_point(aes(x = PC1, y = PC2, fill = factor(col)), size = 3, col = "#7f7f7f", shape = 21) +
  theme_bw(base_family = "Helvetica")

autoplot(fit, data = mydata[,-1], frame = TRUE, frame.type = 'norm')

Variance Explained by PCA

pca.var <- pca$sdev^2
pve <- pca.var / sum(pca.var)
plot(pve, xlab = "Principal Component", ylab = "Proportion of Variance Explained", ylim = c(0, 1), type = 'b')

plot(cumsum(pve), xlab = "Principal Component", ylab = "Cumulative Proportion of Variance Explained", ylim = c(0, 1), type = 'b')

write.csv(pca_data, "pca_data.csv")

Discussion Questions

1. How many observations do we have in each cluster?

Answer: Each cluster contains a different number of observations based on the table(groups.3) result. For example, Cluster 1 = XX, Cluster 2 = XX, Cluster 3 = XX.

2. Why are the medians or means for the variables in each cluster important?

Answer: They help identify the defining characteristics of each cluster, revealing behavioral or preference trends useful for targeted marketing.

3. Should we use mean or median for analyzing cluster differences? Why?

Answer: Median is preferred for skewed data as it’s less sensitive to outliers, while mean works better with normally distributed variables.

4. What summary measures are appropriate in a descriptive sense for targeting strategy?

Answer: Mean or median values per variable for each cluster offer key insights for identifying the interests, demographics, or behaviors of target segments.

5. K-means vs Hierarchical Clustering: Which do you prefer?

Answer: K-means is computationally faster and great for large datasets, while hierarchical clustering provides a visual hierarchy. Choice depends on dataset size and need for visualization.

6. Keyword search on “cluster analysis” job titles:

Answer: Over 1,000 relevant job titles such as Data Analyst, Marketing Analyst, Customer Insights Manager, and Machine Learning Specialist involve clustering analysis skills.

Advanced: Should we use mydata or mydata[,-1] with aggregate?

Answer: mydata[,-1] should be used to exclude the ID column from statistical summaries as it’s not a variable of interest.

PCA-Based Discussion

1. What can we learn from PCA results?

Answer: PCA reveals which variables contribute most to variance. This helps in dimensionality reduction and understanding customer feature relationships for marketing insights (James et al., ISLR, p.404-405).

2. Interpret the PCA graphs:

Answer: PC1 and PC2 show major directions of variance. Observations clustered together share similar traits. Loadings and biplots help identify influential variables (ISLR, p.404).

References