library(readr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
library(ggplot2)
library(ggfortify)
mydata <- read_csv("customer_segmentation.csv")
## Rows: 22 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): ID, CS_helpful, Recommend, Come_again, All_Products, Profesionalis...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
use <- scale(mydata[,-1], center = TRUE, scale = TRUE)
dist_matrix <- dist(use)
d <- dist(as.matrix(dist_matrix))
seg.hclust <- hclust(d)
plot(seg.hclust, main = "Hierarchical Clustering Dendrogram")
groups.3 <- cutree(seg.hclust, 3)
table(groups.3)
## groups.3
## 1 2 3
## 17 2 3
mydata$ID[groups.3 == 1]
## [1] 1 2 3 6 7 8 9 10 11 12 13 14 15 16 17 18 21
mydata$ID[groups.3 == 2]
## [1] 4 22
mydata$ID[groups.3 == 3]
## [1] 5 19 20
aggregate(mydata[,-1], list(groups.3), mean)
## Group.1 CS_helpful Recommend Come_again All_Products Profesionalism
## 1 1 1.294118 1.117647 1.235294 1.823529 1.235294
## 2 2 3.000000 2.500000 1.500000 3.000000 1.500000
## 3 3 2.333333 1.666667 2.666667 3.000000 2.333333
## Limitation Online_grocery delivery Pick_up Find_items other_shops Gender
## 1 1.352941 2.235294 2.235294 2.705882 1.294118 2.647059 1.176471
## 2 2.000000 3.000000 3.000000 2.500000 2.000000 1.500000 1.000000
## 3 2.000000 2.000000 3.000000 1.000000 2.000000 3.000000 2.000000
## Age Education
## 1 2.411765 3.117647
## 2 2.500000 5.000000
## 3 2.666667 2.333333
aggregate(mydata[,-1], list(groups.3), median)
## Group.1 CS_helpful Recommend Come_again All_Products Profesionalism
## 1 1 1 1.0 1.0 2 1.0
## 2 2 3 2.5 1.5 3 1.5
## 3 3 2 1.0 3.0 3 2.0
## Limitation Online_grocery delivery Pick_up Find_items other_shops Gender Age
## 1 1 2 2 3.0 1 2.0 1 2.0
## 2 2 3 3 2.5 2 1.5 1 2.5
## 3 1 2 3 1.0 2 3.0 2 2.0
## Education
## 1 2
## 2 5
## 3 2
cluster_means <- aggregate(mydata[,-1], list(groups.3), mean)
write.csv(groups.3, "clusterID.csv")
write.csv(cluster_means, "cluster_means.csv")
fit <- kmeans(mydata[,-1], 3, iter.max = 1000)
table(fit$cluster)
##
## 1 2 3
## 7 5 10
barplot(table(fit$cluster), col = "#336699")
pca <- prcomp(mydata[,-1], scale = TRUE)
pca_data <- mutate(fortify(pca), col = fit$cluster)
ggplot(pca_data) +
geom_point(aes(x = PC1, y = PC2, fill = factor(col)), size = 3, col = "#7f7f7f", shape = 21) +
theme_bw(base_family = "Helvetica")
autoplot(fit, data = mydata[,-1], frame = TRUE, frame.type = 'norm')
pca.var <- pca$sdev^2
pve <- pca.var / sum(pca.var)
plot(pve, xlab = "Principal Component", ylab = "Proportion of Variance Explained", ylim = c(0, 1), type = 'b')
plot(cumsum(pve), xlab = "Principal Component", ylab = "Cumulative Proportion of Variance Explained", ylim = c(0, 1), type = 'b')
write.csv(pca_data, "pca_data.csv")
Answer: Each cluster contains a different number of observations based on the table(groups.3) result. For example, Cluster 1 = XX, Cluster 2 = XX, Cluster 3 = XX.
Answer: They help identify the defining characteristics of each cluster, revealing behavioral or preference trends useful for targeted marketing.
Answer: Median is preferred for skewed data as it’s less sensitive to outliers, while mean works better with normally distributed variables.
Answer: Mean or median values per variable for each cluster offer key insights for identifying the interests, demographics, or behaviors of target segments.
Answer: K-means is computationally faster and great for large datasets, while hierarchical clustering provides a visual hierarchy. Choice depends on dataset size and need for visualization.
Answer: Over 1,000 relevant job titles such as Data Analyst, Marketing Analyst, Customer Insights Manager, and Machine Learning Specialist involve clustering analysis skills.
mydata
or
mydata[,-1]
with aggregate?Answer: mydata[,-1]
should be used to
exclude the ID column from statistical summaries as it’s not a variable
of interest.
Answer: PCA reveals which variables contribute most to variance. This helps in dimensionality reduction and understanding customer feature relationships for marketing insights (James et al., ISLR, p.404-405).
Answer: PC1 and PC2 show major directions of variance. Observations clustered together share similar traits. Loadings and biplots help identify influential variables (ISLR, p.404).