library(readr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
library(ggplot2)
library(ggfortify)
library(cluster)
library(factoextra)
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
data <- read_csv("Book1_cleaned.csv")
## Rows: 3 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (7): Respondent_ID, Q1_Use_MS365, Q2_Use_GoogleMore, Q3_Used_Amazon_Appl...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
data <- data[complete.cases(data), ]
scaled_data <- scale(data[,-1])
dist_matrix <- dist(scaled_data)
clustering <- hclust(dist_matrix, method = "ward.D2")
plot(clustering, main = "Dendrogram")
clusters <- cutree(clustering, 3)
table(clusters)
## clusters
## 1 2 3
## 1 1 1
data$Cluster <- clusters
aggregate(data[,-1], by = list(Cluster = data$Cluster), mean)
## Cluster Q1_Use_MS365 Q2_Use_GoogleMore Q3_Used_Amazon_Apple
## 1 1 1 0 0
## 2 2 1 1 0
## 3 3 0 1 1
## Q4_Pref_GoogleDocs Q5_Pref_Teams Q6_Switched_Platform Cluster
## 1 1 1 0 1
## 2 1 0 1 2
## 3 1 0 1 3
aggregate(data[,-1], by = list(Cluster = data$Cluster), median)
## Cluster Q1_Use_MS365 Q2_Use_GoogleMore Q3_Used_Amazon_Apple
## 1 1 1 0 0
## 2 2 1 1 0
## 3 3 0 1 1
## Q4_Pref_GoogleDocs Q5_Pref_Teams Q6_Switched_Platform Cluster
## 1 1 1 0 1
## 2 1 0 1 2
## 3 1 0 1 3
write.csv(clusters, "clusterID.csv")
write.csv(aggregate(data[,-1], by = list(Cluster = data$Cluster), mean), "cluster_means.csv")
# Remove constant (zero-variance) columns before PCA
predictors <- data[,-c(1, ncol(data))] # exclude ID and cluster
constant_cols <- sapply(predictors, function(x) var(x) == 0)
predictors_clean <- predictors[, !constant_cols]
# Run PCA
pca <- prcomp(predictors_clean, scale. = TRUE)
summary(pca)
## Importance of components:
## PC1 PC2 PC3
## Standard deviation 1.9552 1.0850 1.36e-16
## Proportion of Variance 0.7646 0.2354 0.00e+00
## Cumulative Proportion 0.7646 1.0000 1.00e+00
plot(pca, type = "l", main = "Scree Plot")
biplot(pca, scale = 0)
Answer: Each cluster’s size is displayed in the
table(clusters)
output. This helps us see the distribution
of respondents.
Answer: It helps us identify behavioral patterns and target specific groups with personalized strategies.
Answer: Median is more robust to outliers, while mean is preferred if the data is symmetrically distributed.
Answer: Cluster means or medians across survey responses give insight into the characteristics of each segment.
Answer: K-means is faster for large datasets. Hierarchical is visual and ideal for small to medium datasets with clear subgroup structures.
Answer: Hundreds of job listings in analytics, marketing, and data science mention cluster analysis as a required skill.
data
or
data[,-1]
?Answer: Use data[,-1]
to exclude the ID
column from analysis since it’s not a meaningful feature.
Answer: PCA helps us identify which variables explain the most variance and reveal hidden structure among customer traits. (James et al., 2013)
Answer: Points close together are similar. The first few PCs explain most variation. Loadings reveal which variables drive that variation. (ISLR p.404–405)