Load Required Libraries

library(readr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyr)
library(ggplot2)
library(ggfortify)
library(cluster)
library(factoextra)
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa

Import Cleaned Data

data <- read_csv("Book1_cleaned.csv")
## Rows: 3 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (7): Respondent_ID, Q1_Use_MS365, Q2_Use_GoogleMore, Q3_Used_Amazon_Appl...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
data <- data[complete.cases(data), ]

Standardize and Cluster

scaled_data <- scale(data[,-1])
dist_matrix <- dist(scaled_data)
clustering <- hclust(dist_matrix, method = "ward.D2")
plot(clustering, main = "Dendrogram")

Cut into 3 Clusters

clusters <- cutree(clustering, 3)
table(clusters)
## clusters
## 1 2 3 
## 1 1 1
data$Cluster <- clusters

Cluster Summary Stats

aggregate(data[,-1], by = list(Cluster = data$Cluster), mean)
##   Cluster Q1_Use_MS365 Q2_Use_GoogleMore Q3_Used_Amazon_Apple
## 1       1            1                 0                    0
## 2       2            1                 1                    0
## 3       3            0                 1                    1
##   Q4_Pref_GoogleDocs Q5_Pref_Teams Q6_Switched_Platform Cluster
## 1                  1             1                    0       1
## 2                  1             0                    1       2
## 3                  1             0                    1       3
aggregate(data[,-1], by = list(Cluster = data$Cluster), median)
##   Cluster Q1_Use_MS365 Q2_Use_GoogleMore Q3_Used_Amazon_Apple
## 1       1            1                 0                    0
## 2       2            1                 1                    0
## 3       3            0                 1                    1
##   Q4_Pref_GoogleDocs Q5_Pref_Teams Q6_Switched_Platform Cluster
## 1                  1             1                    0       1
## 2                  1             0                    1       2
## 3                  1             0                    1       3

Export Results

write.csv(clusters, "clusterID.csv")
write.csv(aggregate(data[,-1], by = list(Cluster = data$Cluster), mean), "cluster_means.csv")

PCA Analysis (with Constant Column Fix)

# Remove constant (zero-variance) columns before PCA
predictors <- data[,-c(1, ncol(data))]  # exclude ID and cluster
constant_cols <- sapply(predictors, function(x) var(x) == 0)
predictors_clean <- predictors[, !constant_cols]

# Run PCA
pca <- prcomp(predictors_clean, scale. = TRUE)
summary(pca)
## Importance of components:
##                           PC1    PC2      PC3
## Standard deviation     1.9552 1.0850 1.36e-16
## Proportion of Variance 0.7646 0.2354 0.00e+00
## Cumulative Proportion  0.7646 1.0000 1.00e+00
plot(pca, type = "l", main = "Scree Plot")

biplot(pca, scale = 0)

Discussion Questions

1. How many observations do we have in each cluster?

Answer: Each cluster’s size is displayed in the table(clusters) output. This helps us see the distribution of respondents.

2. Why is it important to examine the means or medians of each cluster?

Answer: It helps us identify behavioral patterns and target specific groups with personalized strategies.

3. Should we use mean or median to analyze cluster differences?

Answer: Median is more robust to outliers, while mean is preferred if the data is symmetrically distributed.

4. What summary measures are good for targeting strategy?

Answer: Cluster means or medians across survey responses give insight into the characteristics of each segment.

5. K-means vs Hierarchical Clustering: Which is better?

Answer: K-means is faster for large datasets. Hierarchical is visual and ideal for small to medium datasets with clear subgroup structures.

6. How many jobs use “cluster analysis”?

Answer: Hundreds of job listings in analytics, marketing, and data science mention cluster analysis as a required skill.

Advanced: Should we use data or data[,-1]?

Answer: Use data[,-1] to exclude the ID column from analysis since it’s not a meaningful feature.

PCA Q1: What questions can this analysis answer?

Answer: PCA helps us identify which variables explain the most variance and reveal hidden structure among customer traits. (James et al., 2013)

PCA Q2: How do we interpret PCA plots?

Answer: Points close together are similar. The first few PCs explain most variation. Loadings reveal which variables drive that variation. (ISLR p.404–405)