library(data.table)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:data.table':
##
## between, first, last
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(factoextra)
## Loading required package: ggplot2
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(cluster)
library(ggplot2)
library(radiant.data)
## Loading required package: magrittr
## Loading required package: lubridate
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:data.table':
##
## hour, isoweek, mday, minute, month, quarter, second, wday, week,
## yday, year
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
## Loading required package: tidyr
##
## Attaching package: 'tidyr'
## The following object is masked from 'package:magrittr':
##
## extract
##
## Attaching package: 'radiant.data'
## The following objects are masked from 'package:lubridate':
##
## month, wday
## The following object is masked from 'package:magrittr':
##
## set_attr
## The following object is masked from 'package:ggplot2':
##
## diamonds
## The following objects are masked from 'package:data.table':
##
## month, wday
## The following object is masked from 'package:base':
##
## date
#Data Loading
url <- "https://raw.githubusercontent.com/tanishq21/Mall-Customers/main/Mall_Customers.csv"
destfile <- "Mall_Customers.csv"
if (!file.exists(destfile)) {
download.file(url, destfile, mode = "wb")
}
dt <- fread(destfile)
message("Loaded ", nrow(dt), " rows × ", ncol(dt), " cols from URL source")
## Loaded 200 rows × 5 cols from URL source
#Processing
dt_clean <- na.omit(dt)
message("After na.omit: ", nrow(dt_clean), " rows")
## After na.omit: 200 rows
num_cols <- c("Age", "Annual Income (k$)", "Spending Score (1-100)")
num_data <- dt_clean[, num_cols, with = FALSE]
num_scaled <- scale(num_data)
#Clustering
fviz_nbclust(num_scaled, kmeans, method = "wss")

set.seed(386)
k <- 6
km <- kmeans(num_scaled, centers = k, nstart = 25)
dt_clean[, segment := factor(km$cluster)]
#Visualization
pca_out <- prcomp(num_scaled, center = FALSE, scale. = FALSE)
pca_df <- as.data.frame(pca_out$x[,1:2])
names(pca_df) <- c("PC1","PC2")
pca_df$segment <- dt_clean$segment
ggplot(pca_df, aes(PC1, PC2, color = segment)) +
geom_point(alpha = 0.7, size = 2) +
labs(title = "PCA of Mall Customers", subtitle = "k-means segments (n=6)") +
theme_minimal()

fviz_cluster(km, data = num_scaled,
geom = "point",
stand = FALSE,
ellipse = TRUE,
show.clust.cent = TRUE) +
labs(title = "Mall Customers Cluster Plot (k=6)") +
theme_minimal()

#PCA Loadings
loadings <- as.data.frame(pca_out$rotation)
loadings <- rownames_to_column(loadings, var = "Variable")
knitr::kable(loadings, caption = "PCA Loadings for Each Variable and Principal Component")
PCA Loadings for Each Variable and Principal
Component
| Age |
0.7063823 |
0.0301412 |
-0.7071884 |
| Annual Income (k$) |
-0.0480240 |
0.9988316 |
-0.0053979 |
| Spending Score (1-100) |
-0.7061995 |
-0.0377750 |
-0.7070045 |