library(data.table)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:data.table':
## 
##     between, first, last
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(factoextra)
## Loading required package: ggplot2
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(cluster)
library(ggplot2)
library(radiant.data)
## Loading required package: magrittr
## Loading required package: lubridate
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:data.table':
## 
##     hour, isoweek, mday, minute, month, quarter, second, wday, week,
##     yday, year
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
## Loading required package: tidyr
## 
## Attaching package: 'tidyr'
## The following object is masked from 'package:magrittr':
## 
##     extract
## 
## Attaching package: 'radiant.data'
## The following objects are masked from 'package:lubridate':
## 
##     month, wday
## The following object is masked from 'package:magrittr':
## 
##     set_attr
## The following object is masked from 'package:ggplot2':
## 
##     diamonds
## The following objects are masked from 'package:data.table':
## 
##     month, wday
## The following object is masked from 'package:base':
## 
##     date
#Data Loading
url <- "https://raw.githubusercontent.com/tanishq21/Mall-Customers/main/Mall_Customers.csv"
destfile <- "Mall_Customers.csv"
if (!file.exists(destfile)) {
  download.file(url, destfile, mode = "wb")
}
dt <- fread(destfile)
message("Loaded ", nrow(dt), " rows × ", ncol(dt), " cols from URL source")
## Loaded 200 rows × 5 cols from URL source
#Processing
dt_clean <- na.omit(dt)
message("After na.omit: ", nrow(dt_clean), " rows")
## After na.omit: 200 rows
num_cols <- c("Age", "Annual Income (k$)", "Spending Score (1-100)")
num_data <- dt_clean[, num_cols, with = FALSE]
num_scaled <- scale(num_data)

#Clustering
fviz_nbclust(num_scaled, kmeans, method = "wss")

set.seed(386)
k <- 6
km <- kmeans(num_scaled, centers = k, nstart = 25)
dt_clean[, segment := factor(km$cluster)]

#Visualization
pca_out <- prcomp(num_scaled, center = FALSE, scale. = FALSE)
pca_df <- as.data.frame(pca_out$x[,1:2])
names(pca_df) <- c("PC1","PC2")
pca_df$segment <- dt_clean$segment

ggplot(pca_df, aes(PC1, PC2, color = segment)) +
  geom_point(alpha = 0.7, size = 2) +
  labs(title = "PCA of Mall Customers", subtitle = "k-means segments (n=6)") +
  theme_minimal()

fviz_cluster(km, data = num_scaled,
             geom = "point",
             stand = FALSE,
             ellipse = TRUE,
             show.clust.cent = TRUE) +
  labs(title = "Mall Customers Cluster Plot (k=6)") +
  theme_minimal()

#PCA Loadings
loadings <- as.data.frame(pca_out$rotation)
loadings <- rownames_to_column(loadings, var = "Variable")
knitr::kable(loadings, caption = "PCA Loadings for Each Variable and Principal Component")
PCA Loadings for Each Variable and Principal Component
Variable PC1 PC2 PC3
Age 0.7063823 0.0301412 -0.7071884
Annual Income (k$) -0.0480240 0.9988316 -0.0053979
Spending Score (1-100) -0.7061995 -0.0377750 -0.7070045