Market Segmentation

library(data.table)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:data.table':
## 
##     between, first, last

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(factoextra)

## Loading required package: ggplot2

## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa

library(cluster)
library(ggplot2)
library(radiant.data)

## Loading required package: magrittr

## Loading required package: lubridate

## 
## Attaching package: 'lubridate'

## The following objects are masked from 'package:data.table':
## 
##     hour, isoweek, mday, minute, month, quarter, second, wday, week,
##     yday, year

## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union

## Loading required package: tidyr

## 
## Attaching package: 'tidyr'

## The following object is masked from 'package:magrittr':
## 
##     extract

## 
## Attaching package: 'radiant.data'

## The following objects are masked from 'package:lubridate':
## 
##     month, wday

## The following object is masked from 'package:magrittr':
## 
##     set_attr

## The following object is masked from 'package:ggplot2':
## 
##     diamonds

## The following objects are masked from 'package:data.table':
## 
##     month, wday

## The following object is masked from 'package:base':
## 
##     date

#Data Loading
url <- "https://raw.githubusercontent.com/tanishq21/Mall-Customers/main/Mall_Customers.csv"
destfile <- "Mall_Customers.csv"
if (!file.exists(destfile)) {
  download.file(url, destfile, mode = "wb")
}
dt <- fread(destfile)
message("Loaded ", nrow(dt), " rows × ", ncol(dt), " cols from URL source")

## Loaded 200 rows × 5 cols from URL source

#Processing
dt_clean <- na.omit(dt)
message("After na.omit: ", nrow(dt_clean), " rows")

## After na.omit: 200 rows

num_cols <- c("Age", "Annual Income (k$)", "Spending Score (1-100)")
num_data <- dt_clean[, num_cols, with = FALSE]
num_scaled <- scale(num_data)

#Clustering
fviz_nbclust(num_scaled, kmeans, method = "wss")

set.seed(386)
k <- 6
km <- kmeans(num_scaled, centers = k, nstart = 25)
dt_clean[, segment := factor(km$cluster)]

#Visualization
pca_out <- prcomp(num_scaled, center = FALSE, scale. = FALSE)
pca_df <- as.data.frame(pca_out$x[,1:2])
names(pca_df) <- c("PC1","PC2")
pca_df$segment <- dt_clean$segment

ggplot(pca_df, aes(PC1, PC2, color = segment)) +
  geom_point(alpha = 0.7, size = 2) +
  labs(title = "PCA of Mall Customers", subtitle = "k-means segments (n=6)") +
  theme_minimal()

fviz_cluster(km, data = num_scaled,
             geom = "point",
             stand = FALSE,
             ellipse = TRUE,
             show.clust.cent = TRUE) +
  labs(title = "Mall Customers Cluster Plot (k=6)") +
  theme_minimal()

#PCA Loadings
loadings <- as.data.frame(pca_out$rotation)
loadings <- rownames_to_column(loadings, var = "Variable")
knitr::kable(loadings, caption = "PCA Loadings for Each Variable and Principal Component")

PCA Loadings for Each Variable and Principal Component
Variable	PC1	PC2	PC3
Age	0.7063823	0.0301412	-0.7071884
Annual Income (k$)	-0.0480240	0.9988316	-0.0053979
Spending Score (1-100)	-0.7061995	-0.0377750	-0.7070045

Market Segmentation

Alicia Lanz

2025-07-03