Initial Variables:

ID: Unique Identification for customer Sex: Sex of customer (0 = Male, 1 = Female) Marital.status: Indication of whether customer is Single or Non-Single (divorce/separated/married/widowed) Age: Age of customer Education: Education Level (0=Other/Unknown,1=High School,2=University,3=Graduate School) Income: Self Reported Annual Income in US Dollars Occupation: Occupation Tier Level (0=Unemployed/Unskilled,1=Skilled Employee,2=Management/Self Employed/ Highly Qualified Employee) Settlement.size: Size of the city that the customer lives in (0=Small,1=Mid-sized,2=Large)

library(tidyverse)
library(skimr)
library(stats)

library(factoextra)
mall <- read.csv("mall_data.csv") %>% select(-ID)

mall$Sex <- factor(mall$Sex, levels=c(0,1))
mall$Marital.status <- factor(mall$Marital.status,levels=c(0,1))
mall$Education <- factor(mall$Education,levels=c(0,1,2,3))
mall$Occupation <- factor(mall$Occupation,levels=c(0,1,2))
mall$Settlement.size <- factor(mall$Settlement.size,levels=c(0,1,2))


AgeOldMin <- min(mall$Age)
AgeOldMax <- max(mall$Age)

IncomeOldMin <- min(mall$Income)
IncomeOldMax <- max(mall$Income)
skim(mall)
Data summary
Name mall
Number of rows 2000
Number of columns 7
_______________________
Column type frequency:
factor 5
numeric 2
________________________
Group variables None

Variable type: factor

skim_variable n_missing complete_rate ordered n_unique top_counts
Sex 0 1 FALSE 2 0: 1086, 1: 914
Marital.status 0 1 FALSE 2 0: 1007, 1: 993
Education 0 1 FALSE 4 1: 1386, 2: 291, 0: 287, 3: 36
Occupation 0 1 FALSE 3 1: 1113, 0: 633, 2: 254
Settlement.size 0 1 FALSE 3 0: 989, 1: 544, 2: 467

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
Age 0 1 35.91 11.72 18 27.00 33.0 42.0 76 ▇▇▃▂▁
Income 0 1 120954.42 38108.82 35832 97663.25 115548.5 138072.2 309364 ▂▇▂▁▁
mall %>%
  purrr::keep(is.numeric) %>% 
  gather() %>% 
  ggplot(aes(value)) +
    facet_wrap(~ key, scales = "free") +
    geom_histogram(fill = "darkorange") + ggthemes::theme_fivethirtyeight()

Categorizing Age due to strong right skewness and high variability

mall <- mall %>%
  mutate(age_category = cut(mall$Age, breaks = c(18, 35, 55, Inf),
          labels = c("Young Adult", "Middle-Aged Adult", "Senior Adult"), 
          include.lowest = TRUE))
ggplot(mall, aes(x=age_category, y=Income, fill=age_category)) + geom_boxplot()

ggplot(mall, aes(x=Settlement.size, fill=Settlement.size)) + geom_bar()

SexVector <- mall$Sex
mall <- mall %>% select(-age_category, -Sex)
Normalizing the dataset
normalizer <- function(x){
  return ((x-min(x))/(max(x)-min(x)))
}

#mall$Sex <- as.double(mall$Sex)
mall$Marital.status <- as.double(mall$Marital.status)
mall$Age <- as.double(mall$Age)
mall$Education <- as.double(mall$Education)
mall$Income <- as.double(mall$Income)
mall$Occupation <- as.double(mall$Occupation)
mall$Settlement.size <- as.double(mall$Settlement.size)

mall <- as.data.frame(lapply(mall, normalizer))
set.seed(0)

mallCluster <- kmeans(mall, center=3, nstart=20)

mallCluster
## K-means clustering with 3 clusters of sizes 993, 532, 475
## 
## Cluster means:
##   Marital.status       Age Education    Income Occupation Settlement.size
## 1              1 0.2654096 0.4212823 0.3008839  0.3957704       0.3298087
## 2              0 0.3765880 0.2763158 0.3985590  0.6334586       0.7556391
## 3              0 0.3234846 0.2666667 0.2349125  0.1694737       0.0200000
## 
## Clustering vector:
##    [1] 2 1 3 2 2 3 2 2 1 1 1 1 1 3 1 1 3 2 1 2 2 2 3 2 1 3 1 1 2 3 1 2 2 3 1 3 2
##   [38] 2 2 2 2 1 3 1 1 1 3 1 3 1 3 2 2 3 2 3 2 1 1 1 1 2 2 2 1 2 1 1 2 1 1 3 2 1
##   [75] 2 2 2 2 3 1 1 2 1 1 3 1 1 2 2 2 2 2 1 2 2 2 1 2 2 2 1 3 1 2 1 2 2 2 1 1 1
##  [112] 1 2 1 1 2 1 1 1 1 1 1 3 3 1 2 1 2 2 1 1 2 3 2 1 1 2 3 2 1 1 1 2 2 1 2 1 1
##  [149] 2 2 3 2 2 1 2 1 2 1 2 3 2 1 2 2 2 2 1 1 1 1 3 1 1 3 2 1 1 2 2 3 1 1 3 2 2
##  [186] 2 1 3 1 1 2 3 2 1 2 1 1 1 2 1 3 1 2 1 3 2 3 2 1 2 2 1 2 2 3 1 2 2 1 1 1 1
##  [223] 2 2 2 2 2 2 2 2 2 3 3 1 3 3 2 1 1 2 1 2 2 1 1 2 2 3 2 1 1 1 1 2 2 3 3 1 1
##  [260] 1 2 1 2 1 3 3 3 1 1 2 2 1 2 2 1 2 1 1 1 2 2 2 3 2 1 1 2 2 1 1 1 1 1 2 1 1
##  [297] 1 3 2 2 1 1 3 3 3 1 1 1 1 1 3 3 1 1 3 2 3 1 2 1 1 1 2 3 1 3 2 1 2 2 2 1 1
##  [334] 1 2 1 1 1 2 1 1 1 1 2 3 1 2 1 3 2 1 1 2 2 2 3 2 2 1 1 2 2 1 1 3 3 2 1 2 1
##  [371] 2 2 1 2 1 1 1 2 2 1 2 1 1 2 3 1 1 2 2 1 1 1 2 1 2 2 1 1 1 1 1 1 2 3 3 1 2
##  [408] 1 1 1 1 2 2 3 1 2 3 1 1 1 1 2 3 2 1 1 3 2 2 1 3 2 2 1 2 1 1 1 3 1 2 1 2 3
##  [445] 1 1 1 3 1 1 2 1 2 2 2 1 2 3 1 2 2 2 1 2 2 2 1 2 1 3 1 1 3 2 1 1 3 2 3 2 1
##  [482] 3 1 1 2 2 2 1 3 2 2 1 1 3 3 1 2 3 1 1 1 2 1 3 1 3 2 2 1 2 2 2 2 1 3 1 3 1
##  [519] 2 2 2 1 2 1 1 2 1 3 1 3 3 1 1 1 2 1 1 1 2 1 1 3 3 3 3 2 1 2 1 2 1 3 1 2 1
##  [556] 3 1 1 3 3 2 3 1 2 2 1 1 2 3 1 1 3 1 1 3 1 1 1 1 1 2 3 1 3 1 1 3 3 1 2 1 1
##  [593] 1 1 2 1 1 3 3 2 1 1 1 1 1 2 2 1 2 3 1 1 1 1 1 2 2 3 1 1 2 2 1 1 2 2 2 2 2
##  [630] 3 1 2 2 1 1 3 1 2 1 3 1 3 2 1 1 2 1 1 2 3 2 3 2 2 3 2 3 2 1 2 1 1 2 2 1 2
##  [667] 2 1 3 3 3 2 2 1 3 1 2 1 3 1 1 2 1 1 3 2 2 2 2 2 1 1 2 3 2 3 3 1 2 2 1 3 1
##  [704] 2 1 1 2 1 1 3 2 1 2 3 2 2 2 2 2 1 2 1 1 1 1 1 1 1 1 1 2 3 1 1 1 1 1 3 2 1
##  [741] 2 3 2 2 2 3 1 1 3 2 1 1 1 1 2 1 3 2 3 2 1 1 3 2 1 1 1 1 2 3 1 1 2 2 3 1 2
##  [778] 1 2 1 2 2 3 1 1 3 3 2 2 2 1 2 1 1 2 1 2 1 1 3 2 1 1 3 1 2 1 2 2 1 1 1 2 2
##  [815] 2 1 1 1 1 1 1 1 3 3 1 1 1 2 2 2 2 1 2 1 1 2 1 1 3 1 3 3 1 3 2 3 2 1 3 3 1
##  [852] 3 3 3 2 1 1 3 1 2 2 1 1 3 3 1 1 2 2 1 1 2 2 1 2 1 2 2 2 3 2 1 2 1 1 1 2 2
##  [889] 2 3 2 2 3 2 2 2 1 1 1 2 2 1 2 1 2 2 3 1 1 1 1 1 1 1 3 1 3 2 1 2 1 2 1 2 1
##  [926] 2 1 2 3 3 3 1 2 2 1 2 1 1 2 2 2 3 2 3 1 1 2 2 1 2 1 2 1 1 1 1 3 3 3 2 1 1
##  [963] 2 3 1 1 3 1 1 3 2 3 1 2 1 1 1 2 3 1 2 2 1 2 3 1 1 1 2 3 3 3 1 2 2 1 1 2 3
## [1000] 2 1 2 1 1 1 1 2 1 1 1 2 1 1 1 1 1 2 2 1 2 1 1 2 1 1 1 1 2 1 2 2 1 2 2 1 1
## [1037] 2 2 1 1 1 1 1 1 2 1 1 1 1 1 1 2 1 2 1 1 1 1 1 1 1 2 2 1 1 1 1 1 1 2 1 1 2
## [1074] 1 1 1 1 2 1 1 1 2 1 1 1 1 1 2 2 1 2 1 1 2 2 1 2 2 1 1 2 1 2 1 2 2 2 2 1 2
## [1111] 1 2 2 1 1 2 2 2 1 2 2 2 2 2 1 2 2 1 2 2 1 3 2 2 2 1 2 2 2 2 1 1 2 2 2 2 2
## [1148] 1 1 2 1 2 2 1 1 1 2 2 2 1 1 2 2 2 1 2 2 2 1 2 2 2 3 1 2 2 2 1 2 3 3 2 2 1
## [1185] 2 3 2 3 2 3 2 1 2 3 1 3 2 2 2 2 1 2 2 2 1 2 2 2 3 2 1 2 1 1 2 2 2 2 2 2 2
## [1222] 1 3 2 2 1 2 2 2 2 1 1 2 1 2 1 1 2 2 1 1 2 1 2 2 1 3 3 1 2 1 2 2 1 2 2 2 3
## [1259] 1 2 2 1 1 2 2 1 2 2 1 2 2 1 2 1 2 2 3 1 2 1 3 2 2 2 1 1 2 1 2 2 2 2 2 2 3
## [1296] 2 2 2 2 2 3 3 1 3 3 1 1 1 1 1 1 1 3 1 1 1 1 1 1 3 1 3 1 1 1 1 1 1 3 2 1 1
## [1333] 1 1 1 1 1 3 1 1 1 1 1 1 1 1 1 3 1 2 3 1 1 1 1 1 3 1 1 1 1 1 1 1 3 1 2 1 3
## [1370] 3 1 1 1 1 1 1 1 3 1 3 1 3 1 1 1 3 1 3 1 1 1 1 1 1 1 1 3 1 1 1 1 1 3 1 1 1
## [1407] 3 1 1 3 1 3 1 1 1 2 1 3 1 1 1 1 1 1 1 1 2 1 3 1 1 1 3 3 1 3 1 2 1 1 1 1 1
## [1444] 1 1 3 1 3 3 3 1 2 1 1 1 1 1 1 1 3 1 1 1 1 1 1 1 2 3 1 1 1 1 3 1 1 1 1 1 3
## [1481] 3 1 3 3 1 1 3 1 1 1 2 3 1 1 3 1 1 1 3 1 3 1 3 1 1 1 1 3 2 1 1 3 1 1 1 3 3
## [1518] 1 1 1 1 1 1 1 1 1 3 1 2 3 1 1 1 1 1 3 1 1 1 1 1 1 1 3 1 2 1 3 3 1 1 1 1 1
## [1555] 1 1 3 1 3 1 3 1 1 1 3 1 3 1 1 1 1 1 1 1 1 3 1 1 1 1 1 3 1 1 1 3 1 1 3 1 3
## [1592] 1 1 1 2 1 3 1 1 1 1 3 3 1 1 1 1 3 3 1 1 1 1 3 3 3 1 1 3 1 3 1 1 3 3 3 1 3
## [1629] 3 3 3 1 1 1 3 1 3 3 3 1 1 3 3 3 3 1 3 3 1 1 1 3 3 3 1 3 3 1 1 1 3 1 3 3 3
## [1666] 1 1 1 1 1 1 1 3 3 1 3 3 1 1 3 3 1 3 1 3 3 3 3 1 1 1 3 3 1 3 3 1 3 1 1 3 3
## [1703] 3 1 3 3 3 1 1 1 3 1 1 3 3 1 1 3 3 1 1 1 3 3 3 3 3 3 1 3 1 1 1 3 3 1 3 1 1
## [1740] 1 3 3 3 3 3 1 3 1 3 3 1 1 3 3 1 1 3 3 3 1 3 1 3 1 3 3 1 3 3 1 1 3 3 1 3 3
## [1777] 1 3 1 3 1 1 1 3 3 3 1 3 1 3 3 3 3 1 1 1 1 3 1 1 3 3 1 1 1 3 3 1 1 3 1 3 3
## [1814] 3 3 3 3 1 3 1 3 1 1 1 1 1 3 1 1 3 1 3 1 1 1 1 3 1 3 3 1 3 1 1 3 1 3 1 1 1
## [1851] 1 1 3 3 1 1 1 1 3 3 1 1 1 1 3 3 3 1 1 3 1 3 1 1 3 3 3 1 3 3 3 3 1 1 1 3 1
## [1888] 3 3 3 1 1 3 3 3 3 1 3 3 1 1 1 3 3 3 1 3 3 1 1 1 3 1 3 3 3 1 1 1 1 1 1 1 3
## [1925] 3 1 3 3 1 1 3 3 1 3 1 3 3 3 3 1 1 1 3 3 1 3 3 1 3 1 1 3 3 3 1 3 3 3 1 1 1
## [1962] 3 1 1 3 3 1 1 3 3 1 1 1 3 3 3 3 3 3 1 3 1 1 1 3 3 1 3 1 1 1 3 3 3 3 3 1 3
## [1999] 1 3
## 
## Within cluster sum of squares by cluster:
## [1] 328.97426 120.49547  68.92505
##  (between_SS / total_SS =  58.0 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
## [6] "betweenss"    "size"         "iter"         "ifault"

Finding best ‘k’ for K-means clustering

clustering_models <- 1:15 %>% map(function(x){kmeans(mall, x)})

clustering_models %>% 
  map_dbl("tot.withinss") %>% 
  as_data_frame() %>% 
  rename(total_ss = value) %>% 
  mutate(k = 1:15) %>%
  ggplot(aes(k, total_ss)) + 
  geom_point() + 
  geom_line(color = "red") + 
  ggtitle("Total Sum of Within-Cluster Squared Error")

fviz_nbclust(mall, kmeans, method = "wss")

Choosing 8 as number of clusters, reverting the dataset to original values, and appending the computed clusters.
  • Reversed values mathematically so as to save space in the repository.
mallCluster <- kmeans(mall, center=4, nstart=20)

mall$Age = (mall$Age * (AgeOldMax - AgeOldMin)) + AgeOldMin
mall$Income = (mall$Income * (IncomeOldMax - IncomeOldMin)) + IncomeOldMin

#mall$Sex <- factor(mall$Sex, levels=c(0,1))
mall$Marital.status <- factor(mall$Marital.status, levels=c(0,1),labels=c("Single", "Non-Single"))

mall$Education <- as.character(mall$Education) %>% as.factor()
mall$Education <- factor(mall$Education,levels=c('0','0.333333333333333','0.666666666666667','1'),labels=c("Other", "HighSchool", "Undergraduate","Graduate"))

mall$Occupation <- factor(mall$Occupation, levels=c(0,0.5,1), labels=c("Unskilled", "Skilled", "Managuerial"))

mall$Settlement.size <- factor(mall$Settlement.size, levels=c(0.0,0.5,1.0), labels=c("Small","Mid-Size","Large"))




# Appending clusters to data
mall$Clusters <- as.factor(mallCluster$cluster)

# Appending `Sex` variable back to data
mall$Sex <- SexVector

table(mall$Clusters)
## 
##   1   2   3   4 
## 534 459 475 532

Exploratory Data Analysis on Clusters

ggplot(mall, aes(x=Clusters, fill=Occupation)) + geom_bar(position = "dodge")

ggplot(mall, aes(x=Clusters, fill=Sex)) + geom_bar(position = "dodge") +
  scale_fill_manual(values=c("#619CFF","#F8766D"))

ggplot(mall, aes(x=Clusters, fill=Education)) + geom_bar(position = "dodge")

ggplot(mall, aes(x=Clusters, fill=Settlement.size)) + geom_bar(position = "dodge")

CHECK IF ADDING LEVELSS TO THE VARIABLES DOES ANYTHING TO CHANGE HOW THE ELBOW PLOT SHOWS UP.

Cluster 1:

Cluster 2:

Cluster 3:

Cluster 4: