ID: Unique Identification for customer Sex: Sex of customer (0 = Male, 1 = Female) Marital.status: Indication of whether customer is Single or Non-Single (divorce/separated/married/widowed) Age: Age of customer Education: Education Level (0=Other/Unknown,1=High School,2=University,3=Graduate School) Income: Self Reported Annual Income in US Dollars Occupation: Occupation Tier Level (0=Unemployed/Unskilled,1=Skilled Employee,2=Management/Self Employed/ Highly Qualified Employee) Settlement.size: Size of the city that the customer lives in (0=Small,1=Mid-sized,2=Large)
library(tidyverse)
library(skimr)
library(stats)
library(factoextra)
mall <- read.csv("mall_data.csv") %>% select(-ID)
mall$Sex <- factor(mall$Sex, levels=c(0,1))
mall$Marital.status <- factor(mall$Marital.status,levels=c(0,1))
mall$Education <- factor(mall$Education,levels=c(0,1,2,3))
mall$Occupation <- factor(mall$Occupation,levels=c(0,1,2))
mall$Settlement.size <- factor(mall$Settlement.size,levels=c(0,1,2))
AgeOldMin <- min(mall$Age)
AgeOldMax <- max(mall$Age)
IncomeOldMin <- min(mall$Income)
IncomeOldMax <- max(mall$Income)
skim(mall)
| Name | mall |
| Number of rows | 2000 |
| Number of columns | 7 |
| _______________________ | |
| Column type frequency: | |
| factor | 5 |
| numeric | 2 |
| ________________________ | |
| Group variables | None |
Variable type: factor
| skim_variable | n_missing | complete_rate | ordered | n_unique | top_counts |
|---|---|---|---|---|---|
| Sex | 0 | 1 | FALSE | 2 | 0: 1086, 1: 914 |
| Marital.status | 0 | 1 | FALSE | 2 | 0: 1007, 1: 993 |
| Education | 0 | 1 | FALSE | 4 | 1: 1386, 2: 291, 0: 287, 3: 36 |
| Occupation | 0 | 1 | FALSE | 3 | 1: 1113, 0: 633, 2: 254 |
| Settlement.size | 0 | 1 | FALSE | 3 | 0: 989, 1: 544, 2: 467 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| Age | 0 | 1 | 35.91 | 11.72 | 18 | 27.00 | 33.0 | 42.0 | 76 | ▇▇▃▂▁ |
| Income | 0 | 1 | 120954.42 | 38108.82 | 35832 | 97663.25 | 115548.5 | 138072.2 | 309364 | ▂▇▂▁▁ |
mall %>%
purrr::keep(is.numeric) %>%
gather() %>%
ggplot(aes(value)) +
facet_wrap(~ key, scales = "free") +
geom_histogram(fill = "darkorange") + ggthemes::theme_fivethirtyeight()
Categorizing Age due to strong right skewness and high
variability
mall <- mall %>%
mutate(age_category = cut(mall$Age, breaks = c(18, 35, 55, Inf),
labels = c("Young Adult", "Middle-Aged Adult", "Senior Adult"),
include.lowest = TRUE))
ggplot(mall, aes(x=age_category, y=Income, fill=age_category)) + geom_boxplot()
ggplot(mall, aes(x=Settlement.size, fill=Settlement.size)) + geom_bar()
age_category after having used it for
exploratory data analysis.sex variable as it’ll influence the results
substantially due to close distance (Euclidean).SexVector <- mall$Sex
mall <- mall %>% select(-age_category, -Sex)
normalizer <- function(x){
return ((x-min(x))/(max(x)-min(x)))
}
#mall$Sex <- as.double(mall$Sex)
mall$Marital.status <- as.double(mall$Marital.status)
mall$Age <- as.double(mall$Age)
mall$Education <- as.double(mall$Education)
mall$Income <- as.double(mall$Income)
mall$Occupation <- as.double(mall$Occupation)
mall$Settlement.size <- as.double(mall$Settlement.size)
mall <- as.data.frame(lapply(mall, normalizer))
set.seed(0)
mallCluster <- kmeans(mall, center=3, nstart=20)
mallCluster
## K-means clustering with 3 clusters of sizes 993, 532, 475
##
## Cluster means:
## Marital.status Age Education Income Occupation Settlement.size
## 1 1 0.2654096 0.4212823 0.3008839 0.3957704 0.3298087
## 2 0 0.3765880 0.2763158 0.3985590 0.6334586 0.7556391
## 3 0 0.3234846 0.2666667 0.2349125 0.1694737 0.0200000
##
## Clustering vector:
## [1] 2 1 3 2 2 3 2 2 1 1 1 1 1 3 1 1 3 2 1 2 2 2 3 2 1 3 1 1 2 3 1 2 2 3 1 3 2
## [38] 2 2 2 2 1 3 1 1 1 3 1 3 1 3 2 2 3 2 3 2 1 1 1 1 2 2 2 1 2 1 1 2 1 1 3 2 1
## [75] 2 2 2 2 3 1 1 2 1 1 3 1 1 2 2 2 2 2 1 2 2 2 1 2 2 2 1 3 1 2 1 2 2 2 1 1 1
## [112] 1 2 1 1 2 1 1 1 1 1 1 3 3 1 2 1 2 2 1 1 2 3 2 1 1 2 3 2 1 1 1 2 2 1 2 1 1
## [149] 2 2 3 2 2 1 2 1 2 1 2 3 2 1 2 2 2 2 1 1 1 1 3 1 1 3 2 1 1 2 2 3 1 1 3 2 2
## [186] 2 1 3 1 1 2 3 2 1 2 1 1 1 2 1 3 1 2 1 3 2 3 2 1 2 2 1 2 2 3 1 2 2 1 1 1 1
## [223] 2 2 2 2 2 2 2 2 2 3 3 1 3 3 2 1 1 2 1 2 2 1 1 2 2 3 2 1 1 1 1 2 2 3 3 1 1
## [260] 1 2 1 2 1 3 3 3 1 1 2 2 1 2 2 1 2 1 1 1 2 2 2 3 2 1 1 2 2 1 1 1 1 1 2 1 1
## [297] 1 3 2 2 1 1 3 3 3 1 1 1 1 1 3 3 1 1 3 2 3 1 2 1 1 1 2 3 1 3 2 1 2 2 2 1 1
## [334] 1 2 1 1 1 2 1 1 1 1 2 3 1 2 1 3 2 1 1 2 2 2 3 2 2 1 1 2 2 1 1 3 3 2 1 2 1
## [371] 2 2 1 2 1 1 1 2 2 1 2 1 1 2 3 1 1 2 2 1 1 1 2 1 2 2 1 1 1 1 1 1 2 3 3 1 2
## [408] 1 1 1 1 2 2 3 1 2 3 1 1 1 1 2 3 2 1 1 3 2 2 1 3 2 2 1 2 1 1 1 3 1 2 1 2 3
## [445] 1 1 1 3 1 1 2 1 2 2 2 1 2 3 1 2 2 2 1 2 2 2 1 2 1 3 1 1 3 2 1 1 3 2 3 2 1
## [482] 3 1 1 2 2 2 1 3 2 2 1 1 3 3 1 2 3 1 1 1 2 1 3 1 3 2 2 1 2 2 2 2 1 3 1 3 1
## [519] 2 2 2 1 2 1 1 2 1 3 1 3 3 1 1 1 2 1 1 1 2 1 1 3 3 3 3 2 1 2 1 2 1 3 1 2 1
## [556] 3 1 1 3 3 2 3 1 2 2 1 1 2 3 1 1 3 1 1 3 1 1 1 1 1 2 3 1 3 1 1 3 3 1 2 1 1
## [593] 1 1 2 1 1 3 3 2 1 1 1 1 1 2 2 1 2 3 1 1 1 1 1 2 2 3 1 1 2 2 1 1 2 2 2 2 2
## [630] 3 1 2 2 1 1 3 1 2 1 3 1 3 2 1 1 2 1 1 2 3 2 3 2 2 3 2 3 2 1 2 1 1 2 2 1 2
## [667] 2 1 3 3 3 2 2 1 3 1 2 1 3 1 1 2 1 1 3 2 2 2 2 2 1 1 2 3 2 3 3 1 2 2 1 3 1
## [704] 2 1 1 2 1 1 3 2 1 2 3 2 2 2 2 2 1 2 1 1 1 1 1 1 1 1 1 2 3 1 1 1 1 1 3 2 1
## [741] 2 3 2 2 2 3 1 1 3 2 1 1 1 1 2 1 3 2 3 2 1 1 3 2 1 1 1 1 2 3 1 1 2 2 3 1 2
## [778] 1 2 1 2 2 3 1 1 3 3 2 2 2 1 2 1 1 2 1 2 1 1 3 2 1 1 3 1 2 1 2 2 1 1 1 2 2
## [815] 2 1 1 1 1 1 1 1 3 3 1 1 1 2 2 2 2 1 2 1 1 2 1 1 3 1 3 3 1 3 2 3 2 1 3 3 1
## [852] 3 3 3 2 1 1 3 1 2 2 1 1 3 3 1 1 2 2 1 1 2 2 1 2 1 2 2 2 3 2 1 2 1 1 1 2 2
## [889] 2 3 2 2 3 2 2 2 1 1 1 2 2 1 2 1 2 2 3 1 1 1 1 1 1 1 3 1 3 2 1 2 1 2 1 2 1
## [926] 2 1 2 3 3 3 1 2 2 1 2 1 1 2 2 2 3 2 3 1 1 2 2 1 2 1 2 1 1 1 1 3 3 3 2 1 1
## [963] 2 3 1 1 3 1 1 3 2 3 1 2 1 1 1 2 3 1 2 2 1 2 3 1 1 1 2 3 3 3 1 2 2 1 1 2 3
## [1000] 2 1 2 1 1 1 1 2 1 1 1 2 1 1 1 1 1 2 2 1 2 1 1 2 1 1 1 1 2 1 2 2 1 2 2 1 1
## [1037] 2 2 1 1 1 1 1 1 2 1 1 1 1 1 1 2 1 2 1 1 1 1 1 1 1 2 2 1 1 1 1 1 1 2 1 1 2
## [1074] 1 1 1 1 2 1 1 1 2 1 1 1 1 1 2 2 1 2 1 1 2 2 1 2 2 1 1 2 1 2 1 2 2 2 2 1 2
## [1111] 1 2 2 1 1 2 2 2 1 2 2 2 2 2 1 2 2 1 2 2 1 3 2 2 2 1 2 2 2 2 1 1 2 2 2 2 2
## [1148] 1 1 2 1 2 2 1 1 1 2 2 2 1 1 2 2 2 1 2 2 2 1 2 2 2 3 1 2 2 2 1 2 3 3 2 2 1
## [1185] 2 3 2 3 2 3 2 1 2 3 1 3 2 2 2 2 1 2 2 2 1 2 2 2 3 2 1 2 1 1 2 2 2 2 2 2 2
## [1222] 1 3 2 2 1 2 2 2 2 1 1 2 1 2 1 1 2 2 1 1 2 1 2 2 1 3 3 1 2 1 2 2 1 2 2 2 3
## [1259] 1 2 2 1 1 2 2 1 2 2 1 2 2 1 2 1 2 2 3 1 2 1 3 2 2 2 1 1 2 1 2 2 2 2 2 2 3
## [1296] 2 2 2 2 2 3 3 1 3 3 1 1 1 1 1 1 1 3 1 1 1 1 1 1 3 1 3 1 1 1 1 1 1 3 2 1 1
## [1333] 1 1 1 1 1 3 1 1 1 1 1 1 1 1 1 3 1 2 3 1 1 1 1 1 3 1 1 1 1 1 1 1 3 1 2 1 3
## [1370] 3 1 1 1 1 1 1 1 3 1 3 1 3 1 1 1 3 1 3 1 1 1 1 1 1 1 1 3 1 1 1 1 1 3 1 1 1
## [1407] 3 1 1 3 1 3 1 1 1 2 1 3 1 1 1 1 1 1 1 1 2 1 3 1 1 1 3 3 1 3 1 2 1 1 1 1 1
## [1444] 1 1 3 1 3 3 3 1 2 1 1 1 1 1 1 1 3 1 1 1 1 1 1 1 2 3 1 1 1 1 3 1 1 1 1 1 3
## [1481] 3 1 3 3 1 1 3 1 1 1 2 3 1 1 3 1 1 1 3 1 3 1 3 1 1 1 1 3 2 1 1 3 1 1 1 3 3
## [1518] 1 1 1 1 1 1 1 1 1 3 1 2 3 1 1 1 1 1 3 1 1 1 1 1 1 1 3 1 2 1 3 3 1 1 1 1 1
## [1555] 1 1 3 1 3 1 3 1 1 1 3 1 3 1 1 1 1 1 1 1 1 3 1 1 1 1 1 3 1 1 1 3 1 1 3 1 3
## [1592] 1 1 1 2 1 3 1 1 1 1 3 3 1 1 1 1 3 3 1 1 1 1 3 3 3 1 1 3 1 3 1 1 3 3 3 1 3
## [1629] 3 3 3 1 1 1 3 1 3 3 3 1 1 3 3 3 3 1 3 3 1 1 1 3 3 3 1 3 3 1 1 1 3 1 3 3 3
## [1666] 1 1 1 1 1 1 1 3 3 1 3 3 1 1 3 3 1 3 1 3 3 3 3 1 1 1 3 3 1 3 3 1 3 1 1 3 3
## [1703] 3 1 3 3 3 1 1 1 3 1 1 3 3 1 1 3 3 1 1 1 3 3 3 3 3 3 1 3 1 1 1 3 3 1 3 1 1
## [1740] 1 3 3 3 3 3 1 3 1 3 3 1 1 3 3 1 1 3 3 3 1 3 1 3 1 3 3 1 3 3 1 1 3 3 1 3 3
## [1777] 1 3 1 3 1 1 1 3 3 3 1 3 1 3 3 3 3 1 1 1 1 3 1 1 3 3 1 1 1 3 3 1 1 3 1 3 3
## [1814] 3 3 3 3 1 3 1 3 1 1 1 1 1 3 1 1 3 1 3 1 1 1 1 3 1 3 3 1 3 1 1 3 1 3 1 1 1
## [1851] 1 1 3 3 1 1 1 1 3 3 1 1 1 1 3 3 3 1 1 3 1 3 1 1 3 3 3 1 3 3 3 3 1 1 1 3 1
## [1888] 3 3 3 1 1 3 3 3 3 1 3 3 1 1 1 3 3 3 1 3 3 1 1 1 3 1 3 3 3 1 1 1 1 1 1 1 3
## [1925] 3 1 3 3 1 1 3 3 1 3 1 3 3 3 3 1 1 1 3 3 1 3 3 1 3 1 1 3 3 3 1 3 3 3 1 1 1
## [1962] 3 1 1 3 3 1 1 3 3 1 1 1 3 3 3 3 3 3 1 3 1 1 1 3 3 1 3 1 1 1 3 3 3 3 3 1 3
## [1999] 1 3
##
## Within cluster sum of squares by cluster:
## [1] 328.97426 120.49547 68.92505
## (between_SS / total_SS = 58.0 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss" "tot.withinss"
## [6] "betweenss" "size" "iter" "ifault"
clustering_models <- 1:15 %>% map(function(x){kmeans(mall, x)})
clustering_models %>%
map_dbl("tot.withinss") %>%
as_data_frame() %>%
rename(total_ss = value) %>%
mutate(k = 1:15) %>%
ggplot(aes(k, total_ss)) +
geom_point() +
geom_line(color = "red") +
ggtitle("Total Sum of Within-Cluster Squared Error")
fviz_nbclust(mall, kmeans, method = "wss")
mallCluster <- kmeans(mall, center=4, nstart=20)
mall$Age = (mall$Age * (AgeOldMax - AgeOldMin)) + AgeOldMin
mall$Income = (mall$Income * (IncomeOldMax - IncomeOldMin)) + IncomeOldMin
#mall$Sex <- factor(mall$Sex, levels=c(0,1))
mall$Marital.status <- factor(mall$Marital.status, levels=c(0,1),labels=c("Single", "Non-Single"))
mall$Education <- as.character(mall$Education) %>% as.factor()
mall$Education <- factor(mall$Education,levels=c('0','0.333333333333333','0.666666666666667','1'),labels=c("Other", "HighSchool", "Undergraduate","Graduate"))
mall$Occupation <- factor(mall$Occupation, levels=c(0,0.5,1), labels=c("Unskilled", "Skilled", "Managuerial"))
mall$Settlement.size <- factor(mall$Settlement.size, levels=c(0.0,0.5,1.0), labels=c("Small","Mid-Size","Large"))
# Appending clusters to data
mall$Clusters <- as.factor(mallCluster$cluster)
# Appending `Sex` variable back to data
mall$Sex <- SexVector
table(mall$Clusters)
##
## 1 2 3 4
## 534 459 475 532
ggplot(mall, aes(x=Clusters, fill=Occupation)) + geom_bar(position = "dodge")
ggplot(mall, aes(x=Clusters, fill=Sex)) + geom_bar(position = "dodge") +
scale_fill_manual(values=c("#619CFF","#F8766D"))
ggplot(mall, aes(x=Clusters, fill=Education)) + geom_bar(position = "dodge")
ggplot(mall, aes(x=Clusters, fill=Settlement.size)) + geom_bar(position = "dodge")
CHECK IF ADDING LEVELSS TO THE VARIABLES DOES ANYTHING TO CHANGE HOW THE ELBOW PLOT SHOWS UP.
Cluster 1:
Cluster 2:
Cluster 3:
Cluster 4: