library(tidyverse)
library(ggthemr)
library(miscset)
library(recipes)
library(gridExtra)
customerdata <- read_csv("~/Documents/Mall_Customers.csv")
## Parsed with column specification:
## cols(
## CustomerID = col_double(),
## Gender = col_character(),
## Age = col_double(),
## `Annual Income (k$)` = col_double(),
## `Spending Score (1-100)` = col_double()
## )
customerdata <- customerdata %>%
rename(Annual_Income = `Annual Income (k$)`,
Spending_Score = `Spending Score (1-100)`)
customerdata <- customerdata %>%
mutate_if(is.double, as.numeric) %>%
mutate_if(is.character, as.factor)
str(customerdata)
## Classes 'spec_tbl_df', 'tbl_df', 'tbl' and 'data.frame': 200 obs. of 5 variables:
## $ CustomerID : num 1 2 3 4 5 6 7 8 9 10 ...
## $ Gender : Factor w/ 2 levels "Female","Male": 2 2 1 1 1 1 1 1 2 1 ...
## $ Age : num 19 21 20 23 31 22 35 23 64 30 ...
## $ Annual_Income : num 15 15 16 16 17 17 18 18 19 19 ...
## $ Spending_Score: num 39 81 6 77 40 76 6 94 3 72 ...
ggthemr("flat", type = "outer")
## Warning: New theme missing the following elements: axis.ticks.length.x,
## axis.ticks.length.x.top, axis.ticks.length.x.bottom, axis.ticks.length.y,
## axis.ticks.length.y.left, axis.ticks.length.y.right
ggplotGrid(ncol=1,
lapply(c("Age", "Annual_Income", "Spending_Score"),
function(col){
ggplot(customerdata, aes_string(col)) +
geom_density(aes(fill=Gender), bins=30, alpha = 0.7)
}))
## Warning: Ignoring unknown parameters: bins
## Warning: Ignoring unknown parameters: bins
## Warning: Ignoring unknown parameters: bins

ggthemr("pale")
## Warning: New theme missing the following elements: axis.ticks.length.x,
## axis.ticks.length.x.top, axis.ticks.length.x.bottom, axis.ticks.length.y,
## axis.ticks.length.y.left, axis.ticks.length.y.right
gender_count <- customerdata %>%
group_by(Gender) %>%
ggplot(aes(x = Gender,fill = Gender))+
geom_bar(alpha = 0.7)+
theme_bw()
gender_count

gender_spending <- customerdata %>%
select(Gender, Spending_Score) %>%
group_by(Gender) %>%
ggplot(aes(x = Gender, y = Spending_Score, fill = Gender))+
geom_bar(stat = "identity", alpha=0.7)
gender_income <- customerdata %>%
select(Gender, Annual_Income) %>%
group_by(Gender) %>%
ggplot(aes(x = Gender, y = Annual_Income, fill = Gender))+
geom_bar(stat = "identity", alpha = 0.7)
grid.arrange(gender_spending, gender_income, layout_matrix = rbind(c(1,2)))

data_prep <- recipe(~., data=customerdata) %>%
step_rm(CustomerID) %>%
step_dummy(Gender, one_hot = TRUE, naming = partial(dummy_names, sep = "_")) %>%
prep()
data_prep_done <- customerdata %>%
recipes::bake(data_prep, new_data = .)
data_prep_done %>% cor()
## Age Annual_Income Spending_Score Gender_Female
## Age 1.00000000 -0.012398043 -0.327226846 -0.06086739
## Annual_Income -0.01239804 1.000000000 0.009902848 -0.05640981
## Spending_Score -0.32722685 0.009902848 1.000000000 0.05810874
## Gender_Female -0.06086739 -0.056409810 0.058108739 1.00000000
## Gender_Male 0.06086739 0.056409810 -0.058108739 -1.00000000
## Gender_Male
## Age 0.06086739
## Annual_Income 0.05640981
## Spending_Score -0.05810874
## Gender_Female -1.00000000
## Gender_Male 1.00000000
head(customerdata)
## # A tibble: 6 x 5
## CustomerID Gender Age Annual_Income Spending_Score
## <dbl> <fct> <dbl> <dbl> <dbl>
## 1 1 Male 19 15 39
## 2 2 Male 21 15 81
## 3 3 Female 20 16 6
## 4 4 Female 23 16 77
## 5 5 Female 31 17 40
## 6 6 Female 22 17 76
k_variable <- customerdata[,c(3,5)]
tot.withinss <- vector("numeric", length = 10)
for (i in 1:10){
kDet <- kmeans(k_variable, i)
tot.withinss[i] <- kDet$tot.withinss
}
ggplot(as.data.frame(tot.withinss), aes(x = seq(1,10), y = tot.withinss)) +
geom_point() +
geom_line() +
theme(axis.title.x.bottom = element_blank()) +
ylab("Within-cluster Sum of Squares") +
xlab("Number of Clusters") +
ggtitle("Elbow K Estimation")

customerClusters <- kmeans(k_variable, 4)
customerClusters
## K-means clustering with 4 clusters of sizes 57, 48, 48, 47
##
## Cluster means:
## Age Spending_Score
## 1 30.17544 82.35088
## 2 55.70833 48.22917
## 3 43.29167 15.02083
## 4 27.61702 49.14894
##
## Clustering vector:
## [1] 4 1 3 1 4 1 3 1 3 1 3 1 3 1 3 1 4 4 3 1 4 1 3 1 3 1 3 4 3 1 3 1 3 1 3 1 3
## [38] 1 3 1 2 1 2 4 3 4 2 4 4 4 2 4 4 2 2 2 2 2 4 2 2 4 2 2 2 4 2 2 4 4 2 2 2 2
## [75] 2 4 2 4 4 2 2 4 2 2 4 2 2 4 4 2 2 4 2 4 4 4 2 4 2 4 4 2 2 4 2 4 2 2 2 2 2
## [112] 4 4 4 4 4 2 2 2 2 4 4 4 1 4 1 2 1 3 1 3 1 4 1 3 1 3 1 3 1 3 1 4 1 3 1 2 1
## [149] 3 1 3 1 3 1 3 1 3 1 3 1 2 1 3 1 3 1 3 1 3 4 3 1 3 1 3 1 3 1 3 1 3 1 3 1 4
## [186] 1 3 1 3 1 3 1 3 1 3 1 3 1 3 1
##
## Within cluster sum of squares by cluster:
## [1] 6165.228 5694.396 10102.896 6203.064
## (between_SS / total_SS = 83.6 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss" "tot.withinss"
## [6] "betweenss" "size" "iter" "ifault"
ggthemr("flat", type="outer")
## Warning: New theme missing the following elements: axis.ticks.length.x,
## axis.ticks.length.x.top, axis.ticks.length.x.bottom, axis.ticks.length.y,
## axis.ticks.length.y.left, axis.ticks.length.y.right
ggplot(k_variable, aes(x = Age, y = Spending_Score)) +
geom_point(stat = "identity", size = 3, aes(color = as.factor(customerClusters$cluster))) +
scale_x_continuous(breaks = seq(min(15), max(70), by = 5)) +
scale_color_discrete(name=" ",
breaks=c("1", "2", "3", "4"),
labels=c("Cluster 1", "Cluster 2", "Cluster 3", "Cluster 4")) +
theme_bw()+
ggtitle("Mall Customer Segmens", subtitle = "K-means Clustering")

Summary
- Cluster 1 Young and quit high spending score.
- Cluster 2 Old and quith high spending score.
- Cluster 3 Low spending score there’s a few of them have young age, but more focus on old age.
- Cluster 4 Very high spending score and most of them still young, the age between 18 to 40.