Customer Segmentation

library(tidyverse)
library(ggthemr)
library(miscset)
library(recipes)
library(gridExtra)

customerdata <- read_csv("~/Documents/Mall_Customers.csv")

## Parsed with column specification:
## cols(
##   CustomerID = col_double(),
##   Gender = col_character(),
##   Age = col_double(),
##   `Annual Income (k$)` = col_double(),
##   `Spending Score (1-100)` = col_double()
## )

customerdata <- customerdata %>% 
  rename(Annual_Income = `Annual Income (k$)`,
         Spending_Score = `Spending Score (1-100)`)

customerdata <- customerdata %>% 
  mutate_if(is.double, as.numeric) %>% 
  mutate_if(is.character, as.factor)

str(customerdata)

## Classes 'spec_tbl_df', 'tbl_df', 'tbl' and 'data.frame': 200 obs. of  5 variables:
##  $ CustomerID    : num  1 2 3 4 5 6 7 8 9 10 ...
##  $ Gender        : Factor w/ 2 levels "Female","Male": 2 2 1 1 1 1 1 1 2 1 ...
##  $ Age           : num  19 21 20 23 31 22 35 23 64 30 ...
##  $ Annual_Income : num  15 15 16 16 17 17 18 18 19 19 ...
##  $ Spending_Score: num  39 81 6 77 40 76 6 94 3 72 ...

ggthemr("flat", type = "outer")

## Warning: New theme missing the following elements: axis.ticks.length.x,
## axis.ticks.length.x.top, axis.ticks.length.x.bottom, axis.ticks.length.y,
## axis.ticks.length.y.left, axis.ticks.length.y.right

ggplotGrid(ncol=1,
lapply(c("Age", "Annual_Income", "Spending_Score"),
       function(col){
         ggplot(customerdata, aes_string(col)) + 
           geom_density(aes(fill=Gender), bins=30, alpha = 0.7)
       }))

## Warning: Ignoring unknown parameters: bins

## Warning: Ignoring unknown parameters: bins

## Warning: Ignoring unknown parameters: bins

ggthemr("pale")

## Warning: New theme missing the following elements: axis.ticks.length.x,
## axis.ticks.length.x.top, axis.ticks.length.x.bottom, axis.ticks.length.y,
## axis.ticks.length.y.left, axis.ticks.length.y.right

gender_count <- customerdata %>% 
  group_by(Gender) %>%
  ggplot(aes(x = Gender,fill = Gender))+
  geom_bar(alpha = 0.7)+
  theme_bw()

gender_count

gender_spending <- customerdata %>% 
  select(Gender, Spending_Score) %>% 
  group_by(Gender) %>% 
  ggplot(aes(x = Gender, y = Spending_Score, fill = Gender))+
  geom_bar(stat = "identity", alpha=0.7)

gender_income <- customerdata %>% 
  select(Gender, Annual_Income) %>% 
  group_by(Gender) %>% 
  ggplot(aes(x = Gender, y = Annual_Income, fill = Gender))+
  geom_bar(stat = "identity", alpha = 0.7)


grid.arrange(gender_spending, gender_income, layout_matrix = rbind(c(1,2)))

data_prep <- recipe(~., data=customerdata) %>% 
  step_rm(CustomerID) %>% 
  step_dummy(Gender, one_hot = TRUE, naming = partial(dummy_names, sep = "_")) %>% 
  prep()

data_prep_done <- customerdata %>% 
  recipes::bake(data_prep, new_data = .)

data_prep_done %>% cor()

##                        Age Annual_Income Spending_Score Gender_Female
## Age             1.00000000  -0.012398043   -0.327226846   -0.06086739
## Annual_Income  -0.01239804   1.000000000    0.009902848   -0.05640981
## Spending_Score -0.32722685   0.009902848    1.000000000    0.05810874
## Gender_Female  -0.06086739  -0.056409810    0.058108739    1.00000000
## Gender_Male     0.06086739   0.056409810   -0.058108739   -1.00000000
##                Gender_Male
## Age             0.06086739
## Annual_Income   0.05640981
## Spending_Score -0.05810874
## Gender_Female  -1.00000000
## Gender_Male     1.00000000

head(customerdata)

## # A tibble: 6 x 5
##   CustomerID Gender   Age Annual_Income Spending_Score
##        <dbl> <fct>  <dbl>         <dbl>          <dbl>
## 1          1 Male      19            15             39
## 2          2 Male      21            15             81
## 3          3 Female    20            16              6
## 4          4 Female    23            16             77
## 5          5 Female    31            17             40
## 6          6 Female    22            17             76

k_variable <- customerdata[,c(3,5)]

tot.withinss <- vector("numeric", length = 10)

for (i in 1:10){
    kDet <- kmeans(k_variable, i)
    tot.withinss[i] <- kDet$tot.withinss
}

ggplot(as.data.frame(tot.withinss), aes(x = seq(1,10), y = tot.withinss)) + 
    geom_point() +    
    geom_line() + 
    theme(axis.title.x.bottom = element_blank()) +
    ylab("Within-cluster Sum of Squares") +
    xlab("Number of Clusters") +
    ggtitle("Elbow K Estimation")

customerClusters <- kmeans(k_variable, 4)
customerClusters

## K-means clustering with 4 clusters of sizes 57, 48, 48, 47
## 
## Cluster means:
##        Age Spending_Score
## 1 30.17544       82.35088
## 2 55.70833       48.22917
## 3 43.29167       15.02083
## 4 27.61702       49.14894
## 
## Clustering vector:
##   [1] 4 1 3 1 4 1 3 1 3 1 3 1 3 1 3 1 4 4 3 1 4 1 3 1 3 1 3 4 3 1 3 1 3 1 3 1 3
##  [38] 1 3 1 2 1 2 4 3 4 2 4 4 4 2 4 4 2 2 2 2 2 4 2 2 4 2 2 2 4 2 2 4 4 2 2 2 2
##  [75] 2 4 2 4 4 2 2 4 2 2 4 2 2 4 4 2 2 4 2 4 4 4 2 4 2 4 4 2 2 4 2 4 2 2 2 2 2
## [112] 4 4 4 4 4 2 2 2 2 4 4 4 1 4 1 2 1 3 1 3 1 4 1 3 1 3 1 3 1 3 1 4 1 3 1 2 1
## [149] 3 1 3 1 3 1 3 1 3 1 3 1 2 1 3 1 3 1 3 1 3 4 3 1 3 1 3 1 3 1 3 1 3 1 3 1 4
## [186] 1 3 1 3 1 3 1 3 1 3 1 3 1 3 1
## 
## Within cluster sum of squares by cluster:
## [1]  6165.228  5694.396 10102.896  6203.064
##  (between_SS / total_SS =  83.6 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
## [6] "betweenss"    "size"         "iter"         "ifault"

ggthemr("flat", type="outer")

## Warning: New theme missing the following elements: axis.ticks.length.x,
## axis.ticks.length.x.top, axis.ticks.length.x.bottom, axis.ticks.length.y,
## axis.ticks.length.y.left, axis.ticks.length.y.right

ggplot(k_variable, aes(x = Age, y = Spending_Score)) + 
  geom_point(stat = "identity", size = 3, aes(color = as.factor(customerClusters$cluster))) +
  scale_x_continuous(breaks = seq(min(15), max(70), by = 5)) +
  scale_color_discrete(name=" ",
                       breaks=c("1", "2", "3", "4"),
                       labels=c("Cluster 1", "Cluster 2", "Cluster 3", "Cluster 4")) +
  theme_bw()+
  ggtitle("Mall Customer Segmens", subtitle = "K-means Clustering")

Summary

Cluster 1 Young and quit high spending score.
Cluster 2 Old and quith high spending score.
Cluster 3 Low spending score there’s a few of them have young age, but more focus on old age.
Cluster 4 Very high spending score and most of them still young, the age between 18 to 40.

Customer Segmentation

Suardi Sulaiman

Summary