Data Science in Marketing: Customer Segmentation

Read the data

pelanggan <- read.csv("https://storage.googleapis.com/dqlab-dataset/customer_segments.txt", sep="\t")
pelanggan[c("Jenis.Kelamin","Umur", "Profesi", "Tipe.Residen")]

##    Jenis.Kelamin Umur          Profesi Tipe.Residen
## 1           Pria   58       Wiraswasta       Sector
## 2         Wanita   14          Pelajar      Cluster
## 3           Pria   48     Professional      Cluster
## 4           Pria   53     Professional      Cluster
## 5         Wanita   41       Wiraswasta      Cluster
## 6         Wanita   24     Professional      Cluster
## 7           Pria   64       Wiraswasta       Sector
## 8           Pria   52     Professional      Cluster
## 9         Wanita   29     Professional       Sector
## 10          Pria   33     Professional      Cluster
## 11        Wanita   50     Professional       Sector
## 12        Wanita   49     Professional       Sector
## 13        Wanita   64       Wiraswasta      Cluster
## 14          Pria   60       Wiraswasta      Cluster
## 15        Wanita   20       Wiraswasta      Cluster
## 16          Pria   35     Professional      Cluster
## 17        Wanita   32 Ibu Rumah Tangga      Cluster
## 18        Wanita   63 Ibu Rumah Tangga      Cluster
## 19        Wanita   32       Wiraswasta      Cluster
## 20        Wanita   16          Pelajar       Sector
## 21        Wanita   38       Wiraswasta      Cluster
## 22        Wanita   52     Professional      Cluster
## 23          Pria   34     Professional      Cluster
## 24        Wanita   39       Wiraswasta      Cluster
## 25        Wanita   29       Wiraswasta       Sector
## 26        Wanita   55     Professional      Cluster
## 27        Wanita   35       Wiraswasta      Cluster
## 28        Wanita   40 Ibu Rumah Tangga      Cluster
## 29        Wanita   56     Professional      Cluster
## 30        Wanita   46 Ibu Rumah Tangga       Sector
## 31        Wanita   19        Mahasiswa      Cluster
## 32        Wanita   47       Wiraswasta       Sector
## 33        Wanita   19        Mahasiswa      Cluster
## 34        Wanita   21       Wiraswasta       Sector
## 35        Wanita   39     Professional       Sector
## 36        Wanita   30       Wiraswasta      Cluster
## 37        Wanita   25     Professional       Sector
## 38        Wanita   46       Wiraswasta       Sector
## 39        Wanita   20     Professional      Cluster
## 40        Wanita   14          Pelajar       Sector
## 41        Wanita   24 Ibu Rumah Tangga      Cluster
## 42        Wanita   26       Wiraswasta      Cluster
## 43        Wanita   31     Professional      Cluster
## 44        Wanita   18       Wiraswasta      Cluster
## 45        Wanita   22     Professional      Cluster
## 46        Wanita   25       Wiraswasta       Sector
## 47        Wanita   55 Ibu Rumah Tangga      Cluster
## 48        Wanita   45       Wiraswasta       Sector
## 49        Wanita   33 Ibu Rumah Tangga       Sector
## 50        Wanita   55       Wiraswasta       Sector

Vector to Save Field Names

#Reads csv data and inserts into Pelanggan variables
pelanggan <- read.csv("https://storage.googleapis.com/dqlab-dataset/customer_segments.txt",sep="\t")
#Create a field_yang_digunakan variable with the contents of a vector "Jenis.Kelamin", "Umur" dan "Profesi"
field_yang_digunakan <- c("Jenis.Kelamin", "Umur", "Profesi")
#View data Pelanggan 
pelanggan[field_yang_digunakan]

##    Jenis.Kelamin Umur          Profesi
## 1           Pria   58       Wiraswasta
## 2         Wanita   14          Pelajar
## 3           Pria   48     Professional
## 4           Pria   53     Professional
## 5         Wanita   41       Wiraswasta
## 6         Wanita   24     Professional
## 7           Pria   64       Wiraswasta
## 8           Pria   52     Professional
## 9         Wanita   29     Professional
## 10          Pria   33     Professional
## 11        Wanita   50     Professional
## 12        Wanita   49     Professional
## 13        Wanita   64       Wiraswasta
## 14          Pria   60       Wiraswasta
## 15        Wanita   20       Wiraswasta
## 16          Pria   35     Professional
## 17        Wanita   32 Ibu Rumah Tangga
## 18        Wanita   63 Ibu Rumah Tangga
## 19        Wanita   32       Wiraswasta
## 20        Wanita   16          Pelajar
## 21        Wanita   38       Wiraswasta
## 22        Wanita   52     Professional
## 23          Pria   34     Professional
## 24        Wanita   39       Wiraswasta
## 25        Wanita   29       Wiraswasta
## 26        Wanita   55     Professional
## 27        Wanita   35       Wiraswasta
## 28        Wanita   40 Ibu Rumah Tangga
## 29        Wanita   56     Professional
## 30        Wanita   46 Ibu Rumah Tangga
## 31        Wanita   19        Mahasiswa
## 32        Wanita   47       Wiraswasta
## 33        Wanita   19        Mahasiswa
## 34        Wanita   21       Wiraswasta
## 35        Wanita   39     Professional
## 36        Wanita   30       Wiraswasta
## 37        Wanita   25     Professional
## 38        Wanita   46       Wiraswasta
## 39        Wanita   20     Professional
## 40        Wanita   14          Pelajar
## 41        Wanita   24 Ibu Rumah Tangga
## 42        Wanita   26       Wiraswasta
## 43        Wanita   31     Professional
## 44        Wanita   18       Wiraswasta
## 45        Wanita   22     Professional
## 46        Wanita   25       Wiraswasta
## 47        Wanita   55 Ibu Rumah Tangga
## 48        Wanita   45       Wiraswasta
## 49        Wanita   33 Ibu Rumah Tangga
## 50        Wanita   55       Wiraswasta

Data conversion with data.matrix

#Data conversion to numerik
 pelanggan_matrix <- data.matrix(pelanggan[c("Jenis.Kelamin", "Profesi", "Tipe.Residen")])

Combining Conversion Results

pelanggan <- read.csv("https://storage.googleapis.com/dqlab-dataset/customer_segments.txt",sep="\t")
pelanggan_matrix <- data.matrix(pelanggan[c("Jenis.Kelamin", "Profesi", "Tipe.Residen")])
#The data combined
pelanggan <- data.frame(pelanggan, pelanggan_matrix)

pelanggan

##    Customer_ID        Nama.Pelanggan Jenis.Kelamin Umur          Profesi
## 1     CUST-001          Budi Anggara          Pria   58       Wiraswasta
## 2     CUST-002      Shirley Ratuwati        Wanita   14          Pelajar
## 3     CUST-003          Agus Cahyono          Pria   48     Professional
## 4     CUST-004      Antonius Winarta          Pria   53     Professional
## 5     CUST-005   Ibu Sri Wahyuni, IR        Wanita   41       Wiraswasta
## 6     CUST-006       Rosalina Kurnia        Wanita   24     Professional
## 7     CUST-007         Cahyono, Agus          Pria   64       Wiraswasta
## 8     CUST-008        Danang Santosa          Pria   52     Professional
## 9     CUST-009 Elisabeth Suryadinata        Wanita   29     Professional
## 10    CUST-010        Mario Setiawan          Pria   33     Professional
## 11    CUST-011        Maria Suryawan        Wanita   50     Professional
## 12    CUST-012       Erliana Widjaja        Wanita   49     Professional
## 13    CUST-013          Cahaya Putri        Wanita   64       Wiraswasta
## 14    CUST-014        Mario Setiawan          Pria   60       Wiraswasta
## 15    CUST-015      Shirley Ratuwati        Wanita   20       Wiraswasta
## 16    CUST-016          Bambang Rudi          Pria   35     Professional
## 17    CUST-017             Yuni Sari        Wanita   32 Ibu Rumah Tangga
## 18    CUST-018           Nelly Halim        Wanita   63 Ibu Rumah Tangga
## 19    CUST-019          Mega Pranoto        Wanita   32       Wiraswasta
## 20    CUST-020        Irene Novianto        Wanita   16          Pelajar
## 21    CUST-021      Lestari Fabianto        Wanita   38       Wiraswasta
## 22    CUST-022          Novita Purba        Wanita   52     Professional
## 23    CUST-023       Denny Amiruddin          Pria   34     Professional
## 24    CUST-024         Putri Ginting        Wanita   39       Wiraswasta
## 25    CUST-025        Julia Setiawan        Wanita   29       Wiraswasta
## 26    CUST-026     Christine Winarto        Wanita   55     Professional
## 27    CUST-027         Grace Mulyati        Wanita   35       Wiraswasta
## 28    CUST-028         Adeline Huang        Wanita   40 Ibu Rumah Tangga
## 29    CUST-029          Tia Hartanti        Wanita   56     Professional
## 30    CUST-030        Rosita Saragih        Wanita   46 Ibu Rumah Tangga
## 31    CUST-031         Eviana Handry        Wanita   19        Mahasiswa
## 32    CUST-032       Chintya Winarni        Wanita   47       Wiraswasta
## 33    CUST-033       Cecilia Kusnadi        Wanita   19        Mahasiswa
## 34    CUST-034        Deasy Arisandi        Wanita   21       Wiraswasta
## 35    CUST-035               Ida Ayu        Wanita   39     Professional
## 36    CUST-036        Ni Made Suasti        Wanita   30       Wiraswasta
## 37    CUST-037      Felicia Tandiono        Wanita   25     Professional
## 38    CUST-038          Agatha Salim        Wanita   46       Wiraswasta
## 39    CUST-039          Gina Hidayat        Wanita   20     Professional
## 40    CUST-040        Irene Darmawan        Wanita   14          Pelajar
## 41    CUST-041      Shinta Aritonang        Wanita   24 Ibu Rumah Tangga
## 42    CUST-042          Yuliana Wati        Wanita   26       Wiraswasta
## 43    CUST-043          Yenna Sumadi        Wanita   31     Professional
## 44    CUST-044                  Anna        Wanita   18       Wiraswasta
## 45    CUST-045        Rismawati Juni        Wanita   22     Professional
## 46    CUST-046          Elfira Surya        Wanita   25       Wiraswasta
## 47    CUST-047           Mira Kurnia        Wanita   55 Ibu Rumah Tangga
## 48    CUST-048      Maria Hutagalung        Wanita   45       Wiraswasta
## 49    CUST-049       Josephine Wahab        Wanita   33 Ibu Rumah Tangga
## 50    CUST-050        Lianna Nugraha        Wanita   55       Wiraswasta
##    Tipe.Residen NilaiBelanjaSetahun Jenis.Kelamin.1 Profesi.1 Tipe.Residen.1
## 1        Sector             9497927               1         5              2
## 2       Cluster             2722700               2         3              1
## 3       Cluster             5286429               1         4              1
## 4       Cluster             5204498               1         4              1
## 5       Cluster            10615206               2         5              1
## 6       Cluster             5215541               2         4              1
## 7        Sector             9837260               1         5              2
## 8       Cluster             5223569               1         4              1
## 9        Sector             5993218               2         4              2
## 10      Cluster             5257448               1         4              1
## 11       Sector             5987367               2         4              2
## 12       Sector             5941914               2         4              2
## 13      Cluster             9333168               2         5              1
## 14      Cluster             9471615               1         5              1
## 15      Cluster            10365668               2         5              1
## 16      Cluster             5262521               1         4              1
## 17      Cluster             5677762               2         1              1
## 18      Cluster             5340690               2         1              1
## 19      Cluster            10884508               2         5              1
## 20       Sector             2896845               2         3              2
## 21      Cluster             9222070               2         5              1
## 22      Cluster             5298157               2         4              1
## 23      Cluster             5239290               1         4              1
## 24      Cluster            10259572               2         5              1
## 25       Sector            10721998               2         5              2
## 26      Cluster             5269392               2         4              1
## 27      Cluster             9114159               2         5              1
## 28      Cluster             6631680               2         1              1
## 29      Cluster             5271845               2         4              1
## 30       Sector             5020976               2         1              2
## 31      Cluster             3042773               2         2              1
## 32       Sector            10663179               2         5              2
## 33      Cluster             3047926               2         2              1
## 34       Sector             9759822               2         5              2
## 35       Sector             5962575               2         4              2
## 36      Cluster             9678994               2         5              1
## 37       Sector             5972787               2         4              2
## 38       Sector            10477127               2         5              2
## 39      Cluster             5257775               2         4              1
## 40       Sector             2861855               2         3              2
## 41      Cluster             6820976               2         1              1
## 42      Cluster             9880607               2         5              1
## 43      Cluster             5268410               2         4              1
## 44      Cluster             9339737               2         5              1
## 45      Cluster             5211041               2         4              1
## 46       Sector            10099807               2         5              2
## 47      Cluster             6130724               2         1              1
## 48       Sector            10390732               2         5              2
## 49       Sector             4992585               2         1              2
## 50       Sector            10569316               2         5              2

Normalizing Shopping Value

pelanggan <- read.csv("https://storage.googleapis.com/dqlab-dataset/customer_segments.txt", sep="\t")
pelanggan_matrix <- data.matrix(pelanggan[c("Jenis.Kelamin", "Profesi", "Tipe.Residen")])
pelanggan <- data.frame(pelanggan, pelanggan_matrix)
#Normalizing value
pelanggan$NilaiBelanjaSetahun <- pelanggan$NilaiBelanjaSetahun / 1000000

Make out Data Master

pelanggan <- read.csv("https://storage.googleapis.com/dqlab-dataset/customer_segments.txt",sep="\t")
pelanggan_matrix <- data.matrix(pelanggan[c("Jenis.Kelamin", "Profesi", "Tipe.Residen")])
pelanggan <- data.frame(pelanggan, pelanggan_matrix)
pelanggan$NilaiBelanjaSetahun = pelanggan$NilaiBelanjaSetahun/1000000
#Filling data master
Profesi <- unique(pelanggan[c("Profesi","Profesi.1")])
Jenis.Kelamin <- unique(pelanggan[c("Jenis.Kelamin","Jenis.Kelamin.1")])
Tipe.Residen <- unique(pelanggan[c("Tipe.Residen","Tipe.Residen.1")])

K-means is an algorithm that divides data into a number of partitions in a simple way: it looks for the proximity of each point in a cluster by a number of average or mean values.

There are two key concepts that are also the name of the origin of k-means:

The desired number of partitions, represented by the letter k Looks for the “distance” of the proximity of each point to a number of observed cluster mean values, represented by means

Funtions kmeans

#Part of Data Preparation
pelanggan <- read.csv("https://storage.googleapis.com/dqlab-dataset/customer_segments.txt", sep="\t")
pelanggan_matrix <- data.matrix(pelanggan[c("Jenis.Kelamin", "Profesi", "Tipe.Residen")])
pelanggan <- data.frame(pelanggan, pelanggan_matrix)
Profesi <- unique(pelanggan[c("Profesi","Profesi.1")])
Jenis.Kelamin <- unique(pelanggan[c("Jenis.Kelamin","Jenis.Kelamin.1")])
Tipe.Profesi <- unique(pelanggan[c("Tipe.Residen","Tipe.Residen.1")])
pelanggan$NilaiBelanjaSetahun <- pelanggan$NilaiBelanjaSetahun/1000000
field_yang_digunakan = c("Jenis.Kelamin.1", "Umur", "Profesi.1", "Tipe.Residen.1","NilaiBelanjaSetahun")
#Part of K-Means
set.seed(1)
#kmeans function to form 5 clusters with 25 random scenarios and save into segmentation variables
segmentasi <- kmeans(x=pelanggan[field_yang_digunakan], centers=5, nstart=25)
#Show haskmeans(x=pelanggan[field_yang_digunakan], centers=5, nstart=25)s
segmentasi

## K-means clustering with 5 clusters of sizes 12, 10, 5, 14, 9
## 
## Cluster means:
##   Jenis.Kelamin.1     Umur Profesi.1 Tipe.Residen.1 NilaiBelanjaSetahun
## 1            1.75 31.58333  3.916667       1.250000            7.330958
## 2            1.70 52.50000  3.800000       1.300000            6.018321
## 3            1.40 61.80000  4.200000       1.400000            8.696132
## 4            2.00 20.07143  3.571429       1.357143            5.901089
## 5            2.00 42.33333  4.000000       1.555556            8.804791
## 
## Clustering vector:
##  [1] 3 4 2 2 5 4 3 2 1 1 2 2 3 3 4 1 1 3 1 4 5 2 1 5 1 2 1 5 2 5 4 5 4 4 5 1 4 5
## [39] 4 4 4 1 1 4 4 4 2 5 1 2
## 
## Within cluster sum of squares by cluster:
## [1] 174.85164 108.49735  58.21123 316.73367 171.67372
##  (between_SS / total_SS =  92.4 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
## [6] "betweenss"    "size"         "iter"         "ifault"

Vector Clustering Results Analysis

#Combining cluster results

segmentasi$cluster

##  [1] 3 4 2 2 5 4 3 2 1 1 2 2 3 3 4 1 1 3 1 4 5 2 1 5 1 2 1 5 2 5 4 5 4 4 5 1 4 5
## [39] 4 4 4 1 1 4 4 4 2 5 1 2

pelanggan$cluster <- segmentasi$cluster

str(pelanggan)

## 'data.frame':    50 obs. of  11 variables:
##  $ Customer_ID        : chr  "CUST-001" "CUST-002" "CUST-003" "CUST-004" ...
##  $ Nama.Pelanggan     : chr  "Budi Anggara" "Shirley Ratuwati" "Agus Cahyono" "Antonius Winarta" ...
##  $ Jenis.Kelamin      : chr  "Pria" "Wanita" "Pria" "Pria" ...
##  $ Umur               : int  58 14 48 53 41 24 64 52 29 33 ...
##  $ Profesi            : chr  "Wiraswasta" "Pelajar" "Professional" "Professional" ...
##  $ Tipe.Residen       : chr  "Sector" "Cluster" "Cluster" "Cluster" ...
##  $ NilaiBelanjaSetahun: num  9.5 2.72 5.29 5.2 10.62 ...
##  $ Jenis.Kelamin.1    : int  1 2 1 1 2 2 1 1 2 1 ...
##  $ Profesi.1          : int  5 3 4 4 5 4 5 4 4 4 ...
##  $ Tipe.Residen.1     : int  2 1 1 1 1 1 2 1 2 1 ...
##  $ cluster            : int  3 4 2 2 5 4 3 2 1 1 ...

Analysis of Cluster Size Results

#Analysis of result
#1st cluster filter
which(pelanggan$cluster == 2)

##  [1]  3  4  8 11 12 22 26 29 47 50

length(which(pelanggan$cluster == 2))

## [1] 10

Viewing Data on the N-th Cluster

#Part of Data Preparation
pelanggan <- read.csv("https://storage.googleapis.com/dqlab-dataset/customer_segments.txt", sep="\t")
pelanggan_matrix <- data.matrix(pelanggan[c("Jenis.Kelamin", "Profesi", "Tipe.Residen")])
pelanggan <- data.frame(pelanggan, pelanggan_matrix)
Profesi <- unique(pelanggan[c("Profesi","Profesi.1")])
Jenis.Kelamin <- unique(pelanggan[c("Jenis.Kelamin","Jenis.Kelamin.1")])
Tipe.Profesi <- unique(pelanggan[c("Tipe.Residen","Tipe.Residen.1")])
pelanggan$NilaiBelanjaSetahun <- pelanggan$NilaiBelanjaSetahun/1000000
field_yang_digunakan = c("Jenis.Kelamin.1", "Umur", "Profesi.1", "Tipe.Residen.1","NilaiBelanjaSetahun")
#Part of K-Means
set.seed(1)
segmentasi <- kmeans(x=pelanggan[field_yang_digunakan], centers=5, nstart=25)
pelanggan$cluster <- segmentasi$cluster
#Analysis of result
#Viewing data on the 3-5th
pelanggan[which(pelanggan$cluster == 3),]

##    Customer_ID Nama.Pelanggan Jenis.Kelamin Umur          Profesi Tipe.Residen
## 1     CUST-001   Budi Anggara          Pria   58       Wiraswasta       Sector
## 7     CUST-007  Cahyono, Agus          Pria   64       Wiraswasta       Sector
## 13    CUST-013   Cahaya Putri        Wanita   64       Wiraswasta      Cluster
## 14    CUST-014 Mario Setiawan          Pria   60       Wiraswasta      Cluster
## 18    CUST-018    Nelly Halim        Wanita   63 Ibu Rumah Tangga      Cluster
##    NilaiBelanjaSetahun Jenis.Kelamin.1 Profesi.1 Tipe.Residen.1 cluster
## 1             9.497927               1         5              2       3
## 7             9.837260               1         5              2       3
## 13            9.333168               2         5              1       3
## 14            9.471615               1         5              1       3
## 18            5.340690               2         1              1       3

pelanggan[which(pelanggan$cluster == 4),]

##    Customer_ID   Nama.Pelanggan Jenis.Kelamin Umur          Profesi
## 2     CUST-002 Shirley Ratuwati        Wanita   14          Pelajar
## 6     CUST-006  Rosalina Kurnia        Wanita   24     Professional
## 15    CUST-015 Shirley Ratuwati        Wanita   20       Wiraswasta
## 20    CUST-020   Irene Novianto        Wanita   16          Pelajar
## 31    CUST-031    Eviana Handry        Wanita   19        Mahasiswa
## 33    CUST-033  Cecilia Kusnadi        Wanita   19        Mahasiswa
## 34    CUST-034   Deasy Arisandi        Wanita   21       Wiraswasta
## 37    CUST-037 Felicia Tandiono        Wanita   25     Professional
## 39    CUST-039     Gina Hidayat        Wanita   20     Professional
## 40    CUST-040   Irene Darmawan        Wanita   14          Pelajar
## 41    CUST-041 Shinta Aritonang        Wanita   24 Ibu Rumah Tangga
## 44    CUST-044             Anna        Wanita   18       Wiraswasta
## 45    CUST-045   Rismawati Juni        Wanita   22     Professional
## 46    CUST-046     Elfira Surya        Wanita   25       Wiraswasta
##    Tipe.Residen NilaiBelanjaSetahun Jenis.Kelamin.1 Profesi.1 Tipe.Residen.1
## 2       Cluster            2.722700               2         3              1
## 6       Cluster            5.215541               2         4              1
## 15      Cluster           10.365668               2         5              1
## 20       Sector            2.896845               2         3              2
## 31      Cluster            3.042773               2         2              1
## 33      Cluster            3.047926               2         2              1
## 34       Sector            9.759822               2         5              2
## 37       Sector            5.972787               2         4              2
## 39      Cluster            5.257775               2         4              1
## 40       Sector            2.861855               2         3              2
## 41      Cluster            6.820976               2         1              1
## 44      Cluster            9.339737               2         5              1
## 45      Cluster            5.211041               2         4              1
## 46       Sector           10.099807               2         5              2
##    cluster
## 2        4
## 6        4
## 15       4
## 20       4
## 31       4
## 33       4
## 34       4
## 37       4
## 39       4
## 40       4
## 41       4
## 44       4
## 45       4
## 46       4

pelanggan[which(pelanggan$cluster == 5),]

##    Customer_ID      Nama.Pelanggan Jenis.Kelamin Umur          Profesi
## 5     CUST-005 Ibu Sri Wahyuni, IR        Wanita   41       Wiraswasta
## 21    CUST-021    Lestari Fabianto        Wanita   38       Wiraswasta
## 24    CUST-024       Putri Ginting        Wanita   39       Wiraswasta
## 28    CUST-028       Adeline Huang        Wanita   40 Ibu Rumah Tangga
## 30    CUST-030      Rosita Saragih        Wanita   46 Ibu Rumah Tangga
## 32    CUST-032     Chintya Winarni        Wanita   47       Wiraswasta
## 35    CUST-035             Ida Ayu        Wanita   39     Professional
## 38    CUST-038        Agatha Salim        Wanita   46       Wiraswasta
## 48    CUST-048    Maria Hutagalung        Wanita   45       Wiraswasta
##    Tipe.Residen NilaiBelanjaSetahun Jenis.Kelamin.1 Profesi.1 Tipe.Residen.1
## 5       Cluster           10.615206               2         5              1
## 21      Cluster            9.222070               2         5              1
## 24      Cluster           10.259572               2         5              1
## 28      Cluster            6.631680               2         1              1
## 30       Sector            5.020976               2         1              2
## 32       Sector           10.663179               2         5              2
## 35       Sector            5.962575               2         4              2
## 38       Sector           10.477127               2         5              2
## 48       Sector           10.390732               2         5              2
##    cluster
## 5        5
## 21       5
## 24       5
## 28       5
## 30       5
## 32       5
## 35       5
## 38       5
## 48       5

Analysis of Cluster Means Results

#Part of Data Preparation
pelanggan <- read.csv("https://storage.googleapis.com/dqlab-dataset/customer_segments.txt", sep="\t")
pelanggan_matrix <- data.matrix(pelanggan[c("Jenis.Kelamin", "Profesi", "Tipe.Residen")])
pelanggan <- data.frame(pelanggan, pelanggan_matrix)
Profesi <- unique(pelanggan[c("Profesi","Profesi.1")])
Jenis.Kelamin <- unique(pelanggan[c("Jenis.Kelamin","Jenis.Kelamin.1")])
Tipe.Profesi <- unique(pelanggan[c("Tipe.Residen","Tipe.Residen.1")])
pelanggan$NilaiBelanjaSetahun <- pelanggan$NilaiBelanjaSetahun/1000000
field_yang_digunakan = c("Jenis.Kelamin.1", "Umur", "Profesi.1", "Tipe.Residen.1","NilaiBelanjaSetahun")
#Part of K-Means
set.seed(1)
segmentasi <- kmeans(x=pelanggan[field_yang_digunakan], centers=5, nstart=25)
pelanggan$cluster <- segmentasi$cluster
#Analysis of the result
#View cluster means of objects
segmentasi$centers

##   Jenis.Kelamin.1     Umur Profesi.1 Tipe.Residen.1 NilaiBelanjaSetahun
## 1            1.75 31.58333  3.916667       1.250000            7.330958
## 2            1.70 52.50000  3.800000       1.300000            6.018321
## 3            1.40 61.80000  4.200000       1.400000            8.696132
## 4            2.00 20.07143  3.571429       1.357143            5.901089
## 5            2.00 42.33333  4.000000       1.555556            8.804791

Sum of Squares Results Analysis

#Part of Data Preparation
pelanggan <- read.csv("https://storage.googleapis.com/dqlab-dataset/customer_segments.txt", sep="\t")
pelanggan_matrix <- data.matrix(pelanggan[c("Jenis.Kelamin", "Profesi", "Tipe.Residen")])
pelanggan <- data.frame(pelanggan, pelanggan_matrix)
Profesi <- unique(pelanggan[c("Profesi","Profesi.1")])
Jenis.Kelamin <- unique(pelanggan[c("Jenis.Kelamin","Jenis.Kelamin.1")])
Tipe.Profesi <- unique(pelanggan[c("Tipe.Residen","Tipe.Residen.1")])
pelanggan$NilaiBelanjaSetahun <- pelanggan$NilaiBelanjaSetahun/1000000
field_yang_digunakan = c("Jenis.Kelamin.1", "Umur", "Profesi.1", "Tipe.Residen.1","NilaiBelanjaSetahun")
#Comparing with 2 clusters of kmeans, 2 and 5 respectively
set.seed(1)
kmeans(x=pelanggan[field_yang_digunakan], centers=2, nstart=25)

## K-means clustering with 2 clusters of sizes 23, 27
## 
## Cluster means:
##   Jenis.Kelamin.1     Umur Profesi.1 Tipe.Residen.1 NilaiBelanjaSetahun
## 1        1.739130 51.17391  3.913043       1.434783            7.551518
## 2        1.888889 25.85185  3.777778       1.296296            6.659586
## 
## Clustering vector:
##  [1] 1 2 1 1 1 2 1 1 2 2 1 1 1 1 2 2 2 1 2 2 2 1 2 1 2 1 2 1 1 1 2 1 2 2 1 2 2 1
## [39] 2 2 2 2 2 2 2 2 1 1 2 1
## 
## Within cluster sum of squares by cluster:
## [1] 1492.481 1524.081
##  (between_SS / total_SS =  72.6 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
## [6] "betweenss"    "size"         "iter"         "ifault"

set.seed(1)
kmeans(x=pelanggan[field_yang_digunakan], centers=5, nstart=25)

## K-means clustering with 5 clusters of sizes 12, 10, 5, 14, 9
## 
## Cluster means:
##   Jenis.Kelamin.1     Umur Profesi.1 Tipe.Residen.1 NilaiBelanjaSetahun
## 1            1.75 31.58333  3.916667       1.250000            7.330958
## 2            1.70 52.50000  3.800000       1.300000            6.018321
## 3            1.40 61.80000  4.200000       1.400000            8.696132
## 4            2.00 20.07143  3.571429       1.357143            5.901089
## 5            2.00 42.33333  4.000000       1.555556            8.804791
## 
## Clustering vector:
##  [1] 3 4 2 2 5 4 3 2 1 1 2 2 3 3 4 1 1 3 1 4 5 2 1 5 1 2 1 5 2 5 4 5 4 4 5 1 4 5
## [39] 4 4 4 1 1 4 4 4 2 5 1 2
## 
## Within cluster sum of squares by cluster:
## [1] 174.85164 108.49735  58.21123 316.73367 171.67372
##  (between_SS / total_SS =  92.4 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
## [6] "betweenss"    "size"         "iter"         "ifault"

Available Components

#Part of Data Preparation
pelanggan <- read.csv("https://storage.googleapis.com/dqlab-dataset/customer_segments.txt", sep="\t")
pelanggan_matrix <- data.matrix(pelanggan[c("Jenis.Kelamin", "Profesi", "Tipe.Residen")])
pelanggan <- data.frame(pelanggan, pelanggan_matrix)
Profesi <- unique(pelanggan[c("Profesi","Profesi.1")])
Jenis.Kelamin <- unique(pelanggan[c("Jenis.Kelamin","Jenis.Kelamin.1")])
Tipe.Profesi <- unique(pelanggan[c("Tipe.Residen","Tipe.Residen.1")])
pelanggan$NilaiBelanjaSetahun <- pelanggan$NilaiBelanjaSetahun/1000000
field_yang_digunakan = c("Jenis.Kelamin.1", "Umur", "Profesi.1", "Tipe.Residen.1","NilaiBelanjaSetahun")
#Part of K-Means
set.seed(1)
segmentasi <- kmeans(x=pelanggan[field_yang_digunakan], centers=5, nstart=25)
segmentasi$withinss

## [1] 174.85164 108.49735  58.21123 316.73367 171.67372

segmentasi$cluster

##  [1] 3 4 2 2 5 4 3 2 1 1 2 2 3 3 4 1 1 3 1 4 5 2 1 5 1 2 1 5 2 5 4 5 4 4 5 1 4 5
## [39] 4 4 4 1 1 4 4 4 2 5 1 2

segmentasi$tot.withinss

## [1] 829.9676

By analyzing the results of this output, we are able to combine the cluster numbers into the origin data. In addition, we also know how close each data point of the cluster is so that it becomes our provision to determine the optimal number of clusters.

From the information generated by the kmeans function, the Sum of Squares (SS) method or often called the Sum of Squared Errors (SSE) is very important to be used as the basis for us to determine the most optimal number of clusters.

Theoretically, here are some things we can observe with the SS:

The fewer the number of clusters generated, the greater the SS value. Vice versa, the more number of clusters produced, the smaller the SS value. Due to its quadratic nature, if there is a significant difference between each combination of clusters, the difference in SS values will be even greater. And as the number of clusters increases, the difference between these SS will be smaller.

Simulated Number of Clusters and SS

#Part of Data Preparation
pelanggan <- read.csv("https://storage.googleapis.com/dqlab-dataset/customer_segments.txt", sep="\t")
pelanggan_matrix <- data.matrix(pelanggan[c("Jenis.Kelamin", "Profesi", "Tipe.Residen")])
pelanggan <- data.frame(pelanggan, pelanggan_matrix)
Profesi <- unique(pelanggan[c("Profesi","Profesi.1")])
Jenis.Kelamin <- unique(pelanggan[c("Jenis.Kelamin","Jenis.Kelamin.1")])
Tipe.Profesi <- unique(pelanggan[c("Tipe.Residen","Tipe.Residen.1")])
pelanggan$NilaiBelanjaSetahun <-pelanggan$NilaiBelanjaSetahun/1000000
field_yang_digunakan = c("Jenis.Kelamin.1", "Umur", "Profesi.1", "Tipe.Residen.1","NilaiBelanjaSetahun")
#Part of K-Means
set.seed(1)
sse <- sapply(1:10,
function(param_k)
{
kmeans(pelanggan[field_yang_digunakan], param_k, nstart=25)$tot.withinss
}
)
sse

##  [1] 10990.9740  3016.5612  1550.8725  1064.4187   829.9676   625.1462
##  [7]   508.1568   431.6977   374.1095   325.7982

Graph Elbow Effect

library(ggplot2)

## Warning: package 'ggplot2' was built under R version 4.2.2

#Part of Data Preparation
pelanggan <- read.csv("https://storage.googleapis.com/dqlab-dataset/customer_segments.txt", sep="\t")
pelanggan_matrix <- data.matrix(pelanggan[c("Jenis.Kelamin", "Profesi", "Tipe.Residen")])
pelanggan <- data.frame(pelanggan, pelanggan_matrix)
Profesi <- unique(pelanggan[c("Profesi","Profesi.1")])
Jenis.Kelamin <- unique(pelanggan[c("Jenis.Kelamin","Jenis.Kelamin.1")])
Tipe.Profesi <- unique(pelanggan[c("Tipe.Residen","Tipe.Residen.1")])
pelanggan$NilaiBelanjaSetahun <- pelanggan$NilaiBelanjaSetahun/1000000
field_yang_digunakan = c("Jenis.Kelamin.1", "Umur", "Profesi.1", "Tipe.Residen.1","NilaiBelanjaSetahun")
#Part of K-Means
set.seed(1)
sse <- sapply(1:10, function(param_k){kmeans(pelanggan[field_yang_digunakan], param_k, nstart=25)$tot.withinss})

jumlah_cluster_max <- 10
ssdata = data.frame(cluster=c(1:jumlah_cluster_max),sse)
ggplot(ssdata, aes(x=cluster,y=sse)) +
                geom_line(color="red") + geom_point() +
                ylab("Within Cluster Sum of Squares") + xlab("Jumlah Cluster") +
                geom_text(aes(label=format(round(sse, 2), nsmall = 2)),hjust=-0.2, vjust=-0.5) +
  scale_x_discrete(limits=c(1:jumlah_cluster_max))

## Warning: Continuous limits supplied to discrete scale.
## ℹ Did you mean `limits = factor(...)` or `scale_*_continuous()`?

Naming Segments

Segmen.Pelanggan <- data.frame(cluster = c(1,2,3,4,5),
                               Nama.Segmen = c("Silver Youth Gals", 
                                               "Diamond Senior Member", 
                                               "Gold Young Professional", 
                                               "Diamond Professional", 
                                               "Silver Mid Professional"))

Merging References

#Reads csv data and inserts into Pelanggan variables
pelanggan <- read.csv("https://storage.googleapis.com/dqlab-dataset/customer_segments.txt", sep="\t")
pelanggan_matrix <- data.matrix(pelanggan[c("Jenis.Kelamin", "Profesi", "Tipe.Residen")])
pelanggan <- data.frame(pelanggan, pelanggan_matrix)
pelanggan$NilaiBelanjaSetahun = pelanggan$NilaiBelanjaSetahun/1000000
Profesi <- unique(pelanggan[c("Profesi","Profesi.1")])
Jenis.Kelamin <- unique(pelanggan[c("Jenis.Kelamin","Jenis.Kelamin.1")])
Tipe.Residen <- unique(pelanggan[c("Tipe.Residen","Tipe.Residen.1")])

#Part of K-Means
set.seed(1)
field_yang_digunakan = c("Jenis.Kelamin.1", "Umur", "Profesi.1", "Tipe.Residen.1","NilaiBelanjaSetahun")
segmentasi <- kmeans(x=pelanggan[field_yang_digunakan], centers=5, nstart=25)
Segmen.Pelanggan <- data.frame(cluster=c(1,2,3,4,5), Nama.Segmen=c("Silver Youth Gals", "Diamond Senior Member", "Gold Young Professional", "Diamond Professional", "Silver Mid Professional"))

#Combining all assets into Identitas.Cluster variables
Identitas.Cluster <- list(Profesi=Profesi, Jenis.Kelamin=Jenis.Kelamin, Tipe.Residen=Tipe.Residen, Segmentasi=segmentasi, Segmen.Pelanggan=Segmen.Pelanggan, field_yang_digunakan=field_yang_digunakan)

Saving Objects in File Form

#Read data csv and inserts into variable pelanggan
pelanggan <- read.csv("https://storage.googleapis.com/dqlab-dataset/customer_segments.txt", sep="\t")
pelanggan_matrix <- data.matrix(pelanggan[c("Jenis.Kelamin", "Profesi", "Tipe.Residen")])
pelanggan <- data.frame(pelanggan, pelanggan_matrix)
pelanggan$NilaiBelanjaSetahun = pelanggan$NilaiBelanjaSetahun/1000000
Profesi <- unique(pelanggan[c("Profesi","Profesi.1")])
Jenis.Kelamin <- unique(pelanggan[c("Jenis.Kelamin","Jenis.Kelamin.1")])
Tipe.Residen <- unique(pelanggan[c("Tipe.Residen","Tipe.Residen.1")])

#Part of K-Means
set.seed(1)
field_yang_digunakan = c("Jenis.Kelamin.1", "Umur", "Profesi.1", "Tipe.Residen.1","NilaiBelanjaSetahun")
segmentasi <- kmeans(x=pelanggan[field_yang_digunakan], centers=5, nstart=25)
Segmen.Pelanggan <- data.frame(cluster=c(1,2,3,4,5), Nama.Segmen=c("Silver Youth Gals", "Diamond Senior Member", "Gold Young Professional", "Diamond Professional", "Silver Mid Professional"))

Identitas.Cluster <- list(Profesi=Profesi, Jenis.Kelamin=Jenis.Kelamin, Tipe.Residen=Tipe.Residen, Segmentasi=segmentasi, Segmen.Pelanggan=Segmen.Pelanggan, field_yang_digunakan=field_yang_digunakan) 
saveRDS(Identitas.Cluster,"cluster.rds")

New data

databaru <- data.frame(Customer_ID="CUST-100", Nama.Pelanggan="Rudi Wilamar",Umur=20,Jenis.Kelamin="Wanita",Profesi="Pelajar",Tipe.Residen="Cluster",NilaiBelanjaSetahun=3.5)
databaru

##   Customer_ID Nama.Pelanggan Umur Jenis.Kelamin Profesi Tipe.Residen
## 1    CUST-100   Rudi Wilamar   20        Wanita Pelajar      Cluster
##   NilaiBelanjaSetahun
## 1                 3.5

Memuat Objek Clustering dari File

Identitas.Cluster <- readRDS(file="cluster.rds")
Identitas.Cluster

## $Profesi
##             Profesi Profesi.1
## 1        Wiraswasta         5
## 2           Pelajar         3
## 3      Professional         4
## 17 Ibu Rumah Tangga         1
## 31        Mahasiswa         2
## 
## $Jenis.Kelamin
##   Jenis.Kelamin Jenis.Kelamin.1
## 1          Pria               1
## 2        Wanita               2
## 
## $Tipe.Residen
##   Tipe.Residen Tipe.Residen.1
## 1       Sector              2
## 2      Cluster              1
## 
## $Segmentasi
## K-means clustering with 5 clusters of sizes 12, 10, 5, 14, 9
## 
## Cluster means:
##   Jenis.Kelamin.1     Umur Profesi.1 Tipe.Residen.1 NilaiBelanjaSetahun
## 1            1.75 31.58333  3.916667       1.250000            7.330958
## 2            1.70 52.50000  3.800000       1.300000            6.018321
## 3            1.40 61.80000  4.200000       1.400000            8.696132
## 4            2.00 20.07143  3.571429       1.357143            5.901089
## 5            2.00 42.33333  4.000000       1.555556            8.804791
## 
## Clustering vector:
##  [1] 3 4 2 2 5 4 3 2 1 1 2 2 3 3 4 1 1 3 1 4 5 2 1 5 1 2 1 5 2 5 4 5 4 4 5 1 4 5
## [39] 4 4 4 1 1 4 4 4 2 5 1 2
## 
## Within cluster sum of squares by cluster:
## [1] 174.85164 108.49735  58.21123 316.73367 171.67372
##  (between_SS / total_SS =  92.4 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
## [6] "betweenss"    "size"         "iter"         "ifault"      
## 
## $Segmen.Pelanggan
##   cluster             Nama.Segmen
## 1       1       Silver Youth Gals
## 2       2   Diamond Senior Member
## 3       3 Gold Young Professional
## 4       4    Diamond Professional
## 5       5 Silver Mid Professional
## 
## $field_yang_digunakan
## [1] "Jenis.Kelamin.1"     "Umur"                "Profesi.1"          
## [4] "Tipe.Residen.1"      "NilaiBelanjaSetahun"

Merge with Data Reference

databaru <- data.frame(Customer_ID="CUST-100", Nama.Pelanggan="Rudi Wilamar",Jenis.Kelamin="Wanita",Profesi="Pelajar",Tipe.Residen="Cluster",NilaiBelanjaSetahun=3.5)
Identitas.Cluster <- readRDS(file="cluster.rds")
#Enter the command for data merge
databaru <- merge(databaru, Identitas.Cluster$Profesi)
databaru <- merge(databaru, Identitas.Cluster$Jenis.Kelamin)
databaru <- merge(databaru, Identitas.Cluster$Tipe.Residen)
databaru

##   Tipe.Residen Jenis.Kelamin Profesi Customer_ID Nama.Pelanggan
## 1      Cluster        Wanita Pelajar    CUST-100   Rudi Wilamar
##   NilaiBelanjaSetahun Profesi.1 Jenis.Kelamin.1 Tipe.Residen.1
## 1                 3.5         3               2              1

Defining the Cluster

#Make new data
databaru <- data.frame(Customer_ID="CUST-100", Nama.Pelanggan="Rudi Wilamar",Umur=32,Jenis.Kelamin="Wanita",Profesi="Pelajar",Tipe.Residen="Cluster",NilaiBelanjaSetahun=3.5)

Identitas.Cluster <- readRDS(file="cluster.rds")

databaru <- merge(databaru, Identitas.Cluster$Profesi)
databaru <- merge(databaru, Identitas.Cluster$Jenis.Kelamin)
databaru <- merge(databaru, Identitas.Cluster$Tipe.Residen)

#Specify new data in which cluster
Identitas.Cluster$Segmen.Pelanggan [which.min(sapply( 1:5, function( x ) sum( ( databaru[Identitas.Cluster$field_yang_digunakan] - Identitas.Cluster$Segmentasi$centers[x,])^2 ) )),]

##   cluster       Nama.Segmen
## 1       1 Silver Youth Gals