pelanggan <- read.csv("https://storage.googleapis.com/dqlab-dataset/customer_segments.txt", sep="\t")
pelanggan[c("Jenis.Kelamin","Umur", "Profesi", "Tipe.Residen")]
## Jenis.Kelamin Umur Profesi Tipe.Residen
## 1 Pria 58 Wiraswasta Sector
## 2 Wanita 14 Pelajar Cluster
## 3 Pria 48 Professional Cluster
## 4 Pria 53 Professional Cluster
## 5 Wanita 41 Wiraswasta Cluster
## 6 Wanita 24 Professional Cluster
## 7 Pria 64 Wiraswasta Sector
## 8 Pria 52 Professional Cluster
## 9 Wanita 29 Professional Sector
## 10 Pria 33 Professional Cluster
## 11 Wanita 50 Professional Sector
## 12 Wanita 49 Professional Sector
## 13 Wanita 64 Wiraswasta Cluster
## 14 Pria 60 Wiraswasta Cluster
## 15 Wanita 20 Wiraswasta Cluster
## 16 Pria 35 Professional Cluster
## 17 Wanita 32 Ibu Rumah Tangga Cluster
## 18 Wanita 63 Ibu Rumah Tangga Cluster
## 19 Wanita 32 Wiraswasta Cluster
## 20 Wanita 16 Pelajar Sector
## 21 Wanita 38 Wiraswasta Cluster
## 22 Wanita 52 Professional Cluster
## 23 Pria 34 Professional Cluster
## 24 Wanita 39 Wiraswasta Cluster
## 25 Wanita 29 Wiraswasta Sector
## 26 Wanita 55 Professional Cluster
## 27 Wanita 35 Wiraswasta Cluster
## 28 Wanita 40 Ibu Rumah Tangga Cluster
## 29 Wanita 56 Professional Cluster
## 30 Wanita 46 Ibu Rumah Tangga Sector
## 31 Wanita 19 Mahasiswa Cluster
## 32 Wanita 47 Wiraswasta Sector
## 33 Wanita 19 Mahasiswa Cluster
## 34 Wanita 21 Wiraswasta Sector
## 35 Wanita 39 Professional Sector
## 36 Wanita 30 Wiraswasta Cluster
## 37 Wanita 25 Professional Sector
## 38 Wanita 46 Wiraswasta Sector
## 39 Wanita 20 Professional Cluster
## 40 Wanita 14 Pelajar Sector
## 41 Wanita 24 Ibu Rumah Tangga Cluster
## 42 Wanita 26 Wiraswasta Cluster
## 43 Wanita 31 Professional Cluster
## 44 Wanita 18 Wiraswasta Cluster
## 45 Wanita 22 Professional Cluster
## 46 Wanita 25 Wiraswasta Sector
## 47 Wanita 55 Ibu Rumah Tangga Cluster
## 48 Wanita 45 Wiraswasta Sector
## 49 Wanita 33 Ibu Rumah Tangga Sector
## 50 Wanita 55 Wiraswasta Sector
#Reads csv data and inserts into Pelanggan variables
pelanggan <- read.csv("https://storage.googleapis.com/dqlab-dataset/customer_segments.txt",sep="\t")
#Create a field_yang_digunakan variable with the contents of a vector "Jenis.Kelamin", "Umur" dan "Profesi"
field_yang_digunakan <- c("Jenis.Kelamin", "Umur", "Profesi")
#View data Pelanggan
pelanggan[field_yang_digunakan]
## Jenis.Kelamin Umur Profesi
## 1 Pria 58 Wiraswasta
## 2 Wanita 14 Pelajar
## 3 Pria 48 Professional
## 4 Pria 53 Professional
## 5 Wanita 41 Wiraswasta
## 6 Wanita 24 Professional
## 7 Pria 64 Wiraswasta
## 8 Pria 52 Professional
## 9 Wanita 29 Professional
## 10 Pria 33 Professional
## 11 Wanita 50 Professional
## 12 Wanita 49 Professional
## 13 Wanita 64 Wiraswasta
## 14 Pria 60 Wiraswasta
## 15 Wanita 20 Wiraswasta
## 16 Pria 35 Professional
## 17 Wanita 32 Ibu Rumah Tangga
## 18 Wanita 63 Ibu Rumah Tangga
## 19 Wanita 32 Wiraswasta
## 20 Wanita 16 Pelajar
## 21 Wanita 38 Wiraswasta
## 22 Wanita 52 Professional
## 23 Pria 34 Professional
## 24 Wanita 39 Wiraswasta
## 25 Wanita 29 Wiraswasta
## 26 Wanita 55 Professional
## 27 Wanita 35 Wiraswasta
## 28 Wanita 40 Ibu Rumah Tangga
## 29 Wanita 56 Professional
## 30 Wanita 46 Ibu Rumah Tangga
## 31 Wanita 19 Mahasiswa
## 32 Wanita 47 Wiraswasta
## 33 Wanita 19 Mahasiswa
## 34 Wanita 21 Wiraswasta
## 35 Wanita 39 Professional
## 36 Wanita 30 Wiraswasta
## 37 Wanita 25 Professional
## 38 Wanita 46 Wiraswasta
## 39 Wanita 20 Professional
## 40 Wanita 14 Pelajar
## 41 Wanita 24 Ibu Rumah Tangga
## 42 Wanita 26 Wiraswasta
## 43 Wanita 31 Professional
## 44 Wanita 18 Wiraswasta
## 45 Wanita 22 Professional
## 46 Wanita 25 Wiraswasta
## 47 Wanita 55 Ibu Rumah Tangga
## 48 Wanita 45 Wiraswasta
## 49 Wanita 33 Ibu Rumah Tangga
## 50 Wanita 55 Wiraswasta
#Data conversion to numerik
pelanggan_matrix <- data.matrix(pelanggan[c("Jenis.Kelamin", "Profesi", "Tipe.Residen")])
pelanggan <- read.csv("https://storage.googleapis.com/dqlab-dataset/customer_segments.txt",sep="\t")
pelanggan_matrix <- data.matrix(pelanggan[c("Jenis.Kelamin", "Profesi", "Tipe.Residen")])
#The data combined
pelanggan <- data.frame(pelanggan, pelanggan_matrix)
pelanggan
## Customer_ID Nama.Pelanggan Jenis.Kelamin Umur Profesi
## 1 CUST-001 Budi Anggara Pria 58 Wiraswasta
## 2 CUST-002 Shirley Ratuwati Wanita 14 Pelajar
## 3 CUST-003 Agus Cahyono Pria 48 Professional
## 4 CUST-004 Antonius Winarta Pria 53 Professional
## 5 CUST-005 Ibu Sri Wahyuni, IR Wanita 41 Wiraswasta
## 6 CUST-006 Rosalina Kurnia Wanita 24 Professional
## 7 CUST-007 Cahyono, Agus Pria 64 Wiraswasta
## 8 CUST-008 Danang Santosa Pria 52 Professional
## 9 CUST-009 Elisabeth Suryadinata Wanita 29 Professional
## 10 CUST-010 Mario Setiawan Pria 33 Professional
## 11 CUST-011 Maria Suryawan Wanita 50 Professional
## 12 CUST-012 Erliana Widjaja Wanita 49 Professional
## 13 CUST-013 Cahaya Putri Wanita 64 Wiraswasta
## 14 CUST-014 Mario Setiawan Pria 60 Wiraswasta
## 15 CUST-015 Shirley Ratuwati Wanita 20 Wiraswasta
## 16 CUST-016 Bambang Rudi Pria 35 Professional
## 17 CUST-017 Yuni Sari Wanita 32 Ibu Rumah Tangga
## 18 CUST-018 Nelly Halim Wanita 63 Ibu Rumah Tangga
## 19 CUST-019 Mega Pranoto Wanita 32 Wiraswasta
## 20 CUST-020 Irene Novianto Wanita 16 Pelajar
## 21 CUST-021 Lestari Fabianto Wanita 38 Wiraswasta
## 22 CUST-022 Novita Purba Wanita 52 Professional
## 23 CUST-023 Denny Amiruddin Pria 34 Professional
## 24 CUST-024 Putri Ginting Wanita 39 Wiraswasta
## 25 CUST-025 Julia Setiawan Wanita 29 Wiraswasta
## 26 CUST-026 Christine Winarto Wanita 55 Professional
## 27 CUST-027 Grace Mulyati Wanita 35 Wiraswasta
## 28 CUST-028 Adeline Huang Wanita 40 Ibu Rumah Tangga
## 29 CUST-029 Tia Hartanti Wanita 56 Professional
## 30 CUST-030 Rosita Saragih Wanita 46 Ibu Rumah Tangga
## 31 CUST-031 Eviana Handry Wanita 19 Mahasiswa
## 32 CUST-032 Chintya Winarni Wanita 47 Wiraswasta
## 33 CUST-033 Cecilia Kusnadi Wanita 19 Mahasiswa
## 34 CUST-034 Deasy Arisandi Wanita 21 Wiraswasta
## 35 CUST-035 Ida Ayu Wanita 39 Professional
## 36 CUST-036 Ni Made Suasti Wanita 30 Wiraswasta
## 37 CUST-037 Felicia Tandiono Wanita 25 Professional
## 38 CUST-038 Agatha Salim Wanita 46 Wiraswasta
## 39 CUST-039 Gina Hidayat Wanita 20 Professional
## 40 CUST-040 Irene Darmawan Wanita 14 Pelajar
## 41 CUST-041 Shinta Aritonang Wanita 24 Ibu Rumah Tangga
## 42 CUST-042 Yuliana Wati Wanita 26 Wiraswasta
## 43 CUST-043 Yenna Sumadi Wanita 31 Professional
## 44 CUST-044 Anna Wanita 18 Wiraswasta
## 45 CUST-045 Rismawati Juni Wanita 22 Professional
## 46 CUST-046 Elfira Surya Wanita 25 Wiraswasta
## 47 CUST-047 Mira Kurnia Wanita 55 Ibu Rumah Tangga
## 48 CUST-048 Maria Hutagalung Wanita 45 Wiraswasta
## 49 CUST-049 Josephine Wahab Wanita 33 Ibu Rumah Tangga
## 50 CUST-050 Lianna Nugraha Wanita 55 Wiraswasta
## Tipe.Residen NilaiBelanjaSetahun Jenis.Kelamin.1 Profesi.1 Tipe.Residen.1
## 1 Sector 9497927 1 5 2
## 2 Cluster 2722700 2 3 1
## 3 Cluster 5286429 1 4 1
## 4 Cluster 5204498 1 4 1
## 5 Cluster 10615206 2 5 1
## 6 Cluster 5215541 2 4 1
## 7 Sector 9837260 1 5 2
## 8 Cluster 5223569 1 4 1
## 9 Sector 5993218 2 4 2
## 10 Cluster 5257448 1 4 1
## 11 Sector 5987367 2 4 2
## 12 Sector 5941914 2 4 2
## 13 Cluster 9333168 2 5 1
## 14 Cluster 9471615 1 5 1
## 15 Cluster 10365668 2 5 1
## 16 Cluster 5262521 1 4 1
## 17 Cluster 5677762 2 1 1
## 18 Cluster 5340690 2 1 1
## 19 Cluster 10884508 2 5 1
## 20 Sector 2896845 2 3 2
## 21 Cluster 9222070 2 5 1
## 22 Cluster 5298157 2 4 1
## 23 Cluster 5239290 1 4 1
## 24 Cluster 10259572 2 5 1
## 25 Sector 10721998 2 5 2
## 26 Cluster 5269392 2 4 1
## 27 Cluster 9114159 2 5 1
## 28 Cluster 6631680 2 1 1
## 29 Cluster 5271845 2 4 1
## 30 Sector 5020976 2 1 2
## 31 Cluster 3042773 2 2 1
## 32 Sector 10663179 2 5 2
## 33 Cluster 3047926 2 2 1
## 34 Sector 9759822 2 5 2
## 35 Sector 5962575 2 4 2
## 36 Cluster 9678994 2 5 1
## 37 Sector 5972787 2 4 2
## 38 Sector 10477127 2 5 2
## 39 Cluster 5257775 2 4 1
## 40 Sector 2861855 2 3 2
## 41 Cluster 6820976 2 1 1
## 42 Cluster 9880607 2 5 1
## 43 Cluster 5268410 2 4 1
## 44 Cluster 9339737 2 5 1
## 45 Cluster 5211041 2 4 1
## 46 Sector 10099807 2 5 2
## 47 Cluster 6130724 2 1 1
## 48 Sector 10390732 2 5 2
## 49 Sector 4992585 2 1 2
## 50 Sector 10569316 2 5 2
pelanggan <- read.csv("https://storage.googleapis.com/dqlab-dataset/customer_segments.txt", sep="\t")
pelanggan_matrix <- data.matrix(pelanggan[c("Jenis.Kelamin", "Profesi", "Tipe.Residen")])
pelanggan <- data.frame(pelanggan, pelanggan_matrix)
#Normalizing value
pelanggan$NilaiBelanjaSetahun <- pelanggan$NilaiBelanjaSetahun / 1000000
pelanggan <- read.csv("https://storage.googleapis.com/dqlab-dataset/customer_segments.txt",sep="\t")
pelanggan_matrix <- data.matrix(pelanggan[c("Jenis.Kelamin", "Profesi", "Tipe.Residen")])
pelanggan <- data.frame(pelanggan, pelanggan_matrix)
pelanggan$NilaiBelanjaSetahun = pelanggan$NilaiBelanjaSetahun/1000000
#Filling data master
Profesi <- unique(pelanggan[c("Profesi","Profesi.1")])
Jenis.Kelamin <- unique(pelanggan[c("Jenis.Kelamin","Jenis.Kelamin.1")])
Tipe.Residen <- unique(pelanggan[c("Tipe.Residen","Tipe.Residen.1")])
K-means is an algorithm that divides data into a number of partitions in a simple way: it looks for the proximity of each point in a cluster by a number of average or mean values.
There are two key concepts that are also the name of the origin of k-means:
The desired number of partitions, represented by the letter k Looks for the “distance” of the proximity of each point to a number of observed cluster mean values, represented by means
#Part of Data Preparation
pelanggan <- read.csv("https://storage.googleapis.com/dqlab-dataset/customer_segments.txt", sep="\t")
pelanggan_matrix <- data.matrix(pelanggan[c("Jenis.Kelamin", "Profesi", "Tipe.Residen")])
pelanggan <- data.frame(pelanggan, pelanggan_matrix)
Profesi <- unique(pelanggan[c("Profesi","Profesi.1")])
Jenis.Kelamin <- unique(pelanggan[c("Jenis.Kelamin","Jenis.Kelamin.1")])
Tipe.Profesi <- unique(pelanggan[c("Tipe.Residen","Tipe.Residen.1")])
pelanggan$NilaiBelanjaSetahun <- pelanggan$NilaiBelanjaSetahun/1000000
field_yang_digunakan = c("Jenis.Kelamin.1", "Umur", "Profesi.1", "Tipe.Residen.1","NilaiBelanjaSetahun")
#Part of K-Means
set.seed(1)
#kmeans function to form 5 clusters with 25 random scenarios and save into segmentation variables
segmentasi <- kmeans(x=pelanggan[field_yang_digunakan], centers=5, nstart=25)
#Show haskmeans(x=pelanggan[field_yang_digunakan], centers=5, nstart=25)s
segmentasi
## K-means clustering with 5 clusters of sizes 12, 10, 5, 14, 9
##
## Cluster means:
## Jenis.Kelamin.1 Umur Profesi.1 Tipe.Residen.1 NilaiBelanjaSetahun
## 1 1.75 31.58333 3.916667 1.250000 7.330958
## 2 1.70 52.50000 3.800000 1.300000 6.018321
## 3 1.40 61.80000 4.200000 1.400000 8.696132
## 4 2.00 20.07143 3.571429 1.357143 5.901089
## 5 2.00 42.33333 4.000000 1.555556 8.804791
##
## Clustering vector:
## [1] 3 4 2 2 5 4 3 2 1 1 2 2 3 3 4 1 1 3 1 4 5 2 1 5 1 2 1 5 2 5 4 5 4 4 5 1 4 5
## [39] 4 4 4 1 1 4 4 4 2 5 1 2
##
## Within cluster sum of squares by cluster:
## [1] 174.85164 108.49735 58.21123 316.73367 171.67372
## (between_SS / total_SS = 92.4 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss" "tot.withinss"
## [6] "betweenss" "size" "iter" "ifault"
#Combining cluster results
segmentasi$cluster
## [1] 3 4 2 2 5 4 3 2 1 1 2 2 3 3 4 1 1 3 1 4 5 2 1 5 1 2 1 5 2 5 4 5 4 4 5 1 4 5
## [39] 4 4 4 1 1 4 4 4 2 5 1 2
pelanggan$cluster <- segmentasi$cluster
str(pelanggan)
## 'data.frame': 50 obs. of 11 variables:
## $ Customer_ID : chr "CUST-001" "CUST-002" "CUST-003" "CUST-004" ...
## $ Nama.Pelanggan : chr "Budi Anggara" "Shirley Ratuwati" "Agus Cahyono" "Antonius Winarta" ...
## $ Jenis.Kelamin : chr "Pria" "Wanita" "Pria" "Pria" ...
## $ Umur : int 58 14 48 53 41 24 64 52 29 33 ...
## $ Profesi : chr "Wiraswasta" "Pelajar" "Professional" "Professional" ...
## $ Tipe.Residen : chr "Sector" "Cluster" "Cluster" "Cluster" ...
## $ NilaiBelanjaSetahun: num 9.5 2.72 5.29 5.2 10.62 ...
## $ Jenis.Kelamin.1 : int 1 2 1 1 2 2 1 1 2 1 ...
## $ Profesi.1 : int 5 3 4 4 5 4 5 4 4 4 ...
## $ Tipe.Residen.1 : int 2 1 1 1 1 1 2 1 2 1 ...
## $ cluster : int 3 4 2 2 5 4 3 2 1 1 ...
#Analysis of result
#1st cluster filter
which(pelanggan$cluster == 2)
## [1] 3 4 8 11 12 22 26 29 47 50
length(which(pelanggan$cluster == 2))
## [1] 10
#Part of Data Preparation
pelanggan <- read.csv("https://storage.googleapis.com/dqlab-dataset/customer_segments.txt", sep="\t")
pelanggan_matrix <- data.matrix(pelanggan[c("Jenis.Kelamin", "Profesi", "Tipe.Residen")])
pelanggan <- data.frame(pelanggan, pelanggan_matrix)
Profesi <- unique(pelanggan[c("Profesi","Profesi.1")])
Jenis.Kelamin <- unique(pelanggan[c("Jenis.Kelamin","Jenis.Kelamin.1")])
Tipe.Profesi <- unique(pelanggan[c("Tipe.Residen","Tipe.Residen.1")])
pelanggan$NilaiBelanjaSetahun <- pelanggan$NilaiBelanjaSetahun/1000000
field_yang_digunakan = c("Jenis.Kelamin.1", "Umur", "Profesi.1", "Tipe.Residen.1","NilaiBelanjaSetahun")
#Part of K-Means
set.seed(1)
segmentasi <- kmeans(x=pelanggan[field_yang_digunakan], centers=5, nstart=25)
pelanggan$cluster <- segmentasi$cluster
#Analysis of result
#Viewing data on the 3-5th
pelanggan[which(pelanggan$cluster == 3),]
## Customer_ID Nama.Pelanggan Jenis.Kelamin Umur Profesi Tipe.Residen
## 1 CUST-001 Budi Anggara Pria 58 Wiraswasta Sector
## 7 CUST-007 Cahyono, Agus Pria 64 Wiraswasta Sector
## 13 CUST-013 Cahaya Putri Wanita 64 Wiraswasta Cluster
## 14 CUST-014 Mario Setiawan Pria 60 Wiraswasta Cluster
## 18 CUST-018 Nelly Halim Wanita 63 Ibu Rumah Tangga Cluster
## NilaiBelanjaSetahun Jenis.Kelamin.1 Profesi.1 Tipe.Residen.1 cluster
## 1 9.497927 1 5 2 3
## 7 9.837260 1 5 2 3
## 13 9.333168 2 5 1 3
## 14 9.471615 1 5 1 3
## 18 5.340690 2 1 1 3
pelanggan[which(pelanggan$cluster == 4),]
## Customer_ID Nama.Pelanggan Jenis.Kelamin Umur Profesi
## 2 CUST-002 Shirley Ratuwati Wanita 14 Pelajar
## 6 CUST-006 Rosalina Kurnia Wanita 24 Professional
## 15 CUST-015 Shirley Ratuwati Wanita 20 Wiraswasta
## 20 CUST-020 Irene Novianto Wanita 16 Pelajar
## 31 CUST-031 Eviana Handry Wanita 19 Mahasiswa
## 33 CUST-033 Cecilia Kusnadi Wanita 19 Mahasiswa
## 34 CUST-034 Deasy Arisandi Wanita 21 Wiraswasta
## 37 CUST-037 Felicia Tandiono Wanita 25 Professional
## 39 CUST-039 Gina Hidayat Wanita 20 Professional
## 40 CUST-040 Irene Darmawan Wanita 14 Pelajar
## 41 CUST-041 Shinta Aritonang Wanita 24 Ibu Rumah Tangga
## 44 CUST-044 Anna Wanita 18 Wiraswasta
## 45 CUST-045 Rismawati Juni Wanita 22 Professional
## 46 CUST-046 Elfira Surya Wanita 25 Wiraswasta
## Tipe.Residen NilaiBelanjaSetahun Jenis.Kelamin.1 Profesi.1 Tipe.Residen.1
## 2 Cluster 2.722700 2 3 1
## 6 Cluster 5.215541 2 4 1
## 15 Cluster 10.365668 2 5 1
## 20 Sector 2.896845 2 3 2
## 31 Cluster 3.042773 2 2 1
## 33 Cluster 3.047926 2 2 1
## 34 Sector 9.759822 2 5 2
## 37 Sector 5.972787 2 4 2
## 39 Cluster 5.257775 2 4 1
## 40 Sector 2.861855 2 3 2
## 41 Cluster 6.820976 2 1 1
## 44 Cluster 9.339737 2 5 1
## 45 Cluster 5.211041 2 4 1
## 46 Sector 10.099807 2 5 2
## cluster
## 2 4
## 6 4
## 15 4
## 20 4
## 31 4
## 33 4
## 34 4
## 37 4
## 39 4
## 40 4
## 41 4
## 44 4
## 45 4
## 46 4
pelanggan[which(pelanggan$cluster == 5),]
## Customer_ID Nama.Pelanggan Jenis.Kelamin Umur Profesi
## 5 CUST-005 Ibu Sri Wahyuni, IR Wanita 41 Wiraswasta
## 21 CUST-021 Lestari Fabianto Wanita 38 Wiraswasta
## 24 CUST-024 Putri Ginting Wanita 39 Wiraswasta
## 28 CUST-028 Adeline Huang Wanita 40 Ibu Rumah Tangga
## 30 CUST-030 Rosita Saragih Wanita 46 Ibu Rumah Tangga
## 32 CUST-032 Chintya Winarni Wanita 47 Wiraswasta
## 35 CUST-035 Ida Ayu Wanita 39 Professional
## 38 CUST-038 Agatha Salim Wanita 46 Wiraswasta
## 48 CUST-048 Maria Hutagalung Wanita 45 Wiraswasta
## Tipe.Residen NilaiBelanjaSetahun Jenis.Kelamin.1 Profesi.1 Tipe.Residen.1
## 5 Cluster 10.615206 2 5 1
## 21 Cluster 9.222070 2 5 1
## 24 Cluster 10.259572 2 5 1
## 28 Cluster 6.631680 2 1 1
## 30 Sector 5.020976 2 1 2
## 32 Sector 10.663179 2 5 2
## 35 Sector 5.962575 2 4 2
## 38 Sector 10.477127 2 5 2
## 48 Sector 10.390732 2 5 2
## cluster
## 5 5
## 21 5
## 24 5
## 28 5
## 30 5
## 32 5
## 35 5
## 38 5
## 48 5
#Part of Data Preparation
pelanggan <- read.csv("https://storage.googleapis.com/dqlab-dataset/customer_segments.txt", sep="\t")
pelanggan_matrix <- data.matrix(pelanggan[c("Jenis.Kelamin", "Profesi", "Tipe.Residen")])
pelanggan <- data.frame(pelanggan, pelanggan_matrix)
Profesi <- unique(pelanggan[c("Profesi","Profesi.1")])
Jenis.Kelamin <- unique(pelanggan[c("Jenis.Kelamin","Jenis.Kelamin.1")])
Tipe.Profesi <- unique(pelanggan[c("Tipe.Residen","Tipe.Residen.1")])
pelanggan$NilaiBelanjaSetahun <- pelanggan$NilaiBelanjaSetahun/1000000
field_yang_digunakan = c("Jenis.Kelamin.1", "Umur", "Profesi.1", "Tipe.Residen.1","NilaiBelanjaSetahun")
#Part of K-Means
set.seed(1)
segmentasi <- kmeans(x=pelanggan[field_yang_digunakan], centers=5, nstart=25)
pelanggan$cluster <- segmentasi$cluster
#Analysis of the result
#View cluster means of objects
segmentasi$centers
## Jenis.Kelamin.1 Umur Profesi.1 Tipe.Residen.1 NilaiBelanjaSetahun
## 1 1.75 31.58333 3.916667 1.250000 7.330958
## 2 1.70 52.50000 3.800000 1.300000 6.018321
## 3 1.40 61.80000 4.200000 1.400000 8.696132
## 4 2.00 20.07143 3.571429 1.357143 5.901089
## 5 2.00 42.33333 4.000000 1.555556 8.804791
#Part of Data Preparation
pelanggan <- read.csv("https://storage.googleapis.com/dqlab-dataset/customer_segments.txt", sep="\t")
pelanggan_matrix <- data.matrix(pelanggan[c("Jenis.Kelamin", "Profesi", "Tipe.Residen")])
pelanggan <- data.frame(pelanggan, pelanggan_matrix)
Profesi <- unique(pelanggan[c("Profesi","Profesi.1")])
Jenis.Kelamin <- unique(pelanggan[c("Jenis.Kelamin","Jenis.Kelamin.1")])
Tipe.Profesi <- unique(pelanggan[c("Tipe.Residen","Tipe.Residen.1")])
pelanggan$NilaiBelanjaSetahun <- pelanggan$NilaiBelanjaSetahun/1000000
field_yang_digunakan = c("Jenis.Kelamin.1", "Umur", "Profesi.1", "Tipe.Residen.1","NilaiBelanjaSetahun")
#Comparing with 2 clusters of kmeans, 2 and 5 respectively
set.seed(1)
kmeans(x=pelanggan[field_yang_digunakan], centers=2, nstart=25)
## K-means clustering with 2 clusters of sizes 23, 27
##
## Cluster means:
## Jenis.Kelamin.1 Umur Profesi.1 Tipe.Residen.1 NilaiBelanjaSetahun
## 1 1.739130 51.17391 3.913043 1.434783 7.551518
## 2 1.888889 25.85185 3.777778 1.296296 6.659586
##
## Clustering vector:
## [1] 1 2 1 1 1 2 1 1 2 2 1 1 1 1 2 2 2 1 2 2 2 1 2 1 2 1 2 1 1 1 2 1 2 2 1 2 2 1
## [39] 2 2 2 2 2 2 2 2 1 1 2 1
##
## Within cluster sum of squares by cluster:
## [1] 1492.481 1524.081
## (between_SS / total_SS = 72.6 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss" "tot.withinss"
## [6] "betweenss" "size" "iter" "ifault"
set.seed(1)
kmeans(x=pelanggan[field_yang_digunakan], centers=5, nstart=25)
## K-means clustering with 5 clusters of sizes 12, 10, 5, 14, 9
##
## Cluster means:
## Jenis.Kelamin.1 Umur Profesi.1 Tipe.Residen.1 NilaiBelanjaSetahun
## 1 1.75 31.58333 3.916667 1.250000 7.330958
## 2 1.70 52.50000 3.800000 1.300000 6.018321
## 3 1.40 61.80000 4.200000 1.400000 8.696132
## 4 2.00 20.07143 3.571429 1.357143 5.901089
## 5 2.00 42.33333 4.000000 1.555556 8.804791
##
## Clustering vector:
## [1] 3 4 2 2 5 4 3 2 1 1 2 2 3 3 4 1 1 3 1 4 5 2 1 5 1 2 1 5 2 5 4 5 4 4 5 1 4 5
## [39] 4 4 4 1 1 4 4 4 2 5 1 2
##
## Within cluster sum of squares by cluster:
## [1] 174.85164 108.49735 58.21123 316.73367 171.67372
## (between_SS / total_SS = 92.4 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss" "tot.withinss"
## [6] "betweenss" "size" "iter" "ifault"
#Part of Data Preparation
pelanggan <- read.csv("https://storage.googleapis.com/dqlab-dataset/customer_segments.txt", sep="\t")
pelanggan_matrix <- data.matrix(pelanggan[c("Jenis.Kelamin", "Profesi", "Tipe.Residen")])
pelanggan <- data.frame(pelanggan, pelanggan_matrix)
Profesi <- unique(pelanggan[c("Profesi","Profesi.1")])
Jenis.Kelamin <- unique(pelanggan[c("Jenis.Kelamin","Jenis.Kelamin.1")])
Tipe.Profesi <- unique(pelanggan[c("Tipe.Residen","Tipe.Residen.1")])
pelanggan$NilaiBelanjaSetahun <- pelanggan$NilaiBelanjaSetahun/1000000
field_yang_digunakan = c("Jenis.Kelamin.1", "Umur", "Profesi.1", "Tipe.Residen.1","NilaiBelanjaSetahun")
#Part of K-Means
set.seed(1)
segmentasi <- kmeans(x=pelanggan[field_yang_digunakan], centers=5, nstart=25)
segmentasi$withinss
## [1] 174.85164 108.49735 58.21123 316.73367 171.67372
segmentasi$cluster
## [1] 3 4 2 2 5 4 3 2 1 1 2 2 3 3 4 1 1 3 1 4 5 2 1 5 1 2 1 5 2 5 4 5 4 4 5 1 4 5
## [39] 4 4 4 1 1 4 4 4 2 5 1 2
segmentasi$tot.withinss
## [1] 829.9676
By analyzing the results of this output, we are able to combine the cluster numbers into the origin data. In addition, we also know how close each data point of the cluster is so that it becomes our provision to determine the optimal number of clusters.
From the information generated by the kmeans function, the Sum of Squares (SS) method or often called the Sum of Squared Errors (SSE) is very important to be used as the basis for us to determine the most optimal number of clusters.
Theoretically, here are some things we can observe with the SS:
The fewer the number of clusters generated, the greater the SS value. Vice versa, the more number of clusters produced, the smaller the SS value. Due to its quadratic nature, if there is a significant difference between each combination of clusters, the difference in SS values will be even greater. And as the number of clusters increases, the difference between these SS will be smaller.
#Part of Data Preparation
pelanggan <- read.csv("https://storage.googleapis.com/dqlab-dataset/customer_segments.txt", sep="\t")
pelanggan_matrix <- data.matrix(pelanggan[c("Jenis.Kelamin", "Profesi", "Tipe.Residen")])
pelanggan <- data.frame(pelanggan, pelanggan_matrix)
Profesi <- unique(pelanggan[c("Profesi","Profesi.1")])
Jenis.Kelamin <- unique(pelanggan[c("Jenis.Kelamin","Jenis.Kelamin.1")])
Tipe.Profesi <- unique(pelanggan[c("Tipe.Residen","Tipe.Residen.1")])
pelanggan$NilaiBelanjaSetahun <-pelanggan$NilaiBelanjaSetahun/1000000
field_yang_digunakan = c("Jenis.Kelamin.1", "Umur", "Profesi.1", "Tipe.Residen.1","NilaiBelanjaSetahun")
#Part of K-Means
set.seed(1)
sse <- sapply(1:10,
function(param_k)
{
kmeans(pelanggan[field_yang_digunakan], param_k, nstart=25)$tot.withinss
}
)
sse
## [1] 10990.9740 3016.5612 1550.8725 1064.4187 829.9676 625.1462
## [7] 508.1568 431.6977 374.1095 325.7982
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.2.2
#Part of Data Preparation
pelanggan <- read.csv("https://storage.googleapis.com/dqlab-dataset/customer_segments.txt", sep="\t")
pelanggan_matrix <- data.matrix(pelanggan[c("Jenis.Kelamin", "Profesi", "Tipe.Residen")])
pelanggan <- data.frame(pelanggan, pelanggan_matrix)
Profesi <- unique(pelanggan[c("Profesi","Profesi.1")])
Jenis.Kelamin <- unique(pelanggan[c("Jenis.Kelamin","Jenis.Kelamin.1")])
Tipe.Profesi <- unique(pelanggan[c("Tipe.Residen","Tipe.Residen.1")])
pelanggan$NilaiBelanjaSetahun <- pelanggan$NilaiBelanjaSetahun/1000000
field_yang_digunakan = c("Jenis.Kelamin.1", "Umur", "Profesi.1", "Tipe.Residen.1","NilaiBelanjaSetahun")
#Part of K-Means
set.seed(1)
sse <- sapply(1:10, function(param_k){kmeans(pelanggan[field_yang_digunakan], param_k, nstart=25)$tot.withinss})
jumlah_cluster_max <- 10
ssdata = data.frame(cluster=c(1:jumlah_cluster_max),sse)
ggplot(ssdata, aes(x=cluster,y=sse)) +
geom_line(color="red") + geom_point() +
ylab("Within Cluster Sum of Squares") + xlab("Jumlah Cluster") +
geom_text(aes(label=format(round(sse, 2), nsmall = 2)),hjust=-0.2, vjust=-0.5) +
scale_x_discrete(limits=c(1:jumlah_cluster_max))
## Warning: Continuous limits supplied to discrete scale.
## ℹ Did you mean `limits = factor(...)` or `scale_*_continuous()`?
Segmen.Pelanggan <- data.frame(cluster = c(1,2,3,4,5),
Nama.Segmen = c("Silver Youth Gals",
"Diamond Senior Member",
"Gold Young Professional",
"Diamond Professional",
"Silver Mid Professional"))
#Reads csv data and inserts into Pelanggan variables
pelanggan <- read.csv("https://storage.googleapis.com/dqlab-dataset/customer_segments.txt", sep="\t")
pelanggan_matrix <- data.matrix(pelanggan[c("Jenis.Kelamin", "Profesi", "Tipe.Residen")])
pelanggan <- data.frame(pelanggan, pelanggan_matrix)
pelanggan$NilaiBelanjaSetahun = pelanggan$NilaiBelanjaSetahun/1000000
Profesi <- unique(pelanggan[c("Profesi","Profesi.1")])
Jenis.Kelamin <- unique(pelanggan[c("Jenis.Kelamin","Jenis.Kelamin.1")])
Tipe.Residen <- unique(pelanggan[c("Tipe.Residen","Tipe.Residen.1")])
#Part of K-Means
set.seed(1)
field_yang_digunakan = c("Jenis.Kelamin.1", "Umur", "Profesi.1", "Tipe.Residen.1","NilaiBelanjaSetahun")
segmentasi <- kmeans(x=pelanggan[field_yang_digunakan], centers=5, nstart=25)
Segmen.Pelanggan <- data.frame(cluster=c(1,2,3,4,5), Nama.Segmen=c("Silver Youth Gals", "Diamond Senior Member", "Gold Young Professional", "Diamond Professional", "Silver Mid Professional"))
#Combining all assets into Identitas.Cluster variables
Identitas.Cluster <- list(Profesi=Profesi, Jenis.Kelamin=Jenis.Kelamin, Tipe.Residen=Tipe.Residen, Segmentasi=segmentasi, Segmen.Pelanggan=Segmen.Pelanggan, field_yang_digunakan=field_yang_digunakan)
#Read data csv and inserts into variable pelanggan
pelanggan <- read.csv("https://storage.googleapis.com/dqlab-dataset/customer_segments.txt", sep="\t")
pelanggan_matrix <- data.matrix(pelanggan[c("Jenis.Kelamin", "Profesi", "Tipe.Residen")])
pelanggan <- data.frame(pelanggan, pelanggan_matrix)
pelanggan$NilaiBelanjaSetahun = pelanggan$NilaiBelanjaSetahun/1000000
Profesi <- unique(pelanggan[c("Profesi","Profesi.1")])
Jenis.Kelamin <- unique(pelanggan[c("Jenis.Kelamin","Jenis.Kelamin.1")])
Tipe.Residen <- unique(pelanggan[c("Tipe.Residen","Tipe.Residen.1")])
#Part of K-Means
set.seed(1)
field_yang_digunakan = c("Jenis.Kelamin.1", "Umur", "Profesi.1", "Tipe.Residen.1","NilaiBelanjaSetahun")
segmentasi <- kmeans(x=pelanggan[field_yang_digunakan], centers=5, nstart=25)
Segmen.Pelanggan <- data.frame(cluster=c(1,2,3,4,5), Nama.Segmen=c("Silver Youth Gals", "Diamond Senior Member", "Gold Young Professional", "Diamond Professional", "Silver Mid Professional"))
Identitas.Cluster <- list(Profesi=Profesi, Jenis.Kelamin=Jenis.Kelamin, Tipe.Residen=Tipe.Residen, Segmentasi=segmentasi, Segmen.Pelanggan=Segmen.Pelanggan, field_yang_digunakan=field_yang_digunakan)
saveRDS(Identitas.Cluster,"cluster.rds")
databaru <- data.frame(Customer_ID="CUST-100", Nama.Pelanggan="Rudi Wilamar",Umur=20,Jenis.Kelamin="Wanita",Profesi="Pelajar",Tipe.Residen="Cluster",NilaiBelanjaSetahun=3.5)
databaru
## Customer_ID Nama.Pelanggan Umur Jenis.Kelamin Profesi Tipe.Residen
## 1 CUST-100 Rudi Wilamar 20 Wanita Pelajar Cluster
## NilaiBelanjaSetahun
## 1 3.5
Identitas.Cluster <- readRDS(file="cluster.rds")
Identitas.Cluster
## $Profesi
## Profesi Profesi.1
## 1 Wiraswasta 5
## 2 Pelajar 3
## 3 Professional 4
## 17 Ibu Rumah Tangga 1
## 31 Mahasiswa 2
##
## $Jenis.Kelamin
## Jenis.Kelamin Jenis.Kelamin.1
## 1 Pria 1
## 2 Wanita 2
##
## $Tipe.Residen
## Tipe.Residen Tipe.Residen.1
## 1 Sector 2
## 2 Cluster 1
##
## $Segmentasi
## K-means clustering with 5 clusters of sizes 12, 10, 5, 14, 9
##
## Cluster means:
## Jenis.Kelamin.1 Umur Profesi.1 Tipe.Residen.1 NilaiBelanjaSetahun
## 1 1.75 31.58333 3.916667 1.250000 7.330958
## 2 1.70 52.50000 3.800000 1.300000 6.018321
## 3 1.40 61.80000 4.200000 1.400000 8.696132
## 4 2.00 20.07143 3.571429 1.357143 5.901089
## 5 2.00 42.33333 4.000000 1.555556 8.804791
##
## Clustering vector:
## [1] 3 4 2 2 5 4 3 2 1 1 2 2 3 3 4 1 1 3 1 4 5 2 1 5 1 2 1 5 2 5 4 5 4 4 5 1 4 5
## [39] 4 4 4 1 1 4 4 4 2 5 1 2
##
## Within cluster sum of squares by cluster:
## [1] 174.85164 108.49735 58.21123 316.73367 171.67372
## (between_SS / total_SS = 92.4 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss" "tot.withinss"
## [6] "betweenss" "size" "iter" "ifault"
##
## $Segmen.Pelanggan
## cluster Nama.Segmen
## 1 1 Silver Youth Gals
## 2 2 Diamond Senior Member
## 3 3 Gold Young Professional
## 4 4 Diamond Professional
## 5 5 Silver Mid Professional
##
## $field_yang_digunakan
## [1] "Jenis.Kelamin.1" "Umur" "Profesi.1"
## [4] "Tipe.Residen.1" "NilaiBelanjaSetahun"
databaru <- data.frame(Customer_ID="CUST-100", Nama.Pelanggan="Rudi Wilamar",Jenis.Kelamin="Wanita",Profesi="Pelajar",Tipe.Residen="Cluster",NilaiBelanjaSetahun=3.5)
Identitas.Cluster <- readRDS(file="cluster.rds")
#Enter the command for data merge
databaru <- merge(databaru, Identitas.Cluster$Profesi)
databaru <- merge(databaru, Identitas.Cluster$Jenis.Kelamin)
databaru <- merge(databaru, Identitas.Cluster$Tipe.Residen)
databaru
## Tipe.Residen Jenis.Kelamin Profesi Customer_ID Nama.Pelanggan
## 1 Cluster Wanita Pelajar CUST-100 Rudi Wilamar
## NilaiBelanjaSetahun Profesi.1 Jenis.Kelamin.1 Tipe.Residen.1
## 1 3.5 3 2 1
#Make new data
databaru <- data.frame(Customer_ID="CUST-100", Nama.Pelanggan="Rudi Wilamar",Umur=32,Jenis.Kelamin="Wanita",Profesi="Pelajar",Tipe.Residen="Cluster",NilaiBelanjaSetahun=3.5)
Identitas.Cluster <- readRDS(file="cluster.rds")
databaru <- merge(databaru, Identitas.Cluster$Profesi)
databaru <- merge(databaru, Identitas.Cluster$Jenis.Kelamin)
databaru <- merge(databaru, Identitas.Cluster$Tipe.Residen)
#Specify new data in which cluster
Identitas.Cluster$Segmen.Pelanggan [which.min(sapply( 1:5, function( x ) sum( ( databaru[Identitas.Cluster$field_yang_digunakan] - Identitas.Cluster$Segmentasi$centers[x,])^2 ) )),]
## cluster Nama.Segmen
## 1 1 Silver Youth Gals