In this demo we are going to make a customer segmentation via clustering. A bank wants to launch three types of personal loan and would like to segment the existing set of credit card customers into three groups to offer these products.
set.seed(111)
customer_data<-read.csv('/home/user/Desktop/Analytcis with R/DataSets/BankCustomerData.csv')
head(customer_data)
## age job marital education default balance housing loan contact day
## 1 58 management married tertiary no 2143 yes no unknown 5
## 2 44 technician single secondary no 29 yes no unknown 5
## 3 33 entrepreneur married secondary no 2 yes yes unknown 5
## 4 47 blue-collar married unknown no 1506 yes no unknown 5
## 5 33 unknown single unknown no 1 no no unknown 5
## 6 35 management married tertiary no 231 yes no unknown 5
## month duration campaign pdays previous poutcome term_deposit
## 1 may 261 1 -1 0 unknown no
## 2 may 151 1 -1 0 unknown no
## 3 may 76 1 -1 0 unknown no
## 4 may 92 1 -1 0 unknown no
## 5 may 198 1 -1 0 unknown no
## 6 may 139 1 -1 0 unknown no
str(customer_data)
## 'data.frame': 42639 obs. of 17 variables:
## $ age : int 58 44 33 47 33 35 28 42 58 43 ...
## $ job : chr "management" "technician" "entrepreneur" "blue-collar" ...
## $ marital : chr "married" "single" "married" "married" ...
## $ education : chr "tertiary" "secondary" "secondary" "unknown" ...
## $ default : chr "no" "no" "no" "no" ...
## $ balance : int 2143 29 2 1506 1 231 447 2 121 593 ...
## $ housing : chr "yes" "yes" "yes" "yes" ...
## $ loan : chr "no" "no" "yes" "no" ...
## $ contact : chr "unknown" "unknown" "unknown" "unknown" ...
## $ day : int 5 5 5 5 5 5 5 5 5 5 ...
## $ month : chr "may" "may" "may" "may" ...
## $ duration : int 261 151 76 92 198 139 217 380 50 55 ...
## $ campaign : int 1 1 1 1 1 1 1 1 1 1 ...
## $ pdays : int -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 ...
## $ previous : int 0 0 0 0 0 0 0 0 0 0 ...
## $ poutcome : chr "unknown" "unknown" "unknown" "unknown" ...
## $ term_deposit: chr "no" "no" "no" "no" ...
Now lets calculate kmeans with 3 cluster and with no more 10 iterations
cluster_up<-kmeans(customer_data,3,iter.max = 10)
## Warning in storage.mode(x) <- "double": NAs introduced by coercion
## Error in do_one(nmeth): NA/NaN/Inf in foreign function call (arg 1)
We have got an issue, lets talk about this error. Why does it happen? We need to make some data cleaning…
customer_data_num<-customer_data[,sapply(customer_data,is.integer)]
str(customer_data_num) #Demonstrate another cleaning tactics by means of subset()
## 'data.frame': 42639 obs. of 7 variables:
## $ age : int 58 44 33 47 33 35 28 42 58 43 ...
## $ balance : int 2143 29 2 1506 1 231 447 2 121 593 ...
## $ day : int 5 5 5 5 5 5 5 5 5 5 ...
## $ duration: int 261 151 76 92 198 139 217 380 50 55 ...
## $ campaign: int 1 1 1 1 1 1 1 1 1 1 ...
## $ pdays : int -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 ...
## $ previous: int 0 0 0 0 0 0 0 0 0 0 ...
customer_data_num<-na.omit(customer_data_num)
customer_data_num<-scale(customer_data_num) #Get help for scale() function
head(customer_data_num)
## age balance day duration campaign pdays previous
## 1 1.6873326 0.26934282 -1.308767 0.01951722 -0.5741665 -0.3821 -0.2148158
## 2 0.3148154 -0.43262416 -1.308767 -0.40624303 -0.5741665 -0.3821 -0.2148158
## 3 -0.7635910 -0.44158968 -1.308767 -0.69653410 -0.5741665 -0.3821 -0.2148158
## 4 0.6089262 0.05782297 -1.308767 -0.63460534 -0.5741665 -0.3821 -0.2148158
## 5 -0.7635910 -0.44192173 -1.308767 -0.22432728 -0.5741665 -0.3821 -0.2148158
## 6 -0.5675171 -0.36554879 -1.308767 -0.45268960 -0.5741665 -0.3821 -0.2148158
Now let’s make k-means cluster on cleaned dataset
cluster_up<-kmeans(customer_data_num,3,iter.max = 10)
str(cluster_up)
## List of 9
## $ cluster : Named int [1:42639] 3 3 3 3 3 3 3 3 3 3 ...
## ..- attr(*, "names")= chr [1:42639] "1" "2" "3" "4" ...
## $ centers : num [1:3, 1:7] -0.04343 -0.10326 0.06987 0.03167 0.00751 ...
## ..- attr(*, "dimnames")=List of 2
## .. ..$ : chr [1:3] "1" "2" "3"
## .. ..$ : chr [1:7] "age" "balance" "day" "duration" ...
## $ totss : num 298466
## $ withinss : num [1:3] 88982 54661 77716
## $ tot.withinss: num 221359
## $ betweenss : num 77107
## $ size : int [1:3] 18095 5366 19178
## $ iter : int 4
## $ ifault : int 0
## - attr(*, "class")= chr "kmeans"
customer_data_num<-cbind(customer_data_num, ClusterNo=cluster_up$cluster)
head(customer_data_num)
## age balance day duration campaign pdays previous
## 1 1.6873326 0.26934282 -1.308767 0.01951722 -0.5741665 -0.3821 -0.2148158
## 2 0.3148154 -0.43262416 -1.308767 -0.40624303 -0.5741665 -0.3821 -0.2148158
## 3 -0.7635910 -0.44158968 -1.308767 -0.69653410 -0.5741665 -0.3821 -0.2148158
## 4 0.6089262 0.05782297 -1.308767 -0.63460534 -0.5741665 -0.3821 -0.2148158
## 5 -0.7635910 -0.44192173 -1.308767 -0.22432728 -0.5741665 -0.3821 -0.2148158
## 6 -0.5675171 -0.36554879 -1.308767 -0.45268960 -0.5741665 -0.3821 -0.2148158
## ClusterNo
## 1 3
## 2 3
## 3 3
## 4 3
## 5 3
## 6 3
#library(mclust)
#fit<-Mclust(customer_data_num)
#plot(fit)