Importing the Data
<-read.csv("cities.csv")
citiesrow.names(cities)<-cities$Kota
<-cities[,-1] cities
library(dplyr)
glimpse(cities)
## Rows: 12
## Columns: 5
## $ Pendapatan <int> 55, 61, 58, 67, 71, 76, 81, 56, 84, 88, 84, 90
## $ Pinjaman <dbl> 5.6, 8.0, 3.9, 5.5, 5.7, 7.6, 8.7, 7.1, 7.6, 6.5, 6.8, 8.0
## $ Dana.Hibah <int> 9, 7, 7, 7, 6, 8, 9, 6, 7, 8, 9, 9
## $ Konsumsi <int> 50, 62, 60, 64, 70, 80, 80, 86, 82, 86, 88, 90
## $ Produk <int> 25, 41, 32, 51, 42, 29, 57, 29, 46, 52, 61, 66
::corrplot(cor(cities), method = "color",
corrplottype = "upper", order = "original",
addCoef.col = "black", # Add coefficient of correlation
diag = FALSE,
)
Noted that some variables are quite highly correlated, i.e
Konsumsi
and Produk
towards
Pendapatan
. Hence, if we are using Euclidean distance, we
will have to take care of this problem in advance. For instance, we
could select one the correlated variables to be included in the
clustering analysis.
names(cities)
## [1] "Pendapatan" "Pinjaman" "Dana.Hibah" "Konsumsi" "Produk"
# drop `Pendapatan`
<-cities[,-1]
cities2::corrplot(cor(cities2), method = "color",
corrplottype = "upper", order = "original",
addCoef.col = "black", # Add coefficient of correlation
diag = FALSE,
)
K-Means Clustering
Other than multicollinearities, we should also concern about the different measurement scales of our variables. Therefore, we will normalize the variables before using Euclidean distance.
<-scale(cities2)
cities2.normsummary(cities2.norm)
## Pinjaman Dana.Hibah Konsumsi Produk
## Min. :-2.0791 Min. :-1.4434 Min. :-1.8897 Min. :-1.41834
## 1st Qu.:-0.7842 1st Qu.:-0.5774 1st Qu.:-0.8624 1st Qu.:-0.95784
## Median : 0.1459 Median :-0.1443 Median : 0.3932 Median :-0.01842
## Mean : 0.0000 Mean : 0.0000 Mean : 0.0000 Mean : 0.00000
## 3rd Qu.: 0.6930 3rd Qu.: 1.1547 3rd Qu.: 0.8497 3rd Qu.: 0.66312
## Max. : 1.4225 Max. : 1.1547 Max. : 1.1541 Max. : 1.60254
Determining the optimal number of clusters
library(factoextra)
fviz_nbclust(cities2.norm, kmeans, method = "wss") +
geom_vline(xintercept = 4, linetype = 2)
We could use several function to proceed the k-means clustering in R,
some of them are kmeans()
function from stats
package, and eclust()
function from factoextra
package. In this case, both clusters exactly yield the same results.
set.seed(123)
<- kmeans(cities2.norm, 4)
km.res print(km.res)
## K-means clustering with 4 clusters of sizes 1, 4, 4, 3
##
## Cluster means:
## Pinjaman Dana.Hibah Konsumsi Produk
## 1 -0.8389266 1.1547005 -1.8896991 -1.4183407
## 2 0.5471260 0.9381942 0.8497305 1.0867805
## 3 0.6018386 -0.5773503 0.2029207 -0.5894403
## 4 -1.2523107 -0.8660254 -0.7736352 -0.1903401
##
## Clustering vector:
## Inazuma Sumeru Fontaine Natlan Snezhnaya Teyvat Guyot Sakala
## 1 3 4 4 4 3 2 3
## Gaia Mondstat Kagmaw Liyue
## 3 2 2 2
##
## Within cluster sum of squares by cluster:
## [1] 0.000000 3.154522 4.889090 2.810140
## (between_SS / total_SS = 75.3 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss" "tot.withinss"
## [6] "betweenss" "size" "iter" "ifault"
<-eclust(cities2.norm, "kmeans", hc_metric="euclidean",k=4) km
#aggregate(cities2, by=list(cluster=km.res$cluster), mean)
aggregate(cities2, by=list(cluster=km$cluster), mean)
## cluster Pinjaman Dana.Hibah Konsumsi Produk
## 1 1 5.600000 9.000000 50.00000 25.00000
## 2 2 7.500000 8.750000 86.00000 59.00000
## 3 3 7.575000 7.000000 77.50000 36.25000
## 4 4 5.033333 6.666667 64.66667 41.66667
#km.res$cluster
$cluster km
## Inazuma Sumeru Fontaine Natlan Snezhnaya Teyvat Guyot Sakala
## 1 3 4 4 4 3 2 3
## Gaia Mondstat Kagmaw Liyue
## 3 2 2 2
#km.res$size
$size km
## [1] 1 4 4 3
#km.res$centers
$centers km
## Pinjaman Dana.Hibah Konsumsi Produk
## 1 -0.8389266 1.1547005 -1.8896991 -1.4183407
## 2 0.5471260 0.9381942 0.8497305 1.0867805
## 3 0.6018386 -0.5773503 0.2029207 -0.5894403
## 4 -1.2523107 -0.8660254 -0.7736352 -0.1903401
#fviz_cluster(km.res, data = cities2, repel = TRUE, ggtheme = theme_minimal())
fviz_cluster(km, data = cities2, repel = TRUE, ggtheme = theme_minimal())
K-Medoids Clustering
library(cluster)
fviz_nbclust(cities2.norm, FUNcluster=cluster::pam, method = "wss")+
theme_classic()
<- pam(cities2.norm, 4)
pam.res print(pam.res)
## Medoids:
## ID Pinjaman Dana.Hibah Konsumsi Produk
## Inazuma 1 -0.8389266 1.1547005 -1.8896991 -1.4183407
## Gaia 9 0.6200762 -0.5773503 0.5453494 0.1289401
## Natlan 4 -0.9118767 -0.5773503 -0.8243654 0.4973402
## Liyue 12 0.9118767 1.1547005 1.1541115 1.6025408
## Clustering vector:
## Inazuma Sumeru Fontaine Natlan Snezhnaya Teyvat Guyot Sakala
## 1 2 3 3 3 2 4 2
## Gaia Mondstat Kagmaw Liyue
## 2 2 4 4
## Objective function:
## build swap
## 0.9289732 0.9289732
##
## Available components:
## [1] "medoids" "id.med" "clustering" "objective" "isolation"
## [6] "clusinfo" "silinfo" "diss" "call" "data"
<-eclust(cities2.norm, "pam", hc_metric="euclidean",k=4) pam
#pam.res$medoids
$medoids pam
## Pinjaman Dana.Hibah Konsumsi Produk
## Inazuma -0.8389266 1.1547005 -1.8896991 -1.4183407
## Gaia 0.6200762 -0.5773503 0.5453494 0.1289401
## Natlan -0.9118767 -0.5773503 -0.8243654 0.4973402
## Liyue 0.9118767 1.1547005 1.1541115 1.6025408
#pam.res$clustering
$clustering pam
## Inazuma Sumeru Fontaine Natlan Snezhnaya Teyvat Guyot Sakala
## 1 2 3 3 3 2 4 2
## Gaia Mondstat Kagmaw Liyue
## 2 2 4 4
fviz_cluster(pam,
repel = TRUE, # Avoid label overplotting (slow)
ggtheme = theme_classic())
Clustering Evaluation
CH Index
library(clusterSim)
<-index.G1(cities2, cl=km$cluster)) (ch.km
## [1] 8.290769
<-index.G1(cities2, cl=pam$cluster)) (ch.pam
## [1] 6.482316
DB Index
<-index.DB(cities2, cl=km$cluster)$DB) (db.km
## [1] 1.018532
<-index.DB(cities2, cl=pam$cluster)$DB) (db.pam
## [1] 1.026754
ASW
library(fpc)
<-fviz_silhouette(km)) (sil.km
## cluster size ave.sil.width
## 1 1 1 0.00
## 2 2 4 0.45
## 3 3 4 0.24
## 4 4 3 0.27
<-fviz_silhouette(pam)) (sil.pam
## cluster size ave.sil.width
## 1 1 1 0.00
## 2 2 5 0.15
## 3 3 3 0.27
## 4 4 3 0.53
<-data.frame(
clust.evalMethod=c("K-Means", "K-Medoids"),
CH=c(ch.km, ch.pam),
DB=c(db.km, db.pam),
ASW=c(colMeans(sil.km$data[3]), colMeans(sil.pam$data[3]))
) clust.eval
## Method CH DB ASW
## 1 K-Means 8.290769 1.018532 0.2963321
## 2 K-Medoids 6.482316 1.026754 0.2648073
Based on the three cluster validation indexes, we could consider to use the results from k-means clustering as the best clustering. The following is variable means of each clusters.
$cluster<-km$cluster
cities2
%>%
cities2 group_by(cluster) %>%
summarise_all(list(mean))
## # A tibble: 4 × 5
## cluster Pinjaman Dana.Hibah Konsumsi Produk
## <int> <dbl> <dbl> <dbl> <dbl>
## 1 1 5.6 9 50 25
## 2 2 7.5 8.75 86 59
## 3 3 7.57 7 77.5 36.2
## 4 4 5.03 6.67 64.7 41.7
Exercise
Customer Marketing Data Set
<-read.csv("https://raw.githubusercontent.com/raoy/data/master/Customer%20Data%20v.2.csv")
cust#cust<-read.csv("Customer Data v.2.csv")
glimpse(cust)
## Rows: 8,636
## Columns: 18
## $ CUST_ID <chr> "C10001", "C10002", "C10003", "C10005…
## $ BALANCE <dbl> 40.90075, 3202.46742, 2495.14886, 817…
## $ BALANCE_FREQUENCY <dbl> 0.818182, 0.909091, 1.000000, 1.00000…
## $ PURCHASES <dbl> 95.40, 0.00, 773.17, 16.00, 1333.28, …
## $ ONEOFF_PURCHASES <dbl> 0.00, 0.00, 773.17, 16.00, 0.00, 6402…
## $ INSTALLMENTS_PURCHASES <dbl> 95.40, 0.00, 0.00, 0.00, 1333.28, 688…
## $ CASH_ADVANCE <dbl> 0.0000, 6442.9455, 0.0000, 0.0000, 0.…
## $ PURCHASES_FREQUENCY <dbl> 0.166667, 0.000000, 1.000000, 0.08333…
## $ ONEOFF_PURCHASES_FREQUENCY <dbl> 0.000000, 0.000000, 1.000000, 0.08333…
## $ PURCHASES_INSTALLMENTS_FREQUENCY <dbl> 0.083333, 0.000000, 0.000000, 0.00000…
## $ CASH_ADVANCE_FREQUENCY <dbl> 0.000000, 0.250000, 0.000000, 0.00000…
## $ CASH_ADVANCE_TRX <int> 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ PURCHASES_TRX <int> 2, 0, 12, 1, 8, 64, 12, 5, 3, 12, 6, …
## $ CREDIT_LIMIT <dbl> 1000, 7000, 7500, 1200, 1800, 13500, …
## $ PAYMENTS <dbl> 201.8021, 4103.0326, 622.0667, 678.33…
## $ MINIMUM_PAYMENTS <dbl> 139.50979, 1072.34022, 627.28479, 244…
## $ PRC_FULL_PAYMENT <dbl> 0.000000, 0.222222, 0.000000, 0.00000…
## $ TENURE <int> 12, 12, 12, 12, 12, 12, 12, 12, 12, 1…
<-cust[,-1]
cust2names(cust2)<-paste("X", 1:17)
::corrplot(cor(cust2), addCoef.col=1,number.cex=0.5,
corrplotdiag=F, type="lower")
boxplot(scale(cust2), col="lightblue", las=2, pch=16, cex=0.5)
TASKS
Depends on the exploration data analysis, conduct appropriate clustering procedure on the customer data set, and report the results in front of your class.