Importing the Data

cities<-read.csv("cities.csv")
row.names(cities)<-cities$Kota
cities<-cities[,-1]
library(dplyr)
glimpse(cities)
## Rows: 12
## Columns: 5
## $ Pendapatan <int> 55, 61, 58, 67, 71, 76, 81, 56, 84, 88, 84, 90
## $ Pinjaman   <dbl> 5.6, 8.0, 3.9, 5.5, 5.7, 7.6, 8.7, 7.1, 7.6, 6.5, 6.8, 8.0
## $ Dana.Hibah <int> 9, 7, 7, 7, 6, 8, 9, 6, 7, 8, 9, 9
## $ Konsumsi   <int> 50, 62, 60, 64, 70, 80, 80, 86, 82, 86, 88, 90
## $ Produk     <int> 25, 41, 32, 51, 42, 29, 57, 29, 46, 52, 61, 66
corrplot::corrplot(cor(cities), method = "color",  
         type = "upper", order = "original", 
         addCoef.col = "black", # Add coefficient of correlation
         diag = FALSE, 
         )

Noted that some variables are quite highly correlated, i.e Konsumsi and Produk towards Pendapatan. Hence, if we are using Euclidean distance, we will have to take care of this problem in advance. For instance, we could select one the correlated variables to be included in the clustering analysis.

names(cities)
## [1] "Pendapatan" "Pinjaman"   "Dana.Hibah" "Konsumsi"   "Produk"
# drop `Pendapatan`
cities2<-cities[,-1]
corrplot::corrplot(cor(cities2), method = "color",  
         type = "upper", order = "original", 
         addCoef.col = "black", # Add coefficient of correlation
         diag = FALSE, 
         )

K-Means Clustering

Other than multicollinearities, we should also concern about the different measurement scales of our variables. Therefore, we will normalize the variables before using Euclidean distance.

cities2.norm<-scale(cities2)
summary(cities2.norm)
##     Pinjaman         Dana.Hibah         Konsumsi           Produk        
##  Min.   :-2.0791   Min.   :-1.4434   Min.   :-1.8897   Min.   :-1.41834  
##  1st Qu.:-0.7842   1st Qu.:-0.5774   1st Qu.:-0.8624   1st Qu.:-0.95784  
##  Median : 0.1459   Median :-0.1443   Median : 0.3932   Median :-0.01842  
##  Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.00000  
##  3rd Qu.: 0.6930   3rd Qu.: 1.1547   3rd Qu.: 0.8497   3rd Qu.: 0.66312  
##  Max.   : 1.4225   Max.   : 1.1547   Max.   : 1.1541   Max.   : 1.60254

Determining the optimal number of clusters

library(factoextra)
fviz_nbclust(cities2.norm, kmeans, method = "wss") +
geom_vline(xintercept = 4, linetype = 2)

We could use several function to proceed the k-means clustering in R, some of them are kmeans() function from stats package, and eclust() function from factoextra package. In this case, both clusters exactly yield the same results.

set.seed(123)
km.res <- kmeans(cities2.norm, 4)
print(km.res)
## K-means clustering with 4 clusters of sizes 1, 4, 4, 3
## 
## Cluster means:
##     Pinjaman Dana.Hibah   Konsumsi     Produk
## 1 -0.8389266  1.1547005 -1.8896991 -1.4183407
## 2  0.5471260  0.9381942  0.8497305  1.0867805
## 3  0.6018386 -0.5773503  0.2029207 -0.5894403
## 4 -1.2523107 -0.8660254 -0.7736352 -0.1903401
## 
## Clustering vector:
##   Inazuma    Sumeru  Fontaine    Natlan Snezhnaya    Teyvat     Guyot    Sakala 
##         1         3         4         4         4         3         2         3 
##      Gaia  Mondstat    Kagmaw     Liyue 
##         3         2         2         2 
## 
## Within cluster sum of squares by cluster:
## [1] 0.000000 3.154522 4.889090 2.810140
##  (between_SS / total_SS =  75.3 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
## [6] "betweenss"    "size"         "iter"         "ifault"
km<-eclust(cities2.norm, "kmeans", hc_metric="euclidean",k=4)

#aggregate(cities2, by=list(cluster=km.res$cluster), mean)
aggregate(cities2, by=list(cluster=km$cluster), mean)
##   cluster Pinjaman Dana.Hibah Konsumsi   Produk
## 1       1 5.600000   9.000000 50.00000 25.00000
## 2       2 7.500000   8.750000 86.00000 59.00000
## 3       3 7.575000   7.000000 77.50000 36.25000
## 4       4 5.033333   6.666667 64.66667 41.66667
#km.res$cluster
km$cluster
##   Inazuma    Sumeru  Fontaine    Natlan Snezhnaya    Teyvat     Guyot    Sakala 
##         1         3         4         4         4         3         2         3 
##      Gaia  Mondstat    Kagmaw     Liyue 
##         3         2         2         2
#km.res$size
km$size
## [1] 1 4 4 3
#km.res$centers
km$centers
##     Pinjaman Dana.Hibah   Konsumsi     Produk
## 1 -0.8389266  1.1547005 -1.8896991 -1.4183407
## 2  0.5471260  0.9381942  0.8497305  1.0867805
## 3  0.6018386 -0.5773503  0.2029207 -0.5894403
## 4 -1.2523107 -0.8660254 -0.7736352 -0.1903401
#fviz_cluster(km.res, data = cities2, repel = TRUE, ggtheme = theme_minimal())
fviz_cluster(km, data = cities2, repel = TRUE, ggtheme = theme_minimal())

K-Medoids Clustering

library(cluster)
fviz_nbclust(cities2.norm, FUNcluster=cluster::pam, method = "wss")+
theme_classic()

pam.res <- pam(cities2.norm, 4)
print(pam.res)
## Medoids:
##         ID   Pinjaman Dana.Hibah   Konsumsi     Produk
## Inazuma  1 -0.8389266  1.1547005 -1.8896991 -1.4183407
## Gaia     9  0.6200762 -0.5773503  0.5453494  0.1289401
## Natlan   4 -0.9118767 -0.5773503 -0.8243654  0.4973402
## Liyue   12  0.9118767  1.1547005  1.1541115  1.6025408
## Clustering vector:
##   Inazuma    Sumeru  Fontaine    Natlan Snezhnaya    Teyvat     Guyot    Sakala 
##         1         2         3         3         3         2         4         2 
##      Gaia  Mondstat    Kagmaw     Liyue 
##         2         2         4         4 
## Objective function:
##     build      swap 
## 0.9289732 0.9289732 
## 
## Available components:
##  [1] "medoids"    "id.med"     "clustering" "objective"  "isolation" 
##  [6] "clusinfo"   "silinfo"    "diss"       "call"       "data"
pam<-eclust(cities2.norm, "pam", hc_metric="euclidean",k=4)

#pam.res$medoids
pam$medoids
##           Pinjaman Dana.Hibah   Konsumsi     Produk
## Inazuma -0.8389266  1.1547005 -1.8896991 -1.4183407
## Gaia     0.6200762 -0.5773503  0.5453494  0.1289401
## Natlan  -0.9118767 -0.5773503 -0.8243654  0.4973402
## Liyue    0.9118767  1.1547005  1.1541115  1.6025408
#pam.res$clustering
pam$clustering
##   Inazuma    Sumeru  Fontaine    Natlan Snezhnaya    Teyvat     Guyot    Sakala 
##         1         2         3         3         3         2         4         2 
##      Gaia  Mondstat    Kagmaw     Liyue 
##         2         2         4         4
fviz_cluster(pam, 
             repel = TRUE, # Avoid label overplotting (slow)
             ggtheme = theme_classic())

Clustering Evaluation

CH Index

library(clusterSim)
(ch.km<-index.G1(cities2, cl=km$cluster))
## [1] 8.290769
(ch.pam<-index.G1(cities2, cl=pam$cluster))
## [1] 6.482316

DB Index

(db.km<-index.DB(cities2, cl=km$cluster)$DB)
## [1] 1.018532
(db.pam<-index.DB(cities2, cl=pam$cluster)$DB)
## [1] 1.026754

ASW

library(fpc)
(sil.km<-fviz_silhouette(km))
##   cluster size ave.sil.width
## 1       1    1          0.00
## 2       2    4          0.45
## 3       3    4          0.24
## 4       4    3          0.27

(sil.pam<-fviz_silhouette(pam))
##   cluster size ave.sil.width
## 1       1    1          0.00
## 2       2    5          0.15
## 3       3    3          0.27
## 4       4    3          0.53

clust.eval<-data.frame(
  Method=c("K-Means", "K-Medoids"),
  CH=c(ch.km, ch.pam),
  DB=c(db.km, db.pam),
  ASW=c(colMeans(sil.km$data[3]), colMeans(sil.pam$data[3]))
)
clust.eval
##      Method       CH       DB       ASW
## 1   K-Means 8.290769 1.018532 0.2963321
## 2 K-Medoids 6.482316 1.026754 0.2648073

Based on the three cluster validation indexes, we could consider to use the results from k-means clustering as the best clustering. The following is variable means of each clusters.

cities2$cluster<-km$cluster

cities2 %>% 
  group_by(cluster) %>% 
  summarise_all(list(mean))
## # A tibble: 4 × 5
##   cluster Pinjaman Dana.Hibah Konsumsi Produk
##     <int>    <dbl>      <dbl>    <dbl>  <dbl>
## 1       1     5.6        9        50     25  
## 2       2     7.5        8.75     86     59  
## 3       3     7.57       7        77.5   36.2
## 4       4     5.03       6.67     64.7   41.7

Exercise

Customer Marketing Data Set

cust<-read.csv("https://raw.githubusercontent.com/raoy/data/master/Customer%20Data%20v.2.csv")
#cust<-read.csv("Customer Data v.2.csv")
glimpse(cust)
## Rows: 8,636
## Columns: 18
## $ CUST_ID                          <chr> "C10001", "C10002", "C10003", "C10005…
## $ BALANCE                          <dbl> 40.90075, 3202.46742, 2495.14886, 817…
## $ BALANCE_FREQUENCY                <dbl> 0.818182, 0.909091, 1.000000, 1.00000…
## $ PURCHASES                        <dbl> 95.40, 0.00, 773.17, 16.00, 1333.28, …
## $ ONEOFF_PURCHASES                 <dbl> 0.00, 0.00, 773.17, 16.00, 0.00, 6402…
## $ INSTALLMENTS_PURCHASES           <dbl> 95.40, 0.00, 0.00, 0.00, 1333.28, 688…
## $ CASH_ADVANCE                     <dbl> 0.0000, 6442.9455, 0.0000, 0.0000, 0.…
## $ PURCHASES_FREQUENCY              <dbl> 0.166667, 0.000000, 1.000000, 0.08333…
## $ ONEOFF_PURCHASES_FREQUENCY       <dbl> 0.000000, 0.000000, 1.000000, 0.08333…
## $ PURCHASES_INSTALLMENTS_FREQUENCY <dbl> 0.083333, 0.000000, 0.000000, 0.00000…
## $ CASH_ADVANCE_FREQUENCY           <dbl> 0.000000, 0.250000, 0.000000, 0.00000…
## $ CASH_ADVANCE_TRX                 <int> 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ PURCHASES_TRX                    <int> 2, 0, 12, 1, 8, 64, 12, 5, 3, 12, 6, …
## $ CREDIT_LIMIT                     <dbl> 1000, 7000, 7500, 1200, 1800, 13500, …
## $ PAYMENTS                         <dbl> 201.8021, 4103.0326, 622.0667, 678.33…
## $ MINIMUM_PAYMENTS                 <dbl> 139.50979, 1072.34022, 627.28479, 244…
## $ PRC_FULL_PAYMENT                 <dbl> 0.000000, 0.222222, 0.000000, 0.00000…
## $ TENURE                           <int> 12, 12, 12, 12, 12, 12, 12, 12, 12, 1…
cust2<-cust[,-1]
names(cust2)<-paste("X", 1:17)

corrplot::corrplot(cor(cust2), addCoef.col=1,number.cex=0.5,
                   diag=F, type="lower")

boxplot(scale(cust2), col="lightblue", las=2, pch=16, cex=0.5)

TASKS

Depends on the exploration data analysis, conduct appropriate clustering procedure on the customer data set, and report the results in front of your class.