1. Project introduction

In this project, I used clustering methods in grouping kiva.org loans for 3 ASEAN countries (Vietnam, Cambodia and Philippines). The project includes 6 parts:
- Data processing
- Measure the clustering tendency
- Optimal number of clusters
- Data clustering
- Assessing clustering quality  - Analysis of clusters.

2. Data processing

#load data from kivaAsean.csv file
kiva <- read.csv('kivaAsean.csv', stringsAsFactors = F)
head(kiva,10)
##     loanId     country loanTerm loanAmount  sectorName
## 1  2090948 Philippines        8        225 Agriculture
## 2  2078038     Vietnam       20       1300 Agriculture
## 3  2076105 Philippines       14        425 Agriculture
## 4  2081979 Philippines        8        625 Agriculture
## 5  2082098 Philippines       13        425 Agriculture
## 6  2090796     Vietnam       20       1300 Agriculture
## 7  2090968     Vietnam       17        875 Agriculture
## 8  2081247 Philippines        8        175 Agriculture
## 9  2085674 Philippines        8        150 Agriculture
## 10 2084995 Philippines        8        250 Agriculture
#Check data structure and whether there are NA values
summary(kiva)
##      loanId          country             loanTerm       loanAmount    
##  Min.   :2056887   Length:2718        Min.   : 6.00   Min.   : 125.0  
##  1st Qu.:2074364   Class :character   1st Qu.: 8.00   1st Qu.: 250.0  
##  Median :2079178   Mode  :character   Median :11.00   Median : 375.0  
##  Mean   :2079931                      Mean   :12.12   Mean   : 485.4  
##  3rd Qu.:2086031                      3rd Qu.:14.00   3rd Qu.: 618.8  
##  Max.   :2096738                      Max.   :50.00   Max.   :4900.0  
##   sectorName       
##  Length:2718       
##  Class :character  
##  Mode  :character  
##                    
##                    
## 

I divided the data into train and test sets, the train set accounts for 80% of sample size

set.seed(284)
trainIndex <- sample(nrow(kiva), floor(nrow(kiva)*0.8))
kiva$train <- F
kiva$train[trainIndex] <- T
kiva_train <- kiva[kiva$train == T,][,c(3,4)]
kiva_test <- kiva[kiva$train == F,][,c(3,4)]

I standardized data before clustering

kiva_train_z <- as.data.frame(lapply(kiva_train, scale))
kiva_test_z <-  as.data.frame(lapply(kiva_test, scale))
head(kiva_train_z)
##     loanTerm loanAmount
## 1 -0.7953911 -0.7205967
## 2  1.4876062  2.2496762
## 3  0.3461076 -0.1679878
## 4  0.1558578 -0.1679878
## 5  1.4876062  2.2496762
## 6  0.9168569  1.0753823

3. Measure Clustering Tendency

get_clust_tendency(kiva_train_z, 2, graph=TRUE, gradient=list(low="red", mid="white", high="blue"), seed = 123)
## $hopkins_stat
## [1] 0.9633565
## 
## $plot

## {-}
The Hopkins statistic is 0.9633565, close to 1.Also, we can see blue blocks clearly in the graph. These mean the train dataset is significantly clusterable.

4. Optimal number of clusters

WSS method

opt_km1 <- fviz_nbclust(kiva_train,kmeans,method = "wss") + ggtitle("k-means")
opt_km2 <- fviz_nbclust(kiva_train,pam,method = "wss") + ggtitle("PAM")
opt_km3 <- fviz_nbclust(kiva_train,clara,method = "wss") + ggtitle("CLARA")
grid.arrange(opt_km1,opt_km2, opt_km3, ncol=2, top = "Optimal number of clusters using Within-Cluster-Sum of Squared Errors method")

Silhouette method

opt_km1 <- fviz_nbclust(kiva_train,kmeans,method = "silhouette") + ggtitle("k-means")
opt_km2 <- fviz_nbclust(kiva_train, pam ,method = "silhouette") + ggtitle("PAM")
opt_km3 <- fviz_nbclust(kiva_train,clara,method = "silhouette") + ggtitle("CLARA")
grid.arrange(opt_km1,opt_km2, opt_km3, ncol=2, top = "Optimal number of clusters using Silhouette method")


Using both WSS and Silhouette method in k-means, PAM and CLARA, we can see the train dataset can be divided into 2 clusters effectively.

5. Clustering data

K-means

c_km3<-eclust(kiva_train_z, "kmeans", k= 3, graph = F)
c_km2<-eclust(kiva_train_z, "kmeans", k= 2, graph = F)
plotkm2 <- fviz_cluster(c_km2, geom = c("point")) + ggtitle('K-means with 2 clusters')
plotkm3 <- fviz_cluster(c_km3, geom = c("point")) + ggtitle('K-means with 3 clusters')
grid.arrange(arrangeGrob(plotkm2,plotkm3, ncol=2 , top = "Clustering using K-means"))

PAM

c_pam3<-eclust(kiva_train_z, "pam", k= 3, hc_metric = 'euclidean', graph = F)
c_pam2<-eclust(kiva_train_z, "pam", k= 2, hc_metric = 'euclidean', graph = F)
plotpam2 <- fviz_cluster(c_pam2, geom = c("point")) + ggtitle('PAM with 2 clusters')
plotpam3 <- fviz_cluster(c_pam3, geom = c("point")) + ggtitle('PAM with 3 clusters')
grid.arrange(arrangeGrob(plotpam2,plotpam3, ncol=2 , top = "Clustering using PAM"))

CLARA

c_clara2<-eclust(kiva_train_z, "clara", k= 2, hc_metric = 'euclidean', graph = F)
c_clara3<-eclust(kiva_train_z, "clara", k= 3, hc_metric = 'euclidean', graph = F)
plotclara2 <- fviz_cluster(c_clara2, geom = c("point")) + ggtitle('CLARA with 2 clusters')
plotclara3 <- fviz_cluster(c_clara3, geom = c("point")) + ggtitle('CLARA with 3 clusters')
grid.arrange(arrangeGrob(plotclara2,plotclara3, ncol=2 , top = "Clustering using CLARA"))

Via 3 clustering charts, we can see the largest group contains loans with both higher amount and longer loan term compared to other groups. I would investigate groups in the next part. ## {-}

Hierachical clustering

d1 <- dist(kiva_train_z, method = "euclidean")
cluster_hierarchical <- hclust(d1, method = "complete" )
sub_grp_2 <- cutree(cluster_hierarchical, k = 2)
plot(cluster_hierarchical,)
rect.hclust(cluster_hierarchical, k=2)

Stripes for k-means

d1<-cclust(kiva_train, 2, dist="euclidean")
## Found more than one class "kcca" in cache; using the first, from namespace 'flexclust'
## Also defined by 'kernlab'
## Found more than one class "kcca" in cache; using the first, from namespace 'flexclust'
## Also defined by 'kernlab'
stripes(d1)

Looking at the stripes graph, we can see cluster 1 is much more disperse compared to the cluster 2. Whilr there are outliers in the cluster 1, cluster 2 is much more concentrated.

##Measure the clustering quality {.tabset} ###Silhouette index

slh_km <- fviz_silhouette(c_km2) + ggtitle("k-means")
##   cluster size ave.sil.width
## 1       1 1953          0.76
## 2       2  221          0.32
slh_pam <- fviz_silhouette(c_pam2)+ ggtitle("PAM")
##   cluster size ave.sil.width
## 1       1 1905          0.75
## 2       2  269          0.25
slh_clara <- fviz_silhouette(c_clara2) + ggtitle("CLARA")
##   cluster size ave.sil.width
## 1       1 1898          0.75
## 2       2  276          0.24
grid.arrange(arrangeGrob(slh_km,slh_pam,slh_clara, ncol=2 , top = "Silhoette statistic"))


The average silhoutte width is nearly 0.7 in both k-means, PAM and CLARA. The clustering was quite right.
### Shadow index

c_km32 <- cclust(kiva_train_z, dist = 'euclidean', k = 3)
plot(shadow(c_km32))

## {-}

Calinski-Harabasz & Duda-Hart

round(calinhara(kiva_train_z, c_km3$cluster),2)
## [1] 2055.81
round(calinhara(kiva_train_z, c_km2$cluster),2)
## [1] 2726.12

The Calinski-Harabasz index in the case of 2 clusters was higher compared to 3 clusters. This again confirm we should partition data into 2 groups. ## Forecast clustering

c1 = kcca(kiva_train_z, k=2, kccaFamily("kmeans"))
# do the prediction
pred_train <- predict(c1)
pred_test <- predict(c1, newdata=kiva_test_z)
# visualize the output
image(c1)
points(kiva_train_z, col=pred_train, pch=19, cex=0.3)
points(kiva_test_z, col=pred_test, pch=22, bg="orange")


According to the graph, we can see test points are clustered into 2 groups like train points.

Analysis of clusters.

# By countries
t_full1 <- with(kiva,table(kiva[kiva$train == T,]$country)) %>% prop.table()
t_11 <- with(kiva,table(kiva[kiva$train == T,][c_km2$cluster == 1,]$country)) %>% prop.table()
t_12 <- with(kiva,table(kiva[kiva$train == T,][c_km2$cluster == 2,]$country)) %>% prop.table()
t_full1 <- merge(t_full1, t_11, by = 'Var1')
t_full1 <- merge(t_full1, t_12, by = 'Var1')
colnames(t_full1) <- c('country','full_train','cluster1','cluster2')
t_full1
##       country full_train    cluster1    cluster2
## 1    Cambodia 0.07727691 0.022017409 0.565610860
## 2 Philippines 0.87580497 0.973886329 0.009049774
## 3     Vietnam 0.04691812 0.004096262 0.425339367
# By sectors
t_full2 <- with(kiva,table(kiva[kiva$train == T,]$sectorName)) %>% prop.table()
t_21 <- with(kiva,table(kiva[kiva$train == T,][c_km2$cluster == 1,]$sectorName)) %>% prop.table()
t_22 <- with(kiva,table(kiva[kiva$train == T,][c_km2$cluster == 2,]$sectorName)) %>% prop.table()
t_full2 <- merge(t_full2, t_21, by = 'Var1')
t_full2 <- merge(t_full2, t_22, by = 'Var1')
colnames(t_full2) <- c('loanSector','full_train','cluster1','cluster2')
t_full2
##        loanSector   full_train     cluster1    cluster2
## 1     Agriculture 0.3012879485 0.3138760881 0.190045249
## 2        Clothing 0.0275988960 0.0302099334 0.004524887
## 3    Construction 0.0032198712 0.0005120328 0.027149321
## 4       Education 0.0009199632 0.0005120328 0.004524887
## 5            Food 0.3615455382 0.3681515617 0.303167421
## 6         Housing 0.0381784729 0.0035842294 0.343891403
## 7    Personal Use 0.0041398344 0.0020481311 0.022624434
## 8          Retail 0.2460901564 0.2667690732 0.063348416
## 9        Services 0.0087396504 0.0092165899 0.004524887
## 10 Transportation 0.0059797608 0.0025601639 0.036199095

The Cluster1 are mainly loans in Philippines while cluster2 are mostly loans in Vietnam and Cambodia. This means loans in Philippines have higher loan term and loan amount compared to 2 other countries

Regarding loan categories, loan for food and agriculture are key parts in both clusters because 3 countries are developing countries who have high agriculture proportion in economics outcome. However, in cluster1, we can see a large proportion of retail loans while cluster2 have a high proportion of housing loans.

Conclusions