Database investigation

kiva <- read.csv('kivaAsean.csv', stringsAsFactors = F)
head(kiva,10)
##     loanId     country loanTerm loanAmount  sectorName
## 1  2090948 Philippines        8        225 Agriculture
## 2  2078038     Vietnam       20       1300 Agriculture
## 3  2076105 Philippines       14        425 Agriculture
## 4  2081979 Philippines        8        625 Agriculture
## 5  2082098 Philippines       13        425 Agriculture
## 6  2090796     Vietnam       20       1300 Agriculture
## 7  2090968     Vietnam       17        875 Agriculture
## 8  2081247 Philippines        8        175 Agriculture
## 9  2085674 Philippines        8        150 Agriculture
## 10 2084995 Philippines        8        250 Agriculture
summary(kiva)
##      loanId          country             loanTerm       loanAmount    
##  Min.   :2056887   Length:2718        Min.   : 6.00   Min.   : 125.0  
##  1st Qu.:2074364   Class :character   1st Qu.: 8.00   1st Qu.: 250.0  
##  Median :2079178   Mode  :character   Median :11.00   Median : 375.0  
##  Mean   :2079931                      Mean   :12.12   Mean   : 485.4  
##  3rd Qu.:2086031                      3rd Qu.:14.00   3rd Qu.: 618.8  
##  Max.   :2096738                      Max.   :50.00   Max.   :4900.0  
##   sectorName       
##  Length:2718       
##  Class :character  
##  Mode  :character  
##                    
##                    
## 

Divide the data set into train and test sets, train set accounts for 80% of sample size

set.seed(284)
trainIndex <- sample(nrow(kiva), floor(nrow(kiva)*0.8))
kiva$train <- F
kiva$train[trainIndex] <- T
kiva_train <- kiva[kiva$train == T,][,c(3,4)]
kiva_test <- kiva[kiva$train == F,][,c(3,4)]

Standardize data sets

## standardize data
kiva_train_z <- as.data.frame(lapply(kiva_train, scale))
kiva_test_z <-  as.data.frame(lapply(kiva_test, scale))
head(kiva_train_z)
##     loanTerm loanAmount
## 1 -0.7953911 -0.7205967
## 2  1.4876062  2.2496762
## 3  0.3461076 -0.1679878
## 4  0.1558578 -0.1679878
## 5  1.4876062  2.2496762
## 6  0.9168569  1.0753823

Assess Clustering Tendency

get_clust_tendency(kiva_train_z, 2, graph=TRUE, gradient=list(low="red", mid="white", high="blue"), seed = 123)
## $hopkins_stat
## [1] 0.9633565
## 
## $plot

Optimal number of clusters

WSS method

opt_km1 <- fviz_nbclust(kiva_train,kmeans,method = "wss") + ggtitle("k-means")
opt_km2 <- fviz_nbclust(kiva_train,pam,method = "wss") + ggtitle("PAM")
opt_km3 <- fviz_nbclust(kiva_train,clara,method = "wss") + ggtitle("CLARA")
grid.arrange(opt_km1,opt_km2, opt_km3, ncol=2, top = "Optimal number of clusters using Within-Cluster-Sum of Squared Errors method")

###Silhouette method

opt_km1 <- fviz_nbclust(kiva_train,kmeans,method = "silhouette") + ggtitle("k-means")
opt_km2 <- fviz_nbclust(kiva_train, pam ,method = "silhouette") + ggtitle("PAM")
opt_km3 <- fviz_nbclust(kiva_train,clara,method = "silhouette") + ggtitle("CLARA")
grid.arrange(opt_km1,opt_km2, opt_km3, ncol=2, top = "Optimal number of clusters using Silhouette method")

## {-}

Clustering data

K-means

c_km3<-eclust(kiva_train_z, "kmeans", k= 3)
c_km2<-eclust(kiva_train_z, "kmeans", k= 2)
plotkm2 <- fviz_cluster(c_km2, geom = c("point")) + ggtitle('K-means with 2 clusters')
plotkm3 <- fviz_cluster(c_km3, geom = c("point")) + ggtitle('K-means with 3 clusters')
grid.arrange(arrangeGrob(plotkm2,plotkm3, ncol=2 , top = "Clustering using K-means"))

PAM

c_pam3<-eclust(kiva_train_z, "pam", k= 3, hc_metric = 'euclidean')
c_pam2<-eclust(kiva_train_z, "pam", k= 2, hc_metric = 'euclidean')
plotpam2 <- fviz_cluster(c_pam2, geom = c("point")) + ggtitle('PAM with 2 clusters')
plotpam3 <- fviz_cluster(c_pam3, geom = c("point")) + ggtitle('PAM with 3 clusters')
grid.arrange(arrangeGrob(plotpam2,plotpam3, ncol=2 , top = "Clustering using PAM"))

CLARA

c_clara2<-eclust(kiva_train_z, "clara", k= 2, hc_metric = 'euclidean')
c_clara3<-eclust(kiva_train_z, "clara", k= 3, hc_metric = 'euclidean')
plotclara2 <- fviz_cluster(c_clara2, geom = c("point")) + ggtitle('CLARA with 2 clusters')
plotclara3 <- fviz_cluster(c_clara3, geom = c("point")) + ggtitle('CLARA with 3 clusters')
grid.arrange(arrangeGrob(plotclara2,plotclara3, ncol=2 , top = "Clustering using CLARA"))

##Hierachical clustering

d1 <- dist(kiva_train_z, method = "euclidean")
cluster_hierarchical <- hclust(d1, method = "complete" )
sub_grp_2 <- cutree(cluster_hierarchical, k = 2) # cutting by number of clusters
plot(cluster_hierarchical,)
rect.hclust(cluster_hierarchical, k=3)

##Measure the clustering quality {.tabset}

###Silhouette index

slh_km <- fviz_silhouette(c_km3) + ggtitle("k-means")
##   cluster size ave.sil.width
## 1       1 1067          0.50
## 2       2  171          0.29
## 3       3  936          0.30
slh_pam <- fviz_silhouette(c_pam3)+ ggtitle("PAM")
##   cluster size ave.sil.width
## 1       1  821          0.47
## 2       2  262          0.19
## 3       3 1091          0.40
slh_clara <- fviz_silhouette(c_clara3) + ggtitle("CLARA")
##   cluster size ave.sil.width
## 1       1  839          0.44
## 2       2  276          0.16
## 3       3 1059          0.43
grid.arrange(arrangeGrob(slh_km,slh_pam,slh_clara, ncol=2 , top = "Silhoette statistic"))

###Shadow index

c_km32 <- cclust(kiva_train_z, dist = 'euclidean', k = 3)
## Found more than one class "kcca" in cache; using the first, from namespace 'flexclust'
## Also defined by 'kernlab'
## Found more than one class "kcca" in cache; using the first, from namespace 'flexclust'
## Also defined by 'kernlab'
plot(shadow(c_km32))

## {-}

Calinski-Harabasz & Duda-Hart

round(calinhara(kiva_train_z, c_km3$cluster),2)
## [1] 2055.81
round(calinhara(kiva_train_z, c_km2$cluster),2)
## [1] 2726.12

Forecast clustering

c1 = kcca(kiva_train_z, k=3, kccaFamily("kmeans"))
# do the prediction
pred_train <- predict(c1)
pred_test <- predict(c1, newdata=kiva_test_z)
# visualize the output
image(c1)
points(kiva_train_z, col=pred_train, pch=19, cex=0.3)
points(kiva_test_z, col=pred_test, pch=22, bg="orange")

Conclusions