kiva <- read.csv('kivaAsean.csv', stringsAsFactors = F)
head(kiva,10)
## loanId country loanTerm loanAmount sectorName
## 1 2090948 Philippines 8 225 Agriculture
## 2 2078038 Vietnam 20 1300 Agriculture
## 3 2076105 Philippines 14 425 Agriculture
## 4 2081979 Philippines 8 625 Agriculture
## 5 2082098 Philippines 13 425 Agriculture
## 6 2090796 Vietnam 20 1300 Agriculture
## 7 2090968 Vietnam 17 875 Agriculture
## 8 2081247 Philippines 8 175 Agriculture
## 9 2085674 Philippines 8 150 Agriculture
## 10 2084995 Philippines 8 250 Agriculture
summary(kiva)
## loanId country loanTerm loanAmount
## Min. :2056887 Length:2718 Min. : 6.00 Min. : 125.0
## 1st Qu.:2074364 Class :character 1st Qu.: 8.00 1st Qu.: 250.0
## Median :2079178 Mode :character Median :11.00 Median : 375.0
## Mean :2079931 Mean :12.12 Mean : 485.4
## 3rd Qu.:2086031 3rd Qu.:14.00 3rd Qu.: 618.8
## Max. :2096738 Max. :50.00 Max. :4900.0
## sectorName
## Length:2718
## Class :character
## Mode :character
##
##
##
Divide the data set into train and test sets, train set accounts for 80% of sample size
set.seed(284)
trainIndex <- sample(nrow(kiva), floor(nrow(kiva)*0.8))
kiva$train <- F
kiva$train[trainIndex] <- T
kiva_train <- kiva[kiva$train == T,][,c(3,4)]
kiva_test <- kiva[kiva$train == F,][,c(3,4)]
Standardize data sets
## standardize data
kiva_train_z <- as.data.frame(lapply(kiva_train, scale))
kiva_test_z <- as.data.frame(lapply(kiva_test, scale))
head(kiva_train_z)
## loanTerm loanAmount
## 1 -0.7953911 -0.7205967
## 2 1.4876062 2.2496762
## 3 0.3461076 -0.1679878
## 4 0.1558578 -0.1679878
## 5 1.4876062 2.2496762
## 6 0.9168569 1.0753823
get_clust_tendency(kiva_train_z, 2, graph=TRUE, gradient=list(low="red", mid="white", high="blue"), seed = 123)
## $hopkins_stat
## [1] 0.9633565
##
## $plot
opt_km1 <- fviz_nbclust(kiva_train,kmeans,method = "wss") + ggtitle("k-means")
opt_km2 <- fviz_nbclust(kiva_train,pam,method = "wss") + ggtitle("PAM")
opt_km3 <- fviz_nbclust(kiva_train,clara,method = "wss") + ggtitle("CLARA")
grid.arrange(opt_km1,opt_km2, opt_km3, ncol=2, top = "Optimal number of clusters using Within-Cluster-Sum of Squared Errors method")
###Silhouette method
opt_km1 <- fviz_nbclust(kiva_train,kmeans,method = "silhouette") + ggtitle("k-means")
opt_km2 <- fviz_nbclust(kiva_train, pam ,method = "silhouette") + ggtitle("PAM")
opt_km3 <- fviz_nbclust(kiva_train,clara,method = "silhouette") + ggtitle("CLARA")
grid.arrange(opt_km1,opt_km2, opt_km3, ncol=2, top = "Optimal number of clusters using Silhouette method")
## {-}
c_km3<-eclust(kiva_train_z, "kmeans", k= 3)
c_km2<-eclust(kiva_train_z, "kmeans", k= 2)
plotkm2 <- fviz_cluster(c_km2, geom = c("point")) + ggtitle('K-means with 2 clusters')
plotkm3 <- fviz_cluster(c_km3, geom = c("point")) + ggtitle('K-means with 3 clusters')
grid.arrange(arrangeGrob(plotkm2,plotkm3, ncol=2 , top = "Clustering using K-means"))
c_pam3<-eclust(kiva_train_z, "pam", k= 3, hc_metric = 'euclidean')
c_pam2<-eclust(kiva_train_z, "pam", k= 2, hc_metric = 'euclidean')
plotpam2 <- fviz_cluster(c_pam2, geom = c("point")) + ggtitle('PAM with 2 clusters')
plotpam3 <- fviz_cluster(c_pam3, geom = c("point")) + ggtitle('PAM with 3 clusters')
grid.arrange(arrangeGrob(plotpam2,plotpam3, ncol=2 , top = "Clustering using PAM"))
c_clara2<-eclust(kiva_train_z, "clara", k= 2, hc_metric = 'euclidean')
c_clara3<-eclust(kiva_train_z, "clara", k= 3, hc_metric = 'euclidean')
plotclara2 <- fviz_cluster(c_clara2, geom = c("point")) + ggtitle('CLARA with 2 clusters')
plotclara3 <- fviz_cluster(c_clara3, geom = c("point")) + ggtitle('CLARA with 3 clusters')
grid.arrange(arrangeGrob(plotclara2,plotclara3, ncol=2 , top = "Clustering using CLARA"))
##Hierachical clustering
d1 <- dist(kiva_train_z, method = "euclidean")
cluster_hierarchical <- hclust(d1, method = "complete" )
sub_grp_2 <- cutree(cluster_hierarchical, k = 2) # cutting by number of clusters
plot(cluster_hierarchical,)
rect.hclust(cluster_hierarchical, k=3)
##Measure the clustering quality {.tabset}
###Silhouette index
slh_km <- fviz_silhouette(c_km3) + ggtitle("k-means")
## cluster size ave.sil.width
## 1 1 1067 0.50
## 2 2 171 0.29
## 3 3 936 0.30
slh_pam <- fviz_silhouette(c_pam3)+ ggtitle("PAM")
## cluster size ave.sil.width
## 1 1 821 0.47
## 2 2 262 0.19
## 3 3 1091 0.40
slh_clara <- fviz_silhouette(c_clara3) + ggtitle("CLARA")
## cluster size ave.sil.width
## 1 1 839 0.44
## 2 2 276 0.16
## 3 3 1059 0.43
grid.arrange(arrangeGrob(slh_km,slh_pam,slh_clara, ncol=2 , top = "Silhoette statistic"))
###Shadow index
c_km32 <- cclust(kiva_train_z, dist = 'euclidean', k = 3)
## Found more than one class "kcca" in cache; using the first, from namespace 'flexclust'
## Also defined by 'kernlab'
## Found more than one class "kcca" in cache; using the first, from namespace 'flexclust'
## Also defined by 'kernlab'
plot(shadow(c_km32))
## {-}
round(calinhara(kiva_train_z, c_km3$cluster),2)
## [1] 2055.81
round(calinhara(kiva_train_z, c_km2$cluster),2)
## [1] 2726.12
c1 = kcca(kiva_train_z, k=3, kccaFamily("kmeans"))
# do the prediction
pred_train <- predict(c1)
pred_test <- predict(c1, newdata=kiva_test_z)
# visualize the output
image(c1)
points(kiva_train_z, col=pred_train, pch=19, cex=0.3)
points(kiva_test_z, col=pred_test, pch=22, bg="orange")