Cluster Analysis

library(cluster) #conduct cluster analysis
library(compareGroups) #build descriptive statistic tables

## Loading required package: Hmisc

## Loading required package: lattice

## Loading required package: survival

## Loading required package: Formula

## Loading required package: ggplot2

## 
## Attaching package: 'Hmisc'

## The following objects are masked from 'package:base':
## 
##     format.pval, round.POSIXt, trunc.POSIXt, units

## Loading required package: gdata

## gdata: read.xls support for 'XLS' (Excel 97-2004) files ENABLED.

##

## gdata: read.xls support for 'XLSX' (Excel 2007+) files ENABLED.

## 
## Attaching package: 'gdata'

## The following object is masked from 'package:Hmisc':
## 
##     combine

## The following object is masked from 'package:stats':
## 
##     nobs

## The following object is masked from 'package:utils':
## 
##     object.size

## The following object is masked from 'package:base':
## 
##     startsWith

## Loading required package: xtable

## 
## Attaching package: 'xtable'

## The following objects are masked from 'package:Hmisc':
## 
##     label, label<-

## Loading required package: SNPassoc

## Loading required package: haplo.stats

## Loading required package: mvtnorm

## Loading required package: parallel

library(mlbench) #contains the dataset
library(NbClust) #cluster validity measures
library(sparcl) #colored dendrogram

# Sonar, Mines vs. Rocks
data(Sonar)
dim(Sonar)

## [1] 208  61

head(Sonar)

##       V1     V2     V3     V4     V5     V6     V7     V8     V9    V10
## 1 0.0200 0.0371 0.0428 0.0207 0.0954 0.0986 0.1539 0.1601 0.3109 0.2111
## 2 0.0453 0.0523 0.0843 0.0689 0.1183 0.2583 0.2156 0.3481 0.3337 0.2872
## 3 0.0262 0.0582 0.1099 0.1083 0.0974 0.2280 0.2431 0.3771 0.5598 0.6194
## 4 0.0100 0.0171 0.0623 0.0205 0.0205 0.0368 0.1098 0.1276 0.0598 0.1264
## 5 0.0762 0.0666 0.0481 0.0394 0.0590 0.0649 0.1209 0.2467 0.3564 0.4459
## 6 0.0286 0.0453 0.0277 0.0174 0.0384 0.0990 0.1201 0.1833 0.2105 0.3039
##      V11    V12    V13    V14    V15    V16    V17    V18    V19    V20
## 1 0.1609 0.1582 0.2238 0.0645 0.0660 0.2273 0.3100 0.2999 0.5078 0.4797
## 2 0.4918 0.6552 0.6919 0.7797 0.7464 0.9444 1.0000 0.8874 0.8024 0.7818
## 3 0.6333 0.7060 0.5544 0.5320 0.6479 0.6931 0.6759 0.7551 0.8929 0.8619
## 4 0.0881 0.1992 0.0184 0.2261 0.1729 0.2131 0.0693 0.2281 0.4060 0.3973
## 5 0.4152 0.3952 0.4256 0.4135 0.4528 0.5326 0.7306 0.6193 0.2032 0.4636
## 6 0.2988 0.4250 0.6343 0.8198 1.0000 0.9988 0.9508 0.9025 0.7234 0.5122
##      V21    V22    V23    V24    V25    V26    V27    V28    V29    V30
## 1 0.5783 0.5071 0.4328 0.5550 0.6711 0.6415 0.7104 0.8080 0.6791 0.3857
## 2 0.5212 0.4052 0.3957 0.3914 0.3250 0.3200 0.3271 0.2767 0.4423 0.2028
## 3 0.7974 0.6737 0.4293 0.3648 0.5331 0.2413 0.5070 0.8533 0.6036 0.8514
## 4 0.2741 0.3690 0.5556 0.4846 0.3140 0.5334 0.5256 0.2520 0.2090 0.3559
## 5 0.4148 0.4292 0.5730 0.5399 0.3161 0.2285 0.6995 1.0000 0.7262 0.4724
## 6 0.2074 0.3985 0.5890 0.2872 0.2043 0.5782 0.5389 0.3750 0.3411 0.5067
##      V31    V32    V33    V34    V35    V36    V37    V38    V39    V40
## 1 0.1307 0.2604 0.5121 0.7547 0.8537 0.8507 0.6692 0.6097 0.4943 0.2744
## 2 0.3788 0.2947 0.1984 0.2341 0.1306 0.4182 0.3835 0.1057 0.1840 0.1970
## 3 0.8512 0.5045 0.1862 0.2709 0.4232 0.3043 0.6116 0.6756 0.5375 0.4719
## 4 0.6260 0.7340 0.6120 0.3497 0.3953 0.3012 0.5408 0.8814 0.9857 0.9167
## 5 0.5103 0.5459 0.2881 0.0981 0.1951 0.4181 0.4604 0.3217 0.2828 0.2430
## 6 0.5580 0.4778 0.3299 0.2198 0.1407 0.2856 0.3807 0.4158 0.4054 0.3296
##      V41    V42    V43    V44    V45    V46    V47    V48    V49    V50
## 1 0.0510 0.2834 0.2825 0.4256 0.2641 0.1386 0.1051 0.1343 0.0383 0.0324
## 2 0.1674 0.0583 0.1401 0.1628 0.0621 0.0203 0.0530 0.0742 0.0409 0.0061
## 3 0.4647 0.2587 0.2129 0.2222 0.2111 0.0176 0.1348 0.0744 0.0130 0.0106
## 4 0.6121 0.5006 0.3210 0.3202 0.4295 0.3654 0.2655 0.1576 0.0681 0.0294
## 5 0.1979 0.2444 0.1847 0.0841 0.0692 0.0528 0.0357 0.0085 0.0230 0.0046
## 6 0.2707 0.2650 0.0723 0.1238 0.1192 0.1089 0.0623 0.0494 0.0264 0.0081
##      V51    V52    V53    V54    V55    V56    V57    V58    V59    V60
## 1 0.0232 0.0027 0.0065 0.0159 0.0072 0.0167 0.0180 0.0084 0.0090 0.0032
## 2 0.0125 0.0084 0.0089 0.0048 0.0094 0.0191 0.0140 0.0049 0.0052 0.0044
## 3 0.0033 0.0232 0.0166 0.0095 0.0180 0.0244 0.0316 0.0164 0.0095 0.0078
## 4 0.0241 0.0121 0.0036 0.0150 0.0085 0.0073 0.0050 0.0044 0.0040 0.0117
## 5 0.0156 0.0031 0.0054 0.0105 0.0110 0.0015 0.0072 0.0048 0.0107 0.0094
## 6 0.0104 0.0045 0.0014 0.0038 0.0013 0.0089 0.0057 0.0027 0.0051 0.0062
##   Class
## 1     R
## 2     R
## 3     R
## 4     R
## 5     R
## 6     R

Sonar$Class <- NULL
str(Sonar)

## 'data.frame':    208 obs. of  60 variables:
##  $ V1 : num  0.02 0.0453 0.0262 0.01 0.0762 0.0286 0.0317 0.0519 0.0223 0.0164 ...
##  $ V2 : num  0.0371 0.0523 0.0582 0.0171 0.0666 0.0453 0.0956 0.0548 0.0375 0.0173 ...
##  $ V3 : num  0.0428 0.0843 0.1099 0.0623 0.0481 ...
##  $ V4 : num  0.0207 0.0689 0.1083 0.0205 0.0394 ...
##  $ V5 : num  0.0954 0.1183 0.0974 0.0205 0.059 ...
##  $ V6 : num  0.0986 0.2583 0.228 0.0368 0.0649 ...
##  $ V7 : num  0.154 0.216 0.243 0.11 0.121 ...
##  $ V8 : num  0.16 0.348 0.377 0.128 0.247 ...
##  $ V9 : num  0.3109 0.3337 0.5598 0.0598 0.3564 ...
##  $ V10: num  0.211 0.287 0.619 0.126 0.446 ...
##  $ V11: num  0.1609 0.4918 0.6333 0.0881 0.4152 ...
##  $ V12: num  0.158 0.655 0.706 0.199 0.395 ...
##  $ V13: num  0.2238 0.6919 0.5544 0.0184 0.4256 ...
##  $ V14: num  0.0645 0.7797 0.532 0.2261 0.4135 ...
##  $ V15: num  0.066 0.746 0.648 0.173 0.453 ...
##  $ V16: num  0.227 0.944 0.693 0.213 0.533 ...
##  $ V17: num  0.31 1 0.6759 0.0693 0.7306 ...
##  $ V18: num  0.3 0.887 0.755 0.228 0.619 ...
##  $ V19: num  0.508 0.802 0.893 0.406 0.203 ...
##  $ V20: num  0.48 0.782 0.862 0.397 0.464 ...
##  $ V21: num  0.578 0.521 0.797 0.274 0.415 ...
##  $ V22: num  0.507 0.405 0.674 0.369 0.429 ...
##  $ V23: num  0.433 0.396 0.429 0.556 0.573 ...
##  $ V24: num  0.555 0.391 0.365 0.485 0.54 ...
##  $ V25: num  0.671 0.325 0.533 0.314 0.316 ...
##  $ V26: num  0.641 0.32 0.241 0.533 0.229 ...
##  $ V27: num  0.71 0.327 0.507 0.526 0.7 ...
##  $ V28: num  0.808 0.277 0.853 0.252 1 ...
##  $ V29: num  0.679 0.442 0.604 0.209 0.726 ...
##  $ V30: num  0.386 0.203 0.851 0.356 0.472 ...
##  $ V31: num  0.131 0.379 0.851 0.626 0.51 ...
##  $ V32: num  0.26 0.295 0.504 0.734 0.546 ...
##  $ V33: num  0.512 0.198 0.186 0.612 0.288 ...
##  $ V34: num  0.7547 0.2341 0.2709 0.3497 0.0981 ...
##  $ V35: num  0.854 0.131 0.423 0.395 0.195 ...
##  $ V36: num  0.851 0.418 0.304 0.301 0.418 ...
##  $ V37: num  0.669 0.384 0.612 0.541 0.46 ...
##  $ V38: num  0.61 0.106 0.676 0.881 0.322 ...
##  $ V39: num  0.494 0.184 0.537 0.986 0.283 ...
##  $ V40: num  0.274 0.197 0.472 0.917 0.243 ...
##  $ V41: num  0.051 0.167 0.465 0.612 0.198 ...
##  $ V42: num  0.2834 0.0583 0.2587 0.5006 0.2444 ...
##  $ V43: num  0.282 0.14 0.213 0.321 0.185 ...
##  $ V44: num  0.4256 0.1628 0.2222 0.3202 0.0841 ...
##  $ V45: num  0.2641 0.0621 0.2111 0.4295 0.0692 ...
##  $ V46: num  0.1386 0.0203 0.0176 0.3654 0.0528 ...
##  $ V47: num  0.1051 0.053 0.1348 0.2655 0.0357 ...
##  $ V48: num  0.1343 0.0742 0.0744 0.1576 0.0085 ...
##  $ V49: num  0.0383 0.0409 0.013 0.0681 0.023 0.0264 0.0507 0.0285 0.0777 0.0092 ...
##  $ V50: num  0.0324 0.0061 0.0106 0.0294 0.0046 0.0081 0.0159 0.0178 0.0439 0.0198 ...
##  $ V51: num  0.0232 0.0125 0.0033 0.0241 0.0156 0.0104 0.0195 0.0052 0.0061 0.0118 ...
##  $ V52: num  0.0027 0.0084 0.0232 0.0121 0.0031 0.0045 0.0201 0.0081 0.0145 0.009 ...
##  $ V53: num  0.0065 0.0089 0.0166 0.0036 0.0054 0.0014 0.0248 0.012 0.0128 0.0223 ...
##  $ V54: num  0.0159 0.0048 0.0095 0.015 0.0105 0.0038 0.0131 0.0045 0.0145 0.0179 ...
##  $ V55: num  0.0072 0.0094 0.018 0.0085 0.011 0.0013 0.007 0.0121 0.0058 0.0084 ...
##  $ V56: num  0.0167 0.0191 0.0244 0.0073 0.0015 0.0089 0.0138 0.0097 0.0049 0.0068 ...
##  $ V57: num  0.018 0.014 0.0316 0.005 0.0072 0.0057 0.0092 0.0085 0.0065 0.0032 ...
##  $ V58: num  0.0084 0.0049 0.0164 0.0044 0.0048 0.0027 0.0143 0.0047 0.0093 0.0035 ...
##  $ V59: num  0.009 0.0052 0.0095 0.004 0.0107 0.0051 0.0036 0.0048 0.0059 0.0056 ...
##  $ V60: num  0.0032 0.0044 0.0078 0.0117 0.0094 0.0062 0.0103 0.0053 0.0022 0.004 ...

summary(Sonar)

##        V1                V2                V3                V4         
##  Min.   :0.00150   Min.   :0.00060   Min.   :0.00150   Min.   :0.00580  
##  1st Qu.:0.01335   1st Qu.:0.01645   1st Qu.:0.01895   1st Qu.:0.02438  
##  Median :0.02280   Median :0.03080   Median :0.03430   Median :0.04405  
##  Mean   :0.02916   Mean   :0.03844   Mean   :0.04383   Mean   :0.05389  
##  3rd Qu.:0.03555   3rd Qu.:0.04795   3rd Qu.:0.05795   3rd Qu.:0.06450  
##  Max.   :0.13710   Max.   :0.23390   Max.   :0.30590   Max.   :0.42640  
##        V5                V6                V7               V8         
##  Min.   :0.00670   Min.   :0.01020   Min.   :0.0033   Min.   :0.00550  
##  1st Qu.:0.03805   1st Qu.:0.06703   1st Qu.:0.0809   1st Qu.:0.08042  
##  Median :0.06250   Median :0.09215   Median :0.1070   Median :0.11210  
##  Mean   :0.07520   Mean   :0.10457   Mean   :0.1217   Mean   :0.13480  
##  3rd Qu.:0.10028   3rd Qu.:0.13412   3rd Qu.:0.1540   3rd Qu.:0.16960  
##  Max.   :0.40100   Max.   :0.38230   Max.   :0.3729   Max.   :0.45900  
##        V9               V10              V11              V12        
##  Min.   :0.00750   Min.   :0.0113   Min.   :0.0289   Min.   :0.0236  
##  1st Qu.:0.09703   1st Qu.:0.1113   1st Qu.:0.1293   1st Qu.:0.1335  
##  Median :0.15225   Median :0.1824   Median :0.2248   Median :0.2490  
##  Mean   :0.17800   Mean   :0.2083   Mean   :0.2360   Mean   :0.2502  
##  3rd Qu.:0.23342   3rd Qu.:0.2687   3rd Qu.:0.3016   3rd Qu.:0.3312  
##  Max.   :0.68280   Max.   :0.7106   Max.   :0.7342   Max.   :0.7060  
##       V13              V14              V15              V16        
##  Min.   :0.0184   Min.   :0.0273   Min.   :0.0031   Min.   :0.0162  
##  1st Qu.:0.1661   1st Qu.:0.1752   1st Qu.:0.1646   1st Qu.:0.1963  
##  Median :0.2640   Median :0.2811   Median :0.2817   Median :0.3047  
##  Mean   :0.2733   Mean   :0.2966   Mean   :0.3202   Mean   :0.3785  
##  3rd Qu.:0.3513   3rd Qu.:0.3862   3rd Qu.:0.4529   3rd Qu.:0.5357  
##  Max.   :0.7131   Max.   :0.9970   Max.   :1.0000   Max.   :0.9988  
##       V17              V18              V19              V20        
##  Min.   :0.0349   Min.   :0.0375   Min.   :0.0494   Min.   :0.0656  
##  1st Qu.:0.2059   1st Qu.:0.2421   1st Qu.:0.2991   1st Qu.:0.3506  
##  Median :0.3084   Median :0.3683   Median :0.4350   Median :0.5425  
##  Mean   :0.4160   Mean   :0.4523   Mean   :0.5048   Mean   :0.5630  
##  3rd Qu.:0.6594   3rd Qu.:0.6791   3rd Qu.:0.7314   3rd Qu.:0.8093  
##  Max.   :1.0000   Max.   :1.0000   Max.   :1.0000   Max.   :1.0000  
##       V21              V22              V23              V24        
##  Min.   :0.0512   Min.   :0.0219   Min.   :0.0563   Min.   :0.0239  
##  1st Qu.:0.3997   1st Qu.:0.4069   1st Qu.:0.4502   1st Qu.:0.5407  
##  Median :0.6177   Median :0.6649   Median :0.6997   Median :0.6985  
##  Mean   :0.6091   Mean   :0.6243   Mean   :0.6470   Mean   :0.6727  
##  3rd Qu.:0.8170   3rd Qu.:0.8320   3rd Qu.:0.8486   3rd Qu.:0.8722  
##  Max.   :1.0000   Max.   :1.0000   Max.   :1.0000   Max.   :1.0000  
##       V25              V26              V27              V28        
##  Min.   :0.0240   Min.   :0.0921   Min.   :0.0481   Min.   :0.0284  
##  1st Qu.:0.5258   1st Qu.:0.5442   1st Qu.:0.5319   1st Qu.:0.5348  
##  Median :0.7211   Median :0.7545   Median :0.7456   Median :0.7319  
##  Mean   :0.6754   Mean   :0.6999   Mean   :0.7022   Mean   :0.6940  
##  3rd Qu.:0.8737   3rd Qu.:0.8938   3rd Qu.:0.9171   3rd Qu.:0.9003  
##  Max.   :1.0000   Max.   :1.0000   Max.   :1.0000   Max.   :1.0000  
##       V29              V30              V31              V32        
##  Min.   :0.0144   Min.   :0.0613   Min.   :0.0482   Min.   :0.0404  
##  1st Qu.:0.4637   1st Qu.:0.4114   1st Qu.:0.3456   1st Qu.:0.2814  
##  Median :0.6808   Median :0.6071   Median :0.4904   Median :0.4296  
##  Mean   :0.6421   Mean   :0.5809   Mean   :0.5045   Mean   :0.4390  
##  3rd Qu.:0.8521   3rd Qu.:0.7352   3rd Qu.:0.6420   3rd Qu.:0.5803  
##  Max.   :1.0000   Max.   :1.0000   Max.   :0.9657   Max.   :0.9306  
##       V33              V34              V35              V36        
##  Min.   :0.0477   Min.   :0.0212   Min.   :0.0223   Min.   :0.0080  
##  1st Qu.:0.2579   1st Qu.:0.2176   1st Qu.:0.1794   1st Qu.:0.1543  
##  Median :0.3912   Median :0.3510   Median :0.3127   Median :0.3211  
##  Mean   :0.4172   Mean   :0.4032   Mean   :0.3926   Mean   :0.3848  
##  3rd Qu.:0.5561   3rd Qu.:0.5961   3rd Qu.:0.5934   3rd Qu.:0.5565  
##  Max.   :1.0000   Max.   :0.9647   Max.   :1.0000   Max.   :1.0000  
##       V37              V38              V39              V40        
##  Min.   :0.0351   Min.   :0.0383   Min.   :0.0371   Min.   :0.0117  
##  1st Qu.:0.1601   1st Qu.:0.1743   1st Qu.:0.1740   1st Qu.:0.1865  
##  Median :0.3063   Median :0.3127   Median :0.2835   Median :0.2781  
##  Mean   :0.3638   Mean   :0.3397   Mean   :0.3258   Mean   :0.3112  
##  3rd Qu.:0.5189   3rd Qu.:0.4405   3rd Qu.:0.4349   3rd Qu.:0.4244  
##  Max.   :0.9497   Max.   :1.0000   Max.   :0.9857   Max.   :0.9297  
##       V41              V42              V43              V44        
##  Min.   :0.0360   Min.   :0.0056   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:0.1631   1st Qu.:0.1589   1st Qu.:0.1552   1st Qu.:0.1269  
##  Median :0.2595   Median :0.2451   Median :0.2225   Median :0.1777  
##  Mean   :0.2893   Mean   :0.2783   Mean   :0.2465   Mean   :0.2141  
##  3rd Qu.:0.3875   3rd Qu.:0.3842   3rd Qu.:0.3245   3rd Qu.:0.2717  
##  Max.   :0.8995   Max.   :0.8246   Max.   :0.7733   Max.   :0.7762  
##       V45               V46               V47               V48         
##  Min.   :0.00000   Min.   :0.00000   Min.   :0.00000   Min.   :0.00000  
##  1st Qu.:0.09448   1st Qu.:0.06855   1st Qu.:0.06425   1st Qu.:0.04512  
##  Median :0.14800   Median :0.12135   Median :0.10165   Median :0.07810  
##  Mean   :0.19723   Mean   :0.16063   Mean   :0.12245   Mean   :0.09142  
##  3rd Qu.:0.23155   3rd Qu.:0.20037   3rd Qu.:0.15443   3rd Qu.:0.12010  
##  Max.   :0.70340   Max.   :0.72920   Max.   :0.55220   Max.   :0.33390  
##       V49               V50               V51                V52          
##  Min.   :0.00000   Min.   :0.00000   Min.   :0.000000   Min.   :0.000800  
##  1st Qu.:0.02635   1st Qu.:0.01155   1st Qu.:0.008425   1st Qu.:0.007275  
##  Median :0.04470   Median :0.01790   Median :0.013900   Median :0.011400  
##  Mean   :0.05193   Mean   :0.02042   Mean   :0.016069   Mean   :0.013420  
##  3rd Qu.:0.06853   3rd Qu.:0.02527   3rd Qu.:0.020825   3rd Qu.:0.016725  
##  Max.   :0.19810   Max.   :0.08250   Max.   :0.100400   Max.   :0.070900  
##       V53                V54                V55         
##  Min.   :0.000500   Min.   :0.001000   Min.   :0.00060  
##  1st Qu.:0.005075   1st Qu.:0.005375   1st Qu.:0.00415  
##  Median :0.009550   Median :0.009300   Median :0.00750  
##  Mean   :0.010709   Mean   :0.010941   Mean   :0.00929  
##  3rd Qu.:0.014900   3rd Qu.:0.014500   3rd Qu.:0.01210  
##  Max.   :0.039000   Max.   :0.035200   Max.   :0.04470  
##       V56                V57               V58          
##  Min.   :0.000400   Min.   :0.00030   Min.   :0.000300  
##  1st Qu.:0.004400   1st Qu.:0.00370   1st Qu.:0.003600  
##  Median :0.006850   Median :0.00595   Median :0.005800  
##  Mean   :0.008222   Mean   :0.00782   Mean   :0.007949  
##  3rd Qu.:0.010575   3rd Qu.:0.01043   3rd Qu.:0.010350  
##  Max.   :0.039400   Max.   :0.03550   Max.   :0.044000  
##       V59                V60          
##  Min.   :0.000100   Min.   :0.000600  
##  1st Qu.:0.003675   1st Qu.:0.003100  
##  Median :0.006400   Median :0.005300  
##  Mean   :0.007941   Mean   :0.006507  
##  3rd Qu.:0.010325   3rd Qu.:0.008525  
##  Max.   :0.036400   Max.   :0.043900

df = as.data.frame(scale(Sonar))
str(df)

## 'data.frame':    208 obs. of  60 variables:
##  $ V1 : num  -0.399 0.702 -0.129 -0.834 2.046 ...
##  $ V2 : num  -0.0406 0.4206 0.5996 -0.6473 0.8545 ...
##  $ V3 : num  -0.0269 1.0531 1.7193 0.4806 0.1111 ...
##  $ V4 : num  -0.713 0.323 1.169 -0.718 -0.311 ...
##  $ V5 : num  0.364 0.776 0.4 -0.985 -0.292 ...
##  $ V6 : num  -0.101 2.601 2.088 -1.147 -0.671 ...
##  $ V7 : num  0.5204 1.519 1.964 -0.1933 -0.0137 ...
##  $ V8 : num  0.2971 2.5049 2.8455 -0.0845 1.3141 ...
##  $ V9 : num  1.123 1.315 3.225 -0.998 1.507 ...
##  $ V10: num  0.0211 0.5873 3.0587 -0.609 1.768 ...
##  $ V11: num  -0.566 1.927 2.994 -1.115 1.35 ...
##  $ V12: num  -0.657 2.891 3.254 -0.364 1.035 ...
##  $ V13: num  -0.351 2.97 1.994 -1.808 1.08 ...
##  $ V14: num  -1.411 2.937 1.431 -0.428 0.711 ...
##  $ V15: num  -1.237 2.075 1.595 -0.717 0.645 ...
##  $ V16: num  -0.65 2.432 1.352 -0.711 0.662 ...
##  $ V17: num  -0.402 2.215 0.986 -1.315 1.193 ...
##  $ V18: num  -0.583 1.664 1.158 -0.857 0.638 ...
##  $ V19: num  0.0116 1.1535 1.5043 -0.383 -1.1691 ...
##  $ V20: num  -0.317 0.833 1.138 -0.631 -0.379 ...
##  $ V21: num  -0.119 -0.341 0.731 -1.299 -0.753 ...
##  $ V22: num  -0.458 -0.856 0.193 -0.998 -0.762 ...
##  $ V23: num  -0.856 -1.004 -0.87 -0.365 -0.296 ...
##  $ V24: num  -0.492 -1.176 -1.287 -0.786 -0.555 ...
##  $ V25: num  -0.0177 -1.4307 -0.5811 -1.4756 -1.4671 ...
##  $ V26: num  -0.246 -1.601 -1.933 -0.702 -1.987 ...
##  $ V27: num  0.0336 -1.5267 -0.7944 -0.7187 -0.0108 ...
##  $ V28: num  0.481 -1.759 0.672 -1.864 1.29 ...
##  $ V29: num  0.154 -0.832 -0.16 -1.803 0.35 ...
##  $ V30: num  -0.884 -1.713 1.225 -1.019 -0.492 ...
##  $ V31: num  -1.7467 -0.5873 1.6203 0.5679 0.0272 ...
##  $ V32: num  -0.838 -0.677 0.307 1.383 0.501 ...
##  $ V33: num  0.459 -1.06 -1.119 0.943 -0.625 ...
##  $ V34: num  1.52 -0.731 -0.572 -0.232 -1.32 ...
##  $ V35: num  1.7795 -1.011 0.1182 0.0105 -0.762 ...
##  $ V36: num  1.764 0.126 -0.305 -0.317 0.126 ...
##  $ V37: num  1.2729 0.0821 1.0328 0.7377 0.4026 ...
##  $ V38: num  1.268 -1.0985 1.5774 2.5437 -0.0843 ...
##  $ V39: num  0.846 -0.712 1.063 3.315 -0.216 ...
##  $ V40: num  -0.206 -0.639 0.899 3.389 -0.382 ...
##  $ V41: num  -1.392 -0.712 1.025 1.887 -0.534 ...
##  $ V42: num  0.0303 -1.3038 -0.1161 1.3175 -0.2009 ...
##  $ V43: num  0.259 -0.766 -0.242 0.536 -0.445 ...
##  $ V44: num  1.587 -0.385 0.061 0.796 -0.975 ...
##  $ V45: num  0.441 -0.8912 0.0915 1.5318 -0.8444 ...
##  $ V46: num  -0.164 -1.048 -1.068 1.529 -0.805 ...
##  $ V47: num  -0.2 -0.799 0.142 1.645 -0.998 ...
##  $ V48: num  0.687 -0.276 -0.273 1.06 -1.329 ...
##  $ V49: num  -0.379 -0.307 -1.083 0.45 -0.805 ...
##  $ V50: num  0.876 -1.048 -0.719 0.657 -1.158 ...
##  $ V51: num  0.594 -0.297 -1.063 0.669 -0.039 ...
##  $ V52: num  -1.113 -0.521 1.015 -0.137 -1.071 ...
##  $ V53: num  -0.596 -0.256 0.834 -1.007 -0.752 ...
##  $ V54: num  0.6793 -0.8411 -0.1974 0.556 -0.0604 ...
##  $ V55: num  -0.2949 0.0155 1.2288 -0.1115 0.2412 ...
##  $ V56: num  1.478 1.896 2.82 -0.161 -1.172 ...
##  $ V57: num  1.76 1.068 4.11 -0.487 -0.107 ...
##  $ V58: num  0.0697 -0.4713 1.3062 -0.5486 -0.4867 ...
##  $ V59: num  0.171 -0.443 0.252 -0.638 0.446 ...
##  $ V60: num  -0.657 -0.419 0.257 1.032 0.575 ...

numComplete = NbClust(df, distance="euclidean", min.nc=5, max.nc=10,
method="complete", index="all")

## *** : The Hubert index is a graphical method of determining the number of clusters.
##                 In the plot of Hubert index, we seek a significant knee that corresponds to a 
##                 significant increase of the value of the measure i.e the significant peak in Hubert
##                 index second differences plot. 
##

## *** : The D index is a graphical method of determining the number of clusters. 
##                 In the plot of D index, we seek a significant knee (the significant peak in Dindex
##                 second differences plot) that corresponds to a significant increase of the value of
##                 the measure. 
##  
## ******************************************************************* 
## * Among all indices:                                                
## * 4 proposed 5 as the best number of clusters 
## * 2 proposed 6 as the best number of clusters 
## * 11 proposed 8 as the best number of clusters 
## * 6 proposed 10 as the best number of clusters 
## 
##                    ***** Conclusion *****                            
##  
## * According to the majority rule, the best number of clusters is  8 
##  
##  
## *******************************************************************

numComplete$Best.nc

##                    KL      CH Hartigan    CCC    Scott       Marriot
## Number_clusters 8.000  5.0000   8.0000  8.000   8.0000  6.000000e+00
## Value_Index     4.344 20.2751   7.7286 -5.804 373.6376 3.433609e+100
##                   TrCovW   TraceW Friedman   Rubin Cindex      DB
## Number_clusters    8.000   8.0000   8.0000  8.0000  5.000 10.0000
## Value_Index     7130.026 304.5389  38.5919 -0.0561  0.454  1.4647
##                 Silhouette   Duda PseudoT2   Beale Ratkowsky     Ball
## Number_clusters     10.000 8.0000   8.0000  8.0000    5.0000   6.0000
## Value_Index          0.141 1.1488  -0.1295 -2.7705    0.2317 354.4423
##                 PtBiserial Frey McClain    Dunn Hubert SDindex Dindex
## Number_clusters    10.0000    4  5.0000 10.0000      0   10.00      0
## Value_Index         0.5436   NA  1.2119  0.2721      0    0.49      0
##                    SDbw
## Number_clusters 10.0000
## Value_Index      0.6152

dis = dist(df, method="euclidean")
hc = hclust(dis, method="complete")
plot(hc, hang=-1,labels=FALSE, main="Complete-Linkage")

comp8 = cutree(hc, 8)
ColorDendrogram(hc, y = comp8, main = "Complete", branchlength = 50)

table(comp8)

## comp8
##   1   2   3   4   5   6   7   8 
## 111  50   5  18   6   9   6   3

#Glass Identification Database
data(Glass)
dim(Glass)

## [1] 214  10

levels(Glass$Type)

## [1] "1" "2" "3" "5" "6" "7"

head(Glass)

##        RI    Na   Mg   Al    Si    K   Ca Ba   Fe Type
## 1 1.52101 13.64 4.49 1.10 71.78 0.06 8.75  0 0.00    1
## 2 1.51761 13.89 3.60 1.36 72.73 0.48 7.83  0 0.00    1
## 3 1.51618 13.53 3.55 1.54 72.99 0.39 7.78  0 0.00    1
## 4 1.51766 13.21 3.69 1.29 72.61 0.57 8.22  0 0.00    1
## 5 1.51742 13.27 3.62 1.24 73.08 0.55 8.07  0 0.00    1
## 6 1.51596 12.79 3.61 1.62 72.97 0.64 8.07  0 0.26    1

str(Glass)

## 'data.frame':    214 obs. of  10 variables:
##  $ RI  : num  1.52 1.52 1.52 1.52 1.52 ...
##  $ Na  : num  13.6 13.9 13.5 13.2 13.3 ...
##  $ Mg  : num  4.49 3.6 3.55 3.69 3.62 3.61 3.6 3.61 3.58 3.6 ...
##  $ Al  : num  1.1 1.36 1.54 1.29 1.24 1.62 1.14 1.05 1.37 1.36 ...
##  $ Si  : num  71.8 72.7 73 72.6 73.1 ...
##  $ K   : num  0.06 0.48 0.39 0.57 0.55 0.64 0.58 0.57 0.56 0.57 ...
##  $ Ca  : num  8.75 7.83 7.78 8.22 8.07 8.07 8.17 8.24 8.3 8.4 ...
##  $ Ba  : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Fe  : num  0 0 0 0 0 0.26 0 0 0 0.11 ...
##  $ Type: Factor w/ 6 levels "1","2","3","5",..: 1 1 1 1 1 1 1 1 1 1 ...

df2 = as.data.frame(scale(Glass[,-10]))
str(df2)

## 'data.frame':    214 obs. of  9 variables:
##  $ RI: num  0.871 -0.249 -0.72 -0.232 -0.311 ...
##  $ Na: num  0.284 0.59 0.15 -0.242 -0.169 ...
##  $ Mg: num  1.252 0.635 0.6 0.697 0.649 ...
##  $ Al: num  -0.691 -0.17 0.19 -0.31 -0.41 ...
##  $ Si: num  -1.1244 0.1021 0.4378 -0.0528 0.554 ...
##  $ K : num  -0.6701 -0.0262 -0.1641 0.1118 0.0812 ...
##  $ Ca: num  -0.145 -0.792 -0.827 -0.518 -0.623 ...
##  $ Ba: num  -0.352 -0.352 -0.352 -0.352 -0.352 ...
##  $ Fe: num  -0.585 -0.585 -0.585 -0.585 -0.585 ...

table(Glass$Type)

## 
##  1  2  3  5  6  7 
## 70 76 17 13  9 29

#hierarchical clustering
numComplete = NbClust(df2, distance="euclidean", min.nc=2, max.nc=6,
method="complete", index="all")

## *** : The Hubert index is a graphical method of determining the number of clusters.
##                 In the plot of Hubert index, we seek a significant knee that corresponds to a 
##                 significant increase of the value of the measure i.e the significant peak in Hubert
##                 index second differences plot. 
##

## *** : The D index is a graphical method of determining the number of clusters. 
##                 In the plot of D index, we seek a significant knee (the significant peak in Dindex
##                 second differences plot) that corresponds to a significant increase of the value of
##                 the measure. 
##  
## ******************************************************************* 
## * Among all indices:                                                
## * 6 proposed 2 as the best number of clusters 
## * 9 proposed 3 as the best number of clusters 
## * 2 proposed 4 as the best number of clusters 
## * 1 proposed 5 as the best number of clusters 
## * 5 proposed 6 as the best number of clusters 
## 
##                    ***** Conclusion *****                            
##  
## * According to the majority rule, the best number of clusters is  3 
##  
##  
## *******************************************************************

numComplete$Best.nc

##                     KL      CH Hartigan     CCC   Scott      Marriot
## Number_clusters 3.0000  3.0000   3.0000  2.0000   4.000 4.000000e+00
## Value_Index     6.1495 33.0364  15.3975 -6.9247 301.587 9.787967e+16
##                   TrCovW   TraceW Friedman   Rubin Cindex     DB
## Number_clusters     3.00   3.0000   3.0000  3.0000 2.0000 6.0000
## Value_Index     13673.73 113.0336  60.6448 -0.0703 0.2421 0.8198
##                 Silhouette   Duda PseudoT2  Beale Ratkowsky     Ball
## Number_clusters     5.0000 2.0000   2.0000 2.0000    3.0000   3.0000
## Value_Index         0.4095 0.8541  33.4887 1.0208    0.2641 350.3589
##                 PtBiserial Frey McClain   Dunn Hubert SDindex Dindex
## Number_clusters     6.0000    1   2.000 6.0000      0  6.0000      0
## Value_Index         0.5954   NA   0.096 0.1573      0  0.8283      0
##                   SDbw
## Number_clusters 6.0000
## Value_Index     0.5434

dis = dist(df2, method="euclidean")
hc = hclust(dis, method="complete")
plot(hc, hang=-1,labels=FALSE, main="Complete-Linkage")

comp3 = cutree(hc, 3)
ColorDendrogram(hc, y = comp3, main = "Complete", branchlength = 30)

table(comp3)

## comp3
##   1   2   3 
## 192  16   6

table(comp3,Glass$Type)

##      
## comp3  1  2  3  5  6  7
##     1 70 64 17  6  9 26
##     2  0 12  0  4  0  0
##     3  0  0  0  3  0  3

NbClust(df2, diss=NULL, distance="euclidean", min.nc=2, max.nc=6,
method="ward.D2", index="all")

## *** : The Hubert index is a graphical method of determining the number of clusters.
##                 In the plot of Hubert index, we seek a significant knee that corresponds to a 
##                 significant increase of the value of the measure i.e the significant peak in Hubert
##                 index second differences plot. 
##

## *** : The D index is a graphical method of determining the number of clusters. 
##                 In the plot of D index, we seek a significant knee (the significant peak in Dindex
##                 second differences plot) that corresponds to a significant increase of the value of
##                 the measure. 
##  
## ******************************************************************* 
## * Among all indices:                                                
## * 3 proposed 2 as the best number of clusters 
## * 3 proposed 3 as the best number of clusters 
## * 8 proposed 4 as the best number of clusters 
## * 1 proposed 5 as the best number of clusters 
## * 6 proposed 6 as the best number of clusters 
## 
##                    ***** Conclusion *****                            
##  
## * According to the majority rule, the best number of clusters is  4 
##  
##  
## *******************************************************************

## $All.index
##       KL      CH Hartigan     CCC     Scott      Marriot   TrCovW
## 2 1.1170 54.1540  36.9793 -0.9405  375.2761 9.566690e+16 55386.41
## 3 0.7489 50.0525  38.3118 -2.1169  544.6034 9.756835e+16 36071.60
## 4 1.1922 51.9503  32.7478  0.3654  928.3092 2.887279e+16 27154.77
## 5 0.9506 52.9722  34.2352  3.1077 1216.6658 1.172503e+16 22534.94
## 6 2.0614 55.8977  20.2704  7.4254 1357.4165 8.746518e+15 15016.14
##      TraceW Friedman  Rubin Cindex     DB Silhouette   Duda Pseudot2
## 2 1526.9503 143.1121 1.2554 0.2195 1.5776     0.4499 0.7967  46.7061
## 3 1300.1623 295.6671 1.4744 0.1954 1.3824     0.4522 0.5146  25.4717
## 4 1100.3660 345.5944 1.7421 0.2253 0.9734     0.4523 0.8050  41.9052
## 5  951.9214 372.3038 2.0138 0.2024 1.2483     0.2590 0.7049  51.0711
## 6  817.9390 411.5637 2.3437 0.1772 1.1955     0.2822 0.7056  38.3794
##    Beale Ratkowsky     Ball Ptbiserial    Frey McClain   Dunn Hubert
## 2 1.5243    0.2694 763.4752     0.5201 -0.2385  0.1638 0.1457 0.0016
## 3 5.4627    0.2961 433.3874     0.6562 -1.3430  0.2119 0.1105 0.0018
## 4 1.4462    0.3098 275.0915     0.6714  4.7399  0.2107 0.1287 0.0018
## 5 2.4933    0.3108 190.3843     0.4812  0.8382  0.7687 0.0673 0.0018
## 6 2.4781    0.3075 136.3232     0.4516  0.2068  1.2365 0.0361 0.0019
##   SDindex Dindex   SDbw
## 2  2.2887 2.2087 1.5473
## 3  2.2862 2.0601 1.5639
## 4  1.4944 1.9451 0.8433
## 5  1.9771 1.7571 0.7711
## 6  1.9133 1.5899 0.7484
## 
## $All.CriticalValues
##   CritValue_Duda CritValue_PseudoT2 Fvalue_Beale
## 2         0.8235            39.2304       0.1338
## 3         0.6621            13.7821       0.0000
## 4         0.8205            37.8498       0.1630
## 5         0.8000            30.4920       0.0080
## 6         0.7808            25.8208       0.0086
## 
## $Best.nc
##                     KL      CH Hartigan    CCC    Scott     Marriot
## Number_clusters 6.0000  6.0000   6.0000 6.0000   4.0000 4.00000e+00
## Value_Index     2.0614 55.8977  13.9648 7.4254 383.7058 5.15478e+16
##                   TrCovW  TraceW Friedman Rubin Cindex     DB Silhouette
## Number_clusters     3.00  4.0000   3.0000 4.000 6.0000 4.0000     4.0000
## Value_Index     19314.81 51.3517 152.5551 0.004 0.1772 0.9734     0.4523
##                 Duda PseudoT2  Beale Ratkowsky     Ball PtBiserial Frey
## Number_clusters   NA       NA 2.0000    5.0000   3.0000     4.0000    1
## Value_Index       NA       NA 1.5243    0.3108 330.0877     0.6714   NA
##                 McClain   Dunn Hubert SDindex Dindex   SDbw
## Number_clusters  2.0000 2.0000      0  4.0000      0 6.0000
## Value_Index      0.1638 0.1457      0  1.4944      0 0.7484
## 
## $Best.partition
##   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18 
##   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1 
##  19  20  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35  36 
##   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1 
##  37  38  39  40  41  42  43  44  45  46  47  48  49  50  51  52  53  54 
##   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1 
##  55  56  57  58  59  60  61  62  63  64  65  66  67  68  69  70  71  72 
##   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1 
##  73  74  75  76  77  78  79  80  81  82  83  84  85  86  87  88  89  90 
##   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1 
##  91  92  93  94  95  96  97  98  99 100 101 102 103 104 105 106 107 108 
##   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   2   2 
## 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 
##   2   1   2   2   2   1   1   1   1   1   1   1   1   1   1   1   1   1 
## 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 
##   1   1   1   1   2   2   1   1   1   1   1   1   1   1   1   1   1   1 
## 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 
##   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1 
## 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 
##   1   3   1   1   1   1   1   1   2   4   4   2   1   1   1   1   1   1 
## 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 
##   1   1   1   1   3   3   3   1   1   3   1   3   3   3   3   3   3   3 
## 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 
##   3   3   3   1   3   3   3   3   3   3   3   3   3   3   3   3

hcWard = hclust(dis, method="ward.D2")
plot(hcWard, labels=FALSE, main="Ward's-Linkage")

ward4 = cutree(hcWard, 4)
table(ward4,Glass$Type)

##      
## ward4  1  2  3  5  6  7
##     1 70 68 17  8  8  4
##     2  0  8  0  2  0  0
##     3  0  0  0  1  1 25
##     4  0  0  0  2  0  0

table(comp3, ward4)

##      ward4
## comp3   1   2   3   4
##     1 169   0  23   0
##     2   6  10   0   0
##     3   0   0   4   2

aggregate(Glass[,-10],list(comp3),mean)

##   Group.1       RI       Na       Mg       Al       Si         K        Ca
## 1       1 1.517975 13.44609 2.899375 1.414167 72.74359 0.4296875  8.754062
## 2       2 1.524340 12.82500 0.541875 1.392500 72.03375 0.3212500 12.410000
## 3       3 1.514918 13.73833 1.523333 2.568333 71.33167 3.1216667  6.241667
##          Ba         Fe
## 1 0.1348958 0.05130208
## 2 0.2287500 0.14687500
## 3 1.3166667 0.00000000

aggregate(Glass[,-10],list(ward4),mean)

##   Group.1       RI       Na        Mg       Al       Si         K
## 1       1 1.518276 13.26034 3.2143429 1.325886 72.66789 0.4849714
## 2       2 1.526123 12.66200 0.1010000 1.199000 71.88500 0.2030000
## 3       3 1.516459 14.66963 0.4062963 2.190000 72.97741 0.2611111
## 4       4 1.513185 13.01000 0.0000000 3.030000 70.59000 6.2100000
##          Ca         Ba         Fe
## 1  8.834000 0.01548571 0.06348571
## 2 13.437000 0.31500000 0.07000000
## 3  8.243704 1.17037037 0.01444444
## 4  6.945000 0.00000000 0.00000000

par(mfrow=c(1,2))

Glass$comp_cluster = comp3
Glass$ward_cluster = ward4
boxplot(RI~comp_cluster, data=Glass, main="RI by Complete Linkage")
boxplot(RI~ward_cluster, data=Glass, main="RI by Ward's Linkage")

#k means clustering
NbClust(df2, min.nc=2, max.nc=4, method="kmeans")

## *** : The Hubert index is a graphical method of determining the number of clusters.
##                 In the plot of Hubert index, we seek a significant knee that corresponds to a 
##                 significant increase of the value of the measure i.e the significant peak in Hubert
##                 index second differences plot. 
##

## *** : The D index is a graphical method of determining the number of clusters. 
##                 In the plot of D index, we seek a significant knee (the significant peak in Dindex
##                 second differences plot) that corresponds to a significant increase of the value of
##                 the measure. 
##  
## ******************************************************************* 
## * Among all indices:                                                
## * 3 proposed 2 as the best number of clusters 
## * 10 proposed 3 as the best number of clusters 
## * 10 proposed 4 as the best number of clusters 
## 
##                    ***** Conclusion *****                            
##  
## * According to the majority rule, the best number of clusters is  3 
##  
##  
## *******************************************************************

## $All.index
##       KL      CH Hartigan     CCC    Scott      Marriot   TrCovW   TraceW
## 2 0.6241 42.5981  31.4956 -3.8304 241.7117 1.785743e+17 55872.41 1596.257
## 3 0.3291 40.0215  53.8473 -5.5533 506.3398 1.166707e+17 45133.01 1389.785
## 4 2.0350 51.1953  31.7592  0.0528 853.9172 4.087528e+16 28322.72 1107.221
##   Friedman  Rubin Cindex     DB Silhouette   Duda Pseudot2   Beale
## 2 124.4273 1.2009 0.2278 1.9306     0.3033 1.1084  -4.5004 -0.5741
## 3 176.0639 1.3794 0.2608 1.4944     0.3227 6.0797 -53.4732 -2.5086
## 4 294.5052 1.7314 0.1928 1.5092     0.2804 2.8459 -50.5917 -3.7867
##   Ratkowsky     Ball Ptbiserial    Frey McClain   Dunn Hubert SDindex
## 2    0.2234 798.1285     0.3338 -1.3753  0.3362 0.0584 0.0011  1.4670
## 3    0.2527 463.2615     0.4520  0.9951  0.3243 0.0718 0.0017  2.2238
## 4    0.3156 276.8053     0.4453  0.5656  1.0597 0.0504 0.0014  1.3994
##   Dindex   SDbw
## 2 2.2814 1.0414
## 3 2.1995 1.9700
## 4 1.7433 0.8216
## 
## $All.CriticalValues
##   CritValue_Duda CritValue_PseudoT2 Fvalue_Beale
## 2         0.7148            18.3576            1
## 3         0.0985           585.8822            1
## 4         0.6927            34.6028            1
## 
## $Best.nc
##                    KL      CH Hartigan    CCC    Scott      Marriot
## Number_clusters 4.000  4.0000   3.0000 4.0000   4.0000  3.00000e+00
## Value_Index     2.035 51.1953  22.3517 0.0528 347.5775 -1.38918e+16
##                   TrCovW  TraceW Friedman  Rubin Cindex     DB Silhouette
## Number_clusters     4.00   3.000   4.0000 3.0000 4.0000 3.0000     3.0000
## Value_Index     16810.29 -76.091 118.4413 0.1736 0.1928 1.4944     0.3227
##                   Duda PseudoT2   Beale Ratkowsky    Ball PtBiserial Frey
## Number_clusters 2.0000   2.0000  2.0000    4.0000   3.000      3.000    1
## Value_Index     1.1084  -4.5004 -0.5741    0.3156 334.867      0.452   NA
##                 McClain   Dunn Hubert SDindex Dindex   SDbw
## Number_clusters  3.0000 3.0000      0  4.0000      0 4.0000
## Value_Index      0.3243 0.0718      0  1.3994      0 0.8216
## 
## $Best.partition
##   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18 
##   3   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   3 
##  19  20  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35  36 
##   2   2   2   3   2   2   2   2   2   2   2   2   2   2   2   2   2   2 
##  37  38  39  40  41  42  43  44  45  46  47  48  49  50  51  52  53  54 
##   2   2   3   3   2   2   2   3   2   2   2   3   3   2   3   2   2   2 
##  55  56  57  58  59  60  61  62  63  64  65  66  67  68  69  70  71  72 
##   2   2   2   2   2   2   2   2   3   3   3   3   3   3   3   3   2   2 
##  73  74  75  76  77  78  79  80  81  82  83  84  85  86  87  88  89  90 
##   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2 
##  91  92  93  94  95  96  97  98  99 100 101 102 103 104 105 106 107 108 
##   2   2   2   2   2   2   2   2   2   2   2   2   2   3   3   3   3   3 
## 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 
##   3   2   3   3   3   2   2   2   2   2   2   2   2   2   2   2   2   2 
## 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 
##   2   3   3   3   3   3   2   2   2   2   2   2   2   2   2   2   2   2 
## 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 
##   2   2   2   2   2   2   2   3   2   2   2   2   2   3   2   2   2   2 
## 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 
##   3   1   2   3   3   2   2   3   3   1   1   3   3   3   2   2   2   2 
## 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 
##   2   2   2   3   2   2   2   3   3   3   2   2   2   2   2   2   2   2 
## 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 
##   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2

set.seed(123)
km=kmeans(df2,3,nstart=25)
table(km$cluster)

## 
##   1   2   3 
##  45  32 137

km$centers

##           RI          Na         Mg         Al         Si           K
## 1  1.4606888 -0.09043613 -0.3972366 -0.6970535 -0.7718375 -0.38937157
## 2 -0.6459528  1.38028977 -1.5680171  1.5380586  0.2049626  0.17078016
## 3 -0.3289088 -0.29269815  0.4967313 -0.1302954  0.2056488  0.08800552
##           Ca         Ba          Fe
## 1  1.3701447 -0.1884750  0.25647565
## 2 -0.4535000  1.6817596 -0.46000045
## 3 -0.3441205 -0.3309119  0.02320153

Glass$km_cluster = km$cluster
boxplot(Na~km_cluster, data=Glass, main="Na Content, K-Means")
boxplot(Na~ward_cluster, data=Glass, main="Na Content, Ward's")

table(km$cluster, Glass$Type)

##    
##      1  2  3  5  6  7
##   1 16 15  3  8  1  2
##   2  0  0  0  3  3 26
##   3 54 61 14  2  5  1

#Clustering with mixed data
df2$Type = as.factor(Glass$Type)
df2$sandy = as.factor(ifelse(df2$Si>0,"High","Low"))
str(df2)

## 'data.frame':    214 obs. of  11 variables:
##  $ RI   : num  0.871 -0.249 -0.72 -0.232 -0.311 ...
##  $ Na   : num  0.284 0.59 0.15 -0.242 -0.169 ...
##  $ Mg   : num  1.252 0.635 0.6 0.697 0.649 ...
##  $ Al   : num  -0.691 -0.17 0.19 -0.31 -0.41 ...
##  $ Si   : num  -1.1244 0.1021 0.4378 -0.0528 0.554 ...
##  $ K    : num  -0.6701 -0.0262 -0.1641 0.1118 0.0812 ...
##  $ Ca   : num  -0.145 -0.792 -0.827 -0.518 -0.623 ...
##  $ Ba   : num  -0.352 -0.352 -0.352 -0.352 -0.352 ...
##  $ Fe   : num  -0.585 -0.585 -0.585 -0.585 -0.585 ...
##  $ Type : Factor w/ 6 levels "1","2","3","5",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ sandy: Factor w/ 2 levels "High","Low": 2 1 1 2 1 1 1 1 2 1 ...

disMat = daisy(df2, metric="gower")
set.seed(123)
pamFit = pam(disMat, k=3)
table(pamFit$clustering)

## 
##  1  2  3 
## 84 94 36

table(pamFit$clustering, Glass$Type)

##    
##      1  2  3  5  6  7
##   1 26 35 10  6  3  4
##   2 44 37  7  3  3  0
##   3  0  4  0  4  3 25

df2$cluster = pamFit$clustering
group = compareGroups(cluster~., data=df2)
clustab = createTable(group)
clustab

## 
## --------Summary descriptives table by 'cluster'---------
## 
## _________________________________________________________ 
##               1            2            3       p.overall 
##              N=84         N=94         N=36               
## ¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯ 
## RI       0.58 (1.15)  -0.41 (0.42) -0.28 (1.07)  <0.001   
## Na       0.19 (0.84)  -0.50 (0.56) 0.88 (1.42)   <0.001   
## Mg       0.23 (0.86)  0.47 (0.30)  -1.76 (0.42)  <0.001   
## Al       -0.21 (1.09) -0.14 (0.45) 0.86 (1.34)   <0.001   
## Si       -0.91 (0.85) 0.47 (0.36)  0.91 (0.77)   <0.001   
## K        0.09 (1.44)  0.08 (0.23)  -0.41 (0.89)   0.027   
## Ca       0.23 (1.17)  -0.33 (0.43) 0.31 (1.34)   <0.001   
## Ba       -0.11 (0.98) -0.34 (0.05) 1.15 (1.45)   <0.001   
## Fe       0.18 (1.17)  -0.02 (0.94) -0.37 (0.55)   0.020   
## Type:                                            <0.001   
##     1     26 (31.0%)   44 (46.8%)   0 (0.00%)             
##     2     35 (41.7%)   37 (39.4%)   4 (11.1%)             
##     3     10 (11.9%)   7 (7.45%)    0 (0.00%)             
##     5     6 (7.14%)    3 (3.19%)    4 (11.1%)             
##     6     3 (3.57%)    3 (3.19%)    3 (8.33%)             
##     7     4 (4.76%)    0 (0.00%)    25 (69.4%)            
## sandy:                                           <0.001   
##     High  1 (1.19%)    93 (98.9%)   35 (97.2%)            
##     Low   83 (98.8%)   1 (1.06%)    1 (2.78%)             
## ¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯

export2csv(clustab,file="Glass_clusters.csv")

Cluster Analysis

Kushan De Silva

August 5, 2017