Clustering

In order to cluster a data set, one should look carefully at the statistical properties of the data so a suitable method is chosen for the purpose. Here we have faithful data:

library("ggplot2")
library("dplyr")
theme_set(theme_classic())
options(scipen = 999)

h <- faithful %>%
    ggplot(aes(x = waiting)) +
    geom_histogram(aes(waiting, ..density..), binwidth = 1, colour = "black", fill = "white")
h

Histogram of the data

d <- ggplot(faithful, aes(x = waiting)) +
  geom_density() +
  geom_vline(xintercept = 53, col = "red", size = 2) + 
  geom_vline(xintercept = 80, col = "blue", size = 2)
d

Clustering the data using normalmixEM() with 2 clusters

library("mixtools")
plot_mix_comps <- function(x, mu, sigma, lam) {
  lam * dnorm(x, mu, sigma)
}

wait <- faithful$waiting
mixmdl <- normalmixEM(wait, k = 2)
## number of iterations= 18
mixmdl
## $x
##   [1] 79 54 74 62 85 55 88 85 51 85 54 84 78 47 83 52 62 84 52 79 51 47 78 69 74
##  [26] 83 55 76 78 79 73 77 66 80 74 52 48 80 59 90 80 58 84 58 73 83 64 53 82 59
##  [51] 75 90 54 80 54 83 71 64 77 81 59 84 48 82 60 92 78 78 65 73 82 56 79 71 62
##  [76] 76 60 78 76 83 75 82 70 65 73 88 76 80 48 86 60 90 50 78 63 72 84 75 51 82
## [101] 62 88 49 83 81 47 84 52 86 81 75 59 89 79 59 81 50 85 59 87 53 69 77 56 88
## [126] 81 45 82 55 90 45 83 56 89 46 82 51 86 53 79 81 60 82 77 76 59 80 49 96 53
## [151] 77 77 65 81 71 70 81 93 53 89 45 86 58 78 66 76 63 88 52 93 49 57 77 68 81
## [176] 81 73 50 85 74 55 77 83 83 51 78 84 46 83 55 81 57 76 84 77 81 87 77 51 78
## [201] 60 82 91 53 78 46 77 84 49 83 71 80 49 75 64 76 53 94 55 76 50 82 54 75 78
## [226] 79 78 78 70 79 70 54 86 50 90 54 54 77 79 64 75 47 86 63 85 82 57 82 67 74
## [251] 54 83 73 73 88 80 71 83 56 79 78 84 58 83 43 60 75 81 46 90 46 74
## 
## $lambda
## [1] 0.3608851 0.6391149
## 
## $mu
## [1] 54.61482 80.09104
## 
## $sigma
## [1] 5.871192 5.867755
## 
## $loglik
## [1] -1034.002
## 
## $posterior
##                    comp.1           comp.2
##   [1,] 0.0001030668945107 0.99989693310549
##   [2,] 0.9999093255335648 0.00009067446644
##   [3,] 0.0041351226521375 0.99586487734786
##   [4,] 0.9673784477343398 0.03262155226566
##   [5,] 0.0000012232123967 0.99999877678760
##   [6,] 0.9998099834107058 0.00019001658929
##   [7,] 0.0000001333125130 0.99999986668749
##   [8,] 0.0000012232123967 0.99999877678760
##   [9,] 0.9999901512177107 0.00000984878229
##  [10,] 0.0000012232123967 0.99999877678760
##  [11,] 0.9999093255335648 0.00009067446644
##  [12,] 0.0000025609905097 0.99999743900949
##  [13,] 0.0002158069830015 0.99978419301700
##  [14,] 0.9999994898908655 0.00000051010913
##  [15,] 0.0000053620171042 0.99999463798290
##  [16,] 0.9999793570477835 0.00002064295222
##  [17,] 0.9673784477343398 0.03262155226566
##  [18,] 0.0000025609905097 0.99999743900949
##  [19,] 0.9999793570477835 0.00002064295222
##  [20,] 0.0001030668945107 0.99989693310549
##  [21,] 0.9999901512177107 0.00000984878229
##  [22,] 0.9999994898908655 0.00000051010913
##  [23,] 0.0002158069830015 0.99978419301700
##  [24,] 0.1434028502569841 0.85659714974302
##  [25,] 0.0041351226521375 0.99586487734786
##  [26,] 0.0000053620171042 0.99999463798290
##  [27,] 0.9998099834107058 0.00019001658929
##  [28,] 0.0009457647791660 0.99905423522083
##  [29,] 0.0002158069830015 0.99978419301700
##  [30,] 0.0001030668945107 0.99989693310549
##  [31,] 0.0086217705543211 0.99137822944568
##  [32,] 0.0004518278488303 0.99954817215117
##  [33,] 0.6061527480036107 0.39384725199639
##  [34,] 0.0000492223189208 0.99995077768108
##  [35,] 0.0041351226521375 0.99586487734786
##  [36,] 0.9999793570477835 0.00002064295222
##  [37,] 0.9999989306598338 0.00000106934017
##  [38,] 0.0000492223189208 0.99995077768108
##  [39,] 0.9963479555021389 0.00365204449786
##  [40,] 0.0000000304221919 0.99999996957781
##  [41,] 0.0000492223189208 0.99995077768108
##  [42,] 0.9982538745754911 0.00174612542451
##  [43,] 0.0000025609905097 0.99999743900949
##  [44,] 0.9982538745754911 0.00174612542451
##  [45,] 0.0086217705543211 0.99137822944568
##  [46,] 0.0000053620171042 0.99999463798290
##  [47,] 0.8710556253628252 0.12894437463717
##  [48,] 0.9999567345551037 0.00004326544490
##  [49,] 0.0000112269517542 0.99998877304825
##  [50,] 0.9963479555021389 0.00365204449786
##  [51,] 0.0019786704397116 0.99802132956029
##  [52,] 0.0000000304221919 0.99999996957781
##  [53,] 0.9999093255335648 0.00009067446644
##  [54,] 0.0000492223189208 0.99995077768108
##  [55,] 0.9999093255335648 0.00009067446644
##  [56,] 0.0000053620171042 0.99999463798290
##  [57,] 0.0367517331895390 0.96324826681046
##  [58,] 0.8710556253628252 0.12894437463717
##  [59,] 0.0004518278488303 0.99954817215117
##  [60,] 0.0000235075569790 0.99997649244302
##  [61,] 0.9963479555021389 0.00365204449786
##  [62,] 0.0000025609905097 0.99999743900949
##  [63,] 0.9999989306598338 0.00000106934017
##  [64,] 0.0000112269517542 0.99998877304825
##  [65,] 0.9923778425714656 0.00762215742853
##  [66,] 0.0000000069433510 0.99999999305665
##  [67,] 0.0002158069830015 0.99978419301700
##  [68,] 0.0002158069830015 0.99978419301700
##  [69,] 0.7632769048350551 0.23672309516494
##  [70,] 0.0086217705543211 0.99137822944568
##  [71,] 0.0000112269517542 0.99998877304825
##  [72,] 0.9996018598292480 0.00039814017075
##  [73,] 0.0001030668945107 0.99989693310549
##  [74,] 0.0367517331895390 0.96324826681046
##  [75,] 0.9673784477343398 0.03262155226566
##  [76,] 0.0009457647791660 0.99905423522083
##  [77,] 0.9923778425714656 0.00762215742853
##  [78,] 0.0002158069830015 0.99978419301700
##  [79,] 0.0009457647791660 0.99905423522083
##  [80,] 0.0000053620171042 0.99999463798290
##  [81,] 0.0019786704397116 0.99802132956029
##  [82,] 0.0000112269517542 0.99998877304825
##  [83,] 0.0740050898190411 0.92599491018096
##  [84,] 0.7632769048350551 0.23672309516494
##  [85,] 0.0086217705543211 0.99137822944568
##  [86,] 0.0000001333125130 0.99999986668749
##  [87,] 0.0009457647791660 0.99905423522083
##  [88,] 0.0000492223189208 0.99995077768108
##  [89,] 0.9999989306598338 0.00000106934017
##  [90,] 0.0000005842654980 0.99999941573450
##  [91,] 0.9923778425714656 0.00762215742853
##  [92,] 0.0000000304221919 0.99999996957781
##  [93,] 0.9999953013180387 0.00000469868196
##  [94,] 0.0002158069830015 0.99978419301700
##  [95,] 0.9340081346369977 0.06599186536300
##  [96,] 0.0178896256044028 0.98211037439560
##  [97,] 0.0000025609905097 0.99999743900949
##  [98,] 0.0019786704397116 0.99802132956029
##  [99,] 0.9999901512177107 0.00000984878229
## [100,] 0.0000112269517542 0.99998877304825
## [101,] 0.9673784477343398 0.03262155226566
## [102,] 0.0000001333125130 0.99999986668749
## [103,] 0.9999977584231796 0.00000224157682
## [104,] 0.0000053620171042 0.99999463798290
## [105,] 0.0000235075569790 0.99997649244302
## [106,] 0.9999994898908655 0.00000051010913
## [107,] 0.0000025609905097 0.99999743900949
## [108,] 0.9999793570477835 0.00002064295222
## [109,] 0.0000005842654980 0.99999941573450
## [110,] 0.0000235075569790 0.99997649244302
## [111,] 0.0019786704397116 0.99802132956029
## [112,] 0.9963479555021389 0.00365204449786
## [113,] 0.0000000636829735 0.99999993631703
## [114,] 0.0001030668945107 0.99989693310549
## [115,] 0.9963479555021389 0.00365204449786
## [116,] 0.0000235075569790 0.99997649244302
## [117,] 0.9999953013180387 0.00000469868196
## [118,] 0.0000012232123967 0.99999877678760
## [119,] 0.9963479555021389 0.00365204449786
## [120,] 0.0000002790829016 0.99999972091710
## [121,] 0.9999567345551037 0.00004326544490
## [122,] 0.1434028502569841 0.85659714974302
## [123,] 0.0004518278488303 0.99954817215117
## [124,] 0.9996018598292480 0.00039814017075
## [125,] 0.0000001333125130 0.99999986668749
## [126,] 0.0000235075569790 0.99997649244302
## [127,] 0.9999998839318857 0.00000011606811
## [128,] 0.0000112269517542 0.99998877304825
## [129,] 0.9998099834107058 0.00019001658929
## [130,] 0.0000000304221919 0.99999996957781
## [131,] 0.9999998839318857 0.00000011606811
## [132,] 0.0000053620171042 0.99999463798290
## [133,] 0.9996018598292480 0.00039814017075
## [134,] 0.0000000636829735 0.99999993631703
## [135,] 0.9999997566701279 0.00000024332987
## [136,] 0.0000112269517542 0.99998877304825
## [137,] 0.9999901512177107 0.00000984878229
## [138,] 0.0000005842654980 0.99999941573450
## [139,] 0.9999567345551037 0.00004326544490
## [140,] 0.0001030668945107 0.99989693310549
## [141,] 0.0000235075569790 0.99997649244302
## [142,] 0.9923778425714656 0.00762215742853
## [143,] 0.0000112269517542 0.99998877304825
## [144,] 0.0004518278488303 0.99954817215117
## [145,] 0.0009457647791660 0.99905423522083
## [146,] 0.9963479555021389 0.00365204449786
## [147,] 0.0000492223189208 0.99995077768108
## [148,] 0.9999977584231796 0.00000224157682
## [149,] 0.0000000003618291 0.99999999963817
## [150,] 0.9999567345551037 0.00004326544490
## [151,] 0.0004518278488303 0.99954817215117
## [152,] 0.0004518278488303 0.99954817215117
## [153,] 0.7632769048350551 0.23672309516494
## [154,] 0.0000235075569790 0.99997649244302
## [155,] 0.0367517331895390 0.96324826681046
## [156,] 0.0740050898190411 0.92599491018096
## [157,] 0.0000235075569790 0.99997649244302
## [158,] 0.0000000033172681 0.99999999668273
## [159,] 0.9999567345551037 0.00004326544490
## [160,] 0.0000000636829735 0.99999993631703
## [161,] 0.9999998839318857 0.00000011606811
## [162,] 0.0000005842654980 0.99999941573450
## [163,] 0.9982538745754911 0.00174612542451
## [164,] 0.0002158069830015 0.99978419301700
## [165,] 0.6061527480036107 0.39384725199639
## [166,] 0.0009457647791660 0.99905423522083
## [167,] 0.9340081346369977 0.06599186536300
## [168,] 0.0000001333125130 0.99999986668749
## [169,] 0.9999793570477835 0.00002064295222
## [170,] 0.0000000033172681 0.99999999668273
## [171,] 0.9999977584231796 0.00000224157682
## [172,] 0.9991659987204796 0.00083400127952
## [173,] 0.0004518278488303 0.99954817215117
## [174,] 0.2596379810965197 0.74036201890348
## [175,] 0.0000235075569790 0.99997649244302
## [176,] 0.0000235075569790 0.99997649244302
## [177,] 0.0086217705543211 0.99137822944568
## [178,] 0.9999953013180387 0.00000469868196
## [179,] 0.0000012232123967 0.99999877678760
## [180,] 0.0041351226521375 0.99586487734786
## [181,] 0.9998099834107058 0.00019001658929
## [182,] 0.0004518278488303 0.99954817215117
## [183,] 0.0000053620171042 0.99999463798290
## [184,] 0.0000053620171042 0.99999463798290
## [185,] 0.9999901512177107 0.00000984878229
## [186,] 0.0002158069830015 0.99978419301700
## [187,] 0.0000025609905097 0.99999743900949
## [188,] 0.9999997566701279 0.00000024332987
## [189,] 0.0000053620171042 0.99999463798290
## [190,] 0.9998099834107058 0.00019001658929
## [191,] 0.0000235075569790 0.99997649244302
## [192,] 0.9991659987204796 0.00083400127952
## [193,] 0.0009457647791660 0.99905423522083
## [194,] 0.0000025609905097 0.99999743900949
## [195,] 0.0004518278488303 0.99954817215117
## [196,] 0.0000235075569790 0.99997649244302
## [197,] 0.0000002790829016 0.99999972091710
## [198,] 0.0004518278488303 0.99954817215117
## [199,] 0.9999901512177107 0.00000984878229
## [200,] 0.0002158069830015 0.99978419301700
## [201,] 0.9923778425714656 0.00762215742853
## [202,] 0.0000112269517542 0.99998877304825
## [203,] 0.0000000145335741 0.99999998546643
## [204,] 0.9999567345551037 0.00004326544490
## [205,] 0.0002158069830015 0.99978419301700
## [206,] 0.9999997566701279 0.00000024332987
## [207,] 0.0004518278488303 0.99954817215117
## [208,] 0.0000025609905097 0.99999743900949
## [209,] 0.9999977584231796 0.00000224157682
## [210,] 0.0000053620171042 0.99999463798290
## [211,] 0.0367517331895390 0.96324826681046
## [212,] 0.0000492223189208 0.99995077768108
## [213,] 0.9999977584231796 0.00000224157682
## [214,] 0.0019786704397116 0.99802132956029
## [215,] 0.8710556253628252 0.12894437463717
## [216,] 0.0009457647791660 0.99905423522083
## [217,] 0.9999567345551037 0.00004326544490
## [218,] 0.0000000015849180 0.99999999841508
## [219,] 0.9998099834107058 0.00019001658929
## [220,] 0.0009457647791660 0.99905423522083
## [221,] 0.9999953013180387 0.00000469868196
## [222,] 0.0000112269517542 0.99998877304825
## [223,] 0.9999093255335648 0.00009067446644
## [224,] 0.0019786704397116 0.99802132956029
## [225,] 0.0002158069830015 0.99978419301700
## [226,] 0.0001030668945107 0.99989693310549
## [227,] 0.0002158069830015 0.99978419301700
## [228,] 0.0002158069830015 0.99978419301700
## [229,] 0.0740050898190411 0.92599491018096
## [230,] 0.0001030668945107 0.99989693310549
## [231,] 0.0740050898190411 0.92599491018096
## [232,] 0.9999093255335648 0.00009067446644
## [233,] 0.0000005842654980 0.99999941573450
## [234,] 0.9999953013180387 0.00000469868196
## [235,] 0.0000000304221919 0.99999996957781
## [236,] 0.9999093255335648 0.00009067446644
## [237,] 0.9999093255335648 0.00009067446644
## [238,] 0.0004518278488303 0.99954817215117
## [239,] 0.0001030668945107 0.99989693310549
## [240,] 0.8710556253628252 0.12894437463717
## [241,] 0.0019786704397116 0.99802132956029
## [242,] 0.9999994898908655 0.00000051010913
## [243,] 0.0000005842654980 0.99999941573450
## [244,] 0.9340081346369977 0.06599186536300
## [245,] 0.0000012232123967 0.99999877678760
## [246,] 0.0000112269517542 0.99998877304825
## [247,] 0.9991659987204796 0.00083400127952
## [248,] 0.0000112269517542 0.99998877304825
## [249,] 0.4235155663186868 0.57648443368131
## [250,] 0.0041351226521375 0.99586487734786
## [251,] 0.9999093255335648 0.00009067446644
## [252,] 0.0000053620171042 0.99999463798290
## [253,] 0.0086217705543211 0.99137822944568
## [254,] 0.0086217705543211 0.99137822944568
## [255,] 0.0000001333125130 0.99999986668749
## [256,] 0.0000492223189208 0.99995077768108
## [257,] 0.0367517331895390 0.96324826681046
## [258,] 0.0000053620171042 0.99999463798290
## [259,] 0.9996018598292480 0.00039814017075
## [260,] 0.0001030668945107 0.99989693310549
## [261,] 0.0002158069830015 0.99978419301700
## [262,] 0.0000025609905097 0.99999743900949
## [263,] 0.9982538745754911 0.00174612542451
## [264,] 0.0000053620171042 0.99999463798290
## [265,] 0.9999999735939427 0.00000002640606
## [266,] 0.9923778425714656 0.00762215742853
## [267,] 0.0019786704397116 0.99802132956029
## [268,] 0.0000235075569790 0.99997649244302
## [269,] 0.9999997566701279 0.00000024332987
## [270,] 0.0000000304221919 0.99999996957781
## [271,] 0.9999997566701279 0.00000024332987
## [272,] 0.0041351226521375 0.99586487734786
## 
## $all.loglik
##  [1] -1558.279 -1036.608 -1034.116 -1034.036 -1034.015 -1034.006 -1034.003
##  [8] -1034.002 -1034.002 -1034.002 -1034.002 -1034.002 -1034.002 -1034.002
## [15] -1034.002 -1034.002 -1034.002 -1034.002 -1034.002
## 
## $restarts
## [1] 0
## 
## $ft
## [1] "normalmixEM"
## 
## attr(,"class")
## [1] "mixEM"
clusters <- max.col(mixmdl$posterior)
dd <- data.frame(wait=wait, clusters=clusters)
dd$clusters <- as.factor(dd$clusters)
ggplot(dd, aes(x=1:length(wait), y=wait, col=clusters))+
  geom_point(size=3)+
  scale_color_discrete()+
  xlab('Index')

Estimate final density based on the clustering

 data.frame(x = mixmdl$x) %>%
  ggplot() +
  geom_histogram(aes(x, ..density..), binwidth = 1, colour = "black", fill = "white") +
  stat_function(geom = "line", fun = plot_mix_comps,
                args = list(mixmdl$mu[1], mixmdl$sigma[1], lam = mixmdl$lambda[1]),
                colour = "red", lwd = 1.5) +
  stat_function(geom = "line", fun = plot_mix_comps,
                args = list(mixmdl$mu[2], mixmdl$sigma[2], lam = mixmdl$lambda[2]),
                colour = "blue", lwd = 1.5) +
  ylab("Density")

Look at the final numerical results

post.df <- as.data.frame(cbind(x = mixmdl$x, mixmdl$posterior, clusters=clusters))
head(post.df, 10)  # Retrieve first 10 rows
##     x          comp.1         comp.2 clusters
## 1  79 0.0001030668945 0.999896933105        2
## 2  54 0.9999093255336 0.000090674466        1
## 3  74 0.0041351226521 0.995864877348        2
## 4  62 0.9673784477343 0.032621552266        1
## 5  85 0.0000012232124 0.999998776788        2
## 6  55 0.9998099834107 0.000190016589        1
## 7  88 0.0000001333125 0.999999866687        2
## 8  85 0.0000012232124 0.999998776788        2
## 9  51 0.9999901512177 0.000009848782        1
## 10 85 0.0000012232124 0.999998776788        2

distribution of each cluster in the data set

post.df %>%
  ggplot(aes(x = factor(clusters))) +
  geom_bar() +
  xlab("Component") +
  ylab("Number of Data Points") 

table(post.df$clusters)
## 
##   1   2 
##  99 173
table(post.df$clusters)/nrow(post.df)
## 
##         1         2 
## 0.3639706 0.6360294

follow me on RPubs for more!