In order to cluster a data set, one should look carefully at the statistical properties of the data so a suitable method is chosen for the purpose. Here we have faithful data:
library("ggplot2")
library("dplyr")
theme_set(theme_classic())
options(scipen = 999)
h <- faithful %>%
ggplot(aes(x = waiting)) +
geom_histogram(aes(waiting, ..density..), binwidth = 1, colour = "black", fill = "white")
h
d <- ggplot(faithful, aes(x = waiting)) +
geom_density() +
geom_vline(xintercept = 53, col = "red", size = 2) +
geom_vline(xintercept = 80, col = "blue", size = 2)
d
library("mixtools")
plot_mix_comps <- function(x, mu, sigma, lam) {
lam * dnorm(x, mu, sigma)
}
wait <- faithful$waiting
mixmdl <- normalmixEM(wait, k = 2)
## number of iterations= 18
mixmdl
## $x
## [1] 79 54 74 62 85 55 88 85 51 85 54 84 78 47 83 52 62 84 52 79 51 47 78 69 74
## [26] 83 55 76 78 79 73 77 66 80 74 52 48 80 59 90 80 58 84 58 73 83 64 53 82 59
## [51] 75 90 54 80 54 83 71 64 77 81 59 84 48 82 60 92 78 78 65 73 82 56 79 71 62
## [76] 76 60 78 76 83 75 82 70 65 73 88 76 80 48 86 60 90 50 78 63 72 84 75 51 82
## [101] 62 88 49 83 81 47 84 52 86 81 75 59 89 79 59 81 50 85 59 87 53 69 77 56 88
## [126] 81 45 82 55 90 45 83 56 89 46 82 51 86 53 79 81 60 82 77 76 59 80 49 96 53
## [151] 77 77 65 81 71 70 81 93 53 89 45 86 58 78 66 76 63 88 52 93 49 57 77 68 81
## [176] 81 73 50 85 74 55 77 83 83 51 78 84 46 83 55 81 57 76 84 77 81 87 77 51 78
## [201] 60 82 91 53 78 46 77 84 49 83 71 80 49 75 64 76 53 94 55 76 50 82 54 75 78
## [226] 79 78 78 70 79 70 54 86 50 90 54 54 77 79 64 75 47 86 63 85 82 57 82 67 74
## [251] 54 83 73 73 88 80 71 83 56 79 78 84 58 83 43 60 75 81 46 90 46 74
##
## $lambda
## [1] 0.3608851 0.6391149
##
## $mu
## [1] 54.61482 80.09104
##
## $sigma
## [1] 5.871192 5.867755
##
## $loglik
## [1] -1034.002
##
## $posterior
## comp.1 comp.2
## [1,] 0.0001030668945107 0.99989693310549
## [2,] 0.9999093255335648 0.00009067446644
## [3,] 0.0041351226521375 0.99586487734786
## [4,] 0.9673784477343398 0.03262155226566
## [5,] 0.0000012232123967 0.99999877678760
## [6,] 0.9998099834107058 0.00019001658929
## [7,] 0.0000001333125130 0.99999986668749
## [8,] 0.0000012232123967 0.99999877678760
## [9,] 0.9999901512177107 0.00000984878229
## [10,] 0.0000012232123967 0.99999877678760
## [11,] 0.9999093255335648 0.00009067446644
## [12,] 0.0000025609905097 0.99999743900949
## [13,] 0.0002158069830015 0.99978419301700
## [14,] 0.9999994898908655 0.00000051010913
## [15,] 0.0000053620171042 0.99999463798290
## [16,] 0.9999793570477835 0.00002064295222
## [17,] 0.9673784477343398 0.03262155226566
## [18,] 0.0000025609905097 0.99999743900949
## [19,] 0.9999793570477835 0.00002064295222
## [20,] 0.0001030668945107 0.99989693310549
## [21,] 0.9999901512177107 0.00000984878229
## [22,] 0.9999994898908655 0.00000051010913
## [23,] 0.0002158069830015 0.99978419301700
## [24,] 0.1434028502569841 0.85659714974302
## [25,] 0.0041351226521375 0.99586487734786
## [26,] 0.0000053620171042 0.99999463798290
## [27,] 0.9998099834107058 0.00019001658929
## [28,] 0.0009457647791660 0.99905423522083
## [29,] 0.0002158069830015 0.99978419301700
## [30,] 0.0001030668945107 0.99989693310549
## [31,] 0.0086217705543211 0.99137822944568
## [32,] 0.0004518278488303 0.99954817215117
## [33,] 0.6061527480036107 0.39384725199639
## [34,] 0.0000492223189208 0.99995077768108
## [35,] 0.0041351226521375 0.99586487734786
## [36,] 0.9999793570477835 0.00002064295222
## [37,] 0.9999989306598338 0.00000106934017
## [38,] 0.0000492223189208 0.99995077768108
## [39,] 0.9963479555021389 0.00365204449786
## [40,] 0.0000000304221919 0.99999996957781
## [41,] 0.0000492223189208 0.99995077768108
## [42,] 0.9982538745754911 0.00174612542451
## [43,] 0.0000025609905097 0.99999743900949
## [44,] 0.9982538745754911 0.00174612542451
## [45,] 0.0086217705543211 0.99137822944568
## [46,] 0.0000053620171042 0.99999463798290
## [47,] 0.8710556253628252 0.12894437463717
## [48,] 0.9999567345551037 0.00004326544490
## [49,] 0.0000112269517542 0.99998877304825
## [50,] 0.9963479555021389 0.00365204449786
## [51,] 0.0019786704397116 0.99802132956029
## [52,] 0.0000000304221919 0.99999996957781
## [53,] 0.9999093255335648 0.00009067446644
## [54,] 0.0000492223189208 0.99995077768108
## [55,] 0.9999093255335648 0.00009067446644
## [56,] 0.0000053620171042 0.99999463798290
## [57,] 0.0367517331895390 0.96324826681046
## [58,] 0.8710556253628252 0.12894437463717
## [59,] 0.0004518278488303 0.99954817215117
## [60,] 0.0000235075569790 0.99997649244302
## [61,] 0.9963479555021389 0.00365204449786
## [62,] 0.0000025609905097 0.99999743900949
## [63,] 0.9999989306598338 0.00000106934017
## [64,] 0.0000112269517542 0.99998877304825
## [65,] 0.9923778425714656 0.00762215742853
## [66,] 0.0000000069433510 0.99999999305665
## [67,] 0.0002158069830015 0.99978419301700
## [68,] 0.0002158069830015 0.99978419301700
## [69,] 0.7632769048350551 0.23672309516494
## [70,] 0.0086217705543211 0.99137822944568
## [71,] 0.0000112269517542 0.99998877304825
## [72,] 0.9996018598292480 0.00039814017075
## [73,] 0.0001030668945107 0.99989693310549
## [74,] 0.0367517331895390 0.96324826681046
## [75,] 0.9673784477343398 0.03262155226566
## [76,] 0.0009457647791660 0.99905423522083
## [77,] 0.9923778425714656 0.00762215742853
## [78,] 0.0002158069830015 0.99978419301700
## [79,] 0.0009457647791660 0.99905423522083
## [80,] 0.0000053620171042 0.99999463798290
## [81,] 0.0019786704397116 0.99802132956029
## [82,] 0.0000112269517542 0.99998877304825
## [83,] 0.0740050898190411 0.92599491018096
## [84,] 0.7632769048350551 0.23672309516494
## [85,] 0.0086217705543211 0.99137822944568
## [86,] 0.0000001333125130 0.99999986668749
## [87,] 0.0009457647791660 0.99905423522083
## [88,] 0.0000492223189208 0.99995077768108
## [89,] 0.9999989306598338 0.00000106934017
## [90,] 0.0000005842654980 0.99999941573450
## [91,] 0.9923778425714656 0.00762215742853
## [92,] 0.0000000304221919 0.99999996957781
## [93,] 0.9999953013180387 0.00000469868196
## [94,] 0.0002158069830015 0.99978419301700
## [95,] 0.9340081346369977 0.06599186536300
## [96,] 0.0178896256044028 0.98211037439560
## [97,] 0.0000025609905097 0.99999743900949
## [98,] 0.0019786704397116 0.99802132956029
## [99,] 0.9999901512177107 0.00000984878229
## [100,] 0.0000112269517542 0.99998877304825
## [101,] 0.9673784477343398 0.03262155226566
## [102,] 0.0000001333125130 0.99999986668749
## [103,] 0.9999977584231796 0.00000224157682
## [104,] 0.0000053620171042 0.99999463798290
## [105,] 0.0000235075569790 0.99997649244302
## [106,] 0.9999994898908655 0.00000051010913
## [107,] 0.0000025609905097 0.99999743900949
## [108,] 0.9999793570477835 0.00002064295222
## [109,] 0.0000005842654980 0.99999941573450
## [110,] 0.0000235075569790 0.99997649244302
## [111,] 0.0019786704397116 0.99802132956029
## [112,] 0.9963479555021389 0.00365204449786
## [113,] 0.0000000636829735 0.99999993631703
## [114,] 0.0001030668945107 0.99989693310549
## [115,] 0.9963479555021389 0.00365204449786
## [116,] 0.0000235075569790 0.99997649244302
## [117,] 0.9999953013180387 0.00000469868196
## [118,] 0.0000012232123967 0.99999877678760
## [119,] 0.9963479555021389 0.00365204449786
## [120,] 0.0000002790829016 0.99999972091710
## [121,] 0.9999567345551037 0.00004326544490
## [122,] 0.1434028502569841 0.85659714974302
## [123,] 0.0004518278488303 0.99954817215117
## [124,] 0.9996018598292480 0.00039814017075
## [125,] 0.0000001333125130 0.99999986668749
## [126,] 0.0000235075569790 0.99997649244302
## [127,] 0.9999998839318857 0.00000011606811
## [128,] 0.0000112269517542 0.99998877304825
## [129,] 0.9998099834107058 0.00019001658929
## [130,] 0.0000000304221919 0.99999996957781
## [131,] 0.9999998839318857 0.00000011606811
## [132,] 0.0000053620171042 0.99999463798290
## [133,] 0.9996018598292480 0.00039814017075
## [134,] 0.0000000636829735 0.99999993631703
## [135,] 0.9999997566701279 0.00000024332987
## [136,] 0.0000112269517542 0.99998877304825
## [137,] 0.9999901512177107 0.00000984878229
## [138,] 0.0000005842654980 0.99999941573450
## [139,] 0.9999567345551037 0.00004326544490
## [140,] 0.0001030668945107 0.99989693310549
## [141,] 0.0000235075569790 0.99997649244302
## [142,] 0.9923778425714656 0.00762215742853
## [143,] 0.0000112269517542 0.99998877304825
## [144,] 0.0004518278488303 0.99954817215117
## [145,] 0.0009457647791660 0.99905423522083
## [146,] 0.9963479555021389 0.00365204449786
## [147,] 0.0000492223189208 0.99995077768108
## [148,] 0.9999977584231796 0.00000224157682
## [149,] 0.0000000003618291 0.99999999963817
## [150,] 0.9999567345551037 0.00004326544490
## [151,] 0.0004518278488303 0.99954817215117
## [152,] 0.0004518278488303 0.99954817215117
## [153,] 0.7632769048350551 0.23672309516494
## [154,] 0.0000235075569790 0.99997649244302
## [155,] 0.0367517331895390 0.96324826681046
## [156,] 0.0740050898190411 0.92599491018096
## [157,] 0.0000235075569790 0.99997649244302
## [158,] 0.0000000033172681 0.99999999668273
## [159,] 0.9999567345551037 0.00004326544490
## [160,] 0.0000000636829735 0.99999993631703
## [161,] 0.9999998839318857 0.00000011606811
## [162,] 0.0000005842654980 0.99999941573450
## [163,] 0.9982538745754911 0.00174612542451
## [164,] 0.0002158069830015 0.99978419301700
## [165,] 0.6061527480036107 0.39384725199639
## [166,] 0.0009457647791660 0.99905423522083
## [167,] 0.9340081346369977 0.06599186536300
## [168,] 0.0000001333125130 0.99999986668749
## [169,] 0.9999793570477835 0.00002064295222
## [170,] 0.0000000033172681 0.99999999668273
## [171,] 0.9999977584231796 0.00000224157682
## [172,] 0.9991659987204796 0.00083400127952
## [173,] 0.0004518278488303 0.99954817215117
## [174,] 0.2596379810965197 0.74036201890348
## [175,] 0.0000235075569790 0.99997649244302
## [176,] 0.0000235075569790 0.99997649244302
## [177,] 0.0086217705543211 0.99137822944568
## [178,] 0.9999953013180387 0.00000469868196
## [179,] 0.0000012232123967 0.99999877678760
## [180,] 0.0041351226521375 0.99586487734786
## [181,] 0.9998099834107058 0.00019001658929
## [182,] 0.0004518278488303 0.99954817215117
## [183,] 0.0000053620171042 0.99999463798290
## [184,] 0.0000053620171042 0.99999463798290
## [185,] 0.9999901512177107 0.00000984878229
## [186,] 0.0002158069830015 0.99978419301700
## [187,] 0.0000025609905097 0.99999743900949
## [188,] 0.9999997566701279 0.00000024332987
## [189,] 0.0000053620171042 0.99999463798290
## [190,] 0.9998099834107058 0.00019001658929
## [191,] 0.0000235075569790 0.99997649244302
## [192,] 0.9991659987204796 0.00083400127952
## [193,] 0.0009457647791660 0.99905423522083
## [194,] 0.0000025609905097 0.99999743900949
## [195,] 0.0004518278488303 0.99954817215117
## [196,] 0.0000235075569790 0.99997649244302
## [197,] 0.0000002790829016 0.99999972091710
## [198,] 0.0004518278488303 0.99954817215117
## [199,] 0.9999901512177107 0.00000984878229
## [200,] 0.0002158069830015 0.99978419301700
## [201,] 0.9923778425714656 0.00762215742853
## [202,] 0.0000112269517542 0.99998877304825
## [203,] 0.0000000145335741 0.99999998546643
## [204,] 0.9999567345551037 0.00004326544490
## [205,] 0.0002158069830015 0.99978419301700
## [206,] 0.9999997566701279 0.00000024332987
## [207,] 0.0004518278488303 0.99954817215117
## [208,] 0.0000025609905097 0.99999743900949
## [209,] 0.9999977584231796 0.00000224157682
## [210,] 0.0000053620171042 0.99999463798290
## [211,] 0.0367517331895390 0.96324826681046
## [212,] 0.0000492223189208 0.99995077768108
## [213,] 0.9999977584231796 0.00000224157682
## [214,] 0.0019786704397116 0.99802132956029
## [215,] 0.8710556253628252 0.12894437463717
## [216,] 0.0009457647791660 0.99905423522083
## [217,] 0.9999567345551037 0.00004326544490
## [218,] 0.0000000015849180 0.99999999841508
## [219,] 0.9998099834107058 0.00019001658929
## [220,] 0.0009457647791660 0.99905423522083
## [221,] 0.9999953013180387 0.00000469868196
## [222,] 0.0000112269517542 0.99998877304825
## [223,] 0.9999093255335648 0.00009067446644
## [224,] 0.0019786704397116 0.99802132956029
## [225,] 0.0002158069830015 0.99978419301700
## [226,] 0.0001030668945107 0.99989693310549
## [227,] 0.0002158069830015 0.99978419301700
## [228,] 0.0002158069830015 0.99978419301700
## [229,] 0.0740050898190411 0.92599491018096
## [230,] 0.0001030668945107 0.99989693310549
## [231,] 0.0740050898190411 0.92599491018096
## [232,] 0.9999093255335648 0.00009067446644
## [233,] 0.0000005842654980 0.99999941573450
## [234,] 0.9999953013180387 0.00000469868196
## [235,] 0.0000000304221919 0.99999996957781
## [236,] 0.9999093255335648 0.00009067446644
## [237,] 0.9999093255335648 0.00009067446644
## [238,] 0.0004518278488303 0.99954817215117
## [239,] 0.0001030668945107 0.99989693310549
## [240,] 0.8710556253628252 0.12894437463717
## [241,] 0.0019786704397116 0.99802132956029
## [242,] 0.9999994898908655 0.00000051010913
## [243,] 0.0000005842654980 0.99999941573450
## [244,] 0.9340081346369977 0.06599186536300
## [245,] 0.0000012232123967 0.99999877678760
## [246,] 0.0000112269517542 0.99998877304825
## [247,] 0.9991659987204796 0.00083400127952
## [248,] 0.0000112269517542 0.99998877304825
## [249,] 0.4235155663186868 0.57648443368131
## [250,] 0.0041351226521375 0.99586487734786
## [251,] 0.9999093255335648 0.00009067446644
## [252,] 0.0000053620171042 0.99999463798290
## [253,] 0.0086217705543211 0.99137822944568
## [254,] 0.0086217705543211 0.99137822944568
## [255,] 0.0000001333125130 0.99999986668749
## [256,] 0.0000492223189208 0.99995077768108
## [257,] 0.0367517331895390 0.96324826681046
## [258,] 0.0000053620171042 0.99999463798290
## [259,] 0.9996018598292480 0.00039814017075
## [260,] 0.0001030668945107 0.99989693310549
## [261,] 0.0002158069830015 0.99978419301700
## [262,] 0.0000025609905097 0.99999743900949
## [263,] 0.9982538745754911 0.00174612542451
## [264,] 0.0000053620171042 0.99999463798290
## [265,] 0.9999999735939427 0.00000002640606
## [266,] 0.9923778425714656 0.00762215742853
## [267,] 0.0019786704397116 0.99802132956029
## [268,] 0.0000235075569790 0.99997649244302
## [269,] 0.9999997566701279 0.00000024332987
## [270,] 0.0000000304221919 0.99999996957781
## [271,] 0.9999997566701279 0.00000024332987
## [272,] 0.0041351226521375 0.99586487734786
##
## $all.loglik
## [1] -1558.279 -1036.608 -1034.116 -1034.036 -1034.015 -1034.006 -1034.003
## [8] -1034.002 -1034.002 -1034.002 -1034.002 -1034.002 -1034.002 -1034.002
## [15] -1034.002 -1034.002 -1034.002 -1034.002 -1034.002
##
## $restarts
## [1] 0
##
## $ft
## [1] "normalmixEM"
##
## attr(,"class")
## [1] "mixEM"
clusters <- max.col(mixmdl$posterior)
dd <- data.frame(wait=wait, clusters=clusters)
dd$clusters <- as.factor(dd$clusters)
ggplot(dd, aes(x=1:length(wait), y=wait, col=clusters))+
geom_point(size=3)+
scale_color_discrete()+
xlab('Index')
data.frame(x = mixmdl$x) %>%
ggplot() +
geom_histogram(aes(x, ..density..), binwidth = 1, colour = "black", fill = "white") +
stat_function(geom = "line", fun = plot_mix_comps,
args = list(mixmdl$mu[1], mixmdl$sigma[1], lam = mixmdl$lambda[1]),
colour = "red", lwd = 1.5) +
stat_function(geom = "line", fun = plot_mix_comps,
args = list(mixmdl$mu[2], mixmdl$sigma[2], lam = mixmdl$lambda[2]),
colour = "blue", lwd = 1.5) +
ylab("Density")
post.df <- as.data.frame(cbind(x = mixmdl$x, mixmdl$posterior, clusters=clusters))
head(post.df, 10) # Retrieve first 10 rows
## x comp.1 comp.2 clusters
## 1 79 0.0001030668945 0.999896933105 2
## 2 54 0.9999093255336 0.000090674466 1
## 3 74 0.0041351226521 0.995864877348 2
## 4 62 0.9673784477343 0.032621552266 1
## 5 85 0.0000012232124 0.999998776788 2
## 6 55 0.9998099834107 0.000190016589 1
## 7 88 0.0000001333125 0.999999866687 2
## 8 85 0.0000012232124 0.999998776788 2
## 9 51 0.9999901512177 0.000009848782 1
## 10 85 0.0000012232124 0.999998776788 2
post.df %>%
ggplot(aes(x = factor(clusters))) +
geom_bar() +
xlab("Component") +
ylab("Number of Data Points")
table(post.df$clusters)
##
## 1 2
## 99 173
table(post.df$clusters)/nrow(post.df)
##
## 1 2
## 0.3639706 0.6360294