1 Paquete

El paquete necesario para crear datos usados en clustering es clusteringdatasets y la instalación se puede hacer usando el siguiente código.

if (!require('devtools')) install.packages('devtools')
devtools::install_github('elbamos/clusteringdatasets', force=TRUE)

Cargando el paquete.

library(clusteringdatasets)

2 Gotas o Blobs

Para hacer gotas o blobs se usa la siguiente función.

make_blobs(n_samples = 100, n_features = 2, centers = 3,
  cluster_std = 1, center_box = c(-10, 10), shuffle = TRUE)

2.1 Ejemplo

Crear 100 observaciones de 3 grupos cada uno con dos variables y usando los vectores de medias.

medias <- matrix(c(0, 0,
                   3, 1,
                   2, 3), ncol=2, byrow=TRUE)

blobs <- make_blobs(n_samples=1000, n_features=2, centers=medias)

plot(blobs$samples, col=rainbow(3)[blobs$labels])

2.2 Ejemplo

Crear 10 observaciones de 3 grupos cada uno con 3 variables y usando los vectores de medias.

medias <- matrix(c(0, 0, 0,
                   1, 1, 0,
                   1, 1, 1), ncol=3, byrow=TRUE)

blobs <- make_blobs(n_samples=10, n_features=3, centers=medias)
blobs
## $samples
##             [,1]        [,2]        [,3]
##  [1,] -0.4921498 -0.41712455  1.50333632
##  [2,]  0.8606919  1.64510869  1.11452831
##  [3,] -0.2404758  1.43739233 -0.57638507
##  [4,] -0.6796860 -1.62739298  0.06563251
##  [5,]  1.5592125  2.44386438  2.39827294
##  [6,] -0.2938455  2.28756286 -0.55289562
##  [7,]  1.9123192  1.47654203 -0.55703259
##  [8,]  0.0959438  2.45646794  1.07661910
##  [9,] -1.0876149 -1.70068738  0.80432385
## [10,]  0.2735340  0.06982408  0.33965061
## 
## $labels
##  [1] 1 2 2 1 3 2 1 3 1 3

3 Lunas o Moons

Para hacer lunas se usa la siguiente función.

make_moons(n_samples = 100, shuffle = TRUE, noise = NA)

3.1 Ejemplo

Crear 1000 observaciones de dos grupos con formas de luna.

moons <- make_moons(n_samples=1000, shuffle=FALSE, noise=0.15)

plot(moons$samples, col=rainbow(2)[moons$labels])

4 Birch

data(birch1)
data(birch2)
data(birch3)
par(mfrow = c(1, 3), mar = c(0,0,1,0))
plot(birch1, cex = 0.0000005, main = "birch1", xlab = "", ylab = NULL, xaxt='n', yaxt = 'n')
plot(birch2, cex = 0.0000005, main = "birch2", xlab = NULL, ylab = NULL, xaxt='n',yaxt = 'n')
plot(birch3, cex = 0.0000005, main = "birch3", xlab = NULL, ylab = NULL, xaxt='n',yaxt = 'n')

5 S Sets

The S-sets are useful for testing how an algorithm handles cluster overlap.

data(s1)
data(s2)
data(s3)
data(s4)
par(mfrow = c(2, 2), mar = c(0,0,1,0))
plot(s1[, 1:2], cex = 0.0001, col = s1$labels, main = "s1", xlab = NULL, ylab = NULL, xaxt='n',yaxt = 'n')
plot(s2[, 1:2], cex = 0.0001, col = s2$labels, main = "s2", xlab = NULL, ylab = NULL, xaxt='n',yaxt = 'n')
plot(s3[, 1:2], cex = 0.0001, col = s3$labels, main = "s3", xlab = NULL, ylab = NULL, xaxt='n',yaxt = 'n')
plot(s4[, 1:2], cex = 0.0001, col = s4$labels, main = "s4", xlab = NULL, ylab = NULL, xaxt='n',yaxt = 'n')

6 A Sets

data(a1)
data(a2)
data(a3)
par(mfrow = c(1, 3), mar = c(0,0,1,0))
plot(a1[, 1:2], cex = 0.0001, col = s1$labels, main = "a1", xlab = NULL, ylab = NULL, xaxt='n',yaxt = 'n')
plot(a2[, 1:2], cex = 0.0001, col = s2$labels, main = "a2", xlab = NULL, ylab = NULL, xaxt='n',yaxt = 'n')
plot(a3[, 1:2], cex = 0.0001, col = s3$labels, main = "a3", xlab = NULL, ylab = NULL, xaxt='n',yaxt = 'n')

7 Shapesets

data("Aggregation")
data("spiral")
data("D31")
data(Compound)
data(pathbased)
data(jain)
data(flame)
data(R15)
par(mfrow = c(3, 3), mar = c(0,0,1,0))
plot(Aggregation[, 1:2], cex = 0.1, col = Aggregation$label, main = "Aggregation", xlab = NULL, ylab = NULL, xaxt='n',yaxt = 'n')
plot(spiral[, 1:2], cex = 0.1, col = spiral$label, main = "spiral", xlab = NULL, ylab = NULL, xaxt='n',yaxt = 'n')
plot(D31[, 1:2], cex = 0.1, col = D31$label, main = "D31", xlab = NULL, ylab = NULL, xaxt='n',yaxt = 'n')
plot(Compound[, 1:2], cex = 0.1, col = Compound$label, main = "Compound", xlab = NULL, ylab = NULL, xaxt='n',yaxt = 'n')
plot(pathbased[, 1:2], cex = 0.1, col = pathbased$label, main = "pathbased", xlab = NULL, ylab = NULL, xaxt='n',yaxt = 'n')
plot(jain[, 1:2], cex = 0.1, col = jain$label, main = "jain", xlab = NULL, ylab = NULL, xaxt='n',yaxt = 'n')
plot(flame[, 1:2], cex = 0.1, col = flame$label, main = "flame", xlab = NULL, ylab = NULL, xaxt='n',yaxt = 'n')
plot(R15[, 1:2], cex = 0.1, col = R15$label, main = "R15", xlab = NULL, ylab = NULL, xaxt='n',yaxt = 'n')

8 Chameleon

par(mfrow = c(2, 2), mar = c(0,0,1,0))
data("t48k")
data("t58k")
data("t710k")
data("t88k")
plot(t48k, cex = 0.001, main = "t48k", xlab = NULL, ylab = NULL, xaxt='n',yaxt = 'n')
plot(t58k, cex = 0.001, main = "t58k", xlab = NULL, ylab = NULL, xaxt='n',yaxt = 'n')
plot(t710k, cex = 0.001, main = "t710k", xlab = NULL, ylab = NULL, xaxt='n',yaxt = 'n')
plot(t88k, cex = 0.001, main = "t88k", xlab = NULL, ylab = NULL, xaxt='n',yaxt = 'n')

9 Neural Gas

names <- c("Circle"  ,  "Complex1"  , "Complex2" ,  "Complex3"    ,  "Complex4"     ,   "Discrete"  , "HiLoDensity" ,  "JumpingRectangle" ,
                     "MovingJumpingRectangle", "MovingRectangle", "Rectangle" ,  "RMouseRectangle"    ) # "Ring"
data(list = names)
par(mfrow = c(3, 5), mar = c(0, 0, 1, 0))
for (nm in names) {
    plot(eval(parse(text = nm)), cex = 0.01,  main = nm, xlab = NULL, ylab = NULL, xaxt='n',yaxt = 'n')
}

10 Non-Convex

names <- c("cross" , "d4"  ,   "face"  , "pie"  ,  "ring2"  , "sincos")
data(list = names)
par(mfrow = c(2, 3), mar = c(0, 0, 1, 0))
for (nm in names) {
    plot(eval(parse(text = nm)), cex = 0.1,  main = nm, xlab = NULL, ylab = NULL, xaxt='n',yaxt = 'n')
}

11 Locations

data("mopsifinland")
data("mopsijoensu")
par(mfrow = c(1, 2), mar = c(0,0,1,0))
plot(mopsifinland[, 1:2], cex = 0.01,  main = "mopsifinland", xlab = NULL, ylab = NULL, xaxt='n',yaxt = 'n')
plot(mopsijoensu[, 1:2], cex = 0.05,  main = "mopsifinland", xlab = NULL, ylab = NULL, xaxt='n',yaxt = 'n')

12 High Dimensional Datasets

The package contains three sets of high-dimensional data. The visualizations below were made using my largeVis package to reduce each dataset to two dimensions, and the colors are the result of applying the hdbscan function within the package.

12.0.1 UCI Datasets

library(largeVis)
library(clusteringdatasets)
library(ggplot2)
data(glass)
data(wdbc)
data(breast)
data(yeast)
data(wine)
data(thyroid)
toproc <- list(glass, wdbc, breast, yeast, wine, thyroid)
vises <- list()
clusters <- list()
for (i in 1:length(toproc)) {
    dat <- t(scale(as.matrix(toproc[[i]])))
    if (ncol(dat) < 50000) vis <- largeVis(dat, K = 50, verbose = TRUE)
    else vis <- largeVis(dat, K = 100, verbose = TRUE)
    neighbors <- randomProjectionTreeSearch(dat, K = 50)
    edges <- buildEdgeMatrix(data = dat, neighbors = neighbors)
    print(str(edges))
    cluster <- hdbscan(edges = edges,neighbors = neighbors, K = 5, minPts = 10, verbose = TRUE)
    vises[[i]] <- vis
    clusters[[i]] <- cluster
}
library(ggplot2)
load(system.file("extdata/vises.Rda", package = "clusteringdatasets"))
load(system.file("extdata/clusters.Rda", package = "clusteringdatasets"))
names <- c("glass", "wdbc", "breast", "yeast", "wine", "thyroid")
par(mfrow = c(2, 3), mar = c(0,0,1,0))
for (i in 1:length(names)) {
    df <- data.frame(t(vises[[i]]$coords))
    colnames(df) <- c("x", "y")
    df$label <- clusters[[i]]$clusters
    if (length(unique(df$label)) > 1) {
        plot(df[, 1:2], cex = 0.005, col = df$label, main = names[i], xlab = NULL, ylab = NULL, xaxt='n',yaxt = 'n')
    } else {
        plot(df[, 1:2], cex = 0.005, col = df$label, main = names[i], xlab = NULL, ylab = NULL, xaxt='n',yaxt = 'n')
    }
}

13 KDDCUP04Bio

data("kddcup04bio")
library(largeVis)
load("./kddvis.Rda")
dat <- t(scale(as.matrix(kddcup04bio)))
vis <- largeVis(dat, K = 50, n_trees = 50, tree_threshold = 50, max_iter = 2, verbose = TRUE)
load(system.file("extdata/kdvis.Rda", package = "clusteringdatasets"))
par(mfrow = c(1, 1), mar = c(0,0,1,0))
plot(kdvis[, 1:2], cex = 0.0001, col = df$label, main = "kddcup04bio", xlab = NULL, ylab = NULL, xaxt='n',yaxt = 'n', 
         xlim = c(-20, 18), ylim = c(-20, 30))