El paquete necesario para crear datos usados en clustering es clusteringdatasets y la instalación se puede hacer usando el siguiente código.
if (!require('devtools')) install.packages('devtools')
devtools::install_github('elbamos/clusteringdatasets', force=TRUE)
Cargando el paquete.
library(clusteringdatasets)
Para hacer gotas o blobs se usa la siguiente función.
make_blobs(n_samples = 100, n_features = 2, centers = 3,
cluster_std = 1, center_box = c(-10, 10), shuffle = TRUE)
Crear 100 observaciones de 3 grupos cada uno con dos variables y usando los vectores de medias.
medias <- matrix(c(0, 0,
3, 1,
2, 3), ncol=2, byrow=TRUE)
blobs <- make_blobs(n_samples=1000, n_features=2, centers=medias)
plot(blobs$samples, col=rainbow(3)[blobs$labels])
Crear 10 observaciones de 3 grupos cada uno con 3 variables y usando los vectores de medias.
medias <- matrix(c(0, 0, 0,
1, 1, 0,
1, 1, 1), ncol=3, byrow=TRUE)
blobs <- make_blobs(n_samples=10, n_features=3, centers=medias)
blobs
## $samples
## [,1] [,2] [,3]
## [1,] -0.4921498 -0.41712455 1.50333632
## [2,] 0.8606919 1.64510869 1.11452831
## [3,] -0.2404758 1.43739233 -0.57638507
## [4,] -0.6796860 -1.62739298 0.06563251
## [5,] 1.5592125 2.44386438 2.39827294
## [6,] -0.2938455 2.28756286 -0.55289562
## [7,] 1.9123192 1.47654203 -0.55703259
## [8,] 0.0959438 2.45646794 1.07661910
## [9,] -1.0876149 -1.70068738 0.80432385
## [10,] 0.2735340 0.06982408 0.33965061
##
## $labels
## [1] 1 2 2 1 3 2 1 3 1 3
Para hacer lunas se usa la siguiente función.
make_moons(n_samples = 100, shuffle = TRUE, noise = NA)
Crear 1000 observaciones de dos grupos con formas de luna.
moons <- make_moons(n_samples=1000, shuffle=FALSE, noise=0.15)
plot(moons$samples, col=rainbow(2)[moons$labels])
data(birch1)
data(birch2)
data(birch3)
par(mfrow = c(1, 3), mar = c(0,0,1,0))
plot(birch1, cex = 0.0000005, main = "birch1", xlab = "", ylab = NULL, xaxt='n', yaxt = 'n')
plot(birch2, cex = 0.0000005, main = "birch2", xlab = NULL, ylab = NULL, xaxt='n',yaxt = 'n')
plot(birch3, cex = 0.0000005, main = "birch3", xlab = NULL, ylab = NULL, xaxt='n',yaxt = 'n')
The S-sets are useful for testing how an algorithm handles cluster overlap.
data(s1)
data(s2)
data(s3)
data(s4)
par(mfrow = c(2, 2), mar = c(0,0,1,0))
plot(s1[, 1:2], cex = 0.0001, col = s1$labels, main = "s1", xlab = NULL, ylab = NULL, xaxt='n',yaxt = 'n')
plot(s2[, 1:2], cex = 0.0001, col = s2$labels, main = "s2", xlab = NULL, ylab = NULL, xaxt='n',yaxt = 'n')
plot(s3[, 1:2], cex = 0.0001, col = s3$labels, main = "s3", xlab = NULL, ylab = NULL, xaxt='n',yaxt = 'n')
plot(s4[, 1:2], cex = 0.0001, col = s4$labels, main = "s4", xlab = NULL, ylab = NULL, xaxt='n',yaxt = 'n')
data(a1)
data(a2)
data(a3)
par(mfrow = c(1, 3), mar = c(0,0,1,0))
plot(a1[, 1:2], cex = 0.0001, col = s1$labels, main = "a1", xlab = NULL, ylab = NULL, xaxt='n',yaxt = 'n')
plot(a2[, 1:2], cex = 0.0001, col = s2$labels, main = "a2", xlab = NULL, ylab = NULL, xaxt='n',yaxt = 'n')
plot(a3[, 1:2], cex = 0.0001, col = s3$labels, main = "a3", xlab = NULL, ylab = NULL, xaxt='n',yaxt = 'n')
data("Aggregation")
data("spiral")
data("D31")
data(Compound)
data(pathbased)
data(jain)
data(flame)
data(R15)
par(mfrow = c(3, 3), mar = c(0,0,1,0))
plot(Aggregation[, 1:2], cex = 0.1, col = Aggregation$label, main = "Aggregation", xlab = NULL, ylab = NULL, xaxt='n',yaxt = 'n')
plot(spiral[, 1:2], cex = 0.1, col = spiral$label, main = "spiral", xlab = NULL, ylab = NULL, xaxt='n',yaxt = 'n')
plot(D31[, 1:2], cex = 0.1, col = D31$label, main = "D31", xlab = NULL, ylab = NULL, xaxt='n',yaxt = 'n')
plot(Compound[, 1:2], cex = 0.1, col = Compound$label, main = "Compound", xlab = NULL, ylab = NULL, xaxt='n',yaxt = 'n')
plot(pathbased[, 1:2], cex = 0.1, col = pathbased$label, main = "pathbased", xlab = NULL, ylab = NULL, xaxt='n',yaxt = 'n')
plot(jain[, 1:2], cex = 0.1, col = jain$label, main = "jain", xlab = NULL, ylab = NULL, xaxt='n',yaxt = 'n')
plot(flame[, 1:2], cex = 0.1, col = flame$label, main = "flame", xlab = NULL, ylab = NULL, xaxt='n',yaxt = 'n')
plot(R15[, 1:2], cex = 0.1, col = R15$label, main = "R15", xlab = NULL, ylab = NULL, xaxt='n',yaxt = 'n')
par(mfrow = c(2, 2), mar = c(0,0,1,0))
data("t48k")
data("t58k")
data("t710k")
data("t88k")
plot(t48k, cex = 0.001, main = "t48k", xlab = NULL, ylab = NULL, xaxt='n',yaxt = 'n')
plot(t58k, cex = 0.001, main = "t58k", xlab = NULL, ylab = NULL, xaxt='n',yaxt = 'n')
plot(t710k, cex = 0.001, main = "t710k", xlab = NULL, ylab = NULL, xaxt='n',yaxt = 'n')
plot(t88k, cex = 0.001, main = "t88k", xlab = NULL, ylab = NULL, xaxt='n',yaxt = 'n')
names <- c("Circle" , "Complex1" , "Complex2" , "Complex3" , "Complex4" , "Discrete" , "HiLoDensity" , "JumpingRectangle" ,
"MovingJumpingRectangle", "MovingRectangle", "Rectangle" , "RMouseRectangle" ) # "Ring"
data(list = names)
par(mfrow = c(3, 5), mar = c(0, 0, 1, 0))
for (nm in names) {
plot(eval(parse(text = nm)), cex = 0.01, main = nm, xlab = NULL, ylab = NULL, xaxt='n',yaxt = 'n')
}
names <- c("cross" , "d4" , "face" , "pie" , "ring2" , "sincos")
data(list = names)
par(mfrow = c(2, 3), mar = c(0, 0, 1, 0))
for (nm in names) {
plot(eval(parse(text = nm)), cex = 0.1, main = nm, xlab = NULL, ylab = NULL, xaxt='n',yaxt = 'n')
}
data("mopsifinland")
data("mopsijoensu")
par(mfrow = c(1, 2), mar = c(0,0,1,0))
plot(mopsifinland[, 1:2], cex = 0.01, main = "mopsifinland", xlab = NULL, ylab = NULL, xaxt='n',yaxt = 'n')
plot(mopsijoensu[, 1:2], cex = 0.05, main = "mopsifinland", xlab = NULL, ylab = NULL, xaxt='n',yaxt = 'n')
The package contains three sets of high-dimensional data. The visualizations below were made using my largeVis package to reduce each dataset to two dimensions, and the colors are the result of applying the hdbscan function within the package.
library(largeVis)
library(clusteringdatasets)
library(ggplot2)
data(glass)
data(wdbc)
data(breast)
data(yeast)
data(wine)
data(thyroid)
toproc <- list(glass, wdbc, breast, yeast, wine, thyroid)
vises <- list()
clusters <- list()
for (i in 1:length(toproc)) {
dat <- t(scale(as.matrix(toproc[[i]])))
if (ncol(dat) < 50000) vis <- largeVis(dat, K = 50, verbose = TRUE)
else vis <- largeVis(dat, K = 100, verbose = TRUE)
neighbors <- randomProjectionTreeSearch(dat, K = 50)
edges <- buildEdgeMatrix(data = dat, neighbors = neighbors)
print(str(edges))
cluster <- hdbscan(edges = edges,neighbors = neighbors, K = 5, minPts = 10, verbose = TRUE)
vises[[i]] <- vis
clusters[[i]] <- cluster
}
library(ggplot2)
load(system.file("extdata/vises.Rda", package = "clusteringdatasets"))
load(system.file("extdata/clusters.Rda", package = "clusteringdatasets"))
names <- c("glass", "wdbc", "breast", "yeast", "wine", "thyroid")
par(mfrow = c(2, 3), mar = c(0,0,1,0))
for (i in 1:length(names)) {
df <- data.frame(t(vises[[i]]$coords))
colnames(df) <- c("x", "y")
df$label <- clusters[[i]]$clusters
if (length(unique(df$label)) > 1) {
plot(df[, 1:2], cex = 0.005, col = df$label, main = names[i], xlab = NULL, ylab = NULL, xaxt='n',yaxt = 'n')
} else {
plot(df[, 1:2], cex = 0.005, col = df$label, main = names[i], xlab = NULL, ylab = NULL, xaxt='n',yaxt = 'n')
}
}
data("kddcup04bio")
library(largeVis)
load("./kddvis.Rda")
dat <- t(scale(as.matrix(kddcup04bio)))
vis <- largeVis(dat, K = 50, n_trees = 50, tree_threshold = 50, max_iter = 2, verbose = TRUE)
load(system.file("extdata/kdvis.Rda", package = "clusteringdatasets"))
par(mfrow = c(1, 1), mar = c(0,0,1,0))
plot(kdvis[, 1:2], cex = 0.0001, col = df$label, main = "kddcup04bio", xlab = NULL, ylab = NULL, xaxt='n',yaxt = 'n',
xlim = c(-20, 18), ylim = c(-20, 30))