Acontinuación se desarrolla un análisis de conglomerados sobre la data iris y se utiliza la librería Plotly con el fin de representar cuatro variables mediante un mismo gráfico.

Construcción de data

library(car)
## Loading required package: carData
df=data.frame(iris)
some(df)
##     Sepal.Length Sepal.Width Petal.Length Petal.Width    Species
## 3            4.7         3.2          1.3         0.2     setosa
## 11           5.4         3.7          1.5         0.2     setosa
## 12           4.8         3.4          1.6         0.2     setosa
## 15           5.8         4.0          1.2         0.2     setosa
## 53           6.9         3.1          4.9         1.5 versicolor
## 69           6.2         2.2          4.5         1.5 versicolor
## 79           6.0         2.9          4.5         1.5 versicolor
## 92           6.1         3.0          4.6         1.4 versicolor
## 140          6.9         3.1          5.4         2.1  virginica
## 148          6.5         3.0          5.2         2.0  virginica
iris.2 <- iris[,-5]
species <- iris[,5]
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following object is masked from 'package:car':
## 
##     recode
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
di=data.frame(df) %>% 
  mutate(Species=dplyr::recode(Species,
                               setosa="st",
                               versicolor="vs",
                               virginica="vg"))  
pairs(di[,1:4], col = df$Species,lower.panel = NULL)
par(xpd = TRUE)
legend(x = 0.05, y = 0.4, cex = 2,
       legend=as.character(levels(df$Species)),
       fill = unique(df$Species))

par(xpd = NA)

Análisis de conglomerados por el método k-means.

set.seed(20)
k.means.fit <-kmeans(di[,1:4], 3, nstart = 10)
k.means.fit
## K-means clustering with 3 clusters of sizes 50, 62, 38
## 
## Cluster means:
##   Sepal.Length Sepal.Width Petal.Length Petal.Width
## 1     5.006000    3.428000     1.462000    0.246000
## 2     5.901613    2.748387     4.393548    1.433871
## 3     6.850000    3.073684     5.742105    2.071053
## 
## Clustering vector:
##   [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
##  [38] 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 3 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
##  [75] 2 2 2 3 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 3 2 3 3 3 3 2 3 3 3 3
## [112] 3 3 2 2 3 3 3 3 2 3 2 3 2 3 3 2 2 3 3 3 3 3 2 3 3 3 3 2 3 3 3 2 3 3 3 2 3
## [149] 3 2
## 
## Within cluster sum of squares by cluster:
## [1] 15.15100 39.82097 23.87947
##  (between_SS / total_SS =  88.4 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
## [6] "betweenss"    "size"         "iter"         "ifault"
k.means.fit$centers
##   Sepal.Length Sepal.Width Petal.Length Petal.Width
## 1     5.006000    3.428000     1.462000    0.246000
## 2     5.901613    2.748387     4.393548    1.433871
## 3     6.850000    3.073684     5.742105    2.071053
k.means.fit$ifault
## [1] 0
grupos=k.means.fit$cluster
table(di$Species,grupos)
##     grupos
##       1  2  3
##   st 50  0  0
##   vs  0 48  2
##   vg  0 14 36
dif=data.frame(di,grupos)
dif=data.frame(dif) %>% 
  mutate(grupos=dplyr::recode(grupos,
                               "3"="st",
                               "2"="vs",
                               "1"="vg")) 
table(dif$grupos,dif$Species)
##     
##      st vs vg
##   st  0  2 36
##   vg 50  0  0
##   vs  0 48 14

Presentación de análisis de conglomerados en gráfico 2D.

#######visualización
library(factoextra)
## Loading required package: ggplot2
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
d2 <- scale(di[,1:4])
rownames(d2) <- di$Species
#www:for total within sum of square
fviz_nbclust(x = d2, FUNcluster = kmeans, method = "wss", k.max = 15, 
             diss = get_dist(d2, method = "euclidean"), nstart = 50)

set.seed(123)
d2f=data.frame(d2)
km_clusters <- kmeans(x = d2f, centers = 3, nstart = 50)

# Las funciones del paquete factoextra emplean el nombre de las filas del
# dataframe que contiene los datos como identificador de las observaciones.
# Esto permite añadir labels a los gráficos.
fviz_cluster(object = km_clusters, data = d2f, show.clust.cent = TRUE,
             ellipse.type = "euclid", star.plot = TRUE, repel = TRUE,
             pointsize=0.5,outlier.color="darkred") +
  labs(title = "Resultados clustering K-means") +
  theme_bw() +  theme(legend.position = "none")

## Representación de data iris mediante gráfico 3D En los ejes X, Y y Z se presentan las variables longitud de sepalo, ancho de sepalo y longitud de petalo, respectivamente. La variable ancho de petalo se representa mediante el tamaño de los puntos. El primer gráfico corresponde a los datos originales y el segúndo a los datos escalados.

Datos originales

library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
plot_ly(data = di, x = di$Sepal.Length, y = di$Sepal.Width, z = di$Petal.Length,
        size = di$Petal.Width, color = di$Species, symbols = di$Species)%>%
  layout(
  scene = list(
    xaxis = list(title = "Longitud sepalo"),
    yaxis = list(title = "Ancho sepalo"),
    zaxis = list(title = "Longitud sepalo"))
  )
## No trace type specified:
##   Based on info supplied, a 'scatter3d' trace seems appropriate.
##   Read more about this trace type -> https://plot.ly/r/reference/#scatter3d
## No scatter3d mode specifed:
##   Setting the mode to markers
##   Read more about this attribute -> https://plot.ly/r/reference/#scatter-mode
## Warning: `arrange_()` is deprecated as of dplyr 0.7.0.
## Please use `arrange()` instead.
## See vignette('programming') for more help
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
## Warning: `line.width` does not currently support multiple values.

## Warning: `line.width` does not currently support multiple values.

## Warning: `line.width` does not currently support multiple values.

Datos escalados

plot_ly(data = d2f, x = d2f$Sepal.Length, y = d2f$Sepal.Width, z = d2f$Petal.Length,
        size = d2f$Petal.Width*10, color = di$Species, symbols = di$Species)%>%
  layout(
  scene = list(
    xaxis = list(title = "Longitud sepalo"),
    yaxis = list(title = "Ancho sepalo"),
    zaxis = list(title = "Longitud sepalo"))
  )
## No trace type specified:
##   Based on info supplied, a 'scatter3d' trace seems appropriate.
##   Read more about this trace type -> https://plot.ly/r/reference/#scatter3d
## No scatter3d mode specifed:
##   Setting the mode to markers
##   Read more about this attribute -> https://plot.ly/r/reference/#scatter-mode
## Warning: `line.width` does not currently support multiple values.

## Warning: `line.width` does not currently support multiple values.

## Warning: `line.width` does not currently support multiple values.