Clusterización

#Instalación de paquetes y librerías
library(ggplot2)
library(car)

## Loading required package: carData

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following object is masked from 'package:car':
## 
##     recode

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(factoextra)

## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa

library(cluster)
library(simstudy)
library(data.table)

## 
## Attaching package: 'data.table'

## The following objects are masked from 'package:dplyr':
## 
##     between, first, last

library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ lubridate 1.9.2     ✔ tibble    3.2.1
## ✔ purrr     1.0.1     ✔ tidyr     1.3.0
## ✔ readr     2.1.4

## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ data.table::between() masks dplyr::between()
## ✖ dplyr::filter()       masks stats::filter()
## ✖ data.table::first()   masks dplyr::first()
## ✖ lubridate::hour()     masks data.table::hour()
## ✖ lubridate::isoweek()  masks data.table::isoweek()
## ✖ dplyr::lag()          masks stats::lag()
## ✖ data.table::last()    masks dplyr::last()
## ✖ lubridate::mday()     masks data.table::mday()
## ✖ lubridate::minute()   masks data.table::minute()
## ✖ lubridate::month()    masks data.table::month()
## ✖ lubridate::quarter()  masks data.table::quarter()
## ✖ dplyr::recode()       masks car::recode()
## ✖ lubridate::second()   masks data.table::second()
## ✖ purrr::some()         masks car::some()
## ✖ purrr::transpose()    masks data.table::transpose()
## ✖ lubridate::wday()     masks data.table::wday()
## ✖ lubridate::week()     masks data.table::week()
## ✖ lubridate::yday()     masks data.table::yday()
## ✖ lubridate::year()     masks data.table::year()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(PerformanceAnalytics)

## Loading required package: xts
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## 
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
## 
## 
## ######################### Warning from 'xts' package ##########################
## #                                                                             #
## # The dplyr lag() function breaks how base R's lag() function is supposed to  #
## # work, which breaks lag(my_xts). Calls to lag(my_xts) that you type or       #
## # source() into this session won't work correctly.                            #
## #                                                                             #
## # Use stats::lag() to make sure you're not using dplyr::lag(), or you can add #
## # conflictRules('dplyr', exclude = 'lag') to your .Rprofile to stop           #
## # dplyr from breaking base R's lag() function.                                #
## #                                                                             #
## # Code in packages is not affected. It's protected by R's namespace mechanism #
## # Set `options(xts.warn_dplyr_breaks_lag = FALSE)` to suppress this warning.  #
## #                                                                             #
## ###############################################################################
## 
## Attaching package: 'xts'
## 
## The following objects are masked from 'package:data.table':
## 
##     first, last
## 
## The following objects are masked from 'package:dplyr':
## 
##     first, last
## 
## 
## Attaching package: 'PerformanceAnalytics'
## 
## The following object is masked from 'package:graphics':
## 
##     legend

library(corrr)

#Datos
x <- c(4, 4.5, 4, 7.5, 7, 6, 5, 5.5, 5, 6)
y <- c(4, 4.5, 4, 7.5, 7, 6, 5, 5.5, 5, 6) + 4
z <- c(5, 5.5, 4.8, 5.4, 4.7, 5.6, 5.3, 5.5, 5.2, 4.8)
datos <- data.frame(punto = rep(c("x", "y", "z"), each = 10),
                    respuesta = c(x,y,z),predictor = 1:10)
datos

#Visualización datos
ggplot(data = datos, aes(x = as.factor(predictor), y = respuesta, 
                         colour = punto)) +  geom_path(aes(group = punto)) +
  geom_point() +  labs(x = "predictor") +   theme_bw() +
  theme(legend.position = "bottom")

x1=matrix(c(x,y))
x2=matrix(c(x,z))
x3=matrix(c(y,z))
sum(dist(x1, method = "euclidean",diag=F,upper=F,p=2),1)

## [1] 528

sum(dist(x2, method = "euclidean",diag=F,upper=F,p=2),1)

## [1] 180.3

sum(dist(x3, method = "euclidean",diag=F,upper=F,p=2),1)

## [1] 509.3

R=matrix(c(x,y,z),ncol=3)
dcor=1-cor(R)
dist(x = rbind(x,y,z), method = "euclidean")

##          x        y
## y 12.64911         
## z  3.75100 13.98821

dcor <- 1 - cor(x = cbind(x,y,z), method = "pearson")
dcor

##           x         y         z
## x 0.0000000 0.0000000 0.9466303
## y 0.0000000 0.0000000 0.9466303
## z 0.9466303 0.9466303 0.0000000

#Análisis exploratorio
df <- data.frame(iris)
iris.2 <- iris[,-5]
species <- iris[,5]

di=data.frame(df) %>% 
  mutate(Species=dplyr::recode(Species,
                               setosa="st",
                               versicolor="vs",
                               virginica="vg"))  
pairs(di[,1:4], col = species,lower.panel = NULL)
par(xpd = TRUE)
legend(x = 0.05, y = 0.4, cex = 2,
       legend=as.character(levels(species)),
       fill = unique(species))

par(xpd = NA)

#Método de K-means
set.seed(20)
k.means.fit <-kmeans(di[,1:4], 3, nstart = 10)
k.means.fit

## K-means clustering with 3 clusters of sizes 50, 62, 38
## 
## Cluster means:
##   Sepal.Length Sepal.Width Petal.Length Petal.Width
## 1     5.006000    3.428000     1.462000    0.246000
## 2     5.901613    2.748387     4.393548    1.433871
## 3     6.850000    3.073684     5.742105    2.071053
## 
## Clustering vector:
##   [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
##  [38] 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 3 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
##  [75] 2 2 2 3 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 3 2 3 3 3 3 2 3 3 3 3
## [112] 3 3 2 2 3 3 3 3 2 3 2 3 2 3 3 2 2 3 3 3 3 3 2 3 3 3 3 2 3 3 3 2 3 3 3 2 3
## [149] 3 2
## 
## Within cluster sum of squares by cluster:
## [1] 15.15100 39.82097 23.87947
##  (between_SS / total_SS =  88.4 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
## [6] "betweenss"    "size"         "iter"         "ifault"

k.means.fit$centers

##   Sepal.Length Sepal.Width Petal.Length Petal.Width
## 1     5.006000    3.428000     1.462000    0.246000
## 2     5.901613    2.748387     4.393548    1.433871
## 3     6.850000    3.073684     5.742105    2.071053

k.means.fit$ifault

## [1] 0

grupos=k.means.fit$cluster
table(di$Species,grupos) #Matriz de confusión

##     grupos
##       1  2  3
##   st 50  0  0
##   vs  0 48  2
##   vg  0 14 36

dif=data.frame(di,grupos)
dif=data.frame(dif) %>% 
  mutate(grupos=dplyr::recode(grupos,
                              "3"="st",
                              "2"="vs",
                              "1"="vg")) 
table(dif$grupos,dif$Species)

##     
##      st vs vg
##   st  0  2 36
##   vg 50  0  0
##   vs  0 48 14

#Combinación K-means y PCA
d2 <- scale(di[,1:4])
rownames(d2) <- di$Species
fviz_nbclust(x = d2, FUNcluster = kmeans, method = "wss", k.max = 15, 
             diss = get_dist(d2, method = "euclidean"), nstart = 50)

x|set.seed(123)

## logical(0)

d2f=data.frame(d2)
km_clusters <- kmeans(x = d2f, centers = 3, nstart = 50)

Las funciones del paquete factoextra emplean el nombre de las filas deldataframe que contiene los datos como identificador de las observaciones.Esto permite añadir labels a los gráficos.

fviz_cluster(object = km_clusters, data = d2f, show.clust.cent = TRUE,
             ellipse.type = "euclid", star.plot = TRUE, repel = TRUE,
             pointsize=0.5,outlier.color="darkred") +
  labs(title = "Resultados clustering K-means") +
  theme_bw() +  theme(legend.position = "none")

require(cluster)
pam.res <- pam(d2f, 3)

# Visualización
fviz_cluster(pam.res, geom = "point", ellipse.type = "norm",
             show.clust.cent = TRUE,star.plot = TRUE)+
  labs(title = "Resultados clustering K-means")+ theme_bw()

#Biplot PCA y K-Means para medir representatividad
data(iris)
# PCA
pca <- prcomp(iris[,-5], scale=TRUE)
df.pca <- pca$x
# Cluster over the three first PCA dimensions
kc <- kmeans(df.pca[,1:3], 3)
fviz_pca_biplot(pca, label="var", habillage=as.factor(kc$cluster)) +
  labs(color=NULL) + ggtitle("") +
  theme(text = element_text(size = 15),
        panel.background = element_blank(), 
        panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(),
        axis.line = element_line(colour = "black"),
        legend.key = element_rect(fill = "white"))

#K-Medioides con algoritmo PAM
fviz_nbclust(x = d2f[,1:4], FUNcluster = pam, method = "wss", k.max = 15,
             diss = dist(d2, method = "manhattan"))

set.seed(123)
pam_clusters <- pam(x = d2f[,1:4], k = 3, metric = "manhattan")
pam_clusters$medoids

##       Sepal.Length Sepal.Width Petal.Length Petal.Width
## st.7    -1.0184372   0.7861738   -1.2791040  -1.3110521
## vg.16    0.7930124  -0.1315388    0.9868021   0.7880307
## vs.44   -0.2938574  -0.8198233    0.2503826   0.1320673

fviz_cluster(object = pam_clusters, data = d2f[,1:4], 
             ellipse.type = "t",repel = TRUE) +
  theme_bw() +   labs(title = "Resultados clustering PAM") +
  theme(legend.position = "none")

medoids <- prcomp(d2f[,1:4])$x
medoids <- medoids[rownames(pam_clusters$medoids), c("PC1", "PC2")]
medoids <- as.data.frame(medoids)
colnames(medoids) <- c("x", "y")

# Creación del gráfico
fviz_cluster(object = pam_clusters, data = d2f[,1:4], ellipse.type = "t",
             repel = TRUE) +  theme_bw() +
  # Se resaltan las observaciones que actúan como medoids
  geom_point(data = medoids, color = "firebrick", size = 2) +
  labs(title = "Resultados clustering PAM") +
  theme(legend.position = "none")

#Método no jerárquico CLARA basado en simulación y muestreo
set.seed(555)
dt1 <- genCorData(200, mu = c(37.88,16.79, 11.74,21.4,34.04),
                  sigma = 1, rho = 0.5, corstr = "cs" )

#Con la función rho se generan datos aleatorios CORRELACIONADOS, y no dependientes como se generan con los paquetes básicos.
dt2 <- genCorData(200, mu = c(35.94,17.90, 11.81,21.77,32.86),
                  sigma = 1, rho = 0.7, corstr = "cs" )
dac <- rbind(dt1,dt2)
DDA=gl(2,200,400,labels=c("DDA224","DDA231"))
dfc=data.frame(DDA,dac)
colnames(dfc) <- c("DDA","id","L", "a","b","C","h")
dfc=dfc[,-2]
head(dfc)

options(digits = 3)
dfc %>%
  split(.$DDA) %>% 
  map(select, -c(DDA)) %>% 
  map(cor)

## $DDA224
##       L     a     b     C     h
## L 1.000 0.468 0.415 0.535 0.417
## a 0.468 1.000 0.463 0.505 0.454
## b 0.415 0.463 1.000 0.391 0.437
## C 0.535 0.505 0.391 1.000 0.526
## h 0.417 0.454 0.437 0.526 1.000
## 
## $DDA231
##       L     a     b     C     h
## L 1.000 0.664 0.681 0.699 0.680
## a 0.664 1.000 0.714 0.683 0.735
## b 0.681 0.714 1.000 0.686 0.696
## C 0.699 0.683 0.686 1.000 0.707
## h 0.680 0.735 0.696 0.707 1.000

clara_clusters <- clara(x = dfc, k = 2, metric = "manhattan", stand = TRUE,
                        samples = 60, pamLike = TRUE)
clara_clusters$sample

##  [1]   2   9  16  25  26  33  37  41  61  87  90  94  95 107 110 115 116 141 142
## [20] 146 148 150 159 161 183 193 204 222 223 228 244 249 251 253 256 257 282 284
## [39] 296 302 304 314 325 380

clara_clusters$medoids

##      DDA    L  a    b    C    h
## [1,]   1 37.5 17 11.7 21.2 34.4
## [2,]   2 36.1 18 11.8 21.9 32.6

clara_clusters$i.med

## [1]  94 302

clara_clusters$clustering

##   [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
##  [38] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
##  [75] 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1
## [112] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [149] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [186] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## [223] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## [260] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## [297] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## [334] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## [371] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2

table(clara_clusters$clustering)

## 
##   1   2 
## 197 203

fviz_cluster(object = clara_clusters, ellipse.type = "t", geom = "point",
             pointsize = 1.5) +  theme_bw() +
  labs(title = "Resultados clustering CLARA") +
  theme(legend.position = "none")

medoides <- prcomp(dfc[,-1])$x
medoides <- medoides[rownames(pam_clusters$medoides), c("PC1", "PC2")]
medoides <- as.data.frame(medoides)
colnames(medoides) <- c("x", "y")

# Creación del gráfico
fviz_cluster(object = clara_clusters, data = d2f[,-1], ellipse.type = "t",
             repel = TRUE,pointsize = 0.5) +  theme_bw() +
  # Se resaltan las observaciones que actúan como medoids
  geom_point(data = medoides, color = "firebrick", size = 1,) +
  labs(title = "Resultados clustering PAM") +
  theme(legend.position = "none")

#Biplot CLARA con PCA
# PCA
pca <- prcomp(dfc[,-1], scale=TRUE)
df.pca <- pca$x
# Cluster over the three first PCA dimensions
CLA <- clara(df.pca[,1:3], k = 2, metric = "manhattan", stand = TRUE,
             samples = 60, pamLike = TRUE)
fviz_pca_biplot(pca, label="var", habillage=as.factor(CLA$cluster)) +
  labs(color=NULL) + ggtitle("") +
  theme(text = element_text(size = 15),
        panel.background = element_blank(), 
        panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(),
        axis.line = element_line(colour = "black"),
        legend.key = element_rect(fill = "white"))

#Métodos de clasificación Jerárquica
datj <- scale(dfc[,-1])
rownames(datj) <- dfc[,1]
# Matriz de distancias euclideas
mat_dist <- dist(x = datj, method = "euclidean")
# Dendrogramas con linkage complete y average
hc_euclidea_complete <- hclust(d = mat_dist, method = "complete")
hc_euclidea_average  <- hclust(d = mat_dist, method = "average")
hc_euclidea_single  <- hclust(d = mat_dist, method = "single")
hc_euclidea_ward.D2  <- hclust(d = mat_dist, method = "ward.D2")
hc_euclidea_median  <- hclust(d = mat_dist, method = "median")
hc_euclidea_centroid  <- hclust(d = mat_dist, method = "centroid")
hc_euclidea_mcquitty  <- hclust(d = mat_dist, method = "mcquitty")
cor(x = mat_dist, cophenetic(hc_euclidea_complete))

## [1] 0.486

cor(x = mat_dist, cophenetic(hc_euclidea_average))

## [1] 0.618

cor(x = mat_dist, cophenetic(hc_euclidea_single))

## [1] 0.435

cor(x = mat_dist, cophenetic(hc_euclidea_ward.D2))

## [1] 0.502

cor(x = mat_dist, cophenetic(hc_euclidea_median))

## [1] 0.381

cor(x = mat_dist, cophenetic(hc_euclidea_centroid)) ## Tiene la mayor correlación

## [1] 0.538

cor(x = mat_dist, cophenetic(hc_euclidea_mcquitty))

## [1] 0.481

set.seed(101)
hc_euclidea_av <- hclust(d = dist(x = datj, method = "euclidean"),
                         method = "centroid")
fviz_dend(x = hc_euclidea_av, k = 2, cex = 0.5,
          k_colors = c("red","green"),color_labels_by_k = T,
          lwd = 0.2,type = "c",label_cols = rainbow(400),
          rect_lty = "lightblue") +
  geom_hline(yintercept = 3.65, linetype = "dashed") +
  labs(title = "Herarchical clustering",
       subtitle = "Distancia euclídea, Centroide, k=2")

## Warning in get_col(col, k): Length of color vector was shorter than the number
## of clusters - color vector was recycled

## Warning: The `<scale>` argument of `guides()` cannot be `FALSE`. Use "none" instead as
## of ggplot2 3.3.4.
## ℹ The deprecated feature was likely used in the factoextra package.
##   Please report the issue at <https://github.com/kassambara/factoextra/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

## Warning: Removed 1 rows containing missing values (`geom_hline()`).

Clusterización

Marcelo Callao Pimentel

15/07/2023