library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(car)
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
##
## recode
df=data.frame(iris)
iris.2 <- iris[,-5]
species <- iris[,5]
di=data.frame(df) %>%
mutate(Species=dplyr::recode(Species,
setosa="st",
versicolor="vs",
virginica="vg"))
pairs(di[,1:4], col = species,lower.panel = NULL)
par(xpd = TRUE)
legend(x = 0.05, y = 0.4, cex = 2,
legend=as.character(levels(species)),
fill = unique(species))

par(xpd = NA)
set.seed(20)
k.means.fit <- kmeans(df[,1:4], 3)
k.means.fit$centers
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## 1 5.006000 3.428000 1.462000 0.246000
## 2 5.901613 2.748387 4.393548 1.433871
## 3 6.850000 3.073684 5.742105 2.071053
k.means.fit$ifault
## [1] 0
grupos=k.means.fit$cluster
table(di$Species,grupos) #matriz de confusión (puedo hacerlo porque conozco las variedades)
## grupos
## 1 2 3
## st 50 0 0
## vs 0 48 2
## vg 0 14 36
dif=data.frame(di,grupos)
dif=data.frame(dif) %>%
mutate(grupos=dplyr::recode(grupos,
"3"="st",
"2"="vs",
"1"="vg"))
table(dif$grupos,dif$Species)
##
## st vs vg
## st 0 2 36
## vg 50 0 0
## vs 0 48 14
library(factoextra)
## Loading required package: ggplot2
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
d2 <- scale(di[,1:4])
rownames(d2) <- di$Species
#www:for total within sum of square
fviz_nbclust(x = d2, FUNcluster = kmeans, method = "wss", k.max = 15,
diss = get_dist(d2, method = "euclidean"), nstart = 50)

set.seed(123)
d2f=data.frame(d2)
km_clusters <- kmeans(x = d2f, centers = 3, nstart = 50)
# Las funciones del paquete factoextra emplean el nombre de las filas del
# dataframe que contiene los datos como identificador de las observaciones.
# Esto permite añadir labels a los gráficos.
fviz_cluster(object = km_clusters, data = d2f, show.clust.cent = TRUE,
ellipse.type = "euclid", star.plot = TRUE, repel = TRUE,
pointsize=0.5,outlier.color="darkred") +
labs(title = "Resultados clustering K-means") +
theme_bw() + theme(legend.position = "none")

require(cluster)
## Loading required package: cluster
pam.res <- pam(d2f, 3)
# Visualización
fviz_cluster(pam.res, geom = "point", ellipse.type = "norm",
show.clust.cent = TRUE,star.plot = TRUE)+
labs(title = "Resultados clustering K-means")+ theme_bw()

# Load iris dataset
data(iris)
# PCA
pca <- prcomp(iris[,-5], scale=TRUE)
df.pca <- pca$x
# Cluster over the three first PCA dimensions
kc <- kmeans(df.pca[,1:3], 4)
fviz_pca_biplot(pca, label="var", habillage=as.factor(kc$cluster)) +
labs(color=NULL) + ggtitle("") +
theme(text = element_text(size = 15),
panel.background = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
axis.line = element_line(colour = "black"),
legend.key = element_rect(fill = "white"))

library(cluster)
fviz_nbclust(x = d2f[,1:4], FUNcluster = pam, method = "wss", k.max = 15,
diss = dist(d2, method = "manhattan"))

set.seed(123)
pam_clusters <- pam(x = d2f[,1:4], k = 4, metric = "manhattan")
pam_clusters
## Medoids:
## ID Sepal.Length Sepal.Width Petal.Length Petal.Width
## st.7 8 -1.0184372 0.7861738 -1.27910398 -1.3110521
## vg.12 113 1.1553023 -0.1315388 0.98680210 1.1816087
## vs.28 79 0.1891958 -0.3609670 0.42032558 0.3944526
## vs.19 70 -0.2938574 -1.2786796 0.08043967 -0.1303181
## Clustering vector:
## st st.1 st.2 st.3 st.4 st.5 st.6 st.7 st.8 st.9 st.10 st.11 st.12
## 1 1 1 1 1 1 1 1 1 1 1 1 1
## st.13 st.14 st.15 st.16 st.17 st.18 st.19 st.20 st.21 st.22 st.23 st.24 st.25
## 1 1 1 1 1 1 1 1 1 1 1 1 1
## st.26 st.27 st.28 st.29 st.30 st.31 st.32 st.33 st.34 st.35 st.36 st.37 st.38
## 1 1 1 1 1 1 1 1 1 1 1 1 1
## st.39 st.40 st.41 st.42 st.43 st.44 st.45 st.46 st.47 st.48 st.49 vs vs.1
## 1 1 1 1 1 1 1 1 1 1 1 2 3
## vs.2 vs.3 vs.4 vs.5 vs.6 vs.7 vs.8 vs.9 vs.10 vs.11 vs.12 vs.13 vs.14
## 2 4 3 3 3 4 3 4 4 3 4 3 3
## vs.15 vs.16 vs.17 vs.18 vs.19 vs.20 vs.21 vs.22 vs.23 vs.24 vs.25 vs.26 vs.27
## 3 3 4 3 4 3 3 3 3 3 3 3 2
## vs.28 vs.29 vs.30 vs.31 vs.32 vs.33 vs.34 vs.35 vs.36 vs.37 vs.38 vs.39 vs.40
## 3 4 4 4 4 3 3 3 3 4 3 4 4
## vs.41 vs.42 vs.43 vs.44 vs.45 vs.46 vs.47 vs.48 vs.49 vg vg.1 vg.2 vg.3
## 3 4 4 4 3 3 3 4 3 2 3 2 2
## vg.4 vg.5 vg.6 vg.7 vg.8 vg.9 vg.10 vg.11 vg.12 vg.13 vg.14 vg.15 vg.16
## 2 2 4 2 2 2 2 2 2 4 3 2 2
## vg.17 vg.18 vg.19 vg.20 vg.21 vg.22 vg.23 vg.24 vg.25 vg.26 vg.27 vg.28 vg.29
## 2 2 3 2 3 2 3 2 2 3 3 2 2
## vg.30 vg.31 vg.32 vg.33 vg.34 vg.35 vg.36 vg.37 vg.38 vg.39 vg.40 vg.41 vg.42
## 2 2 2 3 3 2 2 2 3 2 2 2 3
## vg.43 vg.44 vg.45 vg.46 vg.47 vg.48 vg.49
## 2 2 2 3 2 2 3
## Objective function:
## build swap
## 1.316741 1.220666
##
## Available components:
## [1] "medoids" "id.med" "clustering" "objective" "isolation"
## [6] "clusinfo" "silinfo" "diss" "call" "data"
pam_clusters$medoids
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## st.7 -1.0184372 0.7861738 -1.27910398 -1.3110521
## vg.12 1.1553023 -0.1315388 0.98680210 1.1816087
## vs.28 0.1891958 -0.3609670 0.42032558 0.3944526
## vs.19 -0.2938574 -1.2786796 0.08043967 -0.1303181
fviz_cluster(object = pam_clusters, data = d2f[,1:4],
ellipse.type = "t",repel = TRUE) +
theme_bw() + labs(title = "Resultados clustering PAM") +
theme(legend.position = "none")

medoids <- prcomp(d2f[,1:4])$x
medoids <- medoids[rownames(pam_clusters$medoids), c("PC1", "PC2")]
medoids <- as.data.frame(medoids)
colnames(medoids) <- c("x", "y")
# Creación del gráfico
fviz_cluster(object = pam_clusters, data = d2f[,1:4], ellipse.type = "t",
repel = TRUE) + theme_bw() +
# Se resaltan las observaciones que actúan como medoids
geom_point(data = medoids, color = "firebrick", size = 2) +
labs(title = "Resultados clustering PAM") +
theme(legend.position = "none")

### método clara
library(simstudy)
library(data.table)
##
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
##
## between, first, last
set.seed(555)
dt1 <- genCorData(200, mu = c(37.88,16.79, 11.74,21.4,34.04),
sigma = 1, rho = 0.5, corstr = "cs" )
dt2 <- genCorData(200, mu = c(35.94,17.90, 11.81,21.77,32.86),
sigma = 1, rho = 0.7, corstr = "cs" )
dac <- rbind(dt1,dt2)
DDA=gl(2,200,400,labels=c("DDA224","DDA231"))
dfc=data.frame(DDA,dac)
View(dfc)
colnames(dfc) <- c("DDA","id","L", "a","b","C","h")
dfc=dfc[,-2]
head(dfc)
## DDA L a b C h
## 1 DDA224 36.71324 16.59757 11.88936 20.80325 34.02769
## 2 DDA224 38.31073 15.76697 13.00676 21.46628 35.65491
## 3 DDA224 38.66201 17.00284 12.14174 22.81913 34.63488
## 4 DDA224 37.53979 17.02302 14.26060 21.62405 34.72869
## 5 DDA224 38.97227 15.98841 12.11759 20.35450 34.15341
## 6 DDA224 38.61491 17.14862 11.28327 22.67647 34.46416
dac <- rbind(dt1,dt2)
DDA=gl(2,200,400,labels=c("DDA224","DDA231"))
dfc=data.frame(DDA,dac)
colnames(dfc) <- c("DDA","id","L", "a","b","C","h")
dfc=dfc[,-2] # quito id
head(dfc)
## DDA L a b C h
## 1 DDA224 36.71324 16.59757 11.88936 20.80325 34.02769
## 2 DDA224 38.31073 15.76697 13.00676 21.46628 35.65491
## 3 DDA224 38.66201 17.00284 12.14174 22.81913 34.63488
## 4 DDA224 37.53979 17.02302 14.26060 21.62405 34.72869
## 5 DDA224 38.97227 15.98841 12.11759 20.35450 34.15341
## 6 DDA224 38.61491 17.14862 11.28327 22.67647 34.46416
options(digits = 3)
library(tidyverse)
## -- Attaching packages ------------------------------------------------ tidyverse 1.3.0 --
## v tibble 3.0.1 v purrr 0.3.3
## v tidyr 1.1.0 v stringr 1.4.0
## v readr 1.3.1 v forcats 0.5.0
## -- Conflicts --------------------------------------------------- tidyverse_conflicts() --
## x data.table::between() masks dplyr::between()
## x dplyr::filter() masks stats::filter()
## x data.table::first() masks dplyr::first()
## x dplyr::lag() masks stats::lag()
## x data.table::last() masks dplyr::last()
## x car::recode() masks dplyr::recode()
## x purrr::some() masks car::some()
## x purrr::transpose() masks data.table::transpose()
dfc %>%
split(.$DDA) %>%
map(select,-c(DDA)) %>%
map(cor)
## $DDA224
## L a b C h
## L 1.000 0.468 0.415 0.535 0.417
## a 0.468 1.000 0.463 0.505 0.454
## b 0.415 0.463 1.000 0.391 0.437
## C 0.535 0.505 0.391 1.000 0.526
## h 0.417 0.454 0.437 0.526 1.000
##
## $DDA231
## L a b C h
## L 1.000 0.664 0.681 0.699 0.680
## a 0.664 1.000 0.714 0.683 0.735
## b 0.681 0.714 1.000 0.686 0.696
## C 0.699 0.683 0.686 1.000 0.707
## h 0.680 0.735 0.696 0.707 1.000
library(cluster)
library(factoextra)
clara_clusters <- clara(x = dfc, k = 2, metric = "manhattan", stand = TRUE,
samples = 60, pamLike = TRUE)
clara_clusters$sample
## [1] 2 9 16 25 26 33 37 41 61 87 90 94 95 107 110 115 116 141 142
## [20] 146 148 150 159 161 183 193 204 222 223 228 244 249 251 253 256 257 282 284
## [39] 296 302 304 314 325 380
clara_clusters$medoids
## DDA L a b C h
## [1,] 1 37.5 17 11.7 21.2 34.4
## [2,] 2 36.1 18 11.8 21.9 32.6
clara_clusters$i.med
## [1] 94 302
clara_clusters$clustering
## [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [38] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [75] 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1
## [112] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [149] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [186] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## [223] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## [260] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## [297] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## [334] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## [371] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
table(clara_clusters$clustering)
##
## 1 2
## 197 203
fviz_cluster(object = clara_clusters, ellipse.type = "t", geom = "point",
pointsize = 1.5) + theme_bw() +
labs(title = "Resultados clustering CLARA") +
theme(legend.position = "none")

medoides <- prcomp(dfc[,-1])$x
medoides <- medoides[rownames(pam_clusters$medoides), c("PC1", "PC2")]
medoides <- as.data.frame(medoides)
colnames(medoides) <- c("x", "y")
# Creación del gráfico
fviz_cluster(object = clara_clusters, data = d2f[,-1], ellipse.type = "t",
repel = TRUE,pointsize = 0.5) + theme_bw() +
geom_point(data = medoides, color = "firebrick", size = 1,) +
labs(title = "Resultados clustering PAM") +
theme(legend.position = "none")

#Hacer para el método clara el BIplot, para saber cual de las variables quedo mejor representadas.
pcacl <- prcomp(dfc[1:200,2:6], scale=TRUE)
df.pca1 <- pcacl$x
clara_clusterspca <- clara(x = df.pca1, k = 2, metric = "manhattan", stand = TRUE,
samples = 60, pamLike = TRUE)
clara_clusterspca$sample
## [1] "4" "8" "12" "14" "19" "27" "33" "41" "43" "44" "53" "55"
## [13] "56" "58" "62" "63" "70" "79" "87" "107" "115" "120" "121" "129"
## [25] "130" "135" "138" "139" "145" "149" "153" "154" "159" "165" "167" "171"
## [37] "173" "174" "176" "179" "181" "183" "185" "199"
clara_clusterspca$medoids
## PC1 PC2 PC3 PC4 PC5
## 154 -0.718 0.317 -0.213 0.172 0.413
## 165 0.626 -0.224 -0.058 -0.161 -0.429
clara_clusterspca$i.med
## [1] 154 165
clara_clusterspca$clustering
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
## 1 2 2 1 2 2 2 2 2 2 2 2 2 1 2 1 1 1 1 1
## 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40
## 1 1 2 1 1 2 2 2 1 2 2 2 1 2 1 1 1 1 2 1
## 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60
## 2 2 2 1 2 2 1 1 2 2 2 1 1 1 1 2 1 1 1 1
## 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80
## 1 2 2 1 1 1 1 2 2 2 1 1 2 2 2 1 2 2 1 1
## 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100
## 2 1 2 2 2 1 2 2 1 2 2 1 2 1 2 2 2 1 1 1
## 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120
## 2 1 2 2 1 2 1 1 2 2 2 2 1 1 2 2 1 1 1 1
## 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140
## 2 2 2 2 2 1 1 1 1 1 1 1 2 2 2 1 2 2 2 1
## 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160
## 1 2 2 2 1 1 1 2 1 1 1 1 1 1 2 2 1 2 2 2
## 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180
## 2 2 2 2 2 2 2 2 2 2 2 1 2 2 1 2 2 1 2 1
## 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200
## 2 2 2 1 1 1 1 2 1 1 1 1 1 2 2 1 2 1 2 1
table(clara_clusters$clustering)
##
## 1 2
## 197 203
fviz_pca_biplot(pcacl, label="var", habillage=as.factor(clara_clusterspca$clustering)) +
labs(color=NULL) + ggtitle("") +
theme(text = element_text(size = 15),
panel.background = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
axis.line = element_line(colour = "black"),
legend.key = element_rect(fill = "white"))

#segundo muestreo
pcacl2 <- prcomp(dfc[201:400,2:6], scale=TRUE)
df.pca2 <- pcacl$x
clara_clusterspca2 <- clara(x = df.pca2, k = 2, metric = "manhattan", stand = TRUE,
samples = 60, pamLike = TRUE)
clara_clusterspca2$sample
## [1] "4" "8" "12" "14" "19" "27" "33" "41" "43" "44" "53" "55"
## [13] "56" "58" "62" "63" "70" "79" "87" "107" "115" "120" "121" "129"
## [25] "130" "135" "138" "139" "145" "149" "153" "154" "159" "165" "167" "171"
## [37] "173" "174" "176" "179" "181" "183" "185" "199"
clara_clusterspca2$medoids
## PC1 PC2 PC3 PC4 PC5
## 154 -0.718 0.317 -0.213 0.172 0.413
## 165 0.626 -0.224 -0.058 -0.161 -0.429
clara_clusterspca2$i.med
## [1] 154 165
clara_clusterspca2$clustering
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
## 1 2 2 1 2 2 2 2 2 2 2 2 2 1 2 1 1 1 1 1
## 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40
## 1 1 2 1 1 2 2 2 1 2 2 2 1 2 1 1 1 1 2 1
## 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60
## 2 2 2 1 2 2 1 1 2 2 2 1 1 1 1 2 1 1 1 1
## 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80
## 1 2 2 1 1 1 1 2 2 2 1 1 2 2 2 1 2 2 1 1
## 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100
## 2 1 2 2 2 1 2 2 1 2 2 1 2 1 2 2 2 1 1 1
## 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120
## 2 1 2 2 1 2 1 1 2 2 2 2 1 1 2 2 1 1 1 1
## 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140
## 2 2 2 2 2 1 1 1 1 1 1 1 2 2 2 1 2 2 2 1
## 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160
## 1 2 2 2 1 1 1 2 1 1 1 1 1 1 2 2 1 2 2 2
## 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180
## 2 2 2 2 2 2 2 2 2 2 2 1 2 2 1 2 2 1 2 1
## 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200
## 2 2 2 1 1 1 1 2 1 1 1 1 1 2 2 1 2 1 2 1
table(clara_clusterspca2$clustering)
##
## 1 2
## 94 106
fviz_pca_biplot(pcacl2, label="var", habillage=as.factor(clara_clusterspca2$clustering)) +
labs(color=NULL) + ggtitle("") +
theme(text = element_text(size = 15),
panel.background = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
axis.line = element_line(colour = "black"),
legend.key = element_rect(fill = "white"))
